From 449ba46b22250c3cd23ac16a5cc493c4781654b2 Mon Sep 17 00:00:00 2001 From: Jake Goulding Date: Fri, 16 Jul 2021 15:56:38 -0400 Subject: [PATCH 01/27] refactor: Make more use of SNAFU's context methods and ensure! macro --- internal_types/src/schema.rs | 8 +-- server/src/config.rs | 108 +++++++++++++++++------------------ server/src/db/catalog.rs | 21 +++---- server/src/db/chunk.rs | 4 +- 4 files changed, 66 insertions(+), 75 deletions(-) diff --git a/internal_types/src/schema.rs b/internal_types/src/schema.rs index 2afb0cede3..4427540576 100644 --- a/internal_types/src/schema.rs +++ b/internal_types/src/schema.rs @@ -11,7 +11,7 @@ use arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, }; -use snafu::Snafu; +use snafu::{OptionExt, Snafu}; use crate::{ schema::sort::{ColumnSort, SortKey}, @@ -395,11 +395,9 @@ impl Schema { pub fn compute_select_indicies(&self, columns: &[&str]) -> Result> { columns .iter() - .map(|column_name| { + .map(|&column_name| { self.find_index_of(column_name) - .ok_or_else(|| Error::ColumnNotFound { - column_name: column_name.to_string(), - }) + .context(ColumnNotFound { column_name }) }) .collect() } diff --git a/server/src/config.rs b/server/src/config.rs index 3fb35941ae..4bfe453ab7 100644 --- a/server/src/config.rs +++ b/server/src/config.rs @@ -16,9 +16,12 @@ use write_buffer::config::WriteBufferConfig; /// This module contains code for managing the configuration of the server. use crate::{ db::{catalog::Catalog, DatabaseToCommit, Db}, - Error, JobRegistry, Result, + DatabaseAlreadyExists, DatabaseNotFound, DatabaseReserved, Error, + InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch, + ServerShuttingDown, }; use observability_deps::tracing::{self, error, info, warn, Instrument}; +use snafu::{ensure, OptionExt}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; @@ -85,16 +88,14 @@ impl Config { db_name: DatabaseName<'static>, ) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); - if state.reservations.contains(&db_name) { - return Err(Error::DatabaseReserved { - db_name: db_name.to_string(), - }); - } - if state.databases.contains_key(&db_name) { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + ensure!( + !state.reservations.contains(&db_name), + DatabaseReserved { db_name } + ); + ensure!( + !state.databases.contains_key(&db_name), + DatabaseAlreadyExists { db_name } + ); state.reservations.insert(db_name.clone()); Ok(DatabaseHandle { @@ -119,28 +120,23 @@ impl Config { /// without initializing it, see [`block_db`](Self::block_db). pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); - if state.reservations.contains(&db_name) { - return Err(Error::DatabaseReserved { - db_name: db_name.to_string(), - }); - } + ensure!( + !state.reservations.contains(&db_name), + DatabaseReserved { db_name } + ); - let db_state = - state - .databases - .get(&db_name) - .cloned() - .ok_or_else(|| Error::DatabaseNotFound { - db_name: db_name.to_string(), - })?; + let db_state = state + .databases + .get(&db_name) + .cloned() + .context(DatabaseNotFound { db_name: &db_name })?; - if db_state.is_initialized() { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + ensure!( + !db_state.is_initialized(), + DatabaseAlreadyExists { db_name } + ); - state.reservations.insert(db_name.clone()); + state.reservations.insert(db_name); Ok(DatabaseHandle { state: Some(db_state), config: &self, @@ -159,16 +155,14 @@ impl Config { db_name: DatabaseName<'static>, ) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); - if state.reservations.contains(&db_name) { - return Err(Error::DatabaseReserved { - db_name: db_name.to_string(), - }); - } - if state.databases.contains_key(&db_name) { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + ensure!( + !state.reservations.contains(&db_name), + DatabaseReserved { db_name } + ); + ensure!( + !state.databases.contains_key(&db_name), + DatabaseAlreadyExists { db_name } + ); state.reservations.insert(db_name.clone()); Ok(BlockDatabaseGuard { @@ -228,9 +222,7 @@ impl Config { // TODO: implement for non-initialized databases let db = self .db_initialized(db_name) - .ok_or_else(|| Error::DatabaseNotFound { - db_name: db_name.to_string(), - })?; + .context(DatabaseNotFound { db_name })?; let mut rules = db.rules.write(); *rules = update(rules.clone()).map_err(UpdateError::Closure)?; @@ -600,12 +592,13 @@ impl<'a> DatabaseHandle<'a> { server_id, db_name, } => { - if db_name != &rules.name { - return Err(Error::RulesDatabaseNameMismatch { - actual: rules.name.to_string(), - expected: db_name.to_string(), - }); - } + ensure!( + db_name == &rules.name, + RulesDatabaseNameMismatch { + actual: rules.name, + expected: db_name, + } + ); self.state = Some(Arc::new(DatabaseState::RulesLoaded { object_store: Arc::clone(&object_store), @@ -616,10 +609,11 @@ impl<'a> DatabaseHandle<'a> { Ok(()) } - state => Err(Error::InvalidDatabaseStateTransition { + state => InvalidDatabaseStateTransition { actual: state.code(), expected: DatabaseStateCode::Known, - }), + } + .fail(), } } @@ -652,10 +646,11 @@ impl<'a> DatabaseHandle<'a> { Ok(()) } - state => Err(Error::InvalidDatabaseStateTransition { + state => InvalidDatabaseStateTransition { actual: state.code(), expected: DatabaseStateCode::RulesLoaded, - }), + } + .fail(), } } @@ -665,7 +660,7 @@ impl<'a> DatabaseHandle<'a> { DatabaseState::Replay { db } => { if self.config.shutdown.is_cancelled() { error!("server is shutting down"); - return Err(Error::ServerShuttingDown); + return ServerShuttingDown.fail(); } let shutdown = self.config.shutdown.child_token(); @@ -688,10 +683,11 @@ impl<'a> DatabaseHandle<'a> { Ok(()) } - state => Err(Error::InvalidDatabaseStateTransition { + state => InvalidDatabaseStateTransition { actual: state.code(), expected: DatabaseStateCode::Replay, - }), + } + .fail(), } } } diff --git a/server/src/db/catalog.rs b/server/src/db/catalog.rs index 6a06b06731..c2953a3de6 100644 --- a/server/src/db/catalog.rs +++ b/server/src/db/catalog.rs @@ -8,7 +8,7 @@ use data_types::chunk_metadata::ChunkSummary; use data_types::chunk_metadata::DetailedChunkSummary; use data_types::partition_metadata::{PartitionSummary, TableSummary}; use internal_types::schema::Schema; -use snafu::Snafu; +use snafu::{OptionExt, Snafu}; use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; use self::chunk::CatalogChunk; @@ -135,11 +135,8 @@ impl Catalog { /// Get a specific table by name, returning `None` if there is no such table pub fn table(&self, table_name: impl AsRef) -> Result> { let table_name = table_name.as_ref(); - RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)).map_err( - |_| Error::TableNotFound { - table: table_name.to_string(), - }, - ) + RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)) + .map_err(|_| TableNotFound { table: table_name }.build()) } /// Get a specific partition by name, returning an error if it can't be found @@ -154,9 +151,9 @@ impl Catalog { self.table(table_name)? .partition(partition_key) .cloned() - .ok_or_else(|| Error::PartitionNotFound { - partition: partition_key.to_string(), - table: table_name.to_string(), + .context(PartitionNotFound { + partition: partition_key, + table: table_name, }) } @@ -174,9 +171,9 @@ impl Catalog { .read() .chunk(chunk_id) .cloned() - .ok_or_else(|| Error::ChunkNotFound { - partition: partition_key.to_string(), - table: table_name.to_string(), + .context(ChunkNotFound { + partition: partition_key, + table: table_name, chunk_id, }) } diff --git a/server/src/db/chunk.rs b/server/src/db/chunk.rs index 997030cb08..e4ba6870cd 100644 --- a/server/src/db/chunk.rs +++ b/server/src/db/chunk.rs @@ -5,7 +5,7 @@ use std::{ use data_types::partition_metadata; use partition_metadata::TableSummary; -use snafu::{ResultExt, Snafu}; +use snafu::{OptionExt, ResultExt, Snafu}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_util::MemoryStream; @@ -417,7 +417,7 @@ impl QueryChunk for DbChunk { // column out to get the set of values. let values = values .remove(column_name) - .ok_or_else(|| Error::ReadBufferError { + .with_context(|| ReadBufferError { chunk_id: self.id(), msg: format!( "failed to find column_name {:?} in results of tag_values", From 1c16988a51a3ce720d5d2aaea58b7360e797e4b3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jul 2021 14:09:06 -0400 Subject: [PATCH 02/27] chore: Update datafusion references (#2056) --- Cargo.lock | 2 +- datafusion/Cargo.toml | 2 +- query/src/exec.rs | 19 +- query/src/exec/context.rs | 66 ++-- query/src/frontend/sql.rs | 2 +- query_tests/cases/in/duplicates.expected | 171 +++++----- query_tests/cases/in/duplicates.sql | 6 +- query_tests/cases/in/pushdown.expected | 383 ++++++++++------------- query_tests/cases/in/pushdown.sql | 26 +- query_tests/src/runner.rs | 14 +- 10 files changed, 349 insertions(+), 342 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8474d61947..ec611883bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -843,7 +843,7 @@ dependencies = [ [[package]] name = "datafusion" version = "4.0.0-SNAPSHOT" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=bd3ee23520a3e6f135891ec32d96fcea7ee2bb55#bd3ee23520a3e6f135891ec32d96fcea7ee2bb55" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=3fb600df48ab1e53903b1a9bb12ebde33ad0856b#3fb600df48ab1e53903b1a9bb12ebde33ad0856b" dependencies = [ "ahash 0.7.4", "arrow", diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 64a9e97e69..f3a735d307 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version" # Rename to workaround doctest bug # Turn off optional datafusion features (function packages) -upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="bd3ee23520a3e6f135891ec32d96fcea7ee2bb55", default-features = false, package = "datafusion" } +upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="3fb600df48ab1e53903b1a9bb12ebde33ad0856b", default-features = false, package = "datafusion" } diff --git a/query/src/exec.rs b/query/src/exec.rs index ce5e085581..e5b7034c75 100644 --- a/query/src/exec.rs +++ b/query/src/exec.rs @@ -39,6 +39,7 @@ use crate::plan::{ }; use self::{ + context::IOxExecutionConfig, split::StreamSplitNode, task::{DedicatedExecutor, Error as ExecutorError}, }; @@ -111,6 +112,9 @@ pub struct Executor { /// Executor for running system/reorganization tasks such as /// compact reorg_exec: DedicatedExecutor, + + /// The default configuration options with which to create contexts + config: IOxExecutionConfig, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -128,12 +132,25 @@ impl Executor { let query_exec = DedicatedExecutor::new("IOx Query Executor Thread", num_threads); let reorg_exec = DedicatedExecutor::new("IOx Reorg Executor Thread", num_threads); + let config = IOxExecutionConfig::new(); + Self { query_exec, reorg_exec, + config, } } + /// returns the config of this executor + pub fn config(&self) -> &IOxExecutionConfig { + &self.config + } + + /// returns a mutable reference to this executor's config + pub fn config_mut(&mut self) -> &mut IOxExecutionConfig { + &mut self.config + } + /// Executes this plan on the query pool, and returns the /// resulting set of strings pub async fn to_string_set(&self, plan: StringSetPlan) -> Result { @@ -289,7 +306,7 @@ impl Executor { pub fn new_context(&self, executor_type: ExecutorType) -> IOxExecutionContext { let executor = self.executor(executor_type).clone(); - IOxExecutionContext::new(executor) + IOxExecutionContext::new(executor, self.config.clone()) } /// Return the execution pool of the specified type diff --git a/query/src/exec/context.rs b/query/src/exec/context.rs index 702e032bac..ffad4541c5 100644 --- a/query/src/exec/context.rs +++ b/query/src/exec/context.rs @@ -5,6 +5,7 @@ use std::{fmt, sync::Arc}; use arrow::record_batch::RecordBatch; use datafusion::{ + catalog::catalog::CatalogProvider, execution::context::{ExecutionContextState, QueryPlanner}, logical_plan::{LogicalPlan, UserDefinedLogicalNode}, physical_plan::{ @@ -105,6 +106,46 @@ impl ExtensionPlanner for IOxExtensionPlanner { } } +// Configuration for an IOx execution context +#[derive(Clone)] +pub struct IOxExecutionConfig { + /// Configuration options to pass to DataFusion + inner: ExecutionConfig, +} + +impl Default for IOxExecutionConfig { + fn default() -> Self { + const BATCH_SIZE: usize = 1000; + + // Setup default configuration + let inner = ExecutionConfig::new() + .with_batch_size(BATCH_SIZE) + .create_default_catalog_and_schema(true) + .with_information_schema(true) + .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA) + .with_query_planner(Arc::new(IOxQueryPlanner {})); + + Self { inner } + } +} + +impl fmt::Debug for IOxExecutionConfig { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "IOxExecutionConfig ...") + } +} + +impl IOxExecutionConfig { + pub fn new() -> Self { + Default::default() + } + + /// Set execution concurrency + pub fn set_concurrency(&mut self, concurrency: usize) { + self.inner.concurrency = concurrency; + } +} + /// This is an execution context for planning in IOx. It wraps a /// DataFusion execution context with the information needed for planning. /// @@ -136,21 +177,8 @@ impl fmt::Debug for IOxExecutionContext { impl IOxExecutionContext { /// Create an ExecutionContext suitable for executing DataFusion plans - /// - /// The config is created with a default catalog and schema, but this - /// can be overridden at a later date - pub fn new(exec: DedicatedExecutor) -> Self { - const BATCH_SIZE: usize = 1000; - - // TBD: Should we be reusing an execution context across all executions? - let config = ExecutionConfig::new() - .with_batch_size(BATCH_SIZE) - .create_default_catalog_and_schema(true) - .with_information_schema(true) - .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA) - .with_query_planner(Arc::new(IOxQueryPlanner {})); - - let inner = ExecutionContext::with_config(config); + pub fn new(exec: DedicatedExecutor, config: IOxExecutionConfig) -> Self { + let inner = ExecutionContext::with_config(config.inner); Self { inner, exec } } @@ -160,11 +188,13 @@ impl IOxExecutionContext { &self.inner } - /// returns a mutable reference to the inner datafusion execution context - pub fn inner_mut(&mut self) -> &mut ExecutionContext { - &mut self.inner + /// registers a catalog with the inner context + pub fn register_catalog(&mut self, name: impl Into, catalog: Arc) { + self.inner.register_catalog(name, catalog); } + /// + /// Prepare a SQL statement for execution. This assumes that any /// tables referenced in the SQL have been registered with this context pub fn prepare_sql(&mut self, sql: &str) -> Result> { diff --git a/query/src/frontend/sql.rs b/query/src/frontend/sql.rs index be737fd4d6..c08ad7c671 100644 --- a/query/src/frontend/sql.rs +++ b/query/src/frontend/sql.rs @@ -87,7 +87,7 @@ impl SqlQueryPlanner { executor: &Executor, ) -> Result> { let mut ctx = executor.new_context(ExecutorType::Query); - ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database); + ctx.register_catalog(DEFAULT_CATALOG, database); ctx.prepare_sql(query).context(Preparing) } } diff --git a/query_tests/cases/in/duplicates.expected b/query_tests/cases/in/duplicates.expected index 97ba6e6001..d83e04b0dc 100644 --- a/query_tests/cases/in/duplicates.expected +++ b/query_tests/cases/in/duplicates.expected @@ -1,86 +1,87 @@ -- Test Setup: OneMeasurementThreeChunksWithDuplicates --- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; -+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | -| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=None | -| logical_plan after projection_push_down | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | -| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| logical_plan after simplify_expressions | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | -| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] | -| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | -| | ExecutionPlan(PlaceHolder) | -| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o; -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=None | -| logical_plan after projection_push_down | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| logical_plan after simplify_expressions | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | -| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | -| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | -| | ExecutionPlan(PlaceHolder) | -| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o; -+-----------------------------------------+-------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-------------------------------------------------------------------------------+ -| logical_plan | Union | -| | Projection: #h2o.state AS name | -| | TableScan: h2o projection=None | -| | Projection: #h2o.city AS name | -| | TableScan: h2o projection=None | -| logical_plan after projection_push_down | Union | -| | Projection: #h2o.state AS name | -| | TableScan: h2o projection=Some([4]) | -| | Projection: #h2o.city AS name | -| | TableScan: h2o projection=Some([1]) | -| logical_plan after simplify_expressions | Union | -| | Projection: #h2o.state AS name | -| | TableScan: h2o projection=Some([4]) | -| | Projection: #h2o.city AS name | -| | TableScan: h2o projection=Some([1]) | -| physical_plan | ExecutionPlan(PlaceHolder) | -| | ProjectionExec: expr=[state@0 as name] | -| | ExecutionPlan(PlaceHolder) | -| | ProjectionExec: expr=[state@1 as state] | -| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | ProjectionExec: expr=[city@0 as name] | -| | ExecutionPlan(PlaceHolder) | -| | ProjectionExec: expr=[city@0 as city] | -| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | -| | ExecutionPlan(PlaceHolder) | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | -+-----------------------------------------+-------------------------------------------------------------------------------+ +-- SQL: explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST | +| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | +| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | +| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] | +| | CoalescePartitionsExec | +| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN select time, state, city, min_temp, max_temp, area from h2o; ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area | +| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) | +| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN select state as name from h2o UNION ALL select city as name from h2o; ++---------------+-----------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------+ +| logical_plan | Union | +| | Projection: #h2o.state AS name | +| | TableScan: h2o projection=Some([4]) | +| | Projection: #h2o.city AS name | +| | TableScan: h2o projection=Some([1]) | +| physical_plan | ExecutionPlan(PlaceHolder) | +| | ProjectionExec: expr=[state@0 as name] | +| | ExecutionPlan(PlaceHolder) | +| | ProjectionExec: expr=[state@1 as state] | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | ProjectionExec: expr=[city@0 as name] | +| | ExecutionPlan(PlaceHolder) | +| | ProjectionExec: expr=[city@0 as city] | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] | +| | ExecutionPlan(PlaceHolder) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate | ++---------------+-----------------------------------------------------------------------------------+ diff --git a/query_tests/cases/in/duplicates.sql b/query_tests/cases/in/duplicates.sql index c9e0159199..5ef261c75e 100644 --- a/query_tests/cases/in/duplicates.sql +++ b/query_tests/cases/in/duplicates.sql @@ -2,11 +2,11 @@ -- IOX_SETUP: OneMeasurementThreeChunksWithDuplicates -- Plan with order by -explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; +explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city; -- plan without order by -explain verbose select time, state, city, min_temp, max_temp, area from h2o; +EXPLAIN select time, state, city, min_temp, max_temp, area from h2o; -- Union plan -EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o; +EXPLAIN select state as name from h2o UNION ALL select city as name from h2o; diff --git a/query_tests/cases/in/pushdown.expected b/query_tests/cases/in/pushdown.expected index 7173930313..dd58342994 100644 --- a/query_tests/cases/in/pushdown.expected +++ b/query_tests/cases/in/pushdown.expected @@ -1,218 +1,167 @@ -- Test Setup: TwoMeasurementsPredicatePushDown --- SQL: EXPLAIN VERBOSE SELECT * from restaurant; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Float64) > 200 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 4 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury'; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); -+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 4 AND system@1 < 7 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0; -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 5 AND system@1 < 7 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+---------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ --- SQL: EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=None | -| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200.0; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Float64(200) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Float64) > 200 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(4) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 4 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury'; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and count < 40000; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 4 AND system@1 < 7 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0; ++---------------+---------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND system@1 < 7 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+---------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; ++---------------+-----------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+-----------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00) | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | ++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/query_tests/cases/in/pushdown.sql b/query_tests/cases/in/pushdown.sql index 6ccf718c32..e97c34ef85 100644 --- a/query_tests/cases/in/pushdown.sql +++ b/query_tests/cases/in/pushdown.sql @@ -2,44 +2,44 @@ -- IOX_SETUP: TwoMeasurementsPredicatePushDown -- Test 1: Select everything -EXPLAIN VERBOSE SELECT * from restaurant; +EXPLAIN SELECT * from restaurant; -- Test 2: One push-down expression: count > 200 -- TODO: Make push-down predicates shown in explain verbose. Ticket #1538 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200; +EXPLAIN SELECT * from restaurant where count > 200; -- Test 2.2: One push-down expression: count > 200.0 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0; +EXPLAIN SELECT * from restaurant where count > 200.0; -- Test 2.3: One push-down expression: system > 4.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0; +EXPLAIN SELECT * from restaurant where system > 4.0; -- Test 3: Two push-down expression: count > 200 and town != 'tewsbury' -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury'; +EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury'; -- Test 4: Still two push-down expression: count > 200 and town != 'tewsbury' -- even though the results are different -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); +EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); -- Test 5: three push-down expression: count > 200 and town != 'tewsbury' and count < 40000 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; +EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; -- Test 6: two push-down expression: count > 200 and count < 40000 -EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000; +EXPLAIN SELECT * from restaurant where count > 200 and count < 40000; -- Test 7: two push-down expression on float: system > 4.0 and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0; +EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0; -- Test 8: two push-down expression on float: system > 5.0 and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0; +EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0; -- Test 9: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; +EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; -- Test 10: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 -EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); +EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); -- Test 11: four push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 and -- time > to_timestamp('1970-01-01T00:00:00.000000120+00:00') rewritten to time GT INT(130) -EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); +EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); diff --git a/query_tests/src/runner.rs b/query_tests/src/runner.rs index 6edf05ce56..7abbf40403 100644 --- a/query_tests/src/runner.rs +++ b/query_tests/src/runner.rs @@ -4,12 +4,16 @@ mod parse; mod setup; use arrow::record_batch::RecordBatch; -use query::{exec::ExecutorType, frontend::sql::SqlQueryPlanner}; +use query::{ + exec::{Executor, ExecutorType}, + frontend::sql::SqlQueryPlanner, +}; use snafu::{OptionExt, ResultExt, Snafu}; use std::{ io::LineWriter, io::Write, path::{Path, PathBuf}, + sync::Arc, }; use self::{parse::TestQueries, setup::TestSetup}; @@ -261,7 +265,13 @@ impl Runner { writeln!(self.log, "Running scenario '{}'", scenario_name)?; writeln!(self.log, "SQL: '{:#?}'", sql)?; let planner = SqlQueryPlanner::default(); - let executor = db.executor(); + let num_threads = 1; + let mut executor = Executor::new(num_threads); + + // hardcode concurrency in tests as by default is is the + // number of cores, which varies across machines + executor.config_mut().set_concurrency(4); + let executor = Arc::new(executor); let physical_plan = planner .query(db, &sql, executor.as_ref()) From 45ff5e214c8df63c3aeabab994cdbe554c0dbe56 Mon Sep 17 00:00:00 2001 From: Marko Mikulicic Date: Mon, 19 Jul 2021 18:41:11 +0200 Subject: [PATCH 03/27] feat(iox): Make max http request size configurable --- src/commands/run.rs | 8 +++++ src/influxdb_ioxd.rs | 10 ++++++- src/influxdb_ioxd/http.rs | 63 ++++++++++++++++++++++++++++----------- 3 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/commands/run.rs b/src/commands/run.rs index cab9cadb4d..878d98af97 100644 --- a/src/commands/run.rs +++ b/src/commands/run.rs @@ -231,6 +231,14 @@ Possible values (case insensitive): default_value = "serving" )] pub initial_serving_state: ServingReadinessState, + + /// Maximum size of HTTP requests. + #[structopt( + long = "--max-http-request-size", + env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE", + default_value = "10485760" // 10 MiB + )] + pub max_http_request_size: usize, } pub async fn command(config: Config) -> Result<()> { diff --git a/src/influxdb_ioxd.rs b/src/influxdb_ioxd.rs index 696ad257cb..c871bc30c5 100644 --- a/src/influxdb_ioxd.rs +++ b/src/influxdb_ioxd.rs @@ -195,7 +195,15 @@ pub async fn main(config: Config) -> Result<()> { let bind_addr = config.http_bind_address; let addr = AddrIncoming::bind(&bind_addr).context(StartListeningHttp { bind_addr })?; - let http_server = http::serve(addr, Arc::clone(&app_server), frontend_shutdown.clone()).fuse(); + let max_http_request_size = config.max_http_request_size; + + let http_server = http::serve( + addr, + Arc::clone(&app_server), + frontend_shutdown.clone(), + max_http_request_size, + ) + .fuse(); info!(bind_address=?bind_addr, "HTTP server listening"); info!(git_hash, "InfluxDB IOx server ready"); diff --git a/src/influxdb_ioxd/http.rs b/src/influxdb_ioxd/http.rs index f8dab0bd8c..818ba09e74 100644 --- a/src/influxdb_ioxd/http.rs +++ b/src/influxdb_ioxd/http.rs @@ -342,12 +342,26 @@ impl ApplicationError { } } -const MAX_SIZE: usize = 10_485_760; // max write request size of 10MB - -fn router(server: Arc>) -> Router +struct Server where M: ConnectionManager + Send + Sync + Debug + 'static, { + app_server: Arc>, + max_request_size: usize, +} + +fn router( + app_server: Arc>, + max_request_size: usize, +) -> Router +where + M: ConnectionManager + Send + Sync + Debug + 'static, +{ + let server = Server { + app_server, + max_request_size, + }; + // Create a router and specify the the handlers. Router::builder() .data(server) @@ -408,7 +422,7 @@ struct WriteInfo { /// Parse the request's body into raw bytes, applying size limits and /// content encoding as needed. -async fn parse_body(req: hyper::Request) -> Result { +async fn parse_body(req: hyper::Request, max_size: usize) -> Result { // clippy says the const needs to be assigned to a local variable: // error: a `const` item with interior mutability should not be borrowed let header_name = CONTENT_ENCODING; @@ -431,9 +445,9 @@ async fn parse_body(req: hyper::Request) -> Result MAX_SIZE { + if (body.len() + chunk.len()) > max_size { return Err(ApplicationError::RequestSizeExceeded { - max_body_size: MAX_SIZE, + max_body_size: max_size, }); } body.extend_from_slice(&chunk); @@ -445,9 +459,9 @@ async fn parse_body(req: hyper::Request) -> Result>>().expect("server state")); + let Server { + app_server: server, + max_request_size, + } = req.data::>().expect("server state"); + let max_request_size = *max_request_size; + let server = Arc::clone(&server); // TODO(edd): figure out best way of catching all errors in this observation. let obs = server.metrics.http_requests.observation(); // instrument request @@ -481,7 +500,7 @@ where let db_name = org_and_bucket_to_database(&write_info.org, &write_info.bucket) .context(BucketMappingError)?; - let body = parse_body(req).await?; + let body = parse_body(req, max_request_size).await?; let body = str::from_utf8(&body).context(ReadingBodyAsUtf8)?; @@ -595,7 +614,7 @@ async fn query( req: Request, ) -> Result, ApplicationError> { let path = req.uri().path().to_string(); - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); // TODO(edd): figure out best way of catching all errors in this observation. let obs = server.metrics.http_requests.observation(); // instrument request @@ -661,7 +680,7 @@ async fn query( async fn health( req: Request, ) -> Result, ApplicationError> { - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); let path = req.uri().path().to_string(); server .metrics @@ -677,7 +696,7 @@ async fn health( async fn handle_metrics( req: Request, ) -> Result, ApplicationError> { - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); let path = req.uri().path().to_string(); server .metrics @@ -700,7 +719,7 @@ async fn list_partitions( ) -> Result, ApplicationError> { let path = req.uri().path().to_string(); - let server = Arc::clone(&req.data::>>().expect("server state")); + let server = Arc::clone(&req.data::>().expect("server state").app_server); // TODO - catch error conditions let obs = server.metrics.http_requests.observation(); @@ -841,11 +860,12 @@ pub async fn serve( addr: AddrIncoming, server: Arc>, shutdown: CancellationToken, + max_request_size: usize, ) -> Result<(), hyper::Error> where M: ConnectionManager + Send + Sync + Debug + 'static, { - let router = router(server); + let router = router(server, max_request_size); let service = RouterService::new(router).unwrap(); hyper::Server::builder(addr) @@ -1234,6 +1254,8 @@ mod tests { .await; } + const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024; + #[tokio::test] async fn client_hangup_during_parse() { #[derive(Debug, Snafu)] @@ -1253,7 +1275,9 @@ mod tests { .body(body) .unwrap(); - let parse_result = parse_body(request).await.unwrap_err(); + let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE) + .await + .unwrap_err(); assert_eq!( parse_result.to_string(), "Client hung up while sending body: error reading a body from connection: Blarg Error" @@ -1334,7 +1358,12 @@ mod tests { let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server"); let server_url = format!("http://{}", addr.local_addr()); - tokio::task::spawn(serve(addr, server, CancellationToken::new())); + tokio::task::spawn(serve( + addr, + server, + CancellationToken::new(), + TEST_MAX_REQUEST_SIZE, + )); println!("Started server at {}", server_url); server_url } From 38f4eec20e73050d8013a5c50258caf5fa81b106 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Mon, 19 Jul 2021 19:04:58 +0200 Subject: [PATCH 04/27] feat: implement `seek` for write buffer This is required to control replay ranges. --- write_buffer/src/core.rs | 70 ++++++++++++++++++++++++++--- write_buffer/src/kafka.rs | 54 +++++++++++++++++----- write_buffer/src/mock.rs | 94 ++++++++++++++++++++++++++++----------- 3 files changed, 176 insertions(+), 42 deletions(-) diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs index f747fde37b..27c7512884 100644 --- a/write_buffer/src/core.rs +++ b/write_buffer/src/core.rs @@ -25,12 +25,18 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static { pub type EntryStream<'a> = BoxStream<'a, Result>; /// Produce streams (one per sequencer) of [`SequencedEntry`]s. +#[async_trait] pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static { /// Returns a stream per sequencer. - fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)> - where - 'life0: 'async_trait, - Self: 'async_trait; + /// + /// Calling this method multiple times returns multiple streams that share the same state, i.e. entries for a + /// specific sequencer will only be deliver on on of the streams (likely the first that is polled). If you need + /// independent streams, create multiple [`WriteBufferReading`] objects. + fn streams(&self) -> Vec<(u32, EntryStream<'_>)>; + + /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least + /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream). + async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError>; } pub mod test_utils { @@ -65,6 +71,7 @@ pub mod test_utils { test_multi_stream_io(&adapter).await; test_multi_sequencer_io(&adapter).await; test_multi_writer_multi_reader(&adapter).await; + test_seek(&adapter).await; } async fn test_single_stream_io(adapter: &T) @@ -213,18 +220,67 @@ pub mod test_utils { writer_2.store_entry(&entry_east_2, 0).await.unwrap(); assert_reader_content( - reader_1, + &reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; assert_reader_content( - reader_2, + &reader_2, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; } - async fn assert_reader_content(reader: R, expected: &[(u32, &[&Entry])]) + async fn test_seek(adapter: &T) + where + T: TestAdapter, + { + let context = adapter.new_context(2).await; + + let waker = futures::task::noop_waker(); + let mut cx = futures::task::Context::from_waker(&waker); + + let entry_east_1 = lp_to_entry("upc,region=east user=1 100"); + let entry_east_2 = lp_to_entry("upc,region=east user=2 200"); + let entry_east_3 = lp_to_entry("upc,region=east user=3 300"); + let entry_west_1 = lp_to_entry("upc,region=west user=1 200"); + + let writer = context.writing(); + let _sequence_number_east_1 = writer.store_entry(&entry_east_1, 0).await.unwrap().number; + let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number; + let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number; + + let reader_1 = context.reading().await; + let reader_2 = context.reading().await; + + // forward seek + reader_1.seek(0, sequence_number_east_2).await.unwrap(); + assert_reader_content(&reader_1, &[(0, &[&entry_east_2]), (1, &[&entry_west_1])]).await; + assert_reader_content( + &reader_2, + &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], + ) + .await; + + // backward seek + reader_1.seek(0, 0).await.unwrap(); + assert_reader_content(&reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])]).await; + + // seek to far end and then at data + reader_1.seek(0, 1_000_000).await.unwrap(); + let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number; + let mut streams = reader_1.streams(); + assert_eq!(streams.len(), 2); + let (_sequencer_id, mut stream_1) = streams.pop().unwrap(); + let (_sequencer_id, mut stream_2) = streams.pop().unwrap(); + assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + + // seeking unknown sequencer is NOT an error + reader_1.seek(0, 42).await.unwrap(); + } + + async fn assert_reader_content(reader: &R, expected: &[(u32, &[&Entry])]) where R: WriteBufferReading, { diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index 15a27a401c..0624a06d18 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -1,5 +1,7 @@ use std::{ + collections::BTreeMap, convert::{TryFrom, TryInto}, + sync::Arc, time::Duration, }; @@ -13,7 +15,7 @@ use rdkafka::{ error::KafkaError, producer::{FutureProducer, FutureRecord}, util::Timeout, - ClientConfig, Message, TopicPartitionList, + ClientConfig, Message, Offset, TopicPartitionList, }; use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; @@ -94,7 +96,7 @@ impl KafkaBufferProducer { pub struct KafkaBufferConsumer { conn: String, database_name: String, - consumers: Vec<(u32, StreamConsumer)>, + consumers: BTreeMap>, } // Needed because rdkafka's StreamConsumer doesn't impl Debug @@ -107,12 +109,9 @@ impl std::fmt::Debug for KafkaBufferConsumer { } } +#[async_trait] impl WriteBufferReading for KafkaBufferConsumer { - fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)> - where - 'life0: 'async_trait, - Self: 'async_trait, - { + fn streams(&self) -> Vec<(u32, EntryStream<'_>)> { self.consumers .iter() .map(|(sequencer_id, consumer)| { @@ -133,6 +132,31 @@ impl WriteBufferReading for KafkaBufferConsumer { }) .collect() } + + async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> { + if let Some(consumer) = self.consumers.get(&sequencer_id) { + let consumer = Arc::clone(consumer); + let database_name = self.database_name.clone(); + let offset = if sequence_number > 0 { + Offset::Offset(sequence_number as i64) + } else { + Offset::Beginning + }; + + tokio::task::spawn_blocking(move || { + consumer.seek( + &database_name, + sequencer_id as i32, + offset, + Duration::from_secs(60), + ) + }) + .await + .expect("subtask failed")?; + } + + Ok(()) + } } impl KafkaBufferConsumer { @@ -169,11 +193,21 @@ impl KafkaBufferConsumer { let mut assignment = TopicPartitionList::new(); assignment.add_partition(&database_name, partition as i32); - consumer.assign(&assignment)?; - Ok((partition, consumer)) + // We must set the offset to `Beginning` here to avoid the following error during seek: + // KafkaError (Seek error: Local: Erroneous state) + // + // Also see: + // - https://github.com/Blizzard/node-rdkafka/issues/237 + // - https://github.com/confluentinc/confluent-kafka-go/issues/121#issuecomment-362308376 + assignment + .set_partition_offset(&database_name, partition as i32, Offset::Beginning) + .expect("partition was set just before"); + + consumer.assign(&assignment)?; + Ok((partition, Arc::new(consumer))) }) - .collect::, KafkaError>>()?; + .collect::>, KafkaError>>()?; Ok(Self { conn, diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs index fc15ca2534..cb4322199a 100644 --- a/write_buffer/src/mock.rs +++ b/write_buffer/src/mock.rs @@ -153,21 +153,38 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors { } } +/// Sequencer-specific playback state +struct PlaybackState { + /// Index within the entry vector. + vector_index: usize, + + /// Offset within the sequencer IDs. + offset: u64, +} + pub struct MockBufferForReading { - state: MockBufferSharedState, - positions: Arc>>, + shared_state: MockBufferSharedState, + playback_states: Arc>>, } impl MockBufferForReading { pub fn new(state: MockBufferSharedState) -> Self { let n_sequencers = state.entries.lock().len() as u32; - let positions: BTreeMap<_, _> = (0..n_sequencers) - .map(|sequencer_id| (sequencer_id, 0)) + let playback_states: BTreeMap<_, _> = (0..n_sequencers) + .map(|sequencer_id| { + ( + sequencer_id, + PlaybackState { + vector_index: 0, + offset: 0, + }, + ) + }) .collect(); Self { - state, - positions: Arc::new(Mutex::new(positions)), + shared_state: state, + playback_states: Arc::new(Mutex::new(playback_states)), } } } @@ -178,38 +195,52 @@ impl std::fmt::Debug for MockBufferForReading { } } +#[async_trait] impl WriteBufferReading for MockBufferForReading { - fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)> - where - 'life0: 'async_trait, - Self: 'async_trait, - { + fn streams(&self) -> Vec<(u32, EntryStream<'_>)> { let sequencer_ids: Vec<_> = { - let positions = self.positions.lock(); - positions.keys().copied().collect() + let playback_states = self.playback_states.lock(); + playback_states.keys().copied().collect() }; let mut streams = vec![]; for sequencer_id in sequencer_ids { - let state = self.state.clone(); - let positions = Arc::clone(&self.positions); + let shared_state = self.shared_state.clone(); + let playback_states = Arc::clone(&self.playback_states); let stream = stream::poll_fn(move |_ctx| { - let entries = state.entries.lock(); - let mut positions = positions.lock(); + let entries = shared_state.entries.lock(); + let mut playback_states = playback_states.lock(); let entry_vec = entries.get(&sequencer_id).unwrap(); - let position = positions.get_mut(&sequencer_id).unwrap(); + let playback_state = playback_states.get_mut(&sequencer_id).unwrap(); - if entry_vec.len() > *position { - let entry = match &entry_vec[*position] { - Ok(entry) => Ok(entry.clone()), - Err(e) => Err(e.to_string().into()), - }; - *position += 1; - return Poll::Ready(Some(entry)); + while entry_vec.len() > playback_state.vector_index { + let entry_result = &entry_vec[playback_state.vector_index]; + + // consume entry + playback_state.vector_index += 1; + + match entry_result { + Ok(entry) => { + // found an entry => need to check if it is within the offset + let sequence = entry.sequence().unwrap(); + if sequence.number >= playback_state.offset { + // within offset => return entry to caller + return Poll::Ready(Some(Ok(entry.clone()))); + } else { + // offset is larger then the current entry => ignore entry and try next + continue; + } + } + Err(e) => { + // found an error => return entry to caller + return Poll::Ready(Some(Err(e.to_string().into()))); + } + } } + // we are at the end of the recorded entries => report pending Poll::Pending }) .boxed(); @@ -218,6 +249,19 @@ impl WriteBufferReading for MockBufferForReading { streams } + + async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> { + let mut playback_states = self.playback_states.lock(); + + if let Some(playback_state) = playback_states.get_mut(&sequencer_id) { + playback_state.offset = sequence_number; + + // reset position to start since seeking might go backwards + playback_state.vector_index = 0; + } + + Ok(()) + } } #[cfg(test)] From 8e5d5928cf700201fe9b0f17b506ebc6dc1b47ac Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Jul 2021 09:46:52 +0100 Subject: [PATCH 05/27] feat: compute WriteSummary from PersistenceWindows (#2030) (#2054) * feat: compute WriteSummary from PersistenceWindows (#2030) * chore: review feedback Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> --- data_types/src/lib.rs | 1 + data_types/src/write_summary.rs | 20 +++ .../src/persistence_windows.rs | 148 +++++++++++++++++- 3 files changed, 164 insertions(+), 5 deletions(-) create mode 100644 data_types/src/write_summary.rs diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs index bea9629bc3..76d7ca0306 100644 --- a/data_types/src/lib.rs +++ b/data_types/src/lib.rs @@ -22,3 +22,4 @@ pub mod names; pub mod partition_metadata; pub mod server_id; pub mod timestamp; +pub mod write_summary; diff --git a/data_types/src/write_summary.rs b/data_types/src/write_summary.rs new file mode 100644 index 0000000000..9574910262 --- /dev/null +++ b/data_types/src/write_summary.rs @@ -0,0 +1,20 @@ +use chrono::{DateTime, Utc}; + +/// A description of a set of writes +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct WriteSummary { + /// The wall clock timestamp of the last write in this summary + pub time_of_first_write: DateTime, + + /// The wall clock timestamp of the last write in this summary + pub time_of_last_write: DateTime, + + /// The minimum row timestamp for data in this summary + pub min_timestamp: DateTime, + + /// The maximum row timestamp value for data in this summary + pub max_timestamp: DateTime, + + /// The number of rows in this summary + pub row_count: usize, +} diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs index 8dd287f5ba..1b5ae73d29 100644 --- a/persistence_windows/src/persistence_windows.rs +++ b/persistence_windows/src/persistence_windows.rs @@ -7,7 +7,7 @@ use std::{ use chrono::{DateTime, TimeZone, Utc}; -use data_types::partition_metadata::PartitionAddr; +use data_types::{partition_metadata::PartitionAddr, write_summary::WriteSummary}; use entry::Sequence; use internal_types::guard::{ReadGuard, ReadLock}; @@ -45,6 +45,16 @@ pub struct PersistenceWindows { late_arrival_period: Duration, closed_window_period: Duration, + /// The datetime this PersistenceWindows was created + /// + /// `PersistenceWindows` internally uses monotonic `Instant`, however, + /// these cannot be rendered. To provide a stable rendering of Wall timestamp, + /// a single timestamp is recorded at creation time + created_at_time: DateTime, + + /// The instant this PersistenceWindows was created + created_at_instant: Instant, + /// The last instant passed to PersistenceWindows::add_range last_instant: Instant, @@ -106,6 +116,9 @@ impl PersistenceWindows { let closed_window_count = late_arrival_seconds / closed_window_seconds; + let created_at_time = Utc::now(); + let created_at_instant = Instant::now(); + Self { persistable: ReadLock::new(None), closed: VecDeque::with_capacity(closed_window_count as usize), @@ -113,7 +126,9 @@ impl PersistenceWindows { addr, late_arrival_period, closed_window_period, - last_instant: Instant::now(), + created_at_time, + created_at_instant, + last_instant: created_at_instant, max_sequence_numbers: Default::default(), } } @@ -165,7 +180,7 @@ impl PersistenceWindows { self.rotate(received_at); match self.open.as_mut() { - Some(w) => w.add_range(sequence, row_count, min_time, max_time), + Some(w) => w.add_range(sequence, row_count, min_time, max_time, received_at), None => { self.open = Some(Window::new( received_at, @@ -335,6 +350,34 @@ impl PersistenceWindows { self.windows().next() } + /// Returns approximate summaries of the unpersisted writes contained + /// recorded by this PersistenceWindow instance + /// + /// These are approximate because persistence may partially flush a window, which will + /// update the min row timestamp but not the row count + pub fn summaries(&self) -> impl Iterator + '_ { + self.windows().map(move |window| { + let window_age = chrono::Duration::from_std( + window.created_at.duration_since(self.created_at_instant), + ) + .expect("duration overflow"); + + let time_of_first_write = self.created_at_time + window_age; + + let window_duration = + chrono::Duration::from_std(window.last_instant.duration_since(window.created_at)) + .expect("duration overflow"); + + WriteSummary { + time_of_first_write, + time_of_last_write: time_of_first_write + window_duration, + min_timestamp: window.min_time, + max_timestamp: window.max_time, + row_count: window.row_count, + } + }) + } + /// Returns true if this PersistenceWindows instance is empty pub fn is_empty(&self) -> bool { self.minimum_window().is_none() @@ -374,9 +417,14 @@ struct Window { /// The server time when this window was created. Used to determine how long data in this /// window has been sitting in memory. created_at: Instant, + /// The server time of the last write to this window + last_instant: Instant, + /// The number of rows in the window row_count: usize, - min_time: DateTime, // min time value for data in the window - max_time: DateTime, // max time value for data in the window + /// min time value for data in the window + min_time: DateTime, + /// max time value for data in the window + max_time: DateTime, /// maps sequencer_id to the minimum and maximum sequence numbers seen sequencer_numbers: BTreeMap, } @@ -399,6 +447,7 @@ impl Window { Self { created_at, + last_instant: created_at, row_count, min_time, max_time, @@ -414,7 +463,11 @@ impl Window { row_count: usize, min_time: DateTime, max_time: DateTime, + instant: Instant, ) { + assert!(self.created_at <= instant); + self.last_instant = instant; + self.row_count += row_count; if self.min_time > min_time { self.min_time = min_time; @@ -1265,4 +1318,89 @@ mod tests { assert_eq!(w.closed[1].max_time, start + chrono::Duration::seconds(2)); assert_eq!(w.closed[1].row_count, 11); } + + #[test] + fn test_summaries() { + let mut w = make_windows(Duration::from_secs(100)); + let instant = w.created_at_instant; + + // Window 1 + w.add_range( + Some(&Sequence { id: 1, number: 1 }), + 11, + Utc.timestamp_nanos(10), + Utc.timestamp_nanos(11), + instant + Duration::from_millis(1), + ); + + w.add_range( + Some(&Sequence { id: 1, number: 2 }), + 4, + Utc.timestamp_nanos(10), + Utc.timestamp_nanos(340), + instant + Duration::from_millis(30), + ); + + w.add_range( + Some(&Sequence { id: 1, number: 3 }), + 6, + Utc.timestamp_nanos(1), + Utc.timestamp_nanos(5), + instant + Duration::from_millis(50), + ); + + // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 1 => Window 2 + w.add_range( + Some(&Sequence { id: 1, number: 4 }), + 3, + Utc.timestamp_nanos(89), + Utc.timestamp_nanos(90), + instant + DEFAULT_CLOSED_WINDOW_PERIOD + Duration::from_millis(1), + ); + + // More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 2 => Window 3 + w.add_range( + Some(&Sequence { id: 1, number: 5 }), + 8, + Utc.timestamp_nanos(3), + Utc.timestamp_nanos(4), + instant + DEFAULT_CLOSED_WINDOW_PERIOD * 3, + ); + + let closed_duration = chrono::Duration::from_std(DEFAULT_CLOSED_WINDOW_PERIOD).unwrap(); + + let summaries: Vec<_> = w.summaries().collect(); + + assert_eq!(summaries.len(), 3); + assert_eq!( + summaries, + vec![ + WriteSummary { + time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1), + time_of_last_write: w.created_at_time + chrono::Duration::milliseconds(50), + min_timestamp: Utc.timestamp_nanos(1), + max_timestamp: Utc.timestamp_nanos(340), + row_count: 21 + }, + WriteSummary { + time_of_first_write: w.created_at_time + + closed_duration + + chrono::Duration::milliseconds(1), + time_of_last_write: w.created_at_time + + closed_duration + + chrono::Duration::milliseconds(1), + min_timestamp: Utc.timestamp_nanos(89), + max_timestamp: Utc.timestamp_nanos(90), + row_count: 3 + }, + WriteSummary { + time_of_first_write: w.created_at_time + closed_duration * 3, + time_of_last_write: w.created_at_time + closed_duration * 3, + min_timestamp: Utc.timestamp_nanos(3), + max_timestamp: Utc.timestamp_nanos(4), + row_count: 8 + }, + ] + ) + } } From 2c20528c6926f16b5c78f0fcac652a2416326631 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Jul 2021 04:53:46 -0400 Subject: [PATCH 06/27] chore: use upstream versions of some workarounds (#2057) * chore: use upstream versions of some workarounds * docs: update docstring Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> --- internal_types/src/schema.rs | 2 +- internal_types/src/schema/sort.rs | 19 +------------------ query/src/frontend/reorg.rs | 3 ++- query/src/provider/deduplicate/algo.rs | 11 +---------- 4 files changed, 5 insertions(+), 30 deletions(-) diff --git a/internal_types/src/schema.rs b/internal_types/src/schema.rs index 4427540576..9ac26a8731 100644 --- a/internal_types/src/schema.rs +++ b/internal_types/src/schema.rs @@ -786,12 +786,12 @@ macro_rules! assert_column_eq { #[cfg(test)] mod test { + use arrow::compute::SortOptions; use InfluxColumnType::*; use InfluxFieldType::*; use super::{builder::SchemaBuilder, *}; use crate::schema::merge::SchemaMerger; - use crate::schema::sort::SortOptions; fn make_field( name: &str, diff --git a/internal_types/src/schema/sort.rs b/internal_types/src/schema/sort.rs index a56fd0a495..0612b11dbc 100644 --- a/internal_types/src/schema/sort.rs +++ b/internal_types/src/schema/sort.rs @@ -1,5 +1,6 @@ use std::{fmt::Display, str::FromStr}; +use arrow::compute::SortOptions; use indexmap::{map::Iter, IndexMap}; use itertools::Itertools; use snafu::Snafu; @@ -23,24 +24,6 @@ pub enum Error { pub type Result = std::result::Result; -/// Temporary - -#[derive(Debug, Clone, Copy, Eq, PartialEq)] -pub struct SortOptions { - /// Whether to sort in descending order - pub descending: bool, - /// Whether to sort nulls first - pub nulls_first: bool, -} - -impl Default for SortOptions { - fn default() -> Self { - Self { - descending: false, - nulls_first: true, - } - } -} - #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub struct ColumnSort { /// Position of this column in the sort key diff --git a/query/src/frontend/reorg.rs b/query/src/frontend/reorg.rs index a1c2df8599..43e1c824e7 100644 --- a/query/src/frontend/reorg.rs +++ b/query/src/frontend/reorg.rs @@ -268,8 +268,9 @@ struct ScanPlan { #[cfg(test)] mod test { + use arrow::compute::SortOptions; use arrow_util::assert_batches_eq; - use internal_types::schema::{merge::SchemaMerger, sort::SortOptions}; + use internal_types::schema::merge::SchemaMerger; use crate::{ exec::{Executor, ExecutorType}, diff --git a/query/src/provider/deduplicate/algo.rs b/query/src/provider/deduplicate/algo.rs index 6cc7ecb77e..df14aebfa9 100644 --- a/query/src/provider/deduplicate/algo.rs +++ b/query/src/provider/deduplicate/algo.rs @@ -339,21 +339,12 @@ impl RecordBatchDeduplicator { } /// Create a new record batch from offset --> len - /// - /// for adding this upstream fn slice_record_batch( batch: &RecordBatch, offset: usize, len: usize, ) -> ArrowResult { - let schema = batch.schema(); - let new_columns: Vec<_> = batch - .columns() - .iter() - .map(|old_column| old_column.slice(offset, len)) - .collect(); - - let batch = RecordBatch::try_new(schema, new_columns)?; + let batch = batch.slice(offset, len); // At time of writing, `concat_batches` concatenates the // contents of dictionaries as well; Do a post pass to remove the From 767c2a6fe15eff33c134f947a1cd8af20ec8d4b6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Jul 2021 11:11:18 +0100 Subject: [PATCH 07/27] refactor: explicit server startup state machine (#2040) * refactor: explicit server startup state machine * chore: update `ServerStage` docs * chore: further docs * chore: more logging * chore: format --- server/src/config.rs | 321 +++-------- server/src/db.rs | 4 +- server/src/init.rs | 683 ++++++++--------------- server/src/lib.rs | 506 +++++++++++------ src/influxdb_ioxd/rpc/error.rs | 2 +- src/influxdb_ioxd/rpc/management.rs | 46 +- tests/end_to_end_cases/management_api.rs | 2 + tests/end_to_end_cases/management_cli.rs | 12 + 8 files changed, 701 insertions(+), 875 deletions(-) diff --git a/server/src/config.rs b/server/src/config.rs index 3088661906..4554e8e912 100644 --- a/server/src/config.rs +++ b/server/src/config.rs @@ -8,7 +8,7 @@ use data_types::{ DatabaseName, }; use metrics::MetricRegistry; -use object_store::{path::ObjectStorePath, ObjectStore}; +use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi}; use parquet_file::catalog::PreservedCatalog; use query::exec::Executor; use write_buffer::config::WriteBufferConfig; @@ -20,6 +20,7 @@ use crate::{ InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch, ServerShuttingDown, }; +use object_store::path::Path; use observability_deps::tracing::{self, error, info, warn, Instrument}; use snafu::{ensure, OptionExt}; use tokio::task::JoinHandle; @@ -37,10 +38,14 @@ pub(crate) const DB_RULES_FILE_NAME: &str = "rules.pb"; /// run to completion if the tokio runtime is dropped #[derive(Debug)] pub(crate) struct Config { - shutdown: CancellationToken, jobs: Arc, - state: RwLock, + object_store: Arc, + exec: Arc, + server_id: ServerId, metric_registry: Arc, + + shutdown: CancellationToken, + state: RwLock, } pub(crate) enum UpdateError { @@ -58,14 +63,20 @@ impl Config { /// Create new empty config. pub(crate) fn new( jobs: Arc, + object_store: Arc, + exec: Arc, + server_id: ServerId, metric_registry: Arc, remote_template: Option, ) -> Self { Self { + jobs, + object_store, + exec, + server_id, + metric_registry, shutdown: Default::default(), state: RwLock::new(ConfigState::new(remote_template)), - jobs, - metric_registry, } } @@ -80,13 +91,7 @@ impl Config { /// This only works if the database is not yet known. To recover a database out of an uninitialized state, see /// [`recover_db`](Self::recover_db). To do maintainance work on data linked to the database (e.g. the catalog) /// without initializing it, see [`block_db`](Self::block_db). - pub(crate) fn create_db( - &self, - object_store: Arc, - exec: Arc, - server_id: ServerId, - db_name: DatabaseName<'static>, - ) -> Result> { + pub(crate) fn create_db(&self, db_name: DatabaseName<'static>) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); ensure!( !state.reservations.contains(&db_name), @@ -99,12 +104,7 @@ impl Config { state.reservations.insert(db_name.clone()); Ok(DatabaseHandle { - state: Some(Arc::new(DatabaseState::Known { - object_store, - exec, - server_id, - db_name, - })), + state: Some(Arc::new(DatabaseState::Known { db_name })), config: &self, }) } @@ -116,7 +116,7 @@ impl Config { /// While the handle is held, no other operations for the given database can be executed. /// /// This only works if the database is known but is uninitialized. To create a new database that is not yet known, - /// see [`create_db`](Self::create_db). To do maintainance work on data linked to the database (e.g. the catalog) + /// see [`create_db`](Self::create_db). To do maintenance work on data linked to the database (e.g. the catalog) /// without initializing it, see [`block_db`](Self::block_db). pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result> { let mut state = self.state.write().expect("mutex poisoned"); @@ -303,6 +303,24 @@ impl Config { pub fn metrics_registry(&self) -> Arc { Arc::clone(&self.metric_registry) } + + /// Returns the object store of this server + pub fn object_store(&self) -> Arc { + Arc::clone(&self.object_store) + } + + /// Returns the server id of this server + pub fn server_id(&self) -> ServerId { + self.server_id + } + + /// Base location in object store for this server. + pub fn root_path(&self) -> Path { + let id = self.server_id.get(); + let mut path = self.object_store.new_path(); + path.push_dir(format!("{}", id)); + path + } } /// Get object store path for the database config under the given root (= path under with the server with the current ID @@ -365,41 +383,14 @@ impl RemoteTemplate { } /// Internal representation of the different database states. -/// -/// # Shared Data During Transitions -/// The following elements can safely be shared between states because they won't be poisoned by any half-done -/// transition (e.g. starting a transition and then failing due to an IO error): -/// - `object_store` -/// - `exec` -/// -/// The following elements can trivially be copied from one state to the next: -/// - `server_id` -/// - `db_name` -/// -/// The following elements MUST be copied from one state to the next because partial modifications are not allowed: -/// - `rules` -/// -/// Exceptions to the above rules are the following states: -/// - [`Replay`](Self::Replay): replaying twice should (apart from some performance penalties) not do much harm -/// - [`Initialized`](Self::Initialized): the final state is not advanced to anything else #[derive(Debug)] #[allow(clippy::large_enum_variant)] enum DatabaseState { /// Database is known but nothing is loaded. - Known { - object_store: Arc, - exec: Arc, - server_id: ServerId, - db_name: DatabaseName<'static>, - }, + Known { db_name: DatabaseName<'static> }, /// Rules are loaded - RulesLoaded { - object_store: Arc, - exec: Arc, - server_id: ServerId, - rules: Arc, - }, + RulesLoaded { rules: Arc }, /// Catalog is loaded but data from sequencers / write buffers is not yet replayed. Replay { db: Arc }, @@ -457,24 +448,6 @@ impl DatabaseState { } } - fn object_store(&self) -> Arc { - match self { - DatabaseState::Known { object_store, .. } => Arc::clone(object_store), - DatabaseState::RulesLoaded { object_store, .. } => Arc::clone(object_store), - DatabaseState::Replay { db, .. } => Arc::clone(&db.store), - DatabaseState::Initialized { db, .. } => Arc::clone(&db.store), - } - } - - fn server_id(&self) -> ServerId { - match self { - DatabaseState::Known { server_id, .. } => *server_id, - DatabaseState::RulesLoaded { server_id, .. } => *server_id, - DatabaseState::Replay { db, .. } => db.server_id, - DatabaseState::Initialized { db, .. } => db.server_id, - } - } - fn rules(&self) -> Option> { match self { DatabaseState::Known { .. } => None, @@ -540,12 +513,12 @@ impl<'a> DatabaseHandle<'a> { /// Get object store. pub fn object_store(&self) -> Arc { - self.state().object_store() + Arc::clone(&self.config.object_store) } /// Get server ID. pub fn server_id(&self) -> ServerId { - self.state().server_id() + self.config.server_id } /// Get metrics registry. @@ -584,12 +557,7 @@ impl<'a> DatabaseHandle<'a> { /// Advance database state to [`RulesLoaded`](DatabaseStateCode::RulesLoaded). pub fn advance_rules_loaded(&mut self, rules: DatabaseRules) -> Result<()> { match self.state().as_ref() { - DatabaseState::Known { - object_store, - exec, - server_id, - db_name, - } => { + DatabaseState::Known { db_name } => { ensure!( db_name == &rules.name, RulesDatabaseNameMismatch { @@ -599,9 +567,6 @@ impl<'a> DatabaseHandle<'a> { ); self.state = Some(Arc::new(DatabaseState::RulesLoaded { - object_store: Arc::clone(&object_store), - exec: Arc::clone(&exec), - server_id: *server_id, rules: Arc::new(rules), })); @@ -623,16 +588,11 @@ impl<'a> DatabaseHandle<'a> { write_buffer: Option, ) -> Result<()> { match self.state().as_ref() { - DatabaseState::RulesLoaded { - object_store, - exec, - server_id, - rules, - } => { + DatabaseState::RulesLoaded { rules } => { let database_to_commit = DatabaseToCommit { - server_id: *server_id, - object_store: Arc::clone(&object_store), - exec: Arc::clone(&exec), + server_id: self.config.server_id, + object_store: Arc::clone(&self.config.object_store), + exec: Arc::clone(&self.config.exec), preserved_catalog, catalog, rules: Arc::clone(&rules), @@ -726,40 +686,32 @@ mod test { use super::*; use std::num::NonZeroU32; + fn make_config(remote_template: Option) -> Config { + let store = Arc::new(ObjectStore::new_in_memory()); + let server_id = ServerId::try_from(1).unwrap(); + let metric_registry = Arc::new(metrics::MetricRegistry::new()); + Config::new( + Arc::new(JobRegistry::new()), + Arc::clone(&store), + Arc::new(Executor::new(1)), + server_id, + Arc::clone(&metric_registry), + remote_template, + ) + } + #[tokio::test] async fn create_db() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); let rules = DatabaseRules::new(name.clone()); // getting handle while DB is reserved => fails { - let _db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let _db_reservation = config.create_db(name.clone()).unwrap(); - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseReserved { .. })); let err = config.block_db(name.clone()).unwrap_err(); @@ -771,14 +723,7 @@ mod test { // name in rules must match reserved name { - let mut db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - DatabaseName::new("bar").unwrap(), - ) - .unwrap(); + let mut db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap(); let err = db_reservation .advance_rules_loaded(rules.clone()) @@ -791,14 +736,7 @@ mod test { // handle.abort just works (aka does not mess up the transaction afterwards) { - let db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - DatabaseName::new("bar").unwrap(), - ) - .unwrap(); + let db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap(); db_reservation.abort(); } @@ -808,21 +746,14 @@ mod test { // create DB successfull { - let mut db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let mut db_reservation = config.create_db(name.clone()).unwrap(); db_reservation.advance_rules_loaded(rules).unwrap(); let (preserved_catalog, catalog) = load_or_create_preserved_catalog( &name, - Arc::clone(&store), - server_id, + config.object_store(), + config.server_id(), config.metrics_registry(), false, ) @@ -862,14 +793,7 @@ mod test { assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // create DB as second time => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // block fully initiliazed DB => fail @@ -884,40 +808,18 @@ mod test { async fn recover_db() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); let rules = DatabaseRules::new(name.clone()); // create DB but don't continue with rules loaded (e.g. because the rules file is broken) { - let db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let db_reservation = config.create_db(name.clone()).unwrap(); db_reservation.commit(); } assert!(config.has_uninitialized_database(&name)); // create DB while it is uninitialized => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // recover an unknown DB => fail @@ -931,19 +833,19 @@ mod test { let mut db_reservation = config.recover_db(name.clone()).unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::Known); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_none()); db_reservation.advance_rules_loaded(rules).unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::RulesLoaded); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_some()); let (preserved_catalog, catalog) = load_or_create_preserved_catalog( &name, - Arc::clone(&store), - server_id, + config.object_store(), + config.server_id(), config.metrics_registry(), false, ) @@ -954,13 +856,13 @@ mod test { .unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::Replay); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_some()); db_reservation.advance_init().unwrap(); assert_eq!(db_reservation.state_code(), DatabaseStateCode::Initialized); assert_eq!(db_reservation.db_name(), name); - assert_eq!(db_reservation.server_id(), server_id); + assert_eq!(db_reservation.server_id(), config.server_id()); assert!(db_reservation.rules().is_some()); db_reservation.commit(); @@ -974,14 +876,7 @@ mod test { assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // create recovered DB => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseAlreadyExists { .. })); // block recovered DB => fail @@ -996,28 +891,13 @@ mod test { async fn block_db() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); // block DB let handle = config.block_db(name.clone()).unwrap(); // create while blocked => fail - let err = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap_err(); + let err = config.create_db(name.clone()).unwrap_err(); assert!(matches!(err, Error::DatabaseReserved { .. })); // recover while blocked => fail @@ -1030,14 +910,7 @@ mod test { // unblock => DB can be created drop(handle); - config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + config.create_db(name.clone()).unwrap(); // cleanup config.drain().await @@ -1047,20 +920,12 @@ mod test { async fn test_db_drop() { // setup let name = DatabaseName::new("foo").unwrap(); - let store = Arc::new(ObjectStore::new_in_memory()); - let exec = Arc::new(Executor::new(1)); - let server_id = ServerId::try_from(1).unwrap(); - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - None, - ); + let config = make_config(None); let rules = DatabaseRules::new(name.clone()); let (preserved_catalog, catalog) = load_or_create_preserved_catalog( &name, - Arc::clone(&store), - server_id, + config.object_store(), + config.server_id(), config.metrics_registry(), false, ) @@ -1068,14 +933,7 @@ mod test { .unwrap(); // create DB - let mut db_reservation = config - .create_db( - Arc::clone(&store), - Arc::clone(&exec), - server_id, - name.clone(), - ) - .unwrap(); + let mut db_reservation = config.create_db(name.clone()).unwrap(); db_reservation.advance_rules_loaded(rules).unwrap(); db_reservation .advance_replay(preserved_catalog, catalog, None) @@ -1122,12 +980,7 @@ mod test { #[test] fn resolve_remote() { - let metric_registry = Arc::new(metrics::MetricRegistry::new()); - let config = Config::new( - Arc::new(JobRegistry::new()), - Arc::clone(&metric_registry), - Some(RemoteTemplate::new("http://iox-query-{id}:8082")), - ); + let config = make_config(Some(RemoteTemplate::new("http://iox-query-{id}:8082"))); let server_id = ServerId::new(NonZeroU32::new(42).unwrap()); let remote = config.resolve_remote(server_id); diff --git a/server/src/db.rs b/server/src/db.rs index c39655f107..4ec0d92f6b 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -203,10 +203,10 @@ pub type Result = std::result::Result; pub struct Db { rules: RwLock>, - pub server_id: ServerId, // this is also the Query Server ID + server_id: ServerId, // this is also the Query Server ID /// Interface to use for persistence - pub store: Arc, + store: Arc, /// Executor for running queries exec: Arc, diff --git a/server/src/init.rs b/server/src/init.rs index c06cbc61b6..2351821f9c 100644 --- a/server/src/init.rs +++ b/server/src/init.rs @@ -2,29 +2,19 @@ use data_types::{ database_rules::{DatabaseRules, WriteBufferConnection}, database_state::DatabaseStateCode, - server_id::ServerId, + error::ErrorLogger, DatabaseName, }; use futures::TryStreamExt; use generated_types::database_rules::decode_database_rules; -use internal_types::once::OnceNonZeroU32; use object_store::{ path::{parsed::DirsAndFileName, ObjectStorePath, Path}, ObjectStore, ObjectStoreApi, }; -use observability_deps::tracing::{debug, error, info, warn}; -use parking_lot::Mutex; +use observability_deps::tracing::{error, info, warn}; use parquet_file::catalog::PreservedCatalog; -use query::exec::Executor; -use snafu::{OptionExt, ResultExt, Snafu}; -use std::{ - collections::HashMap, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, -}; -use tokio::sync::Semaphore; +use snafu::{ResultExt, Snafu}; +use std::sync::Arc; use write_buffer::config::WriteBufferConfig; use crate::{ @@ -45,9 +35,6 @@ pub enum Error { source: generated_types::database_rules::DecodeError, }, - #[snafu(display("id already set"))] - IdAlreadySet { id: ServerId }, - #[snafu(display("unable to use server until id is set"))] IdNotSet, @@ -97,472 +84,254 @@ pub enum Error { pub type Result = std::result::Result; -#[derive(Debug, Default)] -pub struct CurrentServerId(OnceNonZeroU32); +/// Loads the database configurations based on the databases in the +/// object store. Any databases in the config already won't be +/// replaced. +/// +/// Returns a Vec containing the results of loading the contained databases +pub(crate) async fn initialize_server( + config: Arc, + wipe_on_error: bool, +) -> Result, Result<()>)>> { + let root = config.root_path(); -impl CurrentServerId { - pub fn set(&self, id: ServerId) -> Result<()> { - let id = id.get(); + // get the database names from the object store prefixes + // TODO: update object store to pull back all common prefixes by + // following the next tokens. + let list_result = config + .object_store() + .list_with_delimiter(&root) + .await + .context(StoreError)?; - match self.0.set(id) { - Ok(()) => { - info!(server_id = id, "server ID set"); - Ok(()) - } - Err(id) => Err(Error::IdAlreadySet { - id: ServerId::new(id), - }), - } - } + let handles: Vec<_> = list_result + .common_prefixes + .into_iter() + .filter_map(|mut path| { + let config = Arc::clone(&config); + let root = root.clone(); + path.set_file_name(DB_RULES_FILE_NAME); + let db_name = db_name_from_rules_path(&path) + .log_if_error("invalid database path") + .ok()?; - pub fn get(&self) -> Result { - self.0.get().map(ServerId::new).context(IdNotSet) - } -} - -#[derive(Debug)] -pub struct InitStatus { - pub server_id: CurrentServerId, - - /// Flags that databases are loaded and server is ready to read/write data. - initialized: AtomicBool, - - /// Semaphore that limits the number of jobs that load DBs when the serverID is set. - /// - /// Note that this semaphore is more of a "lock" than an arbitrary semaphore. All the other sync structures (mutex, - /// rwlock) require something to be wrapped which we don't have in our case, so we're using a semaphore here. We - /// want exactly 1 background worker to mess with the server init / DB loading, otherwise everything in the critical - /// section (in [`maybe_initialize_server`](Self::maybe_initialize_server)) will break apart. So this semaphore - /// cannot be configured. - initialize_semaphore: Semaphore, - - /// Error occurred during generic server init (e.g. listing store content). - error_generic: Mutex>>, - - /// Errors that occurred during some DB init. - errors_databases: Arc>>>, - - /// Automatic wipe-on-error recovery - /// - /// See - pub(crate) wipe_on_error: AtomicBool, -} - -impl InitStatus { - /// Create new "not initialized" status. - pub fn new() -> Self { - Self { - server_id: Default::default(), - initialized: AtomicBool::new(false), - // Always set semaphore permits to `1`, see design comments in `Server::initialize_semaphore`. - initialize_semaphore: Semaphore::new(1), - error_generic: Default::default(), - errors_databases: Default::default(), - wipe_on_error: AtomicBool::new(true), - } - } - - /// Base location in object store for this writer. - pub fn root_path(&self, store: &ObjectStore) -> Result { - let id = self.server_id.get()?; - - let mut path = store.new_path(); - path.push_dir(format!("{}", id)); - Ok(path) - } - - /// Check if server is loaded. Databases are loaded and server is ready to read/write. - pub fn initialized(&self) -> bool { - // Need `Acquire` ordering because IF we a `true` here, this thread will likely also read data that - // `maybe_initialize_server` wrote before toggling the flag with `Release`. The `Acquire` flag here ensures that - // every data acccess AFTER the following line will also stay AFTER this line. - self.initialized.load(Ordering::Acquire) - } - - /// Error occurred during generic server init (e.g. listing store content). - pub fn error_generic(&self) -> Option> { - let guard = self.error_generic.lock(); - guard.clone() - } - - /// List all databases with errors in sorted order. - pub fn databases_with_errors(&self) -> Vec { - let guard = self.errors_databases.lock(); - let mut names: Vec<_> = guard.keys().cloned().collect(); - names.sort(); - names - } - - /// Error that occurred during initialization of a specific database. - pub fn error_database(&self, db_name: &str) -> Option> { - let guard = self.errors_databases.lock(); - guard.get(db_name).cloned() - } - - /// Loads the database configurations based on the databases in the - /// object store. Any databases in the config already won't be - /// replaced. - /// - /// This requires the serverID to be set (will be a no-op otherwise). - /// - /// It will be a no-op if the configs are already loaded and the server is ready. - pub(crate) async fn maybe_initialize_server( - &self, - store: Arc, - config: Arc, - exec: Arc, - ) { - let server_id = match self.server_id.get() { - Ok(id) => id, - Err(e) => { - debug!(%e, "cannot initialize server because cannot get serverID"); - return; - } - }; - - let _guard = self - .initialize_semaphore - .acquire() - .await - .expect("semaphore should not be closed"); - - // Note that we use Acquire-Release ordering for the atomic within the semaphore to ensure that another thread - // that enters this semaphore after we've left actually sees the correct `is_ready` flag. - if self.initialized.load(Ordering::Acquire) { - // already loaded, so do nothing - return; - } - - // Check if there was a previous failed attempt - if self.error_generic().is_some() { - return; - } - - match self - .maybe_initialize_server_inner(store, config, exec, server_id) - .await - { - Ok(_) => { - // mark as ready (use correct ordering for Acquire-Release) - self.initialized.store(true, Ordering::Release); - info!("loaded databases, server is initalized"); - } - Err(e) => { - error!(%e, "error during server init"); - let mut guard = self.error_generic.lock(); - *guard = Some(Arc::new(e)); - } - } - } - - async fn maybe_initialize_server_inner( - &self, - store: Arc, - config: Arc, - exec: Arc, - server_id: ServerId, - ) -> Result<()> { - let root = self.root_path(&store)?; - - // get the database names from the object store prefixes - // TODO: update object store to pull back all common prefixes by - // following the next tokens. - let list_result = store.list_with_delimiter(&root).await.context(StoreError)?; - - let handles: Vec<_> = list_result - .common_prefixes - .into_iter() - .filter_map(|mut path| { - let store = Arc::clone(&store); - let config = Arc::clone(&config); - let exec = Arc::clone(&exec); - let errors_databases = Arc::clone(&self.errors_databases); - let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed); - let root = root.clone(); - - path.set_file_name(DB_RULES_FILE_NAME); - - match db_name_from_rules_path(&path) { - Ok(db_name) => { - let handle = tokio::task::spawn(async move { - match Self::initialize_database( - server_id, - store, - config, - exec, - root, - db_name.clone(), - wipe_on_error, - ) - .await - { - Ok(()) => { - info!(%db_name, "database initialized"); - } - Err(e) => { - error!(%e, %db_name, "cannot load database"); - let mut guard = errors_databases.lock(); - guard.insert(db_name.to_string(), Arc::new(e)); - } - } - }); - Some(handle) - } - Err(e) => { - error!(%e, "invalid database path"); - None - } - } + Some(async move { + let result = + initialize_database(config, root, db_name.clone(), wipe_on_error).await; + (db_name, result) }) - .collect(); + }) + .collect(); - futures::future::join_all(handles).await; + Ok(futures::future::join_all(handles).await) +} +async fn initialize_database( + config: Arc, + root: Path, + db_name: DatabaseName<'static>, + wipe_on_error: bool, +) -> Result<()> { + // Reserve name before expensive IO (e.g. loading the preserved catalog) + let mut handle = config + .create_db(db_name) + .map_err(Box::new) + .context(InitDbError)?; + + match try_advance_database_init_process_until_complete(&mut handle, &root, wipe_on_error).await + { + Ok(true) => { + // finished init and keep DB + handle.commit(); + Ok(()) + } + Ok(false) => { + // finished but do not keep DB + handle.abort(); + Ok(()) + } + Err(e) => { + // encountered some error, still commit intermediate result + handle.commit(); + Err(e) + } + } +} + +async fn load_database_rules(store: Arc, path: Path) -> Result> { + let serialized_rules = loop { + match get_database_config_bytes(&path, &store).await { + Ok(data) => break data, + Err(e) => { + if let Error::NoDatabaseConfigError { location } = &e { + warn!(?location, "{}", e); + return Ok(None); + } + error!( + "error getting database config {:?} from object store: {}", + path, e + ); + tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS)) + .await; + } + } + }; + let rules = decode_database_rules(serialized_rules.freeze()) + .context(ErrorDeserializingRulesProtobuf)?; + + Ok(Some(rules)) +} + +pub(crate) async fn wipe_preserved_catalog_and_maybe_recover( + config: Arc, + db_name: &DatabaseName<'static>, +) -> Result<()> { + let store = config.object_store(); + + if config.has_uninitialized_database(db_name) { + let mut handle = config + .recover_db(db_name.clone()) + .map_err(|e| Arc::new(e) as _) + .context(RecoverDbError)?; + + if !((handle.state_code() == DatabaseStateCode::Known) + || (handle.state_code() == DatabaseStateCode::RulesLoaded)) + { + // cannot wipe because init state is already too far + return Err(Error::DbPartiallyInitialized { + db_name: db_name.to_string(), + }); + } + + // wipe while holding handle so no other init/wipe process can interact with the catalog + PreservedCatalog::wipe(&store, handle.server_id(), db_name) + .await + .map_err(Box::new) + .context(PreservedCatalogWipeError)?; + + let root = config.root_path(); + + let result = + try_advance_database_init_process_until_complete(&mut handle, &root, true).await; + + // Commit changes even if failed + handle.commit(); + result.map(|_| ()) + } else { + let handle = config + .block_db(db_name.clone()) + .map_err(|e| Arc::new(e) as _) + .context(RecoverDbError)?; + + PreservedCatalog::wipe(&store, config.server_id(), db_name) + .await + .map_err(Box::new) + .context(PreservedCatalogWipeError)?; + + drop(handle); + + info!(%db_name, "wiped preserved catalog of non-registered database"); Ok(()) } +} - async fn initialize_database( - server_id: ServerId, - store: Arc, - config: Arc, - exec: Arc, - root: Path, - db_name: DatabaseName<'static>, - wipe_on_error: bool, - ) -> Result<()> { - // Reserve name before expensive IO (e.g. loading the preserved catalog) - let mut handle = config - .create_db(store, exec, server_id, db_name) - .map_err(Box::new) - .context(InitDbError)?; - - match Self::try_advance_database_init_process_until_complete( - &mut handle, - &root, - wipe_on_error, - ) - .await - { - Ok(true) => { - // finished init and keep DB - handle.commit(); - Ok(()) +/// Try to make as much progress as possible with DB init. +/// +/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe +/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten +/// (e.g. because not rules file is present.) +async fn try_advance_database_init_process_until_complete( + handle: &mut DatabaseHandle<'_>, + root: &Path, + wipe_on_error: bool, +) -> Result { + loop { + match try_advance_database_init_process(handle, root, wipe_on_error).await? { + InitProgress::Unfinished => {} + InitProgress::Done => { + return Ok(true); } - Ok(false) => { - // finished but do not keep DB - handle.abort(); - Ok(()) - } - Err(e) => { - // encountered some error, still commit intermediate result - handle.commit(); - Err(e) + InitProgress::Forget => { + return Ok(false); } } } +} - async fn load_database_rules( - store: Arc, - path: Path, - ) -> Result> { - let serialized_rules = loop { - match get_database_config_bytes(&path, &store).await { - Ok(data) => break data, - Err(e) => { - if let Error::NoDatabaseConfigError { location } = &e { - warn!(?location, "{}", e); - return Ok(None); - } - error!( - "error getting database config {:?} from object store: {}", - path, e - ); - tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS)) - .await; +/// Try to make some progress in the DB init. +async fn try_advance_database_init_process( + handle: &mut DatabaseHandle<'_>, + root: &Path, + wipe_on_error: bool, +) -> Result { + match handle.state_code() { + DatabaseStateCode::Known => { + // known => load DB rules + let path = object_store_path_for_database_config(root, &handle.db_name()); + match load_database_rules(handle.object_store(), path).await? { + Some(rules) => { + handle + .advance_rules_loaded(rules) + .map_err(Box::new) + .context(InitDbError)?; + + // there is still more work to do for this DB + Ok(InitProgress::Unfinished) + } + None => { + // no rules file present, advice to forget his DB + Ok(InitProgress::Forget) } } - }; - let rules = decode_database_rules(serialized_rules.freeze()) - .context(ErrorDeserializingRulesProtobuf)?; - - Ok(Some(rules)) - } - - pub(crate) async fn wipe_preserved_catalog_and_maybe_recover( - &self, - store: Arc, - config: Arc, - server_id: ServerId, - db_name: DatabaseName<'static>, - ) -> Result<()> { - if config.has_uninitialized_database(&db_name) { - let mut handle = config - .recover_db(db_name.clone()) - .map_err(|e| Arc::new(e) as _) - .context(RecoverDbError)?; - - if !((handle.state_code() == DatabaseStateCode::Known) - || (handle.state_code() == DatabaseStateCode::RulesLoaded)) - { - // cannot wipe because init state is already too far - return Err(Error::DbPartiallyInitialized { - db_name: db_name.to_string(), - }); - } - - // wipe while holding handle so no other init/wipe process can interact with the catalog - PreservedCatalog::wipe(&store, handle.server_id(), &db_name) - .await - .map_err(Box::new) - .context(PreservedCatalogWipeError)?; - - let root = self.root_path(&store)?; - let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed); - match Self::try_advance_database_init_process_until_complete( - &mut handle, - &root, + } + DatabaseStateCode::RulesLoaded => { + // rules already loaded => continue with loading preserved catalog + let (preserved_catalog, catalog) = load_or_create_preserved_catalog( + &handle.db_name(), + handle.object_store(), + handle.server_id(), + handle.metrics_registry(), wipe_on_error, ) .await - { - Ok(_) => { - // yeah, recovered DB - handle.commit(); + .map_err(|e| Box::new(e) as _) + .context(CatalogLoadError)?; - let mut guard = self.errors_databases.lock(); - guard.remove(&db_name.to_string()); - - info!(%db_name, "wiped preserved catalog of registered database and recovered"); - Ok(()) - } - Err(e) => { - // could not recover, but still keep new result - handle.commit(); - - let mut guard = self.errors_databases.lock(); - let e = Arc::new(e); - guard.insert(db_name.to_string(), Arc::clone(&e)); - - warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover"); - Err(Error::RecoverDbError { source: e }) - } - } - } else { - let handle = config - .block_db(db_name.clone()) - .map_err(|e| Arc::new(e) as _) - .context(RecoverDbError)?; - - PreservedCatalog::wipe(&store, server_id, &db_name) + let rules = handle + .rules() + .expect("in this state rules should be loaded"); + let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules) .await + .context(CreateWriteBuffer { + config: rules.write_buffer_connection.clone(), + })?; + info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config"); + + handle + .advance_replay(preserved_catalog, catalog, write_buffer) .map_err(Box::new) - .context(PreservedCatalogWipeError)?; + .context(InitDbError)?; - drop(handle); - - info!(%db_name, "wiped preserved catalog of non-registered database"); - Ok(()) + // there is still more work to do for this DB + Ok(InitProgress::Unfinished) } - } + DatabaseStateCode::Replay => { + let db = handle + .db_any_state() + .expect("DB should be available in this state"); + db.perform_replay().await; - /// Try to make as much progress as possible with DB init. - /// - /// Returns an error if there was an error along the way (in which case the handle should still be commit to safe - /// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten - /// (e.g. because not rules file is present.) - async fn try_advance_database_init_process_until_complete( - handle: &mut DatabaseHandle<'_>, - root: &Path, - wipe_on_error: bool, - ) -> Result { - loop { - match Self::try_advance_database_init_process(handle, root, wipe_on_error).await? { - InitProgress::Unfinished => {} - InitProgress::Done => { - return Ok(true); - } - InitProgress::Forget => { - return Ok(false); - } - } + handle + .advance_init() + .map_err(Box::new) + .context(InitDbError)?; + + // there is still more work to do for this DB + Ok(InitProgress::Unfinished) } - } - - /// Try to make some progress in the DB init. - async fn try_advance_database_init_process( - handle: &mut DatabaseHandle<'_>, - root: &Path, - wipe_on_error: bool, - ) -> Result { - match handle.state_code() { - DatabaseStateCode::Known => { - // known => load DB rules - let path = object_store_path_for_database_config(root, &handle.db_name()); - match Self::load_database_rules(handle.object_store(), path).await? { - Some(rules) => { - handle - .advance_rules_loaded(rules) - .map_err(Box::new) - .context(InitDbError)?; - - // there is still more work to do for this DB - Ok(InitProgress::Unfinished) - } - None => { - // no rules file present, advice to forget his DB - Ok(InitProgress::Forget) - } - } - } - DatabaseStateCode::RulesLoaded => { - // rules already loaded => continue with loading preserved catalog - let (preserved_catalog, catalog) = load_or_create_preserved_catalog( - &handle.db_name(), - handle.object_store(), - handle.server_id(), - handle.metrics_registry(), - wipe_on_error, - ) - .await - .map_err(|e| Box::new(e) as _) - .context(CatalogLoadError)?; - - let rules = handle - .rules() - .expect("in this state rules should be loaded"); - let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules) - .await - .context(CreateWriteBuffer { - config: rules.write_buffer_connection.clone(), - })?; - info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config"); - - handle - .advance_replay(preserved_catalog, catalog, write_buffer) - .map_err(Box::new) - .context(InitDbError)?; - - // there is still more work to do for this DB - Ok(InitProgress::Unfinished) - } - DatabaseStateCode::Replay => { - let db = handle - .db_any_state() - .expect("DB should be available in this state"); - db.perform_replay().await; - - handle - .advance_init() - .map_err(Box::new) - .context(InitDbError)?; - - // there is still more work to do for this DB - Ok(InitProgress::Unfinished) - } - DatabaseStateCode::Initialized => { - // database fully initialized => nothing to do - Ok(InitProgress::Done) - } + DatabaseStateCode::Initialized => { + // database fully initialized => nothing to do + Ok(InitProgress::Done) } } } diff --git a/server/src/lib.rs b/server/src/lib.rs index 2c4d666888..48246918d1 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -74,9 +74,8 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::BytesMut; use db::load::create_preserved_catalog; -use init::InitStatus; -use observability_deps::tracing::{debug, info, warn}; -use parking_lot::Mutex; +use observability_deps::tracing::{debug, error, info, warn}; +use parking_lot::{Mutex, RwLockUpgradableReadGuard}; use snafu::{OptionExt, ResultExt, Snafu}; use data_types::{ @@ -93,6 +92,7 @@ use generated_types::influxdata::transfer::column::v1 as pb; use influxdb_line_protocol::ParsedLine; use metrics::{KeyValue, MetricObserverBuilder, MetricRegistry}; use object_store::{ObjectStore, ObjectStoreApi}; +use parking_lot::RwLock; use query::{exec::Executor, DatabaseStore}; use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt}; use write_buffer::config::WriteBufferConfig; @@ -220,11 +220,11 @@ pub enum Error { #[snafu(display("cannot create preserved catalog: {}", source))] CannotCreatePreservedCatalog { source: DatabaseError }, - #[snafu(display("cannot set id: {}", source))] - SetIdError { source: crate::init::Error }, + #[snafu(display("id already set"))] + IdAlreadySet, - #[snafu(display("cannot get id: {}", source))] - GetIdError { source: crate::init::Error }, + #[snafu(display("id not set"))] + IdNotSet, #[snafu(display( "cannot create write buffer with config: {:?}, error: {}", @@ -297,6 +297,8 @@ pub struct ServerConfig { metric_registry: Arc, remote_template: Option, + + wipe_catalog_on_error: bool, } impl ServerConfig { @@ -311,6 +313,7 @@ impl ServerConfig { object_store, metric_registry, remote_template, + wipe_catalog_on_error: true, } } @@ -414,7 +417,6 @@ impl ServerMetrics { /// of these structs, which keeps track of all replication and query rules. #[derive(Debug)] pub struct Server { - config: Arc, connection_manager: Arc, pub store: Arc, exec: Arc, @@ -426,7 +428,50 @@ pub struct Server { /// and populates the endpoint with this data. pub registry: Arc, - init_status: Arc, + /// The state machine for server startup + stage: Arc>, +} + +/// The stage of the server in the startup process +/// +/// The progression is linear Startup -> InitReady -> Initializing -> Initialized +/// with the sole exception that on failure Initializing -> InitReady +/// +/// Errors encountered on server init will be retried, however, errors encountered +/// during database init will require operator intervention +/// +/// These errors are exposed via `Server::error_generic` and `Server::error_database` respectively +/// +/// They do not impact the state machine's progression, but instead are exposed to the +/// gRPC management API to allow an operator to assess the state of the system +#[derive(Debug)] +enum ServerStage { + /// Server has started but doesn't have a server id yet + Startup { + remote_template: Option, + wipe_catalog_on_error: bool, + }, + + /// Server can be initialized + InitReady { + wipe_catalog_on_error: bool, + config: Arc, + last_error: Option>, + }, + + /// Server has a server id, has started loading + Initializing { + wipe_catalog_on_error: bool, + config: Arc, + last_error: Option>, + }, + + /// Server has finish initializing, possibly with errors + Initialized { + config: Arc, + /// Errors that occurred during some DB init. + database_errors: HashMap>, + }, } #[derive(Debug)] @@ -454,22 +499,23 @@ where // to test the metrics provide a different registry to the `ServerConfig`. metric_registry, remote_template, + wipe_catalog_on_error, } = config; + let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get); + let exec = Arc::new(Executor::new(num_worker_threads)); Self { - config: Arc::new(Config::new( - Arc::clone(&jobs), - Arc::clone(&metric_registry), - remote_template, - )), store: object_store, connection_manager: Arc::new(connection_manager), - exec: Arc::new(Executor::new(num_worker_threads)), + exec, jobs, metrics: Arc::new(ServerMetrics::new(Arc::clone(&metric_registry))), registry: Arc::clone(&metric_registry), - init_status: Arc::new(InitStatus::new()), + stage: Arc::new(RwLock::new(ServerStage::Startup { + remote_template, + wipe_catalog_on_error, + })), } } @@ -478,68 +524,112 @@ where /// /// A valid server ID Must be non-zero. pub fn set_id(&self, id: ServerId) -> Result<()> { - self.init_status.server_id.set(id).context(SetIdError) - } + let mut stage = self.stage.write(); + match &mut *stage { + ServerStage::Startup { + remote_template, + wipe_catalog_on_error, + } => { + let remote_template = remote_template.take(); - /// Returns the current server ID, or an error if not yet set. - pub fn require_id(&self) -> Result { - self.init_status.server_id.get().context(GetIdError) + *stage = ServerStage::InitReady { + wipe_catalog_on_error: *wipe_catalog_on_error, + config: Arc::new(Config::new( + Arc::clone(&self.jobs), + Arc::clone(&self.store), + Arc::clone(&self.exec), + id, + Arc::clone(&self.registry), + remote_template, + )), + last_error: None, + }; + Ok(()) + } + _ => Err(Error::IdAlreadySet), + } } /// Check if server is loaded. Databases are loaded and server is ready to read/write. pub fn initialized(&self) -> bool { - self.init_status.initialized() + matches!(&*self.stage.read(), ServerStage::Initialized { .. }) + } + + /// Require that server is loaded. Databases are loaded and server is ready to read/write. + fn require_initialized(&self) -> Result> { + match &*self.stage.read() { + ServerStage::Startup { .. } => Err(Error::IdNotSet), + ServerStage::InitReady { config, .. } | ServerStage::Initializing { config, .. } => { + Err(Error::ServerNotInitialized { + server_id: config.server_id(), + }) + } + ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)), + } + } + + /// Returns the config for this server if server id has been set + fn config(&self) -> Result> { + let stage = self.stage.read(); + match &*stage { + ServerStage::Startup { .. } => Err(Error::IdNotSet), + ServerStage::InitReady { config, .. } + | ServerStage::Initializing { config, .. } + | ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)), + } + } + + /// Returns the server id for this server if set + pub fn server_id(&self) -> Option { + self.config().map(|x| x.server_id()).ok() } /// Error occurred during generic server init (e.g. listing store content). pub fn error_generic(&self) -> Option> { - self.init_status.error_generic() + let stage = self.stage.read(); + match &*stage { + ServerStage::InitReady { last_error, .. } => last_error.clone(), + ServerStage::Initializing { last_error, .. } => last_error.clone(), + _ => None, + } } /// List all databases with errors in sorted order. pub fn databases_with_errors(&self) -> Vec { - self.init_status.databases_with_errors() + let stage = self.stage.read(); + match &*stage { + ServerStage::Initialized { + database_errors, .. + } => database_errors.keys().cloned().collect(), + _ => Default::default(), + } } /// Error that occurred during initialization of a specific database. pub fn error_database(&self, db_name: &str) -> Option> { - self.init_status.error_database(db_name) + let stage = self.stage.read(); + match &*stage { + ServerStage::Initialized { + database_errors, .. + } => database_errors.get(db_name).cloned(), + _ => None, + } } /// Current database init state. pub fn database_state(&self, name: &str) -> Option { - if let Ok(name) = DatabaseName::new(name) { - self.config.db_state(&name) - } else { - None - } - } - - /// Require that server is loaded. Databases are loaded and server is ready to read/write. - fn require_initialized(&self) -> Result { - // since a server ID is the pre-requirement for init, check this first - let server_id = self.require_id()?; - - // ordering here isn't that important since this method is not used to check-and-modify the flag - if self.initialized() { - Ok(server_id) - } else { - Err(Error::ServerNotInitialized { server_id }) - } + let db_name = DatabaseName::new(name).ok()?; + let config = self.config().ok()?; + config.db_state(&db_name) } /// Tells the server the set of rules for a database. pub async fn create_database(&self, rules: DatabaseRules) -> Result<()> { // Return an error if this server is not yet ready - let server_id = self.require_initialized()?; + let config = self.require_initialized()?; // Reserve name before expensive IO (e.g. loading the preserved catalog) - let mut db_reservation = self.config.create_db( - Arc::clone(&self.store), - Arc::clone(&self.exec), - server_id, - rules.name.clone(), - )?; + let mut db_reservation = config.create_db(rules.name.clone())?; // register rules db_reservation.advance_rules_loaded(rules.clone())?; @@ -548,14 +638,14 @@ where let (preserved_catalog, catalog) = create_preserved_catalog( rules.db_name(), Arc::clone(&self.store), - server_id, - self.config.metrics_registry(), + config.server_id(), + config.metrics_registry(), ) .await .map_err(|e| Box::new(e) as _) .context(CannotCreatePreservedCatalog)?; - let write_buffer = WriteBufferConfig::new(server_id, &rules) + let write_buffer = WriteBufferConfig::new(config.server_id(), &rules) .await .map_err(|e| Error::CreatingWriteBuffer { config: rules.write_buffer_connection.clone(), @@ -575,13 +665,8 @@ where } pub async fn persist_database_rules<'a>(&self, rules: DatabaseRules) -> Result<()> { - let location = object_store_path_for_database_config( - &self - .init_status - .root_path(&self.store) - .context(GetIdError)?, - &rules.name, - ); + let config = self.config()?; + let location = object_store_path_for_database_config(&config.root_path(), &rules.name); let mut data = BytesMut::new(); encode_database_rules(rules, &mut data).context(ErrorSerializingRulesProtobuf)?; @@ -604,15 +689,62 @@ where /// object store. Any databases in the config already won't be /// replaced. /// - /// This requires the serverID to be set. It will be a no-op if the configs are already loaded and the server is ready. + /// This requires the serverID to be set. + /// + /// It will be a no-op if the configs are already loaded and the server is ready. pub async fn maybe_initialize_server(&self) { - self.init_status - .maybe_initialize_server( - Arc::clone(&self.store), - Arc::clone(&self.config), - Arc::clone(&self.exec), - ) - .await; + // Explicit scope to help async generator + let (wipe_catalog_on_error, config) = { + let state = self.stage.upgradable_read(); + match &*state { + ServerStage::InitReady { + wipe_catalog_on_error, + config, + last_error, + } => { + let config = Arc::clone(config); + let last_error = last_error.clone(); + let wipe_catalog_on_error = *wipe_catalog_on_error; + + // Mark the server as initializing and drop lock + + let mut state = RwLockUpgradableReadGuard::upgrade(state); + *state = ServerStage::Initializing { + config: Arc::clone(&config), + wipe_catalog_on_error, + last_error, + }; + (wipe_catalog_on_error, config) + } + _ => return, + } + }; + + let init_result = init::initialize_server(Arc::clone(&config), wipe_catalog_on_error).await; + let new_stage = match init_result { + // Success -> move to next stage + Ok(results) => { + info!(server_id=%config.server_id(), "server initialized"); + ServerStage::Initialized { + config, + database_errors: results + .into_iter() + .filter_map(|(name, res)| Some((name.to_string(), Arc::new(res.err()?)))) + .collect(), + } + } + // Error -> return to InitReady + Err(err) => { + error!(%err, "error during server init"); + ServerStage::InitReady { + wipe_catalog_on_error, + config, + last_error: Some(Arc::new(err)), + } + } + }; + + *self.stage.write() = new_stage; } pub async fn write_pb(&self, database_batch: pb::DatabaseBatch) -> Result<()> { @@ -640,11 +772,10 @@ where default_time: i64, ) -> Result<()> { // Return an error if this server is not yet ready - self.require_initialized()?; + let config = self.require_initialized()?; let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?; - let db = self - .config + let db = config .db_initialized(&db_name) .context(DatabaseNotFound { db_name: &*db_name })?; @@ -744,9 +875,12 @@ where node_group: &[ServerId], entry: Entry, ) -> Result<()> { + // Return an error if this server is not yet ready + let config = self.config()?; + let addrs: Vec<_> = node_group .iter() - .filter_map(|&node| self.config.resolve_remote(node)) + .filter_map(|&node| config.resolve_remote(node)) .collect(); if addrs.is_empty() { return NoRemoteConfigured { node_group }.fail(); @@ -775,11 +909,10 @@ where pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec) -> Result<()> { // Return an error if this server is not yet ready - self.require_initialized()?; + let config = self.require_initialized()?; let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?; - let db = self - .config + let db = config .db_initialized(&db_name) .context(DatabaseNotFound { db_name: &*db_name })?; @@ -825,11 +958,11 @@ where } pub fn db(&self, name: &DatabaseName<'_>) -> Option> { - self.config.db_initialized(name) + self.config().ok()?.db_initialized(name) } pub fn db_rules(&self, name: &DatabaseName<'_>) -> Option> { - self.config.db_initialized(name).map(|d| d.rules()) + self.db(name).map(|d| d.rules()) } // Update database rules and save on success. @@ -841,8 +974,8 @@ where where F: FnOnce(DatabaseRules) -> Result + Send, { - let rules = self - .config + let config = self.config()?; + let rules = config .update_db_rules(db_name, update) .map_err(|e| match e { crate::config::UpdateError::Closure(e) => UpdateError::Closure(e), @@ -854,16 +987,23 @@ where Ok(rules) } - pub fn remotes_sorted(&self) -> Vec<(ServerId, String)> { - self.config.remotes_sorted() + pub fn remotes_sorted(&self) -> Result> { + // TODO: Should these be on ConnectionManager and not Config + let config = self.config()?; + Ok(config.remotes_sorted()) } - pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) { - self.config.update_remote(id, addr) + pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) -> Result<()> { + // TODO: Should these be on ConnectionManager and not Config + let config = self.config()?; + config.update_remote(id, addr); + Ok(()) } - pub fn delete_remote(&self, id: ServerId) -> Option { - self.config.delete_remote(id) + pub fn delete_remote(&self, id: ServerId) -> Result> { + // TODO: Should these be on ConnectionManager and not Config + let config = self.config()?; + Ok(config.delete_remote(id)) } pub fn spawn_dummy_job(&self, nanos: Vec) -> TaskTracker { @@ -893,14 +1033,15 @@ where partition_key: impl Into, chunk_id: u32, ) -> Result> { + let config = self.require_initialized()?; + let db_name = db_name.to_string(); let name = DatabaseName::new(&db_name).context(InvalidDatabaseName)?; let partition_key = partition_key.into(); let table_name = table_name.into(); - let db = self - .config + let db = config .db_initialized(&name) .context(DatabaseNotFound { db_name: &db_name })?; @@ -921,25 +1062,62 @@ where /// DB jobs and this command. pub fn wipe_preserved_catalog( &self, - db_name: DatabaseName<'static>, + db_name: &DatabaseName<'static>, ) -> Result> { - if self.config.db_initialized(&db_name).is_some() { - return Err(Error::DatabaseAlreadyExists { - db_name: db_name.to_string(), - }); - } + // Can only wipe catalog of database that failed to initialize + let config = match &*self.stage.read() { + ServerStage::Initialized { + config, + database_errors, + } => { + if config.db_initialized(db_name).is_some() { + return Err(Error::DatabaseAlreadyExists { + db_name: db_name.to_string(), + }); + } + + if !database_errors.contains_key(db_name.as_str()) { + // TODO: Should this be an error? Some end-to-end tests assume it is non-fatal + warn!(%db_name, "wiping database not present at startup"); + } + Arc::clone(config) + } + ServerStage::Startup { .. } => return Err(Error::IdNotSet), + ServerStage::Initializing { config, .. } | ServerStage::InitReady { config, .. } => { + return Err(Error::ServerNotInitialized { + server_id: config.server_id(), + }) + } + }; let (tracker, registration) = self.jobs.register(Job::WipePreservedCatalog { db_name: db_name.to_string(), }); - let object_store = Arc::clone(&self.store); - let config = Arc::clone(&self.config); - let server_id = self.require_id()?; - let init_status = Arc::clone(&self.init_status); + + let state = Arc::clone(&self.stage); + let db_name = db_name.clone(); + let task = async move { - init_status - .wipe_preserved_catalog_and_maybe_recover(object_store, config, server_id, db_name) - .await + let result = init::wipe_preserved_catalog_and_maybe_recover(config, &db_name).await; + + match &mut *state.write() { + ServerStage::Initialized { + database_errors, .. + } => match result { + Ok(_) => { + info!(%db_name, "wiped preserved catalog of registered database and recovered"); + database_errors.remove(db_name.as_str()); + Ok(()) + } + Err(e) => { + warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover"); + let e = Arc::new(e); + database_errors.insert(db_name.to_string(), Arc::clone(&e)); + Err(e) + } + }, + _ => unreachable!("server cannot become uninitialized"), + } }; tokio::spawn(task.track(registration)); @@ -973,7 +1151,9 @@ where } info!("shutting down background workers"); - self.config.drain().await; + if let Ok(config) = self.config() { + config.drain().await; + } info!("draining tracker registry"); @@ -999,11 +1179,15 @@ where type Error = Error; fn db_names_sorted(&self) -> Vec { - self.config - .db_names_sorted() - .iter() - .map(|i| i.clone().into()) - .collect() + self.config() + .map(|config| { + config + .db_names_sorted() + .iter() + .map(ToString::to_string) + .collect() + }) + .unwrap_or_default() } fn db(&self, name: &str) -> Option> { @@ -1214,25 +1398,15 @@ mod tests { let manager = TestConnectionManager::new(); let server = Server::new(manager, config()); - let resp = server.require_id().unwrap_err(); - assert!(matches!( - resp, - Error::GetIdError { - source: crate::init::Error::IdNotSet - } - )); + let resp = server.config().unwrap_err(); + assert!(matches!(resp, Error::IdNotSet)); let lines = parsed_lines("cpu foo=1 10"); let resp = server .write_lines("foo", &lines, ARBITRARY_DEFAULT_TIME) .await .unwrap_err(); - assert!(matches!( - resp, - Error::GetIdError { - source: crate::init::Error::IdNotSet - } - )); + assert!(matches!(resp, Error::IdNotSet)); } #[tokio::test] @@ -1589,7 +1763,9 @@ mod tests { ); // one remote is configured but it's down and we'll get connection error - server.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into()); + server + .update_remote(bad_remote_id, BAD_REMOTE_ADDR.into()) + .unwrap(); let err = server .write_lines(&db_name, &lines, ARBITRARY_DEFAULT_TIME) .await @@ -1606,8 +1782,12 @@ mod tests { // We configure the address for the other remote, this time connection will succeed // despite the bad remote failing to connect. - server.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into()); - server.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into()); + server + .update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into()) + .unwrap(); + server + .update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into()) + .unwrap(); // Remotes are tried in random order, so we need to repeat the test a few times to have a reasonable // probability both the remotes will get hit. @@ -1844,12 +2024,7 @@ mod tests { let err = create_simple_database(&server, "bananas") .await .unwrap_err(); - assert!(matches!( - err, - Error::GetIdError { - source: crate::init::Error::IdNotSet - } - )); + assert!(matches!(err, Error::IdNotSet)); server.set_id(ServerId::try_from(1).unwrap()).unwrap(); // do NOT call `server.maybe_load_database_configs` so DBs are not loaded and server is not ready @@ -1873,7 +2048,7 @@ mod tests { let t_0 = Instant::now(); loop { - if server.require_initialized().is_ok() { + if server.config().is_ok() { break; } assert!(t_0.elapsed() < Duration::from_secs(10)); @@ -1916,9 +2091,12 @@ mod tests { create_simple_database(&server, "foo") .await .expect("failed to create database"); - let root = server.init_status.root_path(&store).unwrap(); - server.config.drain().await; + + let config = server.require_initialized().unwrap(); + let root = config.root_path(); + config.drain().await; drop(server); + drop(config); // tamper store let path = object_store_path_for_database_config(&root, &DatabaseName::new("bar").unwrap()); @@ -2003,18 +2181,24 @@ mod tests { let server = Server::new(manager, config); server.set_id(server_id).unwrap(); server.maybe_initialize_server().await; + create_simple_database(&server, db_name_existing.clone()) .await .expect("failed to create database"); + create_simple_database(&server, db_name_rules_broken.clone()) .await .expect("failed to create database"); + create_simple_database(&server, db_name_catalog_broken.clone()) .await .expect("failed to create database"); - let root = server.init_status.root_path(&store).unwrap(); - server.config.drain().await; + + let config = server.require_initialized().unwrap(); + let root = config.root_path(); + config.drain().await; drop(server); + drop(config); // tamper store to break one database let path = object_store_path_for_database_config(&root, &db_name_rules_broken); @@ -2045,22 +2229,18 @@ mod tests { let store = Arc::try_unwrap(store).unwrap(); store.get(&path).await.unwrap(); let manager = TestConnectionManager::new(); - let config = config_with_store(store); - let server = Server::new(manager, config); - // need to disable auto-wipe for this test - server - .init_status - .wipe_on_error - .store(false, std::sync::atomic::Ordering::Relaxed); + let mut config = config_with_store(store); + config.wipe_catalog_on_error = false; + let server = Server::new(manager, config); // cannot wipe if server ID is not set assert_eq!( server - .wipe_preserved_catalog(db_name_non_existing.clone()) + .wipe_preserved_catalog(&db_name_non_existing) .unwrap_err() .to_string(), - "cannot get id: unable to use server until id is set" + "id not set" ); server.set_id(ServerId::try_from(1).unwrap()).unwrap(); @@ -2069,31 +2249,29 @@ mod tests { // 1. cannot wipe if DB exists assert_eq!( server - .wipe_preserved_catalog(db_name_existing.clone()) + .wipe_preserved_catalog(&db_name_existing) .unwrap_err() .to_string(), "database already exists: db_existing" ); - assert!(PreservedCatalog::exists( - &server.store, - server.require_id().unwrap(), - &db_name_existing.to_string() - ) - .await - .unwrap()); + assert!( + PreservedCatalog::exists(&server.store, server_id, db_name_existing.as_str()) + .await + .unwrap() + ); // 2. wiping a non-existing DB just works, but won't bring DB into existence assert!(server.error_database(&db_name_non_existing).is_none()); PreservedCatalog::new_empty::( Arc::clone(&server.store), - server.require_id().unwrap(), + server_id, db_name_non_existing.to_string(), (), ) .await .unwrap(); let tracker = server - .wipe_preserved_catalog(db_name_non_existing.clone()) + .wipe_preserved_catalog(&db_name_non_existing) .unwrap(); let metadata = tracker.metadata(); let expected_metadata = Job::WipePreservedCatalog { @@ -2103,7 +2281,7 @@ mod tests { tracker.join().await; assert!(!PreservedCatalog::exists( &server.store, - server.require_id().unwrap(), + server_id, &db_name_non_existing.to_string() ) .await @@ -2114,7 +2292,7 @@ mod tests { // 3. wipe DB with broken rules file, this won't bring DB back to life assert!(server.error_database(&db_name_rules_broken).is_some()); let tracker = server - .wipe_preserved_catalog(db_name_rules_broken.clone()) + .wipe_preserved_catalog(&db_name_rules_broken) .unwrap(); let metadata = tracker.metadata(); let expected_metadata = Job::WipePreservedCatalog { @@ -2124,7 +2302,7 @@ mod tests { tracker.join().await; assert!(!PreservedCatalog::exists( &server.store, - server.require_id().unwrap(), + server_id, &db_name_rules_broken.to_string() ) .await @@ -2135,7 +2313,7 @@ mod tests { // 4. wipe DB with broken catalog, this will bring the DB back to life assert!(server.error_database(&db_name_catalog_broken).is_some()); let tracker = server - .wipe_preserved_catalog(db_name_catalog_broken.clone()) + .wipe_preserved_catalog(&db_name_catalog_broken) .unwrap(); let metadata = tracker.metadata(); let expected_metadata = Job::WipePreservedCatalog { @@ -2145,7 +2323,7 @@ mod tests { tracker.join().await; assert!(PreservedCatalog::exists( &server.store, - server.require_id().unwrap(), + server_id, &db_name_catalog_broken.to_string() ) .await @@ -2166,18 +2344,16 @@ mod tests { .unwrap(); assert_eq!( server - .wipe_preserved_catalog(db_name_created.clone()) + .wipe_preserved_catalog(&db_name_created) .unwrap_err() .to_string(), "database already exists: db_created" ); - assert!(PreservedCatalog::exists( - &server.store, - server.require_id().unwrap(), - &db_name_created.to_string() - ) - .await - .unwrap()); + assert!( + PreservedCatalog::exists(&server.store, server_id, &db_name_created.to_string()) + .await + .unwrap() + ); } #[tokio::test] diff --git a/src/influxdb_ioxd/rpc/error.rs b/src/influxdb_ioxd/rpc/error.rs index 4b3b95f314..a6ab258497 100644 --- a/src/influxdb_ioxd/rpc/error.rs +++ b/src/influxdb_ioxd/rpc/error.rs @@ -8,7 +8,7 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status { use server::Error; match error { - Error::GetIdError { .. } => PreconditionViolation { + Error::IdNotSet => PreconditionViolation { category: "Writer ID".to_string(), subject: "influxdata.com/iox".to_string(), description: "Writer ID must be set".to_string(), diff --git a/src/influxdb_ioxd/rpc/management.rs b/src/influxdb_ioxd/rpc/management.rs index 5f81db5e46..2b26040d04 100644 --- a/src/influxdb_ioxd/rpc/management.rs +++ b/src/influxdb_ioxd/rpc/management.rs @@ -56,7 +56,7 @@ where &self, _: Request, ) -> Result, Status> { - match self.server.require_id().ok() { + match self.server.server_id() { Some(id) => Ok(Response::new(GetServerIdResponse { id: id.get_u32() })), None => return Err(NotFound::default().into()), } @@ -71,7 +71,7 @@ where match self.server.set_id(id) { Ok(_) => Ok(Response::new(UpdateServerIdResponse {})), - Err(e @ Error::SetIdError { .. }) => { + Err(e @ Error::IdAlreadySet) => { return Err(FieldViolation { field: "id".to_string(), description: e.to_string(), @@ -199,15 +199,18 @@ where &self, _: Request, ) -> Result, Status> { - let remotes = self - .server - .remotes_sorted() - .into_iter() - .map(|(id, connection_string)| Remote { - id: id.get_u32(), - connection_string, - }) - .collect(); + let result = self.server.remotes_sorted(); + let remotes = match result { + Ok(remotes) => remotes + .into_iter() + .map(|(id, connection_string)| Remote { + id: id.get_u32(), + connection_string, + }) + .collect(), + Err(e) => return Err(default_server_error_handler(e)), + }; + Ok(Response::new(ListRemotesResponse { remotes })) } @@ -221,8 +224,16 @@ where .ok_or_else(|| FieldViolation::required("remote"))?; let remote_id = ServerId::try_from(remote.id) .map_err(|_| FieldViolation::required("id").scope("remote"))?; - self.server + + let result = self + .server .update_remote(remote_id, remote.connection_string); + + match result { + Ok(_) => {} + Err(e) => return Err(default_server_error_handler(e)), + } + Ok(Response::new(UpdateRemoteResponse {})) } @@ -233,9 +244,12 @@ where let request = request.into_inner(); let remote_id = ServerId::try_from(request.id).map_err(|_| FieldViolation::required("id"))?; - self.server - .delete_remote(remote_id) - .ok_or_else(NotFound::default)?; + + match self.server.delete_remote(remote_id) { + Ok(Some(_)) => {} + Ok(None) => return Err(NotFound::default().into()), + Err(e) => return Err(default_server_error_handler(e)), + } Ok(Response::new(DeleteRemoteResponse {})) } @@ -455,7 +469,7 @@ where let tracker = self .server - .wipe_preserved_catalog(db_name) + .wipe_preserved_catalog(&db_name) .map_err(|e| match e { Error::DatabaseAlreadyExists { db_name } => AlreadyExists { resource_type: "database".to_string(), diff --git a/tests/end_to_end_cases/management_api.rs b/tests/end_to_end_cases/management_api.rs index 2f26969085..d543e7fb68 100644 --- a/tests/end_to_end_cases/management_api.rs +++ b/tests/end_to_end_cases/management_api.rs @@ -65,6 +65,8 @@ async fn test_list_update_remotes() { const TEST_REMOTE_ADDR_2: &str = "4.3.2.1:4321"; const TEST_REMOTE_ADDR_2_UPDATED: &str = "40.30.20.10:4321"; + client.update_server_id(123).await.unwrap(); + let res = client.list_remotes().await.expect("list remotes failed"); assert_eq!(res.len(), 0); diff --git a/tests/end_to_end_cases/management_cli.rs b/tests/end_to_end_cases/management_cli.rs index a09285c695..3e0be27290 100644 --- a/tests/end_to_end_cases/management_cli.rs +++ b/tests/end_to_end_cases/management_cli.rs @@ -244,6 +244,18 @@ async fn test_list_chunks_error() { async fn test_remotes() { let server_fixture = ServerFixture::create_single_use().await; let addr = server_fixture.grpc_base(); + + Command::cargo_bin("influxdb_iox") + .unwrap() + .arg("server") + .arg("set") + .arg("32") + .arg("--host") + .arg(addr) + .assert() + .success() + .stdout(predicate::str::contains("Ok")); + Command::cargo_bin("influxdb_iox") .unwrap() .arg("server") From b0663a0337610dfbebc7ec28224a1814e847bf8b Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 20 Jul 2021 12:35:20 +0200 Subject: [PATCH 08/27] feat: disallow multiple write buffer streams and seeking while streams Multiple streams will mess up ordering. Seeking while streaming is likely a bug and should not work. --- server/src/db.rs | 5 ++- write_buffer/src/core.rs | 68 ++++++++++++------------------ write_buffer/src/guard.rs | 87 +++++++++++++++++++++++++++++++++++++++ write_buffer/src/kafka.rs | 32 +++++++++++--- write_buffer/src/lib.rs | 1 + write_buffer/src/mock.rs | 24 +++++++++-- 6 files changed, 167 insertions(+), 50 deletions(-) create mode 100644 write_buffer/src/guard.rs diff --git a/server/src/db.rs b/server/src/db.rs index c39655f107..59348fc956 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -657,7 +657,10 @@ impl Db { async { if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer { let mut futures = vec![]; - for (_sequencer_id, stream) in write_buffer.streams() { + for (_sequencer_id, stream) in write_buffer + .streams() + .expect("no streams should exist at this point") + { let fut = self.stream_in_sequenced_entries(stream); futures.push(fut); } diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs index 27c7512884..1f91d955ab 100644 --- a/write_buffer/src/core.rs +++ b/write_buffer/src/core.rs @@ -29,13 +29,13 @@ pub type EntryStream<'a> = BoxStream<'a, Result Vec<(u32, EntryStream<'_>)>; + /// When calling this method a second time while the streams of the first call are still in use, an error will be returned. + fn streams(&self) -> Result)>, WriteBufferError>; /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream). + /// + /// When calling this methods while streams (from [`streams`](Self::streams)) are in use, an error will be returned. async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError>; } @@ -68,7 +68,7 @@ pub mod test_utils { T: TestAdapter, { test_single_stream_io(&adapter).await; - test_multi_stream_io(&adapter).await; + test_multi_stream(&adapter).await; test_multi_sequencer_io(&adapter).await; test_multi_writer_multi_reader(&adapter).await; test_seek(&adapter).await; @@ -87,7 +87,7 @@ pub mod test_utils { let writer = context.writing(); let reader = context.reading().await; - let mut streams = reader.streams(); + let mut streams = reader.streams().unwrap(); assert_eq!(streams.len(), 1); let (sequencer_id, mut stream) = streams.pop().unwrap(); @@ -127,7 +127,7 @@ pub mod test_utils { let writer = context.writing(); let reader = context.reading().await; - let mut streams = reader.streams(); + let mut streams = reader.streams().unwrap(); assert_eq!(streams.len(), 2); let (sequencer_id_1, mut stream_1) = streams.pop().unwrap(); let (sequencer_id_2, mut stream_2) = streams.pop().unwrap(); @@ -158,45 +158,26 @@ pub mod test_utils { assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); } - async fn test_multi_stream_io(adapter: &T) + async fn test_multi_stream(adapter: &T) where T: TestAdapter, { - let context = adapter.new_context(1).await; - - let entry_1 = lp_to_entry("upc user=1 100"); - let entry_2 = lp_to_entry("upc user=2 200"); - let entry_3 = lp_to_entry("upc user=3 300"); - - let writer = context.writing(); + let context = adapter.new_context(2).await; let reader = context.reading().await; - let mut streams_1 = reader.streams(); - let mut streams_2 = reader.streams(); - assert_eq!(streams_1.len(), 1); - assert_eq!(streams_2.len(), 1); - let (sequencer_id_1, mut stream_1) = streams_1.pop().unwrap(); - let (sequencer_id_2, mut stream_2) = streams_2.pop().unwrap(); - assert_eq!(sequencer_id_1, sequencer_id_2); + let mut streams = reader.streams().unwrap(); + assert_eq!(streams.len(), 2); + let (_sequencer_id, stream_1) = streams.pop().unwrap(); + let (_sequencer_id, stream_2) = streams.pop().unwrap(); - let waker = futures::task::noop_waker(); - let mut cx = futures::task::Context::from_waker(&waker); + // cannot get another stream while streams are in use + assert!(reader.streams().is_err()); + drop(stream_1); + assert!(reader.streams().is_err()); + drop(stream_2); - // empty streams is pending - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); - - // streams poll from same source - writer.store_entry(&entry_1, sequencer_id_1).await.unwrap(); - writer.store_entry(&entry_2, sequencer_id_1).await.unwrap(); - writer.store_entry(&entry_3, sequencer_id_1).await.unwrap(); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1); - assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3); - - // both streams are pending again - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + // when all streams are dropped, we can get new ones + reader.streams().unwrap(); } async fn test_multi_writer_multi_reader(adapter: &T) @@ -269,13 +250,18 @@ pub mod test_utils { // seek to far end and then at data reader_1.seek(0, 1_000_000).await.unwrap(); let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number; - let mut streams = reader_1.streams(); + let mut streams = reader_1.streams().unwrap(); assert_eq!(streams.len(), 2); let (_sequencer_id, mut stream_1) = streams.pop().unwrap(); let (_sequencer_id, mut stream_2) = streams.pop().unwrap(); assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + // seeking while streams are in use is an error + reader_1.seek(0, 0).await.unwrap_err(); + drop(stream_1); + drop(stream_2); + // seeking unknown sequencer is NOT an error reader_1.seek(0, 42).await.unwrap(); } @@ -284,7 +270,7 @@ pub mod test_utils { where R: WriteBufferReading, { - let mut streams = reader.streams(); + let mut streams = reader.streams().unwrap(); assert_eq!(streams.len(), expected.len()); streams.sort_by_key(|(sequencer_id, _stream)| *sequencer_id); diff --git a/write_buffer/src/guard.rs b/write_buffer/src/guard.rs new file mode 100644 index 0000000000..636193a15c --- /dev/null +++ b/write_buffer/src/guard.rs @@ -0,0 +1,87 @@ +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +/// A semaphore that produces [`Send`]able guards. +pub struct Semaphore { + user_count: Arc, +} + +impl Semaphore { + /// Creates new semaphore with a single permit. + pub fn new() -> Self { + Self { + user_count: Arc::new(AtomicUsize::new(0)), + } + } + + /// Creates guard if no permit exists. + /// + /// To produce multiple guards, you can clone an existing one. + pub fn guard(&self) -> Option { + let count = self.user_count.fetch_add(1, Ordering::SeqCst); + if count > 0 { + self.user_count.fetch_sub(1, Ordering::SeqCst); + None + } else { + Some(Guard { + user_count: Arc::clone(&self.user_count), + }) + } + } +} + +/// Guard that hols a [`Semaphore`] permit. +/// +/// New guards can be produced in two ways: +/// - cloning an existing one +/// - when no guard exists: using [`Semaphore::guard`]. +pub struct Guard { + user_count: Arc, +} + +impl Guard { + /// Use a guard. + /// + /// This is a no-op but is helpful if you need to reference a guard within a closure. + pub fn use_here(&self) {} +} + +impl Clone for Guard { + /// Clone guard and increase usage count. + fn clone(&self) -> Self { + self.user_count.fetch_add(1, Ordering::SeqCst); + Self { + user_count: Arc::clone(&self.user_count), + } + } +} + +impl Drop for Guard { + fn drop(&mut self) { + self.user_count.fetch_sub(1, Ordering::SeqCst); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test() { + let s = Semaphore::new(); + + let g = s.guard().unwrap(); + assert!(s.guard().is_none()); + drop(g); + + let g1 = s.guard().unwrap(); + let g2 = g1.clone(); + assert!(s.guard().is_none()); + drop(g1); + assert!(s.guard().is_none()); + drop(g2); + s.guard().unwrap(); + } +} diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index 0624a06d18..abd960a521 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -18,7 +18,10 @@ use rdkafka::{ ClientConfig, Message, Offset, TopicPartitionList, }; -use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; +use crate::{ + core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}, + guard::Semaphore, +}; pub struct KafkaBufferProducer { conn: String, @@ -97,6 +100,7 @@ pub struct KafkaBufferConsumer { conn: String, database_name: String, consumers: BTreeMap>, + semaphore: Semaphore, } // Needed because rdkafka's StreamConsumer doesn't impl Debug @@ -111,13 +115,23 @@ impl std::fmt::Debug for KafkaBufferConsumer { #[async_trait] impl WriteBufferReading for KafkaBufferConsumer { - fn streams(&self) -> Vec<(u32, EntryStream<'_>)> { - self.consumers + fn streams(&self) -> Result)>, WriteBufferError> { + let guard = self + .semaphore + .guard() + .ok_or_else::(|| "stream already in use".to_string().into())?; + + let streams: Vec<_> = self + .consumers .iter() .map(|(sequencer_id, consumer)| { + let guard = guard.clone(); + let stream = consumer .stream() - .map(|message| { + .map(move |message| { + guard.use_here(); + let message = message?; let entry = Entry::try_from(message.payload().unwrap().to_vec())?; let sequence = Sequence { @@ -130,10 +144,17 @@ impl WriteBufferReading for KafkaBufferConsumer { .boxed(); (*sequencer_id, stream) }) - .collect() + .collect(); + + Ok(streams) } async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> { + let _guard = self + .semaphore + .guard() + .ok_or_else::(|| "stream already in use".to_string().into())?; + if let Some(consumer) = self.consumers.get(&sequencer_id) { let consumer = Arc::clone(consumer); let database_name = self.database_name.clone(); @@ -213,6 +234,7 @@ impl KafkaBufferConsumer { conn, database_name, consumers, + semaphore: Semaphore::new(), }) } diff --git a/write_buffer/src/lib.rs b/write_buffer/src/lib.rs index 9e9472940a..a165fa3eb8 100644 --- a/write_buffer/src/lib.rs +++ b/write_buffer/src/lib.rs @@ -10,5 +10,6 @@ pub mod config; pub mod core; +mod guard; pub mod kafka; pub mod mock; diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs index cb4322199a..701b8af703 100644 --- a/write_buffer/src/mock.rs +++ b/write_buffer/src/mock.rs @@ -5,7 +5,10 @@ use entry::{Entry, Sequence, SequencedEntry}; use futures::{stream, StreamExt}; use parking_lot::Mutex; -use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; +use crate::{ + core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}, + guard::Semaphore, +}; type EntryResVec = Vec>; @@ -165,6 +168,7 @@ struct PlaybackState { pub struct MockBufferForReading { shared_state: MockBufferSharedState, playback_states: Arc>>, + semaphore: Semaphore, } impl MockBufferForReading { @@ -185,6 +189,7 @@ impl MockBufferForReading { Self { shared_state: state, playback_states: Arc::new(Mutex::new(playback_states)), + semaphore: Semaphore::new(), } } } @@ -197,7 +202,12 @@ impl std::fmt::Debug for MockBufferForReading { #[async_trait] impl WriteBufferReading for MockBufferForReading { - fn streams(&self) -> Vec<(u32, EntryStream<'_>)> { + fn streams(&self) -> Result)>, WriteBufferError> { + let guard = self + .semaphore + .guard() + .ok_or_else::(|| "stream already in use".to_string().into())?; + let sequencer_ids: Vec<_> = { let playback_states = self.playback_states.lock(); playback_states.keys().copied().collect() @@ -207,8 +217,11 @@ impl WriteBufferReading for MockBufferForReading { for sequencer_id in sequencer_ids { let shared_state = self.shared_state.clone(); let playback_states = Arc::clone(&self.playback_states); + let guard = guard.clone(); let stream = stream::poll_fn(move |_ctx| { + guard.use_here(); + let entries = shared_state.entries.lock(); let mut playback_states = playback_states.lock(); @@ -247,10 +260,15 @@ impl WriteBufferReading for MockBufferForReading { streams.push((sequencer_id, stream)); } - streams + Ok(streams) } async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> { + let _guard = self + .semaphore + .guard() + .ok_or_else::(|| "stream already in use".to_string().into())?; + let mut playback_states = self.playback_states.lock(); if let Some(playback_state) = playback_states.get_mut(&sequencer_id) { From cc0aaa58a7bd71e1a8489ecc5af4dd3731c0ccf2 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 20 Jul 2021 12:43:10 +0100 Subject: [PATCH 09/27] test: ensure high enough limit --- lifecycle/src/policy.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lifecycle/src/policy.rs b/lifecycle/src/policy.rs index dfe713fa2c..beb35464ea 100644 --- a/lifecycle/src/policy.rs +++ b/lifecycle/src/policy.rs @@ -1399,6 +1399,7 @@ mod tests { let rules = LifecycleRules { late_arrive_window_seconds: NonZeroU32::new(10).unwrap(), persist_row_threshold: NonZeroUsize::new(1_000).unwrap(), + max_active_compactions: NonZeroU32::new(10).unwrap(), ..Default::default() }; @@ -1538,6 +1539,7 @@ mod tests { persist_row_threshold: NonZeroUsize::new(1_000).unwrap(), late_arrive_window_seconds: NonZeroU32::new(10).unwrap(), persist_age_threshold_seconds: NonZeroU32::new(10).unwrap(), + max_active_compactions: NonZeroU32::new(10).unwrap(), ..Default::default() }; let now = Instant::now(); From ec7ebdff2994a3b8ebd85c14d48a7827e388ae5c Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 20 Jul 2021 13:52:33 +0200 Subject: [PATCH 10/27] refactor: use lifetimes to ensure single stream / no seek while streaming --- server/src/db.rs | 26 +++++++----- write_buffer/src/config.rs | 6 ++- write_buffer/src/core.rs | 79 ++++++++++++++-------------------- write_buffer/src/guard.rs | 87 -------------------------------------- write_buffer/src/kafka.rs | 36 ++++------------ write_buffer/src/lib.rs | 1 - write_buffer/src/mock.rs | 30 ++++--------- 7 files changed, 68 insertions(+), 197 deletions(-) delete mode 100644 write_buffer/src/guard.rs diff --git a/server/src/db.rs b/server/src/db.rs index 59348fc956..020c91fd21 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -656,11 +656,11 @@ impl Db { // streaming from the write buffer loop async { if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer { + let mut write_buffer = write_buffer + .try_lock() + .expect("no streams should exist at this point"); let mut futures = vec![]; - for (_sequencer_id, stream) in write_buffer - .streams() - .expect("no streams should exist at this point") - { + for (_sequencer_id, stream) in write_buffer.streams() { let fut = self.stream_in_sequenced_entries(stream); futures.push(fut); } @@ -1215,10 +1215,12 @@ mod tests { let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1); write_buffer_state .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap()); - let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state)); + let write_buffer = MockBufferForReading::new(write_buffer_state); let db = TestDb::builder() - .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _)) + .write_buffer(WriteBufferConfig::Reading(Arc::new( + tokio::sync::Mutex::new(Box::new(write_buffer) as _), + ))) .build() .await .db; @@ -1274,10 +1276,12 @@ mod tests { String::from("Something bad happened on the way to creating a SequencedEntry").into(), 0, ); - let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state)); + let write_buffer = MockBufferForReading::new(write_buffer_state); let test_db = TestDb::builder() - .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _)) + .write_buffer(WriteBufferConfig::Reading(Arc::new( + tokio::sync::Mutex::new(Box::new(write_buffer) as _), + ))) .build() .await; @@ -2262,10 +2266,12 @@ mod tests { ); write_buffer_state .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 1), entry).unwrap()); - let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state)); + let write_buffer = MockBufferForReading::new(write_buffer_state); let db = TestDb::builder() - .write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _)) + .write_buffer(WriteBufferConfig::Reading(Arc::new( + tokio::sync::Mutex::new(Box::new(write_buffer) as _), + ))) .build() .await .db; diff --git a/write_buffer/src/config.rs b/write_buffer/src/config.rs index 6aacefd530..d6c69e6341 100644 --- a/write_buffer/src/config.rs +++ b/write_buffer/src/config.rs @@ -13,7 +13,7 @@ use crate::{ #[derive(Debug)] pub enum WriteBufferConfig { Writing(Arc), - Reading(Arc), + Reading(Arc>>), } impl WriteBufferConfig { @@ -36,7 +36,9 @@ impl WriteBufferConfig { Some(WriteBufferConnection::Reading(conn)) => { let kafka_buffer = KafkaBufferConsumer::new(conn, server_id, name).await?; - Ok(Some(Self::Reading(Arc::new(kafka_buffer) as _))) + Ok(Some(Self::Reading(Arc::new(tokio::sync::Mutex::new( + Box::new(kafka_buffer) as _, + ))))) } None => Ok(None), } diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs index 1f91d955ab..fdefc76746 100644 --- a/write_buffer/src/core.rs +++ b/write_buffer/src/core.rs @@ -28,15 +28,15 @@ pub type EntryStream<'a> = BoxStream<'a, Result Result)>, WriteBufferError>; + fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>; /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream). - /// - /// When calling this methods while streams (from [`streams`](Self::streams)) are in use, an error will be returned. - async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError>; + async fn seek( + &mut self, + sequencer_id: u32, + sequence_number: u64, + ) -> Result<(), WriteBufferError>; } pub mod test_utils { @@ -68,7 +68,6 @@ pub mod test_utils { T: TestAdapter, { test_single_stream_io(&adapter).await; - test_multi_stream(&adapter).await; test_multi_sequencer_io(&adapter).await; test_multi_writer_multi_reader(&adapter).await; test_seek(&adapter).await; @@ -85,9 +84,9 @@ pub mod test_utils { let entry_3 = lp_to_entry("upc user=3 300"); let writer = context.writing(); - let reader = context.reading().await; + let mut reader = context.reading().await; - let mut streams = reader.streams().unwrap(); + let mut streams = reader.streams(); assert_eq!(streams.len(), 1); let (sequencer_id, mut stream) = streams.pop().unwrap(); @@ -125,9 +124,9 @@ pub mod test_utils { let entry_3 = lp_to_entry("upc user=3 300"); let writer = context.writing(); - let reader = context.reading().await; + let mut reader = context.reading().await; - let mut streams = reader.streams().unwrap(); + let mut streams = reader.streams(); assert_eq!(streams.len(), 2); let (sequencer_id_1, mut stream_1) = streams.pop().unwrap(); let (sequencer_id_2, mut stream_2) = streams.pop().unwrap(); @@ -158,28 +157,6 @@ pub mod test_utils { assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); } - async fn test_multi_stream(adapter: &T) - where - T: TestAdapter, - { - let context = adapter.new_context(2).await; - let reader = context.reading().await; - - let mut streams = reader.streams().unwrap(); - assert_eq!(streams.len(), 2); - let (_sequencer_id, stream_1) = streams.pop().unwrap(); - let (_sequencer_id, stream_2) = streams.pop().unwrap(); - - // cannot get another stream while streams are in use - assert!(reader.streams().is_err()); - drop(stream_1); - assert!(reader.streams().is_err()); - drop(stream_2); - - // when all streams are dropped, we can get new ones - reader.streams().unwrap(); - } - async fn test_multi_writer_multi_reader(adapter: &T) where T: TestAdapter, @@ -192,8 +169,8 @@ pub mod test_utils { let writer_1 = context.writing(); let writer_2 = context.writing(); - let reader_1 = context.reading().await; - let reader_2 = context.reading().await; + let mut reader_1 = context.reading().await; + let mut reader_2 = context.reading().await; // TODO: do not hard-code sequencer IDs here but provide a proper interface writer_1.store_entry(&entry_east_1, 0).await.unwrap(); @@ -201,12 +178,12 @@ pub mod test_utils { writer_2.store_entry(&entry_east_2, 0).await.unwrap(); assert_reader_content( - &reader_1, + &mut reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; assert_reader_content( - &reader_2, + &mut reader_2, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; @@ -231,46 +208,52 @@ pub mod test_utils { let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number; let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number; - let reader_1 = context.reading().await; - let reader_2 = context.reading().await; + let mut reader_1 = context.reading().await; + let mut reader_2 = context.reading().await; // forward seek reader_1.seek(0, sequence_number_east_2).await.unwrap(); - assert_reader_content(&reader_1, &[(0, &[&entry_east_2]), (1, &[&entry_west_1])]).await; assert_reader_content( - &reader_2, + &mut reader_1, + &[(0, &[&entry_east_2]), (1, &[&entry_west_1])], + ) + .await; + assert_reader_content( + &mut reader_2, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])], ) .await; // backward seek reader_1.seek(0, 0).await.unwrap(); - assert_reader_content(&reader_1, &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])]).await; + assert_reader_content( + &mut reader_1, + &[(0, &[&entry_east_1, &entry_east_2]), (1, &[])], + ) + .await; // seek to far end and then at data reader_1.seek(0, 1_000_000).await.unwrap(); let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number; - let mut streams = reader_1.streams().unwrap(); + let mut streams = reader_1.streams(); assert_eq!(streams.len(), 2); let (_sequencer_id, mut stream_1) = streams.pop().unwrap(); let (_sequencer_id, mut stream_2) = streams.pop().unwrap(); assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); - - // seeking while streams are in use is an error - reader_1.seek(0, 0).await.unwrap_err(); drop(stream_1); drop(stream_2); + drop(streams); // seeking unknown sequencer is NOT an error reader_1.seek(0, 42).await.unwrap(); } - async fn assert_reader_content(reader: &R, expected: &[(u32, &[&Entry])]) + async fn assert_reader_content(reader: &mut R, expected: &[(u32, &[&Entry])]) where R: WriteBufferReading, { - let mut streams = reader.streams().unwrap(); + let mut streams = reader.streams(); assert_eq!(streams.len(), expected.len()); streams.sort_by_key(|(sequencer_id, _stream)| *sequencer_id); diff --git a/write_buffer/src/guard.rs b/write_buffer/src/guard.rs deleted file mode 100644 index 636193a15c..0000000000 --- a/write_buffer/src/guard.rs +++ /dev/null @@ -1,87 +0,0 @@ -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, -}; - -/// A semaphore that produces [`Send`]able guards. -pub struct Semaphore { - user_count: Arc, -} - -impl Semaphore { - /// Creates new semaphore with a single permit. - pub fn new() -> Self { - Self { - user_count: Arc::new(AtomicUsize::new(0)), - } - } - - /// Creates guard if no permit exists. - /// - /// To produce multiple guards, you can clone an existing one. - pub fn guard(&self) -> Option { - let count = self.user_count.fetch_add(1, Ordering::SeqCst); - if count > 0 { - self.user_count.fetch_sub(1, Ordering::SeqCst); - None - } else { - Some(Guard { - user_count: Arc::clone(&self.user_count), - }) - } - } -} - -/// Guard that hols a [`Semaphore`] permit. -/// -/// New guards can be produced in two ways: -/// - cloning an existing one -/// - when no guard exists: using [`Semaphore::guard`]. -pub struct Guard { - user_count: Arc, -} - -impl Guard { - /// Use a guard. - /// - /// This is a no-op but is helpful if you need to reference a guard within a closure. - pub fn use_here(&self) {} -} - -impl Clone for Guard { - /// Clone guard and increase usage count. - fn clone(&self) -> Self { - self.user_count.fetch_add(1, Ordering::SeqCst); - Self { - user_count: Arc::clone(&self.user_count), - } - } -} - -impl Drop for Guard { - fn drop(&mut self) { - self.user_count.fetch_sub(1, Ordering::SeqCst); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test() { - let s = Semaphore::new(); - - let g = s.guard().unwrap(); - assert!(s.guard().is_none()); - drop(g); - - let g1 = s.guard().unwrap(); - let g2 = g1.clone(); - assert!(s.guard().is_none()); - drop(g1); - assert!(s.guard().is_none()); - drop(g2); - s.guard().unwrap(); - } -} diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index abd960a521..3255f43b2b 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -18,10 +18,7 @@ use rdkafka::{ ClientConfig, Message, Offset, TopicPartitionList, }; -use crate::{ - core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}, - guard::Semaphore, -}; +use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; pub struct KafkaBufferProducer { conn: String, @@ -100,7 +97,6 @@ pub struct KafkaBufferConsumer { conn: String, database_name: String, consumers: BTreeMap>, - semaphore: Semaphore, } // Needed because rdkafka's StreamConsumer doesn't impl Debug @@ -115,23 +111,13 @@ impl std::fmt::Debug for KafkaBufferConsumer { #[async_trait] impl WriteBufferReading for KafkaBufferConsumer { - fn streams(&self) -> Result)>, WriteBufferError> { - let guard = self - .semaphore - .guard() - .ok_or_else::(|| "stream already in use".to_string().into())?; - - let streams: Vec<_> = self - .consumers + fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> { + self.consumers .iter() .map(|(sequencer_id, consumer)| { - let guard = guard.clone(); - let stream = consumer .stream() .map(move |message| { - guard.use_here(); - let message = message?; let entry = Entry::try_from(message.payload().unwrap().to_vec())?; let sequence = Sequence { @@ -144,17 +130,14 @@ impl WriteBufferReading for KafkaBufferConsumer { .boxed(); (*sequencer_id, stream) }) - .collect(); - - Ok(streams) + .collect() } - async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> { - let _guard = self - .semaphore - .guard() - .ok_or_else::(|| "stream already in use".to_string().into())?; - + async fn seek( + &mut self, + sequencer_id: u32, + sequence_number: u64, + ) -> Result<(), WriteBufferError> { if let Some(consumer) = self.consumers.get(&sequencer_id) { let consumer = Arc::clone(consumer); let database_name = self.database_name.clone(); @@ -234,7 +217,6 @@ impl KafkaBufferConsumer { conn, database_name, consumers, - semaphore: Semaphore::new(), }) } diff --git a/write_buffer/src/lib.rs b/write_buffer/src/lib.rs index a165fa3eb8..9e9472940a 100644 --- a/write_buffer/src/lib.rs +++ b/write_buffer/src/lib.rs @@ -10,6 +10,5 @@ pub mod config; pub mod core; -mod guard; pub mod kafka; pub mod mock; diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs index 701b8af703..37659ba05b 100644 --- a/write_buffer/src/mock.rs +++ b/write_buffer/src/mock.rs @@ -5,10 +5,7 @@ use entry::{Entry, Sequence, SequencedEntry}; use futures::{stream, StreamExt}; use parking_lot::Mutex; -use crate::{ - core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}, - guard::Semaphore, -}; +use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; type EntryResVec = Vec>; @@ -168,7 +165,6 @@ struct PlaybackState { pub struct MockBufferForReading { shared_state: MockBufferSharedState, playback_states: Arc>>, - semaphore: Semaphore, } impl MockBufferForReading { @@ -189,7 +185,6 @@ impl MockBufferForReading { Self { shared_state: state, playback_states: Arc::new(Mutex::new(playback_states)), - semaphore: Semaphore::new(), } } } @@ -202,12 +197,7 @@ impl std::fmt::Debug for MockBufferForReading { #[async_trait] impl WriteBufferReading for MockBufferForReading { - fn streams(&self) -> Result)>, WriteBufferError> { - let guard = self - .semaphore - .guard() - .ok_or_else::(|| "stream already in use".to_string().into())?; - + fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> { let sequencer_ids: Vec<_> = { let playback_states = self.playback_states.lock(); playback_states.keys().copied().collect() @@ -217,11 +207,8 @@ impl WriteBufferReading for MockBufferForReading { for sequencer_id in sequencer_ids { let shared_state = self.shared_state.clone(); let playback_states = Arc::clone(&self.playback_states); - let guard = guard.clone(); let stream = stream::poll_fn(move |_ctx| { - guard.use_here(); - let entries = shared_state.entries.lock(); let mut playback_states = playback_states.lock(); @@ -260,15 +247,14 @@ impl WriteBufferReading for MockBufferForReading { streams.push((sequencer_id, stream)); } - Ok(streams) + streams } - async fn seek(&self, sequencer_id: u32, sequence_number: u64) -> Result<(), WriteBufferError> { - let _guard = self - .semaphore - .guard() - .ok_or_else::(|| "stream already in use".to_string().into())?; - + async fn seek( + &mut self, + sequencer_id: u32, + sequence_number: u64, + ) -> Result<(), WriteBufferError> { let mut playback_states = self.playback_states.lock(); if let Some(playback_state) = playback_states.get_mut(&sequencer_id) { From c01cfbc34c21d98c3fda8ac2a90c83139a171be4 Mon Sep 17 00:00:00 2001 From: Marko Mikulicic Date: Tue, 20 Jul 2021 14:17:37 +0200 Subject: [PATCH 11/27] fix: Increase kafka message size --- write_buffer/src/kafka.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index 15a27a401c..bcdea87659 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -77,8 +77,8 @@ impl KafkaBufferProducer { let mut cfg = ClientConfig::new(); cfg.set("bootstrap.servers", &conn); cfg.set("message.timeout.ms", "5000"); - cfg.set("message.max.bytes", "10000000"); - cfg.set("queue.buffering.max.kbytes", "10485760"); + cfg.set("message.max.bytes", "31457280"); + cfg.set("queue.buffering.max.kbytes", "31457280"); cfg.set("request.required.acks", "all"); // equivalent to acks=-1 let producer: FutureProducer = cfg.create()?; From cf8a60252d03ce52b7960bd23e1c891b5965da6a Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Jul 2021 13:19:20 +0100 Subject: [PATCH 12/27] refactor: split system_tables module into smaller modules (#2061) Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> --- server/src/db/system_tables.rs | 698 +--------------------- server/src/db/system_tables/chunks.rs | 201 +++++++ server/src/db/system_tables/columns.rs | 404 +++++++++++++ server/src/db/system_tables/operations.rs | 108 ++++ 4 files changed, 729 insertions(+), 682 deletions(-) create mode 100644 server/src/db/system_tables/chunks.rs create mode 100644 server/src/db/system_tables/columns.rs create mode 100644 server/src/db/system_tables/operations.rs diff --git a/server/src/db/system_tables.rs b/server/src/db/system_tables.rs index f80f06b9bc..f83c793fa5 100644 --- a/server/src/db/system_tables.rs +++ b/server/src/db/system_tables.rs @@ -7,38 +7,30 @@ //! //! For example `SELECT * FROM system.chunks` -use std::convert::AsRef; +use std::any::Any; use std::sync::Arc; -use std::{any::Any, collections::HashMap}; - -use chrono::{DateTime, Utc}; use arrow::{ - array::{ - ArrayRef, StringArray, StringBuilder, Time64NanosecondArray, TimestampNanosecondArray, - UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, - }, - datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}, + datatypes::{Field, Schema, SchemaRef}, error::Result, record_batch::RecordBatch, }; -use data_types::{ - chunk_metadata::{ChunkSummary, DetailedChunkSummary}, - error::ErrorLogger, - job::Job, - partition_metadata::PartitionSummary, -}; +use chrono::{DateTime, Utc}; + use datafusion::{ catalog::schema::SchemaProvider, datasource::{datasource::Statistics, TableProvider}, error::{DataFusionError, Result as DataFusionResult}, physical_plan::{memory::MemoryExec, ExecutionPlan}, }; -use tracker::TaskTracker; + +use crate::JobRegistry; use super::catalog::Catalog; -use crate::JobRegistry; -use data_types::partition_metadata::TableSummary; + +mod chunks; +mod columns; +mod operations; // The IOx system schema pub const SYSTEM_SCHEMA: &str = "system"; @@ -67,16 +59,16 @@ impl SystemSchemaProvider { pub fn new(db_name: impl Into, catalog: Arc, jobs: Arc) -> Self { let db_name = db_name.into(); let chunks = Arc::new(SystemTableProvider { - inner: ChunksTable::new(Arc::clone(&catalog)), + inner: chunks::ChunksTable::new(Arc::clone(&catalog)), }); let columns = Arc::new(SystemTableProvider { - inner: ColumnsTable::new(Arc::clone(&catalog)), + inner: columns::ColumnsTable::new(Arc::clone(&catalog)), }); let chunk_columns = Arc::new(SystemTableProvider { - inner: ChunkColumnsTable::new(catalog), + inner: columns::ChunkColumnsTable::new(catalog), }); let operations = Arc::new(SystemTableProvider { - inner: OperationsTable::new(db_name, jobs), + inner: operations::OperationsTable::new(db_name, jobs), }); Self { chunks, @@ -162,407 +154,6 @@ fn time_to_ts(time: Option>) -> Option { time.map(|ts| ts.timestamp_nanos()) } -/// Implementation of system.chunks table -#[derive(Debug)] -struct ChunksTable { - schema: SchemaRef, - catalog: Arc, -} - -impl ChunksTable { - fn new(catalog: Arc) -> Self { - Self { - schema: chunk_summaries_schema(), - catalog, - } - } -} - -impl IoxSystemTable for ChunksTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - - fn batch(&self) -> Result { - from_chunk_summaries(self.schema(), self.catalog.chunk_summaries()) - .log_if_error("system.chunks table") - } -} - -fn chunk_summaries_schema() -> SchemaRef { - let ts = DataType::Timestamp(TimeUnit::Nanosecond, None); - Arc::new(Schema::new(vec![ - Field::new("id", DataType::UInt32, false), - Field::new("partition_key", DataType::Utf8, false), - Field::new("table_name", DataType::Utf8, false), - Field::new("storage", DataType::Utf8, false), - Field::new("lifecycle_action", DataType::Utf8, true), - Field::new("memory_bytes", DataType::UInt64, false), - Field::new("object_store_bytes", DataType::UInt64, false), - Field::new("row_count", DataType::UInt64, false), - Field::new("time_of_first_write", ts.clone(), true), - Field::new("time_of_last_write", ts.clone(), true), - Field::new("time_closed", ts, true), - ])) -} - -fn from_chunk_summaries(schema: SchemaRef, chunks: Vec) -> Result { - let id = chunks.iter().map(|c| Some(c.id)).collect::(); - let partition_key = chunks - .iter() - .map(|c| Some(c.partition_key.as_ref())) - .collect::(); - let table_name = chunks - .iter() - .map(|c| Some(c.table_name.as_ref())) - .collect::(); - let storage = chunks - .iter() - .map(|c| Some(c.storage.as_str())) - .collect::(); - let lifecycle_action = chunks - .iter() - .map(|c| c.lifecycle_action.map(|a| a.name())) - .collect::(); - let memory_bytes = chunks - .iter() - .map(|c| Some(c.memory_bytes as u64)) - .collect::(); - let object_store_bytes = chunks - .iter() - .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0)) - .collect::(); - let row_counts = chunks - .iter() - .map(|c| Some(c.row_count as u64)) - .collect::(); - let time_of_first_write = chunks - .iter() - .map(|c| c.time_of_first_write) - .map(time_to_ts) - .collect::(); - let time_of_last_write = chunks - .iter() - .map(|c| c.time_of_last_write) - .map(time_to_ts) - .collect::(); - let time_closed = chunks - .iter() - .map(|c| c.time_closed) - .map(time_to_ts) - .collect::(); - - RecordBatch::try_new( - schema, - vec![ - Arc::new(id), - Arc::new(partition_key), - Arc::new(table_name), - Arc::new(storage), - Arc::new(lifecycle_action), - Arc::new(memory_bytes), - Arc::new(object_store_bytes), - Arc::new(row_counts), - Arc::new(time_of_first_write), - Arc::new(time_of_last_write), - Arc::new(time_closed), - ], - ) -} - -/// Implementation of `system.columns` system table -#[derive(Debug)] -struct ColumnsTable { - schema: SchemaRef, - catalog: Arc, -} - -impl ColumnsTable { - fn new(catalog: Arc) -> Self { - Self { - schema: partition_summaries_schema(), - catalog, - } - } -} - -impl IoxSystemTable for ColumnsTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - fn batch(&self) -> Result { - from_partition_summaries(self.schema(), self.catalog.partition_summaries()) - .log_if_error("system.columns table") - } -} - -fn partition_summaries_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("partition_key", DataType::Utf8, false), - Field::new("table_name", DataType::Utf8, false), - Field::new("column_name", DataType::Utf8, false), - Field::new("column_type", DataType::Utf8, false), - Field::new("influxdb_type", DataType::Utf8, true), - ])) -} - -fn from_partition_summaries( - schema: SchemaRef, - partitions: Vec, -) -> Result { - // Assume each partition has roughly 5 tables with 5 columns - let row_estimate = partitions.len() * 25; - - let mut partition_key = StringBuilder::new(row_estimate); - let mut table_name = StringBuilder::new(row_estimate); - let mut column_name = StringBuilder::new(row_estimate); - let mut column_type = StringBuilder::new(row_estimate); - let mut influxdb_type = StringBuilder::new(row_estimate); - - // Note no rows are produced for partitions with no tabes, or - // tables with no columns: There are other tables to list tables - // and columns - for partition in partitions { - let table = partition.table; - for column in table.columns { - partition_key.append_value(&partition.key)?; - table_name.append_value(&table.name)?; - column_name.append_value(&column.name)?; - column_type.append_value(column.type_name())?; - if let Some(t) = &column.influxdb_type { - influxdb_type.append_value(t.as_str())?; - } else { - influxdb_type.append_null()?; - } - } - } - - RecordBatch::try_new( - schema, - vec![ - Arc::new(partition_key.finish()) as ArrayRef, - Arc::new(table_name.finish()), - Arc::new(column_name.finish()), - Arc::new(column_type.finish()), - Arc::new(influxdb_type.finish()), - ], - ) -} - -/// Implementation of system.column_chunks table -#[derive(Debug)] -struct ChunkColumnsTable { - schema: SchemaRef, - catalog: Arc, -} - -impl ChunkColumnsTable { - fn new(catalog: Arc) -> Self { - Self { - schema: chunk_columns_schema(), - catalog, - } - } -} - -impl IoxSystemTable for ChunkColumnsTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - - fn batch(&self) -> Result { - assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries()) - .log_if_error("system.column_chunks table") - } -} - -fn chunk_columns_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("partition_key", DataType::Utf8, false), - Field::new("chunk_id", DataType::UInt32, false), - Field::new("table_name", DataType::Utf8, false), - Field::new("column_name", DataType::Utf8, false), - Field::new("storage", DataType::Utf8, false), - Field::new("row_count", DataType::UInt64, true), - Field::new("min_value", DataType::Utf8, true), - Field::new("max_value", DataType::Utf8, true), - Field::new("memory_bytes", DataType::UInt64, true), - ])) -} - -fn assemble_chunk_columns( - schema: SchemaRef, - chunk_summaries: Vec<(Arc, DetailedChunkSummary)>, -) -> Result { - /// Builds an index from column_name -> size - fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> { - summary - .columns - .iter() - .map(|column_summary| { - ( - column_summary.name.as_ref(), - column_summary.memory_bytes as u64, - ) - }) - .collect() - } - - // Assume each chunk has roughly 5 columns - let row_estimate = chunk_summaries.len() * 5; - - let mut partition_key = StringBuilder::new(row_estimate); - let mut chunk_id = UInt32Builder::new(row_estimate); - let mut table_name = StringBuilder::new(row_estimate); - let mut column_name = StringBuilder::new(row_estimate); - let mut storage = StringBuilder::new(row_estimate); - let mut row_count = UInt64Builder::new(row_estimate); - let mut min_values = StringBuilder::new(row_estimate); - let mut max_values = StringBuilder::new(row_estimate); - let mut memory_bytes = UInt64Builder::new(row_estimate); - - // Note no rows are produced for partitions with no chunks, or - // tables with no partitions: There are other tables to list tables - // and columns - for (table_summary, chunk_summary) in chunk_summaries { - let mut column_index = make_column_index(&chunk_summary); - let storage_value = chunk_summary.inner.storage.as_str(); - - for column in &table_summary.columns { - partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?; - chunk_id.append_value(chunk_summary.inner.id)?; - table_name.append_value(&chunk_summary.inner.table_name)?; - column_name.append_value(&column.name)?; - storage.append_value(storage_value)?; - row_count.append_value(column.count())?; - if let Some(v) = column.stats.min_as_str() { - min_values.append_value(v)?; - } else { - min_values.append(false)?; - } - if let Some(v) = column.stats.max_as_str() { - max_values.append_value(v)?; - } else { - max_values.append(false)?; - } - - let size = column_index.remove(column.name.as_str()); - - memory_bytes.append_option(size)?; - } - } - - RecordBatch::try_new( - schema, - vec![ - Arc::new(partition_key.finish()) as ArrayRef, - Arc::new(chunk_id.finish()), - Arc::new(table_name.finish()), - Arc::new(column_name.finish()), - Arc::new(storage.finish()), - Arc::new(row_count.finish()), - Arc::new(min_values.finish()), - Arc::new(max_values.finish()), - Arc::new(memory_bytes.finish()), - ], - ) -} - -/// Implementation of system.operations table -#[derive(Debug)] -struct OperationsTable { - schema: SchemaRef, - db_name: String, - jobs: Arc, -} - -impl OperationsTable { - fn new(db_name: String, jobs: Arc) -> Self { - Self { - schema: operations_schema(), - db_name, - jobs, - } - } -} - -impl IoxSystemTable for OperationsTable { - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) - } - - fn batch(&self) -> Result { - from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked()) - .log_if_error("system.operations table") - } -} - -fn operations_schema() -> SchemaRef { - let ts = DataType::Time64(TimeUnit::Nanosecond); - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Utf8, false), - Field::new("status", DataType::Utf8, true), - Field::new("cpu_time_used", ts.clone(), true), - Field::new("wall_time_used", ts, true), - Field::new("partition_key", DataType::Utf8, true), - Field::new("chunk_id", DataType::UInt32, true), - Field::new("description", DataType::Utf8, true), - ])) -} - -fn from_task_trackers( - schema: SchemaRef, - db_name: &str, - jobs: Vec>, -) -> Result { - let jobs = jobs - .into_iter() - .filter(|job| job.metadata().db_name() == Some(db_name)) - .collect::>(); - - let ids = jobs - .iter() - .map(|job| Some(job.id().to_string())) - .collect::(); - let statuses = jobs - .iter() - .map(|job| Some(job.get_status().name())) - .collect::(); - let cpu_time_used = jobs - .iter() - .map(|job| job.get_status().cpu_nanos().map(|n| n as i64)) - .collect::(); - let wall_time_used = jobs - .iter() - .map(|job| job.get_status().wall_nanos().map(|n| n as i64)) - .collect::(); - let partition_keys = jobs - .iter() - .map(|job| job.metadata().partition_key()) - .collect::(); - let chunk_ids = jobs - .iter() - .map(|job| job.metadata().chunk_id()) - .collect::(); - let descriptions = jobs - .iter() - .map(|job| Some(job.metadata().description())) - .collect::(); - - RecordBatch::try_new( - schema, - vec![ - Arc::new(ids) as ArrayRef, - Arc::new(statuses), - Arc::new(cpu_time_used), - Arc::new(wall_time_used), - Arc::new(partition_keys), - Arc::new(chunk_ids), - Arc::new(descriptions), - ], - ) -} - /// Creates a DataFusion ExecutionPlan node that scans a single batch /// of records. fn scan_batch( @@ -605,141 +196,10 @@ fn scan_batch( #[cfg(test)] mod tests { - use super::*; + use arrow::array::{ArrayRef, UInt64Array}; use arrow_util::assert_batches_eq; - use chrono::NaiveDateTime; - use data_types::{ - chunk_metadata::{ChunkColumnSummary, ChunkLifecycleAction, ChunkStorage}, - partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary}, - }; - #[test] - fn test_from_chunk_summaries() { - let chunks = vec![ - ChunkSummary { - partition_key: Arc::from("p1"), - table_name: Arc::from("table1"), - id: 0, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action: None, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(10, 0), - Utc, - )), - time_of_last_write: None, - time_closed: None, - }, - ChunkSummary { - partition_key: Arc::from("p1"), - table_name: Arc::from("table1"), - id: 1, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action: Some(ChunkLifecycleAction::Persisting), - memory_bytes: 23455, - object_store_bytes: 0, - row_count: 22, - time_of_first_write: None, - time_of_last_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(80, 0), - Utc, - )), - time_closed: None, - }, - ChunkSummary { - partition_key: Arc::from("p1"), - table_name: Arc::from("table1"), - id: 2, - storage: ChunkStorage::ObjectStoreOnly, - lifecycle_action: None, - memory_bytes: 1234, - object_store_bytes: 5678, - row_count: 33, - time_of_first_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(100, 0), - Utc, - )), - time_of_last_write: Some(DateTime::from_utc( - NaiveDateTime::from_timestamp(200, 0), - Utc, - )), - time_closed: None, - }, - ]; - - let expected = vec![ - "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", - "| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |", - "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", - "| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |", - "| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |", - "| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |", - "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", - ]; - - let schema = chunk_summaries_schema(); - let batch = from_chunk_summaries(schema, chunks).unwrap(); - assert_batches_eq!(&expected, &[batch]); - } - - #[test] - fn test_from_partition_summaries() { - let partitions = vec![ - PartitionSummary { - key: "p1".to_string(), - table: TableSummary { - name: "t1".to_string(), - columns: vec![ - ColumnSummary { - name: "c1".to_string(), - influxdb_type: Some(InfluxDbType::Tag), - stats: Statistics::I64(StatValues::new_with_value(23)), - }, - ColumnSummary { - name: "c2".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::I64(StatValues::new_with_value(43)), - }, - ColumnSummary { - name: "c3".to_string(), - influxdb_type: None, - stats: Statistics::String(StatValues::new_with_value( - "foo".to_string(), - )), - }, - ColumnSummary { - name: "time".to_string(), - influxdb_type: Some(InfluxDbType::Timestamp), - stats: Statistics::I64(StatValues::new_with_value(43)), - }, - ], - }, - }, - PartitionSummary { - key: "p3".to_string(), - table: TableSummary { - name: "t1".to_string(), - columns: vec![], - }, - }, - ]; - - let expected = vec![ - "+---------------+------------+-------------+-------------+---------------+", - "| partition_key | table_name | column_name | column_type | influxdb_type |", - "+---------------+------------+-------------+-------------+---------------+", - "| p1 | t1 | c1 | I64 | Tag |", - "| p1 | t1 | c2 | I64 | Field |", - "| p1 | t1 | c3 | String | |", - "| p1 | t1 | time | I64 | Timestamp |", - "+---------------+------------+-------------+-------------+---------------+", - ]; - - let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap(); - assert_batches_eq!(&expected, &[batch]); - } + use super::*; fn seq_array(start: u64, end: u64) -> ArrayRef { Arc::new(UInt64Array::from_iter_values(start..end)) @@ -820,130 +280,4 @@ mod tests { err_string ); } - - #[test] - fn test_assemble_chunk_columns() { - let lifecycle_action = None; - - let summaries = vec![ - ( - Arc::new(TableSummary { - name: "t1".to_string(), - columns: vec![ - ColumnSummary { - name: "c1".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::String(StatValues::new( - Some("bar".to_string()), - Some("foo".to_string()), - 55, - )), - }, - ColumnSummary { - name: "c2".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)), - }, - ], - }), - DetailedChunkSummary { - inner: ChunkSummary { - partition_key: "p1".into(), - table_name: "t1".into(), - id: 42, - storage: ChunkStorage::ReadBuffer, - lifecycle_action, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: None, - time_of_last_write: None, - time_closed: None, - }, - columns: vec![ - ChunkColumnSummary { - name: "c1".into(), - memory_bytes: 11, - }, - ChunkColumnSummary { - name: "c2".into(), - memory_bytes: 12, - }, - ], - }, - ), - ( - Arc::new(TableSummary { - name: "t1".to_string(), - columns: vec![ColumnSummary { - name: "c1".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)), - }], - }), - DetailedChunkSummary { - inner: ChunkSummary { - partition_key: "p2".into(), - table_name: "t1".into(), - id: 43, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: None, - time_of_last_write: None, - time_closed: None, - }, - columns: vec![ChunkColumnSummary { - name: "c1".into(), - memory_bytes: 100, - }], - }, - ), - ( - Arc::new(TableSummary { - name: "t2".to_string(), - columns: vec![ColumnSummary { - name: "c3".to_string(), - influxdb_type: Some(InfluxDbType::Field), - stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)), - }], - }), - DetailedChunkSummary { - inner: ChunkSummary { - partition_key: "p2".into(), - table_name: "t2".into(), - id: 44, - storage: ChunkStorage::OpenMutableBuffer, - lifecycle_action, - memory_bytes: 23754, - object_store_bytes: 0, - row_count: 11, - time_of_first_write: None, - time_of_last_write: None, - time_closed: None, - }, - columns: vec![ChunkColumnSummary { - name: "c3".into(), - memory_bytes: 200, - }], - }, - ), - ]; - - let expected = vec![ - "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", - "| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |", - "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", - "| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |", - "| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |", - "| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |", - "| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |", - "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", - ]; - - let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap(); - assert_batches_eq!(&expected, &[batch]); - } } diff --git a/server/src/db/system_tables/chunks.rs b/server/src/db/system_tables/chunks.rs new file mode 100644 index 0000000000..90acda0629 --- /dev/null +++ b/server/src/db/system_tables/chunks.rs @@ -0,0 +1,201 @@ +use std::sync::Arc; + +use arrow::array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::chunk_metadata::ChunkSummary; +use data_types::error::ErrorLogger; + +use crate::db::catalog::Catalog; +use crate::db::system_tables::{time_to_ts, IoxSystemTable}; + +/// Implementation of system.chunks table +#[derive(Debug)] +pub(super) struct ChunksTable { + schema: SchemaRef, + catalog: Arc, +} + +impl ChunksTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: chunk_summaries_schema(), + catalog, + } + } +} + +impl IoxSystemTable for ChunksTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + from_chunk_summaries(self.schema(), self.catalog.chunk_summaries()) + .log_if_error("system.chunks table") + } +} + +fn chunk_summaries_schema() -> SchemaRef { + let ts = DataType::Timestamp(TimeUnit::Nanosecond, None); + Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt32, false), + Field::new("partition_key", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("storage", DataType::Utf8, false), + Field::new("lifecycle_action", DataType::Utf8, true), + Field::new("memory_bytes", DataType::UInt64, false), + Field::new("object_store_bytes", DataType::UInt64, false), + Field::new("row_count", DataType::UInt64, false), + Field::new("time_of_first_write", ts.clone(), true), + Field::new("time_of_last_write", ts.clone(), true), + Field::new("time_closed", ts, true), + ])) +} + +fn from_chunk_summaries(schema: SchemaRef, chunks: Vec) -> Result { + let id = chunks.iter().map(|c| Some(c.id)).collect::(); + let partition_key = chunks + .iter() + .map(|c| Some(c.partition_key.as_ref())) + .collect::(); + let table_name = chunks + .iter() + .map(|c| Some(c.table_name.as_ref())) + .collect::(); + let storage = chunks + .iter() + .map(|c| Some(c.storage.as_str())) + .collect::(); + let lifecycle_action = chunks + .iter() + .map(|c| c.lifecycle_action.map(|a| a.name())) + .collect::(); + let memory_bytes = chunks + .iter() + .map(|c| Some(c.memory_bytes as u64)) + .collect::(); + let object_store_bytes = chunks + .iter() + .map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0)) + .collect::(); + let row_counts = chunks + .iter() + .map(|c| Some(c.row_count as u64)) + .collect::(); + let time_of_first_write = chunks + .iter() + .map(|c| c.time_of_first_write) + .map(time_to_ts) + .collect::(); + let time_of_last_write = chunks + .iter() + .map(|c| c.time_of_last_write) + .map(time_to_ts) + .collect::(); + let time_closed = chunks + .iter() + .map(|c| c.time_closed) + .map(time_to_ts) + .collect::(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(id), + Arc::new(partition_key), + Arc::new(table_name), + Arc::new(storage), + Arc::new(lifecycle_action), + Arc::new(memory_bytes), + Arc::new(object_store_bytes), + Arc::new(row_counts), + Arc::new(time_of_first_write), + Arc::new(time_of_last_write), + Arc::new(time_closed), + ], + ) +} + +#[cfg(test)] +mod tests { + use chrono::{DateTime, NaiveDateTime, Utc}; + + use arrow_util::assert_batches_eq; + use data_types::chunk_metadata::{ChunkLifecycleAction, ChunkStorage}; + + use super::*; + + #[test] + fn test_from_chunk_summaries() { + let chunks = vec![ + ChunkSummary { + partition_key: Arc::from("p1"), + table_name: Arc::from("table1"), + id: 0, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action: None, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(10, 0), + Utc, + )), + time_of_last_write: None, + time_closed: None, + }, + ChunkSummary { + partition_key: Arc::from("p1"), + table_name: Arc::from("table1"), + id: 1, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action: Some(ChunkLifecycleAction::Persisting), + memory_bytes: 23455, + object_store_bytes: 0, + row_count: 22, + time_of_first_write: None, + time_of_last_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(80, 0), + Utc, + )), + time_closed: None, + }, + ChunkSummary { + partition_key: Arc::from("p1"), + table_name: Arc::from("table1"), + id: 2, + storage: ChunkStorage::ObjectStoreOnly, + lifecycle_action: None, + memory_bytes: 1234, + object_store_bytes: 5678, + row_count: 33, + time_of_first_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(100, 0), + Utc, + )), + time_of_last_write: Some(DateTime::from_utc( + NaiveDateTime::from_timestamp(200, 0), + Utc, + )), + time_closed: None, + }, + ]; + + let expected = vec![ + "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", + "| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |", + "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", + "| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |", + "| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |", + "| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |", + "+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+", + ]; + + let schema = chunk_summaries_schema(); + let batch = from_chunk_summaries(schema, chunks).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } +} diff --git a/server/src/db/system_tables/columns.rs b/server/src/db/system_tables/columns.rs new file mode 100644 index 0000000000..5f0b8f6fdd --- /dev/null +++ b/server/src/db/system_tables/columns.rs @@ -0,0 +1,404 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use arrow::array::{ArrayRef, StringBuilder, UInt32Builder, UInt64Builder}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::chunk_metadata::DetailedChunkSummary; +use data_types::error::ErrorLogger; +use data_types::partition_metadata::{PartitionSummary, TableSummary}; + +use crate::db::catalog::Catalog; +use crate::db::system_tables::IoxSystemTable; + +/// Implementation of `system.columns` system table +#[derive(Debug)] +pub(super) struct ColumnsTable { + schema: SchemaRef, + catalog: Arc, +} + +impl ColumnsTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: partition_summaries_schema(), + catalog, + } + } +} + +impl IoxSystemTable for ColumnsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + fn batch(&self) -> Result { + from_partition_summaries(self.schema(), self.catalog.partition_summaries()) + .log_if_error("system.columns table") + } +} + +fn partition_summaries_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("partition_key", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("column_name", DataType::Utf8, false), + Field::new("column_type", DataType::Utf8, false), + Field::new("influxdb_type", DataType::Utf8, true), + ])) +} + +fn from_partition_summaries( + schema: SchemaRef, + partitions: Vec, +) -> Result { + // Assume each partition has roughly 5 tables with 5 columns + let row_estimate = partitions.len() * 25; + + let mut partition_key = StringBuilder::new(row_estimate); + let mut table_name = StringBuilder::new(row_estimate); + let mut column_name = StringBuilder::new(row_estimate); + let mut column_type = StringBuilder::new(row_estimate); + let mut influxdb_type = StringBuilder::new(row_estimate); + + // Note no rows are produced for partitions with no tabes, or + // tables with no columns: There are other tables to list tables + // and columns + for partition in partitions { + let table = partition.table; + for column in table.columns { + partition_key.append_value(&partition.key)?; + table_name.append_value(&table.name)?; + column_name.append_value(&column.name)?; + column_type.append_value(column.type_name())?; + if let Some(t) = &column.influxdb_type { + influxdb_type.append_value(t.as_str())?; + } else { + influxdb_type.append_null()?; + } + } + } + + RecordBatch::try_new( + schema, + vec![ + Arc::new(partition_key.finish()) as ArrayRef, + Arc::new(table_name.finish()), + Arc::new(column_name.finish()), + Arc::new(column_type.finish()), + Arc::new(influxdb_type.finish()), + ], + ) +} + +/// Implementation of system.column_chunks table +#[derive(Debug)] +pub(super) struct ChunkColumnsTable { + schema: SchemaRef, + catalog: Arc, +} + +impl ChunkColumnsTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: chunk_columns_schema(), + catalog, + } + } +} + +impl IoxSystemTable for ChunkColumnsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries()) + .log_if_error("system.column_chunks table") + } +} + +fn chunk_columns_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("partition_key", DataType::Utf8, false), + Field::new("chunk_id", DataType::UInt32, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("column_name", DataType::Utf8, false), + Field::new("storage", DataType::Utf8, false), + Field::new("row_count", DataType::UInt64, true), + Field::new("min_value", DataType::Utf8, true), + Field::new("max_value", DataType::Utf8, true), + Field::new("memory_bytes", DataType::UInt64, true), + ])) +} + +fn assemble_chunk_columns( + schema: SchemaRef, + chunk_summaries: Vec<(Arc, DetailedChunkSummary)>, +) -> Result { + /// Builds an index from column_name -> size + fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> { + summary + .columns + .iter() + .map(|column_summary| { + ( + column_summary.name.as_ref(), + column_summary.memory_bytes as u64, + ) + }) + .collect() + } + + // Assume each chunk has roughly 5 columns + let row_estimate = chunk_summaries.len() * 5; + + let mut partition_key = StringBuilder::new(row_estimate); + let mut chunk_id = UInt32Builder::new(row_estimate); + let mut table_name = StringBuilder::new(row_estimate); + let mut column_name = StringBuilder::new(row_estimate); + let mut storage = StringBuilder::new(row_estimate); + let mut row_count = UInt64Builder::new(row_estimate); + let mut min_values = StringBuilder::new(row_estimate); + let mut max_values = StringBuilder::new(row_estimate); + let mut memory_bytes = UInt64Builder::new(row_estimate); + + // Note no rows are produced for partitions with no chunks, or + // tables with no partitions: There are other tables to list tables + // and columns + for (table_summary, chunk_summary) in chunk_summaries { + let mut column_index = make_column_index(&chunk_summary); + let storage_value = chunk_summary.inner.storage.as_str(); + + for column in &table_summary.columns { + partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?; + chunk_id.append_value(chunk_summary.inner.id)?; + table_name.append_value(&chunk_summary.inner.table_name)?; + column_name.append_value(&column.name)?; + storage.append_value(storage_value)?; + row_count.append_value(column.count())?; + if let Some(v) = column.stats.min_as_str() { + min_values.append_value(v)?; + } else { + min_values.append(false)?; + } + if let Some(v) = column.stats.max_as_str() { + max_values.append_value(v)?; + } else { + max_values.append(false)?; + } + + let size = column_index.remove(column.name.as_str()); + + memory_bytes.append_option(size)?; + } + } + + RecordBatch::try_new( + schema, + vec![ + Arc::new(partition_key.finish()) as ArrayRef, + Arc::new(chunk_id.finish()), + Arc::new(table_name.finish()), + Arc::new(column_name.finish()), + Arc::new(storage.finish()), + Arc::new(row_count.finish()), + Arc::new(min_values.finish()), + Arc::new(max_values.finish()), + Arc::new(memory_bytes.finish()), + ], + ) +} + +#[cfg(test)] +mod tests { + use arrow_util::assert_batches_eq; + use data_types::chunk_metadata::{ChunkColumnSummary, ChunkStorage, ChunkSummary}; + use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics}; + + use super::*; + + #[test] + fn test_from_partition_summaries() { + let partitions = vec![ + PartitionSummary { + key: "p1".to_string(), + table: TableSummary { + name: "t1".to_string(), + columns: vec![ + ColumnSummary { + name: "c1".to_string(), + influxdb_type: Some(InfluxDbType::Tag), + stats: Statistics::I64(StatValues::new_with_value(23)), + }, + ColumnSummary { + name: "c2".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::I64(StatValues::new_with_value(43)), + }, + ColumnSummary { + name: "c3".to_string(), + influxdb_type: None, + stats: Statistics::String(StatValues::new_with_value( + "foo".to_string(), + )), + }, + ColumnSummary { + name: "time".to_string(), + influxdb_type: Some(InfluxDbType::Timestamp), + stats: Statistics::I64(StatValues::new_with_value(43)), + }, + ], + }, + }, + PartitionSummary { + key: "p3".to_string(), + table: TableSummary { + name: "t1".to_string(), + columns: vec![], + }, + }, + ]; + + let expected = vec![ + "+---------------+------------+-------------+-------------+---------------+", + "| partition_key | table_name | column_name | column_type | influxdb_type |", + "+---------------+------------+-------------+-------------+---------------+", + "| p1 | t1 | c1 | I64 | Tag |", + "| p1 | t1 | c2 | I64 | Field |", + "| p1 | t1 | c3 | String | |", + "| p1 | t1 | time | I64 | Timestamp |", + "+---------------+------------+-------------+-------------+---------------+", + ]; + + let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } + + #[test] + fn test_assemble_chunk_columns() { + let lifecycle_action = None; + + let summaries = vec![ + ( + Arc::new(TableSummary { + name: "t1".to_string(), + columns: vec![ + ColumnSummary { + name: "c1".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::String(StatValues::new( + Some("bar".to_string()), + Some("foo".to_string()), + 55, + )), + }, + ColumnSummary { + name: "c2".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)), + }, + ], + }), + DetailedChunkSummary { + inner: ChunkSummary { + partition_key: "p1".into(), + table_name: "t1".into(), + id: 42, + storage: ChunkStorage::ReadBuffer, + lifecycle_action, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: None, + time_of_last_write: None, + time_closed: None, + }, + columns: vec![ + ChunkColumnSummary { + name: "c1".into(), + memory_bytes: 11, + }, + ChunkColumnSummary { + name: "c2".into(), + memory_bytes: 12, + }, + ], + }, + ), + ( + Arc::new(TableSummary { + name: "t1".to_string(), + columns: vec![ColumnSummary { + name: "c1".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)), + }], + }), + DetailedChunkSummary { + inner: ChunkSummary { + partition_key: "p2".into(), + table_name: "t1".into(), + id: 43, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: None, + time_of_last_write: None, + time_closed: None, + }, + columns: vec![ChunkColumnSummary { + name: "c1".into(), + memory_bytes: 100, + }], + }, + ), + ( + Arc::new(TableSummary { + name: "t2".to_string(), + columns: vec![ColumnSummary { + name: "c3".to_string(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)), + }], + }), + DetailedChunkSummary { + inner: ChunkSummary { + partition_key: "p2".into(), + table_name: "t2".into(), + id: 44, + storage: ChunkStorage::OpenMutableBuffer, + lifecycle_action, + memory_bytes: 23754, + object_store_bytes: 0, + row_count: 11, + time_of_first_write: None, + time_of_last_write: None, + time_closed: None, + }, + columns: vec![ChunkColumnSummary { + name: "c3".into(), + memory_bytes: 200, + }], + }, + ), + ]; + + let expected = vec![ + "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", + "| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |", + "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", + "| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |", + "| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |", + "| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |", + "| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |", + "+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+", + ]; + + let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } +} diff --git a/server/src/db/system_tables/operations.rs b/server/src/db/system_tables/operations.rs new file mode 100644 index 0000000000..d8b2af0ac2 --- /dev/null +++ b/server/src/db/system_tables/operations.rs @@ -0,0 +1,108 @@ +use std::sync::Arc; + +use arrow::array::{ArrayRef, StringArray, Time64NanosecondArray, UInt32Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::error::ErrorLogger; +use data_types::job::Job; +use tracker::TaskTracker; + +use crate::db::system_tables::IoxSystemTable; +use crate::JobRegistry; + +/// Implementation of system.operations table +#[derive(Debug)] +pub(super) struct OperationsTable { + schema: SchemaRef, + db_name: String, + jobs: Arc, +} + +impl OperationsTable { + pub(super) fn new(db_name: String, jobs: Arc) -> Self { + Self { + schema: operations_schema(), + db_name, + jobs, + } + } +} + +impl IoxSystemTable for OperationsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked()) + .log_if_error("system.operations table") + } +} + +fn operations_schema() -> SchemaRef { + let ts = DataType::Time64(TimeUnit::Nanosecond); + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + Field::new("cpu_time_used", ts.clone(), true), + Field::new("wall_time_used", ts, true), + Field::new("partition_key", DataType::Utf8, true), + Field::new("chunk_id", DataType::UInt32, true), + Field::new("description", DataType::Utf8, true), + ])) +} + +fn from_task_trackers( + schema: SchemaRef, + db_name: &str, + jobs: Vec>, +) -> Result { + let jobs = jobs + .into_iter() + .filter(|job| job.metadata().db_name() == Some(db_name)) + .collect::>(); + + let ids = jobs + .iter() + .map(|job| Some(job.id().to_string())) + .collect::(); + let statuses = jobs + .iter() + .map(|job| Some(job.get_status().name())) + .collect::(); + let cpu_time_used = jobs + .iter() + .map(|job| job.get_status().cpu_nanos().map(|n| n as i64)) + .collect::(); + let wall_time_used = jobs + .iter() + .map(|job| job.get_status().wall_nanos().map(|n| n as i64)) + .collect::(); + let partition_keys = jobs + .iter() + .map(|job| job.metadata().partition_key()) + .collect::(); + let chunk_ids = jobs + .iter() + .map(|job| job.metadata().chunk_id()) + .collect::(); + let descriptions = jobs + .iter() + .map(|job| Some(job.metadata().description())) + .collect::(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(ids) as ArrayRef, + Arc::new(statuses), + Arc::new(cpu_time_used), + Arc::new(wall_time_used), + Arc::new(partition_keys), + Arc::new(chunk_ids), + Arc::new(descriptions), + ], + ) +} From e4d2c51e8b7c2a0883afeb3838b558b62c701938 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Jul 2021 13:44:47 +0100 Subject: [PATCH 13/27] fix: update PersistenceWindows on rules update (#2018) (#2060) * fix: update PersistenceWindows on rules update (#2018) * chore: review feedback Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> --- .../src/persistence_windows.rs | 6 + server/src/config.rs | 2 +- server/src/db.rs | 35 +++++- server/src/lib.rs | 4 +- tests/end_to_end_cases/persistence.rs | 109 ++++++++++++++---- 5 files changed, 129 insertions(+), 27 deletions(-) diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs index 1b5ae73d29..5b66a593f8 100644 --- a/persistence_windows/src/persistence_windows.rs +++ b/persistence_windows/src/persistence_windows.rs @@ -133,6 +133,12 @@ impl PersistenceWindows { } } + /// Updates the late arrival period of this `PersistenceWindows` instance + pub fn set_late_arrival_period(&mut self, late_arrival_period: Duration) { + self.closed_window_period = late_arrival_period.min(DEFAULT_CLOSED_WINDOW_PERIOD); + self.late_arrival_period = late_arrival_period; + } + /// Updates the windows with the information from a batch of rows from a single sequencer /// to the same partition. The min and max times are the times on the row data. The `received_at` /// Instant is when the data was received. Taking it in this function is really just about diff --git a/server/src/config.rs b/server/src/config.rs index 4554e8e912..f869319a87 100644 --- a/server/src/config.rs +++ b/server/src/config.rs @@ -224,7 +224,7 @@ impl Config { .db_initialized(db_name) .context(DatabaseNotFound { db_name })?; - db.update_db_rules(update).map_err(UpdateError::Closure) + db.update_rules(update).map_err(UpdateError::Closure) } /// Get all registered remote servers. diff --git a/server/src/db.rs b/server/src/db.rs index dc198aa292..f94eb4b28c 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -333,13 +333,40 @@ impl Db { } /// Updates the database rules - pub fn update_db_rules(&self, update: F) -> Result, E> + pub fn update_rules(&self, update: F) -> Result, E> where F: FnOnce(DatabaseRules) -> Result, { - let mut rules = self.rules.write(); - let new_rules = Arc::new(update(rules.as_ref().clone())?); - *rules = Arc::clone(&new_rules); + let (late_arrive_window_updated, new_rules) = { + let mut rules = self.rules.write(); + info!(db_name=%rules.name, "updating rules for database"); + let new_rules = Arc::new(update(rules.as_ref().clone())?); + let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds + != new_rules.lifecycle_rules.late_arrive_window_seconds; + + *rules = Arc::clone(&new_rules); + (late_arrive_window_updated, new_rules) + }; + + if late_arrive_window_updated { + // Hold a read lock to prevent concurrent modification and + // use values from re-acquired read guard + let current = self.rules.read(); + + // Update windows + let partitions = self.catalog.partitions(); + for partition in &partitions { + let mut partition = partition.write(); + let addr = partition.addr().clone(); + if let Some(windows) = partition.persistence_windows_mut() { + info!(partition=%addr, "updating persistence windows"); + windows.set_late_arrival_period(Duration::from_secs( + current.lifecycle_rules.late_arrive_window_seconds.get() as u64, + )) + } + } + } + Ok(new_rules) } diff --git a/server/src/lib.rs b/server/src/lib.rs index 48246918d1..e2dc829e31 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -1733,7 +1733,7 @@ mod tests { let remote_ids = vec![bad_remote_id, good_remote_id_1, good_remote_id_2]; let db = server.db(&db_name).unwrap(); - db.update_db_rules(|mut rules| { + db.update_rules(|mut rules| { let shard_config = ShardConfig { hash_ring: Some(HashRing { shards: vec![TEST_SHARD_ID].into(), @@ -1976,7 +1976,7 @@ mod tests { let db_name = DatabaseName::new("foo").unwrap(); let db = server.db(&db_name).unwrap(); let rules = db - .update_db_rules(|mut rules| { + .update_rules(|mut rules| { rules.lifecycle_rules.buffer_size_hard = Some(std::num::NonZeroUsize::new(10).unwrap()); Ok::<_, Infallible>(rules) diff --git a/tests/end_to_end_cases/persistence.rs b/tests/end_to_end_cases/persistence.rs index 634c506856..af4dccfa61 100644 --- a/tests/end_to_end_cases/persistence.rs +++ b/tests/end_to_end_cases/persistence.rs @@ -49,16 +49,43 @@ async fn test_chunk_is_persisted_automatically() { assert_eq!(chunks[0].row_count, 1_000); } +async fn write_data( + write_client: &mut influxdb_iox_client::write::Client, + db_name: &str, + num_payloads: u64, + num_duplicates: u64, + payload_size: u64, +) { + let payloads: Vec<_> = (0..num_payloads) + .map(|x| { + (0..payload_size) + .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i)) + .join("\n") + }) + .collect(); + + for payload in &payloads { + // Writing the same data multiple times should be compacted away + for _ in 0..=num_duplicates { + let num_lines_written = write_client + .write(db_name, payload) + .await + .expect("successful write"); + assert_eq!(num_lines_written, payload_size as usize); + } + } +} + #[tokio::test] async fn test_full_lifecycle() { let fixture = ServerFixture::create_shared().await; let mut write_client = fixture.write_client(); let num_payloads = 10; - let num_duplicates = 2; + let num_duplicates = 1; let payload_size = 1_000; - let total_rows = num_payloads * num_duplicates * payload_size; + let total_rows = num_payloads * (1 + num_duplicates) * payload_size; let db_name = rand_name(); DatabaseBuilder::new(db_name.clone()) @@ -73,24 +100,14 @@ async fn test_full_lifecycle() { .build(fixture.grpc_channel()) .await; - let payloads: Vec<_> = (0..num_payloads) - .map(|x| { - (0..payload_size) - .map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i)) - .join("\n") - }) - .collect(); - - for payload in &payloads { - // Writing the same data multiple times should be compacted away - for _ in 0..num_duplicates { - let num_lines_written = write_client - .write(&db_name, payload) - .await - .expect("successful write"); - assert_eq!(num_lines_written, payload_size as usize); - } - } + write_data( + &mut write_client, + &db_name, + num_payloads, + num_duplicates, + payload_size, + ) + .await; wait_for_exact_chunk_states( &fixture, @@ -123,6 +140,58 @@ async fn test_full_lifecycle() { assert_eq!(chunks[0].row_count, (num_payloads * payload_size) as usize) } +#[tokio::test] +async fn test_update_late_arrival() { + let fixture = ServerFixture::create_shared().await; + let mut write_client = fixture.write_client(); + + let payload_size = 100; + + let db_name = rand_name(); + DatabaseBuilder::new(db_name.clone()) + .persist(true) + // Don't close MUB automatically + .mub_row_threshold(payload_size * 2) + .persist_row_threshold(payload_size) + .persist_age_threshold_seconds(1000) + // Initially set to be a large value + .late_arrive_window_seconds(1000) + .build(fixture.grpc_channel()) + .await; + + write_data(&mut write_client, &db_name, 1, 0, payload_size).await; + + let mut management = fixture.management_client(); + + let chunks = management.list_chunks(&db_name).await.unwrap(); + assert_eq!(chunks.len(), 1); + assert_eq!( + chunks[0].storage, + influxdb_iox_client::management::generated_types::ChunkStorage::OpenMutableBuffer as i32 + ); + + let mut rules = management.get_database(&db_name).await.unwrap(); + rules + .lifecycle_rules + .as_mut() + .unwrap() + .late_arrive_window_seconds = 1; + + fixture + .management_client() + .update_database(rules) + .await + .unwrap(); + + wait_for_exact_chunk_states( + &fixture, + &db_name, + vec![ChunkStorage::ReadBufferAndObjectStore], + std::time::Duration::from_secs(5), + ) + .await; +} + #[tokio::test] async fn test_query_chunk_after_restart() { // fixtures From 091837420f5fe83fba63a56b01a81aff041970e6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Jul 2021 14:10:57 +0100 Subject: [PATCH 14/27] feat: add PersistenceWindows sytem table (#2030) (#2062) * feat: add PersistenceWindows sytem table (#2030) * chore: update log Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com> --- .../cases/in/all_chunks_dropped.expected | 46 +++--- query_tests/src/sql.rs | 25 +-- server/src/db/catalog.rs | 20 ++- server/src/db/system_tables.rs | 11 +- server/src/db/system_tables/persistence.rs | 154 ++++++++++++++++++ 5 files changed, 220 insertions(+), 36 deletions(-) create mode 100644 server/src/db/system_tables/persistence.rs diff --git a/query_tests/cases/in/all_chunks_dropped.expected b/query_tests/cases/in/all_chunks_dropped.expected index 5febb4d2e9..65e17df50a 100644 --- a/query_tests/cases/in/all_chunks_dropped.expected +++ b/query_tests/cases/in/all_chunks_dropped.expected @@ -1,25 +1,27 @@ -- Test Setup: OneMeasurementAllChunksDropped -- SQL: SELECT * from information_schema.tables; -+---------------+--------------------+---------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+---------------+------------+ -| public | iox | h2o | BASE TABLE | -| public | system | chunks | BASE TABLE | -| public | system | columns | BASE TABLE | -| public | system | chunk_columns | BASE TABLE | -| public | system | operations | BASE TABLE | -| public | information_schema | tables | VIEW | -| public | information_schema | columns | VIEW | -+---------------+--------------------+---------------+------------+ ++---------------+--------------------+---------------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+---------------------+------------+ +| public | iox | h2o | BASE TABLE | +| public | system | chunks | BASE TABLE | +| public | system | columns | BASE TABLE | +| public | system | chunk_columns | BASE TABLE | +| public | system | operations | BASE TABLE | +| public | system | persistence_windows | BASE TABLE | +| public | information_schema | tables | VIEW | +| public | information_schema | columns | VIEW | ++---------------+--------------------+---------------------+------------+ -- SQL: SHOW TABLES; -+---------------+--------------------+---------------+------------+ -| table_catalog | table_schema | table_name | table_type | -+---------------+--------------------+---------------+------------+ -| public | iox | h2o | BASE TABLE | -| public | system | chunks | BASE TABLE | -| public | system | columns | BASE TABLE | -| public | system | chunk_columns | BASE TABLE | -| public | system | operations | BASE TABLE | -| public | information_schema | tables | VIEW | -| public | information_schema | columns | VIEW | -+---------------+--------------------+---------------+------------+ ++---------------+--------------------+---------------------+------------+ +| table_catalog | table_schema | table_name | table_type | ++---------------+--------------------+---------------------+------------+ +| public | iox | h2o | BASE TABLE | +| public | system | chunks | BASE TABLE | +| public | system | columns | BASE TABLE | +| public | system | chunk_columns | BASE TABLE | +| public | system | operations | BASE TABLE | +| public | system | persistence_windows | BASE TABLE | +| public | information_schema | tables | VIEW | +| public | information_schema | columns | VIEW | ++---------------+--------------------+---------------------+------------+ diff --git a/query_tests/src/sql.rs b/query_tests/src/sql.rs index 58072c5a97..6fc2c13550 100644 --- a/query_tests/src/sql.rs +++ b/query_tests/src/sql.rs @@ -184,18 +184,19 @@ async fn sql_select_from_information_schema_tables() { // validate we have access to information schema for listing table // names let expected = vec![ - "+---------------+--------------------+---------------+------------+", - "| table_catalog | table_schema | table_name | table_type |", - "+---------------+--------------------+---------------+------------+", - "| public | information_schema | columns | VIEW |", - "| public | information_schema | tables | VIEW |", - "| public | iox | h2o | BASE TABLE |", - "| public | iox | o2 | BASE TABLE |", - "| public | system | chunk_columns | BASE TABLE |", - "| public | system | chunks | BASE TABLE |", - "| public | system | columns | BASE TABLE |", - "| public | system | operations | BASE TABLE |", - "+---------------+--------------------+---------------+------------+", + "+---------------+--------------------+---------------------+------------+", + "| table_catalog | table_schema | table_name | table_type |", + "+---------------+--------------------+---------------------+------------+", + "| public | information_schema | columns | VIEW |", + "| public | information_schema | tables | VIEW |", + "| public | iox | h2o | BASE TABLE |", + "| public | iox | o2 | BASE TABLE |", + "| public | system | chunk_columns | BASE TABLE |", + "| public | system | chunks | BASE TABLE |", + "| public | system | columns | BASE TABLE |", + "| public | system | operations | BASE TABLE |", + "| public | system | persistence_windows | BASE TABLE |", + "+---------------+--------------------+---------------------+------------+", ]; run_sql_test_case!( TwoMeasurementsManyFields {}, diff --git a/server/src/db/catalog.rs b/server/src/db/catalog.rs index c2953a3de6..dff3c37b6b 100644 --- a/server/src/db/catalog.rs +++ b/server/src/db/catalog.rs @@ -6,7 +6,7 @@ use hashbrown::{HashMap, HashSet}; use data_types::chunk_metadata::ChunkSummary; use data_types::chunk_metadata::DetailedChunkSummary; -use data_types::partition_metadata::{PartitionSummary, TableSummary}; +use data_types::partition_metadata::{PartitionAddr, PartitionSummary, TableSummary}; use internal_types::schema::Schema; use snafu::{OptionExt, Snafu}; use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; @@ -15,6 +15,7 @@ use self::chunk::CatalogChunk; use self::metrics::CatalogMetrics; use self::partition::Partition; use self::table::Table; +use data_types::write_summary::WriteSummary; pub mod chunk; mod metrics; @@ -225,6 +226,23 @@ impl Catalog { .collect() } + /// Returns a list of persistence window summaries for each partition + pub fn persistence_summaries(&self) -> Vec<(PartitionAddr, WriteSummary)> { + let mut summaries = Vec::new(); + let tables = self.tables.read(); + for table in tables.values() { + for partition in table.partitions() { + let partition = partition.read(); + if let Some(w) = partition.persistence_windows() { + for summary in w.summaries() { + summaries.push((partition.addr().clone(), summary)) + } + } + } + } + summaries + } + pub fn chunk_summaries(&self) -> Vec { let partition_key = None; let table_names = TableNameFilter::AllTables; diff --git a/server/src/db/system_tables.rs b/server/src/db/system_tables.rs index f83c793fa5..bcc474e230 100644 --- a/server/src/db/system_tables.rs +++ b/server/src/db/system_tables.rs @@ -31,6 +31,7 @@ use super::catalog::Catalog; mod chunks; mod columns; mod operations; +mod persistence; // The IOx system schema pub const SYSTEM_SCHEMA: &str = "system"; @@ -39,12 +40,14 @@ const CHUNKS: &str = "chunks"; const COLUMNS: &str = "columns"; const CHUNK_COLUMNS: &str = "chunk_columns"; const OPERATIONS: &str = "operations"; +const PERSISTENCE_WINDOWS: &str = "persistence_windows"; pub struct SystemSchemaProvider { chunks: Arc, columns: Arc, chunk_columns: Arc, operations: Arc, + persistence_windows: Arc, } impl std::fmt::Debug for SystemSchemaProvider { @@ -65,16 +68,20 @@ impl SystemSchemaProvider { inner: columns::ColumnsTable::new(Arc::clone(&catalog)), }); let chunk_columns = Arc::new(SystemTableProvider { - inner: columns::ChunkColumnsTable::new(catalog), + inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)), }); let operations = Arc::new(SystemTableProvider { inner: operations::OperationsTable::new(db_name, jobs), }); + let persistence_windows = Arc::new(SystemTableProvider { + inner: persistence::PersistenceWindowsTable::new(catalog), + }); Self { chunks, columns, chunk_columns, operations, + persistence_windows, } } } @@ -90,6 +97,7 @@ impl SchemaProvider for SystemSchemaProvider { COLUMNS.to_string(), CHUNK_COLUMNS.to_string(), OPERATIONS.to_string(), + PERSISTENCE_WINDOWS.to_string(), ] } @@ -99,6 +107,7 @@ impl SchemaProvider for SystemSchemaProvider { COLUMNS => Some(Arc::clone(&self.columns)), CHUNK_COLUMNS => Some(Arc::clone(&self.chunk_columns)), OPERATIONS => Some(Arc::clone(&self.operations)), + PERSISTENCE_WINDOWS => Some(Arc::clone(&self.persistence_windows)), _ => None, } } diff --git a/server/src/db/system_tables/persistence.rs b/server/src/db/system_tables/persistence.rs new file mode 100644 index 0000000000..3392ff5032 --- /dev/null +++ b/server/src/db/system_tables/persistence.rs @@ -0,0 +1,154 @@ +use std::sync::Arc; + +use arrow::array::{StringArray, TimestampNanosecondArray, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::error::Result; +use arrow::record_batch::RecordBatch; + +use data_types::error::ErrorLogger; +use data_types::partition_metadata::PartitionAddr; +use data_types::write_summary::WriteSummary; + +use crate::db::catalog::Catalog; +use crate::db::system_tables::IoxSystemTable; + +/// Implementation of system.persistence_windows table +#[derive(Debug)] +pub(super) struct PersistenceWindowsTable { + schema: SchemaRef, + catalog: Arc, +} + +impl PersistenceWindowsTable { + pub(super) fn new(catalog: Arc) -> Self { + Self { + schema: persistence_windows_schema(), + catalog, + } + } +} + +impl IoxSystemTable for PersistenceWindowsTable { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn batch(&self) -> Result { + from_write_summaries(self.schema(), self.catalog.persistence_summaries()) + .log_if_error("system.persistence_windows table") + } +} + +fn persistence_windows_schema() -> SchemaRef { + let ts = DataType::Timestamp(TimeUnit::Nanosecond, None); + Arc::new(Schema::new(vec![ + Field::new("partition_key", DataType::Utf8, false), + Field::new("table_name", DataType::Utf8, false), + Field::new("row_count", DataType::UInt64, false), + Field::new("time_of_first_write", ts.clone(), false), + Field::new("time_of_last_write", ts.clone(), false), + Field::new("min_timestamp", ts.clone(), false), + Field::new("max_timestamp", ts, false), + ])) +} + +fn from_write_summaries( + schema: SchemaRef, + chunks: Vec<(PartitionAddr, WriteSummary)>, +) -> Result { + let partition_key = chunks + .iter() + .map(|(addr, _)| Some(addr.partition_key.as_ref())) + .collect::(); + let table_name = chunks + .iter() + .map(|(addr, _)| Some(addr.table_name.as_ref())) + .collect::(); + let row_counts = chunks + .iter() + .map(|(_, w)| Some(w.row_count as u64)) + .collect::(); + let time_of_first_write = chunks + .iter() + .map(|(_, w)| Some(w.time_of_first_write.timestamp_nanos())) + .collect::(); + let time_of_last_write = chunks + .iter() + .map(|(_, w)| Some(w.time_of_last_write.timestamp_nanos())) + .collect::(); + let min_timestamp = chunks + .iter() + .map(|(_, w)| Some(w.min_timestamp.timestamp_nanos())) + .collect::(); + let max_timestamp = chunks + .iter() + .map(|(_, w)| Some(w.max_timestamp.timestamp_nanos())) + .collect::(); + + RecordBatch::try_new( + schema, + vec![ + Arc::new(partition_key), + Arc::new(table_name), + Arc::new(row_counts), + Arc::new(time_of_first_write), + Arc::new(time_of_last_write), + Arc::new(min_timestamp), + Arc::new(max_timestamp), + ], + ) +} + +#[cfg(test)] +mod tests { + use chrono::{TimeZone, Utc}; + + use arrow_util::assert_batches_eq; + + use super::*; + + #[test] + fn test_from_write_summaries() { + let addr = PartitionAddr { + db_name: Arc::from("db"), + table_name: Arc::from("table"), + partition_key: Arc::from("partition"), + }; + + let summaries = vec![ + ( + addr.clone(), + WriteSummary { + time_of_first_write: Utc.timestamp_nanos(0), + time_of_last_write: Utc.timestamp_nanos(20), + min_timestamp: Utc.timestamp_nanos(50), + max_timestamp: Utc.timestamp_nanos(60), + row_count: 320, + }, + ), + ( + addr, + WriteSummary { + time_of_first_write: Utc.timestamp_nanos(6), + time_of_last_write: Utc.timestamp_nanos(21), + min_timestamp: Utc.timestamp_nanos(1), + max_timestamp: Utc.timestamp_nanos(2), + row_count: 2, + }, + ), + ]; + + let expected = vec![ + "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+", + "| partition_key | table_name | row_count | time_of_first_write | time_of_last_write | min_timestamp | max_timestamp |", + "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+", + "| partition | table | 320 | 1970-01-01 00:00:00 | 1970-01-01 00:00:00.000000020 | 1970-01-01 00:00:00.000000050 | 1970-01-01 00:00:00.000000060 |", + "| partition | table | 2 | 1970-01-01 00:00:00.000000006 | 1970-01-01 00:00:00.000000021 | 1970-01-01 00:00:00.000000001 | 1970-01-01 00:00:00.000000002 |", + "+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+", + ]; + + let schema = persistence_windows_schema(); + let batch = from_write_summaries(schema, summaries).unwrap(); + assert_batches_eq!(&expected, &[batch]); + } +} From 61da0fe4dfe218c24da92395937323c710b18e85 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Tue, 20 Jul 2021 17:38:28 +0100 Subject: [PATCH 15/27] fix: update last_instant when rotating into persistable window (#2067) --- .../src/persistence_windows.rs | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs index 5b66a593f8..957d034d06 100644 --- a/persistence_windows/src/persistence_windows.rs +++ b/persistence_windows/src/persistence_windows.rs @@ -499,6 +499,10 @@ impl Window { /// Add one window to another. Used to collapse closed windows into persisted. fn add_window(&mut self, other: Self) { + assert!(self.last_instant <= other.created_at); + assert!(self.last_instant <= other.last_instant); + + self.last_instant = other.last_instant; self.row_count += other.row_count; if self.min_time > other.min_time { self.min_time = other.min_time; @@ -1327,7 +1331,8 @@ mod tests { #[test] fn test_summaries() { - let mut w = make_windows(Duration::from_secs(100)); + let late_arrival_period = Duration::from_secs(100); + let mut w = make_windows(late_arrival_period); let instant = w.created_at_instant; // Window 1 @@ -1407,6 +1412,34 @@ mod tests { row_count: 8 }, ] - ) + ); + + // Rotate first and second windows into persistable + w.rotate(instant + late_arrival_period + DEFAULT_CLOSED_WINDOW_PERIOD * 2); + + let summaries: Vec<_> = w.summaries().collect(); + + assert_eq!(summaries.len(), 2); + assert_eq!( + summaries, + vec![ + WriteSummary { + time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1), + time_of_last_write: w.created_at_time + + closed_duration + + chrono::Duration::milliseconds(1), + min_timestamp: Utc.timestamp_nanos(1), + max_timestamp: Utc.timestamp_nanos(340), + row_count: 24 + }, + WriteSummary { + time_of_first_write: w.created_at_time + closed_duration * 3, + time_of_last_write: w.created_at_time + closed_duration * 3, + min_timestamp: Utc.timestamp_nanos(3), + max_timestamp: Utc.timestamp_nanos(4), + row_count: 8 + }, + ] + ); } } From 297e0590859e90311b75b8095bd0556ee532bf65 Mon Sep 17 00:00:00 2001 From: Paul Dix Date: Mon, 19 Jul 2021 17:52:40 -0400 Subject: [PATCH 16/27] feat: add parquet cache size setting to database rules --- data_types/src/database_rules.rs | 4 ++++ .../iox/management/v1/database_rules.proto | 7 +++++++ generated_types/src/database_rules/lifecycle.rs | 16 ++++++++++++++++ src/commands/database.rs | 6 ++++++ 4 files changed, 33 insertions(+) diff --git a/data_types/src/database_rules.rs b/data_types/src/database_rules.rs index 6cff31a510..ddde203c93 100644 --- a/data_types/src/database_rules.rs +++ b/data_types/src/database_rules.rs @@ -166,6 +166,9 @@ pub struct LifecycleRules { /// Maximum number of rows to buffer in a MUB chunk before compacting it pub mub_row_threshold: NonZeroUsize, + + /// Use up to this amount of space in bytes for caching Parquet files + pub parquet_cache_limit: Option, } impl LifecycleRules { @@ -195,6 +198,7 @@ impl Default for LifecycleRules { persist_age_threshold_seconds: NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS) .unwrap(), mub_row_threshold: NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap(), + parquet_cache_limit: None, } } } diff --git a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto index 530c95bca6..68d9cbf6f9 100644 --- a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto +++ b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto @@ -82,6 +82,13 @@ message LifecycleRules { // If 0, compactions are limited to the default number. // See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS uint32 max_active_compactions = 16; + + // Use up to this amount of space in bytes for caching Parquet files + ParquetCacheLimit parquet_cache_limit = 17; +} + +message ParquetCacheLimit { + uint64 value = 1; } message DatabaseRules { diff --git a/generated_types/src/database_rules/lifecycle.rs b/generated_types/src/database_rules/lifecycle.rs index b9612bf1b6..9a62c54df6 100644 --- a/generated_types/src/database_rules/lifecycle.rs +++ b/generated_types/src/database_rules/lifecycle.rs @@ -10,6 +10,7 @@ use data_types::database_rules::{ use crate::google::FieldViolation; use crate::influxdata::iox::management::v1 as management; +use crate::influxdata::iox::management::v1::ParquetCacheLimit; impl From for management::LifecycleRules { fn from(config: LifecycleRules) -> Self { @@ -35,6 +36,9 @@ impl From for management::LifecycleRules { persist_row_threshold: config.persist_row_threshold.get() as u64, persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(), mub_row_threshold: config.mub_row_threshold.get() as u64, + parquet_cache_limit: config.parquet_cache_limit.map(|x| ParquetCacheLimit { + value: x.get() as u64, + }), } } } @@ -43,6 +47,11 @@ impl TryFrom for LifecycleRules { type Error = FieldViolation; fn try_from(proto: management::LifecycleRules) -> Result { + let parquet_cache_limit = match proto.parquet_cache_limit { + Some(l) => (l.value as usize).try_into().ok(), + None => None, + }; + Ok(Self { buffer_size_soft: (proto.buffer_size_soft as usize).try_into().ok(), buffer_size_hard: (proto.buffer_size_hard as usize).try_into().ok(), @@ -69,6 +78,7 @@ impl TryFrom for LifecycleRules { .unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()), mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize) .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()), + parquet_cache_limit, }) } } @@ -93,6 +103,7 @@ mod tests { persist_row_threshold: 57, persist_age_threshold_seconds: 23, mub_row_threshold: 3454, + parquet_cache_limit: Some(ParquetCacheLimit { value: 10 }), }; let config: LifecycleRules = protobuf.clone().try_into().unwrap(); @@ -125,6 +136,11 @@ mod tests { protobuf.persist_age_threshold_seconds ); assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold); + assert_eq!( + config.parquet_cache_limit.unwrap().get(), + protobuf.parquet_cache_limit.as_ref().unwrap().value as usize + ); + assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit); } #[test] diff --git a/src/commands/database.rs b/src/commands/database.rs index d7d4ddf81d..77f25471a5 100644 --- a/src/commands/database.rs +++ b/src/commands/database.rs @@ -13,6 +13,7 @@ use influxdb_iox_client::{ }, write::{self, WriteError}, }; +use std::num::NonZeroUsize; mod catalog; mod chunk; @@ -119,6 +120,10 @@ struct Create { /// Maximum number of rows to buffer in a MUB chunk before compacting it #[structopt(long, default_value = "100000")] mub_row_threshold: u64, + + /// Use up to this amount of space in bytes for caching Parquet files + #[structopt(long, parse(try_from_str))] + pub parquet_cache_limit: Option, } /// Get list of databases @@ -193,6 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> { persist_row_threshold: command.persist_row_threshold, persist_age_threshold_seconds: command.persist_age_threshold_seconds, mub_row_threshold: command.mub_row_threshold, + parquet_cache_limit: command.parquet_cache_limit.map(|l| ParquetCacheLimit{value: l.get() as u64}), }), // Default to hourly partitions From a4704dd165a816aa5beb9c78a1cb5ea6a3684981 Mon Sep 17 00:00:00 2001 From: Paul Dix Date: Tue, 20 Jul 2021 15:40:50 -0400 Subject: [PATCH 17/27] chore: update parquet_cache_limit to u64 and 0 for default --- data_types/src/database_rules.rs | 5 +++-- .../iox/management/v1/database_rules.proto | 9 +++------ .../src/database_rules/lifecycle.rs | 19 +++++++------------ src/commands/database.rs | 10 +++++----- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/data_types/src/database_rules.rs b/data_types/src/database_rules.rs index ddde203c93..86a71be778 100644 --- a/data_types/src/database_rules.rs +++ b/data_types/src/database_rules.rs @@ -167,8 +167,9 @@ pub struct LifecycleRules { /// Maximum number of rows to buffer in a MUB chunk before compacting it pub mub_row_threshold: NonZeroUsize, - /// Use up to this amount of space in bytes for caching Parquet files - pub parquet_cache_limit: Option, + /// Use up to this amount of space in bytes for caching Parquet files. None + /// will disable Parquet file caching. + pub parquet_cache_limit: Option, } impl LifecycleRules { diff --git a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto index 68d9cbf6f9..b1ad761dbe 100644 --- a/generated_types/protos/influxdata/iox/management/v1/database_rules.proto +++ b/generated_types/protos/influxdata/iox/management/v1/database_rules.proto @@ -83,12 +83,9 @@ message LifecycleRules { // See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS uint32 max_active_compactions = 16; - // Use up to this amount of space in bytes for caching Parquet files - ParquetCacheLimit parquet_cache_limit = 17; -} - -message ParquetCacheLimit { - uint64 value = 1; + // Use up to this amount of space in bytes for caching Parquet files. + // A value of 0 disables Parquet caching + uint64 parquet_cache_limit = 17; } message DatabaseRules { diff --git a/generated_types/src/database_rules/lifecycle.rs b/generated_types/src/database_rules/lifecycle.rs index 9a62c54df6..ab71e38de5 100644 --- a/generated_types/src/database_rules/lifecycle.rs +++ b/generated_types/src/database_rules/lifecycle.rs @@ -10,7 +10,6 @@ use data_types::database_rules::{ use crate::google::FieldViolation; use crate::influxdata::iox::management::v1 as management; -use crate::influxdata::iox::management::v1::ParquetCacheLimit; impl From for management::LifecycleRules { fn from(config: LifecycleRules) -> Self { @@ -36,9 +35,10 @@ impl From for management::LifecycleRules { persist_row_threshold: config.persist_row_threshold.get() as u64, persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(), mub_row_threshold: config.mub_row_threshold.get() as u64, - parquet_cache_limit: config.parquet_cache_limit.map(|x| ParquetCacheLimit { - value: x.get() as u64, - }), + parquet_cache_limit: config + .parquet_cache_limit + .map(|v| v.get()) + .unwrap_or_default(), } } } @@ -47,11 +47,6 @@ impl TryFrom for LifecycleRules { type Error = FieldViolation; fn try_from(proto: management::LifecycleRules) -> Result { - let parquet_cache_limit = match proto.parquet_cache_limit { - Some(l) => (l.value as usize).try_into().ok(), - None => None, - }; - Ok(Self { buffer_size_soft: (proto.buffer_size_soft as usize).try_into().ok(), buffer_size_hard: (proto.buffer_size_hard as usize).try_into().ok(), @@ -78,7 +73,7 @@ impl TryFrom for LifecycleRules { .unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()), mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize) .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()), - parquet_cache_limit, + parquet_cache_limit: NonZeroU64::new(proto.parquet_cache_limit), }) } } @@ -103,7 +98,7 @@ mod tests { persist_row_threshold: 57, persist_age_threshold_seconds: 23, mub_row_threshold: 3454, - parquet_cache_limit: Some(ParquetCacheLimit { value: 10 }), + parquet_cache_limit: 10, }; let config: LifecycleRules = protobuf.clone().try_into().unwrap(); @@ -138,7 +133,7 @@ mod tests { assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold); assert_eq!( config.parquet_cache_limit.unwrap().get(), - protobuf.parquet_cache_limit.as_ref().unwrap().value as usize + protobuf.parquet_cache_limit ); assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit); } diff --git a/src/commands/database.rs b/src/commands/database.rs index 77f25471a5..3ff2c2bbf9 100644 --- a/src/commands/database.rs +++ b/src/commands/database.rs @@ -13,7 +13,6 @@ use influxdb_iox_client::{ }, write::{self, WriteError}, }; -use std::num::NonZeroUsize; mod catalog; mod chunk; @@ -121,9 +120,10 @@ struct Create { #[structopt(long, default_value = "100000")] mub_row_threshold: u64, - /// Use up to this amount of space in bytes for caching Parquet files - #[structopt(long, parse(try_from_str))] - pub parquet_cache_limit: Option, + /// Use up to this amount of space in bytes for caching Parquet files. A + /// value of zero disables Parquet file caching. + #[structopt(long, default_value = "0")] + parquet_cache_limit: u64, } /// Get list of databases @@ -198,7 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> { persist_row_threshold: command.persist_row_threshold, persist_age_threshold_seconds: command.persist_age_threshold_seconds, mub_row_threshold: command.mub_row_threshold, - parquet_cache_limit: command.parquet_cache_limit.map(|l| ParquetCacheLimit{value: l.get() as u64}), + parquet_cache_limit: command.parquet_cache_limit, }), // Default to hourly partitions From 387667330a4822f7af21c05c4e08fbb6be875def Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 21 Jul 2021 04:27:03 -0400 Subject: [PATCH 18/27] chore: Update datafusion deps (#2073) * chore: Update datafusion deps * fix: update tests --- Cargo.lock | 18 +- datafusion/Cargo.toml | 2 +- query_tests/cases/in/pushdown.expected | 288 ++++++++++++------------- 3 files changed, 154 insertions(+), 154 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ec611883bf..cbc1081590 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -769,9 +769,9 @@ dependencies = [ [[package]] name = "crypto-mac" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6" +checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" dependencies = [ "generic-array", "subtle", @@ -843,7 +843,7 @@ dependencies = [ [[package]] name = "datafusion" version = "4.0.0-SNAPSHOT" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=3fb600df48ab1e53903b1a9bb12ebde33ad0856b#3fb600df48ab1e53903b1a9bb12ebde33ad0856b" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=30693df8961dca300306dfd0c8fca130375b50b3#30693df8961dca300306dfd0c8fca130375b50b3" dependencies = [ "ahash 0.7.4", "arrow", @@ -4330,9 +4330,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342" +checksum = "4ac2e1d4bd0f75279cfd5a076e0d578bbf02c22b7c39e766c437dd49b3ec43e0" dependencies = [ "tinyvec_macros", ] @@ -4345,9 +4345,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985" +checksum = "c2602b8af3767c285202012822834005f596c811042315fa7e9f5b12b2a43207" dependencies = [ "autocfg", "bytes", @@ -4984,9 +4984,9 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a" [[package]] name = "zeroize" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeafe61337cb2c879d328b74aa6cd9d794592c82da6be559fdf11493f02a2d18" +checksum = "377db0846015f7ae377174787dd452e1c5f5a9050bc6f954911d01f116daa0cd" [[package]] name = "zstd" diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index f3a735d307..f969251e6b 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version" # Rename to workaround doctest bug # Turn off optional datafusion features (function packages) -upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="3fb600df48ab1e53903b1a9bb12ebde33ad0856b", default-features = false, package = "datafusion" } +upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="30693df8961dca300306dfd0c8fca130375b50b3", default-features = false, package = "datafusion" } diff --git a/query_tests/cases/in/pushdown.expected b/query_tests/cases/in/pushdown.expected index dd58342994..e4e84b8b9e 100644 --- a/query_tests/cases/in/pushdown.expected +++ b/query_tests/cases/in/pushdown.expected @@ -10,158 +10,158 @@ | | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | +---------------+---------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where count > 200; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: CAST(count@0 AS Int64) > 200 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+--------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200)] | ++---------------+--------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where count > 200.0; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Float64(200) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: CAST(count@0 AS Float64) > 200 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+----------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+----------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Float64(200) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Float64(200)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Float64) > 200 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Float64(200)] | ++---------------+----------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where system > 4.0; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: system@1 > 4 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+---------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(4) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 4 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4)] | ++---------------+---------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury'; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury")] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence'); -+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000; -+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence"), #restaurant.count Lt Int64(40000)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury"), #count Lt Int64(40000)] | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where count > 200 and count < 40000; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.count Lt Int64(40000)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #count Lt Int64(40000)] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: system@1 > 4 AND system@1 < 7 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4), #restaurant.system Lt Float64(7)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 4 AND system@1 < 7 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4), #system Lt Float64(7)] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0; -+---------------+---------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+---------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: system@1 > 5 AND system@1 < 7 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+---------------------------------------------------------------------------------------------+ ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.system Lt Float64(7)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND system@1 < 7 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #system Lt Float64(7)] | ++---------------+---------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system; -+---------------+-----------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+-----------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+-----------------------------------------------------------------------------------------------------------------------------+ ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.town NotEq Utf8("tewsbury"), Float64(7) Gt #restaurant.system] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #town NotEq Utf8("tewsbury"), Float64(7) Gt #system] | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading'); -+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), Utf8("tewsbury") NotEq #restaurant.town, #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), Utf8("tewsbury") NotEq #town, #system Lt Float64(7)] | ++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00'); -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| plan_type | plan | -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | -| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) | -| | TableScan: restaurant projection=Some([0, 1, 2, 3]) | -| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | -| | CoalesceBatchesExec: target_batch_size=500 | -| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00) | -| | RepartitionExec: partitioning=RoundRobinBatch(4) | -| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate | -+---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| plan_type | plan | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town | +| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt TimestampNanosecond(130) | +| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[Float64(5) Lt #restaurant.system, #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading"), #restaurant.time Gt TimestampNanosecond(130)] | +| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] | +| | CoalesceBatchesExec: target_batch_size=500 | +| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > 130 | +| | RepartitionExec: partitioning=RoundRobinBatch(4) | +| | IOxReadFilterNode: table_name=restaurant, chunks=0 predicate=Predicate exprs: [Float64(5) Lt #system, #town NotEq Utf8("tewsbury"), #system Lt Float64(7), #time Gt TimestampNanosecond(130)] | ++---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ From 5df88c70aa91709d35dc820af890fcbf10be0d19 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 10:34:08 +0200 Subject: [PATCH 19/27] feat: add ability to fetch watermarks from write buffer --- write_buffer/src/core.rs | 191 +++++++++++++++++++++++++++++++++----- write_buffer/src/kafka.rs | 75 +++++++++++---- write_buffer/src/mock.rs | 42 ++++++++- 3 files changed, 264 insertions(+), 44 deletions(-) diff --git a/write_buffer/src/core.rs b/write_buffer/src/core.rs index fdefc76746..f604b80862 100644 --- a/write_buffer/src/core.rs +++ b/write_buffer/src/core.rs @@ -1,6 +1,8 @@ +use std::fmt::Debug; + use async_trait::async_trait; use entry::{Entry, Sequence, SequencedEntry}; -use futures::stream::BoxStream; +use futures::{future::BoxFuture, stream::BoxStream}; /// Generic boxed error type that is used in this crate. /// @@ -10,7 +12,7 @@ pub type WriteBufferError = Box; /// Writing to a Write Buffer takes an [`Entry`] and returns [`Sequence`] data that facilitates reading /// entries from the Write Buffer at a later time. #[async_trait] -pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static { +pub trait WriteBufferWriting: Sync + Send + Debug + 'static { /// Send an `Entry` to the write buffer using the specified sequencer ID. /// /// Returns information that can be used to restore entries at a later time. @@ -21,17 +23,42 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static { ) -> Result; } +pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result>; +pub type FetchHighWatermark<'a> = Box FetchHighWatermarkFut<'a>) + Send + Sync>; + /// Output stream of [`WriteBufferReading`]. -pub type EntryStream<'a> = BoxStream<'a, Result>; +pub struct EntryStream<'a> { + /// Stream that produces entries. + pub stream: BoxStream<'a, Result>, + + /// Get high watermark (= what we believe is the next sequence number to be added). + /// + /// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts + /// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1. + pub fetch_high_watermark: FetchHighWatermark<'a>, +} + +impl<'a> Debug for EntryStream<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("EntryStream").finish_non_exhaustive() + } +} /// Produce streams (one per sequencer) of [`SequencedEntry`]s. #[async_trait] -pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static { +pub trait WriteBufferReading: Sync + Send + Debug + 'static { /// Returns a stream per sequencer. + /// + /// Note that due to the mutable borrow, it is not possible to have multiple streams from the same + /// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last + /// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either + /// create a new [`WriteBufferReading`] or use [`seek`](Self::seek). fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>; /// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least /// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream). + /// + /// Note that due to the mutable borrow, it is not possible to seek while streams exists. async fn seek( &mut self, sequencer_id: u32, @@ -68,9 +95,11 @@ pub mod test_utils { T: TestAdapter, { test_single_stream_io(&adapter).await; + test_multi_stream_io(&adapter).await; test_multi_sequencer_io(&adapter).await; test_multi_writer_multi_reader(&adapter).await; test_seek(&adapter).await; + test_watermark(&adapter).await; } async fn test_single_stream_io(adapter: &T) @@ -94,23 +123,90 @@ pub mod test_utils { let mut cx = futures::task::Context::from_waker(&waker); // empty stream is pending - assert!(stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); // adding content allows us to get results writer.store_entry(&entry_1, sequencer_id).await.unwrap(); - assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_1); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_1 + ); // stream is pending again - assert!(stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); // adding more data unblocks the stream writer.store_entry(&entry_2, sequencer_id).await.unwrap(); writer.store_entry(&entry_3, sequencer_id).await.unwrap(); - assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_2); - assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_3); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_2 + ); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_3 + ); // stream is pending again - assert!(stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); + } + + async fn test_multi_stream_io(adapter: &T) + where + T: TestAdapter, + { + let context = adapter.new_context(1).await; + + let entry_1 = lp_to_entry("upc user=1 100"); + let entry_2 = lp_to_entry("upc user=2 200"); + let entry_3 = lp_to_entry("upc user=3 300"); + + let writer = context.writing(); + let mut reader = context.reading().await; + + let waker = futures::task::noop_waker(); + let mut cx = futures::task::Context::from_waker(&waker); + + writer.store_entry(&entry_1, 0).await.unwrap(); + writer.store_entry(&entry_2, 0).await.unwrap(); + writer.store_entry(&entry_3, 0).await.unwrap(); + + // creating stream, drop stream, re-create it => still starts at first entry + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, stream) = streams.pop().unwrap(); + drop(stream); + drop(streams); + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, mut stream) = streams.pop().unwrap(); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_1 + ); + + // re-creating stream after reading remembers offset + drop(stream); + drop(streams); + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, mut stream) = streams.pop().unwrap(); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_2 + ); + assert_eq!( + stream.stream.next().await.unwrap().unwrap().entry(), + &entry_3 + ); + + // re-creating stream after reading everything makes it pending + drop(stream); + drop(streams); + let mut streams = reader.streams(); + assert_eq!(streams.len(), 1); + let (_sequencer_id, mut stream) = streams.pop().unwrap(); + assert!(stream.stream.poll_next_unpin(&mut cx).is_pending()); } async fn test_multi_sequencer_io(adapter: &T) @@ -136,25 +232,34 @@ pub mod test_utils { let mut cx = futures::task::Context::from_waker(&waker); // empty streams are pending - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); // entries arrive at the right target stream writer.store_entry(&entry_1, sequencer_id_1).await.unwrap(); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + assert_eq!( + stream_1.stream.next().await.unwrap().unwrap().entry(), + &entry_1 + ); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); writer.store_entry(&entry_2, sequencer_id_2).await.unwrap(); - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2); + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert_eq!( + stream_2.stream.next().await.unwrap().unwrap().entry(), + &entry_2 + ); writer.store_entry(&entry_3, sequencer_id_1).await.unwrap(); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); - assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); + assert_eq!( + stream_1.stream.next().await.unwrap().unwrap().entry(), + &entry_3 + ); // streams are pending again - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); } async fn test_multi_writer_multi_reader(adapter: &T) @@ -239,8 +344,8 @@ pub mod test_utils { assert_eq!(streams.len(), 2); let (_sequencer_id, mut stream_1) = streams.pop().unwrap(); let (_sequencer_id, mut stream_2) = streams.pop().unwrap(); - assert!(stream_1.poll_next_unpin(&mut cx).is_pending()); - assert!(stream_2.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending()); + assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending()); drop(stream_1); drop(stream_2); drop(streams); @@ -249,6 +354,47 @@ pub mod test_utils { reader_1.seek(0, 42).await.unwrap(); } + async fn test_watermark(adapter: &T) + where + T: TestAdapter, + { + let context = adapter.new_context(2).await; + + let entry_east_1 = lp_to_entry("upc,region=east user=1 100"); + let entry_east_2 = lp_to_entry("upc,region=east user=2 200"); + let entry_west_1 = lp_to_entry("upc,region=west user=1 200"); + + let writer = context.writing(); + let mut reader = context.reading().await; + + let mut streams = reader.streams(); + assert_eq!(streams.len(), 2); + let (sequencer_id_1, stream_1) = streams.pop().unwrap(); + let (sequencer_id_2, stream_2) = streams.pop().unwrap(); + + // start at watermark 0 + assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0); + assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0); + + // high water mark moves + writer + .store_entry(&entry_east_1, sequencer_id_1) + .await + .unwrap(); + let mark_1 = writer + .store_entry(&entry_east_2, sequencer_id_1) + .await + .unwrap() + .number; + let mark_2 = writer + .store_entry(&entry_west_1, sequencer_id_2) + .await + .unwrap() + .number; + assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), mark_1 + 1); + assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), mark_2 + 1); + } + async fn assert_reader_content(reader: &mut R, expected: &[(u32, &[&Entry])]) where R: WriteBufferReading, @@ -264,6 +410,7 @@ pub mod test_utils { // we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever let mut results: Vec<_> = actual_stream + .stream .take(expected_entries.len()) .try_collect() .await diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index a32769ae2c..c786f93066 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -8,7 +8,7 @@ use std::{ use async_trait::async_trait; use data_types::server_id::ServerId; use entry::{Entry, Sequence, SequencedEntry}; -use futures::StreamExt; +use futures::{FutureExt, StreamExt}; use observability_deps::tracing::{debug, info}; use rdkafka::{ consumer::{BaseConsumer, Consumer, StreamConsumer}, @@ -18,7 +18,10 @@ use rdkafka::{ ClientConfig, Message, Offset, TopicPartitionList, }; -use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; +use crate::core::{ + EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading, + WriteBufferWriting, +}; pub struct KafkaBufferProducer { conn: String, @@ -112,25 +115,59 @@ impl std::fmt::Debug for KafkaBufferConsumer { #[async_trait] impl WriteBufferReading for KafkaBufferConsumer { fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> { - self.consumers - .iter() - .map(|(sequencer_id, consumer)| { - let stream = consumer - .stream() - .map(move |message| { - let message = message?; - let entry = Entry::try_from(message.payload().unwrap().to_vec())?; - let sequence = Sequence { - id: message.partition().try_into()?, - number: message.offset().try_into()?, - }; + let mut streams = vec![]; - Ok(SequencedEntry::new_from_sequence(sequence, entry)?) + for (sequencer_id, consumer) in &self.consumers { + let sequencer_id = *sequencer_id; + let consumer_cloned = Arc::clone(consumer); + let database_name = self.database_name.clone(); + + let stream = consumer + .stream() + .map(move |message| { + let message = message?; + let entry = Entry::try_from(message.payload().unwrap().to_vec())?; + let sequence = Sequence { + id: message.partition().try_into()?, + number: message.offset().try_into()?, + }; + + Ok(SequencedEntry::new_from_sequence(sequence, entry)?) + }) + .boxed(); + + let fetch_high_watermark = move || { + let consumer_cloned = Arc::clone(&consumer_cloned); + let database_name = database_name.clone(); + + let fut = async move { + let (_low, high) = tokio::task::spawn_blocking(move || { + consumer_cloned.fetch_watermarks( + &database_name, + sequencer_id as i32, + Duration::from_secs(60), + ) }) - .boxed(); - (*sequencer_id, stream) - }) - .collect() + .await + .expect("subtask failed")?; + + Ok(high as u64) + }; + + fut.boxed() as FetchHighWatermarkFut<'_> + }; + let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>; + + streams.push(( + sequencer_id, + EntryStream { + stream, + fetch_high_watermark, + }, + )); + } + + streams } async fn seek( diff --git a/write_buffer/src/mock.rs b/write_buffer/src/mock.rs index 37659ba05b..a67000633d 100644 --- a/write_buffer/src/mock.rs +++ b/write_buffer/src/mock.rs @@ -2,10 +2,13 @@ use std::{collections::BTreeMap, sync::Arc, task::Poll}; use async_trait::async_trait; use entry::{Entry, Sequence, SequencedEntry}; -use futures::{stream, StreamExt}; +use futures::{stream, FutureExt, StreamExt}; use parking_lot::Mutex; -use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting}; +use crate::core::{ + EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading, + WriteBufferWriting, +}; type EntryResVec = Vec>; @@ -244,7 +247,40 @@ impl WriteBufferReading for MockBufferForReading { Poll::Pending }) .boxed(); - streams.push((sequencer_id, stream)); + + let shared_state = self.shared_state.clone(); + + let fetch_high_watermark = move || { + let shared_state = shared_state.clone(); + + let fut = async move { + let entries = shared_state.entries.lock(); + let entry_vec = entries.get(&sequencer_id).unwrap(); + let watermark = entry_vec + .iter() + .filter_map(|entry_res| { + entry_res + .as_ref() + .ok() + .map(|entry| entry.sequence().unwrap().number) + }) + .max() + .map(|n| n + 1) + .unwrap_or(0); + + Ok(watermark) + }; + fut.boxed() as FetchHighWatermarkFut<'_> + }; + let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>; + + streams.push(( + sequencer_id, + EntryStream { + stream, + fetch_high_watermark, + }, + )); } streams From fb931bb1ca5116a8186e922fc7044d2a95e6de5e Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 11:59:38 +0200 Subject: [PATCH 20/27] feat: write buffer ingestion metrics --- server/src/db.rs | 308 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 277 insertions(+), 31 deletions(-) diff --git a/server/src/db.rs b/server/src/db.rs index f94eb4b28c..4a0aa076b8 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -50,7 +50,7 @@ use std::{ time::{Duration, Instant}, }; use write_buffer::config::WriteBufferConfig; -use write_buffer::core::WriteBufferError; +use write_buffer::core::{FetchHighWatermark, WriteBufferError}; pub mod access; pub mod catalog; @@ -144,6 +144,94 @@ pub enum Error { pub type Result = std::result::Result; +/// Metrics for data ingest. +#[derive(Debug)] +struct IngestMetrics { + /// Metrics domain + domain: Arc, +} + +impl IngestMetrics { + fn new(domain: Arc) -> Self { + Self { domain } + } + + fn new_sequencer_metrics(&self, sequencer_id: u32) -> SequencerMetrics { + let labels = vec![KeyValue::new("sequencer_id", sequencer_id.to_string())]; + + let red = self + .domain + .register_red_metric_with_labels(Some("write_buffer"), labels.clone()); + let bytes_read = self.domain.register_counter_metric_with_labels( + "read", + Some("bytes"), + "Bytes read from sequencer", + labels.clone(), + ); + let watermark_iox = self.domain.register_gauge_metric_with_labels( + "watermark_iox", + None, + "High watermark of IOx (aka next sequence number that will be ingested)", + &labels, + ); + let watermark_sequencer = self.domain.register_gauge_metric_with_labels( + "watermark_sequencer", + None, + "High watermark of the sequencer (aka next sequence number that will be added)", + &labels, + ); + let last_min_ts = self.domain.register_gauge_metric_with_labels( + "last_min_ts", + None, + "Minimum unix timestamp of last write as unix timestamp in nanoseconds", + &labels, + ); + let last_max_ts = self.domain.register_gauge_metric_with_labels( + "last_max_ts", + None, + "Maximum unix timestamp of last write as unix timestamp in nanoseconds", + &labels, + ); + + SequencerMetrics { + red, + bytes_read, + watermark_iox, + watermark_sequencer, + last_min_ts, + last_max_ts, + } + } +} + +/// Metrics for a single sequencer. +#[derive(Debug)] +struct SequencerMetrics { + /// Metrics for tracking ingest. + red: metrics::RedMetric, + + /// Bytes read from sequencer. + /// + /// This metrics is independent of the success / error state of the entries. + bytes_read: metrics::Counter, + + /// Watermark of ingested data. + /// + /// This represents the next sequence number that will be ingested. + watermark_iox: metrics::Gauge, + + /// Watermark of to-be-ingested data. + /// + /// This represents the next sequence number that will be added to the sequencer. + watermark_sequencer: metrics::Gauge, + + /// Minimum unix timestamp of last write as unix timestamp in nanoseconds. + last_min_ts: metrics::Gauge, + + /// Maximum unix timestamp of last write as unix timestamp in nanoseconds. + last_max_ts: metrics::Gauge, +} + /// This is the main IOx Database object. It is the root object of any /// specific InfluxDB IOx instance /// @@ -248,8 +336,8 @@ pub struct Db { /// Metric labels metric_labels: Vec, - /// Metrics for tracking the number of errors that occur while ingesting data - ingest_errors: metrics::Counter, + /// Ingest metrics + ingest_metrics: IngestMetrics, /// Optionally connect to a write buffer for either buffering writes or reading buffered writes write_buffer: Option, @@ -286,8 +374,7 @@ impl Db { let ingest_domain = metrics_registry.register_domain_with_labels("ingest", metric_labels.clone()); - let ingest_errors = - ingest_domain.register_counter_metric("errors", None, "Number of errors during ingest"); + let ingest_metrics = IngestMetrics::new(Arc::new(ingest_domain)); let catalog = Arc::new(database_to_commit.catalog); @@ -316,7 +403,7 @@ impl Db { worker_iterations_lifecycle: AtomicUsize::new(0), worker_iterations_cleanup: AtomicUsize::new(0), metric_labels, - ingest_errors, + ingest_metrics, write_buffer: database_to_commit.write_buffer, cleanup_lock: Default::default(), } @@ -687,8 +774,13 @@ impl Db { .try_lock() .expect("no streams should exist at this point"); let mut futures = vec![]; - for (_sequencer_id, stream) in write_buffer.streams() { - let fut = self.stream_in_sequenced_entries(stream); + for (sequencer_id, stream) in write_buffer.streams() { + let metrics = self.ingest_metrics.new_sequencer_metrics(sequencer_id); + let fut = self.stream_in_sequenced_entries( + stream.stream, + stream.fetch_high_watermark, + metrics, + ); futures.push(fut); } @@ -705,32 +797,116 @@ impl Db { /// This is used to take entries from a `Stream` and put them in the mutable buffer, such as /// streaming entries from a write buffer. - async fn stream_in_sequenced_entries( - &self, - stream: BoxStream<'_, Result>, + async fn stream_in_sequenced_entries<'a>( + &'a self, + mut stream: BoxStream<'a, Result>, + f_mark: FetchHighWatermark<'a>, + mut metrics: SequencerMetrics, ) { - stream - .for_each(|sequenced_entry_result| async { - let sequenced_entry = match sequenced_entry_result { - Ok(sequenced_entry) => sequenced_entry, - Err(e) => { - debug!(?e, "Error converting write buffer data to SequencedEntry"); - self.ingest_errors.add(1); - return; - } - }; + let mut last_watermark_update: Option = None; - let sequenced_entry = Arc::new(sequenced_entry); + while let Some(sequenced_entry_result) = stream.next().await { + let red_observation = metrics.red.observation(); - if let Err(e) = self.store_sequenced_entry(sequenced_entry) { + // get entry from sequencer + let sequenced_entry = match sequenced_entry_result { + Ok(sequenced_entry) => sequenced_entry, + Err(e) => { + debug!(?e, "Error converting write buffer data to SequencedEntry"); + red_observation.client_error(); + continue; + } + }; + let sequenced_entry = Arc::new(sequenced_entry); + + // store entry + match self.store_sequenced_entry(Arc::clone(&sequenced_entry)) { + Ok(_) => { + red_observation.ok(); + } + Err(e) => { debug!( ?e, "Error storing SequencedEntry from write buffer in database" ); - self.ingest_errors.add(1); + red_observation.error(); } - }) - .await + } + + // update: + // - bytes read + // - iox watermark + // - min ts + // - max ts + let sequence = sequenced_entry + .sequence() + .expect("entry from write buffer must be sequenced"); + let entry = sequenced_entry.entry(); + metrics + .watermark_iox + .set((sequence.number + 1) as usize, &[]); + metrics.bytes_read.add(entry.data().len() as u64); + if let Some(min_ts) = entry + .partition_writes() + .map(|partition_writes| { + partition_writes + .iter() + .filter_map(|partition_write| { + partition_write + .table_batches() + .iter() + .filter_map(|table_batch| table_batch.min_max_time().ok()) + .map(|(min, _max)| min) + .max() + }) + .min() + }) + .flatten() + { + metrics + .last_min_ts + .set(min_ts.timestamp_nanos() as usize, &[]); + } + if let Some(max_ts) = entry + .partition_writes() + .map(|partition_writes| { + partition_writes + .iter() + .filter_map(|partition_write| { + partition_write + .table_batches() + .iter() + .filter_map(|table_batch| table_batch.min_max_time().ok()) + .map(|(_min, max)| max) + .max() + }) + .max() + }) + .flatten() + { + metrics + .last_max_ts + .set(max_ts.timestamp_nanos() as usize, &[]); + } + + // maybe update sequencer watermark + // We are not updating this watermark every round because asking the sequencer for that watermark can be + // quite expensive. + if last_watermark_update + .map(|ts| ts.elapsed() > Duration::from_secs(60)) + .unwrap_or(true) + { + match f_mark().await { + Ok(watermark) => { + metrics.watermark_sequencer.set(watermark as usize, &[]); + } + Err(e) => { + debug!(%e, "Error while reading sequencer watermark") + } + } + last_watermark_update = Some(Instant::now()); + } + } } async fn cleanup_unreferenced_parquet_files( @@ -1244,13 +1420,13 @@ mod tests { .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap()); let write_buffer = MockBufferForReading::new(write_buffer_state); - let db = TestDb::builder() + let test_db = TestDb::builder() .write_buffer(WriteBufferConfig::Reading(Arc::new( tokio::sync::Mutex::new(Box::new(write_buffer) as _), ))) .build() - .await - .db; + .await; + let db = test_db.db; // do: start background task loop let shutdown: CancellationToken = Default::default(); @@ -1279,6 +1455,71 @@ mod tests { tokio::time::sleep(Duration::from_millis(100)).await; } + // check: metrics + // We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise + let metrics = test_db.metric_registry; + metrics + .has_metric_family("ingest_write_buffer_requests_total") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ("status", "ok"), + ]) + .counter() + .eq(1.0) + .unwrap(); + metrics + .has_metric_family("ingest_read_bytes_total") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .counter() + .eq(256.0) + .unwrap(); + metrics + .has_metric_family("ingest_watermark_iox") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(1.0) + .unwrap(); + metrics + .has_metric_family("ingest_watermark_sequencer") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(1.0) + .unwrap(); + metrics + .has_metric_family("ingest_last_min_ts") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(10.0) + .unwrap(); + metrics + .has_metric_family("ingest_last_max_ts") + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ]) + .gauge() + .eq(10.0) + .unwrap(); + // do: stop background task loop shutdown.cancel(); join_handle.await.unwrap(); @@ -1325,11 +1566,16 @@ mod tests { // check: after a while the error should be reported in the database's metrics let t_0 = Instant::now(); loop { - let family = metrics.try_has_metric_family("ingest_errors_total"); + let family = metrics.try_has_metric_family("ingest_write_buffer_requests_total"); if let Ok(metric) = family { if metric - .with_labels(&[("db_name", "placeholder"), ("svr_id", "1")]) + .with_labels(&[ + ("db_name", "placeholder"), + ("svr_id", "1"), + ("sequencer_id", "0"), + ("status", "client_error"), + ]) .counter() .eq(1.0) .is_ok() From ffe6e62aeef5540ccf4cbca15a04cd92008b7635 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 21 Jul 2021 12:43:27 +0100 Subject: [PATCH 21/27] feat: add instant to datetime conversion (#2078) * feat: add instant to datetime conversion * chore: review feedback --- Cargo.lock | 1 + data_types/Cargo.toml | 1 + data_types/src/instant.rs | 53 ++++++++++++++++ data_types/src/lib.rs | 3 +- .../src/persistence_windows.rs | 62 +++++++------------ 5 files changed, 78 insertions(+), 42 deletions(-) create mode 100644 data_types/src/instant.rs diff --git a/Cargo.lock b/Cargo.lock index cbc1081590..ec6ccbe445 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -826,6 +826,7 @@ dependencies = [ "influxdb_line_protocol", "num_cpus", "observability_deps", + "once_cell", "percent-encoding", "regex", "serde", diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml index 197174f14e..fd145052f9 100644 --- a/data_types/Cargo.toml +++ b/data_types/Cargo.toml @@ -15,6 +15,7 @@ regex = "1.4" serde = { version = "1.0", features = ["rc", "derive"] } snafu = "0.6" observability_deps = { path = "../observability_deps" } +once_cell = { version = "1.4.0", features = ["parking_lot"] } [dev-dependencies] # In alphabetical order test_helpers = { path = "../test_helpers" } diff --git a/data_types/src/instant.rs b/data_types/src/instant.rs new file mode 100644 index 0000000000..807bcbba49 --- /dev/null +++ b/data_types/src/instant.rs @@ -0,0 +1,53 @@ +use chrono::{DateTime, Utc}; +use once_cell::sync::OnceCell; +use std::time::Instant; + +/// Stores an Instant and DateTime captured as close as possible together +static INSTANCE: OnceCell<(DateTime, Instant)> = OnceCell::new(); + +/// Provides a conversion from Instant to DateTime for display purposes +/// +/// It is an approximation as if the system clock changes, the returned DateTime will not be +/// the same as the DateTime that would have been recorded at the time the Instant was created. +/// +/// The conversion does, however, preserve the monotonic property of Instant, i.e. a larger +/// Instant will have a larger returned DateTime. +/// +/// This should ONLY be used for display purposes, the results should not be used to +/// drive logic, nor persisted +pub fn to_approximate_datetime(instant: Instant) -> DateTime { + let (ref_date, ref_instant) = *INSTANCE.get_or_init(|| (Utc::now(), Instant::now())); + + if ref_instant > instant { + ref_date + - chrono::Duration::from_std(ref_instant.duration_since(instant)) + .expect("date overflow") + } else { + ref_date + + chrono::Duration::from_std(instant.duration_since(ref_instant)) + .expect("date overflow") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_to_datetime() { + // Seed global state + to_approximate_datetime(Instant::now()); + + let (ref_date, ref_instant) = *INSTANCE.get().unwrap(); + + assert_eq!( + to_approximate_datetime(ref_instant + std::time::Duration::from_nanos(78)), + ref_date + chrono::Duration::nanoseconds(78) + ); + + assert_eq!( + to_approximate_datetime(ref_instant - std::time::Duration::from_nanos(23)), + ref_date - chrono::Duration::nanoseconds(23) + ); + } +} diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs index 76d7ca0306..f222aad0ff 100644 --- a/data_types/src/lib.rs +++ b/data_types/src/lib.rs @@ -13,13 +13,14 @@ pub mod chunk_metadata; pub mod consistent_hasher; mod database_name; -pub use database_name::*; pub mod database_rules; pub mod database_state; pub mod error; +pub mod instant; pub mod job; pub mod names; pub mod partition_metadata; pub mod server_id; pub mod timestamp; pub mod write_summary; +pub use database_name::*; diff --git a/persistence_windows/src/persistence_windows.rs b/persistence_windows/src/persistence_windows.rs index 957d034d06..8cb97d0694 100644 --- a/persistence_windows/src/persistence_windows.rs +++ b/persistence_windows/src/persistence_windows.rs @@ -13,6 +13,7 @@ use internal_types::guard::{ReadGuard, ReadLock}; use crate::checkpoint::PartitionCheckpoint; use crate::min_max_sequence::MinMaxSequence; +use data_types::instant::to_approximate_datetime; const DEFAULT_CLOSED_WINDOW_PERIOD: Duration = Duration::from_secs(30); @@ -45,15 +46,8 @@ pub struct PersistenceWindows { late_arrival_period: Duration, closed_window_period: Duration, - /// The datetime this PersistenceWindows was created - /// - /// `PersistenceWindows` internally uses monotonic `Instant`, however, - /// these cannot be rendered. To provide a stable rendering of Wall timestamp, - /// a single timestamp is recorded at creation time - created_at_time: DateTime, - /// The instant this PersistenceWindows was created - created_at_instant: Instant, + created_at: Instant, /// The last instant passed to PersistenceWindows::add_range last_instant: Instant, @@ -116,7 +110,6 @@ impl PersistenceWindows { let closed_window_count = late_arrival_seconds / closed_window_seconds; - let created_at_time = Utc::now(); let created_at_instant = Instant::now(); Self { @@ -126,8 +119,7 @@ impl PersistenceWindows { addr, late_arrival_period, closed_window_period, - created_at_time, - created_at_instant, + created_at: created_at_instant, last_instant: created_at_instant, max_sequence_numbers: Default::default(), } @@ -362,25 +354,12 @@ impl PersistenceWindows { /// These are approximate because persistence may partially flush a window, which will /// update the min row timestamp but not the row count pub fn summaries(&self) -> impl Iterator + '_ { - self.windows().map(move |window| { - let window_age = chrono::Duration::from_std( - window.created_at.duration_since(self.created_at_instant), - ) - .expect("duration overflow"); - - let time_of_first_write = self.created_at_time + window_age; - - let window_duration = - chrono::Duration::from_std(window.last_instant.duration_since(window.created_at)) - .expect("duration overflow"); - - WriteSummary { - time_of_first_write, - time_of_last_write: time_of_first_write + window_duration, - min_timestamp: window.min_time, - max_timestamp: window.max_time, - row_count: window.row_count, - } + self.windows().map(move |window| WriteSummary { + time_of_first_write: to_approximate_datetime(window.created_at), + time_of_last_write: to_approximate_datetime(window.last_instant), + min_timestamp: window.min_time, + max_timestamp: window.max_time, + row_count: window.row_count, }) } @@ -1333,7 +1312,8 @@ mod tests { fn test_summaries() { let late_arrival_period = Duration::from_secs(100); let mut w = make_windows(late_arrival_period); - let instant = w.created_at_instant; + let instant = w.created_at; + let created_at_time = to_approximate_datetime(w.created_at); // Window 1 w.add_range( @@ -1387,17 +1367,17 @@ mod tests { summaries, vec![ WriteSummary { - time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1), - time_of_last_write: w.created_at_time + chrono::Duration::milliseconds(50), + time_of_first_write: created_at_time + chrono::Duration::milliseconds(1), + time_of_last_write: created_at_time + chrono::Duration::milliseconds(50), min_timestamp: Utc.timestamp_nanos(1), max_timestamp: Utc.timestamp_nanos(340), row_count: 21 }, WriteSummary { - time_of_first_write: w.created_at_time + time_of_first_write: created_at_time + closed_duration + chrono::Duration::milliseconds(1), - time_of_last_write: w.created_at_time + time_of_last_write: created_at_time + closed_duration + chrono::Duration::milliseconds(1), min_timestamp: Utc.timestamp_nanos(89), @@ -1405,8 +1385,8 @@ mod tests { row_count: 3 }, WriteSummary { - time_of_first_write: w.created_at_time + closed_duration * 3, - time_of_last_write: w.created_at_time + closed_duration * 3, + time_of_first_write: created_at_time + closed_duration * 3, + time_of_last_write: created_at_time + closed_duration * 3, min_timestamp: Utc.timestamp_nanos(3), max_timestamp: Utc.timestamp_nanos(4), row_count: 8 @@ -1424,8 +1404,8 @@ mod tests { summaries, vec![ WriteSummary { - time_of_first_write: w.created_at_time + chrono::Duration::milliseconds(1), - time_of_last_write: w.created_at_time + time_of_first_write: created_at_time + chrono::Duration::milliseconds(1), + time_of_last_write: created_at_time + closed_duration + chrono::Duration::milliseconds(1), min_timestamp: Utc.timestamp_nanos(1), @@ -1433,8 +1413,8 @@ mod tests { row_count: 24 }, WriteSummary { - time_of_first_write: w.created_at_time + closed_duration * 3, - time_of_last_write: w.created_at_time + closed_duration * 3, + time_of_first_write: created_at_time + closed_duration * 3, + time_of_last_write: created_at_time + closed_duration * 3, min_timestamp: Utc.timestamp_nanos(3), max_timestamp: Utc.timestamp_nanos(4), row_count: 8 From 7d597d1d5c7ec5989429d0cc4c0b0beee3ed35ff Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 13:57:53 +0200 Subject: [PATCH 22/27] refactor: make ingest metrics easier to understand --- server/src/db.rs | 116 ++++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/server/src/db.rs b/server/src/db.rs index 4a0aa076b8..8f74114b08 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -168,16 +168,16 @@ impl IngestMetrics { "Bytes read from sequencer", labels.clone(), ); - let watermark_iox = self.domain.register_gauge_metric_with_labels( - "watermark_iox", + let last_sequence_number = self.domain.register_gauge_metric_with_labels( + "last_sequence_number", None, - "High watermark of IOx (aka next sequence number that will be ingested)", + "Last consumed sequence number (e.g. Kafka offset)", &labels, ); - let watermark_sequencer = self.domain.register_gauge_metric_with_labels( - "watermark_sequencer", + let sequence_number_lag = self.domain.register_gauge_metric_with_labels( + "sequence_number_lag", None, - "High watermark of the sequencer (aka next sequence number that will be added)", + "The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number available", &labels, ); let last_min_ts = self.domain.register_gauge_metric_with_labels( @@ -196,8 +196,8 @@ impl IngestMetrics { SequencerMetrics { red, bytes_read, - watermark_iox, - watermark_sequencer, + last_sequence_number, + sequence_number_lag, last_min_ts, last_max_ts, } @@ -215,15 +215,12 @@ struct SequencerMetrics { /// This metrics is independent of the success / error state of the entries. bytes_read: metrics::Counter, - /// Watermark of ingested data. - /// - /// This represents the next sequence number that will be ingested. - watermark_iox: metrics::Gauge, + /// Last consumed sequence number (e.g. Kafka offset). + last_sequence_number: metrics::Gauge, - /// Watermark of to-be-ingested data. - /// - /// This represents the next sequence number that will be added to the sequencer. - watermark_sequencer: metrics::Gauge, + /// The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number + /// available. + sequence_number_lag: metrics::Gauge, /// Minimum unix timestamp of last write as unix timestamp in nanoseconds. last_min_ts: metrics::Gauge, @@ -803,7 +800,8 @@ impl Db { f_mark: FetchHighWatermark<'a>, mut metrics: SequencerMetrics, ) { - let mut last_watermark_update: Option = None; + let mut watermark_last_updated: Option = None; + let mut watermark = 0; while let Some(sequenced_entry_result) = stream.next().await { let red_observation = metrics.red.observation(); @@ -833,19 +831,42 @@ impl Db { } } + // maybe update sequencer watermark + // We are not updating this watermark every round because asking the sequencer for that watermark can be + // quite expensive. + if watermark_last_updated + .map(|ts| ts.elapsed() > Duration::from_secs(60)) + .unwrap_or(true) + { + match f_mark().await { + Ok(w) => { + watermark = w; + } + Err(e) => { + debug!(%e, "Error while reading sequencer watermark") + } + } + watermark_last_updated = Some(Instant::now()); + } + // update: // - bytes read - // - iox watermark + // - last sequence number + // - lag // - min ts // - max ts let sequence = sequenced_entry .sequence() .expect("entry from write buffer must be sequenced"); let entry = sequenced_entry.entry(); - metrics - .watermark_iox - .set((sequence.number + 1) as usize, &[]); metrics.bytes_read.add(entry.data().len() as u64); + metrics + .last_sequence_number + .set(sequence.number as usize, &[]); + metrics.sequence_number_lag.set( + watermark.saturating_sub(sequence.number).saturating_sub(1) as usize, + &[], + ); if let Some(min_ts) = entry .partition_writes() .map(|partition_writes| { @@ -888,24 +909,6 @@ impl Db { .last_max_ts .set(max_ts.timestamp_nanos() as usize, &[]); } - - // maybe update sequencer watermark - // We are not updating this watermark every round because asking the sequencer for that watermark can be - // quite expensive. - if last_watermark_update - .map(|ts| ts.elapsed() > Duration::from_secs(60)) - .unwrap_or(true) - { - match f_mark().await { - Ok(watermark) => { - metrics.watermark_sequencer.set(watermark as usize, &[]); - } - Err(e) => { - debug!(%e, "Error while reading sequencer watermark") - } - } - last_watermark_update = Some(Instant::now()); - } } } @@ -1414,10 +1417,18 @@ mod tests { #[tokio::test] async fn read_from_write_buffer_write_to_mutable_buffer() { - let entry = lp_to_entry("cpu bar=1 10"); let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1); - write_buffer_state - .push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap()); + write_buffer_state.push_entry( + SequencedEntry::new_from_sequence(Sequence::new(0, 0), lp_to_entry("mem foo=1 10")) + .unwrap(), + ); + write_buffer_state.push_entry( + SequencedEntry::new_from_sequence( + Sequence::new(0, 7), + lp_to_entry("cpu bar=2 20\ncpu bar=3 30"), + ) + .unwrap(), + ); let write_buffer = MockBufferForReading::new(write_buffer_state); let test_db = TestDb::builder() @@ -1467,7 +1478,7 @@ mod tests { ("status", "ok"), ]) .counter() - .eq(1.0) + .eq(2.0) .unwrap(); metrics .has_metric_family("ingest_read_bytes_total") @@ -1477,27 +1488,27 @@ mod tests { ("sequencer_id", "0"), ]) .counter() - .eq(256.0) + .eq(528.0) .unwrap(); metrics - .has_metric_family("ingest_watermark_iox") + .has_metric_family("ingest_last_sequence_number") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), ("sequencer_id", "0"), ]) .gauge() - .eq(1.0) + .eq(7.0) .unwrap(); metrics - .has_metric_family("ingest_watermark_sequencer") + .has_metric_family("ingest_sequence_number_lag") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), ("sequencer_id", "0"), ]) .gauge() - .eq(1.0) + .eq(0.0) .unwrap(); metrics .has_metric_family("ingest_last_min_ts") @@ -1507,7 +1518,7 @@ mod tests { ("sequencer_id", "0"), ]) .gauge() - .eq(10.0) + .eq(20.0) .unwrap(); metrics .has_metric_family("ingest_last_max_ts") @@ -1517,7 +1528,7 @@ mod tests { ("sequencer_id", "0"), ]) .gauge() - .eq(10.0) + .eq(30.0) .unwrap(); // do: stop background task loop @@ -1525,13 +1536,14 @@ mod tests { join_handle.await.unwrap(); // check: the expected results should be there - let batches = run_query(db, "select * from cpu").await; + let batches = run_query(db, "select * from cpu order by time").await; let expected = vec![ "+-----+-------------------------------+", "| bar | time |", "+-----+-------------------------------+", - "| 1 | 1970-01-01 00:00:00.000000010 |", + "| 2 | 1970-01-01 00:00:00.000000020 |", + "| 3 | 1970-01-01 00:00:00.000000030 |", "+-----+-------------------------------+", ]; assert_batches_eq!(expected, &batches); From 4d5f2090306fd737a026d1bb53ee3c347553f408 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 14:59:07 +0200 Subject: [PATCH 23/27] docs: do not repeat unix that often --- server/src/db.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/db.rs b/server/src/db.rs index 8f74114b08..eff1e589d2 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -183,13 +183,13 @@ impl IngestMetrics { let last_min_ts = self.domain.register_gauge_metric_with_labels( "last_min_ts", None, - "Minimum unix timestamp of last write as unix timestamp in nanoseconds", + "Minimum timestamp of last write as unix timestamp in nanoseconds", &labels, ); let last_max_ts = self.domain.register_gauge_metric_with_labels( "last_max_ts", None, - "Maximum unix timestamp of last write as unix timestamp in nanoseconds", + "Maximum timestamp of last write as unix timestamp in nanoseconds", &labels, ); @@ -222,10 +222,10 @@ struct SequencerMetrics { /// available. sequence_number_lag: metrics::Gauge, - /// Minimum unix timestamp of last write as unix timestamp in nanoseconds. + /// Minimum timestamp of last write as unix timestamp in nanoseconds. last_min_ts: metrics::Gauge, - /// Maximum unix timestamp of last write as unix timestamp in nanoseconds. + /// Maximum timestamp of last write as unix timestamp in nanoseconds. last_max_ts: metrics::Gauge, } From 2f1efcf517f7504d728f6faddbf296bdc0e5ea26 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 15:00:53 +0200 Subject: [PATCH 24/27] docs: clarify difference --- server/src/db.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/db.rs b/server/src/db.rs index eff1e589d2..949ba5225d 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -177,7 +177,7 @@ impl IngestMetrics { let sequence_number_lag = self.domain.register_gauge_metric_with_labels( "sequence_number_lag", None, - "The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number available", + "The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed sequence number", &labels, ); let last_min_ts = self.domain.register_gauge_metric_with_labels( @@ -218,8 +218,8 @@ struct SequencerMetrics { /// Last consumed sequence number (e.g. Kafka offset). last_sequence_number: metrics::Gauge, - /// The difference between the last consumed sequence number (e.g. Kafka offset) and the last sequence number - /// available. + // The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed + // sequence number. sequence_number_lag: metrics::Gauge, /// Minimum timestamp of last write as unix timestamp in nanoseconds. From fd00206fbbd43cbcff5e743a7ca8fcbb798c733d Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 15:02:48 +0200 Subject: [PATCH 25/27] refactor: increase watermark update frequence to once per 10s --- server/src/db.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/db.rs b/server/src/db.rs index 949ba5225d..216b99effb 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -835,7 +835,7 @@ impl Db { // We are not updating this watermark every round because asking the sequencer for that watermark can be // quite expensive. if watermark_last_updated - .map(|ts| ts.elapsed() > Duration::from_secs(60)) + .map(|ts| ts.elapsed() > Duration::from_secs(10)) .unwrap_or(true) { match f_mark().await { From cddf94653cb741649f347c128e4b6759d485899c Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 15:07:59 +0200 Subject: [PATCH 26/27] refactor: use `write_buffer` subsystem for ingest metrics --- server/src/db.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/server/src/db.rs b/server/src/db.rs index 216b99effb..6dee5446b1 100644 --- a/server/src/db.rs +++ b/server/src/db.rs @@ -144,14 +144,14 @@ pub enum Error { pub type Result = std::result::Result; -/// Metrics for data ingest. +/// Metrics for data ingest via write buffer. #[derive(Debug)] -struct IngestMetrics { +struct WriteBufferIngestMetrics { /// Metrics domain domain: Arc, } -impl IngestMetrics { +impl WriteBufferIngestMetrics { fn new(domain: Arc) -> Self { Self { domain } } @@ -161,7 +161,7 @@ impl IngestMetrics { let red = self .domain - .register_red_metric_with_labels(Some("write_buffer"), labels.clone()); + .register_red_metric_with_labels(Some("ingest"), labels.clone()); let bytes_read = self.domain.register_counter_metric_with_labels( "read", Some("bytes"), @@ -334,7 +334,7 @@ pub struct Db { metric_labels: Vec, /// Ingest metrics - ingest_metrics: IngestMetrics, + ingest_metrics: WriteBufferIngestMetrics, /// Optionally connect to a write buffer for either buffering writes or reading buffered writes write_buffer: Option, @@ -370,8 +370,8 @@ impl Db { let metric_labels = database_to_commit.catalog.metric_labels.clone(); let ingest_domain = - metrics_registry.register_domain_with_labels("ingest", metric_labels.clone()); - let ingest_metrics = IngestMetrics::new(Arc::new(ingest_domain)); + metrics_registry.register_domain_with_labels("write_buffer", metric_labels.clone()); + let ingest_metrics = WriteBufferIngestMetrics::new(Arc::new(ingest_domain)); let catalog = Arc::new(database_to_commit.catalog); @@ -1470,7 +1470,7 @@ mod tests { // We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise let metrics = test_db.metric_registry; metrics - .has_metric_family("ingest_write_buffer_requests_total") + .has_metric_family("write_buffer_ingest_requests_total") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), @@ -1481,7 +1481,7 @@ mod tests { .eq(2.0) .unwrap(); metrics - .has_metric_family("ingest_read_bytes_total") + .has_metric_family("write_buffer_read_bytes_total") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), @@ -1491,7 +1491,7 @@ mod tests { .eq(528.0) .unwrap(); metrics - .has_metric_family("ingest_last_sequence_number") + .has_metric_family("write_buffer_last_sequence_number") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), @@ -1501,7 +1501,7 @@ mod tests { .eq(7.0) .unwrap(); metrics - .has_metric_family("ingest_sequence_number_lag") + .has_metric_family("write_buffer_sequence_number_lag") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), @@ -1511,7 +1511,7 @@ mod tests { .eq(0.0) .unwrap(); metrics - .has_metric_family("ingest_last_min_ts") + .has_metric_family("write_buffer_last_min_ts") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), @@ -1521,7 +1521,7 @@ mod tests { .eq(20.0) .unwrap(); metrics - .has_metric_family("ingest_last_max_ts") + .has_metric_family("write_buffer_last_max_ts") .with_labels(&[ ("db_name", "placeholder"), ("svr_id", "1"), @@ -1578,7 +1578,7 @@ mod tests { // check: after a while the error should be reported in the database's metrics let t_0 = Instant::now(); loop { - let family = metrics.try_has_metric_family("ingest_write_buffer_requests_total"); + let family = metrics.try_has_metric_family("write_buffer_ingest_requests_total"); if let Ok(metric) = family { if metric From 55490c279a0e93f9199f7debf9b8aaf5f837cc57 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 21 Jul 2021 15:21:52 +0200 Subject: [PATCH 27/27] fix: Kafka watermark error for new partitions --- write_buffer/src/kafka.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/write_buffer/src/kafka.rs b/write_buffer/src/kafka.rs index c786f93066..9f971ef9ac 100644 --- a/write_buffer/src/kafka.rs +++ b/write_buffer/src/kafka.rs @@ -14,6 +14,7 @@ use rdkafka::{ consumer::{BaseConsumer, Consumer, StreamConsumer}, error::KafkaError, producer::{FutureProducer, FutureRecord}, + types::RDKafkaErrorCode, util::Timeout, ClientConfig, Message, Offset, TopicPartitionList, }; @@ -141,7 +142,7 @@ impl WriteBufferReading for KafkaBufferConsumer { let database_name = database_name.clone(); let fut = async move { - let (_low, high) = tokio::task::spawn_blocking(move || { + match tokio::task::spawn_blocking(move || { consumer_cloned.fetch_watermarks( &database_name, sequencer_id as i32, @@ -149,9 +150,12 @@ impl WriteBufferReading for KafkaBufferConsumer { ) }) .await - .expect("subtask failed")?; - - Ok(high as u64) + .expect("subtask failed") + { + Ok((_low, high)) => Ok(high as u64), + Err(KafkaError::MetadataFetch(RDKafkaErrorCode::UnknownPartition)) => Ok(0), + Err(e) => Err(Box::new(e) as Box), + } }; fut.boxed() as FetchHighWatermarkFut<'_>