Merge branch 'main' into ntran/dedup_compare_cols_order

pull/24376/head
kodiakhq[bot] 2021-07-21 15:42:30 +00:00 committed by GitHub
commit 18dd108ba6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
49 changed files with 3323 additions and 2286 deletions

19
Cargo.lock generated
View File

@ -769,9 +769,9 @@ dependencies = [
[[package]]
name = "crypto-mac"
version = "0.10.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6"
checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a"
dependencies = [
"generic-array",
"subtle",
@ -826,6 +826,7 @@ dependencies = [
"influxdb_line_protocol",
"num_cpus",
"observability_deps",
"once_cell",
"percent-encoding",
"regex",
"serde",
@ -843,7 +844,7 @@ dependencies = [
[[package]]
name = "datafusion"
version = "4.0.0-SNAPSHOT"
source = "git+https://github.com/apache/arrow-datafusion.git?rev=bd3ee23520a3e6f135891ec32d96fcea7ee2bb55#bd3ee23520a3e6f135891ec32d96fcea7ee2bb55"
source = "git+https://github.com/apache/arrow-datafusion.git?rev=30693df8961dca300306dfd0c8fca130375b50b3#30693df8961dca300306dfd0c8fca130375b50b3"
dependencies = [
"ahash 0.7.4",
"arrow",
@ -4330,9 +4331,9 @@ dependencies = [
[[package]]
name = "tinyvec"
version = "1.2.0"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342"
checksum = "4ac2e1d4bd0f75279cfd5a076e0d578bbf02c22b7c39e766c437dd49b3ec43e0"
dependencies = [
"tinyvec_macros",
]
@ -4345,9 +4346,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.8.1"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985"
checksum = "c2602b8af3767c285202012822834005f596c811042315fa7e9f5b12b2a43207"
dependencies = [
"autocfg",
"bytes",
@ -4984,9 +4985,9 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
[[package]]
name = "zeroize"
version = "1.4.0"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eeafe61337cb2c879d328b74aa6cd9d794592c82da6be559fdf11493f02a2d18"
checksum = "377db0846015f7ae377174787dd452e1c5f5a9050bc6f954911d01f116daa0cd"
[[package]]
name = "zstd"

View File

@ -15,6 +15,7 @@ regex = "1.4"
serde = { version = "1.0", features = ["rc", "derive"] }
snafu = "0.6"
observability_deps = { path = "../observability_deps" }
once_cell = { version = "1.4.0", features = ["parking_lot"] }
[dev-dependencies] # In alphabetical order
test_helpers = { path = "../test_helpers" }

View File

@ -166,6 +166,10 @@ pub struct LifecycleRules {
/// Maximum number of rows to buffer in a MUB chunk before compacting it
pub mub_row_threshold: NonZeroUsize,
/// Use up to this amount of space in bytes for caching Parquet files. None
/// will disable Parquet file caching.
pub parquet_cache_limit: Option<NonZeroU64>,
}
impl LifecycleRules {
@ -195,6 +199,7 @@ impl Default for LifecycleRules {
persist_age_threshold_seconds: NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS)
.unwrap(),
mub_row_threshold: NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap(),
parquet_cache_limit: None,
}
}
}

53
data_types/src/instant.rs Normal file
View File

@ -0,0 +1,53 @@
use chrono::{DateTime, Utc};
use once_cell::sync::OnceCell;
use std::time::Instant;
/// Stores an Instant and DateTime<Utc> captured as close as possible together
static INSTANCE: OnceCell<(DateTime<Utc>, Instant)> = OnceCell::new();
/// Provides a conversion from Instant to DateTime<Utc> for display purposes
///
/// It is an approximation as if the system clock changes, the returned DateTime will not be
/// the same as the DateTime that would have been recorded at the time the Instant was created.
///
/// The conversion does, however, preserve the monotonic property of Instant, i.e. a larger
/// Instant will have a larger returned DateTime.
///
/// This should ONLY be used for display purposes, the results should not be used to
/// drive logic, nor persisted
pub fn to_approximate_datetime(instant: Instant) -> DateTime<Utc> {
let (ref_date, ref_instant) = *INSTANCE.get_or_init(|| (Utc::now(), Instant::now()));
if ref_instant > instant {
ref_date
- chrono::Duration::from_std(ref_instant.duration_since(instant))
.expect("date overflow")
} else {
ref_date
+ chrono::Duration::from_std(instant.duration_since(ref_instant))
.expect("date overflow")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_to_datetime() {
// Seed global state
to_approximate_datetime(Instant::now());
let (ref_date, ref_instant) = *INSTANCE.get().unwrap();
assert_eq!(
to_approximate_datetime(ref_instant + std::time::Duration::from_nanos(78)),
ref_date + chrono::Duration::nanoseconds(78)
);
assert_eq!(
to_approximate_datetime(ref_instant - std::time::Duration::from_nanos(23)),
ref_date - chrono::Duration::nanoseconds(23)
);
}
}

View File

@ -13,12 +13,14 @@
pub mod chunk_metadata;
pub mod consistent_hasher;
mod database_name;
pub use database_name::*;
pub mod database_rules;
pub mod database_state;
pub mod error;
pub mod instant;
pub mod job;
pub mod names;
pub mod partition_metadata;
pub mod server_id;
pub mod timestamp;
pub mod write_summary;
pub use database_name::*;

View File

@ -0,0 +1,20 @@
use chrono::{DateTime, Utc};
/// A description of a set of writes
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct WriteSummary {
/// The wall clock timestamp of the last write in this summary
pub time_of_first_write: DateTime<Utc>,
/// The wall clock timestamp of the last write in this summary
pub time_of_last_write: DateTime<Utc>,
/// The minimum row timestamp for data in this summary
pub min_timestamp: DateTime<Utc>,
/// The maximum row timestamp value for data in this summary
pub max_timestamp: DateTime<Utc>,
/// The number of rows in this summary
pub row_count: usize,
}

View File

@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version"
# Rename to workaround doctest bug
# Turn off optional datafusion features (function packages)
upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="bd3ee23520a3e6f135891ec32d96fcea7ee2bb55", default-features = false, package = "datafusion" }
upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="30693df8961dca300306dfd0c8fca130375b50b3", default-features = false, package = "datafusion" }

View File

@ -82,6 +82,10 @@ message LifecycleRules {
// If 0, compactions are limited to the default number.
// See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS
uint32 max_active_compactions = 16;
// Use up to this amount of space in bytes for caching Parquet files.
// A value of 0 disables Parquet caching
uint64 parquet_cache_limit = 17;
}
message DatabaseRules {

View File

@ -35,6 +35,10 @@ impl From<LifecycleRules> for management::LifecycleRules {
persist_row_threshold: config.persist_row_threshold.get() as u64,
persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(),
mub_row_threshold: config.mub_row_threshold.get() as u64,
parquet_cache_limit: config
.parquet_cache_limit
.map(|v| v.get())
.unwrap_or_default(),
}
}
}
@ -69,6 +73,7 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
.unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()),
mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize)
.unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()),
parquet_cache_limit: NonZeroU64::new(proto.parquet_cache_limit),
})
}
}
@ -93,6 +98,7 @@ mod tests {
persist_row_threshold: 57,
persist_age_threshold_seconds: 23,
mub_row_threshold: 3454,
parquet_cache_limit: 10,
};
let config: LifecycleRules = protobuf.clone().try_into().unwrap();
@ -125,6 +131,11 @@ mod tests {
protobuf.persist_age_threshold_seconds
);
assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold);
assert_eq!(
config.parquet_cache_limit.unwrap().get(),
protobuf.parquet_cache_limit
);
assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit);
}
#[test]

View File

@ -11,7 +11,7 @@ use arrow::datatypes::{
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
SchemaRef as ArrowSchemaRef, TimeUnit,
};
use snafu::Snafu;
use snafu::{OptionExt, Snafu};
use crate::{
schema::sort::{ColumnSort, SortKey},
@ -395,11 +395,9 @@ impl Schema {
pub fn compute_select_indicies(&self, columns: &[&str]) -> Result<Vec<usize>> {
columns
.iter()
.map(|column_name| {
.map(|&column_name| {
self.find_index_of(column_name)
.ok_or_else(|| Error::ColumnNotFound {
column_name: column_name.to_string(),
})
.context(ColumnNotFound { column_name })
})
.collect()
}
@ -788,12 +786,12 @@ macro_rules! assert_column_eq {
#[cfg(test)]
mod test {
use arrow::compute::SortOptions;
use InfluxColumnType::*;
use InfluxFieldType::*;
use super::{builder::SchemaBuilder, *};
use crate::schema::merge::SchemaMerger;
use crate::schema::sort::SortOptions;
fn make_field(
name: &str,

View File

@ -1,5 +1,6 @@
use std::{fmt::Display, str::FromStr};
use arrow::compute::SortOptions;
use indexmap::{map::Iter, IndexMap};
use itertools::Itertools;
use snafu::Snafu;
@ -23,24 +24,6 @@ pub enum Error {
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Temporary - <https://github.com/apache/arrow-rs/pull/425>
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub struct SortOptions {
/// Whether to sort in descending order
pub descending: bool,
/// Whether to sort nulls first
pub nulls_first: bool,
}
impl Default for SortOptions {
fn default() -> Self {
Self {
descending: false,
nulls_first: true,
}
}
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub struct ColumnSort {
/// Position of this column in the sort key

View File

@ -1399,6 +1399,7 @@ mod tests {
let rules = LifecycleRules {
late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
max_active_compactions: NonZeroU32::new(10).unwrap(),
..Default::default()
};
@ -1538,6 +1539,7 @@ mod tests {
persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
persist_age_threshold_seconds: NonZeroU32::new(10).unwrap(),
max_active_compactions: NonZeroU32::new(10).unwrap(),
..Default::default()
};
let now = Instant::now();

View File

@ -7,12 +7,13 @@ use std::{
use chrono::{DateTime, TimeZone, Utc};
use data_types::partition_metadata::PartitionAddr;
use data_types::{partition_metadata::PartitionAddr, write_summary::WriteSummary};
use entry::Sequence;
use internal_types::guard::{ReadGuard, ReadLock};
use crate::checkpoint::PartitionCheckpoint;
use crate::min_max_sequence::MinMaxSequence;
use data_types::instant::to_approximate_datetime;
const DEFAULT_CLOSED_WINDOW_PERIOD: Duration = Duration::from_secs(30);
@ -45,6 +46,9 @@ pub struct PersistenceWindows {
late_arrival_period: Duration,
closed_window_period: Duration,
/// The instant this PersistenceWindows was created
created_at: Instant,
/// The last instant passed to PersistenceWindows::add_range
last_instant: Instant,
@ -106,6 +110,8 @@ impl PersistenceWindows {
let closed_window_count = late_arrival_seconds / closed_window_seconds;
let created_at_instant = Instant::now();
Self {
persistable: ReadLock::new(None),
closed: VecDeque::with_capacity(closed_window_count as usize),
@ -113,11 +119,18 @@ impl PersistenceWindows {
addr,
late_arrival_period,
closed_window_period,
last_instant: Instant::now(),
created_at: created_at_instant,
last_instant: created_at_instant,
max_sequence_numbers: Default::default(),
}
}
/// Updates the late arrival period of this `PersistenceWindows` instance
pub fn set_late_arrival_period(&mut self, late_arrival_period: Duration) {
self.closed_window_period = late_arrival_period.min(DEFAULT_CLOSED_WINDOW_PERIOD);
self.late_arrival_period = late_arrival_period;
}
/// Updates the windows with the information from a batch of rows from a single sequencer
/// to the same partition. The min and max times are the times on the row data. The `received_at`
/// Instant is when the data was received. Taking it in this function is really just about
@ -165,7 +178,7 @@ impl PersistenceWindows {
self.rotate(received_at);
match self.open.as_mut() {
Some(w) => w.add_range(sequence, row_count, min_time, max_time),
Some(w) => w.add_range(sequence, row_count, min_time, max_time, received_at),
None => {
self.open = Some(Window::new(
received_at,
@ -335,6 +348,21 @@ impl PersistenceWindows {
self.windows().next()
}
/// Returns approximate summaries of the unpersisted writes contained
/// recorded by this PersistenceWindow instance
///
/// These are approximate because persistence may partially flush a window, which will
/// update the min row timestamp but not the row count
pub fn summaries(&self) -> impl Iterator<Item = WriteSummary> + '_ {
self.windows().map(move |window| WriteSummary {
time_of_first_write: to_approximate_datetime(window.created_at),
time_of_last_write: to_approximate_datetime(window.last_instant),
min_timestamp: window.min_time,
max_timestamp: window.max_time,
row_count: window.row_count,
})
}
/// Returns true if this PersistenceWindows instance is empty
pub fn is_empty(&self) -> bool {
self.minimum_window().is_none()
@ -374,9 +402,14 @@ struct Window {
/// The server time when this window was created. Used to determine how long data in this
/// window has been sitting in memory.
created_at: Instant,
/// The server time of the last write to this window
last_instant: Instant,
/// The number of rows in the window
row_count: usize,
min_time: DateTime<Utc>, // min time value for data in the window
max_time: DateTime<Utc>, // max time value for data in the window
/// min time value for data in the window
min_time: DateTime<Utc>,
/// max time value for data in the window
max_time: DateTime<Utc>,
/// maps sequencer_id to the minimum and maximum sequence numbers seen
sequencer_numbers: BTreeMap<u32, MinMaxSequence>,
}
@ -399,6 +432,7 @@ impl Window {
Self {
created_at,
last_instant: created_at,
row_count,
min_time,
max_time,
@ -414,7 +448,11 @@ impl Window {
row_count: usize,
min_time: DateTime<Utc>,
max_time: DateTime<Utc>,
instant: Instant,
) {
assert!(self.created_at <= instant);
self.last_instant = instant;
self.row_count += row_count;
if self.min_time > min_time {
self.min_time = min_time;
@ -440,6 +478,10 @@ impl Window {
/// Add one window to another. Used to collapse closed windows into persisted.
fn add_window(&mut self, other: Self) {
assert!(self.last_instant <= other.created_at);
assert!(self.last_instant <= other.last_instant);
self.last_instant = other.last_instant;
self.row_count += other.row_count;
if self.min_time > other.min_time {
self.min_time = other.min_time;
@ -1265,4 +1307,119 @@ mod tests {
assert_eq!(w.closed[1].max_time, start + chrono::Duration::seconds(2));
assert_eq!(w.closed[1].row_count, 11);
}
#[test]
fn test_summaries() {
let late_arrival_period = Duration::from_secs(100);
let mut w = make_windows(late_arrival_period);
let instant = w.created_at;
let created_at_time = to_approximate_datetime(w.created_at);
// Window 1
w.add_range(
Some(&Sequence { id: 1, number: 1 }),
11,
Utc.timestamp_nanos(10),
Utc.timestamp_nanos(11),
instant + Duration::from_millis(1),
);
w.add_range(
Some(&Sequence { id: 1, number: 2 }),
4,
Utc.timestamp_nanos(10),
Utc.timestamp_nanos(340),
instant + Duration::from_millis(30),
);
w.add_range(
Some(&Sequence { id: 1, number: 3 }),
6,
Utc.timestamp_nanos(1),
Utc.timestamp_nanos(5),
instant + Duration::from_millis(50),
);
// More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 1 => Window 2
w.add_range(
Some(&Sequence { id: 1, number: 4 }),
3,
Utc.timestamp_nanos(89),
Utc.timestamp_nanos(90),
instant + DEFAULT_CLOSED_WINDOW_PERIOD + Duration::from_millis(1),
);
// More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 2 => Window 3
w.add_range(
Some(&Sequence { id: 1, number: 5 }),
8,
Utc.timestamp_nanos(3),
Utc.timestamp_nanos(4),
instant + DEFAULT_CLOSED_WINDOW_PERIOD * 3,
);
let closed_duration = chrono::Duration::from_std(DEFAULT_CLOSED_WINDOW_PERIOD).unwrap();
let summaries: Vec<_> = w.summaries().collect();
assert_eq!(summaries.len(), 3);
assert_eq!(
summaries,
vec![
WriteSummary {
time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
time_of_last_write: created_at_time + chrono::Duration::milliseconds(50),
min_timestamp: Utc.timestamp_nanos(1),
max_timestamp: Utc.timestamp_nanos(340),
row_count: 21
},
WriteSummary {
time_of_first_write: created_at_time
+ closed_duration
+ chrono::Duration::milliseconds(1),
time_of_last_write: created_at_time
+ closed_duration
+ chrono::Duration::milliseconds(1),
min_timestamp: Utc.timestamp_nanos(89),
max_timestamp: Utc.timestamp_nanos(90),
row_count: 3
},
WriteSummary {
time_of_first_write: created_at_time + closed_duration * 3,
time_of_last_write: created_at_time + closed_duration * 3,
min_timestamp: Utc.timestamp_nanos(3),
max_timestamp: Utc.timestamp_nanos(4),
row_count: 8
},
]
);
// Rotate first and second windows into persistable
w.rotate(instant + late_arrival_period + DEFAULT_CLOSED_WINDOW_PERIOD * 2);
let summaries: Vec<_> = w.summaries().collect();
assert_eq!(summaries.len(), 2);
assert_eq!(
summaries,
vec![
WriteSummary {
time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
time_of_last_write: created_at_time
+ closed_duration
+ chrono::Duration::milliseconds(1),
min_timestamp: Utc.timestamp_nanos(1),
max_timestamp: Utc.timestamp_nanos(340),
row_count: 24
},
WriteSummary {
time_of_first_write: created_at_time + closed_duration * 3,
time_of_last_write: created_at_time + closed_duration * 3,
min_timestamp: Utc.timestamp_nanos(3),
max_timestamp: Utc.timestamp_nanos(4),
row_count: 8
},
]
);
}
}

View File

@ -39,6 +39,7 @@ use crate::plan::{
};
use self::{
context::IOxExecutionConfig,
split::StreamSplitNode,
task::{DedicatedExecutor, Error as ExecutorError},
};
@ -111,6 +112,9 @@ pub struct Executor {
/// Executor for running system/reorganization tasks such as
/// compact
reorg_exec: DedicatedExecutor,
/// The default configuration options with which to create contexts
config: IOxExecutionConfig,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@ -128,12 +132,25 @@ impl Executor {
let query_exec = DedicatedExecutor::new("IOx Query Executor Thread", num_threads);
let reorg_exec = DedicatedExecutor::new("IOx Reorg Executor Thread", num_threads);
let config = IOxExecutionConfig::new();
Self {
query_exec,
reorg_exec,
config,
}
}
/// returns the config of this executor
pub fn config(&self) -> &IOxExecutionConfig {
&self.config
}
/// returns a mutable reference to this executor's config
pub fn config_mut(&mut self) -> &mut IOxExecutionConfig {
&mut self.config
}
/// Executes this plan on the query pool, and returns the
/// resulting set of strings
pub async fn to_string_set(&self, plan: StringSetPlan) -> Result<StringSetRef> {
@ -289,7 +306,7 @@ impl Executor {
pub fn new_context(&self, executor_type: ExecutorType) -> IOxExecutionContext {
let executor = self.executor(executor_type).clone();
IOxExecutionContext::new(executor)
IOxExecutionContext::new(executor, self.config.clone())
}
/// Return the execution pool of the specified type

View File

@ -5,6 +5,7 @@ use std::{fmt, sync::Arc};
use arrow::record_batch::RecordBatch;
use datafusion::{
catalog::catalog::CatalogProvider,
execution::context::{ExecutionContextState, QueryPlanner},
logical_plan::{LogicalPlan, UserDefinedLogicalNode},
physical_plan::{
@ -105,6 +106,46 @@ impl ExtensionPlanner for IOxExtensionPlanner {
}
}
// Configuration for an IOx execution context
#[derive(Clone)]
pub struct IOxExecutionConfig {
/// Configuration options to pass to DataFusion
inner: ExecutionConfig,
}
impl Default for IOxExecutionConfig {
fn default() -> Self {
const BATCH_SIZE: usize = 1000;
// Setup default configuration
let inner = ExecutionConfig::new()
.with_batch_size(BATCH_SIZE)
.create_default_catalog_and_schema(true)
.with_information_schema(true)
.with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
.with_query_planner(Arc::new(IOxQueryPlanner {}));
Self { inner }
}
}
impl fmt::Debug for IOxExecutionConfig {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "IOxExecutionConfig ...")
}
}
impl IOxExecutionConfig {
pub fn new() -> Self {
Default::default()
}
/// Set execution concurrency
pub fn set_concurrency(&mut self, concurrency: usize) {
self.inner.concurrency = concurrency;
}
}
/// This is an execution context for planning in IOx. It wraps a
/// DataFusion execution context with the information needed for planning.
///
@ -136,21 +177,8 @@ impl fmt::Debug for IOxExecutionContext {
impl IOxExecutionContext {
/// Create an ExecutionContext suitable for executing DataFusion plans
///
/// The config is created with a default catalog and schema, but this
/// can be overridden at a later date
pub fn new(exec: DedicatedExecutor) -> Self {
const BATCH_SIZE: usize = 1000;
// TBD: Should we be reusing an execution context across all executions?
let config = ExecutionConfig::new()
.with_batch_size(BATCH_SIZE)
.create_default_catalog_and_schema(true)
.with_information_schema(true)
.with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
.with_query_planner(Arc::new(IOxQueryPlanner {}));
let inner = ExecutionContext::with_config(config);
pub fn new(exec: DedicatedExecutor, config: IOxExecutionConfig) -> Self {
let inner = ExecutionContext::with_config(config.inner);
Self { inner, exec }
}
@ -160,11 +188,13 @@ impl IOxExecutionContext {
&self.inner
}
/// returns a mutable reference to the inner datafusion execution context
pub fn inner_mut(&mut self) -> &mut ExecutionContext {
&mut self.inner
/// registers a catalog with the inner context
pub fn register_catalog(&mut self, name: impl Into<String>, catalog: Arc<dyn CatalogProvider>) {
self.inner.register_catalog(name, catalog);
}
///
/// Prepare a SQL statement for execution. This assumes that any
/// tables referenced in the SQL have been registered with this context
pub fn prepare_sql(&mut self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {

View File

@ -268,8 +268,9 @@ struct ScanPlan<C: QueryChunk + 'static> {
#[cfg(test)]
mod test {
use arrow::compute::SortOptions;
use arrow_util::assert_batches_eq;
use internal_types::schema::{merge::SchemaMerger, sort::SortOptions};
use internal_types::schema::merge::SchemaMerger;
use crate::{
exec::{Executor, ExecutorType},

View File

@ -87,7 +87,7 @@ impl SqlQueryPlanner {
executor: &Executor,
) -> Result<Arc<dyn ExecutionPlan>> {
let mut ctx = executor.new_context(ExecutorType::Query);
ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database);
ctx.register_catalog(DEFAULT_CATALOG, database);
ctx.prepare_sql(query).context(Preparing)
}
}

View File

@ -366,21 +366,12 @@ impl RecordBatchDeduplicator {
}
/// Create a new record batch from offset --> len
///
/// <https://github.com/apache/arrow-rs/issues/460> for adding this upstream
fn slice_record_batch(
batch: &RecordBatch,
offset: usize,
len: usize,
) -> ArrowResult<RecordBatch> {
let schema = batch.schema();
let new_columns: Vec<_> = batch
.columns()
.iter()
.map(|old_column| old_column.slice(offset, len))
.collect();
let batch = RecordBatch::try_new(schema, new_columns)?;
let batch = batch.slice(offset, len);
// At time of writing, `concat_batches` concatenates the
// contents of dictionaries as well; Do a post pass to remove the

View File

@ -1,25 +1,27 @@
-- Test Setup: OneMeasurementAllChunksDropped
-- SQL: SELECT * from information_schema.tables;
+---------------+--------------------+---------------+------------+
| table_catalog | table_schema | table_name | table_type |
+---------------+--------------------+---------------+------------+
| public | iox | h2o | BASE TABLE |
| public | system | chunks | BASE TABLE |
| public | system | columns | BASE TABLE |
| public | system | chunk_columns | BASE TABLE |
| public | system | operations | BASE TABLE |
| public | information_schema | tables | VIEW |
| public | information_schema | columns | VIEW |
+---------------+--------------------+---------------+------------+
+---------------+--------------------+---------------------+------------+
| table_catalog | table_schema | table_name | table_type |
+---------------+--------------------+---------------------+------------+
| public | iox | h2o | BASE TABLE |
| public | system | chunks | BASE TABLE |
| public | system | columns | BASE TABLE |
| public | system | chunk_columns | BASE TABLE |
| public | system | operations | BASE TABLE |
| public | system | persistence_windows | BASE TABLE |
| public | information_schema | tables | VIEW |
| public | information_schema | columns | VIEW |
+---------------+--------------------+---------------------+------------+
-- SQL: SHOW TABLES;
+---------------+--------------------+---------------+------------+
| table_catalog | table_schema | table_name | table_type |
+---------------+--------------------+---------------+------------+
| public | iox | h2o | BASE TABLE |
| public | system | chunks | BASE TABLE |
| public | system | columns | BASE TABLE |
| public | system | chunk_columns | BASE TABLE |
| public | system | operations | BASE TABLE |
| public | information_schema | tables | VIEW |
| public | information_schema | columns | VIEW |
+---------------+--------------------+---------------+------------+
+---------------+--------------------+---------------------+------------+
| table_catalog | table_schema | table_name | table_type |
+---------------+--------------------+---------------------+------------+
| public | iox | h2o | BASE TABLE |
| public | system | chunks | BASE TABLE |
| public | system | columns | BASE TABLE |
| public | system | chunk_columns | BASE TABLE |
| public | system | operations | BASE TABLE |
| public | system | persistence_windows | BASE TABLE |
| public | information_schema | tables | VIEW |
| public | information_schema | columns | VIEW |
+---------------+--------------------+---------------------+------------+

View File

@ -1,86 +1,87 @@
-- Test Setup: OneMeasurementThreeChunksWithDuplicates
-- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=None |
| logical_plan after projection_push_down | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
| logical_plan after simplify_expressions | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] |
| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
| | ExecutionPlan(PlaceHolder) |
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o;
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=None |
| logical_plan after projection_push_down | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
| logical_plan after simplify_expressions | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
| | ExecutionPlan(PlaceHolder) |
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
+-----------------------------------------+-------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+-------------------------------------------------------------------------------+
| logical_plan | Union |
| | Projection: #h2o.state AS name |
| | TableScan: h2o projection=None |
| | Projection: #h2o.city AS name |
| | TableScan: h2o projection=None |
| logical_plan after projection_push_down | Union |
| | Projection: #h2o.state AS name |
| | TableScan: h2o projection=Some([4]) |
| | Projection: #h2o.city AS name |
| | TableScan: h2o projection=Some([1]) |
| logical_plan after simplify_expressions | Union |
| | Projection: #h2o.state AS name |
| | TableScan: h2o projection=Some([4]) |
| | Projection: #h2o.city AS name |
| | TableScan: h2o projection=Some([1]) |
| physical_plan | ExecutionPlan(PlaceHolder) |
| | ProjectionExec: expr=[state@0 as name] |
| | ExecutionPlan(PlaceHolder) |
| | ProjectionExec: expr=[state@1 as state] |
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | ProjectionExec: expr=[city@0 as name] |
| | ExecutionPlan(PlaceHolder) |
| | ProjectionExec: expr=[city@0 as city] |
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+-----------------------------------------+-------------------------------------------------------------------------------+
-- SQL: explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] |
| | CoalescePartitionsExec |
| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
| | ExecutionPlan(PlaceHolder) |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
| | ExecutionPlan(PlaceHolder) |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
+---------------+-----------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+-----------------------------------------------------------------------------------+
| logical_plan | Union |
| | Projection: #h2o.state AS name |
| | TableScan: h2o projection=Some([4]) |
| | Projection: #h2o.city AS name |
| | TableScan: h2o projection=Some([1]) |
| physical_plan | ExecutionPlan(PlaceHolder) |
| | ProjectionExec: expr=[state@0 as name] |
| | ExecutionPlan(PlaceHolder) |
| | ProjectionExec: expr=[state@1 as state] |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | ProjectionExec: expr=[city@0 as name] |
| | ExecutionPlan(PlaceHolder) |
| | ProjectionExec: expr=[city@0 as city] |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
| | ExecutionPlan(PlaceHolder) |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
+---------------+-----------------------------------------------------------------------------------+

View File

@ -2,11 +2,11 @@
-- IOX_SETUP: OneMeasurementThreeChunksWithDuplicates
-- Plan with order by
explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
-- plan without order by
explain verbose select time, state, city, min_temp, max_temp, area from h2o;
EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
-- Union plan
EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;

View File

@ -1,218 +1,167 @@
-- Test Setup: TwoMeasurementsPredicatePushDown
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: CAST(count@0 AS Int64) > 200 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Float64(200) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Float64(200) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Float64(200) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: CAST(count@0 AS Float64) > 200 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: system@1 > 4 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: system@1 > 4 AND system@1 < 7 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: system@1 > 5 AND system@1 < 7 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
| | TableScan: restaurant projection=None |
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant;
+---------------+---------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
+---------------+---------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where count > 200;
+---------------+--------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+--------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: CAST(count@0 AS Int64) > 200 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200)] |
+---------------+--------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where count > 200.0;
+---------------+----------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+----------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Float64(200) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Float64(200)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: CAST(count@0 AS Float64) > 200 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Float64(200)] |
+---------------+----------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0;
+---------------+---------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: system@1 > 4 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4)] |
+---------------+---------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury")] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence"), #restaurant.count Lt Int64(40000)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury"), #count Lt Int64(40000)] |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and count < 40000;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.count Lt Int64(40000)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #count Lt Int64(40000)] |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4), #restaurant.system Lt Float64(7)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: system@1 > 4 AND system@1 < 7 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4), #system Lt Float64(7)] |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.system Lt Float64(7)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: system@1 > 5 AND system@1 < 7 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #system Lt Float64(7)] |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.town NotEq Utf8("tewsbury"), Float64(7) Gt #restaurant.system] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #town NotEq Utf8("tewsbury"), Float64(7) Gt #system] |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), Utf8("tewsbury") NotEq #restaurant.town, #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), Utf8("tewsbury") NotEq #town, #system Lt Float64(7)] |
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt TimestampNanosecond(130) |
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[Float64(5) Lt #restaurant.system, #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading"), #restaurant.time Gt TimestampNanosecond(130)] |
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
| | CoalesceBatchesExec: target_batch_size=500 |
| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > 130 |
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
| | IOxReadFilterNode: table_name=restaurant, chunks=0 predicate=Predicate exprs: [Float64(5) Lt #system, #town NotEq Utf8("tewsbury"), #system Lt Float64(7), #time Gt TimestampNanosecond(130)] |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

View File

@ -2,44 +2,44 @@
-- IOX_SETUP: TwoMeasurementsPredicatePushDown
-- Test 1: Select everything
EXPLAIN VERBOSE SELECT * from restaurant;
EXPLAIN SELECT * from restaurant;
-- Test 2: One push-down expression: count > 200
-- TODO: Make push-down predicates shown in explain verbose. Ticket #1538
EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
EXPLAIN SELECT * from restaurant where count > 200;
-- Test 2.2: One push-down expression: count > 200.0
EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
EXPLAIN SELECT * from restaurant where count > 200.0;
-- Test 2.3: One push-down expression: system > 4.0
EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
EXPLAIN SELECT * from restaurant where system > 4.0;
-- Test 3: Two push-down expression: count > 200 and town != 'tewsbury'
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
-- Test 4: Still two push-down expression: count > 200 and town != 'tewsbury'
-- even though the results are different
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
-- Test 5: three push-down expression: count > 200 and town != 'tewsbury' and count < 40000
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
-- Test 6: two push-down expression: count > 200 and count < 40000
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000;
EXPLAIN SELECT * from restaurant where count > 200 and count < 40000;
-- Test 7: two push-down expression on float: system > 4.0 and system < 7.0
EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
-- Test 8: two push-down expression on float: system > 5.0 and system < 7.0
EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
-- Test 9: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
-- Test 10: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
-- Test 11: four push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 and
-- time > to_timestamp('1970-01-01T00:00:00.000000120+00:00') rewritten to time GT INT(130)
EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');

View File

@ -4,12 +4,16 @@ mod parse;
mod setup;
use arrow::record_batch::RecordBatch;
use query::{exec::ExecutorType, frontend::sql::SqlQueryPlanner};
use query::{
exec::{Executor, ExecutorType},
frontend::sql::SqlQueryPlanner,
};
use snafu::{OptionExt, ResultExt, Snafu};
use std::{
io::LineWriter,
io::Write,
path::{Path, PathBuf},
sync::Arc,
};
use self::{parse::TestQueries, setup::TestSetup};
@ -261,7 +265,13 @@ impl<W: Write> Runner<W> {
writeln!(self.log, "Running scenario '{}'", scenario_name)?;
writeln!(self.log, "SQL: '{:#?}'", sql)?;
let planner = SqlQueryPlanner::default();
let executor = db.executor();
let num_threads = 1;
let mut executor = Executor::new(num_threads);
// hardcode concurrency in tests as by default is is the
// number of cores, which varies across machines
executor.config_mut().set_concurrency(4);
let executor = Arc::new(executor);
let physical_plan = planner
.query(db, &sql, executor.as_ref())

View File

@ -184,18 +184,19 @@ async fn sql_select_from_information_schema_tables() {
// validate we have access to information schema for listing table
// names
let expected = vec![
"+---------------+--------------------+---------------+------------+",
"| table_catalog | table_schema | table_name | table_type |",
"+---------------+--------------------+---------------+------------+",
"| public | information_schema | columns | VIEW |",
"| public | information_schema | tables | VIEW |",
"| public | iox | h2o | BASE TABLE |",
"| public | iox | o2 | BASE TABLE |",
"| public | system | chunk_columns | BASE TABLE |",
"| public | system | chunks | BASE TABLE |",
"| public | system | columns | BASE TABLE |",
"| public | system | operations | BASE TABLE |",
"+---------------+--------------------+---------------+------------+",
"+---------------+--------------------+---------------------+------------+",
"| table_catalog | table_schema | table_name | table_type |",
"+---------------+--------------------+---------------------+------------+",
"| public | information_schema | columns | VIEW |",
"| public | information_schema | tables | VIEW |",
"| public | iox | h2o | BASE TABLE |",
"| public | iox | o2 | BASE TABLE |",
"| public | system | chunk_columns | BASE TABLE |",
"| public | system | chunks | BASE TABLE |",
"| public | system | columns | BASE TABLE |",
"| public | system | operations | BASE TABLE |",
"| public | system | persistence_windows | BASE TABLE |",
"+---------------+--------------------+---------------------+------------+",
];
run_sql_test_case!(
TwoMeasurementsManyFields {},

View File

@ -8,7 +8,7 @@ use data_types::{
DatabaseName,
};
use metrics::MetricRegistry;
use object_store::{path::ObjectStorePath, ObjectStore};
use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi};
use parquet_file::catalog::PreservedCatalog;
use query::exec::Executor;
use write_buffer::config::WriteBufferConfig;
@ -16,9 +16,13 @@ use write_buffer::config::WriteBufferConfig;
/// This module contains code for managing the configuration of the server.
use crate::{
db::{catalog::Catalog, DatabaseToCommit, Db},
Error, JobRegistry, Result,
DatabaseAlreadyExists, DatabaseNotFound, DatabaseReserved, Error,
InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch,
ServerShuttingDown,
};
use object_store::path::Path;
use observability_deps::tracing::{self, error, info, warn, Instrument};
use snafu::{ensure, OptionExt};
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
@ -34,10 +38,14 @@ pub(crate) const DB_RULES_FILE_NAME: &str = "rules.pb";
/// run to completion if the tokio runtime is dropped
#[derive(Debug)]
pub(crate) struct Config {
shutdown: CancellationToken,
jobs: Arc<JobRegistry>,
state: RwLock<ConfigState>,
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
server_id: ServerId,
metric_registry: Arc<MetricRegistry>,
shutdown: CancellationToken,
state: RwLock<ConfigState>,
}
pub(crate) enum UpdateError<E> {
@ -55,14 +63,20 @@ impl Config {
/// Create new empty config.
pub(crate) fn new(
jobs: Arc<JobRegistry>,
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
server_id: ServerId,
metric_registry: Arc<MetricRegistry>,
remote_template: Option<RemoteTemplate>,
) -> Self {
Self {
jobs,
object_store,
exec,
server_id,
metric_registry,
shutdown: Default::default(),
state: RwLock::new(ConfigState::new(remote_template)),
jobs,
metric_registry,
}
}
@ -77,33 +91,20 @@ impl Config {
/// This only works if the database is not yet known. To recover a database out of an uninitialized state, see
/// [`recover_db`](Self::recover_db). To do maintainance work on data linked to the database (e.g. the catalog)
/// without initializing it, see [`block_db`](Self::block_db).
pub(crate) fn create_db(
&self,
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
server_id: ServerId,
db_name: DatabaseName<'static>,
) -> Result<DatabaseHandle<'_>> {
pub(crate) fn create_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
let mut state = self.state.write().expect("mutex poisoned");
if state.reservations.contains(&db_name) {
return Err(Error::DatabaseReserved {
db_name: db_name.to_string(),
});
}
if state.databases.contains_key(&db_name) {
return Err(Error::DatabaseAlreadyExists {
db_name: db_name.to_string(),
});
}
ensure!(
!state.reservations.contains(&db_name),
DatabaseReserved { db_name }
);
ensure!(
!state.databases.contains_key(&db_name),
DatabaseAlreadyExists { db_name }
);
state.reservations.insert(db_name.clone());
Ok(DatabaseHandle {
state: Some(Arc::new(DatabaseState::Known {
object_store,
exec,
server_id,
db_name,
})),
state: Some(Arc::new(DatabaseState::Known { db_name })),
config: &self,
})
}
@ -115,32 +116,27 @@ impl Config {
/// While the handle is held, no other operations for the given database can be executed.
///
/// This only works if the database is known but is uninitialized. To create a new database that is not yet known,
/// see [`create_db`](Self::create_db). To do maintainance work on data linked to the database (e.g. the catalog)
/// see [`create_db`](Self::create_db). To do maintenance work on data linked to the database (e.g. the catalog)
/// without initializing it, see [`block_db`](Self::block_db).
pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
let mut state = self.state.write().expect("mutex poisoned");
if state.reservations.contains(&db_name) {
return Err(Error::DatabaseReserved {
db_name: db_name.to_string(),
});
}
ensure!(
!state.reservations.contains(&db_name),
DatabaseReserved { db_name }
);
let db_state =
state
.databases
.get(&db_name)
.cloned()
.ok_or_else(|| Error::DatabaseNotFound {
db_name: db_name.to_string(),
})?;
let db_state = state
.databases
.get(&db_name)
.cloned()
.context(DatabaseNotFound { db_name: &db_name })?;
if db_state.is_initialized() {
return Err(Error::DatabaseAlreadyExists {
db_name: db_name.to_string(),
});
}
ensure!(
!db_state.is_initialized(),
DatabaseAlreadyExists { db_name }
);
state.reservations.insert(db_name.clone());
state.reservations.insert(db_name);
Ok(DatabaseHandle {
state: Some(db_state),
config: &self,
@ -159,16 +155,14 @@ impl Config {
db_name: DatabaseName<'static>,
) -> Result<BlockDatabaseGuard<'_>> {
let mut state = self.state.write().expect("mutex poisoned");
if state.reservations.contains(&db_name) {
return Err(Error::DatabaseReserved {
db_name: db_name.to_string(),
});
}
if state.databases.contains_key(&db_name) {
return Err(Error::DatabaseAlreadyExists {
db_name: db_name.to_string(),
});
}
ensure!(
!state.reservations.contains(&db_name),
DatabaseReserved { db_name }
);
ensure!(
!state.databases.contains_key(&db_name),
DatabaseAlreadyExists { db_name }
);
state.reservations.insert(db_name.clone());
Ok(BlockDatabaseGuard {
@ -228,11 +222,9 @@ impl Config {
// TODO: implement for non-initialized databases
let db = self
.db_initialized(db_name)
.ok_or_else(|| Error::DatabaseNotFound {
db_name: db_name.to_string(),
})?;
.context(DatabaseNotFound { db_name })?;
db.update_db_rules(update).map_err(UpdateError::Closure)
db.update_rules(update).map_err(UpdateError::Closure)
}
/// Get all registered remote servers.
@ -311,6 +303,24 @@ impl Config {
pub fn metrics_registry(&self) -> Arc<MetricRegistry> {
Arc::clone(&self.metric_registry)
}
/// Returns the object store of this server
pub fn object_store(&self) -> Arc<ObjectStore> {
Arc::clone(&self.object_store)
}
/// Returns the server id of this server
pub fn server_id(&self) -> ServerId {
self.server_id
}
/// Base location in object store for this server.
pub fn root_path(&self) -> Path {
let id = self.server_id.get();
let mut path = self.object_store.new_path();
path.push_dir(format!("{}", id));
path
}
}
/// Get object store path for the database config under the given root (= path under with the server with the current ID
@ -373,41 +383,14 @@ impl RemoteTemplate {
}
/// Internal representation of the different database states.
///
/// # Shared Data During Transitions
/// The following elements can safely be shared between states because they won't be poisoned by any half-done
/// transition (e.g. starting a transition and then failing due to an IO error):
/// - `object_store`
/// - `exec`
///
/// The following elements can trivially be copied from one state to the next:
/// - `server_id`
/// - `db_name`
///
/// The following elements MUST be copied from one state to the next because partial modifications are not allowed:
/// - `rules`
///
/// Exceptions to the above rules are the following states:
/// - [`Replay`](Self::Replay): replaying twice should (apart from some performance penalties) not do much harm
/// - [`Initialized`](Self::Initialized): the final state is not advanced to anything else
#[derive(Debug)]
#[allow(clippy::large_enum_variant)]
enum DatabaseState {
/// Database is known but nothing is loaded.
Known {
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
server_id: ServerId,
db_name: DatabaseName<'static>,
},
Known { db_name: DatabaseName<'static> },
/// Rules are loaded
RulesLoaded {
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
server_id: ServerId,
rules: Arc<DatabaseRules>,
},
RulesLoaded { rules: Arc<DatabaseRules> },
/// Catalog is loaded but data from sequencers / write buffers is not yet replayed.
Replay { db: Arc<Db> },
@ -465,24 +448,6 @@ impl DatabaseState {
}
}
fn object_store(&self) -> Arc<ObjectStore> {
match self {
DatabaseState::Known { object_store, .. } => Arc::clone(object_store),
DatabaseState::RulesLoaded { object_store, .. } => Arc::clone(object_store),
DatabaseState::Replay { db, .. } => Arc::clone(&db.store),
DatabaseState::Initialized { db, .. } => Arc::clone(&db.store),
}
}
fn server_id(&self) -> ServerId {
match self {
DatabaseState::Known { server_id, .. } => *server_id,
DatabaseState::RulesLoaded { server_id, .. } => *server_id,
DatabaseState::Replay { db, .. } => db.server_id,
DatabaseState::Initialized { db, .. } => db.server_id,
}
}
fn rules(&self) -> Option<Arc<DatabaseRules>> {
match self {
DatabaseState::Known { .. } => None,
@ -548,12 +513,12 @@ impl<'a> DatabaseHandle<'a> {
/// Get object store.
pub fn object_store(&self) -> Arc<ObjectStore> {
self.state().object_store()
Arc::clone(&self.config.object_store)
}
/// Get server ID.
pub fn server_id(&self) -> ServerId {
self.state().server_id()
self.config.server_id
}
/// Get metrics registry.
@ -592,32 +557,26 @@ impl<'a> DatabaseHandle<'a> {
/// Advance database state to [`RulesLoaded`](DatabaseStateCode::RulesLoaded).
pub fn advance_rules_loaded(&mut self, rules: DatabaseRules) -> Result<()> {
match self.state().as_ref() {
DatabaseState::Known {
object_store,
exec,
server_id,
db_name,
} => {
if db_name != &rules.name {
return Err(Error::RulesDatabaseNameMismatch {
actual: rules.name.to_string(),
expected: db_name.to_string(),
});
}
DatabaseState::Known { db_name } => {
ensure!(
db_name == &rules.name,
RulesDatabaseNameMismatch {
actual: rules.name,
expected: db_name,
}
);
self.state = Some(Arc::new(DatabaseState::RulesLoaded {
object_store: Arc::clone(&object_store),
exec: Arc::clone(&exec),
server_id: *server_id,
rules: Arc::new(rules),
}));
Ok(())
}
state => Err(Error::InvalidDatabaseStateTransition {
state => InvalidDatabaseStateTransition {
actual: state.code(),
expected: DatabaseStateCode::Known,
}),
}
.fail(),
}
}
@ -629,16 +588,11 @@ impl<'a> DatabaseHandle<'a> {
write_buffer: Option<WriteBufferConfig>,
) -> Result<()> {
match self.state().as_ref() {
DatabaseState::RulesLoaded {
object_store,
exec,
server_id,
rules,
} => {
DatabaseState::RulesLoaded { rules } => {
let database_to_commit = DatabaseToCommit {
server_id: *server_id,
object_store: Arc::clone(&object_store),
exec: Arc::clone(&exec),
server_id: self.config.server_id,
object_store: Arc::clone(&self.config.object_store),
exec: Arc::clone(&self.config.exec),
preserved_catalog,
catalog,
rules: Arc::clone(&rules),
@ -650,10 +604,11 @@ impl<'a> DatabaseHandle<'a> {
Ok(())
}
state => Err(Error::InvalidDatabaseStateTransition {
state => InvalidDatabaseStateTransition {
actual: state.code(),
expected: DatabaseStateCode::RulesLoaded,
}),
}
.fail(),
}
}
@ -663,7 +618,7 @@ impl<'a> DatabaseHandle<'a> {
DatabaseState::Replay { db } => {
if self.config.shutdown.is_cancelled() {
error!("server is shutting down");
return Err(Error::ServerShuttingDown);
return ServerShuttingDown.fail();
}
let shutdown = self.config.shutdown.child_token();
@ -686,10 +641,11 @@ impl<'a> DatabaseHandle<'a> {
Ok(())
}
state => Err(Error::InvalidDatabaseStateTransition {
state => InvalidDatabaseStateTransition {
actual: state.code(),
expected: DatabaseStateCode::Replay,
}),
}
.fail(),
}
}
}
@ -730,40 +686,32 @@ mod test {
use super::*;
use std::num::NonZeroU32;
fn make_config(remote_template: Option<RemoteTemplate>) -> Config {
let store = Arc::new(ObjectStore::new_in_memory());
let server_id = ServerId::try_from(1).unwrap();
let metric_registry = Arc::new(metrics::MetricRegistry::new());
Config::new(
Arc::new(JobRegistry::new()),
Arc::clone(&store),
Arc::new(Executor::new(1)),
server_id,
Arc::clone(&metric_registry),
remote_template,
)
}
#[tokio::test]
async fn create_db() {
// setup
let name = DatabaseName::new("foo").unwrap();
let store = Arc::new(ObjectStore::new_in_memory());
let exec = Arc::new(Executor::new(1));
let server_id = ServerId::try_from(1).unwrap();
let metric_registry = Arc::new(metrics::MetricRegistry::new());
let config = Config::new(
Arc::new(JobRegistry::new()),
Arc::clone(&metric_registry),
None,
);
let config = make_config(None);
let rules = DatabaseRules::new(name.clone());
// getting handle while DB is reserved => fails
{
let _db_reservation = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap();
let _db_reservation = config.create_db(name.clone()).unwrap();
let err = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap_err();
let err = config.create_db(name.clone()).unwrap_err();
assert!(matches!(err, Error::DatabaseReserved { .. }));
let err = config.block_db(name.clone()).unwrap_err();
@ -775,14 +723,7 @@ mod test {
// name in rules must match reserved name
{
let mut db_reservation = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
DatabaseName::new("bar").unwrap(),
)
.unwrap();
let mut db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();
let err = db_reservation
.advance_rules_loaded(rules.clone())
@ -795,14 +736,7 @@ mod test {
// handle.abort just works (aka does not mess up the transaction afterwards)
{
let db_reservation = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
DatabaseName::new("bar").unwrap(),
)
.unwrap();
let db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();
db_reservation.abort();
}
@ -812,21 +746,14 @@ mod test {
// create DB successfull
{
let mut db_reservation = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap();
let mut db_reservation = config.create_db(name.clone()).unwrap();
db_reservation.advance_rules_loaded(rules).unwrap();
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
&name,
Arc::clone(&store),
server_id,
config.object_store(),
config.server_id(),
config.metrics_registry(),
false,
)
@ -866,14 +793,7 @@ mod test {
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
// create DB as second time => fail
let err = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap_err();
let err = config.create_db(name.clone()).unwrap_err();
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
// block fully initiliazed DB => fail
@ -888,40 +808,18 @@ mod test {
async fn recover_db() {
// setup
let name = DatabaseName::new("foo").unwrap();
let store = Arc::new(ObjectStore::new_in_memory());
let exec = Arc::new(Executor::new(1));
let server_id = ServerId::try_from(1).unwrap();
let metric_registry = Arc::new(metrics::MetricRegistry::new());
let config = Config::new(
Arc::new(JobRegistry::new()),
Arc::clone(&metric_registry),
None,
);
let config = make_config(None);
let rules = DatabaseRules::new(name.clone());
// create DB but don't continue with rules loaded (e.g. because the rules file is broken)
{
let db_reservation = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap();
let db_reservation = config.create_db(name.clone()).unwrap();
db_reservation.commit();
}
assert!(config.has_uninitialized_database(&name));
// create DB while it is uninitialized => fail
let err = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap_err();
let err = config.create_db(name.clone()).unwrap_err();
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
// recover an unknown DB => fail
@ -935,19 +833,19 @@ mod test {
let mut db_reservation = config.recover_db(name.clone()).unwrap();
assert_eq!(db_reservation.state_code(), DatabaseStateCode::Known);
assert_eq!(db_reservation.db_name(), name);
assert_eq!(db_reservation.server_id(), server_id);
assert_eq!(db_reservation.server_id(), config.server_id());
assert!(db_reservation.rules().is_none());
db_reservation.advance_rules_loaded(rules).unwrap();
assert_eq!(db_reservation.state_code(), DatabaseStateCode::RulesLoaded);
assert_eq!(db_reservation.db_name(), name);
assert_eq!(db_reservation.server_id(), server_id);
assert_eq!(db_reservation.server_id(), config.server_id());
assert!(db_reservation.rules().is_some());
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
&name,
Arc::clone(&store),
server_id,
config.object_store(),
config.server_id(),
config.metrics_registry(),
false,
)
@ -958,13 +856,13 @@ mod test {
.unwrap();
assert_eq!(db_reservation.state_code(), DatabaseStateCode::Replay);
assert_eq!(db_reservation.db_name(), name);
assert_eq!(db_reservation.server_id(), server_id);
assert_eq!(db_reservation.server_id(), config.server_id());
assert!(db_reservation.rules().is_some());
db_reservation.advance_init().unwrap();
assert_eq!(db_reservation.state_code(), DatabaseStateCode::Initialized);
assert_eq!(db_reservation.db_name(), name);
assert_eq!(db_reservation.server_id(), server_id);
assert_eq!(db_reservation.server_id(), config.server_id());
assert!(db_reservation.rules().is_some());
db_reservation.commit();
@ -978,14 +876,7 @@ mod test {
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
// create recovered DB => fail
let err = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap_err();
let err = config.create_db(name.clone()).unwrap_err();
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
// block recovered DB => fail
@ -1000,28 +891,13 @@ mod test {
async fn block_db() {
// setup
let name = DatabaseName::new("foo").unwrap();
let store = Arc::new(ObjectStore::new_in_memory());
let exec = Arc::new(Executor::new(1));
let server_id = ServerId::try_from(1).unwrap();
let metric_registry = Arc::new(metrics::MetricRegistry::new());
let config = Config::new(
Arc::new(JobRegistry::new()),
Arc::clone(&metric_registry),
None,
);
let config = make_config(None);
// block DB
let handle = config.block_db(name.clone()).unwrap();
// create while blocked => fail
let err = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap_err();
let err = config.create_db(name.clone()).unwrap_err();
assert!(matches!(err, Error::DatabaseReserved { .. }));
// recover while blocked => fail
@ -1034,14 +910,7 @@ mod test {
// unblock => DB can be created
drop(handle);
config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap();
config.create_db(name.clone()).unwrap();
// cleanup
config.drain().await
@ -1051,20 +920,12 @@ mod test {
async fn test_db_drop() {
// setup
let name = DatabaseName::new("foo").unwrap();
let store = Arc::new(ObjectStore::new_in_memory());
let exec = Arc::new(Executor::new(1));
let server_id = ServerId::try_from(1).unwrap();
let metric_registry = Arc::new(metrics::MetricRegistry::new());
let config = Config::new(
Arc::new(JobRegistry::new()),
Arc::clone(&metric_registry),
None,
);
let config = make_config(None);
let rules = DatabaseRules::new(name.clone());
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
&name,
Arc::clone(&store),
server_id,
config.object_store(),
config.server_id(),
config.metrics_registry(),
false,
)
@ -1072,14 +933,7 @@ mod test {
.unwrap();
// create DB
let mut db_reservation = config
.create_db(
Arc::clone(&store),
Arc::clone(&exec),
server_id,
name.clone(),
)
.unwrap();
let mut db_reservation = config.create_db(name.clone()).unwrap();
db_reservation.advance_rules_loaded(rules).unwrap();
db_reservation
.advance_replay(preserved_catalog, catalog, None)
@ -1126,12 +980,7 @@ mod test {
#[test]
fn resolve_remote() {
let metric_registry = Arc::new(metrics::MetricRegistry::new());
let config = Config::new(
Arc::new(JobRegistry::new()),
Arc::clone(&metric_registry),
Some(RemoteTemplate::new("http://iox-query-{id}:8082")),
);
let config = make_config(Some(RemoteTemplate::new("http://iox-query-{id}:8082")));
let server_id = ServerId::new(NonZeroU32::new(42).unwrap());
let remote = config.resolve_remote(server_id);

View File

@ -50,7 +50,7 @@ use std::{
time::{Duration, Instant},
};
use write_buffer::config::WriteBufferConfig;
use write_buffer::core::WriteBufferError;
use write_buffer::core::{FetchHighWatermark, WriteBufferError};
pub mod access;
pub mod catalog;
@ -144,6 +144,91 @@ pub enum Error {
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Metrics for data ingest via write buffer.
#[derive(Debug)]
struct WriteBufferIngestMetrics {
/// Metrics domain
domain: Arc<metrics::Domain>,
}
impl WriteBufferIngestMetrics {
fn new(domain: Arc<metrics::Domain>) -> Self {
Self { domain }
}
fn new_sequencer_metrics(&self, sequencer_id: u32) -> SequencerMetrics {
let labels = vec![KeyValue::new("sequencer_id", sequencer_id.to_string())];
let red = self
.domain
.register_red_metric_with_labels(Some("ingest"), labels.clone());
let bytes_read = self.domain.register_counter_metric_with_labels(
"read",
Some("bytes"),
"Bytes read from sequencer",
labels.clone(),
);
let last_sequence_number = self.domain.register_gauge_metric_with_labels(
"last_sequence_number",
None,
"Last consumed sequence number (e.g. Kafka offset)",
&labels,
);
let sequence_number_lag = self.domain.register_gauge_metric_with_labels(
"sequence_number_lag",
None,
"The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed sequence number",
&labels,
);
let last_min_ts = self.domain.register_gauge_metric_with_labels(
"last_min_ts",
None,
"Minimum timestamp of last write as unix timestamp in nanoseconds",
&labels,
);
let last_max_ts = self.domain.register_gauge_metric_with_labels(
"last_max_ts",
None,
"Maximum timestamp of last write as unix timestamp in nanoseconds",
&labels,
);
SequencerMetrics {
red,
bytes_read,
last_sequence_number,
sequence_number_lag,
last_min_ts,
last_max_ts,
}
}
}
/// Metrics for a single sequencer.
#[derive(Debug)]
struct SequencerMetrics {
/// Metrics for tracking ingest.
red: metrics::RedMetric,
/// Bytes read from sequencer.
///
/// This metrics is independent of the success / error state of the entries.
bytes_read: metrics::Counter,
/// Last consumed sequence number (e.g. Kafka offset).
last_sequence_number: metrics::Gauge,
// The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed
// sequence number.
sequence_number_lag: metrics::Gauge,
/// Minimum timestamp of last write as unix timestamp in nanoseconds.
last_min_ts: metrics::Gauge,
/// Maximum timestamp of last write as unix timestamp in nanoseconds.
last_max_ts: metrics::Gauge,
}
/// This is the main IOx Database object. It is the root object of any
/// specific InfluxDB IOx instance
///
@ -203,10 +288,10 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
pub struct Db {
rules: RwLock<Arc<DatabaseRules>>,
pub server_id: ServerId, // this is also the Query Server ID
server_id: ServerId, // this is also the Query Server ID
/// Interface to use for persistence
pub store: Arc<ObjectStore>,
store: Arc<ObjectStore>,
/// Executor for running queries
exec: Arc<Executor>,
@ -248,8 +333,8 @@ pub struct Db {
/// Metric labels
metric_labels: Vec<KeyValue>,
/// Metrics for tracking the number of errors that occur while ingesting data
ingest_errors: metrics::Counter,
/// Ingest metrics
ingest_metrics: WriteBufferIngestMetrics,
/// Optionally connect to a write buffer for either buffering writes or reading buffered writes
write_buffer: Option<WriteBufferConfig>,
@ -285,9 +370,8 @@ impl Db {
let metric_labels = database_to_commit.catalog.metric_labels.clone();
let ingest_domain =
metrics_registry.register_domain_with_labels("ingest", metric_labels.clone());
let ingest_errors =
ingest_domain.register_counter_metric("errors", None, "Number of errors during ingest");
metrics_registry.register_domain_with_labels("write_buffer", metric_labels.clone());
let ingest_metrics = WriteBufferIngestMetrics::new(Arc::new(ingest_domain));
let catalog = Arc::new(database_to_commit.catalog);
@ -316,7 +400,7 @@ impl Db {
worker_iterations_lifecycle: AtomicUsize::new(0),
worker_iterations_cleanup: AtomicUsize::new(0),
metric_labels,
ingest_errors,
ingest_metrics,
write_buffer: database_to_commit.write_buffer,
cleanup_lock: Default::default(),
}
@ -333,13 +417,40 @@ impl Db {
}
/// Updates the database rules
pub fn update_db_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
pub fn update_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
where
F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E>,
{
let mut rules = self.rules.write();
let new_rules = Arc::new(update(rules.as_ref().clone())?);
*rules = Arc::clone(&new_rules);
let (late_arrive_window_updated, new_rules) = {
let mut rules = self.rules.write();
info!(db_name=%rules.name, "updating rules for database");
let new_rules = Arc::new(update(rules.as_ref().clone())?);
let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds
!= new_rules.lifecycle_rules.late_arrive_window_seconds;
*rules = Arc::clone(&new_rules);
(late_arrive_window_updated, new_rules)
};
if late_arrive_window_updated {
// Hold a read lock to prevent concurrent modification and
// use values from re-acquired read guard
let current = self.rules.read();
// Update windows
let partitions = self.catalog.partitions();
for partition in &partitions {
let mut partition = partition.write();
let addr = partition.addr().clone();
if let Some(windows) = partition.persistence_windows_mut() {
info!(partition=%addr, "updating persistence windows");
windows.set_late_arrival_period(Duration::from_secs(
current.lifecycle_rules.late_arrive_window_seconds.get() as u64,
))
}
}
}
Ok(new_rules)
}
@ -656,9 +767,17 @@ impl Db {
// streaming from the write buffer loop
async {
if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer {
let mut write_buffer = write_buffer
.try_lock()
.expect("no streams should exist at this point");
let mut futures = vec![];
for (_sequencer_id, stream) in write_buffer.streams() {
let fut = self.stream_in_sequenced_entries(stream);
for (sequencer_id, stream) in write_buffer.streams() {
let metrics = self.ingest_metrics.new_sequencer_metrics(sequencer_id);
let fut = self.stream_in_sequenced_entries(
stream.stream,
stream.fetch_high_watermark,
metrics,
);
futures.push(fut);
}
@ -675,32 +794,122 @@ impl Db {
/// This is used to take entries from a `Stream` and put them in the mutable buffer, such as
/// streaming entries from a write buffer.
async fn stream_in_sequenced_entries(
&self,
stream: BoxStream<'_, Result<SequencedEntry, WriteBufferError>>,
async fn stream_in_sequenced_entries<'a>(
&'a self,
mut stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
f_mark: FetchHighWatermark<'a>,
mut metrics: SequencerMetrics,
) {
stream
.for_each(|sequenced_entry_result| async {
let sequenced_entry = match sequenced_entry_result {
Ok(sequenced_entry) => sequenced_entry,
Err(e) => {
debug!(?e, "Error converting write buffer data to SequencedEntry");
self.ingest_errors.add(1);
return;
}
};
let mut watermark_last_updated: Option<Instant> = None;
let mut watermark = 0;
let sequenced_entry = Arc::new(sequenced_entry);
while let Some(sequenced_entry_result) = stream.next().await {
let red_observation = metrics.red.observation();
if let Err(e) = self.store_sequenced_entry(sequenced_entry) {
// get entry from sequencer
let sequenced_entry = match sequenced_entry_result {
Ok(sequenced_entry) => sequenced_entry,
Err(e) => {
debug!(?e, "Error converting write buffer data to SequencedEntry");
red_observation.client_error();
continue;
}
};
let sequenced_entry = Arc::new(sequenced_entry);
// store entry
match self.store_sequenced_entry(Arc::clone(&sequenced_entry)) {
Ok(_) => {
red_observation.ok();
}
Err(e) => {
debug!(
?e,
"Error storing SequencedEntry from write buffer in database"
);
self.ingest_errors.add(1);
red_observation.error();
}
})
.await
}
// maybe update sequencer watermark
// We are not updating this watermark every round because asking the sequencer for that watermark can be
// quite expensive.
if watermark_last_updated
.map(|ts| ts.elapsed() > Duration::from_secs(10))
.unwrap_or(true)
{
match f_mark().await {
Ok(w) => {
watermark = w;
}
Err(e) => {
debug!(%e, "Error while reading sequencer watermark")
}
}
watermark_last_updated = Some(Instant::now());
}
// update:
// - bytes read
// - last sequence number
// - lag
// - min ts
// - max ts
let sequence = sequenced_entry
.sequence()
.expect("entry from write buffer must be sequenced");
let entry = sequenced_entry.entry();
metrics.bytes_read.add(entry.data().len() as u64);
metrics
.last_sequence_number
.set(sequence.number as usize, &[]);
metrics.sequence_number_lag.set(
watermark.saturating_sub(sequence.number).saturating_sub(1) as usize,
&[],
);
if let Some(min_ts) = entry
.partition_writes()
.map(|partition_writes| {
partition_writes
.iter()
.filter_map(|partition_write| {
partition_write
.table_batches()
.iter()
.filter_map(|table_batch| table_batch.min_max_time().ok())
.map(|(min, _max)| min)
.max()
})
.min()
})
.flatten()
{
metrics
.last_min_ts
.set(min_ts.timestamp_nanos() as usize, &[]);
}
if let Some(max_ts) = entry
.partition_writes()
.map(|partition_writes| {
partition_writes
.iter()
.filter_map(|partition_write| {
partition_write
.table_batches()
.iter()
.filter_map(|table_batch| table_batch.min_max_time().ok())
.map(|(_min, max)| max)
.max()
})
.max()
})
.flatten()
{
metrics
.last_max_ts
.set(max_ts.timestamp_nanos() as usize, &[]);
}
}
}
async fn cleanup_unreferenced_parquet_files(
@ -1208,17 +1417,27 @@ mod tests {
#[tokio::test]
async fn read_from_write_buffer_write_to_mutable_buffer() {
let entry = lp_to_entry("cpu bar=1 10");
let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1);
write_buffer_state
.push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap());
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
write_buffer_state.push_entry(
SequencedEntry::new_from_sequence(Sequence::new(0, 0), lp_to_entry("mem foo=1 10"))
.unwrap(),
);
write_buffer_state.push_entry(
SequencedEntry::new_from_sequence(
Sequence::new(0, 7),
lp_to_entry("cpu bar=2 20\ncpu bar=3 30"),
)
.unwrap(),
);
let write_buffer = MockBufferForReading::new(write_buffer_state);
let db = TestDb::builder()
.write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
let test_db = TestDb::builder()
.write_buffer(WriteBufferConfig::Reading(Arc::new(
tokio::sync::Mutex::new(Box::new(write_buffer) as _),
)))
.build()
.await
.db;
.await;
let db = test_db.db;
// do: start background task loop
let shutdown: CancellationToken = Default::default();
@ -1247,18 +1466,84 @@ mod tests {
tokio::time::sleep(Duration::from_millis(100)).await;
}
// check: metrics
// We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise
let metrics = test_db.metric_registry;
metrics
.has_metric_family("write_buffer_ingest_requests_total")
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
("status", "ok"),
])
.counter()
.eq(2.0)
.unwrap();
metrics
.has_metric_family("write_buffer_read_bytes_total")
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
])
.counter()
.eq(528.0)
.unwrap();
metrics
.has_metric_family("write_buffer_last_sequence_number")
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
])
.gauge()
.eq(7.0)
.unwrap();
metrics
.has_metric_family("write_buffer_sequence_number_lag")
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
])
.gauge()
.eq(0.0)
.unwrap();
metrics
.has_metric_family("write_buffer_last_min_ts")
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
])
.gauge()
.eq(20.0)
.unwrap();
metrics
.has_metric_family("write_buffer_last_max_ts")
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
])
.gauge()
.eq(30.0)
.unwrap();
// do: stop background task loop
shutdown.cancel();
join_handle.await.unwrap();
// check: the expected results should be there
let batches = run_query(db, "select * from cpu").await;
let batches = run_query(db, "select * from cpu order by time").await;
let expected = vec![
"+-----+-------------------------------+",
"| bar | time |",
"+-----+-------------------------------+",
"| 1 | 1970-01-01 00:00:00.000000010 |",
"| 2 | 1970-01-01 00:00:00.000000020 |",
"| 3 | 1970-01-01 00:00:00.000000030 |",
"+-----+-------------------------------+",
];
assert_batches_eq!(expected, &batches);
@ -1271,10 +1556,12 @@ mod tests {
String::from("Something bad happened on the way to creating a SequencedEntry").into(),
0,
);
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
let write_buffer = MockBufferForReading::new(write_buffer_state);
let test_db = TestDb::builder()
.write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
.write_buffer(WriteBufferConfig::Reading(Arc::new(
tokio::sync::Mutex::new(Box::new(write_buffer) as _),
)))
.build()
.await;
@ -1291,11 +1578,16 @@ mod tests {
// check: after a while the error should be reported in the database's metrics
let t_0 = Instant::now();
loop {
let family = metrics.try_has_metric_family("ingest_errors_total");
let family = metrics.try_has_metric_family("write_buffer_ingest_requests_total");
if let Ok(metric) = family {
if metric
.with_labels(&[("db_name", "placeholder"), ("svr_id", "1")])
.with_labels(&[
("db_name", "placeholder"),
("svr_id", "1"),
("sequencer_id", "0"),
("status", "client_error"),
])
.counter()
.eq(1.0)
.is_ok()
@ -2259,10 +2551,12 @@ mod tests {
);
write_buffer_state
.push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 1), entry).unwrap());
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
let write_buffer = MockBufferForReading::new(write_buffer_state);
let db = TestDb::builder()
.write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
.write_buffer(WriteBufferConfig::Reading(Arc::new(
tokio::sync::Mutex::new(Box::new(write_buffer) as _),
)))
.build()
.await
.db;

View File

@ -6,15 +6,16 @@ use hashbrown::{HashMap, HashSet};
use data_types::chunk_metadata::ChunkSummary;
use data_types::chunk_metadata::DetailedChunkSummary;
use data_types::partition_metadata::{PartitionSummary, TableSummary};
use data_types::partition_metadata::{PartitionAddr, PartitionSummary, TableSummary};
use internal_types::schema::Schema;
use snafu::Snafu;
use snafu::{OptionExt, Snafu};
use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
use self::chunk::CatalogChunk;
use self::metrics::CatalogMetrics;
use self::partition::Partition;
use self::table::Table;
use data_types::write_summary::WriteSummary;
pub mod chunk;
mod metrics;
@ -135,11 +136,8 @@ impl Catalog {
/// Get a specific table by name, returning `None` if there is no such table
pub fn table(&self, table_name: impl AsRef<str>) -> Result<MappedRwLockReadGuard<'_, Table>> {
let table_name = table_name.as_ref();
RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)).map_err(
|_| Error::TableNotFound {
table: table_name.to_string(),
},
)
RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name))
.map_err(|_| TableNotFound { table: table_name }.build())
}
/// Get a specific partition by name, returning an error if it can't be found
@ -154,9 +152,9 @@ impl Catalog {
self.table(table_name)?
.partition(partition_key)
.cloned()
.ok_or_else(|| Error::PartitionNotFound {
partition: partition_key.to_string(),
table: table_name.to_string(),
.context(PartitionNotFound {
partition: partition_key,
table: table_name,
})
}
@ -174,9 +172,9 @@ impl Catalog {
.read()
.chunk(chunk_id)
.cloned()
.ok_or_else(|| Error::ChunkNotFound {
partition: partition_key.to_string(),
table: table_name.to_string(),
.context(ChunkNotFound {
partition: partition_key,
table: table_name,
chunk_id,
})
}
@ -228,6 +226,23 @@ impl Catalog {
.collect()
}
/// Returns a list of persistence window summaries for each partition
pub fn persistence_summaries(&self) -> Vec<(PartitionAddr, WriteSummary)> {
let mut summaries = Vec::new();
let tables = self.tables.read();
for table in tables.values() {
for partition in table.partitions() {
let partition = partition.read();
if let Some(w) = partition.persistence_windows() {
for summary in w.summaries() {
summaries.push((partition.addr().clone(), summary))
}
}
}
}
summaries
}
pub fn chunk_summaries(&self) -> Vec<ChunkSummary> {
let partition_key = None;
let table_names = TableNameFilter::AllTables;

View File

@ -5,7 +5,7 @@ use std::{
use data_types::partition_metadata;
use partition_metadata::TableSummary;
use snafu::{ResultExt, Snafu};
use snafu::{OptionExt, ResultExt, Snafu};
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion_util::MemoryStream;
@ -417,7 +417,7 @@ impl QueryChunk for DbChunk {
// column out to get the set of values.
let values = values
.remove(column_name)
.ok_or_else(|| Error::ReadBufferError {
.with_context(|| ReadBufferError {
chunk_id: self.id(),
msg: format!(
"failed to find column_name {:?} in results of tag_values",

View File

@ -7,38 +7,31 @@
//!
//! For example `SELECT * FROM system.chunks`
use std::convert::AsRef;
use std::any::Any;
use std::sync::Arc;
use std::{any::Any, collections::HashMap};
use chrono::{DateTime, Utc};
use arrow::{
array::{
ArrayRef, StringArray, StringBuilder, Time64NanosecondArray, TimestampNanosecondArray,
UInt32Array, UInt32Builder, UInt64Array, UInt64Builder,
},
datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit},
datatypes::{Field, Schema, SchemaRef},
error::Result,
record_batch::RecordBatch,
};
use data_types::{
chunk_metadata::{ChunkSummary, DetailedChunkSummary},
error::ErrorLogger,
job::Job,
partition_metadata::PartitionSummary,
};
use chrono::{DateTime, Utc};
use datafusion::{
catalog::schema::SchemaProvider,
datasource::{datasource::Statistics, TableProvider},
error::{DataFusionError, Result as DataFusionResult},
physical_plan::{memory::MemoryExec, ExecutionPlan},
};
use tracker::TaskTracker;
use crate::JobRegistry;
use super::catalog::Catalog;
use crate::JobRegistry;
use data_types::partition_metadata::TableSummary;
mod chunks;
mod columns;
mod operations;
mod persistence;
// The IOx system schema
pub const SYSTEM_SCHEMA: &str = "system";
@ -47,12 +40,14 @@ const CHUNKS: &str = "chunks";
const COLUMNS: &str = "columns";
const CHUNK_COLUMNS: &str = "chunk_columns";
const OPERATIONS: &str = "operations";
const PERSISTENCE_WINDOWS: &str = "persistence_windows";
pub struct SystemSchemaProvider {
chunks: Arc<dyn TableProvider>,
columns: Arc<dyn TableProvider>,
chunk_columns: Arc<dyn TableProvider>,
operations: Arc<dyn TableProvider>,
persistence_windows: Arc<dyn TableProvider>,
}
impl std::fmt::Debug for SystemSchemaProvider {
@ -67,22 +62,26 @@ impl SystemSchemaProvider {
pub fn new(db_name: impl Into<String>, catalog: Arc<Catalog>, jobs: Arc<JobRegistry>) -> Self {
let db_name = db_name.into();
let chunks = Arc::new(SystemTableProvider {
inner: ChunksTable::new(Arc::clone(&catalog)),
inner: chunks::ChunksTable::new(Arc::clone(&catalog)),
});
let columns = Arc::new(SystemTableProvider {
inner: ColumnsTable::new(Arc::clone(&catalog)),
inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
});
let chunk_columns = Arc::new(SystemTableProvider {
inner: ChunkColumnsTable::new(catalog),
inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)),
});
let operations = Arc::new(SystemTableProvider {
inner: OperationsTable::new(db_name, jobs),
inner: operations::OperationsTable::new(db_name, jobs),
});
let persistence_windows = Arc::new(SystemTableProvider {
inner: persistence::PersistenceWindowsTable::new(catalog),
});
Self {
chunks,
columns,
chunk_columns,
operations,
persistence_windows,
}
}
}
@ -98,6 +97,7 @@ impl SchemaProvider for SystemSchemaProvider {
COLUMNS.to_string(),
CHUNK_COLUMNS.to_string(),
OPERATIONS.to_string(),
PERSISTENCE_WINDOWS.to_string(),
]
}
@ -107,6 +107,7 @@ impl SchemaProvider for SystemSchemaProvider {
COLUMNS => Some(Arc::clone(&self.columns)),
CHUNK_COLUMNS => Some(Arc::clone(&self.chunk_columns)),
OPERATIONS => Some(Arc::clone(&self.operations)),
PERSISTENCE_WINDOWS => Some(Arc::clone(&self.persistence_windows)),
_ => None,
}
}
@ -162,407 +163,6 @@ fn time_to_ts(time: Option<DateTime<Utc>>) -> Option<i64> {
time.map(|ts| ts.timestamp_nanos())
}
/// Implementation of system.chunks table
#[derive(Debug)]
struct ChunksTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl ChunksTable {
fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: chunk_summaries_schema(),
catalog,
}
}
}
impl IoxSystemTable for ChunksTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
.log_if_error("system.chunks table")
}
}
fn chunk_summaries_schema() -> SchemaRef {
let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
Arc::new(Schema::new(vec![
Field::new("id", DataType::UInt32, false),
Field::new("partition_key", DataType::Utf8, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("storage", DataType::Utf8, false),
Field::new("lifecycle_action", DataType::Utf8, true),
Field::new("memory_bytes", DataType::UInt64, false),
Field::new("object_store_bytes", DataType::UInt64, false),
Field::new("row_count", DataType::UInt64, false),
Field::new("time_of_first_write", ts.clone(), true),
Field::new("time_of_last_write", ts.clone(), true),
Field::new("time_closed", ts, true),
]))
}
fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
let partition_key = chunks
.iter()
.map(|c| Some(c.partition_key.as_ref()))
.collect::<StringArray>();
let table_name = chunks
.iter()
.map(|c| Some(c.table_name.as_ref()))
.collect::<StringArray>();
let storage = chunks
.iter()
.map(|c| Some(c.storage.as_str()))
.collect::<StringArray>();
let lifecycle_action = chunks
.iter()
.map(|c| c.lifecycle_action.map(|a| a.name()))
.collect::<StringArray>();
let memory_bytes = chunks
.iter()
.map(|c| Some(c.memory_bytes as u64))
.collect::<UInt64Array>();
let object_store_bytes = chunks
.iter()
.map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
.collect::<UInt64Array>();
let row_counts = chunks
.iter()
.map(|c| Some(c.row_count as u64))
.collect::<UInt64Array>();
let time_of_first_write = chunks
.iter()
.map(|c| c.time_of_first_write)
.map(time_to_ts)
.collect::<TimestampNanosecondArray>();
let time_of_last_write = chunks
.iter()
.map(|c| c.time_of_last_write)
.map(time_to_ts)
.collect::<TimestampNanosecondArray>();
let time_closed = chunks
.iter()
.map(|c| c.time_closed)
.map(time_to_ts)
.collect::<TimestampNanosecondArray>();
RecordBatch::try_new(
schema,
vec![
Arc::new(id),
Arc::new(partition_key),
Arc::new(table_name),
Arc::new(storage),
Arc::new(lifecycle_action),
Arc::new(memory_bytes),
Arc::new(object_store_bytes),
Arc::new(row_counts),
Arc::new(time_of_first_write),
Arc::new(time_of_last_write),
Arc::new(time_closed),
],
)
}
/// Implementation of `system.columns` system table
#[derive(Debug)]
struct ColumnsTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl ColumnsTable {
fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: partition_summaries_schema(),
catalog,
}
}
}
impl IoxSystemTable for ColumnsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_partition_summaries(self.schema(), self.catalog.partition_summaries())
.log_if_error("system.columns table")
}
}
fn partition_summaries_schema() -> SchemaRef {
Arc::new(Schema::new(vec![
Field::new("partition_key", DataType::Utf8, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("column_name", DataType::Utf8, false),
Field::new("column_type", DataType::Utf8, false),
Field::new("influxdb_type", DataType::Utf8, true),
]))
}
fn from_partition_summaries(
schema: SchemaRef,
partitions: Vec<PartitionSummary>,
) -> Result<RecordBatch> {
// Assume each partition has roughly 5 tables with 5 columns
let row_estimate = partitions.len() * 25;
let mut partition_key = StringBuilder::new(row_estimate);
let mut table_name = StringBuilder::new(row_estimate);
let mut column_name = StringBuilder::new(row_estimate);
let mut column_type = StringBuilder::new(row_estimate);
let mut influxdb_type = StringBuilder::new(row_estimate);
// Note no rows are produced for partitions with no tabes, or
// tables with no columns: There are other tables to list tables
// and columns
for partition in partitions {
let table = partition.table;
for column in table.columns {
partition_key.append_value(&partition.key)?;
table_name.append_value(&table.name)?;
column_name.append_value(&column.name)?;
column_type.append_value(column.type_name())?;
if let Some(t) = &column.influxdb_type {
influxdb_type.append_value(t.as_str())?;
} else {
influxdb_type.append_null()?;
}
}
}
RecordBatch::try_new(
schema,
vec![
Arc::new(partition_key.finish()) as ArrayRef,
Arc::new(table_name.finish()),
Arc::new(column_name.finish()),
Arc::new(column_type.finish()),
Arc::new(influxdb_type.finish()),
],
)
}
/// Implementation of system.column_chunks table
#[derive(Debug)]
struct ChunkColumnsTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl ChunkColumnsTable {
fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: chunk_columns_schema(),
catalog,
}
}
}
impl IoxSystemTable for ChunkColumnsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
.log_if_error("system.column_chunks table")
}
}
fn chunk_columns_schema() -> SchemaRef {
Arc::new(Schema::new(vec![
Field::new("partition_key", DataType::Utf8, false),
Field::new("chunk_id", DataType::UInt32, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("column_name", DataType::Utf8, false),
Field::new("storage", DataType::Utf8, false),
Field::new("row_count", DataType::UInt64, true),
Field::new("min_value", DataType::Utf8, true),
Field::new("max_value", DataType::Utf8, true),
Field::new("memory_bytes", DataType::UInt64, true),
]))
}
fn assemble_chunk_columns(
schema: SchemaRef,
chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
) -> Result<RecordBatch> {
/// Builds an index from column_name -> size
fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
summary
.columns
.iter()
.map(|column_summary| {
(
column_summary.name.as_ref(),
column_summary.memory_bytes as u64,
)
})
.collect()
}
// Assume each chunk has roughly 5 columns
let row_estimate = chunk_summaries.len() * 5;
let mut partition_key = StringBuilder::new(row_estimate);
let mut chunk_id = UInt32Builder::new(row_estimate);
let mut table_name = StringBuilder::new(row_estimate);
let mut column_name = StringBuilder::new(row_estimate);
let mut storage = StringBuilder::new(row_estimate);
let mut row_count = UInt64Builder::new(row_estimate);
let mut min_values = StringBuilder::new(row_estimate);
let mut max_values = StringBuilder::new(row_estimate);
let mut memory_bytes = UInt64Builder::new(row_estimate);
// Note no rows are produced for partitions with no chunks, or
// tables with no partitions: There are other tables to list tables
// and columns
for (table_summary, chunk_summary) in chunk_summaries {
let mut column_index = make_column_index(&chunk_summary);
let storage_value = chunk_summary.inner.storage.as_str();
for column in &table_summary.columns {
partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
chunk_id.append_value(chunk_summary.inner.id)?;
table_name.append_value(&chunk_summary.inner.table_name)?;
column_name.append_value(&column.name)?;
storage.append_value(storage_value)?;
row_count.append_value(column.count())?;
if let Some(v) = column.stats.min_as_str() {
min_values.append_value(v)?;
} else {
min_values.append(false)?;
}
if let Some(v) = column.stats.max_as_str() {
max_values.append_value(v)?;
} else {
max_values.append(false)?;
}
let size = column_index.remove(column.name.as_str());
memory_bytes.append_option(size)?;
}
}
RecordBatch::try_new(
schema,
vec![
Arc::new(partition_key.finish()) as ArrayRef,
Arc::new(chunk_id.finish()),
Arc::new(table_name.finish()),
Arc::new(column_name.finish()),
Arc::new(storage.finish()),
Arc::new(row_count.finish()),
Arc::new(min_values.finish()),
Arc::new(max_values.finish()),
Arc::new(memory_bytes.finish()),
],
)
}
/// Implementation of system.operations table
#[derive(Debug)]
struct OperationsTable {
schema: SchemaRef,
db_name: String,
jobs: Arc<JobRegistry>,
}
impl OperationsTable {
fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
Self {
schema: operations_schema(),
db_name,
jobs,
}
}
}
impl IoxSystemTable for OperationsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
.log_if_error("system.operations table")
}
}
fn operations_schema() -> SchemaRef {
let ts = DataType::Time64(TimeUnit::Nanosecond);
Arc::new(Schema::new(vec![
Field::new("id", DataType::Utf8, false),
Field::new("status", DataType::Utf8, true),
Field::new("cpu_time_used", ts.clone(), true),
Field::new("wall_time_used", ts, true),
Field::new("partition_key", DataType::Utf8, true),
Field::new("chunk_id", DataType::UInt32, true),
Field::new("description", DataType::Utf8, true),
]))
}
fn from_task_trackers(
schema: SchemaRef,
db_name: &str,
jobs: Vec<TaskTracker<Job>>,
) -> Result<RecordBatch> {
let jobs = jobs
.into_iter()
.filter(|job| job.metadata().db_name() == Some(db_name))
.collect::<Vec<_>>();
let ids = jobs
.iter()
.map(|job| Some(job.id().to_string()))
.collect::<StringArray>();
let statuses = jobs
.iter()
.map(|job| Some(job.get_status().name()))
.collect::<StringArray>();
let cpu_time_used = jobs
.iter()
.map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
.collect::<Time64NanosecondArray>();
let wall_time_used = jobs
.iter()
.map(|job| job.get_status().wall_nanos().map(|n| n as i64))
.collect::<Time64NanosecondArray>();
let partition_keys = jobs
.iter()
.map(|job| job.metadata().partition_key())
.collect::<StringArray>();
let chunk_ids = jobs
.iter()
.map(|job| job.metadata().chunk_id())
.collect::<UInt32Array>();
let descriptions = jobs
.iter()
.map(|job| Some(job.metadata().description()))
.collect::<StringArray>();
RecordBatch::try_new(
schema,
vec![
Arc::new(ids) as ArrayRef,
Arc::new(statuses),
Arc::new(cpu_time_used),
Arc::new(wall_time_used),
Arc::new(partition_keys),
Arc::new(chunk_ids),
Arc::new(descriptions),
],
)
}
/// Creates a DataFusion ExecutionPlan node that scans a single batch
/// of records.
fn scan_batch(
@ -605,141 +205,10 @@ fn scan_batch(
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::{ArrayRef, UInt64Array};
use arrow_util::assert_batches_eq;
use chrono::NaiveDateTime;
use data_types::{
chunk_metadata::{ChunkColumnSummary, ChunkLifecycleAction, ChunkStorage},
partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary},
};
#[test]
fn test_from_chunk_summaries() {
let chunks = vec![
ChunkSummary {
partition_key: Arc::from("p1"),
table_name: Arc::from("table1"),
id: 0,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action: None,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(10, 0),
Utc,
)),
time_of_last_write: None,
time_closed: None,
},
ChunkSummary {
partition_key: Arc::from("p1"),
table_name: Arc::from("table1"),
id: 1,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action: Some(ChunkLifecycleAction::Persisting),
memory_bytes: 23455,
object_store_bytes: 0,
row_count: 22,
time_of_first_write: None,
time_of_last_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(80, 0),
Utc,
)),
time_closed: None,
},
ChunkSummary {
partition_key: Arc::from("p1"),
table_name: Arc::from("table1"),
id: 2,
storage: ChunkStorage::ObjectStoreOnly,
lifecycle_action: None,
memory_bytes: 1234,
object_store_bytes: 5678,
row_count: 33,
time_of_first_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(100, 0),
Utc,
)),
time_of_last_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(200, 0),
Utc,
)),
time_closed: None,
},
];
let expected = vec![
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
"| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |",
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
"| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |",
"| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |",
"| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |",
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
];
let schema = chunk_summaries_schema();
let batch = from_chunk_summaries(schema, chunks).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
#[test]
fn test_from_partition_summaries() {
let partitions = vec![
PartitionSummary {
key: "p1".to_string(),
table: TableSummary {
name: "t1".to_string(),
columns: vec![
ColumnSummary {
name: "c1".to_string(),
influxdb_type: Some(InfluxDbType::Tag),
stats: Statistics::I64(StatValues::new_with_value(23)),
},
ColumnSummary {
name: "c2".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::I64(StatValues::new_with_value(43)),
},
ColumnSummary {
name: "c3".to_string(),
influxdb_type: None,
stats: Statistics::String(StatValues::new_with_value(
"foo".to_string(),
)),
},
ColumnSummary {
name: "time".to_string(),
influxdb_type: Some(InfluxDbType::Timestamp),
stats: Statistics::I64(StatValues::new_with_value(43)),
},
],
},
},
PartitionSummary {
key: "p3".to_string(),
table: TableSummary {
name: "t1".to_string(),
columns: vec![],
},
},
];
let expected = vec![
"+---------------+------------+-------------+-------------+---------------+",
"| partition_key | table_name | column_name | column_type | influxdb_type |",
"+---------------+------------+-------------+-------------+---------------+",
"| p1 | t1 | c1 | I64 | Tag |",
"| p1 | t1 | c2 | I64 | Field |",
"| p1 | t1 | c3 | String | |",
"| p1 | t1 | time | I64 | Timestamp |",
"+---------------+------------+-------------+-------------+---------------+",
];
let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
use super::*;
fn seq_array(start: u64, end: u64) -> ArrayRef {
Arc::new(UInt64Array::from_iter_values(start..end))
@ -820,130 +289,4 @@ mod tests {
err_string
);
}
#[test]
fn test_assemble_chunk_columns() {
let lifecycle_action = None;
let summaries = vec![
(
Arc::new(TableSummary {
name: "t1".to_string(),
columns: vec![
ColumnSummary {
name: "c1".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::String(StatValues::new(
Some("bar".to_string()),
Some("foo".to_string()),
55,
)),
},
ColumnSummary {
name: "c2".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
},
],
}),
DetailedChunkSummary {
inner: ChunkSummary {
partition_key: "p1".into(),
table_name: "t1".into(),
id: 42,
storage: ChunkStorage::ReadBuffer,
lifecycle_action,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: None,
time_of_last_write: None,
time_closed: None,
},
columns: vec![
ChunkColumnSummary {
name: "c1".into(),
memory_bytes: 11,
},
ChunkColumnSummary {
name: "c2".into(),
memory_bytes: 12,
},
],
},
),
(
Arc::new(TableSummary {
name: "t1".to_string(),
columns: vec![ColumnSummary {
name: "c1".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
}],
}),
DetailedChunkSummary {
inner: ChunkSummary {
partition_key: "p2".into(),
table_name: "t1".into(),
id: 43,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: None,
time_of_last_write: None,
time_closed: None,
},
columns: vec![ChunkColumnSummary {
name: "c1".into(),
memory_bytes: 100,
}],
},
),
(
Arc::new(TableSummary {
name: "t2".to_string(),
columns: vec![ColumnSummary {
name: "c3".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
}],
}),
DetailedChunkSummary {
inner: ChunkSummary {
partition_key: "p2".into(),
table_name: "t2".into(),
id: 44,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: None,
time_of_last_write: None,
time_closed: None,
},
columns: vec![ChunkColumnSummary {
name: "c3".into(),
memory_bytes: 200,
}],
},
),
];
let expected = vec![
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
"| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |",
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
"| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |",
"| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |",
"| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |",
"| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |",
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
];
let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
}

View File

@ -0,0 +1,201 @@
use std::sync::Arc;
use arrow::array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array};
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
use arrow::error::Result;
use arrow::record_batch::RecordBatch;
use data_types::chunk_metadata::ChunkSummary;
use data_types::error::ErrorLogger;
use crate::db::catalog::Catalog;
use crate::db::system_tables::{time_to_ts, IoxSystemTable};
/// Implementation of system.chunks table
#[derive(Debug)]
pub(super) struct ChunksTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl ChunksTable {
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: chunk_summaries_schema(),
catalog,
}
}
}
impl IoxSystemTable for ChunksTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
.log_if_error("system.chunks table")
}
}
fn chunk_summaries_schema() -> SchemaRef {
let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
Arc::new(Schema::new(vec![
Field::new("id", DataType::UInt32, false),
Field::new("partition_key", DataType::Utf8, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("storage", DataType::Utf8, false),
Field::new("lifecycle_action", DataType::Utf8, true),
Field::new("memory_bytes", DataType::UInt64, false),
Field::new("object_store_bytes", DataType::UInt64, false),
Field::new("row_count", DataType::UInt64, false),
Field::new("time_of_first_write", ts.clone(), true),
Field::new("time_of_last_write", ts.clone(), true),
Field::new("time_closed", ts, true),
]))
}
fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
let partition_key = chunks
.iter()
.map(|c| Some(c.partition_key.as_ref()))
.collect::<StringArray>();
let table_name = chunks
.iter()
.map(|c| Some(c.table_name.as_ref()))
.collect::<StringArray>();
let storage = chunks
.iter()
.map(|c| Some(c.storage.as_str()))
.collect::<StringArray>();
let lifecycle_action = chunks
.iter()
.map(|c| c.lifecycle_action.map(|a| a.name()))
.collect::<StringArray>();
let memory_bytes = chunks
.iter()
.map(|c| Some(c.memory_bytes as u64))
.collect::<UInt64Array>();
let object_store_bytes = chunks
.iter()
.map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
.collect::<UInt64Array>();
let row_counts = chunks
.iter()
.map(|c| Some(c.row_count as u64))
.collect::<UInt64Array>();
let time_of_first_write = chunks
.iter()
.map(|c| c.time_of_first_write)
.map(time_to_ts)
.collect::<TimestampNanosecondArray>();
let time_of_last_write = chunks
.iter()
.map(|c| c.time_of_last_write)
.map(time_to_ts)
.collect::<TimestampNanosecondArray>();
let time_closed = chunks
.iter()
.map(|c| c.time_closed)
.map(time_to_ts)
.collect::<TimestampNanosecondArray>();
RecordBatch::try_new(
schema,
vec![
Arc::new(id),
Arc::new(partition_key),
Arc::new(table_name),
Arc::new(storage),
Arc::new(lifecycle_action),
Arc::new(memory_bytes),
Arc::new(object_store_bytes),
Arc::new(row_counts),
Arc::new(time_of_first_write),
Arc::new(time_of_last_write),
Arc::new(time_closed),
],
)
}
#[cfg(test)]
mod tests {
use chrono::{DateTime, NaiveDateTime, Utc};
use arrow_util::assert_batches_eq;
use data_types::chunk_metadata::{ChunkLifecycleAction, ChunkStorage};
use super::*;
#[test]
fn test_from_chunk_summaries() {
let chunks = vec![
ChunkSummary {
partition_key: Arc::from("p1"),
table_name: Arc::from("table1"),
id: 0,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action: None,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(10, 0),
Utc,
)),
time_of_last_write: None,
time_closed: None,
},
ChunkSummary {
partition_key: Arc::from("p1"),
table_name: Arc::from("table1"),
id: 1,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action: Some(ChunkLifecycleAction::Persisting),
memory_bytes: 23455,
object_store_bytes: 0,
row_count: 22,
time_of_first_write: None,
time_of_last_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(80, 0),
Utc,
)),
time_closed: None,
},
ChunkSummary {
partition_key: Arc::from("p1"),
table_name: Arc::from("table1"),
id: 2,
storage: ChunkStorage::ObjectStoreOnly,
lifecycle_action: None,
memory_bytes: 1234,
object_store_bytes: 5678,
row_count: 33,
time_of_first_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(100, 0),
Utc,
)),
time_of_last_write: Some(DateTime::from_utc(
NaiveDateTime::from_timestamp(200, 0),
Utc,
)),
time_closed: None,
},
];
let expected = vec![
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
"| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |",
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
"| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |",
"| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |",
"| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |",
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
];
let schema = chunk_summaries_schema();
let batch = from_chunk_summaries(schema, chunks).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
}

View File

@ -0,0 +1,404 @@
use std::collections::HashMap;
use std::sync::Arc;
use arrow::array::{ArrayRef, StringBuilder, UInt32Builder, UInt64Builder};
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
use arrow::error::Result;
use arrow::record_batch::RecordBatch;
use data_types::chunk_metadata::DetailedChunkSummary;
use data_types::error::ErrorLogger;
use data_types::partition_metadata::{PartitionSummary, TableSummary};
use crate::db::catalog::Catalog;
use crate::db::system_tables::IoxSystemTable;
/// Implementation of `system.columns` system table
#[derive(Debug)]
pub(super) struct ColumnsTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl ColumnsTable {
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: partition_summaries_schema(),
catalog,
}
}
}
impl IoxSystemTable for ColumnsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_partition_summaries(self.schema(), self.catalog.partition_summaries())
.log_if_error("system.columns table")
}
}
fn partition_summaries_schema() -> SchemaRef {
Arc::new(Schema::new(vec![
Field::new("partition_key", DataType::Utf8, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("column_name", DataType::Utf8, false),
Field::new("column_type", DataType::Utf8, false),
Field::new("influxdb_type", DataType::Utf8, true),
]))
}
fn from_partition_summaries(
schema: SchemaRef,
partitions: Vec<PartitionSummary>,
) -> Result<RecordBatch> {
// Assume each partition has roughly 5 tables with 5 columns
let row_estimate = partitions.len() * 25;
let mut partition_key = StringBuilder::new(row_estimate);
let mut table_name = StringBuilder::new(row_estimate);
let mut column_name = StringBuilder::new(row_estimate);
let mut column_type = StringBuilder::new(row_estimate);
let mut influxdb_type = StringBuilder::new(row_estimate);
// Note no rows are produced for partitions with no tabes, or
// tables with no columns: There are other tables to list tables
// and columns
for partition in partitions {
let table = partition.table;
for column in table.columns {
partition_key.append_value(&partition.key)?;
table_name.append_value(&table.name)?;
column_name.append_value(&column.name)?;
column_type.append_value(column.type_name())?;
if let Some(t) = &column.influxdb_type {
influxdb_type.append_value(t.as_str())?;
} else {
influxdb_type.append_null()?;
}
}
}
RecordBatch::try_new(
schema,
vec![
Arc::new(partition_key.finish()) as ArrayRef,
Arc::new(table_name.finish()),
Arc::new(column_name.finish()),
Arc::new(column_type.finish()),
Arc::new(influxdb_type.finish()),
],
)
}
/// Implementation of system.column_chunks table
#[derive(Debug)]
pub(super) struct ChunkColumnsTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl ChunkColumnsTable {
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: chunk_columns_schema(),
catalog,
}
}
}
impl IoxSystemTable for ChunkColumnsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
.log_if_error("system.column_chunks table")
}
}
fn chunk_columns_schema() -> SchemaRef {
Arc::new(Schema::new(vec![
Field::new("partition_key", DataType::Utf8, false),
Field::new("chunk_id", DataType::UInt32, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("column_name", DataType::Utf8, false),
Field::new("storage", DataType::Utf8, false),
Field::new("row_count", DataType::UInt64, true),
Field::new("min_value", DataType::Utf8, true),
Field::new("max_value", DataType::Utf8, true),
Field::new("memory_bytes", DataType::UInt64, true),
]))
}
fn assemble_chunk_columns(
schema: SchemaRef,
chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
) -> Result<RecordBatch> {
/// Builds an index from column_name -> size
fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
summary
.columns
.iter()
.map(|column_summary| {
(
column_summary.name.as_ref(),
column_summary.memory_bytes as u64,
)
})
.collect()
}
// Assume each chunk has roughly 5 columns
let row_estimate = chunk_summaries.len() * 5;
let mut partition_key = StringBuilder::new(row_estimate);
let mut chunk_id = UInt32Builder::new(row_estimate);
let mut table_name = StringBuilder::new(row_estimate);
let mut column_name = StringBuilder::new(row_estimate);
let mut storage = StringBuilder::new(row_estimate);
let mut row_count = UInt64Builder::new(row_estimate);
let mut min_values = StringBuilder::new(row_estimate);
let mut max_values = StringBuilder::new(row_estimate);
let mut memory_bytes = UInt64Builder::new(row_estimate);
// Note no rows are produced for partitions with no chunks, or
// tables with no partitions: There are other tables to list tables
// and columns
for (table_summary, chunk_summary) in chunk_summaries {
let mut column_index = make_column_index(&chunk_summary);
let storage_value = chunk_summary.inner.storage.as_str();
for column in &table_summary.columns {
partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
chunk_id.append_value(chunk_summary.inner.id)?;
table_name.append_value(&chunk_summary.inner.table_name)?;
column_name.append_value(&column.name)?;
storage.append_value(storage_value)?;
row_count.append_value(column.count())?;
if let Some(v) = column.stats.min_as_str() {
min_values.append_value(v)?;
} else {
min_values.append(false)?;
}
if let Some(v) = column.stats.max_as_str() {
max_values.append_value(v)?;
} else {
max_values.append(false)?;
}
let size = column_index.remove(column.name.as_str());
memory_bytes.append_option(size)?;
}
}
RecordBatch::try_new(
schema,
vec![
Arc::new(partition_key.finish()) as ArrayRef,
Arc::new(chunk_id.finish()),
Arc::new(table_name.finish()),
Arc::new(column_name.finish()),
Arc::new(storage.finish()),
Arc::new(row_count.finish()),
Arc::new(min_values.finish()),
Arc::new(max_values.finish()),
Arc::new(memory_bytes.finish()),
],
)
}
#[cfg(test)]
mod tests {
use arrow_util::assert_batches_eq;
use data_types::chunk_metadata::{ChunkColumnSummary, ChunkStorage, ChunkSummary};
use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics};
use super::*;
#[test]
fn test_from_partition_summaries() {
let partitions = vec![
PartitionSummary {
key: "p1".to_string(),
table: TableSummary {
name: "t1".to_string(),
columns: vec![
ColumnSummary {
name: "c1".to_string(),
influxdb_type: Some(InfluxDbType::Tag),
stats: Statistics::I64(StatValues::new_with_value(23)),
},
ColumnSummary {
name: "c2".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::I64(StatValues::new_with_value(43)),
},
ColumnSummary {
name: "c3".to_string(),
influxdb_type: None,
stats: Statistics::String(StatValues::new_with_value(
"foo".to_string(),
)),
},
ColumnSummary {
name: "time".to_string(),
influxdb_type: Some(InfluxDbType::Timestamp),
stats: Statistics::I64(StatValues::new_with_value(43)),
},
],
},
},
PartitionSummary {
key: "p3".to_string(),
table: TableSummary {
name: "t1".to_string(),
columns: vec![],
},
},
];
let expected = vec![
"+---------------+------------+-------------+-------------+---------------+",
"| partition_key | table_name | column_name | column_type | influxdb_type |",
"+---------------+------------+-------------+-------------+---------------+",
"| p1 | t1 | c1 | I64 | Tag |",
"| p1 | t1 | c2 | I64 | Field |",
"| p1 | t1 | c3 | String | |",
"| p1 | t1 | time | I64 | Timestamp |",
"+---------------+------------+-------------+-------------+---------------+",
];
let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
#[test]
fn test_assemble_chunk_columns() {
let lifecycle_action = None;
let summaries = vec![
(
Arc::new(TableSummary {
name: "t1".to_string(),
columns: vec![
ColumnSummary {
name: "c1".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::String(StatValues::new(
Some("bar".to_string()),
Some("foo".to_string()),
55,
)),
},
ColumnSummary {
name: "c2".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
},
],
}),
DetailedChunkSummary {
inner: ChunkSummary {
partition_key: "p1".into(),
table_name: "t1".into(),
id: 42,
storage: ChunkStorage::ReadBuffer,
lifecycle_action,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: None,
time_of_last_write: None,
time_closed: None,
},
columns: vec![
ChunkColumnSummary {
name: "c1".into(),
memory_bytes: 11,
},
ChunkColumnSummary {
name: "c2".into(),
memory_bytes: 12,
},
],
},
),
(
Arc::new(TableSummary {
name: "t1".to_string(),
columns: vec![ColumnSummary {
name: "c1".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
}],
}),
DetailedChunkSummary {
inner: ChunkSummary {
partition_key: "p2".into(),
table_name: "t1".into(),
id: 43,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: None,
time_of_last_write: None,
time_closed: None,
},
columns: vec![ChunkColumnSummary {
name: "c1".into(),
memory_bytes: 100,
}],
},
),
(
Arc::new(TableSummary {
name: "t2".to_string(),
columns: vec![ColumnSummary {
name: "c3".to_string(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
}],
}),
DetailedChunkSummary {
inner: ChunkSummary {
partition_key: "p2".into(),
table_name: "t2".into(),
id: 44,
storage: ChunkStorage::OpenMutableBuffer,
lifecycle_action,
memory_bytes: 23754,
object_store_bytes: 0,
row_count: 11,
time_of_first_write: None,
time_of_last_write: None,
time_closed: None,
},
columns: vec![ChunkColumnSummary {
name: "c3".into(),
memory_bytes: 200,
}],
},
),
];
let expected = vec![
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
"| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |",
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
"| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |",
"| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |",
"| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |",
"| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |",
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
];
let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
}

View File

@ -0,0 +1,108 @@
use std::sync::Arc;
use arrow::array::{ArrayRef, StringArray, Time64NanosecondArray, UInt32Array};
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
use arrow::error::Result;
use arrow::record_batch::RecordBatch;
use data_types::error::ErrorLogger;
use data_types::job::Job;
use tracker::TaskTracker;
use crate::db::system_tables::IoxSystemTable;
use crate::JobRegistry;
/// Implementation of system.operations table
#[derive(Debug)]
pub(super) struct OperationsTable {
schema: SchemaRef,
db_name: String,
jobs: Arc<JobRegistry>,
}
impl OperationsTable {
pub(super) fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
Self {
schema: operations_schema(),
db_name,
jobs,
}
}
}
impl IoxSystemTable for OperationsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
.log_if_error("system.operations table")
}
}
fn operations_schema() -> SchemaRef {
let ts = DataType::Time64(TimeUnit::Nanosecond);
Arc::new(Schema::new(vec![
Field::new("id", DataType::Utf8, false),
Field::new("status", DataType::Utf8, true),
Field::new("cpu_time_used", ts.clone(), true),
Field::new("wall_time_used", ts, true),
Field::new("partition_key", DataType::Utf8, true),
Field::new("chunk_id", DataType::UInt32, true),
Field::new("description", DataType::Utf8, true),
]))
}
fn from_task_trackers(
schema: SchemaRef,
db_name: &str,
jobs: Vec<TaskTracker<Job>>,
) -> Result<RecordBatch> {
let jobs = jobs
.into_iter()
.filter(|job| job.metadata().db_name() == Some(db_name))
.collect::<Vec<_>>();
let ids = jobs
.iter()
.map(|job| Some(job.id().to_string()))
.collect::<StringArray>();
let statuses = jobs
.iter()
.map(|job| Some(job.get_status().name()))
.collect::<StringArray>();
let cpu_time_used = jobs
.iter()
.map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
.collect::<Time64NanosecondArray>();
let wall_time_used = jobs
.iter()
.map(|job| job.get_status().wall_nanos().map(|n| n as i64))
.collect::<Time64NanosecondArray>();
let partition_keys = jobs
.iter()
.map(|job| job.metadata().partition_key())
.collect::<StringArray>();
let chunk_ids = jobs
.iter()
.map(|job| job.metadata().chunk_id())
.collect::<UInt32Array>();
let descriptions = jobs
.iter()
.map(|job| Some(job.metadata().description()))
.collect::<StringArray>();
RecordBatch::try_new(
schema,
vec![
Arc::new(ids) as ArrayRef,
Arc::new(statuses),
Arc::new(cpu_time_used),
Arc::new(wall_time_used),
Arc::new(partition_keys),
Arc::new(chunk_ids),
Arc::new(descriptions),
],
)
}

View File

@ -0,0 +1,154 @@
use std::sync::Arc;
use arrow::array::{StringArray, TimestampNanosecondArray, UInt64Array};
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
use arrow::error::Result;
use arrow::record_batch::RecordBatch;
use data_types::error::ErrorLogger;
use data_types::partition_metadata::PartitionAddr;
use data_types::write_summary::WriteSummary;
use crate::db::catalog::Catalog;
use crate::db::system_tables::IoxSystemTable;
/// Implementation of system.persistence_windows table
#[derive(Debug)]
pub(super) struct PersistenceWindowsTable {
schema: SchemaRef,
catalog: Arc<Catalog>,
}
impl PersistenceWindowsTable {
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
Self {
schema: persistence_windows_schema(),
catalog,
}
}
}
impl IoxSystemTable for PersistenceWindowsTable {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
fn batch(&self) -> Result<RecordBatch> {
from_write_summaries(self.schema(), self.catalog.persistence_summaries())
.log_if_error("system.persistence_windows table")
}
}
fn persistence_windows_schema() -> SchemaRef {
let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
Arc::new(Schema::new(vec![
Field::new("partition_key", DataType::Utf8, false),
Field::new("table_name", DataType::Utf8, false),
Field::new("row_count", DataType::UInt64, false),
Field::new("time_of_first_write", ts.clone(), false),
Field::new("time_of_last_write", ts.clone(), false),
Field::new("min_timestamp", ts.clone(), false),
Field::new("max_timestamp", ts, false),
]))
}
fn from_write_summaries(
schema: SchemaRef,
chunks: Vec<(PartitionAddr, WriteSummary)>,
) -> Result<RecordBatch> {
let partition_key = chunks
.iter()
.map(|(addr, _)| Some(addr.partition_key.as_ref()))
.collect::<StringArray>();
let table_name = chunks
.iter()
.map(|(addr, _)| Some(addr.table_name.as_ref()))
.collect::<StringArray>();
let row_counts = chunks
.iter()
.map(|(_, w)| Some(w.row_count as u64))
.collect::<UInt64Array>();
let time_of_first_write = chunks
.iter()
.map(|(_, w)| Some(w.time_of_first_write.timestamp_nanos()))
.collect::<TimestampNanosecondArray>();
let time_of_last_write = chunks
.iter()
.map(|(_, w)| Some(w.time_of_last_write.timestamp_nanos()))
.collect::<TimestampNanosecondArray>();
let min_timestamp = chunks
.iter()
.map(|(_, w)| Some(w.min_timestamp.timestamp_nanos()))
.collect::<TimestampNanosecondArray>();
let max_timestamp = chunks
.iter()
.map(|(_, w)| Some(w.max_timestamp.timestamp_nanos()))
.collect::<TimestampNanosecondArray>();
RecordBatch::try_new(
schema,
vec![
Arc::new(partition_key),
Arc::new(table_name),
Arc::new(row_counts),
Arc::new(time_of_first_write),
Arc::new(time_of_last_write),
Arc::new(min_timestamp),
Arc::new(max_timestamp),
],
)
}
#[cfg(test)]
mod tests {
use chrono::{TimeZone, Utc};
use arrow_util::assert_batches_eq;
use super::*;
#[test]
fn test_from_write_summaries() {
let addr = PartitionAddr {
db_name: Arc::from("db"),
table_name: Arc::from("table"),
partition_key: Arc::from("partition"),
};
let summaries = vec![
(
addr.clone(),
WriteSummary {
time_of_first_write: Utc.timestamp_nanos(0),
time_of_last_write: Utc.timestamp_nanos(20),
min_timestamp: Utc.timestamp_nanos(50),
max_timestamp: Utc.timestamp_nanos(60),
row_count: 320,
},
),
(
addr,
WriteSummary {
time_of_first_write: Utc.timestamp_nanos(6),
time_of_last_write: Utc.timestamp_nanos(21),
min_timestamp: Utc.timestamp_nanos(1),
max_timestamp: Utc.timestamp_nanos(2),
row_count: 2,
},
),
];
let expected = vec![
"+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
"| partition_key | table_name | row_count | time_of_first_write | time_of_last_write | min_timestamp | max_timestamp |",
"+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
"| partition | table | 320 | 1970-01-01 00:00:00 | 1970-01-01 00:00:00.000000020 | 1970-01-01 00:00:00.000000050 | 1970-01-01 00:00:00.000000060 |",
"| partition | table | 2 | 1970-01-01 00:00:00.000000006 | 1970-01-01 00:00:00.000000021 | 1970-01-01 00:00:00.000000001 | 1970-01-01 00:00:00.000000002 |",
"+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
];
let schema = persistence_windows_schema();
let batch = from_write_summaries(schema, summaries).unwrap();
assert_batches_eq!(&expected, &[batch]);
}
}

View File

@ -2,29 +2,19 @@
use data_types::{
database_rules::{DatabaseRules, WriteBufferConnection},
database_state::DatabaseStateCode,
server_id::ServerId,
error::ErrorLogger,
DatabaseName,
};
use futures::TryStreamExt;
use generated_types::database_rules::decode_database_rules;
use internal_types::once::OnceNonZeroU32;
use object_store::{
path::{parsed::DirsAndFileName, ObjectStorePath, Path},
ObjectStore, ObjectStoreApi,
};
use observability_deps::tracing::{debug, error, info, warn};
use parking_lot::Mutex;
use observability_deps::tracing::{error, info, warn};
use parquet_file::catalog::PreservedCatalog;
use query::exec::Executor;
use snafu::{OptionExt, ResultExt, Snafu};
use std::{
collections::HashMap,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};
use tokio::sync::Semaphore;
use snafu::{ResultExt, Snafu};
use std::sync::Arc;
use write_buffer::config::WriteBufferConfig;
use crate::{
@ -45,9 +35,6 @@ pub enum Error {
source: generated_types::database_rules::DecodeError,
},
#[snafu(display("id already set"))]
IdAlreadySet { id: ServerId },
#[snafu(display("unable to use server until id is set"))]
IdNotSet,
@ -97,472 +84,254 @@ pub enum Error {
pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug, Default)]
pub struct CurrentServerId(OnceNonZeroU32);
/// Loads the database configurations based on the databases in the
/// object store. Any databases in the config already won't be
/// replaced.
///
/// Returns a Vec containing the results of loading the contained databases
pub(crate) async fn initialize_server(
config: Arc<Config>,
wipe_on_error: bool,
) -> Result<Vec<(DatabaseName<'static>, Result<()>)>> {
let root = config.root_path();
impl CurrentServerId {
pub fn set(&self, id: ServerId) -> Result<()> {
let id = id.get();
// get the database names from the object store prefixes
// TODO: update object store to pull back all common prefixes by
// following the next tokens.
let list_result = config
.object_store()
.list_with_delimiter(&root)
.await
.context(StoreError)?;
match self.0.set(id) {
Ok(()) => {
info!(server_id = id, "server ID set");
Ok(())
}
Err(id) => Err(Error::IdAlreadySet {
id: ServerId::new(id),
}),
}
}
let handles: Vec<_> = list_result
.common_prefixes
.into_iter()
.filter_map(|mut path| {
let config = Arc::clone(&config);
let root = root.clone();
path.set_file_name(DB_RULES_FILE_NAME);
let db_name = db_name_from_rules_path(&path)
.log_if_error("invalid database path")
.ok()?;
pub fn get(&self) -> Result<ServerId> {
self.0.get().map(ServerId::new).context(IdNotSet)
}
}
#[derive(Debug)]
pub struct InitStatus {
pub server_id: CurrentServerId,
/// Flags that databases are loaded and server is ready to read/write data.
initialized: AtomicBool,
/// Semaphore that limits the number of jobs that load DBs when the serverID is set.
///
/// Note that this semaphore is more of a "lock" than an arbitrary semaphore. All the other sync structures (mutex,
/// rwlock) require something to be wrapped which we don't have in our case, so we're using a semaphore here. We
/// want exactly 1 background worker to mess with the server init / DB loading, otherwise everything in the critical
/// section (in [`maybe_initialize_server`](Self::maybe_initialize_server)) will break apart. So this semaphore
/// cannot be configured.
initialize_semaphore: Semaphore,
/// Error occurred during generic server init (e.g. listing store content).
error_generic: Mutex<Option<Arc<Error>>>,
/// Errors that occurred during some DB init.
errors_databases: Arc<Mutex<HashMap<String, Arc<Error>>>>,
/// Automatic wipe-on-error recovery
///
/// See <https://github.com/influxdata/influxdb_iox/issues/1522>
pub(crate) wipe_on_error: AtomicBool,
}
impl InitStatus {
/// Create new "not initialized" status.
pub fn new() -> Self {
Self {
server_id: Default::default(),
initialized: AtomicBool::new(false),
// Always set semaphore permits to `1`, see design comments in `Server::initialize_semaphore`.
initialize_semaphore: Semaphore::new(1),
error_generic: Default::default(),
errors_databases: Default::default(),
wipe_on_error: AtomicBool::new(true),
}
}
/// Base location in object store for this writer.
pub fn root_path(&self, store: &ObjectStore) -> Result<Path> {
let id = self.server_id.get()?;
let mut path = store.new_path();
path.push_dir(format!("{}", id));
Ok(path)
}
/// Check if server is loaded. Databases are loaded and server is ready to read/write.
pub fn initialized(&self) -> bool {
// Need `Acquire` ordering because IF we a `true` here, this thread will likely also read data that
// `maybe_initialize_server` wrote before toggling the flag with `Release`. The `Acquire` flag here ensures that
// every data acccess AFTER the following line will also stay AFTER this line.
self.initialized.load(Ordering::Acquire)
}
/// Error occurred during generic server init (e.g. listing store content).
pub fn error_generic(&self) -> Option<Arc<Error>> {
let guard = self.error_generic.lock();
guard.clone()
}
/// List all databases with errors in sorted order.
pub fn databases_with_errors(&self) -> Vec<String> {
let guard = self.errors_databases.lock();
let mut names: Vec<_> = guard.keys().cloned().collect();
names.sort();
names
}
/// Error that occurred during initialization of a specific database.
pub fn error_database(&self, db_name: &str) -> Option<Arc<Error>> {
let guard = self.errors_databases.lock();
guard.get(db_name).cloned()
}
/// Loads the database configurations based on the databases in the
/// object store. Any databases in the config already won't be
/// replaced.
///
/// This requires the serverID to be set (will be a no-op otherwise).
///
/// It will be a no-op if the configs are already loaded and the server is ready.
pub(crate) async fn maybe_initialize_server(
&self,
store: Arc<ObjectStore>,
config: Arc<Config>,
exec: Arc<Executor>,
) {
let server_id = match self.server_id.get() {
Ok(id) => id,
Err(e) => {
debug!(%e, "cannot initialize server because cannot get serverID");
return;
}
};
let _guard = self
.initialize_semaphore
.acquire()
.await
.expect("semaphore should not be closed");
// Note that we use Acquire-Release ordering for the atomic within the semaphore to ensure that another thread
// that enters this semaphore after we've left actually sees the correct `is_ready` flag.
if self.initialized.load(Ordering::Acquire) {
// already loaded, so do nothing
return;
}
// Check if there was a previous failed attempt
if self.error_generic().is_some() {
return;
}
match self
.maybe_initialize_server_inner(store, config, exec, server_id)
.await
{
Ok(_) => {
// mark as ready (use correct ordering for Acquire-Release)
self.initialized.store(true, Ordering::Release);
info!("loaded databases, server is initalized");
}
Err(e) => {
error!(%e, "error during server init");
let mut guard = self.error_generic.lock();
*guard = Some(Arc::new(e));
}
}
}
async fn maybe_initialize_server_inner(
&self,
store: Arc<ObjectStore>,
config: Arc<Config>,
exec: Arc<Executor>,
server_id: ServerId,
) -> Result<()> {
let root = self.root_path(&store)?;
// get the database names from the object store prefixes
// TODO: update object store to pull back all common prefixes by
// following the next tokens.
let list_result = store.list_with_delimiter(&root).await.context(StoreError)?;
let handles: Vec<_> = list_result
.common_prefixes
.into_iter()
.filter_map(|mut path| {
let store = Arc::clone(&store);
let config = Arc::clone(&config);
let exec = Arc::clone(&exec);
let errors_databases = Arc::clone(&self.errors_databases);
let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
let root = root.clone();
path.set_file_name(DB_RULES_FILE_NAME);
match db_name_from_rules_path(&path) {
Ok(db_name) => {
let handle = tokio::task::spawn(async move {
match Self::initialize_database(
server_id,
store,
config,
exec,
root,
db_name.clone(),
wipe_on_error,
)
.await
{
Ok(()) => {
info!(%db_name, "database initialized");
}
Err(e) => {
error!(%e, %db_name, "cannot load database");
let mut guard = errors_databases.lock();
guard.insert(db_name.to_string(), Arc::new(e));
}
}
});
Some(handle)
}
Err(e) => {
error!(%e, "invalid database path");
None
}
}
Some(async move {
let result =
initialize_database(config, root, db_name.clone(), wipe_on_error).await;
(db_name, result)
})
.collect();
})
.collect();
futures::future::join_all(handles).await;
Ok(futures::future::join_all(handles).await)
}
async fn initialize_database(
config: Arc<Config>,
root: Path,
db_name: DatabaseName<'static>,
wipe_on_error: bool,
) -> Result<()> {
// Reserve name before expensive IO (e.g. loading the preserved catalog)
let mut handle = config
.create_db(db_name)
.map_err(Box::new)
.context(InitDbError)?;
match try_advance_database_init_process_until_complete(&mut handle, &root, wipe_on_error).await
{
Ok(true) => {
// finished init and keep DB
handle.commit();
Ok(())
}
Ok(false) => {
// finished but do not keep DB
handle.abort();
Ok(())
}
Err(e) => {
// encountered some error, still commit intermediate result
handle.commit();
Err(e)
}
}
}
async fn load_database_rules(store: Arc<ObjectStore>, path: Path) -> Result<Option<DatabaseRules>> {
let serialized_rules = loop {
match get_database_config_bytes(&path, &store).await {
Ok(data) => break data,
Err(e) => {
if let Error::NoDatabaseConfigError { location } = &e {
warn!(?location, "{}", e);
return Ok(None);
}
error!(
"error getting database config {:?} from object store: {}",
path, e
);
tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
.await;
}
}
};
let rules = decode_database_rules(serialized_rules.freeze())
.context(ErrorDeserializingRulesProtobuf)?;
Ok(Some(rules))
}
pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
config: Arc<Config>,
db_name: &DatabaseName<'static>,
) -> Result<()> {
let store = config.object_store();
if config.has_uninitialized_database(db_name) {
let mut handle = config
.recover_db(db_name.clone())
.map_err(|e| Arc::new(e) as _)
.context(RecoverDbError)?;
if !((handle.state_code() == DatabaseStateCode::Known)
|| (handle.state_code() == DatabaseStateCode::RulesLoaded))
{
// cannot wipe because init state is already too far
return Err(Error::DbPartiallyInitialized {
db_name: db_name.to_string(),
});
}
// wipe while holding handle so no other init/wipe process can interact with the catalog
PreservedCatalog::wipe(&store, handle.server_id(), db_name)
.await
.map_err(Box::new)
.context(PreservedCatalogWipeError)?;
let root = config.root_path();
let result =
try_advance_database_init_process_until_complete(&mut handle, &root, true).await;
// Commit changes even if failed
handle.commit();
result.map(|_| ())
} else {
let handle = config
.block_db(db_name.clone())
.map_err(|e| Arc::new(e) as _)
.context(RecoverDbError)?;
PreservedCatalog::wipe(&store, config.server_id(), db_name)
.await
.map_err(Box::new)
.context(PreservedCatalogWipeError)?;
drop(handle);
info!(%db_name, "wiped preserved catalog of non-registered database");
Ok(())
}
}
async fn initialize_database(
server_id: ServerId,
store: Arc<ObjectStore>,
config: Arc<Config>,
exec: Arc<Executor>,
root: Path,
db_name: DatabaseName<'static>,
wipe_on_error: bool,
) -> Result<()> {
// Reserve name before expensive IO (e.g. loading the preserved catalog)
let mut handle = config
.create_db(store, exec, server_id, db_name)
.map_err(Box::new)
.context(InitDbError)?;
match Self::try_advance_database_init_process_until_complete(
&mut handle,
&root,
wipe_on_error,
)
.await
{
Ok(true) => {
// finished init and keep DB
handle.commit();
Ok(())
/// Try to make as much progress as possible with DB init.
///
/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
/// (e.g. because not rules file is present.)
async fn try_advance_database_init_process_until_complete(
handle: &mut DatabaseHandle<'_>,
root: &Path,
wipe_on_error: bool,
) -> Result<bool> {
loop {
match try_advance_database_init_process(handle, root, wipe_on_error).await? {
InitProgress::Unfinished => {}
InitProgress::Done => {
return Ok(true);
}
Ok(false) => {
// finished but do not keep DB
handle.abort();
Ok(())
}
Err(e) => {
// encountered some error, still commit intermediate result
handle.commit();
Err(e)
InitProgress::Forget => {
return Ok(false);
}
}
}
}
async fn load_database_rules(
store: Arc<ObjectStore>,
path: Path,
) -> Result<Option<DatabaseRules>> {
let serialized_rules = loop {
match get_database_config_bytes(&path, &store).await {
Ok(data) => break data,
Err(e) => {
if let Error::NoDatabaseConfigError { location } = &e {
warn!(?location, "{}", e);
return Ok(None);
}
error!(
"error getting database config {:?} from object store: {}",
path, e
);
tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
.await;
/// Try to make some progress in the DB init.
async fn try_advance_database_init_process(
handle: &mut DatabaseHandle<'_>,
root: &Path,
wipe_on_error: bool,
) -> Result<InitProgress> {
match handle.state_code() {
DatabaseStateCode::Known => {
// known => load DB rules
let path = object_store_path_for_database_config(root, &handle.db_name());
match load_database_rules(handle.object_store(), path).await? {
Some(rules) => {
handle
.advance_rules_loaded(rules)
.map_err(Box::new)
.context(InitDbError)?;
// there is still more work to do for this DB
Ok(InitProgress::Unfinished)
}
None => {
// no rules file present, advice to forget his DB
Ok(InitProgress::Forget)
}
}
};
let rules = decode_database_rules(serialized_rules.freeze())
.context(ErrorDeserializingRulesProtobuf)?;
Ok(Some(rules))
}
pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
&self,
store: Arc<ObjectStore>,
config: Arc<Config>,
server_id: ServerId,
db_name: DatabaseName<'static>,
) -> Result<()> {
if config.has_uninitialized_database(&db_name) {
let mut handle = config
.recover_db(db_name.clone())
.map_err(|e| Arc::new(e) as _)
.context(RecoverDbError)?;
if !((handle.state_code() == DatabaseStateCode::Known)
|| (handle.state_code() == DatabaseStateCode::RulesLoaded))
{
// cannot wipe because init state is already too far
return Err(Error::DbPartiallyInitialized {
db_name: db_name.to_string(),
});
}
// wipe while holding handle so no other init/wipe process can interact with the catalog
PreservedCatalog::wipe(&store, handle.server_id(), &db_name)
.await
.map_err(Box::new)
.context(PreservedCatalogWipeError)?;
let root = self.root_path(&store)?;
let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
match Self::try_advance_database_init_process_until_complete(
&mut handle,
&root,
}
DatabaseStateCode::RulesLoaded => {
// rules already loaded => continue with loading preserved catalog
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
&handle.db_name(),
handle.object_store(),
handle.server_id(),
handle.metrics_registry(),
wipe_on_error,
)
.await
{
Ok(_) => {
// yeah, recovered DB
handle.commit();
.map_err(|e| Box::new(e) as _)
.context(CatalogLoadError)?;
let mut guard = self.errors_databases.lock();
guard.remove(&db_name.to_string());
info!(%db_name, "wiped preserved catalog of registered database and recovered");
Ok(())
}
Err(e) => {
// could not recover, but still keep new result
handle.commit();
let mut guard = self.errors_databases.lock();
let e = Arc::new(e);
guard.insert(db_name.to_string(), Arc::clone(&e));
warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
Err(Error::RecoverDbError { source: e })
}
}
} else {
let handle = config
.block_db(db_name.clone())
.map_err(|e| Arc::new(e) as _)
.context(RecoverDbError)?;
PreservedCatalog::wipe(&store, server_id, &db_name)
let rules = handle
.rules()
.expect("in this state rules should be loaded");
let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
.await
.context(CreateWriteBuffer {
config: rules.write_buffer_connection.clone(),
})?;
info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
handle
.advance_replay(preserved_catalog, catalog, write_buffer)
.map_err(Box::new)
.context(PreservedCatalogWipeError)?;
.context(InitDbError)?;
drop(handle);
info!(%db_name, "wiped preserved catalog of non-registered database");
Ok(())
// there is still more work to do for this DB
Ok(InitProgress::Unfinished)
}
}
DatabaseStateCode::Replay => {
let db = handle
.db_any_state()
.expect("DB should be available in this state");
db.perform_replay().await;
/// Try to make as much progress as possible with DB init.
///
/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
/// (e.g. because not rules file is present.)
async fn try_advance_database_init_process_until_complete(
handle: &mut DatabaseHandle<'_>,
root: &Path,
wipe_on_error: bool,
) -> Result<bool> {
loop {
match Self::try_advance_database_init_process(handle, root, wipe_on_error).await? {
InitProgress::Unfinished => {}
InitProgress::Done => {
return Ok(true);
}
InitProgress::Forget => {
return Ok(false);
}
}
handle
.advance_init()
.map_err(Box::new)
.context(InitDbError)?;
// there is still more work to do for this DB
Ok(InitProgress::Unfinished)
}
}
/// Try to make some progress in the DB init.
async fn try_advance_database_init_process(
handle: &mut DatabaseHandle<'_>,
root: &Path,
wipe_on_error: bool,
) -> Result<InitProgress> {
match handle.state_code() {
DatabaseStateCode::Known => {
// known => load DB rules
let path = object_store_path_for_database_config(root, &handle.db_name());
match Self::load_database_rules(handle.object_store(), path).await? {
Some(rules) => {
handle
.advance_rules_loaded(rules)
.map_err(Box::new)
.context(InitDbError)?;
// there is still more work to do for this DB
Ok(InitProgress::Unfinished)
}
None => {
// no rules file present, advice to forget his DB
Ok(InitProgress::Forget)
}
}
}
DatabaseStateCode::RulesLoaded => {
// rules already loaded => continue with loading preserved catalog
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
&handle.db_name(),
handle.object_store(),
handle.server_id(),
handle.metrics_registry(),
wipe_on_error,
)
.await
.map_err(|e| Box::new(e) as _)
.context(CatalogLoadError)?;
let rules = handle
.rules()
.expect("in this state rules should be loaded");
let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
.await
.context(CreateWriteBuffer {
config: rules.write_buffer_connection.clone(),
})?;
info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
handle
.advance_replay(preserved_catalog, catalog, write_buffer)
.map_err(Box::new)
.context(InitDbError)?;
// there is still more work to do for this DB
Ok(InitProgress::Unfinished)
}
DatabaseStateCode::Replay => {
let db = handle
.db_any_state()
.expect("DB should be available in this state");
db.perform_replay().await;
handle
.advance_init()
.map_err(Box::new)
.context(InitDbError)?;
// there is still more work to do for this DB
Ok(InitProgress::Unfinished)
}
DatabaseStateCode::Initialized => {
// database fully initialized => nothing to do
Ok(InitProgress::Done)
}
DatabaseStateCode::Initialized => {
// database fully initialized => nothing to do
Ok(InitProgress::Done)
}
}
}

View File

@ -74,9 +74,8 @@ use std::sync::Arc;
use async_trait::async_trait;
use bytes::BytesMut;
use db::load::create_preserved_catalog;
use init::InitStatus;
use observability_deps::tracing::{debug, info, warn};
use parking_lot::Mutex;
use observability_deps::tracing::{debug, error, info, warn};
use parking_lot::{Mutex, RwLockUpgradableReadGuard};
use snafu::{OptionExt, ResultExt, Snafu};
use data_types::{
@ -93,6 +92,7 @@ use generated_types::influxdata::transfer::column::v1 as pb;
use influxdb_line_protocol::ParsedLine;
use metrics::{KeyValue, MetricObserverBuilder, MetricRegistry};
use object_store::{ObjectStore, ObjectStoreApi};
use parking_lot::RwLock;
use query::{exec::Executor, DatabaseStore};
use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt};
use write_buffer::config::WriteBufferConfig;
@ -220,11 +220,11 @@ pub enum Error {
#[snafu(display("cannot create preserved catalog: {}", source))]
CannotCreatePreservedCatalog { source: DatabaseError },
#[snafu(display("cannot set id: {}", source))]
SetIdError { source: crate::init::Error },
#[snafu(display("id already set"))]
IdAlreadySet,
#[snafu(display("cannot get id: {}", source))]
GetIdError { source: crate::init::Error },
#[snafu(display("id not set"))]
IdNotSet,
#[snafu(display(
"cannot create write buffer with config: {:?}, error: {}",
@ -297,6 +297,8 @@ pub struct ServerConfig {
metric_registry: Arc<MetricRegistry>,
remote_template: Option<RemoteTemplate>,
wipe_catalog_on_error: bool,
}
impl ServerConfig {
@ -311,6 +313,7 @@ impl ServerConfig {
object_store,
metric_registry,
remote_template,
wipe_catalog_on_error: true,
}
}
@ -414,7 +417,6 @@ impl ServerMetrics {
/// of these structs, which keeps track of all replication and query rules.
#[derive(Debug)]
pub struct Server<M: ConnectionManager> {
config: Arc<Config>,
connection_manager: Arc<M>,
pub store: Arc<ObjectStore>,
exec: Arc<Executor>,
@ -426,7 +428,50 @@ pub struct Server<M: ConnectionManager> {
/// and populates the endpoint with this data.
pub registry: Arc<metrics::MetricRegistry>,
init_status: Arc<InitStatus>,
/// The state machine for server startup
stage: Arc<RwLock<ServerStage>>,
}
/// The stage of the server in the startup process
///
/// The progression is linear Startup -> InitReady -> Initializing -> Initialized
/// with the sole exception that on failure Initializing -> InitReady
///
/// Errors encountered on server init will be retried, however, errors encountered
/// during database init will require operator intervention
///
/// These errors are exposed via `Server::error_generic` and `Server::error_database` respectively
///
/// They do not impact the state machine's progression, but instead are exposed to the
/// gRPC management API to allow an operator to assess the state of the system
#[derive(Debug)]
enum ServerStage {
/// Server has started but doesn't have a server id yet
Startup {
remote_template: Option<RemoteTemplate>,
wipe_catalog_on_error: bool,
},
/// Server can be initialized
InitReady {
wipe_catalog_on_error: bool,
config: Arc<Config>,
last_error: Option<Arc<init::Error>>,
},
/// Server has a server id, has started loading
Initializing {
wipe_catalog_on_error: bool,
config: Arc<Config>,
last_error: Option<Arc<init::Error>>,
},
/// Server has finish initializing, possibly with errors
Initialized {
config: Arc<Config>,
/// Errors that occurred during some DB init.
database_errors: HashMap<String, Arc<init::Error>>,
},
}
#[derive(Debug)]
@ -454,22 +499,23 @@ where
// to test the metrics provide a different registry to the `ServerConfig`.
metric_registry,
remote_template,
wipe_catalog_on_error,
} = config;
let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get);
let exec = Arc::new(Executor::new(num_worker_threads));
Self {
config: Arc::new(Config::new(
Arc::clone(&jobs),
Arc::clone(&metric_registry),
remote_template,
)),
store: object_store,
connection_manager: Arc::new(connection_manager),
exec: Arc::new(Executor::new(num_worker_threads)),
exec,
jobs,
metrics: Arc::new(ServerMetrics::new(Arc::clone(&metric_registry))),
registry: Arc::clone(&metric_registry),
init_status: Arc::new(InitStatus::new()),
stage: Arc::new(RwLock::new(ServerStage::Startup {
remote_template,
wipe_catalog_on_error,
})),
}
}
@ -478,68 +524,112 @@ where
///
/// A valid server ID Must be non-zero.
pub fn set_id(&self, id: ServerId) -> Result<()> {
self.init_status.server_id.set(id).context(SetIdError)
}
let mut stage = self.stage.write();
match &mut *stage {
ServerStage::Startup {
remote_template,
wipe_catalog_on_error,
} => {
let remote_template = remote_template.take();
/// Returns the current server ID, or an error if not yet set.
pub fn require_id(&self) -> Result<ServerId> {
self.init_status.server_id.get().context(GetIdError)
*stage = ServerStage::InitReady {
wipe_catalog_on_error: *wipe_catalog_on_error,
config: Arc::new(Config::new(
Arc::clone(&self.jobs),
Arc::clone(&self.store),
Arc::clone(&self.exec),
id,
Arc::clone(&self.registry),
remote_template,
)),
last_error: None,
};
Ok(())
}
_ => Err(Error::IdAlreadySet),
}
}
/// Check if server is loaded. Databases are loaded and server is ready to read/write.
pub fn initialized(&self) -> bool {
self.init_status.initialized()
matches!(&*self.stage.read(), ServerStage::Initialized { .. })
}
/// Require that server is loaded. Databases are loaded and server is ready to read/write.
fn require_initialized(&self) -> Result<Arc<Config>> {
match &*self.stage.read() {
ServerStage::Startup { .. } => Err(Error::IdNotSet),
ServerStage::InitReady { config, .. } | ServerStage::Initializing { config, .. } => {
Err(Error::ServerNotInitialized {
server_id: config.server_id(),
})
}
ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
}
}
/// Returns the config for this server if server id has been set
fn config(&self) -> Result<Arc<Config>> {
let stage = self.stage.read();
match &*stage {
ServerStage::Startup { .. } => Err(Error::IdNotSet),
ServerStage::InitReady { config, .. }
| ServerStage::Initializing { config, .. }
| ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
}
}
/// Returns the server id for this server if set
pub fn server_id(&self) -> Option<ServerId> {
self.config().map(|x| x.server_id()).ok()
}
/// Error occurred during generic server init (e.g. listing store content).
pub fn error_generic(&self) -> Option<Arc<crate::init::Error>> {
self.init_status.error_generic()
let stage = self.stage.read();
match &*stage {
ServerStage::InitReady { last_error, .. } => last_error.clone(),
ServerStage::Initializing { last_error, .. } => last_error.clone(),
_ => None,
}
}
/// List all databases with errors in sorted order.
pub fn databases_with_errors(&self) -> Vec<String> {
self.init_status.databases_with_errors()
let stage = self.stage.read();
match &*stage {
ServerStage::Initialized {
database_errors, ..
} => database_errors.keys().cloned().collect(),
_ => Default::default(),
}
}
/// Error that occurred during initialization of a specific database.
pub fn error_database(&self, db_name: &str) -> Option<Arc<crate::init::Error>> {
self.init_status.error_database(db_name)
let stage = self.stage.read();
match &*stage {
ServerStage::Initialized {
database_errors, ..
} => database_errors.get(db_name).cloned(),
_ => None,
}
}
/// Current database init state.
pub fn database_state(&self, name: &str) -> Option<DatabaseStateCode> {
if let Ok(name) = DatabaseName::new(name) {
self.config.db_state(&name)
} else {
None
}
}
/// Require that server is loaded. Databases are loaded and server is ready to read/write.
fn require_initialized(&self) -> Result<ServerId> {
// since a server ID is the pre-requirement for init, check this first
let server_id = self.require_id()?;
// ordering here isn't that important since this method is not used to check-and-modify the flag
if self.initialized() {
Ok(server_id)
} else {
Err(Error::ServerNotInitialized { server_id })
}
let db_name = DatabaseName::new(name).ok()?;
let config = self.config().ok()?;
config.db_state(&db_name)
}
/// Tells the server the set of rules for a database.
pub async fn create_database(&self, rules: DatabaseRules) -> Result<()> {
// Return an error if this server is not yet ready
let server_id = self.require_initialized()?;
let config = self.require_initialized()?;
// Reserve name before expensive IO (e.g. loading the preserved catalog)
let mut db_reservation = self.config.create_db(
Arc::clone(&self.store),
Arc::clone(&self.exec),
server_id,
rules.name.clone(),
)?;
let mut db_reservation = config.create_db(rules.name.clone())?;
// register rules
db_reservation.advance_rules_loaded(rules.clone())?;
@ -548,14 +638,14 @@ where
let (preserved_catalog, catalog) = create_preserved_catalog(
rules.db_name(),
Arc::clone(&self.store),
server_id,
self.config.metrics_registry(),
config.server_id(),
config.metrics_registry(),
)
.await
.map_err(|e| Box::new(e) as _)
.context(CannotCreatePreservedCatalog)?;
let write_buffer = WriteBufferConfig::new(server_id, &rules)
let write_buffer = WriteBufferConfig::new(config.server_id(), &rules)
.await
.map_err(|e| Error::CreatingWriteBuffer {
config: rules.write_buffer_connection.clone(),
@ -575,13 +665,8 @@ where
}
pub async fn persist_database_rules<'a>(&self, rules: DatabaseRules) -> Result<()> {
let location = object_store_path_for_database_config(
&self
.init_status
.root_path(&self.store)
.context(GetIdError)?,
&rules.name,
);
let config = self.config()?;
let location = object_store_path_for_database_config(&config.root_path(), &rules.name);
let mut data = BytesMut::new();
encode_database_rules(rules, &mut data).context(ErrorSerializingRulesProtobuf)?;
@ -604,15 +689,62 @@ where
/// object store. Any databases in the config already won't be
/// replaced.
///
/// This requires the serverID to be set. It will be a no-op if the configs are already loaded and the server is ready.
/// This requires the serverID to be set.
///
/// It will be a no-op if the configs are already loaded and the server is ready.
pub async fn maybe_initialize_server(&self) {
self.init_status
.maybe_initialize_server(
Arc::clone(&self.store),
Arc::clone(&self.config),
Arc::clone(&self.exec),
)
.await;
// Explicit scope to help async generator
let (wipe_catalog_on_error, config) = {
let state = self.stage.upgradable_read();
match &*state {
ServerStage::InitReady {
wipe_catalog_on_error,
config,
last_error,
} => {
let config = Arc::clone(config);
let last_error = last_error.clone();
let wipe_catalog_on_error = *wipe_catalog_on_error;
// Mark the server as initializing and drop lock
let mut state = RwLockUpgradableReadGuard::upgrade(state);
*state = ServerStage::Initializing {
config: Arc::clone(&config),
wipe_catalog_on_error,
last_error,
};
(wipe_catalog_on_error, config)
}
_ => return,
}
};
let init_result = init::initialize_server(Arc::clone(&config), wipe_catalog_on_error).await;
let new_stage = match init_result {
// Success -> move to next stage
Ok(results) => {
info!(server_id=%config.server_id(), "server initialized");
ServerStage::Initialized {
config,
database_errors: results
.into_iter()
.filter_map(|(name, res)| Some((name.to_string(), Arc::new(res.err()?))))
.collect(),
}
}
// Error -> return to InitReady
Err(err) => {
error!(%err, "error during server init");
ServerStage::InitReady {
wipe_catalog_on_error,
config,
last_error: Some(Arc::new(err)),
}
}
};
*self.stage.write() = new_stage;
}
pub async fn write_pb(&self, database_batch: pb::DatabaseBatch) -> Result<()> {
@ -640,11 +772,10 @@ where
default_time: i64,
) -> Result<()> {
// Return an error if this server is not yet ready
self.require_initialized()?;
let config = self.require_initialized()?;
let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
let db = self
.config
let db = config
.db_initialized(&db_name)
.context(DatabaseNotFound { db_name: &*db_name })?;
@ -744,9 +875,12 @@ where
node_group: &[ServerId],
entry: Entry,
) -> Result<()> {
// Return an error if this server is not yet ready
let config = self.config()?;
let addrs: Vec<_> = node_group
.iter()
.filter_map(|&node| self.config.resolve_remote(node))
.filter_map(|&node| config.resolve_remote(node))
.collect();
if addrs.is_empty() {
return NoRemoteConfigured { node_group }.fail();
@ -775,11 +909,10 @@ where
pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec<u8>) -> Result<()> {
// Return an error if this server is not yet ready
self.require_initialized()?;
let config = self.require_initialized()?;
let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
let db = self
.config
let db = config
.db_initialized(&db_name)
.context(DatabaseNotFound { db_name: &*db_name })?;
@ -825,11 +958,11 @@ where
}
pub fn db(&self, name: &DatabaseName<'_>) -> Option<Arc<Db>> {
self.config.db_initialized(name)
self.config().ok()?.db_initialized(name)
}
pub fn db_rules(&self, name: &DatabaseName<'_>) -> Option<Arc<DatabaseRules>> {
self.config.db_initialized(name).map(|d| d.rules())
self.db(name).map(|d| d.rules())
}
// Update database rules and save on success.
@ -841,8 +974,8 @@ where
where
F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E> + Send,
{
let rules = self
.config
let config = self.config()?;
let rules = config
.update_db_rules(db_name, update)
.map_err(|e| match e {
crate::config::UpdateError::Closure(e) => UpdateError::Closure(e),
@ -854,16 +987,23 @@ where
Ok(rules)
}
pub fn remotes_sorted(&self) -> Vec<(ServerId, String)> {
self.config.remotes_sorted()
pub fn remotes_sorted(&self) -> Result<Vec<(ServerId, String)>> {
// TODO: Should these be on ConnectionManager and not Config
let config = self.config()?;
Ok(config.remotes_sorted())
}
pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) {
self.config.update_remote(id, addr)
pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) -> Result<()> {
// TODO: Should these be on ConnectionManager and not Config
let config = self.config()?;
config.update_remote(id, addr);
Ok(())
}
pub fn delete_remote(&self, id: ServerId) -> Option<GRpcConnectionString> {
self.config.delete_remote(id)
pub fn delete_remote(&self, id: ServerId) -> Result<Option<GRpcConnectionString>> {
// TODO: Should these be on ConnectionManager and not Config
let config = self.config()?;
Ok(config.delete_remote(id))
}
pub fn spawn_dummy_job(&self, nanos: Vec<u64>) -> TaskTracker<Job> {
@ -893,14 +1033,15 @@ where
partition_key: impl Into<String>,
chunk_id: u32,
) -> Result<TaskTracker<Job>> {
let config = self.require_initialized()?;
let db_name = db_name.to_string();
let name = DatabaseName::new(&db_name).context(InvalidDatabaseName)?;
let partition_key = partition_key.into();
let table_name = table_name.into();
let db = self
.config
let db = config
.db_initialized(&name)
.context(DatabaseNotFound { db_name: &db_name })?;
@ -921,25 +1062,62 @@ where
/// DB jobs and this command.
pub fn wipe_preserved_catalog(
&self,
db_name: DatabaseName<'static>,
db_name: &DatabaseName<'static>,
) -> Result<TaskTracker<Job>> {
if self.config.db_initialized(&db_name).is_some() {
return Err(Error::DatabaseAlreadyExists {
db_name: db_name.to_string(),
});
}
// Can only wipe catalog of database that failed to initialize
let config = match &*self.stage.read() {
ServerStage::Initialized {
config,
database_errors,
} => {
if config.db_initialized(db_name).is_some() {
return Err(Error::DatabaseAlreadyExists {
db_name: db_name.to_string(),
});
}
if !database_errors.contains_key(db_name.as_str()) {
// TODO: Should this be an error? Some end-to-end tests assume it is non-fatal
warn!(%db_name, "wiping database not present at startup");
}
Arc::clone(config)
}
ServerStage::Startup { .. } => return Err(Error::IdNotSet),
ServerStage::Initializing { config, .. } | ServerStage::InitReady { config, .. } => {
return Err(Error::ServerNotInitialized {
server_id: config.server_id(),
})
}
};
let (tracker, registration) = self.jobs.register(Job::WipePreservedCatalog {
db_name: db_name.to_string(),
});
let object_store = Arc::clone(&self.store);
let config = Arc::clone(&self.config);
let server_id = self.require_id()?;
let init_status = Arc::clone(&self.init_status);
let state = Arc::clone(&self.stage);
let db_name = db_name.clone();
let task = async move {
init_status
.wipe_preserved_catalog_and_maybe_recover(object_store, config, server_id, db_name)
.await
let result = init::wipe_preserved_catalog_and_maybe_recover(config, &db_name).await;
match &mut *state.write() {
ServerStage::Initialized {
database_errors, ..
} => match result {
Ok(_) => {
info!(%db_name, "wiped preserved catalog of registered database and recovered");
database_errors.remove(db_name.as_str());
Ok(())
}
Err(e) => {
warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
let e = Arc::new(e);
database_errors.insert(db_name.to_string(), Arc::clone(&e));
Err(e)
}
},
_ => unreachable!("server cannot become uninitialized"),
}
};
tokio::spawn(task.track(registration));
@ -973,7 +1151,9 @@ where
}
info!("shutting down background workers");
self.config.drain().await;
if let Ok(config) = self.config() {
config.drain().await;
}
info!("draining tracker registry");
@ -999,11 +1179,15 @@ where
type Error = Error;
fn db_names_sorted(&self) -> Vec<String> {
self.config
.db_names_sorted()
.iter()
.map(|i| i.clone().into())
.collect()
self.config()
.map(|config| {
config
.db_names_sorted()
.iter()
.map(ToString::to_string)
.collect()
})
.unwrap_or_default()
}
fn db(&self, name: &str) -> Option<Arc<Self::Database>> {
@ -1214,25 +1398,15 @@ mod tests {
let manager = TestConnectionManager::new();
let server = Server::new(manager, config());
let resp = server.require_id().unwrap_err();
assert!(matches!(
resp,
Error::GetIdError {
source: crate::init::Error::IdNotSet
}
));
let resp = server.config().unwrap_err();
assert!(matches!(resp, Error::IdNotSet));
let lines = parsed_lines("cpu foo=1 10");
let resp = server
.write_lines("foo", &lines, ARBITRARY_DEFAULT_TIME)
.await
.unwrap_err();
assert!(matches!(
resp,
Error::GetIdError {
source: crate::init::Error::IdNotSet
}
));
assert!(matches!(resp, Error::IdNotSet));
}
#[tokio::test]
@ -1559,7 +1733,7 @@ mod tests {
let remote_ids = vec![bad_remote_id, good_remote_id_1, good_remote_id_2];
let db = server.db(&db_name).unwrap();
db.update_db_rules(|mut rules| {
db.update_rules(|mut rules| {
let shard_config = ShardConfig {
hash_ring: Some(HashRing {
shards: vec![TEST_SHARD_ID].into(),
@ -1589,7 +1763,9 @@ mod tests {
);
// one remote is configured but it's down and we'll get connection error
server.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into());
server
.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into())
.unwrap();
let err = server
.write_lines(&db_name, &lines, ARBITRARY_DEFAULT_TIME)
.await
@ -1606,8 +1782,12 @@ mod tests {
// We configure the address for the other remote, this time connection will succeed
// despite the bad remote failing to connect.
server.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into());
server.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into());
server
.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into())
.unwrap();
server
.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into())
.unwrap();
// Remotes are tried in random order, so we need to repeat the test a few times to have a reasonable
// probability both the remotes will get hit.
@ -1796,7 +1976,7 @@ mod tests {
let db_name = DatabaseName::new("foo").unwrap();
let db = server.db(&db_name).unwrap();
let rules = db
.update_db_rules(|mut rules| {
.update_rules(|mut rules| {
rules.lifecycle_rules.buffer_size_hard =
Some(std::num::NonZeroUsize::new(10).unwrap());
Ok::<_, Infallible>(rules)
@ -1844,12 +2024,7 @@ mod tests {
let err = create_simple_database(&server, "bananas")
.await
.unwrap_err();
assert!(matches!(
err,
Error::GetIdError {
source: crate::init::Error::IdNotSet
}
));
assert!(matches!(err, Error::IdNotSet));
server.set_id(ServerId::try_from(1).unwrap()).unwrap();
// do NOT call `server.maybe_load_database_configs` so DBs are not loaded and server is not ready
@ -1873,7 +2048,7 @@ mod tests {
let t_0 = Instant::now();
loop {
if server.require_initialized().is_ok() {
if server.config().is_ok() {
break;
}
assert!(t_0.elapsed() < Duration::from_secs(10));
@ -1916,9 +2091,12 @@ mod tests {
create_simple_database(&server, "foo")
.await
.expect("failed to create database");
let root = server.init_status.root_path(&store).unwrap();
server.config.drain().await;
let config = server.require_initialized().unwrap();
let root = config.root_path();
config.drain().await;
drop(server);
drop(config);
// tamper store
let path = object_store_path_for_database_config(&root, &DatabaseName::new("bar").unwrap());
@ -2003,18 +2181,24 @@ mod tests {
let server = Server::new(manager, config);
server.set_id(server_id).unwrap();
server.maybe_initialize_server().await;
create_simple_database(&server, db_name_existing.clone())
.await
.expect("failed to create database");
create_simple_database(&server, db_name_rules_broken.clone())
.await
.expect("failed to create database");
create_simple_database(&server, db_name_catalog_broken.clone())
.await
.expect("failed to create database");
let root = server.init_status.root_path(&store).unwrap();
server.config.drain().await;
let config = server.require_initialized().unwrap();
let root = config.root_path();
config.drain().await;
drop(server);
drop(config);
// tamper store to break one database
let path = object_store_path_for_database_config(&root, &db_name_rules_broken);
@ -2045,22 +2229,18 @@ mod tests {
let store = Arc::try_unwrap(store).unwrap();
store.get(&path).await.unwrap();
let manager = TestConnectionManager::new();
let config = config_with_store(store);
let server = Server::new(manager, config);
// need to disable auto-wipe for this test
server
.init_status
.wipe_on_error
.store(false, std::sync::atomic::Ordering::Relaxed);
let mut config = config_with_store(store);
config.wipe_catalog_on_error = false;
let server = Server::new(manager, config);
// cannot wipe if server ID is not set
assert_eq!(
server
.wipe_preserved_catalog(db_name_non_existing.clone())
.wipe_preserved_catalog(&db_name_non_existing)
.unwrap_err()
.to_string(),
"cannot get id: unable to use server until id is set"
"id not set"
);
server.set_id(ServerId::try_from(1).unwrap()).unwrap();
@ -2069,31 +2249,29 @@ mod tests {
// 1. cannot wipe if DB exists
assert_eq!(
server
.wipe_preserved_catalog(db_name_existing.clone())
.wipe_preserved_catalog(&db_name_existing)
.unwrap_err()
.to_string(),
"database already exists: db_existing"
);
assert!(PreservedCatalog::exists(
&server.store,
server.require_id().unwrap(),
&db_name_existing.to_string()
)
.await
.unwrap());
assert!(
PreservedCatalog::exists(&server.store, server_id, db_name_existing.as_str())
.await
.unwrap()
);
// 2. wiping a non-existing DB just works, but won't bring DB into existence
assert!(server.error_database(&db_name_non_existing).is_none());
PreservedCatalog::new_empty::<TestCatalogState>(
Arc::clone(&server.store),
server.require_id().unwrap(),
server_id,
db_name_non_existing.to_string(),
(),
)
.await
.unwrap();
let tracker = server
.wipe_preserved_catalog(db_name_non_existing.clone())
.wipe_preserved_catalog(&db_name_non_existing)
.unwrap();
let metadata = tracker.metadata();
let expected_metadata = Job::WipePreservedCatalog {
@ -2103,7 +2281,7 @@ mod tests {
tracker.join().await;
assert!(!PreservedCatalog::exists(
&server.store,
server.require_id().unwrap(),
server_id,
&db_name_non_existing.to_string()
)
.await
@ -2114,7 +2292,7 @@ mod tests {
// 3. wipe DB with broken rules file, this won't bring DB back to life
assert!(server.error_database(&db_name_rules_broken).is_some());
let tracker = server
.wipe_preserved_catalog(db_name_rules_broken.clone())
.wipe_preserved_catalog(&db_name_rules_broken)
.unwrap();
let metadata = tracker.metadata();
let expected_metadata = Job::WipePreservedCatalog {
@ -2124,7 +2302,7 @@ mod tests {
tracker.join().await;
assert!(!PreservedCatalog::exists(
&server.store,
server.require_id().unwrap(),
server_id,
&db_name_rules_broken.to_string()
)
.await
@ -2135,7 +2313,7 @@ mod tests {
// 4. wipe DB with broken catalog, this will bring the DB back to life
assert!(server.error_database(&db_name_catalog_broken).is_some());
let tracker = server
.wipe_preserved_catalog(db_name_catalog_broken.clone())
.wipe_preserved_catalog(&db_name_catalog_broken)
.unwrap();
let metadata = tracker.metadata();
let expected_metadata = Job::WipePreservedCatalog {
@ -2145,7 +2323,7 @@ mod tests {
tracker.join().await;
assert!(PreservedCatalog::exists(
&server.store,
server.require_id().unwrap(),
server_id,
&db_name_catalog_broken.to_string()
)
.await
@ -2166,18 +2344,16 @@ mod tests {
.unwrap();
assert_eq!(
server
.wipe_preserved_catalog(db_name_created.clone())
.wipe_preserved_catalog(&db_name_created)
.unwrap_err()
.to_string(),
"database already exists: db_created"
);
assert!(PreservedCatalog::exists(
&server.store,
server.require_id().unwrap(),
&db_name_created.to_string()
)
.await
.unwrap());
assert!(
PreservedCatalog::exists(&server.store, server_id, &db_name_created.to_string())
.await
.unwrap()
);
}
#[tokio::test]

View File

@ -119,6 +119,11 @@ struct Create {
/// Maximum number of rows to buffer in a MUB chunk before compacting it
#[structopt(long, default_value = "100000")]
mub_row_threshold: u64,
/// Use up to this amount of space in bytes for caching Parquet files. A
/// value of zero disables Parquet file caching.
#[structopt(long, default_value = "0")]
parquet_cache_limit: u64,
}
/// Get list of databases
@ -193,6 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> {
persist_row_threshold: command.persist_row_threshold,
persist_age_threshold_seconds: command.persist_age_threshold_seconds,
mub_row_threshold: command.mub_row_threshold,
parquet_cache_limit: command.parquet_cache_limit,
}),
// Default to hourly partitions

View File

@ -231,6 +231,14 @@ Possible values (case insensitive):
default_value = "serving"
)]
pub initial_serving_state: ServingReadinessState,
/// Maximum size of HTTP requests.
#[structopt(
long = "--max-http-request-size",
env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE",
default_value = "10485760" // 10 MiB
)]
pub max_http_request_size: usize,
}
pub async fn command(config: Config) -> Result<()> {

View File

@ -195,7 +195,15 @@ pub async fn main(config: Config) -> Result<()> {
let bind_addr = config.http_bind_address;
let addr = AddrIncoming::bind(&bind_addr).context(StartListeningHttp { bind_addr })?;
let http_server = http::serve(addr, Arc::clone(&app_server), frontend_shutdown.clone()).fuse();
let max_http_request_size = config.max_http_request_size;
let http_server = http::serve(
addr,
Arc::clone(&app_server),
frontend_shutdown.clone(),
max_http_request_size,
)
.fuse();
info!(bind_address=?bind_addr, "HTTP server listening");
info!(git_hash, "InfluxDB IOx server ready");

View File

@ -342,12 +342,26 @@ impl ApplicationError {
}
}
const MAX_SIZE: usize = 10_485_760; // max write request size of 10MB
fn router<M>(server: Arc<AppServer<M>>) -> Router<Body, ApplicationError>
struct Server<M>
where
M: ConnectionManager + Send + Sync + Debug + 'static,
{
app_server: Arc<AppServer<M>>,
max_request_size: usize,
}
fn router<M>(
app_server: Arc<AppServer<M>>,
max_request_size: usize,
) -> Router<Body, ApplicationError>
where
M: ConnectionManager + Send + Sync + Debug + 'static,
{
let server = Server {
app_server,
max_request_size,
};
// Create a router and specify the the handlers.
Router::builder()
.data(server)
@ -408,7 +422,7 @@ struct WriteInfo {
/// Parse the request's body into raw bytes, applying size limits and
/// content encoding as needed.
async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError> {
async fn parse_body(req: hyper::Request<Body>, max_size: usize) -> Result<Bytes, ApplicationError> {
// clippy says the const needs to be assigned to a local variable:
// error: a `const` item with interior mutability should not be borrowed
let header_name = CONTENT_ENCODING;
@ -431,9 +445,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
while let Some(chunk) = payload.next().await {
let chunk = chunk.context(ClientHangup)?;
// limit max size of in-memory payload
if (body.len() + chunk.len()) > MAX_SIZE {
if (body.len() + chunk.len()) > max_size {
return Err(ApplicationError::RequestSizeExceeded {
max_body_size: MAX_SIZE,
max_body_size: max_size,
});
}
body.extend_from_slice(&chunk);
@ -445,9 +459,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
use std::io::Read;
let decoder = flate2::read::GzDecoder::new(&body[..]);
// Read at most MAX_SIZE bytes to prevent a decompression bomb based
// Read at most max_size bytes to prevent a decompression bomb based
// DoS.
let mut decoder = decoder.take(MAX_SIZE as u64);
let mut decoder = decoder.take(max_size as u64);
let mut decoded_data = Vec::new();
decoder
.read_to_end(&mut decoded_data)
@ -464,7 +478,12 @@ where
M: ConnectionManager + Send + Sync + Debug + 'static,
{
let path = req.uri().path().to_string();
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
let Server {
app_server: server,
max_request_size,
} = req.data::<Server<M>>().expect("server state");
let max_request_size = *max_request_size;
let server = Arc::clone(&server);
// TODO(edd): figure out best way of catching all errors in this observation.
let obs = server.metrics.http_requests.observation(); // instrument request
@ -481,7 +500,7 @@ where
let db_name = org_and_bucket_to_database(&write_info.org, &write_info.bucket)
.context(BucketMappingError)?;
let body = parse_body(req).await?;
let body = parse_body(req, max_request_size).await?;
let body = str::from_utf8(&body).context(ReadingBodyAsUtf8)?;
@ -595,7 +614,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
req: Request<Body>,
) -> Result<Response<Body>, ApplicationError> {
let path = req.uri().path().to_string();
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
// TODO(edd): figure out best way of catching all errors in this observation.
let obs = server.metrics.http_requests.observation(); // instrument request
@ -661,7 +680,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
req: Request<Body>,
) -> Result<Response<Body>, ApplicationError> {
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
let path = req.uri().path().to_string();
server
.metrics
@ -677,7 +696,7 @@ async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
async fn handle_metrics<M: ConnectionManager + Send + Sync + Debug + 'static>(
req: Request<Body>,
) -> Result<Response<Body>, ApplicationError> {
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
let path = req.uri().path().to_string();
server
.metrics
@ -700,7 +719,7 @@ async fn list_partitions<M: ConnectionManager + Send + Sync + Debug + 'static>(
) -> Result<Response<Body>, ApplicationError> {
let path = req.uri().path().to_string();
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
// TODO - catch error conditions
let obs = server.metrics.http_requests.observation();
@ -841,11 +860,12 @@ pub async fn serve<M>(
addr: AddrIncoming,
server: Arc<AppServer<M>>,
shutdown: CancellationToken,
max_request_size: usize,
) -> Result<(), hyper::Error>
where
M: ConnectionManager + Send + Sync + Debug + 'static,
{
let router = router(server);
let router = router(server, max_request_size);
let service = RouterService::new(router).unwrap();
hyper::Server::builder(addr)
@ -1234,6 +1254,8 @@ mod tests {
.await;
}
const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024;
#[tokio::test]
async fn client_hangup_during_parse() {
#[derive(Debug, Snafu)]
@ -1253,7 +1275,9 @@ mod tests {
.body(body)
.unwrap();
let parse_result = parse_body(request).await.unwrap_err();
let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE)
.await
.unwrap_err();
assert_eq!(
parse_result.to_string(),
"Client hung up while sending body: error reading a body from connection: Blarg Error"
@ -1334,7 +1358,12 @@ mod tests {
let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server");
let server_url = format!("http://{}", addr.local_addr());
tokio::task::spawn(serve(addr, server, CancellationToken::new()));
tokio::task::spawn(serve(
addr,
server,
CancellationToken::new(),
TEST_MAX_REQUEST_SIZE,
));
println!("Started server at {}", server_url);
server_url
}

View File

@ -8,7 +8,7 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status {
use server::Error;
match error {
Error::GetIdError { .. } => PreconditionViolation {
Error::IdNotSet => PreconditionViolation {
category: "Writer ID".to_string(),
subject: "influxdata.com/iox".to_string(),
description: "Writer ID must be set".to_string(),

View File

@ -56,7 +56,7 @@ where
&self,
_: Request<GetServerIdRequest>,
) -> Result<Response<GetServerIdResponse>, Status> {
match self.server.require_id().ok() {
match self.server.server_id() {
Some(id) => Ok(Response::new(GetServerIdResponse { id: id.get_u32() })),
None => return Err(NotFound::default().into()),
}
@ -71,7 +71,7 @@ where
match self.server.set_id(id) {
Ok(_) => Ok(Response::new(UpdateServerIdResponse {})),
Err(e @ Error::SetIdError { .. }) => {
Err(e @ Error::IdAlreadySet) => {
return Err(FieldViolation {
field: "id".to_string(),
description: e.to_string(),
@ -199,15 +199,18 @@ where
&self,
_: Request<ListRemotesRequest>,
) -> Result<Response<ListRemotesResponse>, Status> {
let remotes = self
.server
.remotes_sorted()
.into_iter()
.map(|(id, connection_string)| Remote {
id: id.get_u32(),
connection_string,
})
.collect();
let result = self.server.remotes_sorted();
let remotes = match result {
Ok(remotes) => remotes
.into_iter()
.map(|(id, connection_string)| Remote {
id: id.get_u32(),
connection_string,
})
.collect(),
Err(e) => return Err(default_server_error_handler(e)),
};
Ok(Response::new(ListRemotesResponse { remotes }))
}
@ -221,8 +224,16 @@ where
.ok_or_else(|| FieldViolation::required("remote"))?;
let remote_id = ServerId::try_from(remote.id)
.map_err(|_| FieldViolation::required("id").scope("remote"))?;
self.server
let result = self
.server
.update_remote(remote_id, remote.connection_string);
match result {
Ok(_) => {}
Err(e) => return Err(default_server_error_handler(e)),
}
Ok(Response::new(UpdateRemoteResponse {}))
}
@ -233,9 +244,12 @@ where
let request = request.into_inner();
let remote_id =
ServerId::try_from(request.id).map_err(|_| FieldViolation::required("id"))?;
self.server
.delete_remote(remote_id)
.ok_or_else(NotFound::default)?;
match self.server.delete_remote(remote_id) {
Ok(Some(_)) => {}
Ok(None) => return Err(NotFound::default().into()),
Err(e) => return Err(default_server_error_handler(e)),
}
Ok(Response::new(DeleteRemoteResponse {}))
}
@ -455,7 +469,7 @@ where
let tracker = self
.server
.wipe_preserved_catalog(db_name)
.wipe_preserved_catalog(&db_name)
.map_err(|e| match e {
Error::DatabaseAlreadyExists { db_name } => AlreadyExists {
resource_type: "database".to_string(),

View File

@ -65,6 +65,8 @@ async fn test_list_update_remotes() {
const TEST_REMOTE_ADDR_2: &str = "4.3.2.1:4321";
const TEST_REMOTE_ADDR_2_UPDATED: &str = "40.30.20.10:4321";
client.update_server_id(123).await.unwrap();
let res = client.list_remotes().await.expect("list remotes failed");
assert_eq!(res.len(), 0);

View File

@ -244,6 +244,18 @@ async fn test_list_chunks_error() {
async fn test_remotes() {
let server_fixture = ServerFixture::create_single_use().await;
let addr = server_fixture.grpc_base();
Command::cargo_bin("influxdb_iox")
.unwrap()
.arg("server")
.arg("set")
.arg("32")
.arg("--host")
.arg(addr)
.assert()
.success()
.stdout(predicate::str::contains("Ok"));
Command::cargo_bin("influxdb_iox")
.unwrap()
.arg("server")

View File

@ -49,16 +49,43 @@ async fn test_chunk_is_persisted_automatically() {
assert_eq!(chunks[0].row_count, 1_000);
}
async fn write_data(
write_client: &mut influxdb_iox_client::write::Client,
db_name: &str,
num_payloads: u64,
num_duplicates: u64,
payload_size: u64,
) {
let payloads: Vec<_> = (0..num_payloads)
.map(|x| {
(0..payload_size)
.map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
.join("\n")
})
.collect();
for payload in &payloads {
// Writing the same data multiple times should be compacted away
for _ in 0..=num_duplicates {
let num_lines_written = write_client
.write(db_name, payload)
.await
.expect("successful write");
assert_eq!(num_lines_written, payload_size as usize);
}
}
}
#[tokio::test]
async fn test_full_lifecycle() {
let fixture = ServerFixture::create_shared().await;
let mut write_client = fixture.write_client();
let num_payloads = 10;
let num_duplicates = 2;
let num_duplicates = 1;
let payload_size = 1_000;
let total_rows = num_payloads * num_duplicates * payload_size;
let total_rows = num_payloads * (1 + num_duplicates) * payload_size;
let db_name = rand_name();
DatabaseBuilder::new(db_name.clone())
@ -73,24 +100,14 @@ async fn test_full_lifecycle() {
.build(fixture.grpc_channel())
.await;
let payloads: Vec<_> = (0..num_payloads)
.map(|x| {
(0..payload_size)
.map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
.join("\n")
})
.collect();
for payload in &payloads {
// Writing the same data multiple times should be compacted away
for _ in 0..num_duplicates {
let num_lines_written = write_client
.write(&db_name, payload)
.await
.expect("successful write");
assert_eq!(num_lines_written, payload_size as usize);
}
}
write_data(
&mut write_client,
&db_name,
num_payloads,
num_duplicates,
payload_size,
)
.await;
wait_for_exact_chunk_states(
&fixture,
@ -123,6 +140,58 @@ async fn test_full_lifecycle() {
assert_eq!(chunks[0].row_count, (num_payloads * payload_size) as usize)
}
#[tokio::test]
async fn test_update_late_arrival() {
let fixture = ServerFixture::create_shared().await;
let mut write_client = fixture.write_client();
let payload_size = 100;
let db_name = rand_name();
DatabaseBuilder::new(db_name.clone())
.persist(true)
// Don't close MUB automatically
.mub_row_threshold(payload_size * 2)
.persist_row_threshold(payload_size)
.persist_age_threshold_seconds(1000)
// Initially set to be a large value
.late_arrive_window_seconds(1000)
.build(fixture.grpc_channel())
.await;
write_data(&mut write_client, &db_name, 1, 0, payload_size).await;
let mut management = fixture.management_client();
let chunks = management.list_chunks(&db_name).await.unwrap();
assert_eq!(chunks.len(), 1);
assert_eq!(
chunks[0].storage,
influxdb_iox_client::management::generated_types::ChunkStorage::OpenMutableBuffer as i32
);
let mut rules = management.get_database(&db_name).await.unwrap();
rules
.lifecycle_rules
.as_mut()
.unwrap()
.late_arrive_window_seconds = 1;
fixture
.management_client()
.update_database(rules)
.await
.unwrap();
wait_for_exact_chunk_states(
&fixture,
&db_name,
vec![ChunkStorage::ReadBufferAndObjectStore],
std::time::Duration::from_secs(5),
)
.await;
}
#[tokio::test]
async fn test_query_chunk_after_restart() {
// fixtures

View File

@ -13,7 +13,7 @@ use crate::{
#[derive(Debug)]
pub enum WriteBufferConfig {
Writing(Arc<dyn WriteBufferWriting>),
Reading(Arc<dyn WriteBufferReading>),
Reading(Arc<tokio::sync::Mutex<Box<dyn WriteBufferReading>>>),
}
impl WriteBufferConfig {
@ -36,7 +36,9 @@ impl WriteBufferConfig {
Some(WriteBufferConnection::Reading(conn)) => {
let kafka_buffer = KafkaBufferConsumer::new(conn, server_id, name).await?;
Ok(Some(Self::Reading(Arc::new(kafka_buffer) as _)))
Ok(Some(Self::Reading(Arc::new(tokio::sync::Mutex::new(
Box::new(kafka_buffer) as _,
)))))
}
None => Ok(None),
}

View File

@ -1,6 +1,8 @@
use std::fmt::Debug;
use async_trait::async_trait;
use entry::{Entry, Sequence, SequencedEntry};
use futures::stream::BoxStream;
use futures::{future::BoxFuture, stream::BoxStream};
/// Generic boxed error type that is used in this crate.
///
@ -10,7 +12,7 @@ pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
/// Writing to a Write Buffer takes an [`Entry`] and returns [`Sequence`] data that facilitates reading
/// entries from the Write Buffer at a later time.
#[async_trait]
pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
/// Send an `Entry` to the write buffer using the specified sequencer ID.
///
/// Returns information that can be used to restore entries at a later time.
@ -21,16 +23,47 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
) -> Result<Sequence, WriteBufferError>;
}
pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result<u64, WriteBufferError>>;
pub type FetchHighWatermark<'a> = Box<dyn (Fn() -> FetchHighWatermarkFut<'a>) + Send + Sync>;
/// Output stream of [`WriteBufferReading`].
pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError>>;
pub struct EntryStream<'a> {
/// Stream that produces entries.
pub stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
/// Get high watermark (= what we believe is the next sequence number to be added).
///
/// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
/// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
pub fetch_high_watermark: FetchHighWatermark<'a>,
}
impl<'a> Debug for EntryStream<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EntryStream").finish_non_exhaustive()
}
}
/// Produce streams (one per sequencer) of [`SequencedEntry`]s.
pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
#[async_trait]
pub trait WriteBufferReading: Sync + Send + Debug + 'static {
/// Returns a stream per sequencer.
fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
where
'life0: 'async_trait,
Self: 'async_trait;
///
/// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
/// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last
/// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
/// create a new [`WriteBufferReading`] or use [`seek`](Self::seek).
fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>;
/// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
/// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
///
/// Note that due to the mutable borrow, it is not possible to seek while streams exists.
async fn seek(
&mut self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError>;
}
pub mod test_utils {
@ -65,6 +98,8 @@ pub mod test_utils {
test_multi_stream_io(&adapter).await;
test_multi_sequencer_io(&adapter).await;
test_multi_writer_multi_reader(&adapter).await;
test_seek(&adapter).await;
test_watermark(&adapter).await;
}
async fn test_single_stream_io<T>(adapter: &T)
@ -78,7 +113,7 @@ pub mod test_utils {
let entry_3 = lp_to_entry("upc user=3 300");
let writer = context.writing();
let reader = context.reading().await;
let mut reader = context.reading().await;
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
@ -88,67 +123,32 @@ pub mod test_utils {
let mut cx = futures::task::Context::from_waker(&waker);
// empty stream is pending
assert!(stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
// adding content allows us to get results
writer.store_entry(&entry_1, sequencer_id).await.unwrap();
assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_1);
assert_eq!(
stream.stream.next().await.unwrap().unwrap().entry(),
&entry_1
);
// stream is pending again
assert!(stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
// adding more data unblocks the stream
writer.store_entry(&entry_2, sequencer_id).await.unwrap();
writer.store_entry(&entry_3, sequencer_id).await.unwrap();
assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_2);
assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_3);
assert_eq!(
stream.stream.next().await.unwrap().unwrap().entry(),
&entry_2
);
assert_eq!(
stream.stream.next().await.unwrap().unwrap().entry(),
&entry_3
);
// stream is pending again
assert!(stream.poll_next_unpin(&mut cx).is_pending());
}
async fn test_multi_sequencer_io<T>(adapter: &T)
where
T: TestAdapter,
{
let context = adapter.new_context(2).await;
let entry_1 = lp_to_entry("upc user=1 100");
let entry_2 = lp_to_entry("upc user=2 200");
let entry_3 = lp_to_entry("upc user=3 300");
let writer = context.writing();
let reader = context.reading().await;
let mut streams = reader.streams();
assert_eq!(streams.len(), 2);
let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
assert_ne!(sequencer_id_1, sequencer_id_2);
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
// empty streams are pending
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
// entries arrive at the right target stream
writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
// streams are pending again
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
}
async fn test_multi_stream_io<T>(adapter: &T)
@ -162,34 +162,104 @@ pub mod test_utils {
let entry_3 = lp_to_entry("upc user=3 300");
let writer = context.writing();
let reader = context.reading().await;
let mut streams_1 = reader.streams();
let mut streams_2 = reader.streams();
assert_eq!(streams_1.len(), 1);
assert_eq!(streams_2.len(), 1);
let (sequencer_id_1, mut stream_1) = streams_1.pop().unwrap();
let (sequencer_id_2, mut stream_2) = streams_2.pop().unwrap();
assert_eq!(sequencer_id_1, sequencer_id_2);
let mut reader = context.reading().await;
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
// empty streams is pending
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
writer.store_entry(&entry_1, 0).await.unwrap();
writer.store_entry(&entry_2, 0).await.unwrap();
writer.store_entry(&entry_3, 0).await.unwrap();
// streams poll from same source
// creating stream, drop stream, re-create it => still starts at first entry
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, stream) = streams.pop().unwrap();
drop(stream);
drop(streams);
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = streams.pop().unwrap();
assert_eq!(
stream.stream.next().await.unwrap().unwrap().entry(),
&entry_1
);
// re-creating stream after reading remembers offset
drop(stream);
drop(streams);
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = streams.pop().unwrap();
assert_eq!(
stream.stream.next().await.unwrap().unwrap().entry(),
&entry_2
);
assert_eq!(
stream.stream.next().await.unwrap().unwrap().entry(),
&entry_3
);
// re-creating stream after reading everything makes it pending
drop(stream);
drop(streams);
let mut streams = reader.streams();
assert_eq!(streams.len(), 1);
let (_sequencer_id, mut stream) = streams.pop().unwrap();
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
}
async fn test_multi_sequencer_io<T>(adapter: &T)
where
T: TestAdapter,
{
let context = adapter.new_context(2).await;
let entry_1 = lp_to_entry("upc user=1 100");
let entry_2 = lp_to_entry("upc user=2 200");
let entry_3 = lp_to_entry("upc user=3 300");
let writer = context.writing();
let mut reader = context.reading().await;
let mut streams = reader.streams();
assert_eq!(streams.len(), 2);
let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
assert_ne!(sequencer_id_1, sequencer_id_2);
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
// empty streams are pending
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
// entries arrive at the right target stream
writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
writer.store_entry(&entry_2, sequencer_id_1).await.unwrap();
writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
assert_eq!(
stream_1.stream.next().await.unwrap().unwrap().entry(),
&entry_1
);
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
// both streams are pending again
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert_eq!(
stream_2.stream.next().await.unwrap().unwrap().entry(),
&entry_2
);
writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
assert_eq!(
stream_1.stream.next().await.unwrap().unwrap().entry(),
&entry_3
);
// streams are pending again
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
}
async fn test_multi_writer_multi_reader<T>(adapter: &T)
@ -204,8 +274,8 @@ pub mod test_utils {
let writer_1 = context.writing();
let writer_2 = context.writing();
let reader_1 = context.reading().await;
let reader_2 = context.reading().await;
let mut reader_1 = context.reading().await;
let mut reader_2 = context.reading().await;
// TODO: do not hard-code sequencer IDs here but provide a proper interface
writer_1.store_entry(&entry_east_1, 0).await.unwrap();
@ -213,18 +283,119 @@ pub mod test_utils {
writer_2.store_entry(&entry_east_2, 0).await.unwrap();
assert_reader_content(
reader_1,
&mut reader_1,
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
)
.await;
assert_reader_content(
reader_2,
&mut reader_2,
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
)
.await;
}
async fn assert_reader_content<R>(reader: R, expected: &[(u32, &[&Entry])])
async fn test_seek<T>(adapter: &T)
where
T: TestAdapter,
{
let context = adapter.new_context(2).await;
let waker = futures::task::noop_waker();
let mut cx = futures::task::Context::from_waker(&waker);
let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
let entry_east_3 = lp_to_entry("upc,region=east user=3 300");
let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
let writer = context.writing();
let _sequence_number_east_1 = writer.store_entry(&entry_east_1, 0).await.unwrap().number;
let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number;
let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number;
let mut reader_1 = context.reading().await;
let mut reader_2 = context.reading().await;
// forward seek
reader_1.seek(0, sequence_number_east_2).await.unwrap();
assert_reader_content(
&mut reader_1,
&[(0, &[&entry_east_2]), (1, &[&entry_west_1])],
)
.await;
assert_reader_content(
&mut reader_2,
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
)
.await;
// backward seek
reader_1.seek(0, 0).await.unwrap();
assert_reader_content(
&mut reader_1,
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[])],
)
.await;
// seek to far end and then at data
reader_1.seek(0, 1_000_000).await.unwrap();
let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number;
let mut streams = reader_1.streams();
assert_eq!(streams.len(), 2);
let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
drop(stream_1);
drop(stream_2);
drop(streams);
// seeking unknown sequencer is NOT an error
reader_1.seek(0, 42).await.unwrap();
}
async fn test_watermark<T>(adapter: &T)
where
T: TestAdapter,
{
let context = adapter.new_context(2).await;
let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
let writer = context.writing();
let mut reader = context.reading().await;
let mut streams = reader.streams();
assert_eq!(streams.len(), 2);
let (sequencer_id_1, stream_1) = streams.pop().unwrap();
let (sequencer_id_2, stream_2) = streams.pop().unwrap();
// start at watermark 0
assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0);
assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0);
// high water mark moves
writer
.store_entry(&entry_east_1, sequencer_id_1)
.await
.unwrap();
let mark_1 = writer
.store_entry(&entry_east_2, sequencer_id_1)
.await
.unwrap()
.number;
let mark_2 = writer
.store_entry(&entry_west_1, sequencer_id_2)
.await
.unwrap()
.number;
assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), mark_1 + 1);
assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), mark_2 + 1);
}
async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&Entry])])
where
R: WriteBufferReading,
{
@ -239,6 +410,7 @@ pub mod test_utils {
// we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever
let mut results: Vec<_> = actual_stream
.stream
.take(expected_entries.len())
.try_collect()
.await

View File

@ -1,22 +1,28 @@
use std::{
collections::BTreeMap,
convert::{TryFrom, TryInto},
sync::Arc,
time::Duration,
};
use async_trait::async_trait;
use data_types::server_id::ServerId;
use entry::{Entry, Sequence, SequencedEntry};
use futures::StreamExt;
use futures::{FutureExt, StreamExt};
use observability_deps::tracing::{debug, info};
use rdkafka::{
consumer::{BaseConsumer, Consumer, StreamConsumer},
error::KafkaError,
producer::{FutureProducer, FutureRecord},
types::RDKafkaErrorCode,
util::Timeout,
ClientConfig, Message, TopicPartitionList,
ClientConfig, Message, Offset, TopicPartitionList,
};
use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
use crate::core::{
EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
WriteBufferWriting,
};
pub struct KafkaBufferProducer {
conn: String,
@ -77,8 +83,8 @@ impl KafkaBufferProducer {
let mut cfg = ClientConfig::new();
cfg.set("bootstrap.servers", &conn);
cfg.set("message.timeout.ms", "5000");
cfg.set("message.max.bytes", "10000000");
cfg.set("queue.buffering.max.kbytes", "10485760");
cfg.set("message.max.bytes", "31457280");
cfg.set("queue.buffering.max.kbytes", "31457280");
cfg.set("request.required.acks", "all"); // equivalent to acks=-1
let producer: FutureProducer = cfg.create()?;
@ -94,7 +100,7 @@ impl KafkaBufferProducer {
pub struct KafkaBufferConsumer {
conn: String,
database_name: String,
consumers: Vec<(u32, StreamConsumer)>,
consumers: BTreeMap<u32, Arc<StreamConsumer>>,
}
// Needed because rdkafka's StreamConsumer doesn't impl Debug
@ -107,31 +113,94 @@ impl std::fmt::Debug for KafkaBufferConsumer {
}
}
#[async_trait]
impl WriteBufferReading for KafkaBufferConsumer {
fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
where
'life0: 'async_trait,
Self: 'async_trait,
{
self.consumers
.iter()
.map(|(sequencer_id, consumer)| {
let stream = consumer
.stream()
.map(|message| {
let message = message?;
let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
let sequence = Sequence {
id: message.partition().try_into()?,
number: message.offset().try_into()?,
};
fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
let mut streams = vec![];
Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
for (sequencer_id, consumer) in &self.consumers {
let sequencer_id = *sequencer_id;
let consumer_cloned = Arc::clone(consumer);
let database_name = self.database_name.clone();
let stream = consumer
.stream()
.map(move |message| {
let message = message?;
let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
let sequence = Sequence {
id: message.partition().try_into()?,
number: message.offset().try_into()?,
};
Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
})
.boxed();
let fetch_high_watermark = move || {
let consumer_cloned = Arc::clone(&consumer_cloned);
let database_name = database_name.clone();
let fut = async move {
match tokio::task::spawn_blocking(move || {
consumer_cloned.fetch_watermarks(
&database_name,
sequencer_id as i32,
Duration::from_secs(60),
)
})
.boxed();
(*sequencer_id, stream)
.await
.expect("subtask failed")
{
Ok((_low, high)) => Ok(high as u64),
Err(KafkaError::MetadataFetch(RDKafkaErrorCode::UnknownPartition)) => Ok(0),
Err(e) => Err(Box::new(e) as Box<dyn std::error::Error + Send + Sync>),
}
};
fut.boxed() as FetchHighWatermarkFut<'_>
};
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
streams.push((
sequencer_id,
EntryStream {
stream,
fetch_high_watermark,
},
));
}
streams
}
async fn seek(
&mut self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError> {
if let Some(consumer) = self.consumers.get(&sequencer_id) {
let consumer = Arc::clone(consumer);
let database_name = self.database_name.clone();
let offset = if sequence_number > 0 {
Offset::Offset(sequence_number as i64)
} else {
Offset::Beginning
};
tokio::task::spawn_blocking(move || {
consumer.seek(
&database_name,
sequencer_id as i32,
offset,
Duration::from_secs(60),
)
})
.collect()
.await
.expect("subtask failed")?;
}
Ok(())
}
}
@ -169,11 +238,21 @@ impl KafkaBufferConsumer {
let mut assignment = TopicPartitionList::new();
assignment.add_partition(&database_name, partition as i32);
consumer.assign(&assignment)?;
Ok((partition, consumer))
// We must set the offset to `Beginning` here to avoid the following error during seek:
// KafkaError (Seek error: Local: Erroneous state)
//
// Also see:
// - https://github.com/Blizzard/node-rdkafka/issues/237
// - https://github.com/confluentinc/confluent-kafka-go/issues/121#issuecomment-362308376
assignment
.set_partition_offset(&database_name, partition as i32, Offset::Beginning)
.expect("partition was set just before");
consumer.assign(&assignment)?;
Ok((partition, Arc::new(consumer)))
})
.collect::<Result<Vec<(u32, StreamConsumer)>, KafkaError>>()?;
.collect::<Result<BTreeMap<u32, Arc<StreamConsumer>>, KafkaError>>()?;
Ok(Self {
conn,

View File

@ -2,10 +2,13 @@ use std::{collections::BTreeMap, sync::Arc, task::Poll};
use async_trait::async_trait;
use entry::{Entry, Sequence, SequencedEntry};
use futures::{stream, StreamExt};
use futures::{stream, FutureExt, StreamExt};
use parking_lot::Mutex;
use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
use crate::core::{
EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
WriteBufferWriting,
};
type EntryResVec = Vec<Result<SequencedEntry, WriteBufferError>>;
@ -153,21 +156,38 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors {
}
}
/// Sequencer-specific playback state
struct PlaybackState {
/// Index within the entry vector.
vector_index: usize,
/// Offset within the sequencer IDs.
offset: u64,
}
pub struct MockBufferForReading {
state: MockBufferSharedState,
positions: Arc<Mutex<BTreeMap<u32, usize>>>,
shared_state: MockBufferSharedState,
playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
}
impl MockBufferForReading {
pub fn new(state: MockBufferSharedState) -> Self {
let n_sequencers = state.entries.lock().len() as u32;
let positions: BTreeMap<_, _> = (0..n_sequencers)
.map(|sequencer_id| (sequencer_id, 0))
let playback_states: BTreeMap<_, _> = (0..n_sequencers)
.map(|sequencer_id| {
(
sequencer_id,
PlaybackState {
vector_index: 0,
offset: 0,
},
)
})
.collect();
Self {
state,
positions: Arc::new(Mutex::new(positions)),
shared_state: state,
playback_states: Arc::new(Mutex::new(playback_states)),
}
}
}
@ -178,46 +198,110 @@ impl std::fmt::Debug for MockBufferForReading {
}
}
#[async_trait]
impl WriteBufferReading for MockBufferForReading {
fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
where
'life0: 'async_trait,
Self: 'async_trait,
{
fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
let sequencer_ids: Vec<_> = {
let positions = self.positions.lock();
positions.keys().copied().collect()
let playback_states = self.playback_states.lock();
playback_states.keys().copied().collect()
};
let mut streams = vec![];
for sequencer_id in sequencer_ids {
let state = self.state.clone();
let positions = Arc::clone(&self.positions);
let shared_state = self.shared_state.clone();
let playback_states = Arc::clone(&self.playback_states);
let stream = stream::poll_fn(move |_ctx| {
let entries = state.entries.lock();
let mut positions = positions.lock();
let entries = shared_state.entries.lock();
let mut playback_states = playback_states.lock();
let entry_vec = entries.get(&sequencer_id).unwrap();
let position = positions.get_mut(&sequencer_id).unwrap();
let playback_state = playback_states.get_mut(&sequencer_id).unwrap();
if entry_vec.len() > *position {
let entry = match &entry_vec[*position] {
Ok(entry) => Ok(entry.clone()),
Err(e) => Err(e.to_string().into()),
};
*position += 1;
return Poll::Ready(Some(entry));
while entry_vec.len() > playback_state.vector_index {
let entry_result = &entry_vec[playback_state.vector_index];
// consume entry
playback_state.vector_index += 1;
match entry_result {
Ok(entry) => {
// found an entry => need to check if it is within the offset
let sequence = entry.sequence().unwrap();
if sequence.number >= playback_state.offset {
// within offset => return entry to caller
return Poll::Ready(Some(Ok(entry.clone())));
} else {
// offset is larger then the current entry => ignore entry and try next
continue;
}
}
Err(e) => {
// found an error => return entry to caller
return Poll::Ready(Some(Err(e.to_string().into())));
}
}
}
// we are at the end of the recorded entries => report pending
Poll::Pending
})
.boxed();
streams.push((sequencer_id, stream));
let shared_state = self.shared_state.clone();
let fetch_high_watermark = move || {
let shared_state = shared_state.clone();
let fut = async move {
let entries = shared_state.entries.lock();
let entry_vec = entries.get(&sequencer_id).unwrap();
let watermark = entry_vec
.iter()
.filter_map(|entry_res| {
entry_res
.as_ref()
.ok()
.map(|entry| entry.sequence().unwrap().number)
})
.max()
.map(|n| n + 1)
.unwrap_or(0);
Ok(watermark)
};
fut.boxed() as FetchHighWatermarkFut<'_>
};
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
streams.push((
sequencer_id,
EntryStream {
stream,
fetch_high_watermark,
},
));
}
streams
}
async fn seek(
&mut self,
sequencer_id: u32,
sequence_number: u64,
) -> Result<(), WriteBufferError> {
let mut playback_states = self.playback_states.lock();
if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {
playback_state.offset = sequence_number;
// reset position to start since seeking might go backwards
playback_state.vector_index = 0;
}
Ok(())
}
}
#[cfg(test)]