Merge branch 'main' into ntran/dedup_compare_cols_order
commit
18dd108ba6
|
@ -769,9 +769,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "crypto-mac"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6"
|
||||
checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"subtle",
|
||||
|
@ -826,6 +826,7 @@ dependencies = [
|
|||
"influxdb_line_protocol",
|
||||
"num_cpus",
|
||||
"observability_deps",
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"regex",
|
||||
"serde",
|
||||
|
@ -843,7 +844,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "4.0.0-SNAPSHOT"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=bd3ee23520a3e6f135891ec32d96fcea7ee2bb55#bd3ee23520a3e6f135891ec32d96fcea7ee2bb55"
|
||||
source = "git+https://github.com/apache/arrow-datafusion.git?rev=30693df8961dca300306dfd0c8fca130375b50b3#30693df8961dca300306dfd0c8fca130375b50b3"
|
||||
dependencies = [
|
||||
"ahash 0.7.4",
|
||||
"arrow",
|
||||
|
@ -4330,9 +4331,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.2.0"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342"
|
||||
checksum = "4ac2e1d4bd0f75279cfd5a076e0d578bbf02c22b7c39e766c437dd49b3ec43e0"
|
||||
dependencies = [
|
||||
"tinyvec_macros",
|
||||
]
|
||||
|
@ -4345,9 +4346,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
|||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.8.1"
|
||||
version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "98c8b05dc14c75ea83d63dd391100353789f5f24b8b3866542a5e85c8be8e985"
|
||||
checksum = "c2602b8af3767c285202012822834005f596c811042315fa7e9f5b12b2a43207"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bytes",
|
||||
|
@ -4984,9 +4985,9 @@ checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
|
|||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eeafe61337cb2c879d328b74aa6cd9d794592c82da6be559fdf11493f02a2d18"
|
||||
checksum = "377db0846015f7ae377174787dd452e1c5f5a9050bc6f954911d01f116daa0cd"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
|
|
|
@ -15,6 +15,7 @@ regex = "1.4"
|
|||
serde = { version = "1.0", features = ["rc", "derive"] }
|
||||
snafu = "0.6"
|
||||
observability_deps = { path = "../observability_deps" }
|
||||
once_cell = { version = "1.4.0", features = ["parking_lot"] }
|
||||
|
||||
[dev-dependencies] # In alphabetical order
|
||||
test_helpers = { path = "../test_helpers" }
|
||||
|
|
|
@ -166,6 +166,10 @@ pub struct LifecycleRules {
|
|||
|
||||
/// Maximum number of rows to buffer in a MUB chunk before compacting it
|
||||
pub mub_row_threshold: NonZeroUsize,
|
||||
|
||||
/// Use up to this amount of space in bytes for caching Parquet files. None
|
||||
/// will disable Parquet file caching.
|
||||
pub parquet_cache_limit: Option<NonZeroU64>,
|
||||
}
|
||||
|
||||
impl LifecycleRules {
|
||||
|
@ -195,6 +199,7 @@ impl Default for LifecycleRules {
|
|||
persist_age_threshold_seconds: NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS)
|
||||
.unwrap(),
|
||||
mub_row_threshold: NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap(),
|
||||
parquet_cache_limit: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
use chrono::{DateTime, Utc};
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::time::Instant;
|
||||
|
||||
/// Stores an Instant and DateTime<Utc> captured as close as possible together
|
||||
static INSTANCE: OnceCell<(DateTime<Utc>, Instant)> = OnceCell::new();
|
||||
|
||||
/// Provides a conversion from Instant to DateTime<Utc> for display purposes
|
||||
///
|
||||
/// It is an approximation as if the system clock changes, the returned DateTime will not be
|
||||
/// the same as the DateTime that would have been recorded at the time the Instant was created.
|
||||
///
|
||||
/// The conversion does, however, preserve the monotonic property of Instant, i.e. a larger
|
||||
/// Instant will have a larger returned DateTime.
|
||||
///
|
||||
/// This should ONLY be used for display purposes, the results should not be used to
|
||||
/// drive logic, nor persisted
|
||||
pub fn to_approximate_datetime(instant: Instant) -> DateTime<Utc> {
|
||||
let (ref_date, ref_instant) = *INSTANCE.get_or_init(|| (Utc::now(), Instant::now()));
|
||||
|
||||
if ref_instant > instant {
|
||||
ref_date
|
||||
- chrono::Duration::from_std(ref_instant.duration_since(instant))
|
||||
.expect("date overflow")
|
||||
} else {
|
||||
ref_date
|
||||
+ chrono::Duration::from_std(instant.duration_since(ref_instant))
|
||||
.expect("date overflow")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_to_datetime() {
|
||||
// Seed global state
|
||||
to_approximate_datetime(Instant::now());
|
||||
|
||||
let (ref_date, ref_instant) = *INSTANCE.get().unwrap();
|
||||
|
||||
assert_eq!(
|
||||
to_approximate_datetime(ref_instant + std::time::Duration::from_nanos(78)),
|
||||
ref_date + chrono::Duration::nanoseconds(78)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
to_approximate_datetime(ref_instant - std::time::Duration::from_nanos(23)),
|
||||
ref_date - chrono::Duration::nanoseconds(23)
|
||||
);
|
||||
}
|
||||
}
|
|
@ -13,12 +13,14 @@
|
|||
pub mod chunk_metadata;
|
||||
pub mod consistent_hasher;
|
||||
mod database_name;
|
||||
pub use database_name::*;
|
||||
pub mod database_rules;
|
||||
pub mod database_state;
|
||||
pub mod error;
|
||||
pub mod instant;
|
||||
pub mod job;
|
||||
pub mod names;
|
||||
pub mod partition_metadata;
|
||||
pub mod server_id;
|
||||
pub mod timestamp;
|
||||
pub mod write_summary;
|
||||
pub use database_name::*;
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
use chrono::{DateTime, Utc};
|
||||
|
||||
/// A description of a set of writes
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub struct WriteSummary {
|
||||
/// The wall clock timestamp of the last write in this summary
|
||||
pub time_of_first_write: DateTime<Utc>,
|
||||
|
||||
/// The wall clock timestamp of the last write in this summary
|
||||
pub time_of_last_write: DateTime<Utc>,
|
||||
|
||||
/// The minimum row timestamp for data in this summary
|
||||
pub min_timestamp: DateTime<Utc>,
|
||||
|
||||
/// The maximum row timestamp value for data in this summary
|
||||
pub max_timestamp: DateTime<Utc>,
|
||||
|
||||
/// The number of rows in this summary
|
||||
pub row_count: usize,
|
||||
}
|
|
@ -9,4 +9,4 @@ description = "Re-exports datafusion at a specific version"
|
|||
|
||||
# Rename to workaround doctest bug
|
||||
# Turn off optional datafusion features (function packages)
|
||||
upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="bd3ee23520a3e6f135891ec32d96fcea7ee2bb55", default-features = false, package = "datafusion" }
|
||||
upstream = { git = "https://github.com/apache/arrow-datafusion.git", rev="30693df8961dca300306dfd0c8fca130375b50b3", default-features = false, package = "datafusion" }
|
||||
|
|
|
@ -82,6 +82,10 @@ message LifecycleRules {
|
|||
// If 0, compactions are limited to the default number.
|
||||
// See data_types::database_rules::DEFAULT_MAX_ACTIVE_COMPACTIONS
|
||||
uint32 max_active_compactions = 16;
|
||||
|
||||
// Use up to this amount of space in bytes for caching Parquet files.
|
||||
// A value of 0 disables Parquet caching
|
||||
uint64 parquet_cache_limit = 17;
|
||||
}
|
||||
|
||||
message DatabaseRules {
|
||||
|
|
|
@ -35,6 +35,10 @@ impl From<LifecycleRules> for management::LifecycleRules {
|
|||
persist_row_threshold: config.persist_row_threshold.get() as u64,
|
||||
persist_age_threshold_seconds: config.persist_age_threshold_seconds.get(),
|
||||
mub_row_threshold: config.mub_row_threshold.get() as u64,
|
||||
parquet_cache_limit: config
|
||||
.parquet_cache_limit
|
||||
.map(|v| v.get())
|
||||
.unwrap_or_default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -69,6 +73,7 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
|
|||
.unwrap_or_else(|| NonZeroU32::new(DEFAULT_PERSIST_AGE_THRESHOLD_SECONDS).unwrap()),
|
||||
mub_row_threshold: NonZeroUsize::new(proto.mub_row_threshold as usize)
|
||||
.unwrap_or_else(|| NonZeroUsize::new(DEFAULT_MUB_ROW_THRESHOLD).unwrap()),
|
||||
parquet_cache_limit: NonZeroU64::new(proto.parquet_cache_limit),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -93,6 +98,7 @@ mod tests {
|
|||
persist_row_threshold: 57,
|
||||
persist_age_threshold_seconds: 23,
|
||||
mub_row_threshold: 3454,
|
||||
parquet_cache_limit: 10,
|
||||
};
|
||||
|
||||
let config: LifecycleRules = protobuf.clone().try_into().unwrap();
|
||||
|
@ -125,6 +131,11 @@ mod tests {
|
|||
protobuf.persist_age_threshold_seconds
|
||||
);
|
||||
assert_eq!(back.mub_row_threshold, protobuf.mub_row_threshold);
|
||||
assert_eq!(
|
||||
config.parquet_cache_limit.unwrap().get(),
|
||||
protobuf.parquet_cache_limit
|
||||
);
|
||||
assert_eq!(back.parquet_cache_limit, protobuf.parquet_cache_limit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -11,7 +11,7 @@ use arrow::datatypes::{
|
|||
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
|
||||
SchemaRef as ArrowSchemaRef, TimeUnit,
|
||||
};
|
||||
use snafu::Snafu;
|
||||
use snafu::{OptionExt, Snafu};
|
||||
|
||||
use crate::{
|
||||
schema::sort::{ColumnSort, SortKey},
|
||||
|
@ -395,11 +395,9 @@ impl Schema {
|
|||
pub fn compute_select_indicies(&self, columns: &[&str]) -> Result<Vec<usize>> {
|
||||
columns
|
||||
.iter()
|
||||
.map(|column_name| {
|
||||
.map(|&column_name| {
|
||||
self.find_index_of(column_name)
|
||||
.ok_or_else(|| Error::ColumnNotFound {
|
||||
column_name: column_name.to_string(),
|
||||
})
|
||||
.context(ColumnNotFound { column_name })
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
@ -788,12 +786,12 @@ macro_rules! assert_column_eq {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use arrow::compute::SortOptions;
|
||||
use InfluxColumnType::*;
|
||||
use InfluxFieldType::*;
|
||||
|
||||
use super::{builder::SchemaBuilder, *};
|
||||
use crate::schema::merge::SchemaMerger;
|
||||
use crate::schema::sort::SortOptions;
|
||||
|
||||
fn make_field(
|
||||
name: &str,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use std::{fmt::Display, str::FromStr};
|
||||
|
||||
use arrow::compute::SortOptions;
|
||||
use indexmap::{map::Iter, IndexMap};
|
||||
use itertools::Itertools;
|
||||
use snafu::Snafu;
|
||||
|
@ -23,24 +24,6 @@ pub enum Error {
|
|||
|
||||
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
|
||||
/// Temporary - <https://github.com/apache/arrow-rs/pull/425>
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub struct SortOptions {
|
||||
/// Whether to sort in descending order
|
||||
pub descending: bool,
|
||||
/// Whether to sort nulls first
|
||||
pub nulls_first: bool,
|
||||
}
|
||||
|
||||
impl Default for SortOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
descending: false,
|
||||
nulls_first: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub struct ColumnSort {
|
||||
/// Position of this column in the sort key
|
||||
|
|
|
@ -1399,6 +1399,7 @@ mod tests {
|
|||
let rules = LifecycleRules {
|
||||
late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
|
||||
persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
|
||||
max_active_compactions: NonZeroU32::new(10).unwrap(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
@ -1538,6 +1539,7 @@ mod tests {
|
|||
persist_row_threshold: NonZeroUsize::new(1_000).unwrap(),
|
||||
late_arrive_window_seconds: NonZeroU32::new(10).unwrap(),
|
||||
persist_age_threshold_seconds: NonZeroU32::new(10).unwrap(),
|
||||
max_active_compactions: NonZeroU32::new(10).unwrap(),
|
||||
..Default::default()
|
||||
};
|
||||
let now = Instant::now();
|
||||
|
|
|
@ -7,12 +7,13 @@ use std::{
|
|||
|
||||
use chrono::{DateTime, TimeZone, Utc};
|
||||
|
||||
use data_types::partition_metadata::PartitionAddr;
|
||||
use data_types::{partition_metadata::PartitionAddr, write_summary::WriteSummary};
|
||||
use entry::Sequence;
|
||||
use internal_types::guard::{ReadGuard, ReadLock};
|
||||
|
||||
use crate::checkpoint::PartitionCheckpoint;
|
||||
use crate::min_max_sequence::MinMaxSequence;
|
||||
use data_types::instant::to_approximate_datetime;
|
||||
|
||||
const DEFAULT_CLOSED_WINDOW_PERIOD: Duration = Duration::from_secs(30);
|
||||
|
||||
|
@ -45,6 +46,9 @@ pub struct PersistenceWindows {
|
|||
late_arrival_period: Duration,
|
||||
closed_window_period: Duration,
|
||||
|
||||
/// The instant this PersistenceWindows was created
|
||||
created_at: Instant,
|
||||
|
||||
/// The last instant passed to PersistenceWindows::add_range
|
||||
last_instant: Instant,
|
||||
|
||||
|
@ -106,6 +110,8 @@ impl PersistenceWindows {
|
|||
|
||||
let closed_window_count = late_arrival_seconds / closed_window_seconds;
|
||||
|
||||
let created_at_instant = Instant::now();
|
||||
|
||||
Self {
|
||||
persistable: ReadLock::new(None),
|
||||
closed: VecDeque::with_capacity(closed_window_count as usize),
|
||||
|
@ -113,11 +119,18 @@ impl PersistenceWindows {
|
|||
addr,
|
||||
late_arrival_period,
|
||||
closed_window_period,
|
||||
last_instant: Instant::now(),
|
||||
created_at: created_at_instant,
|
||||
last_instant: created_at_instant,
|
||||
max_sequence_numbers: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates the late arrival period of this `PersistenceWindows` instance
|
||||
pub fn set_late_arrival_period(&mut self, late_arrival_period: Duration) {
|
||||
self.closed_window_period = late_arrival_period.min(DEFAULT_CLOSED_WINDOW_PERIOD);
|
||||
self.late_arrival_period = late_arrival_period;
|
||||
}
|
||||
|
||||
/// Updates the windows with the information from a batch of rows from a single sequencer
|
||||
/// to the same partition. The min and max times are the times on the row data. The `received_at`
|
||||
/// Instant is when the data was received. Taking it in this function is really just about
|
||||
|
@ -165,7 +178,7 @@ impl PersistenceWindows {
|
|||
self.rotate(received_at);
|
||||
|
||||
match self.open.as_mut() {
|
||||
Some(w) => w.add_range(sequence, row_count, min_time, max_time),
|
||||
Some(w) => w.add_range(sequence, row_count, min_time, max_time, received_at),
|
||||
None => {
|
||||
self.open = Some(Window::new(
|
||||
received_at,
|
||||
|
@ -335,6 +348,21 @@ impl PersistenceWindows {
|
|||
self.windows().next()
|
||||
}
|
||||
|
||||
/// Returns approximate summaries of the unpersisted writes contained
|
||||
/// recorded by this PersistenceWindow instance
|
||||
///
|
||||
/// These are approximate because persistence may partially flush a window, which will
|
||||
/// update the min row timestamp but not the row count
|
||||
pub fn summaries(&self) -> impl Iterator<Item = WriteSummary> + '_ {
|
||||
self.windows().map(move |window| WriteSummary {
|
||||
time_of_first_write: to_approximate_datetime(window.created_at),
|
||||
time_of_last_write: to_approximate_datetime(window.last_instant),
|
||||
min_timestamp: window.min_time,
|
||||
max_timestamp: window.max_time,
|
||||
row_count: window.row_count,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true if this PersistenceWindows instance is empty
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.minimum_window().is_none()
|
||||
|
@ -374,9 +402,14 @@ struct Window {
|
|||
/// The server time when this window was created. Used to determine how long data in this
|
||||
/// window has been sitting in memory.
|
||||
created_at: Instant,
|
||||
/// The server time of the last write to this window
|
||||
last_instant: Instant,
|
||||
/// The number of rows in the window
|
||||
row_count: usize,
|
||||
min_time: DateTime<Utc>, // min time value for data in the window
|
||||
max_time: DateTime<Utc>, // max time value for data in the window
|
||||
/// min time value for data in the window
|
||||
min_time: DateTime<Utc>,
|
||||
/// max time value for data in the window
|
||||
max_time: DateTime<Utc>,
|
||||
/// maps sequencer_id to the minimum and maximum sequence numbers seen
|
||||
sequencer_numbers: BTreeMap<u32, MinMaxSequence>,
|
||||
}
|
||||
|
@ -399,6 +432,7 @@ impl Window {
|
|||
|
||||
Self {
|
||||
created_at,
|
||||
last_instant: created_at,
|
||||
row_count,
|
||||
min_time,
|
||||
max_time,
|
||||
|
@ -414,7 +448,11 @@ impl Window {
|
|||
row_count: usize,
|
||||
min_time: DateTime<Utc>,
|
||||
max_time: DateTime<Utc>,
|
||||
instant: Instant,
|
||||
) {
|
||||
assert!(self.created_at <= instant);
|
||||
self.last_instant = instant;
|
||||
|
||||
self.row_count += row_count;
|
||||
if self.min_time > min_time {
|
||||
self.min_time = min_time;
|
||||
|
@ -440,6 +478,10 @@ impl Window {
|
|||
|
||||
/// Add one window to another. Used to collapse closed windows into persisted.
|
||||
fn add_window(&mut self, other: Self) {
|
||||
assert!(self.last_instant <= other.created_at);
|
||||
assert!(self.last_instant <= other.last_instant);
|
||||
|
||||
self.last_instant = other.last_instant;
|
||||
self.row_count += other.row_count;
|
||||
if self.min_time > other.min_time {
|
||||
self.min_time = other.min_time;
|
||||
|
@ -1265,4 +1307,119 @@ mod tests {
|
|||
assert_eq!(w.closed[1].max_time, start + chrono::Duration::seconds(2));
|
||||
assert_eq!(w.closed[1].row_count, 11);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_summaries() {
|
||||
let late_arrival_period = Duration::from_secs(100);
|
||||
let mut w = make_windows(late_arrival_period);
|
||||
let instant = w.created_at;
|
||||
let created_at_time = to_approximate_datetime(w.created_at);
|
||||
|
||||
// Window 1
|
||||
w.add_range(
|
||||
Some(&Sequence { id: 1, number: 1 }),
|
||||
11,
|
||||
Utc.timestamp_nanos(10),
|
||||
Utc.timestamp_nanos(11),
|
||||
instant + Duration::from_millis(1),
|
||||
);
|
||||
|
||||
w.add_range(
|
||||
Some(&Sequence { id: 1, number: 2 }),
|
||||
4,
|
||||
Utc.timestamp_nanos(10),
|
||||
Utc.timestamp_nanos(340),
|
||||
instant + Duration::from_millis(30),
|
||||
);
|
||||
|
||||
w.add_range(
|
||||
Some(&Sequence { id: 1, number: 3 }),
|
||||
6,
|
||||
Utc.timestamp_nanos(1),
|
||||
Utc.timestamp_nanos(5),
|
||||
instant + Duration::from_millis(50),
|
||||
);
|
||||
|
||||
// More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 1 => Window 2
|
||||
w.add_range(
|
||||
Some(&Sequence { id: 1, number: 4 }),
|
||||
3,
|
||||
Utc.timestamp_nanos(89),
|
||||
Utc.timestamp_nanos(90),
|
||||
instant + DEFAULT_CLOSED_WINDOW_PERIOD + Duration::from_millis(1),
|
||||
);
|
||||
|
||||
// More than DEFAULT_CLOSED_WINDOW_PERIOD after start of Window 2 => Window 3
|
||||
w.add_range(
|
||||
Some(&Sequence { id: 1, number: 5 }),
|
||||
8,
|
||||
Utc.timestamp_nanos(3),
|
||||
Utc.timestamp_nanos(4),
|
||||
instant + DEFAULT_CLOSED_WINDOW_PERIOD * 3,
|
||||
);
|
||||
|
||||
let closed_duration = chrono::Duration::from_std(DEFAULT_CLOSED_WINDOW_PERIOD).unwrap();
|
||||
|
||||
let summaries: Vec<_> = w.summaries().collect();
|
||||
|
||||
assert_eq!(summaries.len(), 3);
|
||||
assert_eq!(
|
||||
summaries,
|
||||
vec![
|
||||
WriteSummary {
|
||||
time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
|
||||
time_of_last_write: created_at_time + chrono::Duration::milliseconds(50),
|
||||
min_timestamp: Utc.timestamp_nanos(1),
|
||||
max_timestamp: Utc.timestamp_nanos(340),
|
||||
row_count: 21
|
||||
},
|
||||
WriteSummary {
|
||||
time_of_first_write: created_at_time
|
||||
+ closed_duration
|
||||
+ chrono::Duration::milliseconds(1),
|
||||
time_of_last_write: created_at_time
|
||||
+ closed_duration
|
||||
+ chrono::Duration::milliseconds(1),
|
||||
min_timestamp: Utc.timestamp_nanos(89),
|
||||
max_timestamp: Utc.timestamp_nanos(90),
|
||||
row_count: 3
|
||||
},
|
||||
WriteSummary {
|
||||
time_of_first_write: created_at_time + closed_duration * 3,
|
||||
time_of_last_write: created_at_time + closed_duration * 3,
|
||||
min_timestamp: Utc.timestamp_nanos(3),
|
||||
max_timestamp: Utc.timestamp_nanos(4),
|
||||
row_count: 8
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
// Rotate first and second windows into persistable
|
||||
w.rotate(instant + late_arrival_period + DEFAULT_CLOSED_WINDOW_PERIOD * 2);
|
||||
|
||||
let summaries: Vec<_> = w.summaries().collect();
|
||||
|
||||
assert_eq!(summaries.len(), 2);
|
||||
assert_eq!(
|
||||
summaries,
|
||||
vec![
|
||||
WriteSummary {
|
||||
time_of_first_write: created_at_time + chrono::Duration::milliseconds(1),
|
||||
time_of_last_write: created_at_time
|
||||
+ closed_duration
|
||||
+ chrono::Duration::milliseconds(1),
|
||||
min_timestamp: Utc.timestamp_nanos(1),
|
||||
max_timestamp: Utc.timestamp_nanos(340),
|
||||
row_count: 24
|
||||
},
|
||||
WriteSummary {
|
||||
time_of_first_write: created_at_time + closed_duration * 3,
|
||||
time_of_last_write: created_at_time + closed_duration * 3,
|
||||
min_timestamp: Utc.timestamp_nanos(3),
|
||||
max_timestamp: Utc.timestamp_nanos(4),
|
||||
row_count: 8
|
||||
},
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ use crate::plan::{
|
|||
};
|
||||
|
||||
use self::{
|
||||
context::IOxExecutionConfig,
|
||||
split::StreamSplitNode,
|
||||
task::{DedicatedExecutor, Error as ExecutorError},
|
||||
};
|
||||
|
@ -111,6 +112,9 @@ pub struct Executor {
|
|||
/// Executor for running system/reorganization tasks such as
|
||||
/// compact
|
||||
reorg_exec: DedicatedExecutor,
|
||||
|
||||
/// The default configuration options with which to create contexts
|
||||
config: IOxExecutionConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
|
@ -128,12 +132,25 @@ impl Executor {
|
|||
let query_exec = DedicatedExecutor::new("IOx Query Executor Thread", num_threads);
|
||||
let reorg_exec = DedicatedExecutor::new("IOx Reorg Executor Thread", num_threads);
|
||||
|
||||
let config = IOxExecutionConfig::new();
|
||||
|
||||
Self {
|
||||
query_exec,
|
||||
reorg_exec,
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// returns the config of this executor
|
||||
pub fn config(&self) -> &IOxExecutionConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
/// returns a mutable reference to this executor's config
|
||||
pub fn config_mut(&mut self) -> &mut IOxExecutionConfig {
|
||||
&mut self.config
|
||||
}
|
||||
|
||||
/// Executes this plan on the query pool, and returns the
|
||||
/// resulting set of strings
|
||||
pub async fn to_string_set(&self, plan: StringSetPlan) -> Result<StringSetRef> {
|
||||
|
@ -289,7 +306,7 @@ impl Executor {
|
|||
pub fn new_context(&self, executor_type: ExecutorType) -> IOxExecutionContext {
|
||||
let executor = self.executor(executor_type).clone();
|
||||
|
||||
IOxExecutionContext::new(executor)
|
||||
IOxExecutionContext::new(executor, self.config.clone())
|
||||
}
|
||||
|
||||
/// Return the execution pool of the specified type
|
||||
|
|
|
@ -5,6 +5,7 @@ use std::{fmt, sync::Arc};
|
|||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use datafusion::{
|
||||
catalog::catalog::CatalogProvider,
|
||||
execution::context::{ExecutionContextState, QueryPlanner},
|
||||
logical_plan::{LogicalPlan, UserDefinedLogicalNode},
|
||||
physical_plan::{
|
||||
|
@ -105,6 +106,46 @@ impl ExtensionPlanner for IOxExtensionPlanner {
|
|||
}
|
||||
}
|
||||
|
||||
// Configuration for an IOx execution context
|
||||
#[derive(Clone)]
|
||||
pub struct IOxExecutionConfig {
|
||||
/// Configuration options to pass to DataFusion
|
||||
inner: ExecutionConfig,
|
||||
}
|
||||
|
||||
impl Default for IOxExecutionConfig {
|
||||
fn default() -> Self {
|
||||
const BATCH_SIZE: usize = 1000;
|
||||
|
||||
// Setup default configuration
|
||||
let inner = ExecutionConfig::new()
|
||||
.with_batch_size(BATCH_SIZE)
|
||||
.create_default_catalog_and_schema(true)
|
||||
.with_information_schema(true)
|
||||
.with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
|
||||
.with_query_planner(Arc::new(IOxQueryPlanner {}));
|
||||
|
||||
Self { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for IOxExecutionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "IOxExecutionConfig ...")
|
||||
}
|
||||
}
|
||||
|
||||
impl IOxExecutionConfig {
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Set execution concurrency
|
||||
pub fn set_concurrency(&mut self, concurrency: usize) {
|
||||
self.inner.concurrency = concurrency;
|
||||
}
|
||||
}
|
||||
|
||||
/// This is an execution context for planning in IOx. It wraps a
|
||||
/// DataFusion execution context with the information needed for planning.
|
||||
///
|
||||
|
@ -136,21 +177,8 @@ impl fmt::Debug for IOxExecutionContext {
|
|||
|
||||
impl IOxExecutionContext {
|
||||
/// Create an ExecutionContext suitable for executing DataFusion plans
|
||||
///
|
||||
/// The config is created with a default catalog and schema, but this
|
||||
/// can be overridden at a later date
|
||||
pub fn new(exec: DedicatedExecutor) -> Self {
|
||||
const BATCH_SIZE: usize = 1000;
|
||||
|
||||
// TBD: Should we be reusing an execution context across all executions?
|
||||
let config = ExecutionConfig::new()
|
||||
.with_batch_size(BATCH_SIZE)
|
||||
.create_default_catalog_and_schema(true)
|
||||
.with_information_schema(true)
|
||||
.with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
|
||||
.with_query_planner(Arc::new(IOxQueryPlanner {}));
|
||||
|
||||
let inner = ExecutionContext::with_config(config);
|
||||
pub fn new(exec: DedicatedExecutor, config: IOxExecutionConfig) -> Self {
|
||||
let inner = ExecutionContext::with_config(config.inner);
|
||||
|
||||
Self { inner, exec }
|
||||
}
|
||||
|
@ -160,11 +188,13 @@ impl IOxExecutionContext {
|
|||
&self.inner
|
||||
}
|
||||
|
||||
/// returns a mutable reference to the inner datafusion execution context
|
||||
pub fn inner_mut(&mut self) -> &mut ExecutionContext {
|
||||
&mut self.inner
|
||||
/// registers a catalog with the inner context
|
||||
pub fn register_catalog(&mut self, name: impl Into<String>, catalog: Arc<dyn CatalogProvider>) {
|
||||
self.inner.register_catalog(name, catalog);
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
/// Prepare a SQL statement for execution. This assumes that any
|
||||
/// tables referenced in the SQL have been registered with this context
|
||||
pub fn prepare_sql(&mut self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
|
|
|
@ -268,8 +268,9 @@ struct ScanPlan<C: QueryChunk + 'static> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use arrow::compute::SortOptions;
|
||||
use arrow_util::assert_batches_eq;
|
||||
use internal_types::schema::{merge::SchemaMerger, sort::SortOptions};
|
||||
use internal_types::schema::merge::SchemaMerger;
|
||||
|
||||
use crate::{
|
||||
exec::{Executor, ExecutorType},
|
||||
|
|
|
@ -87,7 +87,7 @@ impl SqlQueryPlanner {
|
|||
executor: &Executor,
|
||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||
let mut ctx = executor.new_context(ExecutorType::Query);
|
||||
ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database);
|
||||
ctx.register_catalog(DEFAULT_CATALOG, database);
|
||||
ctx.prepare_sql(query).context(Preparing)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -366,21 +366,12 @@ impl RecordBatchDeduplicator {
|
|||
}
|
||||
|
||||
/// Create a new record batch from offset --> len
|
||||
///
|
||||
/// <https://github.com/apache/arrow-rs/issues/460> for adding this upstream
|
||||
fn slice_record_batch(
|
||||
batch: &RecordBatch,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> ArrowResult<RecordBatch> {
|
||||
let schema = batch.schema();
|
||||
let new_columns: Vec<_> = batch
|
||||
.columns()
|
||||
.iter()
|
||||
.map(|old_column| old_column.slice(offset, len))
|
||||
.collect();
|
||||
|
||||
let batch = RecordBatch::try_new(schema, new_columns)?;
|
||||
let batch = batch.slice(offset, len);
|
||||
|
||||
// At time of writing, `concat_batches` concatenates the
|
||||
// contents of dictionaries as well; Do a post pass to remove the
|
||||
|
|
|
@ -1,25 +1,27 @@
|
|||
-- Test Setup: OneMeasurementAllChunksDropped
|
||||
-- SQL: SELECT * from information_schema.tables;
|
||||
+---------------+--------------------+---------------+------------+
|
||||
| table_catalog | table_schema | table_name | table_type |
|
||||
+---------------+--------------------+---------------+------------+
|
||||
| public | iox | h2o | BASE TABLE |
|
||||
| public | system | chunks | BASE TABLE |
|
||||
| public | system | columns | BASE TABLE |
|
||||
| public | system | chunk_columns | BASE TABLE |
|
||||
| public | system | operations | BASE TABLE |
|
||||
| public | information_schema | tables | VIEW |
|
||||
| public | information_schema | columns | VIEW |
|
||||
+---------------+--------------------+---------------+------------+
|
||||
+---------------+--------------------+---------------------+------------+
|
||||
| table_catalog | table_schema | table_name | table_type |
|
||||
+---------------+--------------------+---------------------+------------+
|
||||
| public | iox | h2o | BASE TABLE |
|
||||
| public | system | chunks | BASE TABLE |
|
||||
| public | system | columns | BASE TABLE |
|
||||
| public | system | chunk_columns | BASE TABLE |
|
||||
| public | system | operations | BASE TABLE |
|
||||
| public | system | persistence_windows | BASE TABLE |
|
||||
| public | information_schema | tables | VIEW |
|
||||
| public | information_schema | columns | VIEW |
|
||||
+---------------+--------------------+---------------------+------------+
|
||||
-- SQL: SHOW TABLES;
|
||||
+---------------+--------------------+---------------+------------+
|
||||
| table_catalog | table_schema | table_name | table_type |
|
||||
+---------------+--------------------+---------------+------------+
|
||||
| public | iox | h2o | BASE TABLE |
|
||||
| public | system | chunks | BASE TABLE |
|
||||
| public | system | columns | BASE TABLE |
|
||||
| public | system | chunk_columns | BASE TABLE |
|
||||
| public | system | operations | BASE TABLE |
|
||||
| public | information_schema | tables | VIEW |
|
||||
| public | information_schema | columns | VIEW |
|
||||
+---------------+--------------------+---------------+------------+
|
||||
+---------------+--------------------+---------------------+------------+
|
||||
| table_catalog | table_schema | table_name | table_type |
|
||||
+---------------+--------------------+---------------------+------------+
|
||||
| public | iox | h2o | BASE TABLE |
|
||||
| public | system | chunks | BASE TABLE |
|
||||
| public | system | columns | BASE TABLE |
|
||||
| public | system | chunk_columns | BASE TABLE |
|
||||
| public | system | operations | BASE TABLE |
|
||||
| public | system | persistence_windows | BASE TABLE |
|
||||
| public | information_schema | tables | VIEW |
|
||||
| public | information_schema | columns | VIEW |
|
||||
+---------------+--------------------+---------------------+------------+
|
||||
|
|
|
@ -1,86 +1,87 @@
|
|||
-- Test Setup: OneMeasurementThreeChunksWithDuplicates
|
||||
-- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
|
||||
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
|
||||
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=None |
|
||||
| logical_plan after projection_push_down | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
|
||||
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
|
||||
| logical_plan after simplify_expressions | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
|
||||
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
|
||||
| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] |
|
||||
| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: explain verbose select time, state, city, min_temp, max_temp, area from h2o;
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
|
||||
| logical_plan after simplify_expressions | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
|
||||
| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
|
||||
+-----------------------------------------+-------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+-------------------------------------------------------------------------------+
|
||||
| logical_plan | Union |
|
||||
| | Projection: #h2o.state AS name |
|
||||
| | TableScan: h2o projection=None |
|
||||
| | Projection: #h2o.city AS name |
|
||||
| | TableScan: h2o projection=None |
|
||||
| logical_plan after projection_push_down | Union |
|
||||
| | Projection: #h2o.state AS name |
|
||||
| | TableScan: h2o projection=Some([4]) |
|
||||
| | Projection: #h2o.city AS name |
|
||||
| | TableScan: h2o projection=Some([1]) |
|
||||
| logical_plan after simplify_expressions | Union |
|
||||
| | Projection: #h2o.state AS name |
|
||||
| | TableScan: h2o projection=Some([4]) |
|
||||
| | Projection: #h2o.city AS name |
|
||||
| | TableScan: h2o projection=Some([1]) |
|
||||
| physical_plan | ExecutionPlan(PlaceHolder) |
|
||||
| | ProjectionExec: expr=[state@0 as name] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | ProjectionExec: expr=[state@1 as state] |
|
||||
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | ProjectionExec: expr=[city@0 as name] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | ProjectionExec: expr=[city@0 as city] |
|
||||
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+-------------------------------------------------------------------------------+
|
||||
-- SQL: explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Sort: #h2o.time ASC NULLS FIRST, #h2o.state ASC NULLS FIRST, #h2o.city ASC NULLS FIRST |
|
||||
| | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
|
||||
| physical_plan | SortExec: [time@0 ASC,state@1 ASC,city@2 ASC] |
|
||||
| | CoalescePartitionsExec |
|
||||
| | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
|
||||
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #h2o.time, #h2o.state, #h2o.city, #h2o.min_temp, #h2o.max_temp, #h2o.area |
|
||||
| | TableScan: h2o projection=Some([0, 1, 2, 3, 4, 5]) |
|
||||
| physical_plan | ProjectionExec: expr=[time@5 as time, state@4 as state, city@1 as city, min_temp@3 as min_temp, max_temp@2 as max_temp, area@0 as area] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | DeduplicateExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | SortPreservingMergeExec: [state@4 ASC,city@1 ASC,time@5 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
+---------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
|
||||
+---------------+-----------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+-----------------------------------------------------------------------------------+
|
||||
| logical_plan | Union |
|
||||
| | Projection: #h2o.state AS name |
|
||||
| | TableScan: h2o projection=Some([4]) |
|
||||
| | Projection: #h2o.city AS name |
|
||||
| | TableScan: h2o projection=Some([1]) |
|
||||
| physical_plan | ExecutionPlan(PlaceHolder) |
|
||||
| | ProjectionExec: expr=[state@0 as name] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | ProjectionExec: expr=[state@1 as state] |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | ProjectionExec: expr=[city@0 as name] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | ProjectionExec: expr=[city@0 as city] |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | DeduplicateExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | SortPreservingMergeExec: [state@1 ASC,city@0 ASC,time@2 ASC] |
|
||||
| | ExecutionPlan(PlaceHolder) |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=h2o, chunks=1 predicate=Predicate |
|
||||
+---------------+-----------------------------------------------------------------------------------+
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
-- IOX_SETUP: OneMeasurementThreeChunksWithDuplicates
|
||||
|
||||
-- Plan with order by
|
||||
explain verbose select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
|
||||
explain select time, state, city, min_temp, max_temp, area from h2o order by time, state, city;
|
||||
|
||||
|
||||
-- plan without order by
|
||||
explain verbose select time, state, city, min_temp, max_temp, area from h2o;
|
||||
EXPLAIN select time, state, city, min_temp, max_temp, area from h2o;
|
||||
|
||||
-- Union plan
|
||||
EXPLAIN VERBOSE select state as name from h2o UNION ALL select city as name from h2o;
|
||||
EXPLAIN select state as name from h2o UNION ALL select city as name from h2o;
|
||||
|
|
|
@ -1,218 +1,167 @@
|
|||
-- Test Setup: TwoMeasurementsPredicatePushDown
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Float64(200) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Float64(200) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Float64(200) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: CAST(count@0 AS Float64) > 200 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: system@1 > 4 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
|
||||
+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
|
||||
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: system@1 > 4 AND system@1 < 7 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: system@1 > 5 AND system@1 < 7 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
|
||||
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
|
||||
| | TableScan: restaurant projection=None |
|
||||
| logical_plan after projection_push_down | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| logical_plan after simplify_expressions | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt totimestamp(Utf8("1970-01-01T00:00:00.000000130+00:00")) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > totimestamp(1970-01-01T00:00:00.000000130+00:00) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant;
|
||||
+---------------+---------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]) |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate |
|
||||
+---------------+---------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where count > 200;
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200)] |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where count > 200.0;
|
||||
+---------------+----------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+----------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Float64(200) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Float64(200)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: CAST(count@0 AS Float64) > 200 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Float64(200)] |
|
||||
+---------------+----------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0;
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: system@1 > 4 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4)] |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury")] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence")] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury")] |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence") And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Eq Int64(5) Or #restaurant.town Eq Utf8("lawrence"), #restaurant.count Lt Int64(40000)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 = CAST(5 AS Float64) OR CAST(town@3 AS Utf8) = lawrence AND CAST(count@0 AS Int64) < 40000 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #town NotEq Utf8("tewsbury"), #count Lt Int64(40000)] |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where count > 200 and count < 40000;
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.count Gt Int64(200) And #restaurant.count Lt Int64(40000) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.count Gt Int64(200), #restaurant.count Lt Int64(40000)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: CAST(count@0 AS Int64) > 200 AND CAST(count@0 AS Int64) < 40000 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#count Gt Int64(200), #count Lt Int64(40000)] |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(4) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(4), #restaurant.system Lt Float64(7)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: system@1 > 4 AND system@1 < 7 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(4), #system Lt Float64(7)] |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.system Lt Float64(7) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.system Lt Float64(7)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: system@1 > 5 AND system@1 < 7 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #system Lt Float64(7)] |
|
||||
+---------------+---------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And #restaurant.town NotEq Utf8("tewsbury") And Float64(7) Gt #restaurant.system |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), #restaurant.town NotEq Utf8("tewsbury"), Float64(7) Gt #restaurant.system] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: system@1 > 5 AND CAST(town@3 AS Utf8) != tewsbury AND 7 > system@1 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), #town NotEq Utf8("tewsbury"), Float64(7) Gt #system] |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: #restaurant.system Gt Float64(5) And Utf8("tewsbury") NotEq #restaurant.town And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[#restaurant.system Gt Float64(5), Utf8("tewsbury") NotEq #restaurant.town, #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading")] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: system@1 > 5 AND tewsbury != CAST(town@3 AS Utf8) AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=1 predicate=Predicate exprs: [#system Gt Float64(5), Utf8("tewsbury") NotEq #town, #system Lt Float64(7)] |
|
||||
+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
-- SQL: EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
|
||||
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| plan_type | plan |
|
||||
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| logical_plan | Projection: #restaurant.count, #restaurant.system, #restaurant.time, #restaurant.town |
|
||||
| | Filter: Float64(5) Lt #restaurant.system And #restaurant.town NotEq Utf8("tewsbury") And #restaurant.system Lt Float64(7) And #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading") And #restaurant.time Gt TimestampNanosecond(130) |
|
||||
| | TableScan: restaurant projection=Some([0, 1, 2, 3]), filters=[Float64(5) Lt #restaurant.system, #restaurant.town NotEq Utf8("tewsbury"), #restaurant.system Lt Float64(7), #restaurant.count Eq Int64(632) Or #restaurant.town Eq Utf8("reading"), #restaurant.time Gt TimestampNanosecond(130)] |
|
||||
| physical_plan | ProjectionExec: expr=[count@0 as count, system@1 as system, time@2 as time, town@3 as town] |
|
||||
| | CoalesceBatchesExec: target_batch_size=500 |
|
||||
| | FilterExec: 5 < system@1 AND CAST(town@3 AS Utf8) != tewsbury AND system@1 < 7 AND CAST(count@0 AS Int64) = 632 OR CAST(town@3 AS Utf8) = reading AND time@2 > 130 |
|
||||
| | RepartitionExec: partitioning=RoundRobinBatch(4) |
|
||||
| | IOxReadFilterNode: table_name=restaurant, chunks=0 predicate=Predicate exprs: [Float64(5) Lt #system, #town NotEq Utf8("tewsbury"), #system Lt Float64(7), #time Gt TimestampNanosecond(130)] |
|
||||
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
|
|
@ -2,44 +2,44 @@
|
|||
-- IOX_SETUP: TwoMeasurementsPredicatePushDown
|
||||
|
||||
-- Test 1: Select everything
|
||||
EXPLAIN VERBOSE SELECT * from restaurant;
|
||||
EXPLAIN SELECT * from restaurant;
|
||||
|
||||
-- Test 2: One push-down expression: count > 200
|
||||
-- TODO: Make push-down predicates shown in explain verbose. Ticket #1538
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where count > 200;
|
||||
EXPLAIN SELECT * from restaurant where count > 200;
|
||||
|
||||
-- Test 2.2: One push-down expression: count > 200.0
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where count > 200.0;
|
||||
EXPLAIN SELECT * from restaurant where count > 200.0;
|
||||
|
||||
-- Test 2.3: One push-down expression: system > 4.0
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0;
|
||||
EXPLAIN SELECT * from restaurant where system > 4.0;
|
||||
|
||||
|
||||
-- Test 3: Two push-down expression: count > 200 and town != 'tewsbury'
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury';
|
||||
EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury';
|
||||
|
||||
-- Test 4: Still two push-down expression: count > 200 and town != 'tewsbury'
|
||||
-- even though the results are different
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
|
||||
EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence');
|
||||
|
||||
-- Test 5: three push-down expression: count > 200 and town != 'tewsbury' and count < 40000
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
|
||||
EXPLAIN SELECT * from restaurant where count > 200 and town != 'tewsbury' and (system =5 or town = 'lawrence') and count < 40000;
|
||||
|
||||
-- Test 6: two push-down expression: count > 200 and count < 40000
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where count > 200 and count < 40000;
|
||||
EXPLAIN SELECT * from restaurant where count > 200 and count < 40000;
|
||||
|
||||
-- Test 7: two push-down expression on float: system > 4.0 and system < 7.0
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where system > 4.0 and system < 7.0;
|
||||
EXPLAIN SELECT * from restaurant where system > 4.0 and system < 7.0;
|
||||
|
||||
-- Test 8: two push-down expression on float: system > 5.0 and system < 7.0
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and system < 7.0;
|
||||
EXPLAIN SELECT * from restaurant where system > 5.0 and system < 7.0;
|
||||
|
||||
-- Test 9: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
|
||||
EXPLAIN SELECT * from restaurant where system > 5.0 and town != 'tewsbury' and 7.0 > system;
|
||||
|
||||
-- Test 10: three push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
|
||||
EXPLAIN SELECT * from restaurant where system > 5.0 and 'tewsbury' != town and system < 7.0 and (count = 632 or town = 'reading');
|
||||
|
||||
-- Test 11: four push-down expression: system > 5.0 and town != 'tewsbury' and system < 7.0 and
|
||||
-- time > to_timestamp('1970-01-01T00:00:00.000000120+00:00') rewritten to time GT INT(130)
|
||||
EXPLAIN VERBOSE SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
|
||||
EXPLAIN SELECT * from restaurant where 5.0 < system and town != 'tewsbury' and system < 7.0 and (count = 632 or town = 'reading') and time > to_timestamp('1970-01-01T00:00:00.000000130+00:00');
|
||||
|
|
|
@ -4,12 +4,16 @@ mod parse;
|
|||
mod setup;
|
||||
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use query::{exec::ExecutorType, frontend::sql::SqlQueryPlanner};
|
||||
use query::{
|
||||
exec::{Executor, ExecutorType},
|
||||
frontend::sql::SqlQueryPlanner,
|
||||
};
|
||||
use snafu::{OptionExt, ResultExt, Snafu};
|
||||
use std::{
|
||||
io::LineWriter,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use self::{parse::TestQueries, setup::TestSetup};
|
||||
|
@ -261,7 +265,13 @@ impl<W: Write> Runner<W> {
|
|||
writeln!(self.log, "Running scenario '{}'", scenario_name)?;
|
||||
writeln!(self.log, "SQL: '{:#?}'", sql)?;
|
||||
let planner = SqlQueryPlanner::default();
|
||||
let executor = db.executor();
|
||||
let num_threads = 1;
|
||||
let mut executor = Executor::new(num_threads);
|
||||
|
||||
// hardcode concurrency in tests as by default is is the
|
||||
// number of cores, which varies across machines
|
||||
executor.config_mut().set_concurrency(4);
|
||||
let executor = Arc::new(executor);
|
||||
|
||||
let physical_plan = planner
|
||||
.query(db, &sql, executor.as_ref())
|
||||
|
|
|
@ -184,18 +184,19 @@ async fn sql_select_from_information_schema_tables() {
|
|||
// validate we have access to information schema for listing table
|
||||
// names
|
||||
let expected = vec![
|
||||
"+---------------+--------------------+---------------+------------+",
|
||||
"| table_catalog | table_schema | table_name | table_type |",
|
||||
"+---------------+--------------------+---------------+------------+",
|
||||
"| public | information_schema | columns | VIEW |",
|
||||
"| public | information_schema | tables | VIEW |",
|
||||
"| public | iox | h2o | BASE TABLE |",
|
||||
"| public | iox | o2 | BASE TABLE |",
|
||||
"| public | system | chunk_columns | BASE TABLE |",
|
||||
"| public | system | chunks | BASE TABLE |",
|
||||
"| public | system | columns | BASE TABLE |",
|
||||
"| public | system | operations | BASE TABLE |",
|
||||
"+---------------+--------------------+---------------+------------+",
|
||||
"+---------------+--------------------+---------------------+------------+",
|
||||
"| table_catalog | table_schema | table_name | table_type |",
|
||||
"+---------------+--------------------+---------------------+------------+",
|
||||
"| public | information_schema | columns | VIEW |",
|
||||
"| public | information_schema | tables | VIEW |",
|
||||
"| public | iox | h2o | BASE TABLE |",
|
||||
"| public | iox | o2 | BASE TABLE |",
|
||||
"| public | system | chunk_columns | BASE TABLE |",
|
||||
"| public | system | chunks | BASE TABLE |",
|
||||
"| public | system | columns | BASE TABLE |",
|
||||
"| public | system | operations | BASE TABLE |",
|
||||
"| public | system | persistence_windows | BASE TABLE |",
|
||||
"+---------------+--------------------+---------------------+------------+",
|
||||
];
|
||||
run_sql_test_case!(
|
||||
TwoMeasurementsManyFields {},
|
||||
|
|
|
@ -8,7 +8,7 @@ use data_types::{
|
|||
DatabaseName,
|
||||
};
|
||||
use metrics::MetricRegistry;
|
||||
use object_store::{path::ObjectStorePath, ObjectStore};
|
||||
use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi};
|
||||
use parquet_file::catalog::PreservedCatalog;
|
||||
use query::exec::Executor;
|
||||
use write_buffer::config::WriteBufferConfig;
|
||||
|
@ -16,9 +16,13 @@ use write_buffer::config::WriteBufferConfig;
|
|||
/// This module contains code for managing the configuration of the server.
|
||||
use crate::{
|
||||
db::{catalog::Catalog, DatabaseToCommit, Db},
|
||||
Error, JobRegistry, Result,
|
||||
DatabaseAlreadyExists, DatabaseNotFound, DatabaseReserved, Error,
|
||||
InvalidDatabaseStateTransition, JobRegistry, Result, RulesDatabaseNameMismatch,
|
||||
ServerShuttingDown,
|
||||
};
|
||||
use object_store::path::Path;
|
||||
use observability_deps::tracing::{self, error, info, warn, Instrument};
|
||||
use snafu::{ensure, OptionExt};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
|
@ -34,10 +38,14 @@ pub(crate) const DB_RULES_FILE_NAME: &str = "rules.pb";
|
|||
/// run to completion if the tokio runtime is dropped
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Config {
|
||||
shutdown: CancellationToken,
|
||||
jobs: Arc<JobRegistry>,
|
||||
state: RwLock<ConfigState>,
|
||||
object_store: Arc<ObjectStore>,
|
||||
exec: Arc<Executor>,
|
||||
server_id: ServerId,
|
||||
metric_registry: Arc<MetricRegistry>,
|
||||
|
||||
shutdown: CancellationToken,
|
||||
state: RwLock<ConfigState>,
|
||||
}
|
||||
|
||||
pub(crate) enum UpdateError<E> {
|
||||
|
@ -55,14 +63,20 @@ impl Config {
|
|||
/// Create new empty config.
|
||||
pub(crate) fn new(
|
||||
jobs: Arc<JobRegistry>,
|
||||
object_store: Arc<ObjectStore>,
|
||||
exec: Arc<Executor>,
|
||||
server_id: ServerId,
|
||||
metric_registry: Arc<MetricRegistry>,
|
||||
remote_template: Option<RemoteTemplate>,
|
||||
) -> Self {
|
||||
Self {
|
||||
jobs,
|
||||
object_store,
|
||||
exec,
|
||||
server_id,
|
||||
metric_registry,
|
||||
shutdown: Default::default(),
|
||||
state: RwLock::new(ConfigState::new(remote_template)),
|
||||
jobs,
|
||||
metric_registry,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -77,33 +91,20 @@ impl Config {
|
|||
/// This only works if the database is not yet known. To recover a database out of an uninitialized state, see
|
||||
/// [`recover_db`](Self::recover_db). To do maintainance work on data linked to the database (e.g. the catalog)
|
||||
/// without initializing it, see [`block_db`](Self::block_db).
|
||||
pub(crate) fn create_db(
|
||||
&self,
|
||||
object_store: Arc<ObjectStore>,
|
||||
exec: Arc<Executor>,
|
||||
server_id: ServerId,
|
||||
db_name: DatabaseName<'static>,
|
||||
) -> Result<DatabaseHandle<'_>> {
|
||||
pub(crate) fn create_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
|
||||
let mut state = self.state.write().expect("mutex poisoned");
|
||||
if state.reservations.contains(&db_name) {
|
||||
return Err(Error::DatabaseReserved {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
if state.databases.contains_key(&db_name) {
|
||||
return Err(Error::DatabaseAlreadyExists {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
ensure!(
|
||||
!state.reservations.contains(&db_name),
|
||||
DatabaseReserved { db_name }
|
||||
);
|
||||
ensure!(
|
||||
!state.databases.contains_key(&db_name),
|
||||
DatabaseAlreadyExists { db_name }
|
||||
);
|
||||
|
||||
state.reservations.insert(db_name.clone());
|
||||
Ok(DatabaseHandle {
|
||||
state: Some(Arc::new(DatabaseState::Known {
|
||||
object_store,
|
||||
exec,
|
||||
server_id,
|
||||
db_name,
|
||||
})),
|
||||
state: Some(Arc::new(DatabaseState::Known { db_name })),
|
||||
config: &self,
|
||||
})
|
||||
}
|
||||
|
@ -115,32 +116,27 @@ impl Config {
|
|||
/// While the handle is held, no other operations for the given database can be executed.
|
||||
///
|
||||
/// This only works if the database is known but is uninitialized. To create a new database that is not yet known,
|
||||
/// see [`create_db`](Self::create_db). To do maintainance work on data linked to the database (e.g. the catalog)
|
||||
/// see [`create_db`](Self::create_db). To do maintenance work on data linked to the database (e.g. the catalog)
|
||||
/// without initializing it, see [`block_db`](Self::block_db).
|
||||
pub(crate) fn recover_db(&self, db_name: DatabaseName<'static>) -> Result<DatabaseHandle<'_>> {
|
||||
let mut state = self.state.write().expect("mutex poisoned");
|
||||
if state.reservations.contains(&db_name) {
|
||||
return Err(Error::DatabaseReserved {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
ensure!(
|
||||
!state.reservations.contains(&db_name),
|
||||
DatabaseReserved { db_name }
|
||||
);
|
||||
|
||||
let db_state =
|
||||
state
|
||||
.databases
|
||||
.get(&db_name)
|
||||
.cloned()
|
||||
.ok_or_else(|| Error::DatabaseNotFound {
|
||||
db_name: db_name.to_string(),
|
||||
})?;
|
||||
let db_state = state
|
||||
.databases
|
||||
.get(&db_name)
|
||||
.cloned()
|
||||
.context(DatabaseNotFound { db_name: &db_name })?;
|
||||
|
||||
if db_state.is_initialized() {
|
||||
return Err(Error::DatabaseAlreadyExists {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
ensure!(
|
||||
!db_state.is_initialized(),
|
||||
DatabaseAlreadyExists { db_name }
|
||||
);
|
||||
|
||||
state.reservations.insert(db_name.clone());
|
||||
state.reservations.insert(db_name);
|
||||
Ok(DatabaseHandle {
|
||||
state: Some(db_state),
|
||||
config: &self,
|
||||
|
@ -159,16 +155,14 @@ impl Config {
|
|||
db_name: DatabaseName<'static>,
|
||||
) -> Result<BlockDatabaseGuard<'_>> {
|
||||
let mut state = self.state.write().expect("mutex poisoned");
|
||||
if state.reservations.contains(&db_name) {
|
||||
return Err(Error::DatabaseReserved {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
if state.databases.contains_key(&db_name) {
|
||||
return Err(Error::DatabaseAlreadyExists {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
ensure!(
|
||||
!state.reservations.contains(&db_name),
|
||||
DatabaseReserved { db_name }
|
||||
);
|
||||
ensure!(
|
||||
!state.databases.contains_key(&db_name),
|
||||
DatabaseAlreadyExists { db_name }
|
||||
);
|
||||
|
||||
state.reservations.insert(db_name.clone());
|
||||
Ok(BlockDatabaseGuard {
|
||||
|
@ -228,11 +222,9 @@ impl Config {
|
|||
// TODO: implement for non-initialized databases
|
||||
let db = self
|
||||
.db_initialized(db_name)
|
||||
.ok_or_else(|| Error::DatabaseNotFound {
|
||||
db_name: db_name.to_string(),
|
||||
})?;
|
||||
.context(DatabaseNotFound { db_name })?;
|
||||
|
||||
db.update_db_rules(update).map_err(UpdateError::Closure)
|
||||
db.update_rules(update).map_err(UpdateError::Closure)
|
||||
}
|
||||
|
||||
/// Get all registered remote servers.
|
||||
|
@ -311,6 +303,24 @@ impl Config {
|
|||
pub fn metrics_registry(&self) -> Arc<MetricRegistry> {
|
||||
Arc::clone(&self.metric_registry)
|
||||
}
|
||||
|
||||
/// Returns the object store of this server
|
||||
pub fn object_store(&self) -> Arc<ObjectStore> {
|
||||
Arc::clone(&self.object_store)
|
||||
}
|
||||
|
||||
/// Returns the server id of this server
|
||||
pub fn server_id(&self) -> ServerId {
|
||||
self.server_id
|
||||
}
|
||||
|
||||
/// Base location in object store for this server.
|
||||
pub fn root_path(&self) -> Path {
|
||||
let id = self.server_id.get();
|
||||
let mut path = self.object_store.new_path();
|
||||
path.push_dir(format!("{}", id));
|
||||
path
|
||||
}
|
||||
}
|
||||
|
||||
/// Get object store path for the database config under the given root (= path under with the server with the current ID
|
||||
|
@ -373,41 +383,14 @@ impl RemoteTemplate {
|
|||
}
|
||||
|
||||
/// Internal representation of the different database states.
|
||||
///
|
||||
/// # Shared Data During Transitions
|
||||
/// The following elements can safely be shared between states because they won't be poisoned by any half-done
|
||||
/// transition (e.g. starting a transition and then failing due to an IO error):
|
||||
/// - `object_store`
|
||||
/// - `exec`
|
||||
///
|
||||
/// The following elements can trivially be copied from one state to the next:
|
||||
/// - `server_id`
|
||||
/// - `db_name`
|
||||
///
|
||||
/// The following elements MUST be copied from one state to the next because partial modifications are not allowed:
|
||||
/// - `rules`
|
||||
///
|
||||
/// Exceptions to the above rules are the following states:
|
||||
/// - [`Replay`](Self::Replay): replaying twice should (apart from some performance penalties) not do much harm
|
||||
/// - [`Initialized`](Self::Initialized): the final state is not advanced to anything else
|
||||
#[derive(Debug)]
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
enum DatabaseState {
|
||||
/// Database is known but nothing is loaded.
|
||||
Known {
|
||||
object_store: Arc<ObjectStore>,
|
||||
exec: Arc<Executor>,
|
||||
server_id: ServerId,
|
||||
db_name: DatabaseName<'static>,
|
||||
},
|
||||
Known { db_name: DatabaseName<'static> },
|
||||
|
||||
/// Rules are loaded
|
||||
RulesLoaded {
|
||||
object_store: Arc<ObjectStore>,
|
||||
exec: Arc<Executor>,
|
||||
server_id: ServerId,
|
||||
rules: Arc<DatabaseRules>,
|
||||
},
|
||||
RulesLoaded { rules: Arc<DatabaseRules> },
|
||||
|
||||
/// Catalog is loaded but data from sequencers / write buffers is not yet replayed.
|
||||
Replay { db: Arc<Db> },
|
||||
|
@ -465,24 +448,6 @@ impl DatabaseState {
|
|||
}
|
||||
}
|
||||
|
||||
fn object_store(&self) -> Arc<ObjectStore> {
|
||||
match self {
|
||||
DatabaseState::Known { object_store, .. } => Arc::clone(object_store),
|
||||
DatabaseState::RulesLoaded { object_store, .. } => Arc::clone(object_store),
|
||||
DatabaseState::Replay { db, .. } => Arc::clone(&db.store),
|
||||
DatabaseState::Initialized { db, .. } => Arc::clone(&db.store),
|
||||
}
|
||||
}
|
||||
|
||||
fn server_id(&self) -> ServerId {
|
||||
match self {
|
||||
DatabaseState::Known { server_id, .. } => *server_id,
|
||||
DatabaseState::RulesLoaded { server_id, .. } => *server_id,
|
||||
DatabaseState::Replay { db, .. } => db.server_id,
|
||||
DatabaseState::Initialized { db, .. } => db.server_id,
|
||||
}
|
||||
}
|
||||
|
||||
fn rules(&self) -> Option<Arc<DatabaseRules>> {
|
||||
match self {
|
||||
DatabaseState::Known { .. } => None,
|
||||
|
@ -548,12 +513,12 @@ impl<'a> DatabaseHandle<'a> {
|
|||
|
||||
/// Get object store.
|
||||
pub fn object_store(&self) -> Arc<ObjectStore> {
|
||||
self.state().object_store()
|
||||
Arc::clone(&self.config.object_store)
|
||||
}
|
||||
|
||||
/// Get server ID.
|
||||
pub fn server_id(&self) -> ServerId {
|
||||
self.state().server_id()
|
||||
self.config.server_id
|
||||
}
|
||||
|
||||
/// Get metrics registry.
|
||||
|
@ -592,32 +557,26 @@ impl<'a> DatabaseHandle<'a> {
|
|||
/// Advance database state to [`RulesLoaded`](DatabaseStateCode::RulesLoaded).
|
||||
pub fn advance_rules_loaded(&mut self, rules: DatabaseRules) -> Result<()> {
|
||||
match self.state().as_ref() {
|
||||
DatabaseState::Known {
|
||||
object_store,
|
||||
exec,
|
||||
server_id,
|
||||
db_name,
|
||||
} => {
|
||||
if db_name != &rules.name {
|
||||
return Err(Error::RulesDatabaseNameMismatch {
|
||||
actual: rules.name.to_string(),
|
||||
expected: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
DatabaseState::Known { db_name } => {
|
||||
ensure!(
|
||||
db_name == &rules.name,
|
||||
RulesDatabaseNameMismatch {
|
||||
actual: rules.name,
|
||||
expected: db_name,
|
||||
}
|
||||
);
|
||||
|
||||
self.state = Some(Arc::new(DatabaseState::RulesLoaded {
|
||||
object_store: Arc::clone(&object_store),
|
||||
exec: Arc::clone(&exec),
|
||||
server_id: *server_id,
|
||||
rules: Arc::new(rules),
|
||||
}));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
state => Err(Error::InvalidDatabaseStateTransition {
|
||||
state => InvalidDatabaseStateTransition {
|
||||
actual: state.code(),
|
||||
expected: DatabaseStateCode::Known,
|
||||
}),
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -629,16 +588,11 @@ impl<'a> DatabaseHandle<'a> {
|
|||
write_buffer: Option<WriteBufferConfig>,
|
||||
) -> Result<()> {
|
||||
match self.state().as_ref() {
|
||||
DatabaseState::RulesLoaded {
|
||||
object_store,
|
||||
exec,
|
||||
server_id,
|
||||
rules,
|
||||
} => {
|
||||
DatabaseState::RulesLoaded { rules } => {
|
||||
let database_to_commit = DatabaseToCommit {
|
||||
server_id: *server_id,
|
||||
object_store: Arc::clone(&object_store),
|
||||
exec: Arc::clone(&exec),
|
||||
server_id: self.config.server_id,
|
||||
object_store: Arc::clone(&self.config.object_store),
|
||||
exec: Arc::clone(&self.config.exec),
|
||||
preserved_catalog,
|
||||
catalog,
|
||||
rules: Arc::clone(&rules),
|
||||
|
@ -650,10 +604,11 @@ impl<'a> DatabaseHandle<'a> {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
state => Err(Error::InvalidDatabaseStateTransition {
|
||||
state => InvalidDatabaseStateTransition {
|
||||
actual: state.code(),
|
||||
expected: DatabaseStateCode::RulesLoaded,
|
||||
}),
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -663,7 +618,7 @@ impl<'a> DatabaseHandle<'a> {
|
|||
DatabaseState::Replay { db } => {
|
||||
if self.config.shutdown.is_cancelled() {
|
||||
error!("server is shutting down");
|
||||
return Err(Error::ServerShuttingDown);
|
||||
return ServerShuttingDown.fail();
|
||||
}
|
||||
|
||||
let shutdown = self.config.shutdown.child_token();
|
||||
|
@ -686,10 +641,11 @@ impl<'a> DatabaseHandle<'a> {
|
|||
|
||||
Ok(())
|
||||
}
|
||||
state => Err(Error::InvalidDatabaseStateTransition {
|
||||
state => InvalidDatabaseStateTransition {
|
||||
actual: state.code(),
|
||||
expected: DatabaseStateCode::Replay,
|
||||
}),
|
||||
}
|
||||
.fail(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -730,40 +686,32 @@ mod test {
|
|||
use super::*;
|
||||
use std::num::NonZeroU32;
|
||||
|
||||
fn make_config(remote_template: Option<RemoteTemplate>) -> Config {
|
||||
let store = Arc::new(ObjectStore::new_in_memory());
|
||||
let server_id = ServerId::try_from(1).unwrap();
|
||||
let metric_registry = Arc::new(metrics::MetricRegistry::new());
|
||||
Config::new(
|
||||
Arc::new(JobRegistry::new()),
|
||||
Arc::clone(&store),
|
||||
Arc::new(Executor::new(1)),
|
||||
server_id,
|
||||
Arc::clone(&metric_registry),
|
||||
remote_template,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn create_db() {
|
||||
// setup
|
||||
let name = DatabaseName::new("foo").unwrap();
|
||||
let store = Arc::new(ObjectStore::new_in_memory());
|
||||
let exec = Arc::new(Executor::new(1));
|
||||
let server_id = ServerId::try_from(1).unwrap();
|
||||
let metric_registry = Arc::new(metrics::MetricRegistry::new());
|
||||
let config = Config::new(
|
||||
Arc::new(JobRegistry::new()),
|
||||
Arc::clone(&metric_registry),
|
||||
None,
|
||||
);
|
||||
let config = make_config(None);
|
||||
let rules = DatabaseRules::new(name.clone());
|
||||
|
||||
// getting handle while DB is reserved => fails
|
||||
{
|
||||
let _db_reservation = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap();
|
||||
let _db_reservation = config.create_db(name.clone()).unwrap();
|
||||
|
||||
let err = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap_err();
|
||||
let err = config.create_db(name.clone()).unwrap_err();
|
||||
assert!(matches!(err, Error::DatabaseReserved { .. }));
|
||||
|
||||
let err = config.block_db(name.clone()).unwrap_err();
|
||||
|
@ -775,14 +723,7 @@ mod test {
|
|||
|
||||
// name in rules must match reserved name
|
||||
{
|
||||
let mut db_reservation = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
DatabaseName::new("bar").unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let mut db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();
|
||||
|
||||
let err = db_reservation
|
||||
.advance_rules_loaded(rules.clone())
|
||||
|
@ -795,14 +736,7 @@ mod test {
|
|||
|
||||
// handle.abort just works (aka does not mess up the transaction afterwards)
|
||||
{
|
||||
let db_reservation = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
DatabaseName::new("bar").unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let db_reservation = config.create_db(DatabaseName::new("bar").unwrap()).unwrap();
|
||||
|
||||
db_reservation.abort();
|
||||
}
|
||||
|
@ -812,21 +746,14 @@ mod test {
|
|||
|
||||
// create DB successfull
|
||||
{
|
||||
let mut db_reservation = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap();
|
||||
let mut db_reservation = config.create_db(name.clone()).unwrap();
|
||||
|
||||
db_reservation.advance_rules_loaded(rules).unwrap();
|
||||
|
||||
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
|
||||
&name,
|
||||
Arc::clone(&store),
|
||||
server_id,
|
||||
config.object_store(),
|
||||
config.server_id(),
|
||||
config.metrics_registry(),
|
||||
false,
|
||||
)
|
||||
|
@ -866,14 +793,7 @@ mod test {
|
|||
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
|
||||
|
||||
// create DB as second time => fail
|
||||
let err = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap_err();
|
||||
let err = config.create_db(name.clone()).unwrap_err();
|
||||
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
|
||||
|
||||
// block fully initiliazed DB => fail
|
||||
|
@ -888,40 +808,18 @@ mod test {
|
|||
async fn recover_db() {
|
||||
// setup
|
||||
let name = DatabaseName::new("foo").unwrap();
|
||||
let store = Arc::new(ObjectStore::new_in_memory());
|
||||
let exec = Arc::new(Executor::new(1));
|
||||
let server_id = ServerId::try_from(1).unwrap();
|
||||
let metric_registry = Arc::new(metrics::MetricRegistry::new());
|
||||
let config = Config::new(
|
||||
Arc::new(JobRegistry::new()),
|
||||
Arc::clone(&metric_registry),
|
||||
None,
|
||||
);
|
||||
let config = make_config(None);
|
||||
let rules = DatabaseRules::new(name.clone());
|
||||
|
||||
// create DB but don't continue with rules loaded (e.g. because the rules file is broken)
|
||||
{
|
||||
let db_reservation = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap();
|
||||
let db_reservation = config.create_db(name.clone()).unwrap();
|
||||
db_reservation.commit();
|
||||
}
|
||||
assert!(config.has_uninitialized_database(&name));
|
||||
|
||||
// create DB while it is uninitialized => fail
|
||||
let err = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap_err();
|
||||
let err = config.create_db(name.clone()).unwrap_err();
|
||||
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
|
||||
|
||||
// recover an unknown DB => fail
|
||||
|
@ -935,19 +833,19 @@ mod test {
|
|||
let mut db_reservation = config.recover_db(name.clone()).unwrap();
|
||||
assert_eq!(db_reservation.state_code(), DatabaseStateCode::Known);
|
||||
assert_eq!(db_reservation.db_name(), name);
|
||||
assert_eq!(db_reservation.server_id(), server_id);
|
||||
assert_eq!(db_reservation.server_id(), config.server_id());
|
||||
assert!(db_reservation.rules().is_none());
|
||||
|
||||
db_reservation.advance_rules_loaded(rules).unwrap();
|
||||
assert_eq!(db_reservation.state_code(), DatabaseStateCode::RulesLoaded);
|
||||
assert_eq!(db_reservation.db_name(), name);
|
||||
assert_eq!(db_reservation.server_id(), server_id);
|
||||
assert_eq!(db_reservation.server_id(), config.server_id());
|
||||
assert!(db_reservation.rules().is_some());
|
||||
|
||||
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
|
||||
&name,
|
||||
Arc::clone(&store),
|
||||
server_id,
|
||||
config.object_store(),
|
||||
config.server_id(),
|
||||
config.metrics_registry(),
|
||||
false,
|
||||
)
|
||||
|
@ -958,13 +856,13 @@ mod test {
|
|||
.unwrap();
|
||||
assert_eq!(db_reservation.state_code(), DatabaseStateCode::Replay);
|
||||
assert_eq!(db_reservation.db_name(), name);
|
||||
assert_eq!(db_reservation.server_id(), server_id);
|
||||
assert_eq!(db_reservation.server_id(), config.server_id());
|
||||
assert!(db_reservation.rules().is_some());
|
||||
|
||||
db_reservation.advance_init().unwrap();
|
||||
assert_eq!(db_reservation.state_code(), DatabaseStateCode::Initialized);
|
||||
assert_eq!(db_reservation.db_name(), name);
|
||||
assert_eq!(db_reservation.server_id(), server_id);
|
||||
assert_eq!(db_reservation.server_id(), config.server_id());
|
||||
assert!(db_reservation.rules().is_some());
|
||||
|
||||
db_reservation.commit();
|
||||
|
@ -978,14 +876,7 @@ mod test {
|
|||
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
|
||||
|
||||
// create recovered DB => fail
|
||||
let err = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap_err();
|
||||
let err = config.create_db(name.clone()).unwrap_err();
|
||||
assert!(matches!(err, Error::DatabaseAlreadyExists { .. }));
|
||||
|
||||
// block recovered DB => fail
|
||||
|
@ -1000,28 +891,13 @@ mod test {
|
|||
async fn block_db() {
|
||||
// setup
|
||||
let name = DatabaseName::new("foo").unwrap();
|
||||
let store = Arc::new(ObjectStore::new_in_memory());
|
||||
let exec = Arc::new(Executor::new(1));
|
||||
let server_id = ServerId::try_from(1).unwrap();
|
||||
let metric_registry = Arc::new(metrics::MetricRegistry::new());
|
||||
let config = Config::new(
|
||||
Arc::new(JobRegistry::new()),
|
||||
Arc::clone(&metric_registry),
|
||||
None,
|
||||
);
|
||||
let config = make_config(None);
|
||||
|
||||
// block DB
|
||||
let handle = config.block_db(name.clone()).unwrap();
|
||||
|
||||
// create while blocked => fail
|
||||
let err = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap_err();
|
||||
let err = config.create_db(name.clone()).unwrap_err();
|
||||
assert!(matches!(err, Error::DatabaseReserved { .. }));
|
||||
|
||||
// recover while blocked => fail
|
||||
|
@ -1034,14 +910,7 @@ mod test {
|
|||
|
||||
// unblock => DB can be created
|
||||
drop(handle);
|
||||
config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap();
|
||||
config.create_db(name.clone()).unwrap();
|
||||
|
||||
// cleanup
|
||||
config.drain().await
|
||||
|
@ -1051,20 +920,12 @@ mod test {
|
|||
async fn test_db_drop() {
|
||||
// setup
|
||||
let name = DatabaseName::new("foo").unwrap();
|
||||
let store = Arc::new(ObjectStore::new_in_memory());
|
||||
let exec = Arc::new(Executor::new(1));
|
||||
let server_id = ServerId::try_from(1).unwrap();
|
||||
let metric_registry = Arc::new(metrics::MetricRegistry::new());
|
||||
let config = Config::new(
|
||||
Arc::new(JobRegistry::new()),
|
||||
Arc::clone(&metric_registry),
|
||||
None,
|
||||
);
|
||||
let config = make_config(None);
|
||||
let rules = DatabaseRules::new(name.clone());
|
||||
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
|
||||
&name,
|
||||
Arc::clone(&store),
|
||||
server_id,
|
||||
config.object_store(),
|
||||
config.server_id(),
|
||||
config.metrics_registry(),
|
||||
false,
|
||||
)
|
||||
|
@ -1072,14 +933,7 @@ mod test {
|
|||
.unwrap();
|
||||
|
||||
// create DB
|
||||
let mut db_reservation = config
|
||||
.create_db(
|
||||
Arc::clone(&store),
|
||||
Arc::clone(&exec),
|
||||
server_id,
|
||||
name.clone(),
|
||||
)
|
||||
.unwrap();
|
||||
let mut db_reservation = config.create_db(name.clone()).unwrap();
|
||||
db_reservation.advance_rules_loaded(rules).unwrap();
|
||||
db_reservation
|
||||
.advance_replay(preserved_catalog, catalog, None)
|
||||
|
@ -1126,12 +980,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn resolve_remote() {
|
||||
let metric_registry = Arc::new(metrics::MetricRegistry::new());
|
||||
let config = Config::new(
|
||||
Arc::new(JobRegistry::new()),
|
||||
Arc::clone(&metric_registry),
|
||||
Some(RemoteTemplate::new("http://iox-query-{id}:8082")),
|
||||
);
|
||||
let config = make_config(Some(RemoteTemplate::new("http://iox-query-{id}:8082")));
|
||||
|
||||
let server_id = ServerId::new(NonZeroU32::new(42).unwrap());
|
||||
let remote = config.resolve_remote(server_id);
|
||||
|
|
392
server/src/db.rs
392
server/src/db.rs
|
@ -50,7 +50,7 @@ use std::{
|
|||
time::{Duration, Instant},
|
||||
};
|
||||
use write_buffer::config::WriteBufferConfig;
|
||||
use write_buffer::core::WriteBufferError;
|
||||
use write_buffer::core::{FetchHighWatermark, WriteBufferError};
|
||||
|
||||
pub mod access;
|
||||
pub mod catalog;
|
||||
|
@ -144,6 +144,91 @@ pub enum Error {
|
|||
|
||||
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
|
||||
/// Metrics for data ingest via write buffer.
|
||||
#[derive(Debug)]
|
||||
struct WriteBufferIngestMetrics {
|
||||
/// Metrics domain
|
||||
domain: Arc<metrics::Domain>,
|
||||
}
|
||||
|
||||
impl WriteBufferIngestMetrics {
|
||||
fn new(domain: Arc<metrics::Domain>) -> Self {
|
||||
Self { domain }
|
||||
}
|
||||
|
||||
fn new_sequencer_metrics(&self, sequencer_id: u32) -> SequencerMetrics {
|
||||
let labels = vec![KeyValue::new("sequencer_id", sequencer_id.to_string())];
|
||||
|
||||
let red = self
|
||||
.domain
|
||||
.register_red_metric_with_labels(Some("ingest"), labels.clone());
|
||||
let bytes_read = self.domain.register_counter_metric_with_labels(
|
||||
"read",
|
||||
Some("bytes"),
|
||||
"Bytes read from sequencer",
|
||||
labels.clone(),
|
||||
);
|
||||
let last_sequence_number = self.domain.register_gauge_metric_with_labels(
|
||||
"last_sequence_number",
|
||||
None,
|
||||
"Last consumed sequence number (e.g. Kafka offset)",
|
||||
&labels,
|
||||
);
|
||||
let sequence_number_lag = self.domain.register_gauge_metric_with_labels(
|
||||
"sequence_number_lag",
|
||||
None,
|
||||
"The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed sequence number",
|
||||
&labels,
|
||||
);
|
||||
let last_min_ts = self.domain.register_gauge_metric_with_labels(
|
||||
"last_min_ts",
|
||||
None,
|
||||
"Minimum timestamp of last write as unix timestamp in nanoseconds",
|
||||
&labels,
|
||||
);
|
||||
let last_max_ts = self.domain.register_gauge_metric_with_labels(
|
||||
"last_max_ts",
|
||||
None,
|
||||
"Maximum timestamp of last write as unix timestamp in nanoseconds",
|
||||
&labels,
|
||||
);
|
||||
|
||||
SequencerMetrics {
|
||||
red,
|
||||
bytes_read,
|
||||
last_sequence_number,
|
||||
sequence_number_lag,
|
||||
last_min_ts,
|
||||
last_max_ts,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for a single sequencer.
|
||||
#[derive(Debug)]
|
||||
struct SequencerMetrics {
|
||||
/// Metrics for tracking ingest.
|
||||
red: metrics::RedMetric,
|
||||
|
||||
/// Bytes read from sequencer.
|
||||
///
|
||||
/// This metrics is independent of the success / error state of the entries.
|
||||
bytes_read: metrics::Counter,
|
||||
|
||||
/// Last consumed sequence number (e.g. Kafka offset).
|
||||
last_sequence_number: metrics::Gauge,
|
||||
|
||||
// The difference between the the last sequence number available (e.g. Kafka offset) and (= minus) last consumed
|
||||
// sequence number.
|
||||
sequence_number_lag: metrics::Gauge,
|
||||
|
||||
/// Minimum timestamp of last write as unix timestamp in nanoseconds.
|
||||
last_min_ts: metrics::Gauge,
|
||||
|
||||
/// Maximum timestamp of last write as unix timestamp in nanoseconds.
|
||||
last_max_ts: metrics::Gauge,
|
||||
}
|
||||
|
||||
/// This is the main IOx Database object. It is the root object of any
|
||||
/// specific InfluxDB IOx instance
|
||||
///
|
||||
|
@ -203,10 +288,10 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
|
|||
pub struct Db {
|
||||
rules: RwLock<Arc<DatabaseRules>>,
|
||||
|
||||
pub server_id: ServerId, // this is also the Query Server ID
|
||||
server_id: ServerId, // this is also the Query Server ID
|
||||
|
||||
/// Interface to use for persistence
|
||||
pub store: Arc<ObjectStore>,
|
||||
store: Arc<ObjectStore>,
|
||||
|
||||
/// Executor for running queries
|
||||
exec: Arc<Executor>,
|
||||
|
@ -248,8 +333,8 @@ pub struct Db {
|
|||
/// Metric labels
|
||||
metric_labels: Vec<KeyValue>,
|
||||
|
||||
/// Metrics for tracking the number of errors that occur while ingesting data
|
||||
ingest_errors: metrics::Counter,
|
||||
/// Ingest metrics
|
||||
ingest_metrics: WriteBufferIngestMetrics,
|
||||
|
||||
/// Optionally connect to a write buffer for either buffering writes or reading buffered writes
|
||||
write_buffer: Option<WriteBufferConfig>,
|
||||
|
@ -285,9 +370,8 @@ impl Db {
|
|||
let metric_labels = database_to_commit.catalog.metric_labels.clone();
|
||||
|
||||
let ingest_domain =
|
||||
metrics_registry.register_domain_with_labels("ingest", metric_labels.clone());
|
||||
let ingest_errors =
|
||||
ingest_domain.register_counter_metric("errors", None, "Number of errors during ingest");
|
||||
metrics_registry.register_domain_with_labels("write_buffer", metric_labels.clone());
|
||||
let ingest_metrics = WriteBufferIngestMetrics::new(Arc::new(ingest_domain));
|
||||
|
||||
let catalog = Arc::new(database_to_commit.catalog);
|
||||
|
||||
|
@ -316,7 +400,7 @@ impl Db {
|
|||
worker_iterations_lifecycle: AtomicUsize::new(0),
|
||||
worker_iterations_cleanup: AtomicUsize::new(0),
|
||||
metric_labels,
|
||||
ingest_errors,
|
||||
ingest_metrics,
|
||||
write_buffer: database_to_commit.write_buffer,
|
||||
cleanup_lock: Default::default(),
|
||||
}
|
||||
|
@ -333,13 +417,40 @@ impl Db {
|
|||
}
|
||||
|
||||
/// Updates the database rules
|
||||
pub fn update_db_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
|
||||
pub fn update_rules<F, E>(&self, update: F) -> Result<Arc<DatabaseRules>, E>
|
||||
where
|
||||
F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E>,
|
||||
{
|
||||
let mut rules = self.rules.write();
|
||||
let new_rules = Arc::new(update(rules.as_ref().clone())?);
|
||||
*rules = Arc::clone(&new_rules);
|
||||
let (late_arrive_window_updated, new_rules) = {
|
||||
let mut rules = self.rules.write();
|
||||
info!(db_name=%rules.name, "updating rules for database");
|
||||
let new_rules = Arc::new(update(rules.as_ref().clone())?);
|
||||
let late_arrive_window_updated = rules.lifecycle_rules.late_arrive_window_seconds
|
||||
!= new_rules.lifecycle_rules.late_arrive_window_seconds;
|
||||
|
||||
*rules = Arc::clone(&new_rules);
|
||||
(late_arrive_window_updated, new_rules)
|
||||
};
|
||||
|
||||
if late_arrive_window_updated {
|
||||
// Hold a read lock to prevent concurrent modification and
|
||||
// use values from re-acquired read guard
|
||||
let current = self.rules.read();
|
||||
|
||||
// Update windows
|
||||
let partitions = self.catalog.partitions();
|
||||
for partition in &partitions {
|
||||
let mut partition = partition.write();
|
||||
let addr = partition.addr().clone();
|
||||
if let Some(windows) = partition.persistence_windows_mut() {
|
||||
info!(partition=%addr, "updating persistence windows");
|
||||
windows.set_late_arrival_period(Duration::from_secs(
|
||||
current.lifecycle_rules.late_arrive_window_seconds.get() as u64,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(new_rules)
|
||||
}
|
||||
|
||||
|
@ -656,9 +767,17 @@ impl Db {
|
|||
// streaming from the write buffer loop
|
||||
async {
|
||||
if let Some(WriteBufferConfig::Reading(write_buffer)) = &self.write_buffer {
|
||||
let mut write_buffer = write_buffer
|
||||
.try_lock()
|
||||
.expect("no streams should exist at this point");
|
||||
let mut futures = vec![];
|
||||
for (_sequencer_id, stream) in write_buffer.streams() {
|
||||
let fut = self.stream_in_sequenced_entries(stream);
|
||||
for (sequencer_id, stream) in write_buffer.streams() {
|
||||
let metrics = self.ingest_metrics.new_sequencer_metrics(sequencer_id);
|
||||
let fut = self.stream_in_sequenced_entries(
|
||||
stream.stream,
|
||||
stream.fetch_high_watermark,
|
||||
metrics,
|
||||
);
|
||||
futures.push(fut);
|
||||
}
|
||||
|
||||
|
@ -675,32 +794,122 @@ impl Db {
|
|||
|
||||
/// This is used to take entries from a `Stream` and put them in the mutable buffer, such as
|
||||
/// streaming entries from a write buffer.
|
||||
async fn stream_in_sequenced_entries(
|
||||
&self,
|
||||
stream: BoxStream<'_, Result<SequencedEntry, WriteBufferError>>,
|
||||
async fn stream_in_sequenced_entries<'a>(
|
||||
&'a self,
|
||||
mut stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
|
||||
f_mark: FetchHighWatermark<'a>,
|
||||
mut metrics: SequencerMetrics,
|
||||
) {
|
||||
stream
|
||||
.for_each(|sequenced_entry_result| async {
|
||||
let sequenced_entry = match sequenced_entry_result {
|
||||
Ok(sequenced_entry) => sequenced_entry,
|
||||
Err(e) => {
|
||||
debug!(?e, "Error converting write buffer data to SequencedEntry");
|
||||
self.ingest_errors.add(1);
|
||||
return;
|
||||
}
|
||||
};
|
||||
let mut watermark_last_updated: Option<Instant> = None;
|
||||
let mut watermark = 0;
|
||||
|
||||
let sequenced_entry = Arc::new(sequenced_entry);
|
||||
while let Some(sequenced_entry_result) = stream.next().await {
|
||||
let red_observation = metrics.red.observation();
|
||||
|
||||
if let Err(e) = self.store_sequenced_entry(sequenced_entry) {
|
||||
// get entry from sequencer
|
||||
let sequenced_entry = match sequenced_entry_result {
|
||||
Ok(sequenced_entry) => sequenced_entry,
|
||||
Err(e) => {
|
||||
debug!(?e, "Error converting write buffer data to SequencedEntry");
|
||||
red_observation.client_error();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let sequenced_entry = Arc::new(sequenced_entry);
|
||||
|
||||
// store entry
|
||||
match self.store_sequenced_entry(Arc::clone(&sequenced_entry)) {
|
||||
Ok(_) => {
|
||||
red_observation.ok();
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
?e,
|
||||
"Error storing SequencedEntry from write buffer in database"
|
||||
);
|
||||
self.ingest_errors.add(1);
|
||||
red_observation.error();
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
// maybe update sequencer watermark
|
||||
// We are not updating this watermark every round because asking the sequencer for that watermark can be
|
||||
// quite expensive.
|
||||
if watermark_last_updated
|
||||
.map(|ts| ts.elapsed() > Duration::from_secs(10))
|
||||
.unwrap_or(true)
|
||||
{
|
||||
match f_mark().await {
|
||||
Ok(w) => {
|
||||
watermark = w;
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(%e, "Error while reading sequencer watermark")
|
||||
}
|
||||
}
|
||||
watermark_last_updated = Some(Instant::now());
|
||||
}
|
||||
|
||||
// update:
|
||||
// - bytes read
|
||||
// - last sequence number
|
||||
// - lag
|
||||
// - min ts
|
||||
// - max ts
|
||||
let sequence = sequenced_entry
|
||||
.sequence()
|
||||
.expect("entry from write buffer must be sequenced");
|
||||
let entry = sequenced_entry.entry();
|
||||
metrics.bytes_read.add(entry.data().len() as u64);
|
||||
metrics
|
||||
.last_sequence_number
|
||||
.set(sequence.number as usize, &[]);
|
||||
metrics.sequence_number_lag.set(
|
||||
watermark.saturating_sub(sequence.number).saturating_sub(1) as usize,
|
||||
&[],
|
||||
);
|
||||
if let Some(min_ts) = entry
|
||||
.partition_writes()
|
||||
.map(|partition_writes| {
|
||||
partition_writes
|
||||
.iter()
|
||||
.filter_map(|partition_write| {
|
||||
partition_write
|
||||
.table_batches()
|
||||
.iter()
|
||||
.filter_map(|table_batch| table_batch.min_max_time().ok())
|
||||
.map(|(min, _max)| min)
|
||||
.max()
|
||||
})
|
||||
.min()
|
||||
})
|
||||
.flatten()
|
||||
{
|
||||
metrics
|
||||
.last_min_ts
|
||||
.set(min_ts.timestamp_nanos() as usize, &[]);
|
||||
}
|
||||
if let Some(max_ts) = entry
|
||||
.partition_writes()
|
||||
.map(|partition_writes| {
|
||||
partition_writes
|
||||
.iter()
|
||||
.filter_map(|partition_write| {
|
||||
partition_write
|
||||
.table_batches()
|
||||
.iter()
|
||||
.filter_map(|table_batch| table_batch.min_max_time().ok())
|
||||
.map(|(_min, max)| max)
|
||||
.max()
|
||||
})
|
||||
.max()
|
||||
})
|
||||
.flatten()
|
||||
{
|
||||
metrics
|
||||
.last_max_ts
|
||||
.set(max_ts.timestamp_nanos() as usize, &[]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup_unreferenced_parquet_files(
|
||||
|
@ -1208,17 +1417,27 @@ mod tests {
|
|||
|
||||
#[tokio::test]
|
||||
async fn read_from_write_buffer_write_to_mutable_buffer() {
|
||||
let entry = lp_to_entry("cpu bar=1 10");
|
||||
let write_buffer_state = MockBufferSharedState::empty_with_n_sequencers(1);
|
||||
write_buffer_state
|
||||
.push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 0), entry).unwrap());
|
||||
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
|
||||
write_buffer_state.push_entry(
|
||||
SequencedEntry::new_from_sequence(Sequence::new(0, 0), lp_to_entry("mem foo=1 10"))
|
||||
.unwrap(),
|
||||
);
|
||||
write_buffer_state.push_entry(
|
||||
SequencedEntry::new_from_sequence(
|
||||
Sequence::new(0, 7),
|
||||
lp_to_entry("cpu bar=2 20\ncpu bar=3 30"),
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
let write_buffer = MockBufferForReading::new(write_buffer_state);
|
||||
|
||||
let db = TestDb::builder()
|
||||
.write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
|
||||
let test_db = TestDb::builder()
|
||||
.write_buffer(WriteBufferConfig::Reading(Arc::new(
|
||||
tokio::sync::Mutex::new(Box::new(write_buffer) as _),
|
||||
)))
|
||||
.build()
|
||||
.await
|
||||
.db;
|
||||
.await;
|
||||
let db = test_db.db;
|
||||
|
||||
// do: start background task loop
|
||||
let shutdown: CancellationToken = Default::default();
|
||||
|
@ -1247,18 +1466,84 @@ mod tests {
|
|||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
|
||||
// check: metrics
|
||||
// We need to do that BEFORE shutting down the background loop because gauges would be dropped and resetted otherwise
|
||||
let metrics = test_db.metric_registry;
|
||||
metrics
|
||||
.has_metric_family("write_buffer_ingest_requests_total")
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
("status", "ok"),
|
||||
])
|
||||
.counter()
|
||||
.eq(2.0)
|
||||
.unwrap();
|
||||
metrics
|
||||
.has_metric_family("write_buffer_read_bytes_total")
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
])
|
||||
.counter()
|
||||
.eq(528.0)
|
||||
.unwrap();
|
||||
metrics
|
||||
.has_metric_family("write_buffer_last_sequence_number")
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
])
|
||||
.gauge()
|
||||
.eq(7.0)
|
||||
.unwrap();
|
||||
metrics
|
||||
.has_metric_family("write_buffer_sequence_number_lag")
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
])
|
||||
.gauge()
|
||||
.eq(0.0)
|
||||
.unwrap();
|
||||
metrics
|
||||
.has_metric_family("write_buffer_last_min_ts")
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
])
|
||||
.gauge()
|
||||
.eq(20.0)
|
||||
.unwrap();
|
||||
metrics
|
||||
.has_metric_family("write_buffer_last_max_ts")
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
])
|
||||
.gauge()
|
||||
.eq(30.0)
|
||||
.unwrap();
|
||||
|
||||
// do: stop background task loop
|
||||
shutdown.cancel();
|
||||
join_handle.await.unwrap();
|
||||
|
||||
// check: the expected results should be there
|
||||
let batches = run_query(db, "select * from cpu").await;
|
||||
let batches = run_query(db, "select * from cpu order by time").await;
|
||||
|
||||
let expected = vec![
|
||||
"+-----+-------------------------------+",
|
||||
"| bar | time |",
|
||||
"+-----+-------------------------------+",
|
||||
"| 1 | 1970-01-01 00:00:00.000000010 |",
|
||||
"| 2 | 1970-01-01 00:00:00.000000020 |",
|
||||
"| 3 | 1970-01-01 00:00:00.000000030 |",
|
||||
"+-----+-------------------------------+",
|
||||
];
|
||||
assert_batches_eq!(expected, &batches);
|
||||
|
@ -1271,10 +1556,12 @@ mod tests {
|
|||
String::from("Something bad happened on the way to creating a SequencedEntry").into(),
|
||||
0,
|
||||
);
|
||||
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
|
||||
let write_buffer = MockBufferForReading::new(write_buffer_state);
|
||||
|
||||
let test_db = TestDb::builder()
|
||||
.write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
|
||||
.write_buffer(WriteBufferConfig::Reading(Arc::new(
|
||||
tokio::sync::Mutex::new(Box::new(write_buffer) as _),
|
||||
)))
|
||||
.build()
|
||||
.await;
|
||||
|
||||
|
@ -1291,11 +1578,16 @@ mod tests {
|
|||
// check: after a while the error should be reported in the database's metrics
|
||||
let t_0 = Instant::now();
|
||||
loop {
|
||||
let family = metrics.try_has_metric_family("ingest_errors_total");
|
||||
let family = metrics.try_has_metric_family("write_buffer_ingest_requests_total");
|
||||
|
||||
if let Ok(metric) = family {
|
||||
if metric
|
||||
.with_labels(&[("db_name", "placeholder"), ("svr_id", "1")])
|
||||
.with_labels(&[
|
||||
("db_name", "placeholder"),
|
||||
("svr_id", "1"),
|
||||
("sequencer_id", "0"),
|
||||
("status", "client_error"),
|
||||
])
|
||||
.counter()
|
||||
.eq(1.0)
|
||||
.is_ok()
|
||||
|
@ -2259,10 +2551,12 @@ mod tests {
|
|||
);
|
||||
write_buffer_state
|
||||
.push_entry(SequencedEntry::new_from_sequence(Sequence::new(0, 1), entry).unwrap());
|
||||
let write_buffer = Arc::new(MockBufferForReading::new(write_buffer_state));
|
||||
let write_buffer = MockBufferForReading::new(write_buffer_state);
|
||||
|
||||
let db = TestDb::builder()
|
||||
.write_buffer(WriteBufferConfig::Reading(Arc::clone(&write_buffer) as _))
|
||||
.write_buffer(WriteBufferConfig::Reading(Arc::new(
|
||||
tokio::sync::Mutex::new(Box::new(write_buffer) as _),
|
||||
)))
|
||||
.build()
|
||||
.await
|
||||
.db;
|
||||
|
|
|
@ -6,15 +6,16 @@ use hashbrown::{HashMap, HashSet};
|
|||
|
||||
use data_types::chunk_metadata::ChunkSummary;
|
||||
use data_types::chunk_metadata::DetailedChunkSummary;
|
||||
use data_types::partition_metadata::{PartitionSummary, TableSummary};
|
||||
use data_types::partition_metadata::{PartitionAddr, PartitionSummary, TableSummary};
|
||||
use internal_types::schema::Schema;
|
||||
use snafu::Snafu;
|
||||
use snafu::{OptionExt, Snafu};
|
||||
use tracker::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
|
||||
|
||||
use self::chunk::CatalogChunk;
|
||||
use self::metrics::CatalogMetrics;
|
||||
use self::partition::Partition;
|
||||
use self::table::Table;
|
||||
use data_types::write_summary::WriteSummary;
|
||||
|
||||
pub mod chunk;
|
||||
mod metrics;
|
||||
|
@ -135,11 +136,8 @@ impl Catalog {
|
|||
/// Get a specific table by name, returning `None` if there is no such table
|
||||
pub fn table(&self, table_name: impl AsRef<str>) -> Result<MappedRwLockReadGuard<'_, Table>> {
|
||||
let table_name = table_name.as_ref();
|
||||
RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name)).map_err(
|
||||
|_| Error::TableNotFound {
|
||||
table: table_name.to_string(),
|
||||
},
|
||||
)
|
||||
RwLockReadGuard::try_map(self.tables.read(), |tables| tables.get(table_name))
|
||||
.map_err(|_| TableNotFound { table: table_name }.build())
|
||||
}
|
||||
|
||||
/// Get a specific partition by name, returning an error if it can't be found
|
||||
|
@ -154,9 +152,9 @@ impl Catalog {
|
|||
self.table(table_name)?
|
||||
.partition(partition_key)
|
||||
.cloned()
|
||||
.ok_or_else(|| Error::PartitionNotFound {
|
||||
partition: partition_key.to_string(),
|
||||
table: table_name.to_string(),
|
||||
.context(PartitionNotFound {
|
||||
partition: partition_key,
|
||||
table: table_name,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -174,9 +172,9 @@ impl Catalog {
|
|||
.read()
|
||||
.chunk(chunk_id)
|
||||
.cloned()
|
||||
.ok_or_else(|| Error::ChunkNotFound {
|
||||
partition: partition_key.to_string(),
|
||||
table: table_name.to_string(),
|
||||
.context(ChunkNotFound {
|
||||
partition: partition_key,
|
||||
table: table_name,
|
||||
chunk_id,
|
||||
})
|
||||
}
|
||||
|
@ -228,6 +226,23 @@ impl Catalog {
|
|||
.collect()
|
||||
}
|
||||
|
||||
/// Returns a list of persistence window summaries for each partition
|
||||
pub fn persistence_summaries(&self) -> Vec<(PartitionAddr, WriteSummary)> {
|
||||
let mut summaries = Vec::new();
|
||||
let tables = self.tables.read();
|
||||
for table in tables.values() {
|
||||
for partition in table.partitions() {
|
||||
let partition = partition.read();
|
||||
if let Some(w) = partition.persistence_windows() {
|
||||
for summary in w.summaries() {
|
||||
summaries.push((partition.addr().clone(), summary))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
summaries
|
||||
}
|
||||
|
||||
pub fn chunk_summaries(&self) -> Vec<ChunkSummary> {
|
||||
let partition_key = None;
|
||||
let table_names = TableNameFilter::AllTables;
|
||||
|
|
|
@ -5,7 +5,7 @@ use std::{
|
|||
|
||||
use data_types::partition_metadata;
|
||||
use partition_metadata::TableSummary;
|
||||
use snafu::{ResultExt, Snafu};
|
||||
use snafu::{OptionExt, ResultExt, Snafu};
|
||||
|
||||
use datafusion::physical_plan::SendableRecordBatchStream;
|
||||
use datafusion_util::MemoryStream;
|
||||
|
@ -417,7 +417,7 @@ impl QueryChunk for DbChunk {
|
|||
// column out to get the set of values.
|
||||
let values = values
|
||||
.remove(column_name)
|
||||
.ok_or_else(|| Error::ReadBufferError {
|
||||
.with_context(|| ReadBufferError {
|
||||
chunk_id: self.id(),
|
||||
msg: format!(
|
||||
"failed to find column_name {:?} in results of tag_values",
|
||||
|
|
|
@ -7,38 +7,31 @@
|
|||
//!
|
||||
//! For example `SELECT * FROM system.chunks`
|
||||
|
||||
use std::convert::AsRef;
|
||||
use std::any::Any;
|
||||
use std::sync::Arc;
|
||||
use std::{any::Any, collections::HashMap};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
use arrow::{
|
||||
array::{
|
||||
ArrayRef, StringArray, StringBuilder, Time64NanosecondArray, TimestampNanosecondArray,
|
||||
UInt32Array, UInt32Builder, UInt64Array, UInt64Builder,
|
||||
},
|
||||
datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit},
|
||||
datatypes::{Field, Schema, SchemaRef},
|
||||
error::Result,
|
||||
record_batch::RecordBatch,
|
||||
};
|
||||
use data_types::{
|
||||
chunk_metadata::{ChunkSummary, DetailedChunkSummary},
|
||||
error::ErrorLogger,
|
||||
job::Job,
|
||||
partition_metadata::PartitionSummary,
|
||||
};
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
use datafusion::{
|
||||
catalog::schema::SchemaProvider,
|
||||
datasource::{datasource::Statistics, TableProvider},
|
||||
error::{DataFusionError, Result as DataFusionResult},
|
||||
physical_plan::{memory::MemoryExec, ExecutionPlan},
|
||||
};
|
||||
use tracker::TaskTracker;
|
||||
|
||||
use crate::JobRegistry;
|
||||
|
||||
use super::catalog::Catalog;
|
||||
use crate::JobRegistry;
|
||||
use data_types::partition_metadata::TableSummary;
|
||||
|
||||
mod chunks;
|
||||
mod columns;
|
||||
mod operations;
|
||||
mod persistence;
|
||||
|
||||
// The IOx system schema
|
||||
pub const SYSTEM_SCHEMA: &str = "system";
|
||||
|
@ -47,12 +40,14 @@ const CHUNKS: &str = "chunks";
|
|||
const COLUMNS: &str = "columns";
|
||||
const CHUNK_COLUMNS: &str = "chunk_columns";
|
||||
const OPERATIONS: &str = "operations";
|
||||
const PERSISTENCE_WINDOWS: &str = "persistence_windows";
|
||||
|
||||
pub struct SystemSchemaProvider {
|
||||
chunks: Arc<dyn TableProvider>,
|
||||
columns: Arc<dyn TableProvider>,
|
||||
chunk_columns: Arc<dyn TableProvider>,
|
||||
operations: Arc<dyn TableProvider>,
|
||||
persistence_windows: Arc<dyn TableProvider>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SystemSchemaProvider {
|
||||
|
@ -67,22 +62,26 @@ impl SystemSchemaProvider {
|
|||
pub fn new(db_name: impl Into<String>, catalog: Arc<Catalog>, jobs: Arc<JobRegistry>) -> Self {
|
||||
let db_name = db_name.into();
|
||||
let chunks = Arc::new(SystemTableProvider {
|
||||
inner: ChunksTable::new(Arc::clone(&catalog)),
|
||||
inner: chunks::ChunksTable::new(Arc::clone(&catalog)),
|
||||
});
|
||||
let columns = Arc::new(SystemTableProvider {
|
||||
inner: ColumnsTable::new(Arc::clone(&catalog)),
|
||||
inner: columns::ColumnsTable::new(Arc::clone(&catalog)),
|
||||
});
|
||||
let chunk_columns = Arc::new(SystemTableProvider {
|
||||
inner: ChunkColumnsTable::new(catalog),
|
||||
inner: columns::ChunkColumnsTable::new(Arc::clone(&catalog)),
|
||||
});
|
||||
let operations = Arc::new(SystemTableProvider {
|
||||
inner: OperationsTable::new(db_name, jobs),
|
||||
inner: operations::OperationsTable::new(db_name, jobs),
|
||||
});
|
||||
let persistence_windows = Arc::new(SystemTableProvider {
|
||||
inner: persistence::PersistenceWindowsTable::new(catalog),
|
||||
});
|
||||
Self {
|
||||
chunks,
|
||||
columns,
|
||||
chunk_columns,
|
||||
operations,
|
||||
persistence_windows,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -98,6 +97,7 @@ impl SchemaProvider for SystemSchemaProvider {
|
|||
COLUMNS.to_string(),
|
||||
CHUNK_COLUMNS.to_string(),
|
||||
OPERATIONS.to_string(),
|
||||
PERSISTENCE_WINDOWS.to_string(),
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -107,6 +107,7 @@ impl SchemaProvider for SystemSchemaProvider {
|
|||
COLUMNS => Some(Arc::clone(&self.columns)),
|
||||
CHUNK_COLUMNS => Some(Arc::clone(&self.chunk_columns)),
|
||||
OPERATIONS => Some(Arc::clone(&self.operations)),
|
||||
PERSISTENCE_WINDOWS => Some(Arc::clone(&self.persistence_windows)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
@ -162,407 +163,6 @@ fn time_to_ts(time: Option<DateTime<Utc>>) -> Option<i64> {
|
|||
time.map(|ts| ts.timestamp_nanos())
|
||||
}
|
||||
|
||||
/// Implementation of system.chunks table
|
||||
#[derive(Debug)]
|
||||
struct ChunksTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl ChunksTable {
|
||||
fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: chunk_summaries_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for ChunksTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
|
||||
.log_if_error("system.chunks table")
|
||||
}
|
||||
}
|
||||
|
||||
fn chunk_summaries_schema() -> SchemaRef {
|
||||
let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::UInt32, false),
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("storage", DataType::Utf8, false),
|
||||
Field::new("lifecycle_action", DataType::Utf8, true),
|
||||
Field::new("memory_bytes", DataType::UInt64, false),
|
||||
Field::new("object_store_bytes", DataType::UInt64, false),
|
||||
Field::new("row_count", DataType::UInt64, false),
|
||||
Field::new("time_of_first_write", ts.clone(), true),
|
||||
Field::new("time_of_last_write", ts.clone(), true),
|
||||
Field::new("time_closed", ts, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
|
||||
let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
|
||||
let partition_key = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.partition_key.as_ref()))
|
||||
.collect::<StringArray>();
|
||||
let table_name = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.table_name.as_ref()))
|
||||
.collect::<StringArray>();
|
||||
let storage = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.storage.as_str()))
|
||||
.collect::<StringArray>();
|
||||
let lifecycle_action = chunks
|
||||
.iter()
|
||||
.map(|c| c.lifecycle_action.map(|a| a.name()))
|
||||
.collect::<StringArray>();
|
||||
let memory_bytes = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.memory_bytes as u64))
|
||||
.collect::<UInt64Array>();
|
||||
let object_store_bytes = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
|
||||
.collect::<UInt64Array>();
|
||||
let row_counts = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.row_count as u64))
|
||||
.collect::<UInt64Array>();
|
||||
let time_of_first_write = chunks
|
||||
.iter()
|
||||
.map(|c| c.time_of_first_write)
|
||||
.map(time_to_ts)
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let time_of_last_write = chunks
|
||||
.iter()
|
||||
.map(|c| c.time_of_last_write)
|
||||
.map(time_to_ts)
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let time_closed = chunks
|
||||
.iter()
|
||||
.map(|c| c.time_closed)
|
||||
.map(time_to_ts)
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(id),
|
||||
Arc::new(partition_key),
|
||||
Arc::new(table_name),
|
||||
Arc::new(storage),
|
||||
Arc::new(lifecycle_action),
|
||||
Arc::new(memory_bytes),
|
||||
Arc::new(object_store_bytes),
|
||||
Arc::new(row_counts),
|
||||
Arc::new(time_of_first_write),
|
||||
Arc::new(time_of_last_write),
|
||||
Arc::new(time_closed),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
/// Implementation of `system.columns` system table
|
||||
#[derive(Debug)]
|
||||
struct ColumnsTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl ColumnsTable {
|
||||
fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: partition_summaries_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for ColumnsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_partition_summaries(self.schema(), self.catalog.partition_summaries())
|
||||
.log_if_error("system.columns table")
|
||||
}
|
||||
}
|
||||
|
||||
fn partition_summaries_schema() -> SchemaRef {
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("column_name", DataType::Utf8, false),
|
||||
Field::new("column_type", DataType::Utf8, false),
|
||||
Field::new("influxdb_type", DataType::Utf8, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_partition_summaries(
|
||||
schema: SchemaRef,
|
||||
partitions: Vec<PartitionSummary>,
|
||||
) -> Result<RecordBatch> {
|
||||
// Assume each partition has roughly 5 tables with 5 columns
|
||||
let row_estimate = partitions.len() * 25;
|
||||
|
||||
let mut partition_key = StringBuilder::new(row_estimate);
|
||||
let mut table_name = StringBuilder::new(row_estimate);
|
||||
let mut column_name = StringBuilder::new(row_estimate);
|
||||
let mut column_type = StringBuilder::new(row_estimate);
|
||||
let mut influxdb_type = StringBuilder::new(row_estimate);
|
||||
|
||||
// Note no rows are produced for partitions with no tabes, or
|
||||
// tables with no columns: There are other tables to list tables
|
||||
// and columns
|
||||
for partition in partitions {
|
||||
let table = partition.table;
|
||||
for column in table.columns {
|
||||
partition_key.append_value(&partition.key)?;
|
||||
table_name.append_value(&table.name)?;
|
||||
column_name.append_value(&column.name)?;
|
||||
column_type.append_value(column.type_name())?;
|
||||
if let Some(t) = &column.influxdb_type {
|
||||
influxdb_type.append_value(t.as_str())?;
|
||||
} else {
|
||||
influxdb_type.append_null()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(partition_key.finish()) as ArrayRef,
|
||||
Arc::new(table_name.finish()),
|
||||
Arc::new(column_name.finish()),
|
||||
Arc::new(column_type.finish()),
|
||||
Arc::new(influxdb_type.finish()),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
/// Implementation of system.column_chunks table
|
||||
#[derive(Debug)]
|
||||
struct ChunkColumnsTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl ChunkColumnsTable {
|
||||
fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: chunk_columns_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for ChunkColumnsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
|
||||
.log_if_error("system.column_chunks table")
|
||||
}
|
||||
}
|
||||
|
||||
fn chunk_columns_schema() -> SchemaRef {
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("chunk_id", DataType::UInt32, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("column_name", DataType::Utf8, false),
|
||||
Field::new("storage", DataType::Utf8, false),
|
||||
Field::new("row_count", DataType::UInt64, true),
|
||||
Field::new("min_value", DataType::Utf8, true),
|
||||
Field::new("max_value", DataType::Utf8, true),
|
||||
Field::new("memory_bytes", DataType::UInt64, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn assemble_chunk_columns(
|
||||
schema: SchemaRef,
|
||||
chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
|
||||
) -> Result<RecordBatch> {
|
||||
/// Builds an index from column_name -> size
|
||||
fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
|
||||
summary
|
||||
.columns
|
||||
.iter()
|
||||
.map(|column_summary| {
|
||||
(
|
||||
column_summary.name.as_ref(),
|
||||
column_summary.memory_bytes as u64,
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Assume each chunk has roughly 5 columns
|
||||
let row_estimate = chunk_summaries.len() * 5;
|
||||
|
||||
let mut partition_key = StringBuilder::new(row_estimate);
|
||||
let mut chunk_id = UInt32Builder::new(row_estimate);
|
||||
let mut table_name = StringBuilder::new(row_estimate);
|
||||
let mut column_name = StringBuilder::new(row_estimate);
|
||||
let mut storage = StringBuilder::new(row_estimate);
|
||||
let mut row_count = UInt64Builder::new(row_estimate);
|
||||
let mut min_values = StringBuilder::new(row_estimate);
|
||||
let mut max_values = StringBuilder::new(row_estimate);
|
||||
let mut memory_bytes = UInt64Builder::new(row_estimate);
|
||||
|
||||
// Note no rows are produced for partitions with no chunks, or
|
||||
// tables with no partitions: There are other tables to list tables
|
||||
// and columns
|
||||
for (table_summary, chunk_summary) in chunk_summaries {
|
||||
let mut column_index = make_column_index(&chunk_summary);
|
||||
let storage_value = chunk_summary.inner.storage.as_str();
|
||||
|
||||
for column in &table_summary.columns {
|
||||
partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
|
||||
chunk_id.append_value(chunk_summary.inner.id)?;
|
||||
table_name.append_value(&chunk_summary.inner.table_name)?;
|
||||
column_name.append_value(&column.name)?;
|
||||
storage.append_value(storage_value)?;
|
||||
row_count.append_value(column.count())?;
|
||||
if let Some(v) = column.stats.min_as_str() {
|
||||
min_values.append_value(v)?;
|
||||
} else {
|
||||
min_values.append(false)?;
|
||||
}
|
||||
if let Some(v) = column.stats.max_as_str() {
|
||||
max_values.append_value(v)?;
|
||||
} else {
|
||||
max_values.append(false)?;
|
||||
}
|
||||
|
||||
let size = column_index.remove(column.name.as_str());
|
||||
|
||||
memory_bytes.append_option(size)?;
|
||||
}
|
||||
}
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(partition_key.finish()) as ArrayRef,
|
||||
Arc::new(chunk_id.finish()),
|
||||
Arc::new(table_name.finish()),
|
||||
Arc::new(column_name.finish()),
|
||||
Arc::new(storage.finish()),
|
||||
Arc::new(row_count.finish()),
|
||||
Arc::new(min_values.finish()),
|
||||
Arc::new(max_values.finish()),
|
||||
Arc::new(memory_bytes.finish()),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
/// Implementation of system.operations table
|
||||
#[derive(Debug)]
|
||||
struct OperationsTable {
|
||||
schema: SchemaRef,
|
||||
db_name: String,
|
||||
jobs: Arc<JobRegistry>,
|
||||
}
|
||||
|
||||
impl OperationsTable {
|
||||
fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
|
||||
Self {
|
||||
schema: operations_schema(),
|
||||
db_name,
|
||||
jobs,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for OperationsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
|
||||
.log_if_error("system.operations table")
|
||||
}
|
||||
}
|
||||
|
||||
fn operations_schema() -> SchemaRef {
|
||||
let ts = DataType::Time64(TimeUnit::Nanosecond);
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Utf8, false),
|
||||
Field::new("status", DataType::Utf8, true),
|
||||
Field::new("cpu_time_used", ts.clone(), true),
|
||||
Field::new("wall_time_used", ts, true),
|
||||
Field::new("partition_key", DataType::Utf8, true),
|
||||
Field::new("chunk_id", DataType::UInt32, true),
|
||||
Field::new("description", DataType::Utf8, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_task_trackers(
|
||||
schema: SchemaRef,
|
||||
db_name: &str,
|
||||
jobs: Vec<TaskTracker<Job>>,
|
||||
) -> Result<RecordBatch> {
|
||||
let jobs = jobs
|
||||
.into_iter()
|
||||
.filter(|job| job.metadata().db_name() == Some(db_name))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let ids = jobs
|
||||
.iter()
|
||||
.map(|job| Some(job.id().to_string()))
|
||||
.collect::<StringArray>();
|
||||
let statuses = jobs
|
||||
.iter()
|
||||
.map(|job| Some(job.get_status().name()))
|
||||
.collect::<StringArray>();
|
||||
let cpu_time_used = jobs
|
||||
.iter()
|
||||
.map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
|
||||
.collect::<Time64NanosecondArray>();
|
||||
let wall_time_used = jobs
|
||||
.iter()
|
||||
.map(|job| job.get_status().wall_nanos().map(|n| n as i64))
|
||||
.collect::<Time64NanosecondArray>();
|
||||
let partition_keys = jobs
|
||||
.iter()
|
||||
.map(|job| job.metadata().partition_key())
|
||||
.collect::<StringArray>();
|
||||
let chunk_ids = jobs
|
||||
.iter()
|
||||
.map(|job| job.metadata().chunk_id())
|
||||
.collect::<UInt32Array>();
|
||||
let descriptions = jobs
|
||||
.iter()
|
||||
.map(|job| Some(job.metadata().description()))
|
||||
.collect::<StringArray>();
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(ids) as ArrayRef,
|
||||
Arc::new(statuses),
|
||||
Arc::new(cpu_time_used),
|
||||
Arc::new(wall_time_used),
|
||||
Arc::new(partition_keys),
|
||||
Arc::new(chunk_ids),
|
||||
Arc::new(descriptions),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a DataFusion ExecutionPlan node that scans a single batch
|
||||
/// of records.
|
||||
fn scan_batch(
|
||||
|
@ -605,141 +205,10 @@ fn scan_batch(
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use arrow::array::{ArrayRef, UInt64Array};
|
||||
use arrow_util::assert_batches_eq;
|
||||
use chrono::NaiveDateTime;
|
||||
use data_types::{
|
||||
chunk_metadata::{ChunkColumnSummary, ChunkLifecycleAction, ChunkStorage},
|
||||
partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics, TableSummary},
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_from_chunk_summaries() {
|
||||
let chunks = vec![
|
||||
ChunkSummary {
|
||||
partition_key: Arc::from("p1"),
|
||||
table_name: Arc::from("table1"),
|
||||
id: 0,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action: None,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(10, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
ChunkSummary {
|
||||
partition_key: Arc::from("p1"),
|
||||
table_name: Arc::from("table1"),
|
||||
id: 1,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action: Some(ChunkLifecycleAction::Persisting),
|
||||
memory_bytes: 23455,
|
||||
object_store_bytes: 0,
|
||||
row_count: 22,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(80, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_closed: None,
|
||||
},
|
||||
ChunkSummary {
|
||||
partition_key: Arc::from("p1"),
|
||||
table_name: Arc::from("table1"),
|
||||
id: 2,
|
||||
storage: ChunkStorage::ObjectStoreOnly,
|
||||
lifecycle_action: None,
|
||||
memory_bytes: 1234,
|
||||
object_store_bytes: 5678,
|
||||
row_count: 33,
|
||||
time_of_first_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(100, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_of_last_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(200, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_closed: None,
|
||||
},
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
|
||||
"| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |",
|
||||
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
|
||||
"| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |",
|
||||
"| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |",
|
||||
"| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |",
|
||||
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
|
||||
];
|
||||
|
||||
let schema = chunk_summaries_schema();
|
||||
let batch = from_chunk_summaries(schema, chunks).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_partition_summaries() {
|
||||
let partitions = vec![
|
||||
PartitionSummary {
|
||||
key: "p1".to_string(),
|
||||
table: TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![
|
||||
ColumnSummary {
|
||||
name: "c1".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Tag),
|
||||
stats: Statistics::I64(StatValues::new_with_value(23)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "c2".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::I64(StatValues::new_with_value(43)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "c3".to_string(),
|
||||
influxdb_type: None,
|
||||
stats: Statistics::String(StatValues::new_with_value(
|
||||
"foo".to_string(),
|
||||
)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "time".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Timestamp),
|
||||
stats: Statistics::I64(StatValues::new_with_value(43)),
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
PartitionSummary {
|
||||
key: "p3".to_string(),
|
||||
table: TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+---------------+------------+-------------+-------------+---------------+",
|
||||
"| partition_key | table_name | column_name | column_type | influxdb_type |",
|
||||
"+---------------+------------+-------------+-------------+---------------+",
|
||||
"| p1 | t1 | c1 | I64 | Tag |",
|
||||
"| p1 | t1 | c2 | I64 | Field |",
|
||||
"| p1 | t1 | c3 | String | |",
|
||||
"| p1 | t1 | time | I64 | Timestamp |",
|
||||
"+---------------+------------+-------------+-------------+---------------+",
|
||||
];
|
||||
|
||||
let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
use super::*;
|
||||
|
||||
fn seq_array(start: u64, end: u64) -> ArrayRef {
|
||||
Arc::new(UInt64Array::from_iter_values(start..end))
|
||||
|
@ -820,130 +289,4 @@ mod tests {
|
|||
err_string
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_assemble_chunk_columns() {
|
||||
let lifecycle_action = None;
|
||||
|
||||
let summaries = vec![
|
||||
(
|
||||
Arc::new(TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![
|
||||
ColumnSummary {
|
||||
name: "c1".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::String(StatValues::new(
|
||||
Some("bar".to_string()),
|
||||
Some("foo".to_string()),
|
||||
55,
|
||||
)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "c2".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
|
||||
},
|
||||
],
|
||||
}),
|
||||
DetailedChunkSummary {
|
||||
inner: ChunkSummary {
|
||||
partition_key: "p1".into(),
|
||||
table_name: "t1".into(),
|
||||
id: 42,
|
||||
storage: ChunkStorage::ReadBuffer,
|
||||
lifecycle_action,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
columns: vec![
|
||||
ChunkColumnSummary {
|
||||
name: "c1".into(),
|
||||
memory_bytes: 11,
|
||||
},
|
||||
ChunkColumnSummary {
|
||||
name: "c2".into(),
|
||||
memory_bytes: 12,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
(
|
||||
Arc::new(TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![ColumnSummary {
|
||||
name: "c1".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
|
||||
}],
|
||||
}),
|
||||
DetailedChunkSummary {
|
||||
inner: ChunkSummary {
|
||||
partition_key: "p2".into(),
|
||||
table_name: "t1".into(),
|
||||
id: 43,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
columns: vec![ChunkColumnSummary {
|
||||
name: "c1".into(),
|
||||
memory_bytes: 100,
|
||||
}],
|
||||
},
|
||||
),
|
||||
(
|
||||
Arc::new(TableSummary {
|
||||
name: "t2".to_string(),
|
||||
columns: vec![ColumnSummary {
|
||||
name: "c3".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
|
||||
}],
|
||||
}),
|
||||
DetailedChunkSummary {
|
||||
inner: ChunkSummary {
|
||||
partition_key: "p2".into(),
|
||||
table_name: "t2".into(),
|
||||
id: 44,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
columns: vec![ChunkColumnSummary {
|
||||
name: "c3".into(),
|
||||
memory_bytes: 200,
|
||||
}],
|
||||
},
|
||||
),
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
|
||||
"| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |",
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
|
||||
"| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |",
|
||||
"| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |",
|
||||
"| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |",
|
||||
"| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |",
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
|
||||
];
|
||||
|
||||
let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,201 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{StringArray, TimestampNanosecondArray, UInt32Array, UInt64Array};
|
||||
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
|
||||
use arrow::error::Result;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
|
||||
use data_types::chunk_metadata::ChunkSummary;
|
||||
use data_types::error::ErrorLogger;
|
||||
|
||||
use crate::db::catalog::Catalog;
|
||||
use crate::db::system_tables::{time_to_ts, IoxSystemTable};
|
||||
|
||||
/// Implementation of system.chunks table
|
||||
#[derive(Debug)]
|
||||
pub(super) struct ChunksTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl ChunksTable {
|
||||
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: chunk_summaries_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for ChunksTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_chunk_summaries(self.schema(), self.catalog.chunk_summaries())
|
||||
.log_if_error("system.chunks table")
|
||||
}
|
||||
}
|
||||
|
||||
fn chunk_summaries_schema() -> SchemaRef {
|
||||
let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::UInt32, false),
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("storage", DataType::Utf8, false),
|
||||
Field::new("lifecycle_action", DataType::Utf8, true),
|
||||
Field::new("memory_bytes", DataType::UInt64, false),
|
||||
Field::new("object_store_bytes", DataType::UInt64, false),
|
||||
Field::new("row_count", DataType::UInt64, false),
|
||||
Field::new("time_of_first_write", ts.clone(), true),
|
||||
Field::new("time_of_last_write", ts.clone(), true),
|
||||
Field::new("time_closed", ts, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_chunk_summaries(schema: SchemaRef, chunks: Vec<ChunkSummary>) -> Result<RecordBatch> {
|
||||
let id = chunks.iter().map(|c| Some(c.id)).collect::<UInt32Array>();
|
||||
let partition_key = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.partition_key.as_ref()))
|
||||
.collect::<StringArray>();
|
||||
let table_name = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.table_name.as_ref()))
|
||||
.collect::<StringArray>();
|
||||
let storage = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.storage.as_str()))
|
||||
.collect::<StringArray>();
|
||||
let lifecycle_action = chunks
|
||||
.iter()
|
||||
.map(|c| c.lifecycle_action.map(|a| a.name()))
|
||||
.collect::<StringArray>();
|
||||
let memory_bytes = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.memory_bytes as u64))
|
||||
.collect::<UInt64Array>();
|
||||
let object_store_bytes = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.object_store_bytes as u64).filter(|&v| v > 0))
|
||||
.collect::<UInt64Array>();
|
||||
let row_counts = chunks
|
||||
.iter()
|
||||
.map(|c| Some(c.row_count as u64))
|
||||
.collect::<UInt64Array>();
|
||||
let time_of_first_write = chunks
|
||||
.iter()
|
||||
.map(|c| c.time_of_first_write)
|
||||
.map(time_to_ts)
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let time_of_last_write = chunks
|
||||
.iter()
|
||||
.map(|c| c.time_of_last_write)
|
||||
.map(time_to_ts)
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let time_closed = chunks
|
||||
.iter()
|
||||
.map(|c| c.time_closed)
|
||||
.map(time_to_ts)
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(id),
|
||||
Arc::new(partition_key),
|
||||
Arc::new(table_name),
|
||||
Arc::new(storage),
|
||||
Arc::new(lifecycle_action),
|
||||
Arc::new(memory_bytes),
|
||||
Arc::new(object_store_bytes),
|
||||
Arc::new(row_counts),
|
||||
Arc::new(time_of_first_write),
|
||||
Arc::new(time_of_last_write),
|
||||
Arc::new(time_closed),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::{DateTime, NaiveDateTime, Utc};
|
||||
|
||||
use arrow_util::assert_batches_eq;
|
||||
use data_types::chunk_metadata::{ChunkLifecycleAction, ChunkStorage};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_from_chunk_summaries() {
|
||||
let chunks = vec![
|
||||
ChunkSummary {
|
||||
partition_key: Arc::from("p1"),
|
||||
table_name: Arc::from("table1"),
|
||||
id: 0,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action: None,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(10, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
ChunkSummary {
|
||||
partition_key: Arc::from("p1"),
|
||||
table_name: Arc::from("table1"),
|
||||
id: 1,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action: Some(ChunkLifecycleAction::Persisting),
|
||||
memory_bytes: 23455,
|
||||
object_store_bytes: 0,
|
||||
row_count: 22,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(80, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_closed: None,
|
||||
},
|
||||
ChunkSummary {
|
||||
partition_key: Arc::from("p1"),
|
||||
table_name: Arc::from("table1"),
|
||||
id: 2,
|
||||
storage: ChunkStorage::ObjectStoreOnly,
|
||||
lifecycle_action: None,
|
||||
memory_bytes: 1234,
|
||||
object_store_bytes: 5678,
|
||||
row_count: 33,
|
||||
time_of_first_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(100, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_of_last_write: Some(DateTime::from_utc(
|
||||
NaiveDateTime::from_timestamp(200, 0),
|
||||
Utc,
|
||||
)),
|
||||
time_closed: None,
|
||||
},
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
|
||||
"| id | partition_key | table_name | storage | lifecycle_action | memory_bytes | object_store_bytes | row_count | time_of_first_write | time_of_last_write | time_closed |",
|
||||
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
|
||||
"| 0 | p1 | table1 | OpenMutableBuffer | | 23754 | | 11 | 1970-01-01 00:00:10 | | |",
|
||||
"| 1 | p1 | table1 | OpenMutableBuffer | Persisting to Object Storage | 23455 | | 22 | | 1970-01-01 00:01:20 | |",
|
||||
"| 2 | p1 | table1 | ObjectStoreOnly | | 1234 | 5678 | 33 | 1970-01-01 00:01:40 | 1970-01-01 00:03:20 | |",
|
||||
"+----+---------------+------------+-------------------+------------------------------+--------------+--------------------+-----------+---------------------+---------------------+-------------+",
|
||||
];
|
||||
|
||||
let schema = chunk_summaries_schema();
|
||||
let batch = from_chunk_summaries(schema, chunks).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,404 @@
|
|||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{ArrayRef, StringBuilder, UInt32Builder, UInt64Builder};
|
||||
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
|
||||
use arrow::error::Result;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
|
||||
use data_types::chunk_metadata::DetailedChunkSummary;
|
||||
use data_types::error::ErrorLogger;
|
||||
use data_types::partition_metadata::{PartitionSummary, TableSummary};
|
||||
|
||||
use crate::db::catalog::Catalog;
|
||||
use crate::db::system_tables::IoxSystemTable;
|
||||
|
||||
/// Implementation of `system.columns` system table
|
||||
#[derive(Debug)]
|
||||
pub(super) struct ColumnsTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl ColumnsTable {
|
||||
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: partition_summaries_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for ColumnsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_partition_summaries(self.schema(), self.catalog.partition_summaries())
|
||||
.log_if_error("system.columns table")
|
||||
}
|
||||
}
|
||||
|
||||
fn partition_summaries_schema() -> SchemaRef {
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("column_name", DataType::Utf8, false),
|
||||
Field::new("column_type", DataType::Utf8, false),
|
||||
Field::new("influxdb_type", DataType::Utf8, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_partition_summaries(
|
||||
schema: SchemaRef,
|
||||
partitions: Vec<PartitionSummary>,
|
||||
) -> Result<RecordBatch> {
|
||||
// Assume each partition has roughly 5 tables with 5 columns
|
||||
let row_estimate = partitions.len() * 25;
|
||||
|
||||
let mut partition_key = StringBuilder::new(row_estimate);
|
||||
let mut table_name = StringBuilder::new(row_estimate);
|
||||
let mut column_name = StringBuilder::new(row_estimate);
|
||||
let mut column_type = StringBuilder::new(row_estimate);
|
||||
let mut influxdb_type = StringBuilder::new(row_estimate);
|
||||
|
||||
// Note no rows are produced for partitions with no tabes, or
|
||||
// tables with no columns: There are other tables to list tables
|
||||
// and columns
|
||||
for partition in partitions {
|
||||
let table = partition.table;
|
||||
for column in table.columns {
|
||||
partition_key.append_value(&partition.key)?;
|
||||
table_name.append_value(&table.name)?;
|
||||
column_name.append_value(&column.name)?;
|
||||
column_type.append_value(column.type_name())?;
|
||||
if let Some(t) = &column.influxdb_type {
|
||||
influxdb_type.append_value(t.as_str())?;
|
||||
} else {
|
||||
influxdb_type.append_null()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(partition_key.finish()) as ArrayRef,
|
||||
Arc::new(table_name.finish()),
|
||||
Arc::new(column_name.finish()),
|
||||
Arc::new(column_type.finish()),
|
||||
Arc::new(influxdb_type.finish()),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
/// Implementation of system.column_chunks table
|
||||
#[derive(Debug)]
|
||||
pub(super) struct ChunkColumnsTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl ChunkColumnsTable {
|
||||
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: chunk_columns_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for ChunkColumnsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
assemble_chunk_columns(self.schema(), self.catalog.detailed_chunk_summaries())
|
||||
.log_if_error("system.column_chunks table")
|
||||
}
|
||||
}
|
||||
|
||||
fn chunk_columns_schema() -> SchemaRef {
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("chunk_id", DataType::UInt32, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("column_name", DataType::Utf8, false),
|
||||
Field::new("storage", DataType::Utf8, false),
|
||||
Field::new("row_count", DataType::UInt64, true),
|
||||
Field::new("min_value", DataType::Utf8, true),
|
||||
Field::new("max_value", DataType::Utf8, true),
|
||||
Field::new("memory_bytes", DataType::UInt64, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn assemble_chunk_columns(
|
||||
schema: SchemaRef,
|
||||
chunk_summaries: Vec<(Arc<TableSummary>, DetailedChunkSummary)>,
|
||||
) -> Result<RecordBatch> {
|
||||
/// Builds an index from column_name -> size
|
||||
fn make_column_index(summary: &DetailedChunkSummary) -> HashMap<&str, u64> {
|
||||
summary
|
||||
.columns
|
||||
.iter()
|
||||
.map(|column_summary| {
|
||||
(
|
||||
column_summary.name.as_ref(),
|
||||
column_summary.memory_bytes as u64,
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Assume each chunk has roughly 5 columns
|
||||
let row_estimate = chunk_summaries.len() * 5;
|
||||
|
||||
let mut partition_key = StringBuilder::new(row_estimate);
|
||||
let mut chunk_id = UInt32Builder::new(row_estimate);
|
||||
let mut table_name = StringBuilder::new(row_estimate);
|
||||
let mut column_name = StringBuilder::new(row_estimate);
|
||||
let mut storage = StringBuilder::new(row_estimate);
|
||||
let mut row_count = UInt64Builder::new(row_estimate);
|
||||
let mut min_values = StringBuilder::new(row_estimate);
|
||||
let mut max_values = StringBuilder::new(row_estimate);
|
||||
let mut memory_bytes = UInt64Builder::new(row_estimate);
|
||||
|
||||
// Note no rows are produced for partitions with no chunks, or
|
||||
// tables with no partitions: There are other tables to list tables
|
||||
// and columns
|
||||
for (table_summary, chunk_summary) in chunk_summaries {
|
||||
let mut column_index = make_column_index(&chunk_summary);
|
||||
let storage_value = chunk_summary.inner.storage.as_str();
|
||||
|
||||
for column in &table_summary.columns {
|
||||
partition_key.append_value(chunk_summary.inner.partition_key.as_ref())?;
|
||||
chunk_id.append_value(chunk_summary.inner.id)?;
|
||||
table_name.append_value(&chunk_summary.inner.table_name)?;
|
||||
column_name.append_value(&column.name)?;
|
||||
storage.append_value(storage_value)?;
|
||||
row_count.append_value(column.count())?;
|
||||
if let Some(v) = column.stats.min_as_str() {
|
||||
min_values.append_value(v)?;
|
||||
} else {
|
||||
min_values.append(false)?;
|
||||
}
|
||||
if let Some(v) = column.stats.max_as_str() {
|
||||
max_values.append_value(v)?;
|
||||
} else {
|
||||
max_values.append(false)?;
|
||||
}
|
||||
|
||||
let size = column_index.remove(column.name.as_str());
|
||||
|
||||
memory_bytes.append_option(size)?;
|
||||
}
|
||||
}
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(partition_key.finish()) as ArrayRef,
|
||||
Arc::new(chunk_id.finish()),
|
||||
Arc::new(table_name.finish()),
|
||||
Arc::new(column_name.finish()),
|
||||
Arc::new(storage.finish()),
|
||||
Arc::new(row_count.finish()),
|
||||
Arc::new(min_values.finish()),
|
||||
Arc::new(max_values.finish()),
|
||||
Arc::new(memory_bytes.finish()),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use arrow_util::assert_batches_eq;
|
||||
use data_types::chunk_metadata::{ChunkColumnSummary, ChunkStorage, ChunkSummary};
|
||||
use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_from_partition_summaries() {
|
||||
let partitions = vec![
|
||||
PartitionSummary {
|
||||
key: "p1".to_string(),
|
||||
table: TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![
|
||||
ColumnSummary {
|
||||
name: "c1".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Tag),
|
||||
stats: Statistics::I64(StatValues::new_with_value(23)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "c2".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::I64(StatValues::new_with_value(43)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "c3".to_string(),
|
||||
influxdb_type: None,
|
||||
stats: Statistics::String(StatValues::new_with_value(
|
||||
"foo".to_string(),
|
||||
)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "time".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Timestamp),
|
||||
stats: Statistics::I64(StatValues::new_with_value(43)),
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
PartitionSummary {
|
||||
key: "p3".to_string(),
|
||||
table: TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![],
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+---------------+------------+-------------+-------------+---------------+",
|
||||
"| partition_key | table_name | column_name | column_type | influxdb_type |",
|
||||
"+---------------+------------+-------------+-------------+---------------+",
|
||||
"| p1 | t1 | c1 | I64 | Tag |",
|
||||
"| p1 | t1 | c2 | I64 | Field |",
|
||||
"| p1 | t1 | c3 | String | |",
|
||||
"| p1 | t1 | time | I64 | Timestamp |",
|
||||
"+---------------+------------+-------------+-------------+---------------+",
|
||||
];
|
||||
|
||||
let batch = from_partition_summaries(partition_summaries_schema(), partitions).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_assemble_chunk_columns() {
|
||||
let lifecycle_action = None;
|
||||
|
||||
let summaries = vec![
|
||||
(
|
||||
Arc::new(TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![
|
||||
ColumnSummary {
|
||||
name: "c1".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::String(StatValues::new(
|
||||
Some("bar".to_string()),
|
||||
Some("foo".to_string()),
|
||||
55,
|
||||
)),
|
||||
},
|
||||
ColumnSummary {
|
||||
name: "c2".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::F64(StatValues::new(Some(11.0), Some(43.0), 66)),
|
||||
},
|
||||
],
|
||||
}),
|
||||
DetailedChunkSummary {
|
||||
inner: ChunkSummary {
|
||||
partition_key: "p1".into(),
|
||||
table_name: "t1".into(),
|
||||
id: 42,
|
||||
storage: ChunkStorage::ReadBuffer,
|
||||
lifecycle_action,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
columns: vec![
|
||||
ChunkColumnSummary {
|
||||
name: "c1".into(),
|
||||
memory_bytes: 11,
|
||||
},
|
||||
ChunkColumnSummary {
|
||||
name: "c2".into(),
|
||||
memory_bytes: 12,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
(
|
||||
Arc::new(TableSummary {
|
||||
name: "t1".to_string(),
|
||||
columns: vec![ColumnSummary {
|
||||
name: "c1".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::F64(StatValues::new(Some(110.0), Some(430.0), 667)),
|
||||
}],
|
||||
}),
|
||||
DetailedChunkSummary {
|
||||
inner: ChunkSummary {
|
||||
partition_key: "p2".into(),
|
||||
table_name: "t1".into(),
|
||||
id: 43,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
columns: vec![ChunkColumnSummary {
|
||||
name: "c1".into(),
|
||||
memory_bytes: 100,
|
||||
}],
|
||||
},
|
||||
),
|
||||
(
|
||||
Arc::new(TableSummary {
|
||||
name: "t2".to_string(),
|
||||
columns: vec![ColumnSummary {
|
||||
name: "c3".to_string(),
|
||||
influxdb_type: Some(InfluxDbType::Field),
|
||||
stats: Statistics::F64(StatValues::new(Some(-1.0), Some(2.0), 4)),
|
||||
}],
|
||||
}),
|
||||
DetailedChunkSummary {
|
||||
inner: ChunkSummary {
|
||||
partition_key: "p2".into(),
|
||||
table_name: "t2".into(),
|
||||
id: 44,
|
||||
storage: ChunkStorage::OpenMutableBuffer,
|
||||
lifecycle_action,
|
||||
memory_bytes: 23754,
|
||||
object_store_bytes: 0,
|
||||
row_count: 11,
|
||||
time_of_first_write: None,
|
||||
time_of_last_write: None,
|
||||
time_closed: None,
|
||||
},
|
||||
columns: vec![ChunkColumnSummary {
|
||||
name: "c3".into(),
|
||||
memory_bytes: 200,
|
||||
}],
|
||||
},
|
||||
),
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
|
||||
"| partition_key | chunk_id | table_name | column_name | storage | row_count | min_value | max_value | memory_bytes |",
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
|
||||
"| p1 | 42 | t1 | c1 | ReadBuffer | 55 | bar | foo | 11 |",
|
||||
"| p1 | 42 | t1 | c2 | ReadBuffer | 66 | 11 | 43 | 12 |",
|
||||
"| p2 | 43 | t1 | c1 | OpenMutableBuffer | 667 | 110 | 430 | 100 |",
|
||||
"| p2 | 44 | t2 | c3 | OpenMutableBuffer | 4 | -1 | 2 | 200 |",
|
||||
"+---------------+----------+------------+-------------+-------------------+-----------+-----------+-----------+--------------+",
|
||||
];
|
||||
|
||||
let batch = assemble_chunk_columns(chunk_columns_schema(), summaries).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{ArrayRef, StringArray, Time64NanosecondArray, UInt32Array};
|
||||
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
|
||||
use arrow::error::Result;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
|
||||
use data_types::error::ErrorLogger;
|
||||
use data_types::job::Job;
|
||||
use tracker::TaskTracker;
|
||||
|
||||
use crate::db::system_tables::IoxSystemTable;
|
||||
use crate::JobRegistry;
|
||||
|
||||
/// Implementation of system.operations table
|
||||
#[derive(Debug)]
|
||||
pub(super) struct OperationsTable {
|
||||
schema: SchemaRef,
|
||||
db_name: String,
|
||||
jobs: Arc<JobRegistry>,
|
||||
}
|
||||
|
||||
impl OperationsTable {
|
||||
pub(super) fn new(db_name: String, jobs: Arc<JobRegistry>) -> Self {
|
||||
Self {
|
||||
schema: operations_schema(),
|
||||
db_name,
|
||||
jobs,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for OperationsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_task_trackers(self.schema(), &self.db_name, self.jobs.tracked())
|
||||
.log_if_error("system.operations table")
|
||||
}
|
||||
}
|
||||
|
||||
fn operations_schema() -> SchemaRef {
|
||||
let ts = DataType::Time64(TimeUnit::Nanosecond);
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Utf8, false),
|
||||
Field::new("status", DataType::Utf8, true),
|
||||
Field::new("cpu_time_used", ts.clone(), true),
|
||||
Field::new("wall_time_used", ts, true),
|
||||
Field::new("partition_key", DataType::Utf8, true),
|
||||
Field::new("chunk_id", DataType::UInt32, true),
|
||||
Field::new("description", DataType::Utf8, true),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_task_trackers(
|
||||
schema: SchemaRef,
|
||||
db_name: &str,
|
||||
jobs: Vec<TaskTracker<Job>>,
|
||||
) -> Result<RecordBatch> {
|
||||
let jobs = jobs
|
||||
.into_iter()
|
||||
.filter(|job| job.metadata().db_name() == Some(db_name))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let ids = jobs
|
||||
.iter()
|
||||
.map(|job| Some(job.id().to_string()))
|
||||
.collect::<StringArray>();
|
||||
let statuses = jobs
|
||||
.iter()
|
||||
.map(|job| Some(job.get_status().name()))
|
||||
.collect::<StringArray>();
|
||||
let cpu_time_used = jobs
|
||||
.iter()
|
||||
.map(|job| job.get_status().cpu_nanos().map(|n| n as i64))
|
||||
.collect::<Time64NanosecondArray>();
|
||||
let wall_time_used = jobs
|
||||
.iter()
|
||||
.map(|job| job.get_status().wall_nanos().map(|n| n as i64))
|
||||
.collect::<Time64NanosecondArray>();
|
||||
let partition_keys = jobs
|
||||
.iter()
|
||||
.map(|job| job.metadata().partition_key())
|
||||
.collect::<StringArray>();
|
||||
let chunk_ids = jobs
|
||||
.iter()
|
||||
.map(|job| job.metadata().chunk_id())
|
||||
.collect::<UInt32Array>();
|
||||
let descriptions = jobs
|
||||
.iter()
|
||||
.map(|job| Some(job.metadata().description()))
|
||||
.collect::<StringArray>();
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(ids) as ArrayRef,
|
||||
Arc::new(statuses),
|
||||
Arc::new(cpu_time_used),
|
||||
Arc::new(wall_time_used),
|
||||
Arc::new(partition_keys),
|
||||
Arc::new(chunk_ids),
|
||||
Arc::new(descriptions),
|
||||
],
|
||||
)
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use arrow::array::{StringArray, TimestampNanosecondArray, UInt64Array};
|
||||
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
|
||||
use arrow::error::Result;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
|
||||
use data_types::error::ErrorLogger;
|
||||
use data_types::partition_metadata::PartitionAddr;
|
||||
use data_types::write_summary::WriteSummary;
|
||||
|
||||
use crate::db::catalog::Catalog;
|
||||
use crate::db::system_tables::IoxSystemTable;
|
||||
|
||||
/// Implementation of system.persistence_windows table
|
||||
#[derive(Debug)]
|
||||
pub(super) struct PersistenceWindowsTable {
|
||||
schema: SchemaRef,
|
||||
catalog: Arc<Catalog>,
|
||||
}
|
||||
|
||||
impl PersistenceWindowsTable {
|
||||
pub(super) fn new(catalog: Arc<Catalog>) -> Self {
|
||||
Self {
|
||||
schema: persistence_windows_schema(),
|
||||
catalog,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IoxSystemTable for PersistenceWindowsTable {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
Arc::clone(&self.schema)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<RecordBatch> {
|
||||
from_write_summaries(self.schema(), self.catalog.persistence_summaries())
|
||||
.log_if_error("system.persistence_windows table")
|
||||
}
|
||||
}
|
||||
|
||||
fn persistence_windows_schema() -> SchemaRef {
|
||||
let ts = DataType::Timestamp(TimeUnit::Nanosecond, None);
|
||||
Arc::new(Schema::new(vec![
|
||||
Field::new("partition_key", DataType::Utf8, false),
|
||||
Field::new("table_name", DataType::Utf8, false),
|
||||
Field::new("row_count", DataType::UInt64, false),
|
||||
Field::new("time_of_first_write", ts.clone(), false),
|
||||
Field::new("time_of_last_write", ts.clone(), false),
|
||||
Field::new("min_timestamp", ts.clone(), false),
|
||||
Field::new("max_timestamp", ts, false),
|
||||
]))
|
||||
}
|
||||
|
||||
fn from_write_summaries(
|
||||
schema: SchemaRef,
|
||||
chunks: Vec<(PartitionAddr, WriteSummary)>,
|
||||
) -> Result<RecordBatch> {
|
||||
let partition_key = chunks
|
||||
.iter()
|
||||
.map(|(addr, _)| Some(addr.partition_key.as_ref()))
|
||||
.collect::<StringArray>();
|
||||
let table_name = chunks
|
||||
.iter()
|
||||
.map(|(addr, _)| Some(addr.table_name.as_ref()))
|
||||
.collect::<StringArray>();
|
||||
let row_counts = chunks
|
||||
.iter()
|
||||
.map(|(_, w)| Some(w.row_count as u64))
|
||||
.collect::<UInt64Array>();
|
||||
let time_of_first_write = chunks
|
||||
.iter()
|
||||
.map(|(_, w)| Some(w.time_of_first_write.timestamp_nanos()))
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let time_of_last_write = chunks
|
||||
.iter()
|
||||
.map(|(_, w)| Some(w.time_of_last_write.timestamp_nanos()))
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let min_timestamp = chunks
|
||||
.iter()
|
||||
.map(|(_, w)| Some(w.min_timestamp.timestamp_nanos()))
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
let max_timestamp = chunks
|
||||
.iter()
|
||||
.map(|(_, w)| Some(w.max_timestamp.timestamp_nanos()))
|
||||
.collect::<TimestampNanosecondArray>();
|
||||
|
||||
RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(partition_key),
|
||||
Arc::new(table_name),
|
||||
Arc::new(row_counts),
|
||||
Arc::new(time_of_first_write),
|
||||
Arc::new(time_of_last_write),
|
||||
Arc::new(min_timestamp),
|
||||
Arc::new(max_timestamp),
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::{TimeZone, Utc};
|
||||
|
||||
use arrow_util::assert_batches_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_from_write_summaries() {
|
||||
let addr = PartitionAddr {
|
||||
db_name: Arc::from("db"),
|
||||
table_name: Arc::from("table"),
|
||||
partition_key: Arc::from("partition"),
|
||||
};
|
||||
|
||||
let summaries = vec![
|
||||
(
|
||||
addr.clone(),
|
||||
WriteSummary {
|
||||
time_of_first_write: Utc.timestamp_nanos(0),
|
||||
time_of_last_write: Utc.timestamp_nanos(20),
|
||||
min_timestamp: Utc.timestamp_nanos(50),
|
||||
max_timestamp: Utc.timestamp_nanos(60),
|
||||
row_count: 320,
|
||||
},
|
||||
),
|
||||
(
|
||||
addr,
|
||||
WriteSummary {
|
||||
time_of_first_write: Utc.timestamp_nanos(6),
|
||||
time_of_last_write: Utc.timestamp_nanos(21),
|
||||
min_timestamp: Utc.timestamp_nanos(1),
|
||||
max_timestamp: Utc.timestamp_nanos(2),
|
||||
row_count: 2,
|
||||
},
|
||||
),
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
"+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
|
||||
"| partition_key | table_name | row_count | time_of_first_write | time_of_last_write | min_timestamp | max_timestamp |",
|
||||
"+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
|
||||
"| partition | table | 320 | 1970-01-01 00:00:00 | 1970-01-01 00:00:00.000000020 | 1970-01-01 00:00:00.000000050 | 1970-01-01 00:00:00.000000060 |",
|
||||
"| partition | table | 2 | 1970-01-01 00:00:00.000000006 | 1970-01-01 00:00:00.000000021 | 1970-01-01 00:00:00.000000001 | 1970-01-01 00:00:00.000000002 |",
|
||||
"+---------------+------------+-----------+-------------------------------+-------------------------------+-------------------------------+-------------------------------+",
|
||||
];
|
||||
|
||||
let schema = persistence_windows_schema();
|
||||
let batch = from_write_summaries(schema, summaries).unwrap();
|
||||
assert_batches_eq!(&expected, &[batch]);
|
||||
}
|
||||
}
|
|
@ -2,29 +2,19 @@
|
|||
use data_types::{
|
||||
database_rules::{DatabaseRules, WriteBufferConnection},
|
||||
database_state::DatabaseStateCode,
|
||||
server_id::ServerId,
|
||||
error::ErrorLogger,
|
||||
DatabaseName,
|
||||
};
|
||||
use futures::TryStreamExt;
|
||||
use generated_types::database_rules::decode_database_rules;
|
||||
use internal_types::once::OnceNonZeroU32;
|
||||
use object_store::{
|
||||
path::{parsed::DirsAndFileName, ObjectStorePath, Path},
|
||||
ObjectStore, ObjectStoreApi,
|
||||
};
|
||||
use observability_deps::tracing::{debug, error, info, warn};
|
||||
use parking_lot::Mutex;
|
||||
use observability_deps::tracing::{error, info, warn};
|
||||
use parquet_file::catalog::PreservedCatalog;
|
||||
use query::exec::Executor;
|
||||
use snafu::{OptionExt, ResultExt, Snafu};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
use snafu::{ResultExt, Snafu};
|
||||
use std::sync::Arc;
|
||||
use write_buffer::config::WriteBufferConfig;
|
||||
|
||||
use crate::{
|
||||
|
@ -45,9 +35,6 @@ pub enum Error {
|
|||
source: generated_types::database_rules::DecodeError,
|
||||
},
|
||||
|
||||
#[snafu(display("id already set"))]
|
||||
IdAlreadySet { id: ServerId },
|
||||
|
||||
#[snafu(display("unable to use server until id is set"))]
|
||||
IdNotSet,
|
||||
|
||||
|
@ -97,472 +84,254 @@ pub enum Error {
|
|||
|
||||
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CurrentServerId(OnceNonZeroU32);
|
||||
/// Loads the database configurations based on the databases in the
|
||||
/// object store. Any databases in the config already won't be
|
||||
/// replaced.
|
||||
///
|
||||
/// Returns a Vec containing the results of loading the contained databases
|
||||
pub(crate) async fn initialize_server(
|
||||
config: Arc<Config>,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<Vec<(DatabaseName<'static>, Result<()>)>> {
|
||||
let root = config.root_path();
|
||||
|
||||
impl CurrentServerId {
|
||||
pub fn set(&self, id: ServerId) -> Result<()> {
|
||||
let id = id.get();
|
||||
// get the database names from the object store prefixes
|
||||
// TODO: update object store to pull back all common prefixes by
|
||||
// following the next tokens.
|
||||
let list_result = config
|
||||
.object_store()
|
||||
.list_with_delimiter(&root)
|
||||
.await
|
||||
.context(StoreError)?;
|
||||
|
||||
match self.0.set(id) {
|
||||
Ok(()) => {
|
||||
info!(server_id = id, "server ID set");
|
||||
Ok(())
|
||||
}
|
||||
Err(id) => Err(Error::IdAlreadySet {
|
||||
id: ServerId::new(id),
|
||||
}),
|
||||
}
|
||||
}
|
||||
let handles: Vec<_> = list_result
|
||||
.common_prefixes
|
||||
.into_iter()
|
||||
.filter_map(|mut path| {
|
||||
let config = Arc::clone(&config);
|
||||
let root = root.clone();
|
||||
path.set_file_name(DB_RULES_FILE_NAME);
|
||||
let db_name = db_name_from_rules_path(&path)
|
||||
.log_if_error("invalid database path")
|
||||
.ok()?;
|
||||
|
||||
pub fn get(&self) -> Result<ServerId> {
|
||||
self.0.get().map(ServerId::new).context(IdNotSet)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct InitStatus {
|
||||
pub server_id: CurrentServerId,
|
||||
|
||||
/// Flags that databases are loaded and server is ready to read/write data.
|
||||
initialized: AtomicBool,
|
||||
|
||||
/// Semaphore that limits the number of jobs that load DBs when the serverID is set.
|
||||
///
|
||||
/// Note that this semaphore is more of a "lock" than an arbitrary semaphore. All the other sync structures (mutex,
|
||||
/// rwlock) require something to be wrapped which we don't have in our case, so we're using a semaphore here. We
|
||||
/// want exactly 1 background worker to mess with the server init / DB loading, otherwise everything in the critical
|
||||
/// section (in [`maybe_initialize_server`](Self::maybe_initialize_server)) will break apart. So this semaphore
|
||||
/// cannot be configured.
|
||||
initialize_semaphore: Semaphore,
|
||||
|
||||
/// Error occurred during generic server init (e.g. listing store content).
|
||||
error_generic: Mutex<Option<Arc<Error>>>,
|
||||
|
||||
/// Errors that occurred during some DB init.
|
||||
errors_databases: Arc<Mutex<HashMap<String, Arc<Error>>>>,
|
||||
|
||||
/// Automatic wipe-on-error recovery
|
||||
///
|
||||
/// See <https://github.com/influxdata/influxdb_iox/issues/1522>
|
||||
pub(crate) wipe_on_error: AtomicBool,
|
||||
}
|
||||
|
||||
impl InitStatus {
|
||||
/// Create new "not initialized" status.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
server_id: Default::default(),
|
||||
initialized: AtomicBool::new(false),
|
||||
// Always set semaphore permits to `1`, see design comments in `Server::initialize_semaphore`.
|
||||
initialize_semaphore: Semaphore::new(1),
|
||||
error_generic: Default::default(),
|
||||
errors_databases: Default::default(),
|
||||
wipe_on_error: AtomicBool::new(true),
|
||||
}
|
||||
}
|
||||
|
||||
/// Base location in object store for this writer.
|
||||
pub fn root_path(&self, store: &ObjectStore) -> Result<Path> {
|
||||
let id = self.server_id.get()?;
|
||||
|
||||
let mut path = store.new_path();
|
||||
path.push_dir(format!("{}", id));
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
/// Check if server is loaded. Databases are loaded and server is ready to read/write.
|
||||
pub fn initialized(&self) -> bool {
|
||||
// Need `Acquire` ordering because IF we a `true` here, this thread will likely also read data that
|
||||
// `maybe_initialize_server` wrote before toggling the flag with `Release`. The `Acquire` flag here ensures that
|
||||
// every data acccess AFTER the following line will also stay AFTER this line.
|
||||
self.initialized.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
/// Error occurred during generic server init (e.g. listing store content).
|
||||
pub fn error_generic(&self) -> Option<Arc<Error>> {
|
||||
let guard = self.error_generic.lock();
|
||||
guard.clone()
|
||||
}
|
||||
|
||||
/// List all databases with errors in sorted order.
|
||||
pub fn databases_with_errors(&self) -> Vec<String> {
|
||||
let guard = self.errors_databases.lock();
|
||||
let mut names: Vec<_> = guard.keys().cloned().collect();
|
||||
names.sort();
|
||||
names
|
||||
}
|
||||
|
||||
/// Error that occurred during initialization of a specific database.
|
||||
pub fn error_database(&self, db_name: &str) -> Option<Arc<Error>> {
|
||||
let guard = self.errors_databases.lock();
|
||||
guard.get(db_name).cloned()
|
||||
}
|
||||
|
||||
/// Loads the database configurations based on the databases in the
|
||||
/// object store. Any databases in the config already won't be
|
||||
/// replaced.
|
||||
///
|
||||
/// This requires the serverID to be set (will be a no-op otherwise).
|
||||
///
|
||||
/// It will be a no-op if the configs are already loaded and the server is ready.
|
||||
pub(crate) async fn maybe_initialize_server(
|
||||
&self,
|
||||
store: Arc<ObjectStore>,
|
||||
config: Arc<Config>,
|
||||
exec: Arc<Executor>,
|
||||
) {
|
||||
let server_id = match self.server_id.get() {
|
||||
Ok(id) => id,
|
||||
Err(e) => {
|
||||
debug!(%e, "cannot initialize server because cannot get serverID");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let _guard = self
|
||||
.initialize_semaphore
|
||||
.acquire()
|
||||
.await
|
||||
.expect("semaphore should not be closed");
|
||||
|
||||
// Note that we use Acquire-Release ordering for the atomic within the semaphore to ensure that another thread
|
||||
// that enters this semaphore after we've left actually sees the correct `is_ready` flag.
|
||||
if self.initialized.load(Ordering::Acquire) {
|
||||
// already loaded, so do nothing
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if there was a previous failed attempt
|
||||
if self.error_generic().is_some() {
|
||||
return;
|
||||
}
|
||||
|
||||
match self
|
||||
.maybe_initialize_server_inner(store, config, exec, server_id)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
// mark as ready (use correct ordering for Acquire-Release)
|
||||
self.initialized.store(true, Ordering::Release);
|
||||
info!("loaded databases, server is initalized");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(%e, "error during server init");
|
||||
let mut guard = self.error_generic.lock();
|
||||
*guard = Some(Arc::new(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn maybe_initialize_server_inner(
|
||||
&self,
|
||||
store: Arc<ObjectStore>,
|
||||
config: Arc<Config>,
|
||||
exec: Arc<Executor>,
|
||||
server_id: ServerId,
|
||||
) -> Result<()> {
|
||||
let root = self.root_path(&store)?;
|
||||
|
||||
// get the database names from the object store prefixes
|
||||
// TODO: update object store to pull back all common prefixes by
|
||||
// following the next tokens.
|
||||
let list_result = store.list_with_delimiter(&root).await.context(StoreError)?;
|
||||
|
||||
let handles: Vec<_> = list_result
|
||||
.common_prefixes
|
||||
.into_iter()
|
||||
.filter_map(|mut path| {
|
||||
let store = Arc::clone(&store);
|
||||
let config = Arc::clone(&config);
|
||||
let exec = Arc::clone(&exec);
|
||||
let errors_databases = Arc::clone(&self.errors_databases);
|
||||
let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
|
||||
let root = root.clone();
|
||||
|
||||
path.set_file_name(DB_RULES_FILE_NAME);
|
||||
|
||||
match db_name_from_rules_path(&path) {
|
||||
Ok(db_name) => {
|
||||
let handle = tokio::task::spawn(async move {
|
||||
match Self::initialize_database(
|
||||
server_id,
|
||||
store,
|
||||
config,
|
||||
exec,
|
||||
root,
|
||||
db_name.clone(),
|
||||
wipe_on_error,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
info!(%db_name, "database initialized");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(%e, %db_name, "cannot load database");
|
||||
let mut guard = errors_databases.lock();
|
||||
guard.insert(db_name.to_string(), Arc::new(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
Some(handle)
|
||||
}
|
||||
Err(e) => {
|
||||
error!(%e, "invalid database path");
|
||||
None
|
||||
}
|
||||
}
|
||||
Some(async move {
|
||||
let result =
|
||||
initialize_database(config, root, db_name.clone(), wipe_on_error).await;
|
||||
(db_name, result)
|
||||
})
|
||||
.collect();
|
||||
})
|
||||
.collect();
|
||||
|
||||
futures::future::join_all(handles).await;
|
||||
Ok(futures::future::join_all(handles).await)
|
||||
}
|
||||
|
||||
async fn initialize_database(
|
||||
config: Arc<Config>,
|
||||
root: Path,
|
||||
db_name: DatabaseName<'static>,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<()> {
|
||||
// Reserve name before expensive IO (e.g. loading the preserved catalog)
|
||||
let mut handle = config
|
||||
.create_db(db_name)
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
match try_advance_database_init_process_until_complete(&mut handle, &root, wipe_on_error).await
|
||||
{
|
||||
Ok(true) => {
|
||||
// finished init and keep DB
|
||||
handle.commit();
|
||||
Ok(())
|
||||
}
|
||||
Ok(false) => {
|
||||
// finished but do not keep DB
|
||||
handle.abort();
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
// encountered some error, still commit intermediate result
|
||||
handle.commit();
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_database_rules(store: Arc<ObjectStore>, path: Path) -> Result<Option<DatabaseRules>> {
|
||||
let serialized_rules = loop {
|
||||
match get_database_config_bytes(&path, &store).await {
|
||||
Ok(data) => break data,
|
||||
Err(e) => {
|
||||
if let Error::NoDatabaseConfigError { location } = &e {
|
||||
warn!(?location, "{}", e);
|
||||
return Ok(None);
|
||||
}
|
||||
error!(
|
||||
"error getting database config {:?} from object store: {}",
|
||||
path, e
|
||||
);
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
|
||||
.await;
|
||||
}
|
||||
}
|
||||
};
|
||||
let rules = decode_database_rules(serialized_rules.freeze())
|
||||
.context(ErrorDeserializingRulesProtobuf)?;
|
||||
|
||||
Ok(Some(rules))
|
||||
}
|
||||
|
||||
pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
|
||||
config: Arc<Config>,
|
||||
db_name: &DatabaseName<'static>,
|
||||
) -> Result<()> {
|
||||
let store = config.object_store();
|
||||
|
||||
if config.has_uninitialized_database(db_name) {
|
||||
let mut handle = config
|
||||
.recover_db(db_name.clone())
|
||||
.map_err(|e| Arc::new(e) as _)
|
||||
.context(RecoverDbError)?;
|
||||
|
||||
if !((handle.state_code() == DatabaseStateCode::Known)
|
||||
|| (handle.state_code() == DatabaseStateCode::RulesLoaded))
|
||||
{
|
||||
// cannot wipe because init state is already too far
|
||||
return Err(Error::DbPartiallyInitialized {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// wipe while holding handle so no other init/wipe process can interact with the catalog
|
||||
PreservedCatalog::wipe(&store, handle.server_id(), db_name)
|
||||
.await
|
||||
.map_err(Box::new)
|
||||
.context(PreservedCatalogWipeError)?;
|
||||
|
||||
let root = config.root_path();
|
||||
|
||||
let result =
|
||||
try_advance_database_init_process_until_complete(&mut handle, &root, true).await;
|
||||
|
||||
// Commit changes even if failed
|
||||
handle.commit();
|
||||
result.map(|_| ())
|
||||
} else {
|
||||
let handle = config
|
||||
.block_db(db_name.clone())
|
||||
.map_err(|e| Arc::new(e) as _)
|
||||
.context(RecoverDbError)?;
|
||||
|
||||
PreservedCatalog::wipe(&store, config.server_id(), db_name)
|
||||
.await
|
||||
.map_err(Box::new)
|
||||
.context(PreservedCatalogWipeError)?;
|
||||
|
||||
drop(handle);
|
||||
|
||||
info!(%db_name, "wiped preserved catalog of non-registered database");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn initialize_database(
|
||||
server_id: ServerId,
|
||||
store: Arc<ObjectStore>,
|
||||
config: Arc<Config>,
|
||||
exec: Arc<Executor>,
|
||||
root: Path,
|
||||
db_name: DatabaseName<'static>,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<()> {
|
||||
// Reserve name before expensive IO (e.g. loading the preserved catalog)
|
||||
let mut handle = config
|
||||
.create_db(store, exec, server_id, db_name)
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
match Self::try_advance_database_init_process_until_complete(
|
||||
&mut handle,
|
||||
&root,
|
||||
wipe_on_error,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(true) => {
|
||||
// finished init and keep DB
|
||||
handle.commit();
|
||||
Ok(())
|
||||
/// Try to make as much progress as possible with DB init.
|
||||
///
|
||||
/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
|
||||
/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
|
||||
/// (e.g. because not rules file is present.)
|
||||
async fn try_advance_database_init_process_until_complete(
|
||||
handle: &mut DatabaseHandle<'_>,
|
||||
root: &Path,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<bool> {
|
||||
loop {
|
||||
match try_advance_database_init_process(handle, root, wipe_on_error).await? {
|
||||
InitProgress::Unfinished => {}
|
||||
InitProgress::Done => {
|
||||
return Ok(true);
|
||||
}
|
||||
Ok(false) => {
|
||||
// finished but do not keep DB
|
||||
handle.abort();
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
// encountered some error, still commit intermediate result
|
||||
handle.commit();
|
||||
Err(e)
|
||||
InitProgress::Forget => {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_database_rules(
|
||||
store: Arc<ObjectStore>,
|
||||
path: Path,
|
||||
) -> Result<Option<DatabaseRules>> {
|
||||
let serialized_rules = loop {
|
||||
match get_database_config_bytes(&path, &store).await {
|
||||
Ok(data) => break data,
|
||||
Err(e) => {
|
||||
if let Error::NoDatabaseConfigError { location } = &e {
|
||||
warn!(?location, "{}", e);
|
||||
return Ok(None);
|
||||
}
|
||||
error!(
|
||||
"error getting database config {:?} from object store: {}",
|
||||
path, e
|
||||
);
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(STORE_ERROR_PAUSE_SECONDS))
|
||||
.await;
|
||||
/// Try to make some progress in the DB init.
|
||||
async fn try_advance_database_init_process(
|
||||
handle: &mut DatabaseHandle<'_>,
|
||||
root: &Path,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<InitProgress> {
|
||||
match handle.state_code() {
|
||||
DatabaseStateCode::Known => {
|
||||
// known => load DB rules
|
||||
let path = object_store_path_for_database_config(root, &handle.db_name());
|
||||
match load_database_rules(handle.object_store(), path).await? {
|
||||
Some(rules) => {
|
||||
handle
|
||||
.advance_rules_loaded(rules)
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
// there is still more work to do for this DB
|
||||
Ok(InitProgress::Unfinished)
|
||||
}
|
||||
None => {
|
||||
// no rules file present, advice to forget his DB
|
||||
Ok(InitProgress::Forget)
|
||||
}
|
||||
}
|
||||
};
|
||||
let rules = decode_database_rules(serialized_rules.freeze())
|
||||
.context(ErrorDeserializingRulesProtobuf)?;
|
||||
|
||||
Ok(Some(rules))
|
||||
}
|
||||
|
||||
pub(crate) async fn wipe_preserved_catalog_and_maybe_recover(
|
||||
&self,
|
||||
store: Arc<ObjectStore>,
|
||||
config: Arc<Config>,
|
||||
server_id: ServerId,
|
||||
db_name: DatabaseName<'static>,
|
||||
) -> Result<()> {
|
||||
if config.has_uninitialized_database(&db_name) {
|
||||
let mut handle = config
|
||||
.recover_db(db_name.clone())
|
||||
.map_err(|e| Arc::new(e) as _)
|
||||
.context(RecoverDbError)?;
|
||||
|
||||
if !((handle.state_code() == DatabaseStateCode::Known)
|
||||
|| (handle.state_code() == DatabaseStateCode::RulesLoaded))
|
||||
{
|
||||
// cannot wipe because init state is already too far
|
||||
return Err(Error::DbPartiallyInitialized {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// wipe while holding handle so no other init/wipe process can interact with the catalog
|
||||
PreservedCatalog::wipe(&store, handle.server_id(), &db_name)
|
||||
.await
|
||||
.map_err(Box::new)
|
||||
.context(PreservedCatalogWipeError)?;
|
||||
|
||||
let root = self.root_path(&store)?;
|
||||
let wipe_on_error = self.wipe_on_error.load(Ordering::Relaxed);
|
||||
match Self::try_advance_database_init_process_until_complete(
|
||||
&mut handle,
|
||||
&root,
|
||||
}
|
||||
DatabaseStateCode::RulesLoaded => {
|
||||
// rules already loaded => continue with loading preserved catalog
|
||||
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
|
||||
&handle.db_name(),
|
||||
handle.object_store(),
|
||||
handle.server_id(),
|
||||
handle.metrics_registry(),
|
||||
wipe_on_error,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {
|
||||
// yeah, recovered DB
|
||||
handle.commit();
|
||||
.map_err(|e| Box::new(e) as _)
|
||||
.context(CatalogLoadError)?;
|
||||
|
||||
let mut guard = self.errors_databases.lock();
|
||||
guard.remove(&db_name.to_string());
|
||||
|
||||
info!(%db_name, "wiped preserved catalog of registered database and recovered");
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
// could not recover, but still keep new result
|
||||
handle.commit();
|
||||
|
||||
let mut guard = self.errors_databases.lock();
|
||||
let e = Arc::new(e);
|
||||
guard.insert(db_name.to_string(), Arc::clone(&e));
|
||||
|
||||
warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
|
||||
Err(Error::RecoverDbError { source: e })
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let handle = config
|
||||
.block_db(db_name.clone())
|
||||
.map_err(|e| Arc::new(e) as _)
|
||||
.context(RecoverDbError)?;
|
||||
|
||||
PreservedCatalog::wipe(&store, server_id, &db_name)
|
||||
let rules = handle
|
||||
.rules()
|
||||
.expect("in this state rules should be loaded");
|
||||
let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
|
||||
.await
|
||||
.context(CreateWriteBuffer {
|
||||
config: rules.write_buffer_connection.clone(),
|
||||
})?;
|
||||
info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
|
||||
|
||||
handle
|
||||
.advance_replay(preserved_catalog, catalog, write_buffer)
|
||||
.map_err(Box::new)
|
||||
.context(PreservedCatalogWipeError)?;
|
||||
.context(InitDbError)?;
|
||||
|
||||
drop(handle);
|
||||
|
||||
info!(%db_name, "wiped preserved catalog of non-registered database");
|
||||
Ok(())
|
||||
// there is still more work to do for this DB
|
||||
Ok(InitProgress::Unfinished)
|
||||
}
|
||||
}
|
||||
DatabaseStateCode::Replay => {
|
||||
let db = handle
|
||||
.db_any_state()
|
||||
.expect("DB should be available in this state");
|
||||
db.perform_replay().await;
|
||||
|
||||
/// Try to make as much progress as possible with DB init.
|
||||
///
|
||||
/// Returns an error if there was an error along the way (in which case the handle should still be commit to safe
|
||||
/// the intermediate result). Returns `Ok(true)` if DB init is finished and `Ok(false)` if the DB can be forgotten
|
||||
/// (e.g. because not rules file is present.)
|
||||
async fn try_advance_database_init_process_until_complete(
|
||||
handle: &mut DatabaseHandle<'_>,
|
||||
root: &Path,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<bool> {
|
||||
loop {
|
||||
match Self::try_advance_database_init_process(handle, root, wipe_on_error).await? {
|
||||
InitProgress::Unfinished => {}
|
||||
InitProgress::Done => {
|
||||
return Ok(true);
|
||||
}
|
||||
InitProgress::Forget => {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
handle
|
||||
.advance_init()
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
// there is still more work to do for this DB
|
||||
Ok(InitProgress::Unfinished)
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to make some progress in the DB init.
|
||||
async fn try_advance_database_init_process(
|
||||
handle: &mut DatabaseHandle<'_>,
|
||||
root: &Path,
|
||||
wipe_on_error: bool,
|
||||
) -> Result<InitProgress> {
|
||||
match handle.state_code() {
|
||||
DatabaseStateCode::Known => {
|
||||
// known => load DB rules
|
||||
let path = object_store_path_for_database_config(root, &handle.db_name());
|
||||
match Self::load_database_rules(handle.object_store(), path).await? {
|
||||
Some(rules) => {
|
||||
handle
|
||||
.advance_rules_loaded(rules)
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
// there is still more work to do for this DB
|
||||
Ok(InitProgress::Unfinished)
|
||||
}
|
||||
None => {
|
||||
// no rules file present, advice to forget his DB
|
||||
Ok(InitProgress::Forget)
|
||||
}
|
||||
}
|
||||
}
|
||||
DatabaseStateCode::RulesLoaded => {
|
||||
// rules already loaded => continue with loading preserved catalog
|
||||
let (preserved_catalog, catalog) = load_or_create_preserved_catalog(
|
||||
&handle.db_name(),
|
||||
handle.object_store(),
|
||||
handle.server_id(),
|
||||
handle.metrics_registry(),
|
||||
wipe_on_error,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| Box::new(e) as _)
|
||||
.context(CatalogLoadError)?;
|
||||
|
||||
let rules = handle
|
||||
.rules()
|
||||
.expect("in this state rules should be loaded");
|
||||
let write_buffer = WriteBufferConfig::new(handle.server_id(), &rules)
|
||||
.await
|
||||
.context(CreateWriteBuffer {
|
||||
config: rules.write_buffer_connection.clone(),
|
||||
})?;
|
||||
info!(write_buffer_enabled=?write_buffer.is_some(), db_name=rules.db_name(), "write buffer config");
|
||||
|
||||
handle
|
||||
.advance_replay(preserved_catalog, catalog, write_buffer)
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
// there is still more work to do for this DB
|
||||
Ok(InitProgress::Unfinished)
|
||||
}
|
||||
DatabaseStateCode::Replay => {
|
||||
let db = handle
|
||||
.db_any_state()
|
||||
.expect("DB should be available in this state");
|
||||
db.perform_replay().await;
|
||||
|
||||
handle
|
||||
.advance_init()
|
||||
.map_err(Box::new)
|
||||
.context(InitDbError)?;
|
||||
|
||||
// there is still more work to do for this DB
|
||||
Ok(InitProgress::Unfinished)
|
||||
}
|
||||
DatabaseStateCode::Initialized => {
|
||||
// database fully initialized => nothing to do
|
||||
Ok(InitProgress::Done)
|
||||
}
|
||||
DatabaseStateCode::Initialized => {
|
||||
// database fully initialized => nothing to do
|
||||
Ok(InitProgress::Done)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,9 +74,8 @@ use std::sync::Arc;
|
|||
use async_trait::async_trait;
|
||||
use bytes::BytesMut;
|
||||
use db::load::create_preserved_catalog;
|
||||
use init::InitStatus;
|
||||
use observability_deps::tracing::{debug, info, warn};
|
||||
use parking_lot::Mutex;
|
||||
use observability_deps::tracing::{debug, error, info, warn};
|
||||
use parking_lot::{Mutex, RwLockUpgradableReadGuard};
|
||||
use snafu::{OptionExt, ResultExt, Snafu};
|
||||
|
||||
use data_types::{
|
||||
|
@ -93,6 +92,7 @@ use generated_types::influxdata::transfer::column::v1 as pb;
|
|||
use influxdb_line_protocol::ParsedLine;
|
||||
use metrics::{KeyValue, MetricObserverBuilder, MetricRegistry};
|
||||
use object_store::{ObjectStore, ObjectStoreApi};
|
||||
use parking_lot::RwLock;
|
||||
use query::{exec::Executor, DatabaseStore};
|
||||
use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt};
|
||||
use write_buffer::config::WriteBufferConfig;
|
||||
|
@ -220,11 +220,11 @@ pub enum Error {
|
|||
#[snafu(display("cannot create preserved catalog: {}", source))]
|
||||
CannotCreatePreservedCatalog { source: DatabaseError },
|
||||
|
||||
#[snafu(display("cannot set id: {}", source))]
|
||||
SetIdError { source: crate::init::Error },
|
||||
#[snafu(display("id already set"))]
|
||||
IdAlreadySet,
|
||||
|
||||
#[snafu(display("cannot get id: {}", source))]
|
||||
GetIdError { source: crate::init::Error },
|
||||
#[snafu(display("id not set"))]
|
||||
IdNotSet,
|
||||
|
||||
#[snafu(display(
|
||||
"cannot create write buffer with config: {:?}, error: {}",
|
||||
|
@ -297,6 +297,8 @@ pub struct ServerConfig {
|
|||
metric_registry: Arc<MetricRegistry>,
|
||||
|
||||
remote_template: Option<RemoteTemplate>,
|
||||
|
||||
wipe_catalog_on_error: bool,
|
||||
}
|
||||
|
||||
impl ServerConfig {
|
||||
|
@ -311,6 +313,7 @@ impl ServerConfig {
|
|||
object_store,
|
||||
metric_registry,
|
||||
remote_template,
|
||||
wipe_catalog_on_error: true,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -414,7 +417,6 @@ impl ServerMetrics {
|
|||
/// of these structs, which keeps track of all replication and query rules.
|
||||
#[derive(Debug)]
|
||||
pub struct Server<M: ConnectionManager> {
|
||||
config: Arc<Config>,
|
||||
connection_manager: Arc<M>,
|
||||
pub store: Arc<ObjectStore>,
|
||||
exec: Arc<Executor>,
|
||||
|
@ -426,7 +428,50 @@ pub struct Server<M: ConnectionManager> {
|
|||
/// and populates the endpoint with this data.
|
||||
pub registry: Arc<metrics::MetricRegistry>,
|
||||
|
||||
init_status: Arc<InitStatus>,
|
||||
/// The state machine for server startup
|
||||
stage: Arc<RwLock<ServerStage>>,
|
||||
}
|
||||
|
||||
/// The stage of the server in the startup process
|
||||
///
|
||||
/// The progression is linear Startup -> InitReady -> Initializing -> Initialized
|
||||
/// with the sole exception that on failure Initializing -> InitReady
|
||||
///
|
||||
/// Errors encountered on server init will be retried, however, errors encountered
|
||||
/// during database init will require operator intervention
|
||||
///
|
||||
/// These errors are exposed via `Server::error_generic` and `Server::error_database` respectively
|
||||
///
|
||||
/// They do not impact the state machine's progression, but instead are exposed to the
|
||||
/// gRPC management API to allow an operator to assess the state of the system
|
||||
#[derive(Debug)]
|
||||
enum ServerStage {
|
||||
/// Server has started but doesn't have a server id yet
|
||||
Startup {
|
||||
remote_template: Option<RemoteTemplate>,
|
||||
wipe_catalog_on_error: bool,
|
||||
},
|
||||
|
||||
/// Server can be initialized
|
||||
InitReady {
|
||||
wipe_catalog_on_error: bool,
|
||||
config: Arc<Config>,
|
||||
last_error: Option<Arc<init::Error>>,
|
||||
},
|
||||
|
||||
/// Server has a server id, has started loading
|
||||
Initializing {
|
||||
wipe_catalog_on_error: bool,
|
||||
config: Arc<Config>,
|
||||
last_error: Option<Arc<init::Error>>,
|
||||
},
|
||||
|
||||
/// Server has finish initializing, possibly with errors
|
||||
Initialized {
|
||||
config: Arc<Config>,
|
||||
/// Errors that occurred during some DB init.
|
||||
database_errors: HashMap<String, Arc<init::Error>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
@ -454,22 +499,23 @@ where
|
|||
// to test the metrics provide a different registry to the `ServerConfig`.
|
||||
metric_registry,
|
||||
remote_template,
|
||||
wipe_catalog_on_error,
|
||||
} = config;
|
||||
|
||||
let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get);
|
||||
let exec = Arc::new(Executor::new(num_worker_threads));
|
||||
|
||||
Self {
|
||||
config: Arc::new(Config::new(
|
||||
Arc::clone(&jobs),
|
||||
Arc::clone(&metric_registry),
|
||||
remote_template,
|
||||
)),
|
||||
store: object_store,
|
||||
connection_manager: Arc::new(connection_manager),
|
||||
exec: Arc::new(Executor::new(num_worker_threads)),
|
||||
exec,
|
||||
jobs,
|
||||
metrics: Arc::new(ServerMetrics::new(Arc::clone(&metric_registry))),
|
||||
registry: Arc::clone(&metric_registry),
|
||||
init_status: Arc::new(InitStatus::new()),
|
||||
stage: Arc::new(RwLock::new(ServerStage::Startup {
|
||||
remote_template,
|
||||
wipe_catalog_on_error,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -478,68 +524,112 @@ where
|
|||
///
|
||||
/// A valid server ID Must be non-zero.
|
||||
pub fn set_id(&self, id: ServerId) -> Result<()> {
|
||||
self.init_status.server_id.set(id).context(SetIdError)
|
||||
}
|
||||
let mut stage = self.stage.write();
|
||||
match &mut *stage {
|
||||
ServerStage::Startup {
|
||||
remote_template,
|
||||
wipe_catalog_on_error,
|
||||
} => {
|
||||
let remote_template = remote_template.take();
|
||||
|
||||
/// Returns the current server ID, or an error if not yet set.
|
||||
pub fn require_id(&self) -> Result<ServerId> {
|
||||
self.init_status.server_id.get().context(GetIdError)
|
||||
*stage = ServerStage::InitReady {
|
||||
wipe_catalog_on_error: *wipe_catalog_on_error,
|
||||
config: Arc::new(Config::new(
|
||||
Arc::clone(&self.jobs),
|
||||
Arc::clone(&self.store),
|
||||
Arc::clone(&self.exec),
|
||||
id,
|
||||
Arc::clone(&self.registry),
|
||||
remote_template,
|
||||
)),
|
||||
last_error: None,
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
_ => Err(Error::IdAlreadySet),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if server is loaded. Databases are loaded and server is ready to read/write.
|
||||
pub fn initialized(&self) -> bool {
|
||||
self.init_status.initialized()
|
||||
matches!(&*self.stage.read(), ServerStage::Initialized { .. })
|
||||
}
|
||||
|
||||
/// Require that server is loaded. Databases are loaded and server is ready to read/write.
|
||||
fn require_initialized(&self) -> Result<Arc<Config>> {
|
||||
match &*self.stage.read() {
|
||||
ServerStage::Startup { .. } => Err(Error::IdNotSet),
|
||||
ServerStage::InitReady { config, .. } | ServerStage::Initializing { config, .. } => {
|
||||
Err(Error::ServerNotInitialized {
|
||||
server_id: config.server_id(),
|
||||
})
|
||||
}
|
||||
ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the config for this server if server id has been set
|
||||
fn config(&self) -> Result<Arc<Config>> {
|
||||
let stage = self.stage.read();
|
||||
match &*stage {
|
||||
ServerStage::Startup { .. } => Err(Error::IdNotSet),
|
||||
ServerStage::InitReady { config, .. }
|
||||
| ServerStage::Initializing { config, .. }
|
||||
| ServerStage::Initialized { config, .. } => Ok(Arc::clone(&config)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the server id for this server if set
|
||||
pub fn server_id(&self) -> Option<ServerId> {
|
||||
self.config().map(|x| x.server_id()).ok()
|
||||
}
|
||||
|
||||
/// Error occurred during generic server init (e.g. listing store content).
|
||||
pub fn error_generic(&self) -> Option<Arc<crate::init::Error>> {
|
||||
self.init_status.error_generic()
|
||||
let stage = self.stage.read();
|
||||
match &*stage {
|
||||
ServerStage::InitReady { last_error, .. } => last_error.clone(),
|
||||
ServerStage::Initializing { last_error, .. } => last_error.clone(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// List all databases with errors in sorted order.
|
||||
pub fn databases_with_errors(&self) -> Vec<String> {
|
||||
self.init_status.databases_with_errors()
|
||||
let stage = self.stage.read();
|
||||
match &*stage {
|
||||
ServerStage::Initialized {
|
||||
database_errors, ..
|
||||
} => database_errors.keys().cloned().collect(),
|
||||
_ => Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that occurred during initialization of a specific database.
|
||||
pub fn error_database(&self, db_name: &str) -> Option<Arc<crate::init::Error>> {
|
||||
self.init_status.error_database(db_name)
|
||||
let stage = self.stage.read();
|
||||
match &*stage {
|
||||
ServerStage::Initialized {
|
||||
database_errors, ..
|
||||
} => database_errors.get(db_name).cloned(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Current database init state.
|
||||
pub fn database_state(&self, name: &str) -> Option<DatabaseStateCode> {
|
||||
if let Ok(name) = DatabaseName::new(name) {
|
||||
self.config.db_state(&name)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Require that server is loaded. Databases are loaded and server is ready to read/write.
|
||||
fn require_initialized(&self) -> Result<ServerId> {
|
||||
// since a server ID is the pre-requirement for init, check this first
|
||||
let server_id = self.require_id()?;
|
||||
|
||||
// ordering here isn't that important since this method is not used to check-and-modify the flag
|
||||
if self.initialized() {
|
||||
Ok(server_id)
|
||||
} else {
|
||||
Err(Error::ServerNotInitialized { server_id })
|
||||
}
|
||||
let db_name = DatabaseName::new(name).ok()?;
|
||||
let config = self.config().ok()?;
|
||||
config.db_state(&db_name)
|
||||
}
|
||||
|
||||
/// Tells the server the set of rules for a database.
|
||||
pub async fn create_database(&self, rules: DatabaseRules) -> Result<()> {
|
||||
// Return an error if this server is not yet ready
|
||||
let server_id = self.require_initialized()?;
|
||||
let config = self.require_initialized()?;
|
||||
|
||||
// Reserve name before expensive IO (e.g. loading the preserved catalog)
|
||||
let mut db_reservation = self.config.create_db(
|
||||
Arc::clone(&self.store),
|
||||
Arc::clone(&self.exec),
|
||||
server_id,
|
||||
rules.name.clone(),
|
||||
)?;
|
||||
let mut db_reservation = config.create_db(rules.name.clone())?;
|
||||
|
||||
// register rules
|
||||
db_reservation.advance_rules_loaded(rules.clone())?;
|
||||
|
@ -548,14 +638,14 @@ where
|
|||
let (preserved_catalog, catalog) = create_preserved_catalog(
|
||||
rules.db_name(),
|
||||
Arc::clone(&self.store),
|
||||
server_id,
|
||||
self.config.metrics_registry(),
|
||||
config.server_id(),
|
||||
config.metrics_registry(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| Box::new(e) as _)
|
||||
.context(CannotCreatePreservedCatalog)?;
|
||||
|
||||
let write_buffer = WriteBufferConfig::new(server_id, &rules)
|
||||
let write_buffer = WriteBufferConfig::new(config.server_id(), &rules)
|
||||
.await
|
||||
.map_err(|e| Error::CreatingWriteBuffer {
|
||||
config: rules.write_buffer_connection.clone(),
|
||||
|
@ -575,13 +665,8 @@ where
|
|||
}
|
||||
|
||||
pub async fn persist_database_rules<'a>(&self, rules: DatabaseRules) -> Result<()> {
|
||||
let location = object_store_path_for_database_config(
|
||||
&self
|
||||
.init_status
|
||||
.root_path(&self.store)
|
||||
.context(GetIdError)?,
|
||||
&rules.name,
|
||||
);
|
||||
let config = self.config()?;
|
||||
let location = object_store_path_for_database_config(&config.root_path(), &rules.name);
|
||||
|
||||
let mut data = BytesMut::new();
|
||||
encode_database_rules(rules, &mut data).context(ErrorSerializingRulesProtobuf)?;
|
||||
|
@ -604,15 +689,62 @@ where
|
|||
/// object store. Any databases in the config already won't be
|
||||
/// replaced.
|
||||
///
|
||||
/// This requires the serverID to be set. It will be a no-op if the configs are already loaded and the server is ready.
|
||||
/// This requires the serverID to be set.
|
||||
///
|
||||
/// It will be a no-op if the configs are already loaded and the server is ready.
|
||||
pub async fn maybe_initialize_server(&self) {
|
||||
self.init_status
|
||||
.maybe_initialize_server(
|
||||
Arc::clone(&self.store),
|
||||
Arc::clone(&self.config),
|
||||
Arc::clone(&self.exec),
|
||||
)
|
||||
.await;
|
||||
// Explicit scope to help async generator
|
||||
let (wipe_catalog_on_error, config) = {
|
||||
let state = self.stage.upgradable_read();
|
||||
match &*state {
|
||||
ServerStage::InitReady {
|
||||
wipe_catalog_on_error,
|
||||
config,
|
||||
last_error,
|
||||
} => {
|
||||
let config = Arc::clone(config);
|
||||
let last_error = last_error.clone();
|
||||
let wipe_catalog_on_error = *wipe_catalog_on_error;
|
||||
|
||||
// Mark the server as initializing and drop lock
|
||||
|
||||
let mut state = RwLockUpgradableReadGuard::upgrade(state);
|
||||
*state = ServerStage::Initializing {
|
||||
config: Arc::clone(&config),
|
||||
wipe_catalog_on_error,
|
||||
last_error,
|
||||
};
|
||||
(wipe_catalog_on_error, config)
|
||||
}
|
||||
_ => return,
|
||||
}
|
||||
};
|
||||
|
||||
let init_result = init::initialize_server(Arc::clone(&config), wipe_catalog_on_error).await;
|
||||
let new_stage = match init_result {
|
||||
// Success -> move to next stage
|
||||
Ok(results) => {
|
||||
info!(server_id=%config.server_id(), "server initialized");
|
||||
ServerStage::Initialized {
|
||||
config,
|
||||
database_errors: results
|
||||
.into_iter()
|
||||
.filter_map(|(name, res)| Some((name.to_string(), Arc::new(res.err()?))))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
// Error -> return to InitReady
|
||||
Err(err) => {
|
||||
error!(%err, "error during server init");
|
||||
ServerStage::InitReady {
|
||||
wipe_catalog_on_error,
|
||||
config,
|
||||
last_error: Some(Arc::new(err)),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
*self.stage.write() = new_stage;
|
||||
}
|
||||
|
||||
pub async fn write_pb(&self, database_batch: pb::DatabaseBatch) -> Result<()> {
|
||||
|
@ -640,11 +772,10 @@ where
|
|||
default_time: i64,
|
||||
) -> Result<()> {
|
||||
// Return an error if this server is not yet ready
|
||||
self.require_initialized()?;
|
||||
let config = self.require_initialized()?;
|
||||
|
||||
let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
|
||||
let db = self
|
||||
.config
|
||||
let db = config
|
||||
.db_initialized(&db_name)
|
||||
.context(DatabaseNotFound { db_name: &*db_name })?;
|
||||
|
||||
|
@ -744,9 +875,12 @@ where
|
|||
node_group: &[ServerId],
|
||||
entry: Entry,
|
||||
) -> Result<()> {
|
||||
// Return an error if this server is not yet ready
|
||||
let config = self.config()?;
|
||||
|
||||
let addrs: Vec<_> = node_group
|
||||
.iter()
|
||||
.filter_map(|&node| self.config.resolve_remote(node))
|
||||
.filter_map(|&node| config.resolve_remote(node))
|
||||
.collect();
|
||||
if addrs.is_empty() {
|
||||
return NoRemoteConfigured { node_group }.fail();
|
||||
|
@ -775,11 +909,10 @@ where
|
|||
|
||||
pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec<u8>) -> Result<()> {
|
||||
// Return an error if this server is not yet ready
|
||||
self.require_initialized()?;
|
||||
let config = self.require_initialized()?;
|
||||
|
||||
let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
|
||||
let db = self
|
||||
.config
|
||||
let db = config
|
||||
.db_initialized(&db_name)
|
||||
.context(DatabaseNotFound { db_name: &*db_name })?;
|
||||
|
||||
|
@ -825,11 +958,11 @@ where
|
|||
}
|
||||
|
||||
pub fn db(&self, name: &DatabaseName<'_>) -> Option<Arc<Db>> {
|
||||
self.config.db_initialized(name)
|
||||
self.config().ok()?.db_initialized(name)
|
||||
}
|
||||
|
||||
pub fn db_rules(&self, name: &DatabaseName<'_>) -> Option<Arc<DatabaseRules>> {
|
||||
self.config.db_initialized(name).map(|d| d.rules())
|
||||
self.db(name).map(|d| d.rules())
|
||||
}
|
||||
|
||||
// Update database rules and save on success.
|
||||
|
@ -841,8 +974,8 @@ where
|
|||
where
|
||||
F: FnOnce(DatabaseRules) -> Result<DatabaseRules, E> + Send,
|
||||
{
|
||||
let rules = self
|
||||
.config
|
||||
let config = self.config()?;
|
||||
let rules = config
|
||||
.update_db_rules(db_name, update)
|
||||
.map_err(|e| match e {
|
||||
crate::config::UpdateError::Closure(e) => UpdateError::Closure(e),
|
||||
|
@ -854,16 +987,23 @@ where
|
|||
Ok(rules)
|
||||
}
|
||||
|
||||
pub fn remotes_sorted(&self) -> Vec<(ServerId, String)> {
|
||||
self.config.remotes_sorted()
|
||||
pub fn remotes_sorted(&self) -> Result<Vec<(ServerId, String)>> {
|
||||
// TODO: Should these be on ConnectionManager and not Config
|
||||
let config = self.config()?;
|
||||
Ok(config.remotes_sorted())
|
||||
}
|
||||
|
||||
pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) {
|
||||
self.config.update_remote(id, addr)
|
||||
pub fn update_remote(&self, id: ServerId, addr: GRpcConnectionString) -> Result<()> {
|
||||
// TODO: Should these be on ConnectionManager and not Config
|
||||
let config = self.config()?;
|
||||
config.update_remote(id, addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn delete_remote(&self, id: ServerId) -> Option<GRpcConnectionString> {
|
||||
self.config.delete_remote(id)
|
||||
pub fn delete_remote(&self, id: ServerId) -> Result<Option<GRpcConnectionString>> {
|
||||
// TODO: Should these be on ConnectionManager and not Config
|
||||
let config = self.config()?;
|
||||
Ok(config.delete_remote(id))
|
||||
}
|
||||
|
||||
pub fn spawn_dummy_job(&self, nanos: Vec<u64>) -> TaskTracker<Job> {
|
||||
|
@ -893,14 +1033,15 @@ where
|
|||
partition_key: impl Into<String>,
|
||||
chunk_id: u32,
|
||||
) -> Result<TaskTracker<Job>> {
|
||||
let config = self.require_initialized()?;
|
||||
|
||||
let db_name = db_name.to_string();
|
||||
let name = DatabaseName::new(&db_name).context(InvalidDatabaseName)?;
|
||||
|
||||
let partition_key = partition_key.into();
|
||||
let table_name = table_name.into();
|
||||
|
||||
let db = self
|
||||
.config
|
||||
let db = config
|
||||
.db_initialized(&name)
|
||||
.context(DatabaseNotFound { db_name: &db_name })?;
|
||||
|
||||
|
@ -921,25 +1062,62 @@ where
|
|||
/// DB jobs and this command.
|
||||
pub fn wipe_preserved_catalog(
|
||||
&self,
|
||||
db_name: DatabaseName<'static>,
|
||||
db_name: &DatabaseName<'static>,
|
||||
) -> Result<TaskTracker<Job>> {
|
||||
if self.config.db_initialized(&db_name).is_some() {
|
||||
return Err(Error::DatabaseAlreadyExists {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
// Can only wipe catalog of database that failed to initialize
|
||||
let config = match &*self.stage.read() {
|
||||
ServerStage::Initialized {
|
||||
config,
|
||||
database_errors,
|
||||
} => {
|
||||
if config.db_initialized(db_name).is_some() {
|
||||
return Err(Error::DatabaseAlreadyExists {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
if !database_errors.contains_key(db_name.as_str()) {
|
||||
// TODO: Should this be an error? Some end-to-end tests assume it is non-fatal
|
||||
warn!(%db_name, "wiping database not present at startup");
|
||||
}
|
||||
Arc::clone(config)
|
||||
}
|
||||
ServerStage::Startup { .. } => return Err(Error::IdNotSet),
|
||||
ServerStage::Initializing { config, .. } | ServerStage::InitReady { config, .. } => {
|
||||
return Err(Error::ServerNotInitialized {
|
||||
server_id: config.server_id(),
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
let (tracker, registration) = self.jobs.register(Job::WipePreservedCatalog {
|
||||
db_name: db_name.to_string(),
|
||||
});
|
||||
let object_store = Arc::clone(&self.store);
|
||||
let config = Arc::clone(&self.config);
|
||||
let server_id = self.require_id()?;
|
||||
let init_status = Arc::clone(&self.init_status);
|
||||
|
||||
let state = Arc::clone(&self.stage);
|
||||
let db_name = db_name.clone();
|
||||
|
||||
let task = async move {
|
||||
init_status
|
||||
.wipe_preserved_catalog_and_maybe_recover(object_store, config, server_id, db_name)
|
||||
.await
|
||||
let result = init::wipe_preserved_catalog_and_maybe_recover(config, &db_name).await;
|
||||
|
||||
match &mut *state.write() {
|
||||
ServerStage::Initialized {
|
||||
database_errors, ..
|
||||
} => match result {
|
||||
Ok(_) => {
|
||||
info!(%db_name, "wiped preserved catalog of registered database and recovered");
|
||||
database_errors.remove(db_name.as_str());
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(%db_name, %e, "wiped preserved catalog of registered database but still cannot recover");
|
||||
let e = Arc::new(e);
|
||||
database_errors.insert(db_name.to_string(), Arc::clone(&e));
|
||||
Err(e)
|
||||
}
|
||||
},
|
||||
_ => unreachable!("server cannot become uninitialized"),
|
||||
}
|
||||
};
|
||||
tokio::spawn(task.track(registration));
|
||||
|
||||
|
@ -973,7 +1151,9 @@ where
|
|||
}
|
||||
|
||||
info!("shutting down background workers");
|
||||
self.config.drain().await;
|
||||
if let Ok(config) = self.config() {
|
||||
config.drain().await;
|
||||
}
|
||||
|
||||
info!("draining tracker registry");
|
||||
|
||||
|
@ -999,11 +1179,15 @@ where
|
|||
type Error = Error;
|
||||
|
||||
fn db_names_sorted(&self) -> Vec<String> {
|
||||
self.config
|
||||
.db_names_sorted()
|
||||
.iter()
|
||||
.map(|i| i.clone().into())
|
||||
.collect()
|
||||
self.config()
|
||||
.map(|config| {
|
||||
config
|
||||
.db_names_sorted()
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
fn db(&self, name: &str) -> Option<Arc<Self::Database>> {
|
||||
|
@ -1214,25 +1398,15 @@ mod tests {
|
|||
let manager = TestConnectionManager::new();
|
||||
let server = Server::new(manager, config());
|
||||
|
||||
let resp = server.require_id().unwrap_err();
|
||||
assert!(matches!(
|
||||
resp,
|
||||
Error::GetIdError {
|
||||
source: crate::init::Error::IdNotSet
|
||||
}
|
||||
));
|
||||
let resp = server.config().unwrap_err();
|
||||
assert!(matches!(resp, Error::IdNotSet));
|
||||
|
||||
let lines = parsed_lines("cpu foo=1 10");
|
||||
let resp = server
|
||||
.write_lines("foo", &lines, ARBITRARY_DEFAULT_TIME)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(matches!(
|
||||
resp,
|
||||
Error::GetIdError {
|
||||
source: crate::init::Error::IdNotSet
|
||||
}
|
||||
));
|
||||
assert!(matches!(resp, Error::IdNotSet));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
@ -1559,7 +1733,7 @@ mod tests {
|
|||
|
||||
let remote_ids = vec![bad_remote_id, good_remote_id_1, good_remote_id_2];
|
||||
let db = server.db(&db_name).unwrap();
|
||||
db.update_db_rules(|mut rules| {
|
||||
db.update_rules(|mut rules| {
|
||||
let shard_config = ShardConfig {
|
||||
hash_ring: Some(HashRing {
|
||||
shards: vec![TEST_SHARD_ID].into(),
|
||||
|
@ -1589,7 +1763,9 @@ mod tests {
|
|||
);
|
||||
|
||||
// one remote is configured but it's down and we'll get connection error
|
||||
server.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into());
|
||||
server
|
||||
.update_remote(bad_remote_id, BAD_REMOTE_ADDR.into())
|
||||
.unwrap();
|
||||
let err = server
|
||||
.write_lines(&db_name, &lines, ARBITRARY_DEFAULT_TIME)
|
||||
.await
|
||||
|
@ -1606,8 +1782,12 @@ mod tests {
|
|||
|
||||
// We configure the address for the other remote, this time connection will succeed
|
||||
// despite the bad remote failing to connect.
|
||||
server.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into());
|
||||
server.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into());
|
||||
server
|
||||
.update_remote(good_remote_id_1, GOOD_REMOTE_ADDR_1.into())
|
||||
.unwrap();
|
||||
server
|
||||
.update_remote(good_remote_id_2, GOOD_REMOTE_ADDR_2.into())
|
||||
.unwrap();
|
||||
|
||||
// Remotes are tried in random order, so we need to repeat the test a few times to have a reasonable
|
||||
// probability both the remotes will get hit.
|
||||
|
@ -1796,7 +1976,7 @@ mod tests {
|
|||
let db_name = DatabaseName::new("foo").unwrap();
|
||||
let db = server.db(&db_name).unwrap();
|
||||
let rules = db
|
||||
.update_db_rules(|mut rules| {
|
||||
.update_rules(|mut rules| {
|
||||
rules.lifecycle_rules.buffer_size_hard =
|
||||
Some(std::num::NonZeroUsize::new(10).unwrap());
|
||||
Ok::<_, Infallible>(rules)
|
||||
|
@ -1844,12 +2024,7 @@ mod tests {
|
|||
let err = create_simple_database(&server, "bananas")
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(matches!(
|
||||
err,
|
||||
Error::GetIdError {
|
||||
source: crate::init::Error::IdNotSet
|
||||
}
|
||||
));
|
||||
assert!(matches!(err, Error::IdNotSet));
|
||||
|
||||
server.set_id(ServerId::try_from(1).unwrap()).unwrap();
|
||||
// do NOT call `server.maybe_load_database_configs` so DBs are not loaded and server is not ready
|
||||
|
@ -1873,7 +2048,7 @@ mod tests {
|
|||
|
||||
let t_0 = Instant::now();
|
||||
loop {
|
||||
if server.require_initialized().is_ok() {
|
||||
if server.config().is_ok() {
|
||||
break;
|
||||
}
|
||||
assert!(t_0.elapsed() < Duration::from_secs(10));
|
||||
|
@ -1916,9 +2091,12 @@ mod tests {
|
|||
create_simple_database(&server, "foo")
|
||||
.await
|
||||
.expect("failed to create database");
|
||||
let root = server.init_status.root_path(&store).unwrap();
|
||||
server.config.drain().await;
|
||||
|
||||
let config = server.require_initialized().unwrap();
|
||||
let root = config.root_path();
|
||||
config.drain().await;
|
||||
drop(server);
|
||||
drop(config);
|
||||
|
||||
// tamper store
|
||||
let path = object_store_path_for_database_config(&root, &DatabaseName::new("bar").unwrap());
|
||||
|
@ -2003,18 +2181,24 @@ mod tests {
|
|||
let server = Server::new(manager, config);
|
||||
server.set_id(server_id).unwrap();
|
||||
server.maybe_initialize_server().await;
|
||||
|
||||
create_simple_database(&server, db_name_existing.clone())
|
||||
.await
|
||||
.expect("failed to create database");
|
||||
|
||||
create_simple_database(&server, db_name_rules_broken.clone())
|
||||
.await
|
||||
.expect("failed to create database");
|
||||
|
||||
create_simple_database(&server, db_name_catalog_broken.clone())
|
||||
.await
|
||||
.expect("failed to create database");
|
||||
let root = server.init_status.root_path(&store).unwrap();
|
||||
server.config.drain().await;
|
||||
|
||||
let config = server.require_initialized().unwrap();
|
||||
let root = config.root_path();
|
||||
config.drain().await;
|
||||
drop(server);
|
||||
drop(config);
|
||||
|
||||
// tamper store to break one database
|
||||
let path = object_store_path_for_database_config(&root, &db_name_rules_broken);
|
||||
|
@ -2045,22 +2229,18 @@ mod tests {
|
|||
let store = Arc::try_unwrap(store).unwrap();
|
||||
store.get(&path).await.unwrap();
|
||||
let manager = TestConnectionManager::new();
|
||||
let config = config_with_store(store);
|
||||
let server = Server::new(manager, config);
|
||||
|
||||
// need to disable auto-wipe for this test
|
||||
server
|
||||
.init_status
|
||||
.wipe_on_error
|
||||
.store(false, std::sync::atomic::Ordering::Relaxed);
|
||||
let mut config = config_with_store(store);
|
||||
config.wipe_catalog_on_error = false;
|
||||
let server = Server::new(manager, config);
|
||||
|
||||
// cannot wipe if server ID is not set
|
||||
assert_eq!(
|
||||
server
|
||||
.wipe_preserved_catalog(db_name_non_existing.clone())
|
||||
.wipe_preserved_catalog(&db_name_non_existing)
|
||||
.unwrap_err()
|
||||
.to_string(),
|
||||
"cannot get id: unable to use server until id is set"
|
||||
"id not set"
|
||||
);
|
||||
|
||||
server.set_id(ServerId::try_from(1).unwrap()).unwrap();
|
||||
|
@ -2069,31 +2249,29 @@ mod tests {
|
|||
// 1. cannot wipe if DB exists
|
||||
assert_eq!(
|
||||
server
|
||||
.wipe_preserved_catalog(db_name_existing.clone())
|
||||
.wipe_preserved_catalog(&db_name_existing)
|
||||
.unwrap_err()
|
||||
.to_string(),
|
||||
"database already exists: db_existing"
|
||||
);
|
||||
assert!(PreservedCatalog::exists(
|
||||
&server.store,
|
||||
server.require_id().unwrap(),
|
||||
&db_name_existing.to_string()
|
||||
)
|
||||
.await
|
||||
.unwrap());
|
||||
assert!(
|
||||
PreservedCatalog::exists(&server.store, server_id, db_name_existing.as_str())
|
||||
.await
|
||||
.unwrap()
|
||||
);
|
||||
|
||||
// 2. wiping a non-existing DB just works, but won't bring DB into existence
|
||||
assert!(server.error_database(&db_name_non_existing).is_none());
|
||||
PreservedCatalog::new_empty::<TestCatalogState>(
|
||||
Arc::clone(&server.store),
|
||||
server.require_id().unwrap(),
|
||||
server_id,
|
||||
db_name_non_existing.to_string(),
|
||||
(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let tracker = server
|
||||
.wipe_preserved_catalog(db_name_non_existing.clone())
|
||||
.wipe_preserved_catalog(&db_name_non_existing)
|
||||
.unwrap();
|
||||
let metadata = tracker.metadata();
|
||||
let expected_metadata = Job::WipePreservedCatalog {
|
||||
|
@ -2103,7 +2281,7 @@ mod tests {
|
|||
tracker.join().await;
|
||||
assert!(!PreservedCatalog::exists(
|
||||
&server.store,
|
||||
server.require_id().unwrap(),
|
||||
server_id,
|
||||
&db_name_non_existing.to_string()
|
||||
)
|
||||
.await
|
||||
|
@ -2114,7 +2292,7 @@ mod tests {
|
|||
// 3. wipe DB with broken rules file, this won't bring DB back to life
|
||||
assert!(server.error_database(&db_name_rules_broken).is_some());
|
||||
let tracker = server
|
||||
.wipe_preserved_catalog(db_name_rules_broken.clone())
|
||||
.wipe_preserved_catalog(&db_name_rules_broken)
|
||||
.unwrap();
|
||||
let metadata = tracker.metadata();
|
||||
let expected_metadata = Job::WipePreservedCatalog {
|
||||
|
@ -2124,7 +2302,7 @@ mod tests {
|
|||
tracker.join().await;
|
||||
assert!(!PreservedCatalog::exists(
|
||||
&server.store,
|
||||
server.require_id().unwrap(),
|
||||
server_id,
|
||||
&db_name_rules_broken.to_string()
|
||||
)
|
||||
.await
|
||||
|
@ -2135,7 +2313,7 @@ mod tests {
|
|||
// 4. wipe DB with broken catalog, this will bring the DB back to life
|
||||
assert!(server.error_database(&db_name_catalog_broken).is_some());
|
||||
let tracker = server
|
||||
.wipe_preserved_catalog(db_name_catalog_broken.clone())
|
||||
.wipe_preserved_catalog(&db_name_catalog_broken)
|
||||
.unwrap();
|
||||
let metadata = tracker.metadata();
|
||||
let expected_metadata = Job::WipePreservedCatalog {
|
||||
|
@ -2145,7 +2323,7 @@ mod tests {
|
|||
tracker.join().await;
|
||||
assert!(PreservedCatalog::exists(
|
||||
&server.store,
|
||||
server.require_id().unwrap(),
|
||||
server_id,
|
||||
&db_name_catalog_broken.to_string()
|
||||
)
|
||||
.await
|
||||
|
@ -2166,18 +2344,16 @@ mod tests {
|
|||
.unwrap();
|
||||
assert_eq!(
|
||||
server
|
||||
.wipe_preserved_catalog(db_name_created.clone())
|
||||
.wipe_preserved_catalog(&db_name_created)
|
||||
.unwrap_err()
|
||||
.to_string(),
|
||||
"database already exists: db_created"
|
||||
);
|
||||
assert!(PreservedCatalog::exists(
|
||||
&server.store,
|
||||
server.require_id().unwrap(),
|
||||
&db_name_created.to_string()
|
||||
)
|
||||
.await
|
||||
.unwrap());
|
||||
assert!(
|
||||
PreservedCatalog::exists(&server.store, server_id, &db_name_created.to_string())
|
||||
.await
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
@ -119,6 +119,11 @@ struct Create {
|
|||
/// Maximum number of rows to buffer in a MUB chunk before compacting it
|
||||
#[structopt(long, default_value = "100000")]
|
||||
mub_row_threshold: u64,
|
||||
|
||||
/// Use up to this amount of space in bytes for caching Parquet files. A
|
||||
/// value of zero disables Parquet file caching.
|
||||
#[structopt(long, default_value = "0")]
|
||||
parquet_cache_limit: u64,
|
||||
}
|
||||
|
||||
/// Get list of databases
|
||||
|
@ -193,6 +198,7 @@ pub async fn command(url: String, config: Config) -> Result<()> {
|
|||
persist_row_threshold: command.persist_row_threshold,
|
||||
persist_age_threshold_seconds: command.persist_age_threshold_seconds,
|
||||
mub_row_threshold: command.mub_row_threshold,
|
||||
parquet_cache_limit: command.parquet_cache_limit,
|
||||
}),
|
||||
|
||||
// Default to hourly partitions
|
||||
|
|
|
@ -231,6 +231,14 @@ Possible values (case insensitive):
|
|||
default_value = "serving"
|
||||
)]
|
||||
pub initial_serving_state: ServingReadinessState,
|
||||
|
||||
/// Maximum size of HTTP requests.
|
||||
#[structopt(
|
||||
long = "--max-http-request-size",
|
||||
env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE",
|
||||
default_value = "10485760" // 10 MiB
|
||||
)]
|
||||
pub max_http_request_size: usize,
|
||||
}
|
||||
|
||||
pub async fn command(config: Config) -> Result<()> {
|
||||
|
|
|
@ -195,7 +195,15 @@ pub async fn main(config: Config) -> Result<()> {
|
|||
let bind_addr = config.http_bind_address;
|
||||
let addr = AddrIncoming::bind(&bind_addr).context(StartListeningHttp { bind_addr })?;
|
||||
|
||||
let http_server = http::serve(addr, Arc::clone(&app_server), frontend_shutdown.clone()).fuse();
|
||||
let max_http_request_size = config.max_http_request_size;
|
||||
|
||||
let http_server = http::serve(
|
||||
addr,
|
||||
Arc::clone(&app_server),
|
||||
frontend_shutdown.clone(),
|
||||
max_http_request_size,
|
||||
)
|
||||
.fuse();
|
||||
info!(bind_address=?bind_addr, "HTTP server listening");
|
||||
|
||||
info!(git_hash, "InfluxDB IOx server ready");
|
||||
|
|
|
@ -342,12 +342,26 @@ impl ApplicationError {
|
|||
}
|
||||
}
|
||||
|
||||
const MAX_SIZE: usize = 10_485_760; // max write request size of 10MB
|
||||
|
||||
fn router<M>(server: Arc<AppServer<M>>) -> Router<Body, ApplicationError>
|
||||
struct Server<M>
|
||||
where
|
||||
M: ConnectionManager + Send + Sync + Debug + 'static,
|
||||
{
|
||||
app_server: Arc<AppServer<M>>,
|
||||
max_request_size: usize,
|
||||
}
|
||||
|
||||
fn router<M>(
|
||||
app_server: Arc<AppServer<M>>,
|
||||
max_request_size: usize,
|
||||
) -> Router<Body, ApplicationError>
|
||||
where
|
||||
M: ConnectionManager + Send + Sync + Debug + 'static,
|
||||
{
|
||||
let server = Server {
|
||||
app_server,
|
||||
max_request_size,
|
||||
};
|
||||
|
||||
// Create a router and specify the the handlers.
|
||||
Router::builder()
|
||||
.data(server)
|
||||
|
@ -408,7 +422,7 @@ struct WriteInfo {
|
|||
|
||||
/// Parse the request's body into raw bytes, applying size limits and
|
||||
/// content encoding as needed.
|
||||
async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError> {
|
||||
async fn parse_body(req: hyper::Request<Body>, max_size: usize) -> Result<Bytes, ApplicationError> {
|
||||
// clippy says the const needs to be assigned to a local variable:
|
||||
// error: a `const` item with interior mutability should not be borrowed
|
||||
let header_name = CONTENT_ENCODING;
|
||||
|
@ -431,9 +445,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
|
|||
while let Some(chunk) = payload.next().await {
|
||||
let chunk = chunk.context(ClientHangup)?;
|
||||
// limit max size of in-memory payload
|
||||
if (body.len() + chunk.len()) > MAX_SIZE {
|
||||
if (body.len() + chunk.len()) > max_size {
|
||||
return Err(ApplicationError::RequestSizeExceeded {
|
||||
max_body_size: MAX_SIZE,
|
||||
max_body_size: max_size,
|
||||
});
|
||||
}
|
||||
body.extend_from_slice(&chunk);
|
||||
|
@ -445,9 +459,9 @@ async fn parse_body(req: hyper::Request<Body>) -> Result<Bytes, ApplicationError
|
|||
use std::io::Read;
|
||||
let decoder = flate2::read::GzDecoder::new(&body[..]);
|
||||
|
||||
// Read at most MAX_SIZE bytes to prevent a decompression bomb based
|
||||
// Read at most max_size bytes to prevent a decompression bomb based
|
||||
// DoS.
|
||||
let mut decoder = decoder.take(MAX_SIZE as u64);
|
||||
let mut decoder = decoder.take(max_size as u64);
|
||||
let mut decoded_data = Vec::new();
|
||||
decoder
|
||||
.read_to_end(&mut decoded_data)
|
||||
|
@ -464,7 +478,12 @@ where
|
|||
M: ConnectionManager + Send + Sync + Debug + 'static,
|
||||
{
|
||||
let path = req.uri().path().to_string();
|
||||
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
|
||||
let Server {
|
||||
app_server: server,
|
||||
max_request_size,
|
||||
} = req.data::<Server<M>>().expect("server state");
|
||||
let max_request_size = *max_request_size;
|
||||
let server = Arc::clone(&server);
|
||||
|
||||
// TODO(edd): figure out best way of catching all errors in this observation.
|
||||
let obs = server.metrics.http_requests.observation(); // instrument request
|
||||
|
@ -481,7 +500,7 @@ where
|
|||
let db_name = org_and_bucket_to_database(&write_info.org, &write_info.bucket)
|
||||
.context(BucketMappingError)?;
|
||||
|
||||
let body = parse_body(req).await?;
|
||||
let body = parse_body(req, max_request_size).await?;
|
||||
|
||||
let body = str::from_utf8(&body).context(ReadingBodyAsUtf8)?;
|
||||
|
||||
|
@ -595,7 +614,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
|
|||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApplicationError> {
|
||||
let path = req.uri().path().to_string();
|
||||
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
|
||||
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
|
||||
|
||||
// TODO(edd): figure out best way of catching all errors in this observation.
|
||||
let obs = server.metrics.http_requests.observation(); // instrument request
|
||||
|
@ -661,7 +680,7 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
|
|||
async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApplicationError> {
|
||||
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
|
||||
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
|
||||
let path = req.uri().path().to_string();
|
||||
server
|
||||
.metrics
|
||||
|
@ -677,7 +696,7 @@ async fn health<M: ConnectionManager + Send + Sync + Debug + 'static>(
|
|||
async fn handle_metrics<M: ConnectionManager + Send + Sync + Debug + 'static>(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApplicationError> {
|
||||
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
|
||||
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
|
||||
let path = req.uri().path().to_string();
|
||||
server
|
||||
.metrics
|
||||
|
@ -700,7 +719,7 @@ async fn list_partitions<M: ConnectionManager + Send + Sync + Debug + 'static>(
|
|||
) -> Result<Response<Body>, ApplicationError> {
|
||||
let path = req.uri().path().to_string();
|
||||
|
||||
let server = Arc::clone(&req.data::<Arc<AppServer<M>>>().expect("server state"));
|
||||
let server = Arc::clone(&req.data::<Server<M>>().expect("server state").app_server);
|
||||
|
||||
// TODO - catch error conditions
|
||||
let obs = server.metrics.http_requests.observation();
|
||||
|
@ -841,11 +860,12 @@ pub async fn serve<M>(
|
|||
addr: AddrIncoming,
|
||||
server: Arc<AppServer<M>>,
|
||||
shutdown: CancellationToken,
|
||||
max_request_size: usize,
|
||||
) -> Result<(), hyper::Error>
|
||||
where
|
||||
M: ConnectionManager + Send + Sync + Debug + 'static,
|
||||
{
|
||||
let router = router(server);
|
||||
let router = router(server, max_request_size);
|
||||
let service = RouterService::new(router).unwrap();
|
||||
|
||||
hyper::Server::builder(addr)
|
||||
|
@ -1234,6 +1254,8 @@ mod tests {
|
|||
.await;
|
||||
}
|
||||
|
||||
const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024;
|
||||
|
||||
#[tokio::test]
|
||||
async fn client_hangup_during_parse() {
|
||||
#[derive(Debug, Snafu)]
|
||||
|
@ -1253,7 +1275,9 @@ mod tests {
|
|||
.body(body)
|
||||
.unwrap();
|
||||
|
||||
let parse_result = parse_body(request).await.unwrap_err();
|
||||
let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert_eq!(
|
||||
parse_result.to_string(),
|
||||
"Client hung up while sending body: error reading a body from connection: Blarg Error"
|
||||
|
@ -1334,7 +1358,12 @@ mod tests {
|
|||
let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server");
|
||||
let server_url = format!("http://{}", addr.local_addr());
|
||||
|
||||
tokio::task::spawn(serve(addr, server, CancellationToken::new()));
|
||||
tokio::task::spawn(serve(
|
||||
addr,
|
||||
server,
|
||||
CancellationToken::new(),
|
||||
TEST_MAX_REQUEST_SIZE,
|
||||
));
|
||||
println!("Started server at {}", server_url);
|
||||
server_url
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status {
|
|||
use server::Error;
|
||||
|
||||
match error {
|
||||
Error::GetIdError { .. } => PreconditionViolation {
|
||||
Error::IdNotSet => PreconditionViolation {
|
||||
category: "Writer ID".to_string(),
|
||||
subject: "influxdata.com/iox".to_string(),
|
||||
description: "Writer ID must be set".to_string(),
|
||||
|
|
|
@ -56,7 +56,7 @@ where
|
|||
&self,
|
||||
_: Request<GetServerIdRequest>,
|
||||
) -> Result<Response<GetServerIdResponse>, Status> {
|
||||
match self.server.require_id().ok() {
|
||||
match self.server.server_id() {
|
||||
Some(id) => Ok(Response::new(GetServerIdResponse { id: id.get_u32() })),
|
||||
None => return Err(NotFound::default().into()),
|
||||
}
|
||||
|
@ -71,7 +71,7 @@ where
|
|||
|
||||
match self.server.set_id(id) {
|
||||
Ok(_) => Ok(Response::new(UpdateServerIdResponse {})),
|
||||
Err(e @ Error::SetIdError { .. }) => {
|
||||
Err(e @ Error::IdAlreadySet) => {
|
||||
return Err(FieldViolation {
|
||||
field: "id".to_string(),
|
||||
description: e.to_string(),
|
||||
|
@ -199,15 +199,18 @@ where
|
|||
&self,
|
||||
_: Request<ListRemotesRequest>,
|
||||
) -> Result<Response<ListRemotesResponse>, Status> {
|
||||
let remotes = self
|
||||
.server
|
||||
.remotes_sorted()
|
||||
.into_iter()
|
||||
.map(|(id, connection_string)| Remote {
|
||||
id: id.get_u32(),
|
||||
connection_string,
|
||||
})
|
||||
.collect();
|
||||
let result = self.server.remotes_sorted();
|
||||
let remotes = match result {
|
||||
Ok(remotes) => remotes
|
||||
.into_iter()
|
||||
.map(|(id, connection_string)| Remote {
|
||||
id: id.get_u32(),
|
||||
connection_string,
|
||||
})
|
||||
.collect(),
|
||||
Err(e) => return Err(default_server_error_handler(e)),
|
||||
};
|
||||
|
||||
Ok(Response::new(ListRemotesResponse { remotes }))
|
||||
}
|
||||
|
||||
|
@ -221,8 +224,16 @@ where
|
|||
.ok_or_else(|| FieldViolation::required("remote"))?;
|
||||
let remote_id = ServerId::try_from(remote.id)
|
||||
.map_err(|_| FieldViolation::required("id").scope("remote"))?;
|
||||
self.server
|
||||
|
||||
let result = self
|
||||
.server
|
||||
.update_remote(remote_id, remote.connection_string);
|
||||
|
||||
match result {
|
||||
Ok(_) => {}
|
||||
Err(e) => return Err(default_server_error_handler(e)),
|
||||
}
|
||||
|
||||
Ok(Response::new(UpdateRemoteResponse {}))
|
||||
}
|
||||
|
||||
|
@ -233,9 +244,12 @@ where
|
|||
let request = request.into_inner();
|
||||
let remote_id =
|
||||
ServerId::try_from(request.id).map_err(|_| FieldViolation::required("id"))?;
|
||||
self.server
|
||||
.delete_remote(remote_id)
|
||||
.ok_or_else(NotFound::default)?;
|
||||
|
||||
match self.server.delete_remote(remote_id) {
|
||||
Ok(Some(_)) => {}
|
||||
Ok(None) => return Err(NotFound::default().into()),
|
||||
Err(e) => return Err(default_server_error_handler(e)),
|
||||
}
|
||||
|
||||
Ok(Response::new(DeleteRemoteResponse {}))
|
||||
}
|
||||
|
@ -455,7 +469,7 @@ where
|
|||
|
||||
let tracker = self
|
||||
.server
|
||||
.wipe_preserved_catalog(db_name)
|
||||
.wipe_preserved_catalog(&db_name)
|
||||
.map_err(|e| match e {
|
||||
Error::DatabaseAlreadyExists { db_name } => AlreadyExists {
|
||||
resource_type: "database".to_string(),
|
||||
|
|
|
@ -65,6 +65,8 @@ async fn test_list_update_remotes() {
|
|||
const TEST_REMOTE_ADDR_2: &str = "4.3.2.1:4321";
|
||||
const TEST_REMOTE_ADDR_2_UPDATED: &str = "40.30.20.10:4321";
|
||||
|
||||
client.update_server_id(123).await.unwrap();
|
||||
|
||||
let res = client.list_remotes().await.expect("list remotes failed");
|
||||
assert_eq!(res.len(), 0);
|
||||
|
||||
|
|
|
@ -244,6 +244,18 @@ async fn test_list_chunks_error() {
|
|||
async fn test_remotes() {
|
||||
let server_fixture = ServerFixture::create_single_use().await;
|
||||
let addr = server_fixture.grpc_base();
|
||||
|
||||
Command::cargo_bin("influxdb_iox")
|
||||
.unwrap()
|
||||
.arg("server")
|
||||
.arg("set")
|
||||
.arg("32")
|
||||
.arg("--host")
|
||||
.arg(addr)
|
||||
.assert()
|
||||
.success()
|
||||
.stdout(predicate::str::contains("Ok"));
|
||||
|
||||
Command::cargo_bin("influxdb_iox")
|
||||
.unwrap()
|
||||
.arg("server")
|
||||
|
|
|
@ -49,16 +49,43 @@ async fn test_chunk_is_persisted_automatically() {
|
|||
assert_eq!(chunks[0].row_count, 1_000);
|
||||
}
|
||||
|
||||
async fn write_data(
|
||||
write_client: &mut influxdb_iox_client::write::Client,
|
||||
db_name: &str,
|
||||
num_payloads: u64,
|
||||
num_duplicates: u64,
|
||||
payload_size: u64,
|
||||
) {
|
||||
let payloads: Vec<_> = (0..num_payloads)
|
||||
.map(|x| {
|
||||
(0..payload_size)
|
||||
.map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
|
||||
.join("\n")
|
||||
})
|
||||
.collect();
|
||||
|
||||
for payload in &payloads {
|
||||
// Writing the same data multiple times should be compacted away
|
||||
for _ in 0..=num_duplicates {
|
||||
let num_lines_written = write_client
|
||||
.write(db_name, payload)
|
||||
.await
|
||||
.expect("successful write");
|
||||
assert_eq!(num_lines_written, payload_size as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_lifecycle() {
|
||||
let fixture = ServerFixture::create_shared().await;
|
||||
let mut write_client = fixture.write_client();
|
||||
|
||||
let num_payloads = 10;
|
||||
let num_duplicates = 2;
|
||||
let num_duplicates = 1;
|
||||
let payload_size = 1_000;
|
||||
|
||||
let total_rows = num_payloads * num_duplicates * payload_size;
|
||||
let total_rows = num_payloads * (1 + num_duplicates) * payload_size;
|
||||
|
||||
let db_name = rand_name();
|
||||
DatabaseBuilder::new(db_name.clone())
|
||||
|
@ -73,24 +100,14 @@ async fn test_full_lifecycle() {
|
|||
.build(fixture.grpc_channel())
|
||||
.await;
|
||||
|
||||
let payloads: Vec<_> = (0..num_payloads)
|
||||
.map(|x| {
|
||||
(0..payload_size)
|
||||
.map(|i| format!("data,tag{}=val{} x={} {}", x, i, i * 10, i))
|
||||
.join("\n")
|
||||
})
|
||||
.collect();
|
||||
|
||||
for payload in &payloads {
|
||||
// Writing the same data multiple times should be compacted away
|
||||
for _ in 0..num_duplicates {
|
||||
let num_lines_written = write_client
|
||||
.write(&db_name, payload)
|
||||
.await
|
||||
.expect("successful write");
|
||||
assert_eq!(num_lines_written, payload_size as usize);
|
||||
}
|
||||
}
|
||||
write_data(
|
||||
&mut write_client,
|
||||
&db_name,
|
||||
num_payloads,
|
||||
num_duplicates,
|
||||
payload_size,
|
||||
)
|
||||
.await;
|
||||
|
||||
wait_for_exact_chunk_states(
|
||||
&fixture,
|
||||
|
@ -123,6 +140,58 @@ async fn test_full_lifecycle() {
|
|||
assert_eq!(chunks[0].row_count, (num_payloads * payload_size) as usize)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_late_arrival() {
|
||||
let fixture = ServerFixture::create_shared().await;
|
||||
let mut write_client = fixture.write_client();
|
||||
|
||||
let payload_size = 100;
|
||||
|
||||
let db_name = rand_name();
|
||||
DatabaseBuilder::new(db_name.clone())
|
||||
.persist(true)
|
||||
// Don't close MUB automatically
|
||||
.mub_row_threshold(payload_size * 2)
|
||||
.persist_row_threshold(payload_size)
|
||||
.persist_age_threshold_seconds(1000)
|
||||
// Initially set to be a large value
|
||||
.late_arrive_window_seconds(1000)
|
||||
.build(fixture.grpc_channel())
|
||||
.await;
|
||||
|
||||
write_data(&mut write_client, &db_name, 1, 0, payload_size).await;
|
||||
|
||||
let mut management = fixture.management_client();
|
||||
|
||||
let chunks = management.list_chunks(&db_name).await.unwrap();
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(
|
||||
chunks[0].storage,
|
||||
influxdb_iox_client::management::generated_types::ChunkStorage::OpenMutableBuffer as i32
|
||||
);
|
||||
|
||||
let mut rules = management.get_database(&db_name).await.unwrap();
|
||||
rules
|
||||
.lifecycle_rules
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.late_arrive_window_seconds = 1;
|
||||
|
||||
fixture
|
||||
.management_client()
|
||||
.update_database(rules)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
wait_for_exact_chunk_states(
|
||||
&fixture,
|
||||
&db_name,
|
||||
vec![ChunkStorage::ReadBufferAndObjectStore],
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_query_chunk_after_restart() {
|
||||
// fixtures
|
||||
|
|
|
@ -13,7 +13,7 @@ use crate::{
|
|||
#[derive(Debug)]
|
||||
pub enum WriteBufferConfig {
|
||||
Writing(Arc<dyn WriteBufferWriting>),
|
||||
Reading(Arc<dyn WriteBufferReading>),
|
||||
Reading(Arc<tokio::sync::Mutex<Box<dyn WriteBufferReading>>>),
|
||||
}
|
||||
|
||||
impl WriteBufferConfig {
|
||||
|
@ -36,7 +36,9 @@ impl WriteBufferConfig {
|
|||
Some(WriteBufferConnection::Reading(conn)) => {
|
||||
let kafka_buffer = KafkaBufferConsumer::new(conn, server_id, name).await?;
|
||||
|
||||
Ok(Some(Self::Reading(Arc::new(kafka_buffer) as _)))
|
||||
Ok(Some(Self::Reading(Arc::new(tokio::sync::Mutex::new(
|
||||
Box::new(kafka_buffer) as _,
|
||||
)))))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
use std::fmt::Debug;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use entry::{Entry, Sequence, SequencedEntry};
|
||||
use futures::stream::BoxStream;
|
||||
use futures::{future::BoxFuture, stream::BoxStream};
|
||||
|
||||
/// Generic boxed error type that is used in this crate.
|
||||
///
|
||||
|
@ -10,7 +12,7 @@ pub type WriteBufferError = Box<dyn std::error::Error + Sync + Send>;
|
|||
/// Writing to a Write Buffer takes an [`Entry`] and returns [`Sequence`] data that facilitates reading
|
||||
/// entries from the Write Buffer at a later time.
|
||||
#[async_trait]
|
||||
pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
|
||||
pub trait WriteBufferWriting: Sync + Send + Debug + 'static {
|
||||
/// Send an `Entry` to the write buffer using the specified sequencer ID.
|
||||
///
|
||||
/// Returns information that can be used to restore entries at a later time.
|
||||
|
@ -21,16 +23,47 @@ pub trait WriteBufferWriting: Sync + Send + std::fmt::Debug + 'static {
|
|||
) -> Result<Sequence, WriteBufferError>;
|
||||
}
|
||||
|
||||
pub type FetchHighWatermarkFut<'a> = BoxFuture<'a, Result<u64, WriteBufferError>>;
|
||||
pub type FetchHighWatermark<'a> = Box<dyn (Fn() -> FetchHighWatermarkFut<'a>) + Send + Sync>;
|
||||
|
||||
/// Output stream of [`WriteBufferReading`].
|
||||
pub type EntryStream<'a> = BoxStream<'a, Result<SequencedEntry, WriteBufferError>>;
|
||||
pub struct EntryStream<'a> {
|
||||
/// Stream that produces entries.
|
||||
pub stream: BoxStream<'a, Result<SequencedEntry, WriteBufferError>>,
|
||||
|
||||
/// Get high watermark (= what we believe is the next sequence number to be added).
|
||||
///
|
||||
/// Can be used to calculate lag. Note that since the watermark is "next sequence ID number to be added", it starts
|
||||
/// at 0 and after the entry with sequence number 0 is added to the buffer, it is 1.
|
||||
pub fetch_high_watermark: FetchHighWatermark<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Debug for EntryStream<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("EntryStream").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
/// Produce streams (one per sequencer) of [`SequencedEntry`]s.
|
||||
pub trait WriteBufferReading: Sync + Send + std::fmt::Debug + 'static {
|
||||
#[async_trait]
|
||||
pub trait WriteBufferReading: Sync + Send + Debug + 'static {
|
||||
/// Returns a stream per sequencer.
|
||||
fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
|
||||
where
|
||||
'life0: 'async_trait,
|
||||
Self: 'async_trait;
|
||||
///
|
||||
/// Note that due to the mutable borrow, it is not possible to have multiple streams from the same
|
||||
/// [`WriteBufferReading`] instance at the same time. If all streams are dropped and requested again, the last
|
||||
/// offsets of the old streams will be the start offsets for the new streams. If you want to prevent that either
|
||||
/// create a new [`WriteBufferReading`] or use [`seek`](Self::seek).
|
||||
fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)>;
|
||||
|
||||
/// Seek given sequencer to given sequence number. The next output of related streams will be an entry with at least
|
||||
/// the given sequence number (the actual sequence number might be skipped due to "holes" in the stream).
|
||||
///
|
||||
/// Note that due to the mutable borrow, it is not possible to seek while streams exists.
|
||||
async fn seek(
|
||||
&mut self,
|
||||
sequencer_id: u32,
|
||||
sequence_number: u64,
|
||||
) -> Result<(), WriteBufferError>;
|
||||
}
|
||||
|
||||
pub mod test_utils {
|
||||
|
@ -65,6 +98,8 @@ pub mod test_utils {
|
|||
test_multi_stream_io(&adapter).await;
|
||||
test_multi_sequencer_io(&adapter).await;
|
||||
test_multi_writer_multi_reader(&adapter).await;
|
||||
test_seek(&adapter).await;
|
||||
test_watermark(&adapter).await;
|
||||
}
|
||||
|
||||
async fn test_single_stream_io<T>(adapter: &T)
|
||||
|
@ -78,7 +113,7 @@ pub mod test_utils {
|
|||
let entry_3 = lp_to_entry("upc user=3 300");
|
||||
|
||||
let writer = context.writing();
|
||||
let reader = context.reading().await;
|
||||
let mut reader = context.reading().await;
|
||||
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 1);
|
||||
|
@ -88,67 +123,32 @@ pub mod test_utils {
|
|||
let mut cx = futures::task::Context::from_waker(&waker);
|
||||
|
||||
// empty stream is pending
|
||||
assert!(stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
|
||||
// adding content allows us to get results
|
||||
writer.store_entry(&entry_1, sequencer_id).await.unwrap();
|
||||
assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_1);
|
||||
assert_eq!(
|
||||
stream.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_1
|
||||
);
|
||||
|
||||
// stream is pending again
|
||||
assert!(stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
|
||||
// adding more data unblocks the stream
|
||||
writer.store_entry(&entry_2, sequencer_id).await.unwrap();
|
||||
writer.store_entry(&entry_3, sequencer_id).await.unwrap();
|
||||
assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_2);
|
||||
assert_eq!(stream.next().await.unwrap().unwrap().entry(), &entry_3);
|
||||
assert_eq!(
|
||||
stream.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_2
|
||||
);
|
||||
assert_eq!(
|
||||
stream.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_3
|
||||
);
|
||||
|
||||
// stream is pending again
|
||||
assert!(stream.poll_next_unpin(&mut cx).is_pending());
|
||||
}
|
||||
|
||||
async fn test_multi_sequencer_io<T>(adapter: &T)
|
||||
where
|
||||
T: TestAdapter,
|
||||
{
|
||||
let context = adapter.new_context(2).await;
|
||||
|
||||
let entry_1 = lp_to_entry("upc user=1 100");
|
||||
let entry_2 = lp_to_entry("upc user=2 200");
|
||||
let entry_3 = lp_to_entry("upc user=3 300");
|
||||
|
||||
let writer = context.writing();
|
||||
let reader = context.reading().await;
|
||||
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 2);
|
||||
let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
|
||||
let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
|
||||
assert_ne!(sequencer_id_1, sequencer_id_2);
|
||||
|
||||
let waker = futures::task::noop_waker();
|
||||
let mut cx = futures::task::Context::from_waker(&waker);
|
||||
|
||||
// empty streams are pending
|
||||
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
|
||||
|
||||
// entries arrive at the right target stream
|
||||
writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
|
||||
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
|
||||
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
|
||||
|
||||
writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
|
||||
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
|
||||
assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
|
||||
|
||||
writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
|
||||
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
|
||||
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
|
||||
|
||||
// streams are pending again
|
||||
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
}
|
||||
|
||||
async fn test_multi_stream_io<T>(adapter: &T)
|
||||
|
@ -162,34 +162,104 @@ pub mod test_utils {
|
|||
let entry_3 = lp_to_entry("upc user=3 300");
|
||||
|
||||
let writer = context.writing();
|
||||
let reader = context.reading().await;
|
||||
|
||||
let mut streams_1 = reader.streams();
|
||||
let mut streams_2 = reader.streams();
|
||||
assert_eq!(streams_1.len(), 1);
|
||||
assert_eq!(streams_2.len(), 1);
|
||||
let (sequencer_id_1, mut stream_1) = streams_1.pop().unwrap();
|
||||
let (sequencer_id_2, mut stream_2) = streams_2.pop().unwrap();
|
||||
assert_eq!(sequencer_id_1, sequencer_id_2);
|
||||
let mut reader = context.reading().await;
|
||||
|
||||
let waker = futures::task::noop_waker();
|
||||
let mut cx = futures::task::Context::from_waker(&waker);
|
||||
|
||||
// empty streams is pending
|
||||
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
|
||||
writer.store_entry(&entry_1, 0).await.unwrap();
|
||||
writer.store_entry(&entry_2, 0).await.unwrap();
|
||||
writer.store_entry(&entry_3, 0).await.unwrap();
|
||||
|
||||
// streams poll from same source
|
||||
// creating stream, drop stream, re-create it => still starts at first entry
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 1);
|
||||
let (_sequencer_id, stream) = streams.pop().unwrap();
|
||||
drop(stream);
|
||||
drop(streams);
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 1);
|
||||
let (_sequencer_id, mut stream) = streams.pop().unwrap();
|
||||
assert_eq!(
|
||||
stream.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_1
|
||||
);
|
||||
|
||||
// re-creating stream after reading remembers offset
|
||||
drop(stream);
|
||||
drop(streams);
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 1);
|
||||
let (_sequencer_id, mut stream) = streams.pop().unwrap();
|
||||
assert_eq!(
|
||||
stream.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_2
|
||||
);
|
||||
assert_eq!(
|
||||
stream.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_3
|
||||
);
|
||||
|
||||
// re-creating stream after reading everything makes it pending
|
||||
drop(stream);
|
||||
drop(streams);
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 1);
|
||||
let (_sequencer_id, mut stream) = streams.pop().unwrap();
|
||||
assert!(stream.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
}
|
||||
|
||||
async fn test_multi_sequencer_io<T>(adapter: &T)
|
||||
where
|
||||
T: TestAdapter,
|
||||
{
|
||||
let context = adapter.new_context(2).await;
|
||||
|
||||
let entry_1 = lp_to_entry("upc user=1 100");
|
||||
let entry_2 = lp_to_entry("upc user=2 200");
|
||||
let entry_3 = lp_to_entry("upc user=3 300");
|
||||
|
||||
let writer = context.writing();
|
||||
let mut reader = context.reading().await;
|
||||
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 2);
|
||||
let (sequencer_id_1, mut stream_1) = streams.pop().unwrap();
|
||||
let (sequencer_id_2, mut stream_2) = streams.pop().unwrap();
|
||||
assert_ne!(sequencer_id_1, sequencer_id_2);
|
||||
|
||||
let waker = futures::task::noop_waker();
|
||||
let mut cx = futures::task::Context::from_waker(&waker);
|
||||
|
||||
// empty streams are pending
|
||||
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
|
||||
// entries arrive at the right target stream
|
||||
writer.store_entry(&entry_1, sequencer_id_1).await.unwrap();
|
||||
writer.store_entry(&entry_2, sequencer_id_1).await.unwrap();
|
||||
writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
|
||||
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_1);
|
||||
assert_eq!(stream_2.next().await.unwrap().unwrap().entry(), &entry_2);
|
||||
assert_eq!(stream_1.next().await.unwrap().unwrap().entry(), &entry_3);
|
||||
assert_eq!(
|
||||
stream_1.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_1
|
||||
);
|
||||
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
|
||||
// both streams are pending again
|
||||
assert!(stream_1.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.poll_next_unpin(&mut cx).is_pending());
|
||||
writer.store_entry(&entry_2, sequencer_id_2).await.unwrap();
|
||||
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert_eq!(
|
||||
stream_2.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_2
|
||||
);
|
||||
|
||||
writer.store_entry(&entry_3, sequencer_id_1).await.unwrap();
|
||||
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert_eq!(
|
||||
stream_1.stream.next().await.unwrap().unwrap().entry(),
|
||||
&entry_3
|
||||
);
|
||||
|
||||
// streams are pending again
|
||||
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
}
|
||||
|
||||
async fn test_multi_writer_multi_reader<T>(adapter: &T)
|
||||
|
@ -204,8 +274,8 @@ pub mod test_utils {
|
|||
|
||||
let writer_1 = context.writing();
|
||||
let writer_2 = context.writing();
|
||||
let reader_1 = context.reading().await;
|
||||
let reader_2 = context.reading().await;
|
||||
let mut reader_1 = context.reading().await;
|
||||
let mut reader_2 = context.reading().await;
|
||||
|
||||
// TODO: do not hard-code sequencer IDs here but provide a proper interface
|
||||
writer_1.store_entry(&entry_east_1, 0).await.unwrap();
|
||||
|
@ -213,18 +283,119 @@ pub mod test_utils {
|
|||
writer_2.store_entry(&entry_east_2, 0).await.unwrap();
|
||||
|
||||
assert_reader_content(
|
||||
reader_1,
|
||||
&mut reader_1,
|
||||
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
|
||||
)
|
||||
.await;
|
||||
assert_reader_content(
|
||||
reader_2,
|
||||
&mut reader_2,
|
||||
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
async fn assert_reader_content<R>(reader: R, expected: &[(u32, &[&Entry])])
|
||||
async fn test_seek<T>(adapter: &T)
|
||||
where
|
||||
T: TestAdapter,
|
||||
{
|
||||
let context = adapter.new_context(2).await;
|
||||
|
||||
let waker = futures::task::noop_waker();
|
||||
let mut cx = futures::task::Context::from_waker(&waker);
|
||||
|
||||
let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
|
||||
let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
|
||||
let entry_east_3 = lp_to_entry("upc,region=east user=3 300");
|
||||
let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
|
||||
|
||||
let writer = context.writing();
|
||||
let _sequence_number_east_1 = writer.store_entry(&entry_east_1, 0).await.unwrap().number;
|
||||
let sequence_number_east_2 = writer.store_entry(&entry_east_2, 0).await.unwrap().number;
|
||||
let _sequence_number_west_1 = writer.store_entry(&entry_west_1, 1).await.unwrap().number;
|
||||
|
||||
let mut reader_1 = context.reading().await;
|
||||
let mut reader_2 = context.reading().await;
|
||||
|
||||
// forward seek
|
||||
reader_1.seek(0, sequence_number_east_2).await.unwrap();
|
||||
assert_reader_content(
|
||||
&mut reader_1,
|
||||
&[(0, &[&entry_east_2]), (1, &[&entry_west_1])],
|
||||
)
|
||||
.await;
|
||||
assert_reader_content(
|
||||
&mut reader_2,
|
||||
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[&entry_west_1])],
|
||||
)
|
||||
.await;
|
||||
|
||||
// backward seek
|
||||
reader_1.seek(0, 0).await.unwrap();
|
||||
assert_reader_content(
|
||||
&mut reader_1,
|
||||
&[(0, &[&entry_east_1, &entry_east_2]), (1, &[])],
|
||||
)
|
||||
.await;
|
||||
|
||||
// seek to far end and then at data
|
||||
reader_1.seek(0, 1_000_000).await.unwrap();
|
||||
let _sequence_number_east_3 = writer.store_entry(&entry_east_3, 0).await.unwrap().number;
|
||||
let mut streams = reader_1.streams();
|
||||
assert_eq!(streams.len(), 2);
|
||||
let (_sequencer_id, mut stream_1) = streams.pop().unwrap();
|
||||
let (_sequencer_id, mut stream_2) = streams.pop().unwrap();
|
||||
assert!(stream_1.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
assert!(stream_2.stream.poll_next_unpin(&mut cx).is_pending());
|
||||
drop(stream_1);
|
||||
drop(stream_2);
|
||||
drop(streams);
|
||||
|
||||
// seeking unknown sequencer is NOT an error
|
||||
reader_1.seek(0, 42).await.unwrap();
|
||||
}
|
||||
|
||||
async fn test_watermark<T>(adapter: &T)
|
||||
where
|
||||
T: TestAdapter,
|
||||
{
|
||||
let context = adapter.new_context(2).await;
|
||||
|
||||
let entry_east_1 = lp_to_entry("upc,region=east user=1 100");
|
||||
let entry_east_2 = lp_to_entry("upc,region=east user=2 200");
|
||||
let entry_west_1 = lp_to_entry("upc,region=west user=1 200");
|
||||
|
||||
let writer = context.writing();
|
||||
let mut reader = context.reading().await;
|
||||
|
||||
let mut streams = reader.streams();
|
||||
assert_eq!(streams.len(), 2);
|
||||
let (sequencer_id_1, stream_1) = streams.pop().unwrap();
|
||||
let (sequencer_id_2, stream_2) = streams.pop().unwrap();
|
||||
|
||||
// start at watermark 0
|
||||
assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), 0);
|
||||
assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), 0);
|
||||
|
||||
// high water mark moves
|
||||
writer
|
||||
.store_entry(&entry_east_1, sequencer_id_1)
|
||||
.await
|
||||
.unwrap();
|
||||
let mark_1 = writer
|
||||
.store_entry(&entry_east_2, sequencer_id_1)
|
||||
.await
|
||||
.unwrap()
|
||||
.number;
|
||||
let mark_2 = writer
|
||||
.store_entry(&entry_west_1, sequencer_id_2)
|
||||
.await
|
||||
.unwrap()
|
||||
.number;
|
||||
assert_eq!((stream_1.fetch_high_watermark)().await.unwrap(), mark_1 + 1);
|
||||
assert_eq!((stream_2.fetch_high_watermark)().await.unwrap(), mark_2 + 1);
|
||||
}
|
||||
|
||||
async fn assert_reader_content<R>(reader: &mut R, expected: &[(u32, &[&Entry])])
|
||||
where
|
||||
R: WriteBufferReading,
|
||||
{
|
||||
|
@ -239,6 +410,7 @@ pub mod test_utils {
|
|||
|
||||
// we need to limit the stream to `expected.len()` elements, otherwise it might be pending forever
|
||||
let mut results: Vec<_> = actual_stream
|
||||
.stream
|
||||
.take(expected_entries.len())
|
||||
.try_collect()
|
||||
.await
|
||||
|
|
|
@ -1,22 +1,28 @@
|
|||
use std::{
|
||||
collections::BTreeMap,
|
||||
convert::{TryFrom, TryInto},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use data_types::server_id::ServerId;
|
||||
use entry::{Entry, Sequence, SequencedEntry};
|
||||
use futures::StreamExt;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use observability_deps::tracing::{debug, info};
|
||||
use rdkafka::{
|
||||
consumer::{BaseConsumer, Consumer, StreamConsumer},
|
||||
error::KafkaError,
|
||||
producer::{FutureProducer, FutureRecord},
|
||||
types::RDKafkaErrorCode,
|
||||
util::Timeout,
|
||||
ClientConfig, Message, TopicPartitionList,
|
||||
ClientConfig, Message, Offset, TopicPartitionList,
|
||||
};
|
||||
|
||||
use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
|
||||
use crate::core::{
|
||||
EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
|
||||
WriteBufferWriting,
|
||||
};
|
||||
|
||||
pub struct KafkaBufferProducer {
|
||||
conn: String,
|
||||
|
@ -77,8 +83,8 @@ impl KafkaBufferProducer {
|
|||
let mut cfg = ClientConfig::new();
|
||||
cfg.set("bootstrap.servers", &conn);
|
||||
cfg.set("message.timeout.ms", "5000");
|
||||
cfg.set("message.max.bytes", "10000000");
|
||||
cfg.set("queue.buffering.max.kbytes", "10485760");
|
||||
cfg.set("message.max.bytes", "31457280");
|
||||
cfg.set("queue.buffering.max.kbytes", "31457280");
|
||||
cfg.set("request.required.acks", "all"); // equivalent to acks=-1
|
||||
|
||||
let producer: FutureProducer = cfg.create()?;
|
||||
|
@ -94,7 +100,7 @@ impl KafkaBufferProducer {
|
|||
pub struct KafkaBufferConsumer {
|
||||
conn: String,
|
||||
database_name: String,
|
||||
consumers: Vec<(u32, StreamConsumer)>,
|
||||
consumers: BTreeMap<u32, Arc<StreamConsumer>>,
|
||||
}
|
||||
|
||||
// Needed because rdkafka's StreamConsumer doesn't impl Debug
|
||||
|
@ -107,31 +113,94 @@ impl std::fmt::Debug for KafkaBufferConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl WriteBufferReading for KafkaBufferConsumer {
|
||||
fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
|
||||
where
|
||||
'life0: 'async_trait,
|
||||
Self: 'async_trait,
|
||||
{
|
||||
self.consumers
|
||||
.iter()
|
||||
.map(|(sequencer_id, consumer)| {
|
||||
let stream = consumer
|
||||
.stream()
|
||||
.map(|message| {
|
||||
let message = message?;
|
||||
let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
|
||||
let sequence = Sequence {
|
||||
id: message.partition().try_into()?,
|
||||
number: message.offset().try_into()?,
|
||||
};
|
||||
fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
|
||||
let mut streams = vec![];
|
||||
|
||||
Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
|
||||
for (sequencer_id, consumer) in &self.consumers {
|
||||
let sequencer_id = *sequencer_id;
|
||||
let consumer_cloned = Arc::clone(consumer);
|
||||
let database_name = self.database_name.clone();
|
||||
|
||||
let stream = consumer
|
||||
.stream()
|
||||
.map(move |message| {
|
||||
let message = message?;
|
||||
let entry = Entry::try_from(message.payload().unwrap().to_vec())?;
|
||||
let sequence = Sequence {
|
||||
id: message.partition().try_into()?,
|
||||
number: message.offset().try_into()?,
|
||||
};
|
||||
|
||||
Ok(SequencedEntry::new_from_sequence(sequence, entry)?)
|
||||
})
|
||||
.boxed();
|
||||
|
||||
let fetch_high_watermark = move || {
|
||||
let consumer_cloned = Arc::clone(&consumer_cloned);
|
||||
let database_name = database_name.clone();
|
||||
|
||||
let fut = async move {
|
||||
match tokio::task::spawn_blocking(move || {
|
||||
consumer_cloned.fetch_watermarks(
|
||||
&database_name,
|
||||
sequencer_id as i32,
|
||||
Duration::from_secs(60),
|
||||
)
|
||||
})
|
||||
.boxed();
|
||||
(*sequencer_id, stream)
|
||||
.await
|
||||
.expect("subtask failed")
|
||||
{
|
||||
Ok((_low, high)) => Ok(high as u64),
|
||||
Err(KafkaError::MetadataFetch(RDKafkaErrorCode::UnknownPartition)) => Ok(0),
|
||||
Err(e) => Err(Box::new(e) as Box<dyn std::error::Error + Send + Sync>),
|
||||
}
|
||||
};
|
||||
|
||||
fut.boxed() as FetchHighWatermarkFut<'_>
|
||||
};
|
||||
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
|
||||
|
||||
streams.push((
|
||||
sequencer_id,
|
||||
EntryStream {
|
||||
stream,
|
||||
fetch_high_watermark,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
streams
|
||||
}
|
||||
|
||||
async fn seek(
|
||||
&mut self,
|
||||
sequencer_id: u32,
|
||||
sequence_number: u64,
|
||||
) -> Result<(), WriteBufferError> {
|
||||
if let Some(consumer) = self.consumers.get(&sequencer_id) {
|
||||
let consumer = Arc::clone(consumer);
|
||||
let database_name = self.database_name.clone();
|
||||
let offset = if sequence_number > 0 {
|
||||
Offset::Offset(sequence_number as i64)
|
||||
} else {
|
||||
Offset::Beginning
|
||||
};
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
consumer.seek(
|
||||
&database_name,
|
||||
sequencer_id as i32,
|
||||
offset,
|
||||
Duration::from_secs(60),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
.await
|
||||
.expect("subtask failed")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -169,11 +238,21 @@ impl KafkaBufferConsumer {
|
|||
|
||||
let mut assignment = TopicPartitionList::new();
|
||||
assignment.add_partition(&database_name, partition as i32);
|
||||
consumer.assign(&assignment)?;
|
||||
|
||||
Ok((partition, consumer))
|
||||
// We must set the offset to `Beginning` here to avoid the following error during seek:
|
||||
// KafkaError (Seek error: Local: Erroneous state)
|
||||
//
|
||||
// Also see:
|
||||
// - https://github.com/Blizzard/node-rdkafka/issues/237
|
||||
// - https://github.com/confluentinc/confluent-kafka-go/issues/121#issuecomment-362308376
|
||||
assignment
|
||||
.set_partition_offset(&database_name, partition as i32, Offset::Beginning)
|
||||
.expect("partition was set just before");
|
||||
|
||||
consumer.assign(&assignment)?;
|
||||
Ok((partition, Arc::new(consumer)))
|
||||
})
|
||||
.collect::<Result<Vec<(u32, StreamConsumer)>, KafkaError>>()?;
|
||||
.collect::<Result<BTreeMap<u32, Arc<StreamConsumer>>, KafkaError>>()?;
|
||||
|
||||
Ok(Self {
|
||||
conn,
|
||||
|
|
|
@ -2,10 +2,13 @@ use std::{collections::BTreeMap, sync::Arc, task::Poll};
|
|||
|
||||
use async_trait::async_trait;
|
||||
use entry::{Entry, Sequence, SequencedEntry};
|
||||
use futures::{stream, StreamExt};
|
||||
use futures::{stream, FutureExt, StreamExt};
|
||||
use parking_lot::Mutex;
|
||||
|
||||
use crate::core::{EntryStream, WriteBufferError, WriteBufferReading, WriteBufferWriting};
|
||||
use crate::core::{
|
||||
EntryStream, FetchHighWatermark, FetchHighWatermarkFut, WriteBufferError, WriteBufferReading,
|
||||
WriteBufferWriting,
|
||||
};
|
||||
|
||||
type EntryResVec = Vec<Result<SequencedEntry, WriteBufferError>>;
|
||||
|
||||
|
@ -153,21 +156,38 @@ impl WriteBufferWriting for MockBufferForWritingThatAlwaysErrors {
|
|||
}
|
||||
}
|
||||
|
||||
/// Sequencer-specific playback state
|
||||
struct PlaybackState {
|
||||
/// Index within the entry vector.
|
||||
vector_index: usize,
|
||||
|
||||
/// Offset within the sequencer IDs.
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
pub struct MockBufferForReading {
|
||||
state: MockBufferSharedState,
|
||||
positions: Arc<Mutex<BTreeMap<u32, usize>>>,
|
||||
shared_state: MockBufferSharedState,
|
||||
playback_states: Arc<Mutex<BTreeMap<u32, PlaybackState>>>,
|
||||
}
|
||||
|
||||
impl MockBufferForReading {
|
||||
pub fn new(state: MockBufferSharedState) -> Self {
|
||||
let n_sequencers = state.entries.lock().len() as u32;
|
||||
let positions: BTreeMap<_, _> = (0..n_sequencers)
|
||||
.map(|sequencer_id| (sequencer_id, 0))
|
||||
let playback_states: BTreeMap<_, _> = (0..n_sequencers)
|
||||
.map(|sequencer_id| {
|
||||
(
|
||||
sequencer_id,
|
||||
PlaybackState {
|
||||
vector_index: 0,
|
||||
offset: 0,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
state,
|
||||
positions: Arc::new(Mutex::new(positions)),
|
||||
shared_state: state,
|
||||
playback_states: Arc::new(Mutex::new(playback_states)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -178,46 +198,110 @@ impl std::fmt::Debug for MockBufferForReading {
|
|||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl WriteBufferReading for MockBufferForReading {
|
||||
fn streams<'life0, 'async_trait>(&'life0 self) -> Vec<(u32, EntryStream<'async_trait>)>
|
||||
where
|
||||
'life0: 'async_trait,
|
||||
Self: 'async_trait,
|
||||
{
|
||||
fn streams(&mut self) -> Vec<(u32, EntryStream<'_>)> {
|
||||
let sequencer_ids: Vec<_> = {
|
||||
let positions = self.positions.lock();
|
||||
positions.keys().copied().collect()
|
||||
let playback_states = self.playback_states.lock();
|
||||
playback_states.keys().copied().collect()
|
||||
};
|
||||
|
||||
let mut streams = vec![];
|
||||
for sequencer_id in sequencer_ids {
|
||||
let state = self.state.clone();
|
||||
let positions = Arc::clone(&self.positions);
|
||||
let shared_state = self.shared_state.clone();
|
||||
let playback_states = Arc::clone(&self.playback_states);
|
||||
|
||||
let stream = stream::poll_fn(move |_ctx| {
|
||||
let entries = state.entries.lock();
|
||||
let mut positions = positions.lock();
|
||||
let entries = shared_state.entries.lock();
|
||||
let mut playback_states = playback_states.lock();
|
||||
|
||||
let entry_vec = entries.get(&sequencer_id).unwrap();
|
||||
let position = positions.get_mut(&sequencer_id).unwrap();
|
||||
let playback_state = playback_states.get_mut(&sequencer_id).unwrap();
|
||||
|
||||
if entry_vec.len() > *position {
|
||||
let entry = match &entry_vec[*position] {
|
||||
Ok(entry) => Ok(entry.clone()),
|
||||
Err(e) => Err(e.to_string().into()),
|
||||
};
|
||||
*position += 1;
|
||||
return Poll::Ready(Some(entry));
|
||||
while entry_vec.len() > playback_state.vector_index {
|
||||
let entry_result = &entry_vec[playback_state.vector_index];
|
||||
|
||||
// consume entry
|
||||
playback_state.vector_index += 1;
|
||||
|
||||
match entry_result {
|
||||
Ok(entry) => {
|
||||
// found an entry => need to check if it is within the offset
|
||||
let sequence = entry.sequence().unwrap();
|
||||
if sequence.number >= playback_state.offset {
|
||||
// within offset => return entry to caller
|
||||
return Poll::Ready(Some(Ok(entry.clone())));
|
||||
} else {
|
||||
// offset is larger then the current entry => ignore entry and try next
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// found an error => return entry to caller
|
||||
return Poll::Ready(Some(Err(e.to_string().into())));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we are at the end of the recorded entries => report pending
|
||||
Poll::Pending
|
||||
})
|
||||
.boxed();
|
||||
streams.push((sequencer_id, stream));
|
||||
|
||||
let shared_state = self.shared_state.clone();
|
||||
|
||||
let fetch_high_watermark = move || {
|
||||
let shared_state = shared_state.clone();
|
||||
|
||||
let fut = async move {
|
||||
let entries = shared_state.entries.lock();
|
||||
let entry_vec = entries.get(&sequencer_id).unwrap();
|
||||
let watermark = entry_vec
|
||||
.iter()
|
||||
.filter_map(|entry_res| {
|
||||
entry_res
|
||||
.as_ref()
|
||||
.ok()
|
||||
.map(|entry| entry.sequence().unwrap().number)
|
||||
})
|
||||
.max()
|
||||
.map(|n| n + 1)
|
||||
.unwrap_or(0);
|
||||
|
||||
Ok(watermark)
|
||||
};
|
||||
fut.boxed() as FetchHighWatermarkFut<'_>
|
||||
};
|
||||
let fetch_high_watermark = Box::new(fetch_high_watermark) as FetchHighWatermark<'_>;
|
||||
|
||||
streams.push((
|
||||
sequencer_id,
|
||||
EntryStream {
|
||||
stream,
|
||||
fetch_high_watermark,
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
streams
|
||||
}
|
||||
|
||||
async fn seek(
|
||||
&mut self,
|
||||
sequencer_id: u32,
|
||||
sequence_number: u64,
|
||||
) -> Result<(), WriteBufferError> {
|
||||
let mut playback_states = self.playback_states.lock();
|
||||
|
||||
if let Some(playback_state) = playback_states.get_mut(&sequencer_id) {
|
||||
playback_state.offset = sequence_number;
|
||||
|
||||
// reset position to start since seeking might go backwards
|
||||
playback_state.vector_index = 0;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
Loading…
Reference in New Issue