Merge branch 'main' into feature-label

pull/24376/head
kodiakhq[bot] 2021-04-14 17:23:17 +00:00 committed by GitHub
commit 7247467225
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
82 changed files with 3783 additions and 2836 deletions

85
Cargo.lock generated
View File

@ -111,7 +111,7 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
[[package]]
name = "arrow"
version = "4.0.0-SNAPSHOT"
source = "git+https://github.com/apache/arrow.git?rev=e69478a890b1e4eee49b540b69b2711d170a0433#e69478a890b1e4eee49b540b69b2711d170a0433"
source = "git+https://github.com/apache/arrow.git?rev=00a443629c00079ea03c0b9f415d74669d2759a7#00a443629c00079ea03c0b9f415d74669d2759a7"
dependencies = [
"cfg_aliases",
"chrono",
@ -134,7 +134,7 @@ dependencies = [
[[package]]
name = "arrow-flight"
version = "4.0.0-SNAPSHOT"
source = "git+https://github.com/apache/arrow.git?rev=e69478a890b1e4eee49b540b69b2711d170a0433#e69478a890b1e4eee49b540b69b2711d170a0433"
source = "git+https://github.com/apache/arrow.git?rev=00a443629c00079ea03c0b9f415d74669d2759a7#00a443629c00079ea03c0b9f415d74669d2759a7"
dependencies = [
"arrow",
"bytes",
@ -429,9 +429,9 @@ dependencies = [
[[package]]
name = "cast"
version = "0.2.3"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
checksum = "cc38c385bfd7e444464011bb24820f40dd1c76bcdfa1b78611cb7c2e5cafab75"
dependencies = [
"rustc_version",
]
@ -488,9 +488,9 @@ dependencies = [
[[package]]
name = "clang-sys"
version = "1.1.1"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f54d78e30b388d4815220c8dd03fea5656b6c6d32adb59e89061552a102f8da1"
checksum = "853eda514c284c2287f4bf20ae614f8781f40a81d32ecda6e91449304dfe077c"
dependencies = [
"glob",
"libc",
@ -599,6 +599,7 @@ dependencies = [
"clap",
"criterion-plot",
"csv",
"futures",
"itertools 0.10.0",
"lazy_static",
"num-traits",
@ -611,6 +612,7 @@ dependencies = [
"serde_derive",
"serde_json",
"tinytemplate",
"tokio",
"walkdir",
]
@ -662,9 +664,9 @@ dependencies = [
[[package]]
name = "crossbeam-channel"
version = "0.5.0"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
dependencies = [
"cfg-if 1.0.0",
"crossbeam-utils",
@ -787,7 +789,7 @@ dependencies = [
[[package]]
name = "datafusion"
version = "4.0.0-SNAPSHOT"
source = "git+https://github.com/apache/arrow.git?rev=e69478a890b1e4eee49b540b69b2711d170a0433#e69478a890b1e4eee49b540b69b2711d170a0433"
source = "git+https://github.com/apache/arrow.git?rev=00a443629c00079ea03c0b9f415d74669d2759a7#00a443629c00079ea03c0b9f415d74669d2759a7"
dependencies = [
"ahash 0.7.2",
"arrow",
@ -1044,9 +1046,9 @@ checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394"
[[package]]
name = "futures"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1"
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
dependencies = [
"futures-channel",
"futures-core",
@ -1059,9 +1061,9 @@ dependencies = [
[[package]]
name = "futures-channel"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939"
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
dependencies = [
"futures-core",
"futures-sink",
@ -1069,15 +1071,15 @@ dependencies = [
[[package]]
name = "futures-core"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94"
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
[[package]]
name = "futures-executor"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1"
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
dependencies = [
"futures-core",
"futures-task",
@ -1086,15 +1088,15 @@ dependencies = [
[[package]]
name = "futures-io"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59"
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
[[package]]
name = "futures-macro"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7"
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
dependencies = [
"proc-macro-hack",
"proc-macro2",
@ -1104,21 +1106,21 @@ dependencies = [
[[package]]
name = "futures-sink"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3"
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
[[package]]
name = "futures-task"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80"
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
[[package]]
name = "futures-test"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1fe5e51002528907757d5f1648101086f7197f792112db43ba23b06b09e6bce"
checksum = "e77baeade98824bc928c21b8ad39918b9d8a06745ebdb6e2c93fb7673fb7968d"
dependencies = [
"futures-core",
"futures-executor",
@ -1132,9 +1134,9 @@ dependencies = [
[[package]]
name = "futures-util"
version = "0.3.13"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1"
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
dependencies = [
"futures-channel",
"futures-core",
@ -1885,6 +1887,7 @@ dependencies = [
"influxdb_line_protocol",
"internal_types",
"observability_deps",
"parking_lot",
"snafu",
"string-interner",
"test_helpers",
@ -2298,7 +2301,7 @@ dependencies = [
[[package]]
name = "parquet"
version = "4.0.0-SNAPSHOT"
source = "git+https://github.com/apache/arrow.git?rev=e69478a890b1e4eee49b540b69b2711d170a0433#e69478a890b1e4eee49b540b69b2711d170a0433"
source = "git+https://github.com/apache/arrow.git?rev=00a443629c00079ea03c0b9f415d74669d2759a7#00a443629c00079ea03c0b9f415d74669d2759a7"
dependencies = [
"arrow",
"base64 0.12.3",
@ -2331,6 +2334,7 @@ dependencies = [
"bytes",
"data_types",
"futures",
"internal_types",
"object_store",
"parking_lot",
"snafu",
@ -2646,6 +2650,7 @@ dependencies = [
"futures",
"influxdb_line_protocol",
"internal_types",
"libc",
"observability_deps",
"parking_lot",
"snafu",
@ -2880,9 +2885,9 @@ dependencies = [
[[package]]
name = "reqwest"
version = "0.11.2"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf12057f289428dbf5c591c74bf10392e4a8003f993405a902f20117019022d4"
checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124"
dependencies = [
"base64 0.13.0",
"bytes",
@ -3117,9 +3122,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "sct"
version = "0.6.0"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c"
checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce"
dependencies = [
"ring",
"untrusted",
@ -3260,13 +3265,16 @@ dependencies = [
"bytes",
"chrono",
"crc32fast",
"criterion",
"data_types",
"flatbuffers",
"flate2",
"futures",
"generated_types",
"influxdb_line_protocol",
"internal_types",
"mutable_buffer",
"num_cpus",
"object_store",
"observability_deps",
"parking_lot",
@ -3277,6 +3285,7 @@ dependencies = [
"serde_json",
"snafu",
"snap",
"tempfile",
"test_helpers",
"tokio",
"tokio-util",
@ -3756,9 +3765,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.4.0"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "134af885d758d645f0f0505c9a8b3f9bf8a348fd822e112ab5248138348f1722"
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
dependencies = [
"autocfg",
"bytes",
@ -3819,9 +3828,9 @@ dependencies = [
[[package]]
name = "tokio-util"
version = "0.6.5"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f"
checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e"
dependencies = [
"bytes",
"futures-core",

View File

@ -71,7 +71,7 @@ csv = "1.1"
dirs = "3.0.1"
dotenv = "0.15.0"
flate2 = "1.0"
futures = "0.3.1"
futures = "0.3"
http = "0.2.0"
hyper = "0.14"
once_cell = { version = "1.4.0", features = ["parking_lot"] }

View File

@ -8,14 +8,14 @@ description = "Apache Arrow / Parquet / DataFusion dependencies for InfluxDB IOx
[dependencies] # In alphabetical order
# We are using development version of arrow/parquet/datafusion and the dependencies are at the same rev
# The version can be found here: https://github.com/apache/arrow/commit/e69478a890b1e4eee49b540b69b2711d170a0433
# The version can be found here: https://github.com/apache/arrow/commit/00a443629c00079ea03c0b9f415d74669d2759a7
#
arrow = { git = "https://github.com/apache/arrow.git", rev = "e69478a890b1e4eee49b540b69b2711d170a0433" , features = ["simd"] }
arrow-flight = { git = "https://github.com/apache/arrow.git", rev = "e69478a890b1e4eee49b540b69b2711d170a0433" }
arrow = { git = "https://github.com/apache/arrow.git", rev = "00a443629c00079ea03c0b9f415d74669d2759a7" , features = ["simd"] }
arrow-flight = { git = "https://github.com/apache/arrow.git", rev = "00a443629c00079ea03c0b9f415d74669d2759a7" }
# Turn off optional datafusion features (function packages)
datafusion = { git = "https://github.com/apache/arrow.git", rev = "e69478a890b1e4eee49b540b69b2711d170a0433", default-features = false }
datafusion = { git = "https://github.com/apache/arrow.git", rev = "00a443629c00079ea03c0b9f415d74669d2759a7", default-features = false }
# Turn off the "arrow" feature; it currently has a bug that causes the crate to rebuild every time
# and we're not currently using it anyway
parquet = { git = "https://github.com/apache/arrow.git", rev = "e69478a890b1e4eee49b540b69b2711d170a0433", default-features = false, features = ["snap", "brotli", "flate2", "lz4", "zstd"] }
parquet = { git = "https://github.com/apache/arrow.git", rev = "00a443629c00079ea03c0b9f415d74669d2759a7", default-features = false, features = ["snap", "brotli", "flate2", "lz4", "zstd"] }

View File

@ -29,6 +29,46 @@ macro_rules! assert_table_eq {
};
}
/// Compares formatted output of a record batch with an expected
/// vector of strings in a way that order does not matter.
/// This is a macro so errors appear on the correct line
///
/// Designed so that failure output can be directly copy/pasted
/// into the test code as expected results.
///
/// Expects to be called about like this:
///
/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])`
#[macro_export]
macro_rules! assert_batches_sorted_eq {
($EXPECTED_LINES: expr, $CHUNKS: expr) => {
let mut expected_lines: Vec<String> = $EXPECTED_LINES.iter().map(|&s| s.into()).collect();
// sort except for header + footer
let num_lines = expected_lines.len();
if num_lines > 3 {
expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable()
}
let formatted = arrow_deps::arrow::util::pretty::pretty_format_batches($CHUNKS).unwrap();
// fix for windows: \r\n -->
let mut actual_lines: Vec<&str> = formatted.trim().lines().collect();
// sort except for header + footer
let num_lines = actual_lines.len();
if num_lines > 3 {
actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable()
}
assert_eq!(
expected_lines, actual_lines,
"\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
expected_lines, actual_lines
);
};
}
// sort a record batch by all columns (to provide a stable output order for test
// comparison)
pub fn sort_record_batch(batch: RecordBatch) -> RecordBatch {

View File

@ -21,8 +21,11 @@ pub enum ChunkStorage {
/// The chunk is in the Read Buffer (where it can not be mutated)
ReadBuffer,
/// The chunk is both in ReadBuffer and Object Store
ReadBufferAndObjectStore,
/// The chunk is stored in Object Storage (where it can not be mutated)
ObjectStore,
ObjectStoreOnly,
}
impl ChunkStorage {
@ -32,7 +35,8 @@ impl ChunkStorage {
Self::OpenMutableBuffer => "OpenMutableBuffer",
Self::ClosedMutableBuffer => "ClosedMutableBuffer",
Self::ReadBuffer => "ReadBuffer",
Self::ObjectStore => "ObjectStore",
Self::ReadBufferAndObjectStore => "ReadBufferAndObjectStore",
Self::ObjectStoreOnly => "ObjectStoreOnly",
}
}
}
@ -134,7 +138,8 @@ impl From<ChunkStorage> for management::ChunkStorage {
ChunkStorage::OpenMutableBuffer => Self::OpenMutableBuffer,
ChunkStorage::ClosedMutableBuffer => Self::ClosedMutableBuffer,
ChunkStorage::ReadBuffer => Self::ReadBuffer,
ChunkStorage::ObjectStore => Self::ObjectStore,
ChunkStorage::ReadBufferAndObjectStore => Self::ReadBufferAndObjectStore,
ChunkStorage::ObjectStoreOnly => Self::ObjectStoreOnly,
}
}
}
@ -204,7 +209,10 @@ impl TryFrom<management::ChunkStorage> for ChunkStorage {
management::ChunkStorage::OpenMutableBuffer => Ok(Self::OpenMutableBuffer),
management::ChunkStorage::ClosedMutableBuffer => Ok(Self::ClosedMutableBuffer),
management::ChunkStorage::ReadBuffer => Ok(Self::ReadBuffer),
management::ChunkStorage::ObjectStore => Ok(Self::ObjectStore),
management::ChunkStorage::ReadBufferAndObjectStore => {
Ok(Self::ReadBufferAndObjectStore)
}
management::ChunkStorage::ObjectStoreOnly => Ok(Self::ObjectStoreOnly),
management::ChunkStorage::Unspecified => Err(FieldViolation::required("")),
}
}
@ -220,7 +228,7 @@ mod test {
partition_key: "foo".to_string(),
id: 42,
estimated_bytes: 1234,
storage: management::ChunkStorage::ObjectStore.into(),
storage: management::ChunkStorage::ObjectStoreOnly.into(),
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -231,7 +239,7 @@ mod test {
partition_key: Arc::new("foo".to_string()),
id: 42,
estimated_bytes: 1234,
storage: ChunkStorage::ObjectStore,
storage: ChunkStorage::ObjectStoreOnly,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -250,7 +258,7 @@ mod test {
partition_key: Arc::new("foo".to_string()),
id: 42,
estimated_bytes: 1234,
storage: ChunkStorage::ObjectStore,
storage: ChunkStorage::ObjectStoreOnly,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -262,7 +270,7 @@ mod test {
partition_key: "foo".to_string(),
id: 42,
estimated_bytes: 1234,
storage: management::ChunkStorage::ObjectStore.into(),
storage: management::ChunkStorage::ObjectStoreOnly.into(),
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,

View File

@ -181,7 +181,7 @@ pub struct LifecycleRules {
pub mutable_size_threshold: Option<NonZeroUsize>,
/// Once the total amount of buffered data in memory reaches this size start
/// dropping data from memory based on the drop_order
/// dropping data from memory based on the [`sort_order`](Self::sort_order)
pub buffer_size_soft: Option<NonZeroUsize>,
/// Once the amount of data in memory reaches this size start
@ -199,6 +199,9 @@ pub struct LifecycleRules {
/// Allow dropping data that has not been persisted to object storage
pub drop_non_persisted: bool,
/// Persists chunks to object storage.
pub persist: bool,
/// Do not allow writing new data to this database
pub immutable: bool,
}
@ -228,6 +231,7 @@ impl From<LifecycleRules> for management::LifecycleRules {
.unwrap_or_default(),
sort_order: Some(config.sort_order.into()),
drop_non_persisted: config.drop_non_persisted,
persist: config.persist,
immutable: config.immutable,
}
}
@ -245,6 +249,7 @@ impl TryFrom<management::LifecycleRules> for LifecycleRules {
buffer_size_hard: (proto.buffer_size_hard as usize).try_into().ok(),
sort_order: proto.sort_order.optional("sort_order")?.unwrap_or_default(),
drop_non_persisted: proto.drop_non_persisted,
persist: proto.persist,
immutable: proto.immutable,
})
}
@ -743,6 +748,7 @@ impl TryFrom<management::partition_template::Part> for TemplatePart {
/// ShardId maps to a nodegroup that holds the the shard.
pub type ShardId = u16;
pub const NO_SHARD_CONFIG: Option<&ShardConfig> = None;
/// Assigns a given line to a specific shard id.
pub trait Sharder {
@ -776,6 +782,12 @@ pub struct ShardConfig {
pub ignore_errors: bool,
}
impl Sharder for ShardConfig {
fn shard(&self, _line: &ParsedLine<'_>) -> Result<ShardId, Error> {
todo!("mkm to implement as part of #916");
}
}
/// Maps a matcher with specific target group. If the line/row matches
/// it should be sent to the group.
#[derive(Debug, Eq, PartialEq, Clone, Default)]
@ -1281,6 +1293,7 @@ mod tests {
buffer_size_hard: 232,
sort_order: None,
drop_non_persisted: true,
persist: true,
immutable: true,
};

View File

@ -27,6 +27,13 @@ pub enum Job {
partition_key: String,
chunk_id: u32,
},
/// Write a chunk from read buffer to object store
WriteChunk {
db_name: String,
partition_key: String,
chunk_id: u32,
},
}
impl From<Job> for management::operation_metadata::Job {
@ -49,6 +56,15 @@ impl From<Job> for management::operation_metadata::Job {
partition_key,
chunk_id,
}),
Job::WriteChunk {
db_name,
partition_key,
chunk_id,
} => Self::WriteChunk(management::WriteChunk {
db_name,
partition_key,
chunk_id,
}),
}
}
}
@ -74,6 +90,15 @@ impl From<management::operation_metadata::Job> for Job {
partition_key,
chunk_id,
},
Job::WriteChunk(management::WriteChunk {
db_name,
partition_key,
chunk_id,
}) => Self::WriteChunk {
db_name,
partition_key,
chunk_id,
},
}
}
}

View File

@ -12,6 +12,7 @@ pub struct TimestampRange {
impl TimestampRange {
pub fn new(start: i64, end: i64) -> Self {
debug_assert!(end > start);
Self { start, end }
}
@ -26,6 +27,12 @@ impl TimestampRange {
pub fn contains_opt(&self, v: Option<i64>) -> bool {
Some(true) == v.map(|ts| self.contains(ts))
}
#[inline]
/// Returns if this range is disjoint w.r.t the provided range
pub fn disjoint(&self, other: &Self) -> bool {
self.end <= other.start || self.start >= other.end
}
}
#[cfg(test)]
@ -55,4 +62,18 @@ mod tests {
assert!(!range.contains_opt(None));
}
#[test]
fn test_disjoint() {
let r1 = TimestampRange::new(100, 200);
let r2 = TimestampRange::new(200, 300);
let r3 = TimestampRange::new(150, 250);
assert!(r1.disjoint(&r2));
assert!(r2.disjoint(&r1));
assert!(!r1.disjoint(&r3));
assert!(!r3.disjoint(&r1));
assert!(!r2.disjoint(&r3));
assert!(!r3.disjoint(&r2));
}
}

View File

@ -6,7 +6,7 @@ As discussed on https://github.com/influxdata/influxdb_iox/pull/221 and https://
1. Use only async I/O via `tokio` for socket communication. It is ok to use either blocking (e.g. `std::fs::File`) or async APIs (e.g. `tokio::fs::File`) for local File I/O.
2. All CPU bound tasks should be scheduled on the separate application level `thread_pool` not with `tokio::task::spawn` nor `tokio::task::spawn_blocking` nor a new threadpool.
2. All CPU bound tasks should be scheduled on the separate application level `thread_pool` (which can be another tokio executor but should be separate from the executor that handles I/O).
We will work, over time, to migrate the rest of the codebase to use these patterns.
@ -41,11 +41,11 @@ It is ok to use either blocking (e.g. `std::fs::File`) or async APIs for local
This can not always be done (e.g. with a library such as parquet writer which is not `async`). In such cases, using `tokio::task::spawn_blocking` should be used to perform the file I/O.
### All CPU heavy work should be done on the single app level worker pool, separate from the tokio runtime
### All CPU heavy work should be done on the single app level worker pool, separate from the tokio runtime handling IO
**What**: All CPU heavy work should be done on the single app level worker pool. We provide a `thread_pool` interface that interacts nicely with async tasks (e.g. that allows an async task to `await` for a CPU heavy task to complete).
**What**: All CPU heavy work should be done on the app level worker pool. We provide a `thread_pool` interface that interacts nicely with async tasks (e.g. that allows an async task to `await` for a CPU heavy task to complete).
**Rationale**: A single app level worker pool gives us a single place to control work priority, eventually, so that tasks such as compaction of large data files can have lower precedence than incoming queries. By using a different pool than the tokio runtime, with a limited number of threads, we avoid over-saturating the CPU with OS threads and thereby starving the limited number tokio I/O threads. A separate, single app level pool also limits the number of underlying OS CPU threads which are spawned, even under heavy load, keeping thread context switching overhead low.
**Rationale**: A single app level worker pool gives us a single place to control work priority, eventually, so that tasks such as compaction of large data files can have lower precedence than incoming queries. By using a different pool than the main tokio runtime, with a limited number of threads, we avoid over-saturating the CPU with OS threads and thereby starving the limited number tokio I/O threads. A separate, single app level pool also limits the number of underlying OS CPU threads which are spawned, even under heavy load, keeping thread context switching overhead low.
There will, of course, always be a judgment call to be made of where "CPU bound work" starts and "work acceptable for I/O processing" ends. A reasonable rule of thumb is if a job will *always* be completed in less than 100ms then that is probably fine for an I/O thread). This number may be revised as we tune the system.

View File

@ -5,3 +5,6 @@ When updating the version of the [flatbuffers](https://crates.io/crates/flatbuff
To update the generated code, edit `generated_types/regenerate-flatbuffers.sh` and set the `FB_COMMIT` variable at the top of the file to the commit SHA of the same commit in the [flatbuffers repository](https://github.com/google/flatbuffers) where the `flatbuffers` Rust crate version was updated. This ensures we'll be [using the same version of `flatc` that the crate was tested with](https://github.com/google/flatbuffers/issues/6199#issuecomment-714562121).
Then run the `generated_types/regenerate-flatbuffers.sh` script and check in any changes. Check the whole project builds.
`generated_types/regenerate-flatbuffers.sh` will build `flatc` from source if it cannot be found.
In order to do that your system will require `bazel`; you can likely install this with your favourite package manager.

View File

@ -9,7 +9,7 @@ bytes = { version = "1.0", features = ["serde"] }
# See docs/regenerating_flatbuffers.md about updating generated code when updating the
# version of the flatbuffers crate
flatbuffers = "0.8"
futures = "0.3.1"
futures = "0.3"
prost = "0.7"
prost-types = "0.7"
tonic = "0.4"

View File

@ -17,8 +17,11 @@ enum ChunkStorage {
// The chunk is in the Read Buffer (where it can not be mutated)
CHUNK_STORAGE_READ_BUFFER = 3;
// The chunk is in the Read Buffer and Object Store
CHUNK_STORAGE_READ_BUFFER_AND_OBJECT_STORE = 4;
// The chunk is stored in Object Storage (where it can not be mutated)
CHUNK_STORAGE_OBJECT_STORE = 4;
CHUNK_STORAGE_OBJECT_STORE_ONLY = 5;
}
// `Chunk` represents part of a partition of data in a database.

View File

@ -151,6 +151,9 @@ message LifecycleRules {
// Allow dropping data that has not been persisted to object storage
bool drop_non_persisted = 7;
// Persists chunks to object storage.
bool persist = 9;
// Do not allow writing new data to this database
bool immutable = 8;
}

View File

@ -19,6 +19,7 @@ message OperationMetadata {
Dummy dummy = 5;
PersistSegment persist_segment = 6;
CloseChunk close_chunk = 7;
WriteChunk write_chunk = 8;
}
}
@ -45,3 +46,15 @@ message CloseChunk {
// chunk_id
uint32 chunk_id = 3;
}
// Write a chunk from read buffer to object store
message WriteChunk {
// name of the database
string db_name = 1;
// partition key
string partition_key = 2;
// chunk_id
uint32 chunk_id = 3;
}

View File

@ -4,7 +4,12 @@ package influxdata.iox.write.v1;
service WriteService {
// write data into a specific Database
rpc Write(WriteRequest) returns (WriteResponse);
rpc Write(WriteRequest) returns (WriteResponse) {
option deprecated = true;
};
// write an entry into a Database
rpc WriteEntry(WriteEntryRequest) returns (WriteEntryResponse);
}
message WriteRequest {
@ -21,3 +26,17 @@ message WriteResponse {
// how many lines were parsed and written into the database
uint64 lines_written = 1;
}
message WriteEntryRequest {
// name of database into which to write
string db_name = 1;
// entry, in serialized flatbuffers [Entry] format
//
// [Entry](https://github.com/influxdata/influxdb_iox/blob/main/generated_types/protos/influxdata/iox/write/v1/entry.fbs)
bytes entry = 2;
}
message WriteEntryResponse {
}

View File

@ -1,11 +1,29 @@
#!/bin/bash -e
# The commit where the Rust `flatbuffers` crate version was changed to the version in `Cargo.lock`
# Update this, rerun this script, and check in the changes in the generated code when the
# `flatbuffers` crate version is updated.
# Instructions
#
# If you have changed some `*.fbs` files:
#
# - Run this script to regenerate the corresponding Rust code.
# - Run `cargo test` to make sure everything works as you would expect.
# - Check in the changes to the generated code along with your changes to the `*.fbs` files.
# - You should not need to edit this script.
#
# If you are updating the version of the `flatbuffers` crate in `Cargo.lock`:
#
# - The `flatbuffers` crate gets developed in sync with the `flatc` compiler in the same repo,
# so when updating the `flatbuffers` crate we also need to update the `flatc` compiler we're
# using.
# - Go to https://github.com/google/flatbuffers/blame/master/rust/flatbuffers/Cargo.toml and find
# the commit SHA where the `version` metadata was updated to the version of the `flatbuffers`
# crate we now want to have in our `Cargo.lock`.
# - Put that commit SHA in this variable:
FB_COMMIT="86401e078d0746d2381735415f8c2dfe849f3f52"
# - Run this script to regenerate the corresponding Rust code.
# - Run `cargo test` to make sure everything works as you would expect.
# - Check in the changes to the generated code along with your changes to the `Cargo.lock` file and
# this script.
# Change to the generated_types crate directory, where this script is located
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
pushd $DIR

View File

@ -6,7 +6,7 @@ edition = "2018"
[dependencies] # In alphabetical order
bytes = { version = "1.0", default-features = false }
futures = { version = "0.3.5", default-features = false }
futures = { version = "0.3", default-features = false }
reqwest = { version = "0.11", features = ["stream", "json"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.44"

View File

@ -1,8 +1,7 @@
use chrono::{DateTime, Utc};
use criterion::{criterion_group, criterion_main, Criterion};
use data_types::database_rules::{Error as DataError, Partitioner, Sharder};
use influxdb_line_protocol::ParsedLine;
use internal_types::entry::{lines_to_sharded_entries, SequencedEntry};
use data_types::database_rules::ShardConfig;
use internal_types::entry::test_helpers::partitioner;
use internal_types::entry::{lines_to_sharded_entries, ClockValue, SequencedEntry};
static LINES: &str = include_str!("../../tests/fixtures/lineproto/prometheus.lp");
@ -12,7 +11,8 @@ fn sequenced_entry(c: &mut Criterion) {
let lines = influxdb_line_protocol::parse_lines(LINES)
.collect::<Result<Vec<_>, _>>()
.unwrap();
let sharded_entries = lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
let shard_config: Option<&ShardConfig> = None;
let sharded_entries = lines_to_sharded_entries(&lines, shard_config, &partitioner(1)).unwrap();
let entry = &sharded_entries.first().unwrap().entry;
let data = entry.data();
assert_eq!(
@ -28,10 +28,13 @@ fn sequenced_entry(c: &mut Criterion) {
554
);
let clock_value = ClockValue::new(23);
group.bench_function("new_from_entry_bytes", |b| {
b.iter(|| {
let sequenced_entry = SequencedEntry::new_from_entry_bytes(23, 2, data).unwrap();
assert_eq!(sequenced_entry.clock_value(), 23);
let sequenced_entry =
SequencedEntry::new_from_entry_bytes(clock_value, 2, data).unwrap();
assert_eq!(sequenced_entry.clock_value(), clock_value);
assert_eq!(sequenced_entry.writer_id(), 2);
})
});
@ -42,50 +45,3 @@ fn sequenced_entry(c: &mut Criterion) {
criterion_group!(benches, sequenced_entry);
criterion_main!(benches);
fn sharder(count: u16) -> TestSharder {
TestSharder {
count,
n: std::cell::RefCell::new(0),
}
}
// For each line passed to shard returns a shard id from [0, count) in order
struct TestSharder {
count: u16,
n: std::cell::RefCell<u16>,
}
impl Sharder for TestSharder {
fn shard(&self, _line: &ParsedLine<'_>) -> Result<u16, DataError> {
let n = *self.n.borrow();
self.n.replace(n + 1);
Ok(n % self.count)
}
}
fn partitioner(count: u8) -> TestPartitioner {
TestPartitioner {
count,
n: std::cell::RefCell::new(0),
}
}
// For each line passed to partition_key returns a key with a number from [0,
// count)
struct TestPartitioner {
count: u8,
n: std::cell::RefCell<u8>,
}
impl Partitioner for TestPartitioner {
fn partition_key(
&self,
_line: &ParsedLine<'_>,
_default_time: &DateTime<Utc>,
) -> data_types::database_rules::Result<String> {
let n = *self.n.borrow();
self.n.replace(n + 1);
Ok(format!("key_{}", n % self.count))
}
}

View File

@ -2,7 +2,7 @@
//! from line protocol and the `DatabaseRules` configuration.
use crate::schema::TIME_COLUMN_NAME;
use data_types::database_rules::{Error as DataError, Partitioner, ShardId, Sharder};
use data_types::database_rules::{Error as DataError, Partitioner, ShardId, Sharder, WriterId};
use generated_types::entry as entry_fb;
use influxdb_line_protocol::{FieldValue, ParsedLine};
@ -56,14 +56,17 @@ type ColumnResult<T, E = ColumnError> = std::result::Result<T, E>;
/// underlying flatbuffers bytes generated.
pub fn lines_to_sharded_entries(
lines: &[ParsedLine<'_>],
sharder: &impl Sharder,
sharder: Option<&impl Sharder>,
partitioner: &impl Partitioner,
) -> Result<Vec<ShardedEntry>> {
let default_time = Utc::now();
let mut sharded_lines = BTreeMap::new();
for line in lines {
let shard_id = sharder.shard(line).context(GeneratingShardId)?;
let shard_id = match &sharder {
Some(s) => Some(s.shard(line).context(GeneratingShardId)?),
None => None,
};
let partition_key = partitioner
.partition_key(line, &default_time)
.context(GeneratingPartitionKey)?;
@ -90,7 +93,7 @@ pub fn lines_to_sharded_entries(
}
fn build_sharded_entry(
shard_id: ShardId,
shard_id: Option<ShardId>,
partitions: BTreeMap<String, BTreeMap<&str, Vec<&ParsedLine<'_>>>>,
default_time: &DateTime<Utc>,
) -> Result<ShardedEntry> {
@ -277,10 +280,12 @@ fn build_table_write_batch<'a>(
))
}
/// Holds a shard id to the associated entry
/// Holds a shard id to the associated entry. If there is no ShardId, then
/// everything goes to the same place. This means a single entry will be
/// generated from a batch of line protocol.
#[derive(Debug)]
pub struct ShardedEntry {
pub shard_id: ShardId,
pub shard_id: Option<ShardId>,
pub entry: Entry,
}
@ -337,8 +342,10 @@ pub struct PartitionWrite<'a> {
}
impl<'a> PartitionWrite<'a> {
pub fn key(&self) -> Option<&str> {
self.fb.key()
pub fn key(&self) -> &str {
self.fb
.key()
.expect("key must be present in the flatbuffer PartitionWrite")
}
pub fn table_batches(&self) -> Vec<TableBatch<'_>> {
@ -360,8 +367,10 @@ pub struct TableBatch<'a> {
}
impl<'a> TableBatch<'a> {
pub fn name(&self) -> Option<&str> {
self.fb.name()
pub fn name(&self) -> &str {
self.fb
.name()
.expect("name must be present in flatbuffers TableWriteBatch")
}
pub fn columns(&self) -> Vec<Column<'_>> {
@ -420,18 +429,32 @@ impl<'a> TableBatch<'a> {
#[derive(Debug)]
pub struct Column<'a> {
fb: entry_fb::Column<'a>,
row_count: usize,
pub row_count: usize,
}
impl<'a> Column<'a> {
pub fn name(&self) -> Option<&str> {
self.fb.name()
pub fn name(&self) -> &str {
self.fb
.name()
.expect("name must be present in flatbuffers Column")
}
pub fn logical_type(&self) -> entry_fb::LogicalColumnType {
self.fb.logical_column_type()
}
pub fn is_tag(&self) -> bool {
self.fb.logical_column_type() == entry_fb::LogicalColumnType::Tag
}
pub fn is_field(&self) -> bool {
self.fb.logical_column_type() == entry_fb::LogicalColumnType::Field
}
pub fn is_time(&self) -> bool {
self.fb.logical_column_type() == entry_fb::LogicalColumnType::Time
}
pub fn values(&self) -> TypedValuesIterator<'a> {
match self.fb.values_type() {
entry_fb::ColumnValues::BoolValues => TypedValuesIterator::Bool(BoolIterator {
@ -554,12 +577,22 @@ impl<'a> TypedValuesIterator<'a> {
_ => None,
}
}
pub fn type_description(&self) -> &str {
match self {
Self::Bool(_) => "bool",
Self::I64(_) => "i64",
Self::F64(_) => "f64",
Self::U64(_) => "u64",
Self::String(_) => "String",
}
}
}
/// Iterator over the flatbuffers BoolValues
#[derive(Debug)]
pub struct BoolIterator<'a> {
row_count: usize,
pub row_count: usize,
position: usize,
null_mask: Option<&'a [u8]>,
values: &'a [bool],
@ -589,7 +622,7 @@ impl<'a> Iterator for BoolIterator<'a> {
/// Iterator over the flatbuffers I64Values, F64Values, and U64Values.
#[derive(Debug)]
pub struct ValIterator<'a, T: Follow<'a> + Follow<'a, Inner = T>> {
row_count: usize,
pub row_count: usize,
position: usize,
null_mask: Option<&'a [u8]>,
values_iter: VectorIter<'a, T>,
@ -615,7 +648,7 @@ impl<'a, T: Follow<'a> + Follow<'a, Inner = T>> Iterator for ValIterator<'a, T>
/// Iterator over the flatbuffers StringValues
#[derive(Debug)]
pub struct StringIterator<'a> {
row_count: usize,
pub row_count: usize,
position: usize,
null_mask: Option<&'a [u8]>,
values: VectorIter<'a, ForwardsUOffset<&'a str>>,
@ -1087,6 +1120,19 @@ enum ColumnRaw<'a> {
Bool(Vec<bool>),
}
#[derive(Debug, PartialOrd, PartialEq, Copy, Clone)]
pub struct ClockValue(u64);
impl ClockValue {
pub fn get(&self) -> u64 {
self.0
}
pub fn new(v: u64) -> Self {
Self { 0: v }
}
}
#[self_referencing]
#[derive(Debug)]
pub struct SequencedEntry {
@ -1101,7 +1147,7 @@ pub struct SequencedEntry {
impl SequencedEntry {
pub fn new_from_entry_bytes(
clock_value: u64,
clock_value: ClockValue,
writer_id: u32,
entry_bytes: &[u8],
) -> Result<Self> {
@ -1118,7 +1164,7 @@ impl SequencedEntry {
let sequenced_entry = entry_fb::SequencedEntry::create(
&mut fbb,
&entry_fb::SequencedEntryArgs {
clock_value,
clock_value: clock_value.get(),
writer_id,
entry_bytes: Some(entry_bytes),
},
@ -1151,11 +1197,11 @@ impl SequencedEntry {
}
}
pub fn clock_value(&self) -> u64 {
self.fb().clock_value()
pub fn clock_value(&self) -> ClockValue {
ClockValue::new(self.fb().clock_value())
}
pub fn writer_id(&self) -> u32 {
pub fn writer_id(&self) -> WriterId {
self.fb().writer_id()
}
}
@ -1180,10 +1226,133 @@ impl TryFrom<Vec<u8>> for SequencedEntry {
}
}
pub mod test_helpers {
use super::*;
use chrono::TimeZone;
use influxdb_line_protocol::parse_lines;
// An appropriate maximum size for batches of LP to be written into IOx. Using
// test fixtures containing more than this many lines of LP will result in them
// being written as multiple writes.
const LP_BATCH_SIZE: usize = 10000;
/// Converts the line protocol to a single `Entry` with a single shard and
/// a single partition.
pub fn lp_to_entry(lp: &str) -> Entry {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &hour_partitioner())
.unwrap()
.pop()
.unwrap()
.entry
}
/// Converts the line protocol to a collection of `Entry` with a single
/// shard and a single partition, which is useful for testing when `lp` is
/// large. Batches are sized according to LP_BATCH_SIZE.
pub fn lp_to_entries(lp: &str) -> Vec<Entry> {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
lines
.chunks(LP_BATCH_SIZE)
.map(|batch| {
lines_to_sharded_entries(batch, sharder(1).as_ref(), &hour_partitioner())
.unwrap()
.pop()
.unwrap()
.entry
})
.collect::<Vec<_>>()
}
/// Returns a test sharder that will assign shard ids from [0, count)
/// incrementing for each line.
pub fn sharder(count: u16) -> Option<TestSharder> {
Some(TestSharder {
count,
n: std::cell::RefCell::new(0),
})
}
// For each line passed to shard returns a shard id from [0, count) in order
#[derive(Debug)]
pub struct TestSharder {
count: u16,
n: std::cell::RefCell<u16>,
}
impl Sharder for TestSharder {
fn shard(&self, _line: &ParsedLine<'_>) -> Result<u16, DataError> {
let n = *self.n.borrow();
self.n.replace(n + 1);
Ok(n % self.count)
}
}
/// Returns a test partitioner that will partition data by the hour
pub fn hour_partitioner() -> HourPartitioner {
HourPartitioner {}
}
/// Returns a test partitioner that will assign partition keys in the form
/// key_# where # is replaced by a number `[0, count)` incrementing for
/// each line.
pub fn partitioner(count: u8) -> TestPartitioner {
TestPartitioner {
count,
n: std::cell::RefCell::new(0),
}
}
// For each line passed to partition_key returns a key with a number from
// `[0, count)`
#[derive(Debug)]
pub struct TestPartitioner {
count: u8,
n: std::cell::RefCell<u8>,
}
impl Partitioner for TestPartitioner {
fn partition_key(
&self,
_line: &ParsedLine<'_>,
_default_time: &DateTime<Utc>,
) -> data_types::database_rules::Result<String> {
let n = *self.n.borrow();
self.n.replace(n + 1);
Ok(format!("key_{}", n % self.count))
}
}
// Partitions by the hour
#[derive(Debug)]
pub struct HourPartitioner {}
impl Partitioner for HourPartitioner {
fn partition_key(
&self,
line: &ParsedLine<'_>,
default_time: &DateTime<Utc>,
) -> data_types::database_rules::Result<String> {
const HOUR_FORMAT: &str = "%Y-%m-%dT%H";
let key = match line.timestamp {
Some(t) => Utc.timestamp_nanos(t).format(HOUR_FORMAT),
None => default_time.format(HOUR_FORMAT),
}
.to_string();
Ok(key)
}
}
}
#[cfg(test)]
mod tests {
use super::test_helpers::*;
use super::*;
use data_types::database_rules::NO_SHARD_CONFIG;
use influxdb_line_protocol::parse_lines;
#[test]
@ -1197,11 +1366,28 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(2), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(2).as_ref(), &partitioner(1)).unwrap();
assert_eq!(sharded_entries.len(), 2);
assert_eq!(sharded_entries[0].shard_id, 0);
assert_eq!(sharded_entries[1].shard_id, 1);
assert_eq!(sharded_entries[0].shard_id, Some(0));
assert_eq!(sharded_entries[1].shard_id, Some(1));
}
#[test]
fn no_shard_config() {
let lp = vec![
"cpu,host=a,region=west user=23.1,system=66.1 123",
"mem,host=a,region=west used=23432 123",
"foo bar=true 21",
]
.join("\n");
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, NO_SHARD_CONFIG, &partitioner(1)).unwrap();
assert_eq!(sharded_entries.len(), 1);
assert_eq!(sharded_entries[0].shard_id, None);
}
#[test]
@ -1215,12 +1401,12 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(2)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(2)).unwrap();
let partition_writes = sharded_entries[0].entry.partition_writes().unwrap();
assert_eq!(partition_writes.len(), 2);
assert_eq!(partition_writes[0].key().unwrap(), "key_0");
assert_eq!(partition_writes[1].key().unwrap(), "key_1");
assert_eq!(partition_writes[0].key(), "key_0");
assert_eq!(partition_writes[1].key(), "key_1");
}
#[test]
@ -1236,15 +1422,15 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries[0].entry.partition_writes().unwrap();
let table_batches = partition_writes[0].table_batches();
assert_eq!(table_batches.len(), 3);
assert_eq!(table_batches[0].name().unwrap(), "cpu");
assert_eq!(table_batches[1].name().unwrap(), "disk");
assert_eq!(table_batches[2].name().unwrap(), "mem");
assert_eq!(table_batches[0].name(), "cpu");
assert_eq!(table_batches[1].name(), "disk");
assert_eq!(table_batches[2].name(), "mem");
}
#[test]
@ -1253,7 +1439,7 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries[0].entry.partition_writes().unwrap();
let table_batches = partition_writes[0].table_batches();
@ -1263,22 +1449,22 @@ mod tests {
assert_eq!(columns.len(), 5);
assert_eq!(columns[0].name().unwrap(), "host");
assert_eq!(columns[0].name(), "host");
assert_eq!(columns[0].logical_type(), entry_fb::LogicalColumnType::Tag);
assert_eq!(columns[1].name().unwrap(), "region");
assert_eq!(columns[1].name(), "region");
assert_eq!(columns[1].logical_type(), entry_fb::LogicalColumnType::Tag);
assert_eq!(columns[2].name().unwrap(), "time");
assert_eq!(columns[2].name(), "time");
assert_eq!(columns[2].logical_type(), entry_fb::LogicalColumnType::Time);
assert_eq!(columns[3].name().unwrap(), "val");
assert_eq!(columns[3].name(), "val");
assert_eq!(
columns[3].logical_type(),
entry_fb::LogicalColumnType::Field
);
assert_eq!(columns[4].name().unwrap(), "val2");
assert_eq!(columns[4].name(), "val2");
assert_eq!(
columns[4].logical_type(),
entry_fb::LogicalColumnType::Field
@ -1295,7 +1481,7 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries
.first()
@ -1312,17 +1498,17 @@ mod tests {
assert_eq!(columns.len(), 7);
let col = columns.get(0).unwrap();
assert_eq!(col.name().unwrap(), "bval");
assert_eq!(col.name(), "bval");
let values = col.values().bool_values().unwrap();
assert_eq!(&values, &[Some(true), Some(false)]);
let col = columns.get(1).unwrap();
assert_eq!(col.name().unwrap(), "fval");
assert_eq!(col.name(), "fval");
let values = col.values().f64_values().unwrap();
assert_eq!(&values, &[Some(1.2), Some(2.2)]);
let col = columns.get(2).unwrap();
assert_eq!(col.name().unwrap(), "host");
assert_eq!(col.name(), "host");
let values = match col.values() {
TypedValuesIterator::String(v) => v,
_ => panic!("wrong type"),
@ -1331,12 +1517,12 @@ mod tests {
assert_eq!(&values, &[Some("a"), Some("b")]);
let col = columns.get(3).unwrap();
assert_eq!(col.name().unwrap(), "ival");
assert_eq!(col.name(), "ival");
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(23), Some(22)]);
let col = columns.get(4).unwrap();
assert_eq!(col.name().unwrap(), "sval");
assert_eq!(col.name(), "sval");
let values = match col.values() {
TypedValuesIterator::String(v) => v,
_ => panic!("wrong type"),
@ -1345,12 +1531,12 @@ mod tests {
assert_eq!(&values, &[Some("hi"), Some("world")]);
let col = columns.get(5).unwrap();
assert_eq!(col.name().unwrap(), TIME_COLUMN_NAME);
assert_eq!(col.name(), TIME_COLUMN_NAME);
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(1), Some(2)]);
let col = columns.get(6).unwrap();
assert_eq!(col.name().unwrap(), "uval");
assert_eq!(col.name(), "uval");
let values = col.values().u64_values().unwrap();
assert_eq!(&values, &[Some(7), Some(1)]);
}
@ -1366,7 +1552,7 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries
.first()
@ -1383,13 +1569,13 @@ mod tests {
assert_eq!(columns.len(), 7);
let col = columns.get(0).unwrap();
assert_eq!(col.name().unwrap(), "bool");
assert_eq!(col.name(), "bool");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = col.values().bool_values().unwrap();
assert_eq!(&values, &[None, None, Some(true)]);
let col = columns.get(1).unwrap();
assert_eq!(col.name().unwrap(), "host");
assert_eq!(col.name(), "host");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Tag);
let values = match col.values() {
TypedValuesIterator::String(v) => v,
@ -1399,7 +1585,7 @@ mod tests {
assert_eq!(&values, &[Some("a"), Some("a"), None]);
let col = columns.get(2).unwrap();
assert_eq!(col.name().unwrap(), "region");
assert_eq!(col.name(), "region");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Tag);
let values = match col.values() {
TypedValuesIterator::String(v) => v,
@ -1409,7 +1595,7 @@ mod tests {
assert_eq!(&values, &[None, Some("west"), None]);
let col = columns.get(3).unwrap();
assert_eq!(col.name().unwrap(), "string");
assert_eq!(col.name(), "string");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = match col.values() {
TypedValuesIterator::String(v) => v,
@ -1419,19 +1605,19 @@ mod tests {
assert_eq!(&values, &[None, None, Some("hello")]);
let col = columns.get(4).unwrap();
assert_eq!(col.name().unwrap(), TIME_COLUMN_NAME);
assert_eq!(col.name(), TIME_COLUMN_NAME);
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Time);
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(983), Some(2343), Some(222)]);
let col = columns.get(5).unwrap();
assert_eq!(col.name().unwrap(), "val");
assert_eq!(col.name(), "val");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(23), None, Some(21)]);
let col = columns.get(6).unwrap();
assert_eq!(col.name().unwrap(), "val2");
assert_eq!(col.name(), "val2");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = col.values().f64_values().unwrap();
assert_eq!(&values, &[None, Some(23.2), None]);
@ -1491,7 +1677,7 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries
.first()
.unwrap()
@ -1504,7 +1690,7 @@ mod tests {
assert_eq!(batch.row_count(), 1);
let col = columns.get(1).unwrap();
assert_eq!(col.name().unwrap(), "val");
assert_eq!(col.name(), "val");
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(1)]);
@ -1522,7 +1708,7 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries
.first()
.unwrap()
@ -1535,7 +1721,7 @@ mod tests {
assert_eq!(batch.row_count(), 8);
let col = columns.get(1).unwrap();
assert_eq!(col.name().unwrap(), "val");
assert_eq!(col.name(), "val");
let values = col.values().i64_values().unwrap();
assert_eq!(
&values,
@ -1566,7 +1752,7 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries
.first()
.unwrap()
@ -1579,7 +1765,7 @@ mod tests {
assert_eq!(batch.row_count(), 9);
let col = columns.get(1).unwrap();
assert_eq!(col.name().unwrap(), "val");
assert_eq!(col.name(), "val");
let values = col.values().i64_values().unwrap();
assert_eq!(
&values,
@ -1605,7 +1791,7 @@ mod tests {
let t = Utc::now().timestamp_nanos();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let partition_writes = sharded_entries
.first()
@ -1618,7 +1804,7 @@ mod tests {
let columns = batch.columns();
let col = columns.get(0).unwrap();
assert_eq!(col.name().unwrap(), TIME_COLUMN_NAME);
assert_eq!(col.name(), TIME_COLUMN_NAME);
let values = col.values().i64_values().unwrap();
assert!(values[0].unwrap() > t);
assert_eq!(values[1], Some(123));
@ -1629,7 +1815,8 @@ mod tests {
let lp = vec!["a val=1i 1", "a val=2.1 123"].join("\n");
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries = lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1));
let sharded_entries =
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1));
assert!(sharded_entries.is_err());
}
@ -1639,7 +1826,8 @@ mod tests {
let lp = vec!["a,host=a val=1i 1", "a host=\"b\" 123"].join("\n");
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries = lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1));
let sharded_entries =
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1));
assert!(sharded_entries.is_err());
}
@ -1655,11 +1843,13 @@ mod tests {
let lines: Vec<_> = parse_lines(&lp).map(|l| l.unwrap()).collect();
let sharded_entries =
lines_to_sharded_entries(&lines, &sharder(1), &partitioner(1)).unwrap();
lines_to_sharded_entries(&lines, sharder(1).as_ref(), &partitioner(1)).unwrap();
let entry_bytes = sharded_entries.first().unwrap().entry.data();
let sequenced_entry = SequencedEntry::new_from_entry_bytes(23, 2, entry_bytes).unwrap();
assert_eq!(sequenced_entry.clock_value(), 23);
let clock_value = ClockValue::new(23);
let sequenced_entry =
SequencedEntry::new_from_entry_bytes(clock_value, 2, entry_bytes).unwrap();
assert_eq!(sequenced_entry.clock_value(), clock_value);
assert_eq!(sequenced_entry.writer_id(), 2);
let partition_writes = sequenced_entry.partition_writes().unwrap();
@ -1672,13 +1862,13 @@ mod tests {
assert_eq!(columns.len(), 7);
let col = columns.get(0).unwrap();
assert_eq!(col.name().unwrap(), "bool");
assert_eq!(col.name(), "bool");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = col.values().bool_values().unwrap();
assert_eq!(&values, &[None, None, Some(true)]);
let col = columns.get(1).unwrap();
assert_eq!(col.name().unwrap(), "host");
assert_eq!(col.name(), "host");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Tag);
let values = match col.values() {
TypedValuesIterator::String(v) => v,
@ -1688,7 +1878,7 @@ mod tests {
assert_eq!(&values, &[Some("a"), Some("a"), None]);
let col = columns.get(2).unwrap();
assert_eq!(col.name().unwrap(), "region");
assert_eq!(col.name(), "region");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Tag);
let values = match col.values() {
TypedValuesIterator::String(v) => v,
@ -1698,7 +1888,7 @@ mod tests {
assert_eq!(&values, &[None, Some("west"), None]);
let col = columns.get(3).unwrap();
assert_eq!(col.name().unwrap(), "string");
assert_eq!(col.name(), "string");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = match col.values() {
TypedValuesIterator::String(v) => v,
@ -1708,68 +1898,21 @@ mod tests {
assert_eq!(&values, &[None, None, Some("hello")]);
let col = columns.get(4).unwrap();
assert_eq!(col.name().unwrap(), TIME_COLUMN_NAME);
assert_eq!(col.name(), TIME_COLUMN_NAME);
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Time);
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(983), Some(2343), Some(222)]);
let col = columns.get(5).unwrap();
assert_eq!(col.name().unwrap(), "val");
assert_eq!(col.name(), "val");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = col.values().i64_values().unwrap();
assert_eq!(&values, &[Some(23), None, Some(21)]);
let col = columns.get(6).unwrap();
assert_eq!(col.name().unwrap(), "val2");
assert_eq!(col.name(), "val2");
assert_eq!(col.logical_type(), entry_fb::LogicalColumnType::Field);
let values = col.values().f64_values().unwrap();
assert_eq!(&values, &[None, Some(23.2), None]);
}
fn sharder(count: u16) -> TestSharder {
TestSharder {
count,
n: std::cell::RefCell::new(0),
}
}
// For each line passed to shard returns a shard id from [0, count) in order
struct TestSharder {
count: u16,
n: std::cell::RefCell<u16>,
}
impl Sharder for TestSharder {
fn shard(&self, _line: &ParsedLine<'_>) -> Result<u16, DataError> {
let n = *self.n.borrow();
self.n.replace(n + 1);
Ok(n % self.count)
}
}
fn partitioner(count: u8) -> TestPartitioner {
TestPartitioner {
count,
n: std::cell::RefCell::new(0),
}
}
// For each line passed to partition_key returns a key with a number from [0,
// count)
struct TestPartitioner {
count: u8,
n: std::cell::RefCell<u8>,
}
impl Partitioner for TestPartitioner {
fn partition_key(
&self,
_line: &ParsedLine<'_>,
_default_time: &DateTime<Utc>,
) -> data_types::database_rules::Result<String> {
let n = *self.n.borrow();
self.n.replace(n + 1);
Ok(format!("key_{}", n % self.count))
}
}
}

View File

@ -94,11 +94,8 @@ pub enum Error {
source: arrow_deps::arrow::error::ArrowError,
},
#[snafu(display("Schema Selection error while selecting '{}': {}", column_name, source))]
SelectingColumns {
column_name: String,
source: arrow_deps::arrow::error::ArrowError,
},
#[snafu(display("Column not found '{}'", column_name))]
ColumnNotFound { column_name: String },
}
fn nullable_to_str(nullability: bool) -> &'static str {
@ -470,6 +467,44 @@ impl Schema {
}
}
}
/// Returns the field indexes for a given selection
///
/// Returns an error if a corresponding column isn't found
pub fn select(&self, columns: &[&str]) -> Result<Vec<usize>> {
columns
.iter()
.map(|column_name| {
self.find_index_of(column_name)
.ok_or_else(|| Error::ColumnNotFound {
column_name: column_name.to_string(),
})
})
.collect()
}
/// Returns the schema for a given set of column projects
pub fn project(&self, projection: &[usize]) -> Self {
let mut metadata = HashMap::with_capacity(projection.len() + 1);
let mut fields = Vec::with_capacity(projection.len());
let current_metadata = self.inner.metadata();
for idx in projection {
let (_, field) = self.field(*idx);
fields.push(field.clone());
if let Some(value) = current_metadata.get(field.name()) {
metadata.insert(field.name().clone(), value.clone());
}
}
if let Some(measurement) = current_metadata.get(MEASUREMENT_METADATA_KEY).cloned() {
metadata.insert(MEASUREMENT_METADATA_KEY.to_string(), measurement);
}
Self {
inner: Arc::new(ArrowSchema::new_with_metadata(fields, metadata)),
}
}
}
/// Valid types for InfluxDB data model, as defined in [the documentation]
@ -1180,4 +1215,58 @@ mod test {
expected_schema, sorted_schema
);
}
#[test]
fn test_select() {
let schema1 = SchemaBuilder::new()
.influx_field("the_field", String)
.tag("the_tag")
.timestamp()
.measurement("the_measurement")
.build()
.unwrap();
let projection = schema1.select(&[TIME_COLUMN_NAME]).unwrap();
let schema2 = schema1.project(&projection);
let schema3 = Schema::try_from_arrow(Arc::clone(&schema2.inner)).unwrap();
assert_eq!(schema1.measurement(), schema2.measurement());
assert_eq!(schema1.measurement(), schema3.measurement());
assert_eq!(schema1.len(), 3);
assert_eq!(schema2.len(), 1);
assert_eq!(schema3.len(), 1);
assert_eq!(schema1.inner.fields().len(), 3);
assert_eq!(schema2.inner.fields().len(), 1);
assert_eq!(schema3.inner.fields().len(), 1);
let get_type = |x: &Schema, field: &str| -> InfluxColumnType {
let idx = x.find_index_of(field).unwrap();
x.field(idx).0.unwrap()
};
assert_eq!(
get_type(&schema1, TIME_COLUMN_NAME),
InfluxColumnType::Timestamp
);
assert_eq!(
get_type(&schema2, TIME_COLUMN_NAME),
InfluxColumnType::Timestamp
);
assert_eq!(get_type(&schema1, "the_tag"), InfluxColumnType::Tag);
assert_eq!(
get_type(&schema1, "the_field"),
InfluxColumnType::Field(InfluxFieldType::String)
);
assert_eq!(
get_type(&schema2, TIME_COLUMN_NAME),
InfluxColumnType::Timestamp
);
assert_eq!(
get_type(&schema3, TIME_COLUMN_NAME),
InfluxColumnType::Timestamp
);
}
}

View File

@ -24,6 +24,7 @@ generated_types = { path = "../generated_types" }
influxdb_line_protocol = { path = "../influxdb_line_protocol" }
internal_types = { path = "../internal_types" }
observability_deps = { path = "../observability_deps" }
parking_lot = "0.11.1"
snafu = "0.6.2"
string-interner = "0.12.2"
tokio = { version = "1.0", features = ["macros"] }

View File

@ -1,21 +1,26 @@
//! Represents a Chunk of data (a collection of tables and their data within
//! some chunk) in the mutable store.
use arrow_deps::{arrow::record_batch::RecordBatch, datafusion::logical_plan::Expr};
use generated_types::wal as wb;
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use data_types::partition_metadata::TableSummary;
use internal_types::{schema::Schema, selection::Selection};
use snafu::{OptionExt, ResultExt, Snafu};
use arrow_deps::arrow::record_batch::RecordBatch;
use data_types::{database_rules::WriterId, partition_metadata::TableSummary};
use internal_types::{
entry::{ClockValue, TableBatch},
selection::Selection,
};
use tracker::{MemRegistry, MemTracker};
use crate::chunk::snapshot::ChunkSnapshot;
use crate::{
column::Column,
dictionary::{Dictionary, Error as DictionaryError},
pred::{ChunkPredicate, ChunkPredicateBuilder},
dictionary::{Dictionary, Error as DictionaryError, DID},
table::Table,
};
use snafu::{OptionExt, ResultExt, Snafu};
use tracker::{MemRegistry, MemTracker};
use parking_lot::Mutex;
pub mod snapshot;
#[derive(Debug, Snafu)]
pub enum Error {
@ -31,57 +36,12 @@ pub enum Error {
source: crate::table::Error,
},
#[snafu(display("Error checking predicate in table {}: {}", table_id, source))]
PredicateCheck {
table_id: u32,
source: crate::table::Error,
},
#[snafu(display("Error checking predicate in table '{}': {}", table_name, source))]
NamedTablePredicateCheck {
table_name: String,
source: crate::table::Error,
},
#[snafu(display(
"Unsupported predicate when mutable buffer table names. Found a general expression: {:?}",
exprs
))]
PredicateNotYetSupported { exprs: Vec<Expr> },
#[snafu(display("Table ID {} not found in dictionary of chunk {}", table_id, chunk))]
TableIdNotFoundInDictionary {
table_id: u32,
chunk: u64,
source: DictionaryError,
},
#[snafu(display(
"Internal error: table {} not found in dictionary of chunk {}",
table_name,
chunk_id
))]
InternalTableNotFoundInDictionary { table_name: String, chunk_id: u32 },
#[snafu(display("Table {} not found in chunk {}", table, chunk))]
TableNotFoundInChunk { table: u32, chunk: u64 },
#[snafu(display("Table '{}' not found in chunk {}", table_name, chunk_id))]
NamedTableNotFoundInChunk { table_name: String, chunk_id: u64 },
#[snafu(display("Attempt to write table batch without a name"))]
TableWriteWithoutName,
#[snafu(display("Value ID {} not found in dictionary of chunk {}", value_id, chunk_id))]
InternalColumnValueIdNotFoundInDictionary {
value_id: u32,
chunk_id: u64,
source: DictionaryError,
},
TableNotFoundInChunk { table: DID, chunk: u64 },
#[snafu(display("Column ID {} not found in dictionary of chunk {}", column_id, chunk))]
ColumnIdNotFoundInDictionary {
column_id: u32,
column_id: DID,
chunk: u64,
source: DictionaryError,
},
@ -96,12 +56,6 @@ pub enum Error {
chunk_id: u64,
source: DictionaryError,
},
#[snafu(display(
"Column '{}' is not a string tag column and thus can not list values",
column_name
))]
UnsupportedColumnTypeForListingValues { column_name: String },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -109,34 +63,25 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug)]
pub struct Chunk {
/// The id for this chunk
pub id: u32,
id: u32,
/// `dictionary` maps &str -> u32. The u32s are used in place of String or
/// `dictionary` maps &str -> DID. The DIDs are used in place of String or
/// str to avoid slow string operations. The same dictionary is used for
/// table names, tag names, tag values, and column names.
// TODO: intern string field values too?
pub dictionary: Dictionary,
dictionary: Dictionary,
/// map of the dictionary ID for the table name to the table
pub tables: HashMap<u32, Table>,
tables: HashMap<DID, Table>,
/// keep track of memory used by chunk
tracker: MemTracker,
}
impl Clone for Chunk {
fn clone(&self) -> Self {
// TODO: The performance of this is not great - (#635)
let mut ret = Self {
id: self.id,
dictionary: self.dictionary.clone(),
tables: self.tables.clone(),
tracker: self.tracker.clone_empty(),
};
ret.tracker.set_bytes(ret.size());
ret
}
/// Cached chunk snapshot
///
/// Note: This is a mutex to allow mutation within
/// `Chunk::snapshot()` which only takes an immutable borrow
snapshot: Mutex<Option<Arc<ChunkSnapshot>>>,
}
impl Chunk {
@ -146,38 +91,41 @@ impl Chunk {
dictionary: Dictionary::new(),
tables: HashMap::new(),
tracker: memory_registry.register(),
snapshot: Mutex::new(None),
};
chunk.tracker.set_bytes(chunk.size());
chunk
}
pub fn write_entry(&mut self, entry: &wb::WriteBufferEntry<'_>) -> Result<()> {
if let Some(table_batches) = entry.table_batches() {
for batch in table_batches {
self.write_table_batch(&batch)?;
}
}
pub fn write_table_batches(
&mut self,
clock_value: ClockValue,
writer_id: WriterId,
batches: &[TableBatch<'_>],
) -> Result<()> {
for batch in batches {
let table_name = batch.name();
let table_id = self.dictionary.lookup_value_or_insert(table_name);
self.tracker.set_bytes(self.size());
let table = self
.tables
.entry(table_id)
.or_insert_with(|| Table::new(table_id));
Ok(())
}
fn write_table_batch(&mut self, batch: &wb::TableWriteBatch<'_>) -> Result<()> {
let table_name = batch.name().context(TableWriteWithoutName)?;
let table_id = self.dictionary.lookup_value_or_insert(table_name);
let table = self
.tables
.entry(table_id)
.or_insert_with(|| Table::new(table_id));
if let Some(rows) = batch.rows() {
let columns = batch.columns();
table
.append_rows(&mut self.dictionary, &rows)
.write_columns(&mut self.dictionary, clock_value, writer_id, columns)
.context(TableWrite { table_name })?;
}
// Invalidate chunk snapshot
*self
.snapshot
.try_lock()
.expect("concurrent readers/writers to MBChunk") = None;
self.tracker.set_bytes(self.size());
Ok(())
}
@ -191,212 +139,17 @@ impl Chunk {
}
}
/// Return all the names of the tables names in this chunk that match
/// chunk predicate
pub fn table_names(&self, chunk_predicate: &ChunkPredicate) -> Result<Vec<&str>> {
// we don't support arbitrary expressions in chunk predicate yet
if !chunk_predicate.chunk_exprs.is_empty() {
return PredicateNotYetSupported {
exprs: chunk_predicate.chunk_exprs.clone(),
}
.fail();
/// Returns a queryable snapshot of this chunk
pub fn snapshot(&self) -> Arc<ChunkSnapshot> {
let mut guard = self.snapshot.lock();
if let Some(snapshot) = &*guard {
return Arc::clone(snapshot);
}
self.tables
.iter()
.filter_map(|(&table_id, table)| {
// could match is good enough for this metadata query
match table.could_match_predicate(chunk_predicate) {
Ok(true) => Some(self.dictionary.lookup_id(table_id).context(
TableIdNotFoundInDictionary {
table_id,
chunk: self.id,
},
)),
Ok(false) => None,
Err(e) => Some(Err(e).context(PredicateCheck { table_id })),
}
})
.collect()
}
/// If the column names that match the predicate can be found
/// from the predicate entirely using metadata, return those
/// strings.
///
/// If the predicate cannot be evaluated entirely with
/// metadata, return `Ok(None)`.
pub fn column_names(
&self,
table_name: &str,
chunk_predicate: &ChunkPredicate,
selection: Selection<'_>,
) -> Result<Option<BTreeSet<String>>> {
// No support for general purpose expressions
if !chunk_predicate.chunk_exprs.is_empty() {
return Ok(None);
}
let table_name_id = self.table_name_id(table_name)?;
let mut chunk_column_ids = BTreeSet::new();
// Is this table in the chunk?
if let Some(table) = self.tables.get(&table_name_id) {
for (&column_id, column) in &table.columns {
let column_matches_predicate = table
.column_matches_predicate(&column, chunk_predicate)
.context(NamedTableError { table_name })?;
if column_matches_predicate {
chunk_column_ids.insert(column_id);
}
}
}
// Only return subset of these selection_cols if not all_cols
let mut all_cols = true;
let selection_cols = match selection {
Selection::All => &[""],
Selection::Some(cols) => {
all_cols = false;
cols
}
};
let mut column_names = BTreeSet::new();
for &column_id in &chunk_column_ids {
let column_name =
self.dictionary
.lookup_id(column_id)
.context(ColumnIdNotFoundInDictionary {
column_id,
chunk: self.id,
})?;
if !column_names.contains(column_name)
&& (all_cols || selection_cols.contains(&column_name))
{
// only use columns in selection_cols
column_names.insert(column_name.to_string());
}
}
Ok(Some(column_names))
}
/// Return the id of the table in the chunk's dictionary
fn table_name_id(&self, table_name: &str) -> Result<u32> {
self.dictionary
.id(table_name)
.context(InternalTableNotFoundInDictionary {
table_name,
chunk_id: self.id(),
})
}
/// Returns the strings of the specified Tag column that satisfy
/// the predicate, if they can be determined entirely using metadata.
///
/// If the predicate cannot be evaluated entirely with metadata,
/// return `Ok(None)`.
pub fn tag_column_values(
&self,
table_name: &str,
column_name: &str,
chunk_predicate: &ChunkPredicate,
) -> Result<Option<BTreeSet<String>>> {
// No support for general purpose expressions
if !chunk_predicate.chunk_exprs.is_empty() {
return Ok(None);
}
let chunk_id = self.id();
let table_name_id = self.table_name_id(table_name)?;
// Is this table even in the chunk?
let table = self
.tables
.get(&table_name_id)
.context(NamedTableNotFoundInChunk {
table_name,
chunk_id,
})?;
// See if we can rule out the table entire on metadata
let could_match = table
.could_match_predicate(chunk_predicate)
.context(NamedTablePredicateCheck { table_name })?;
if !could_match {
// No columns could match, return empty set
return Ok(Default::default());
}
let column_id =
self.dictionary
.lookup_value(column_name)
.context(ColumnNameNotFoundInDictionary {
column_name,
chunk_id,
})?;
let column = table
.column(column_id)
.context(NamedTableError { table_name })?;
if let Column::Tag(column, _) = column {
// if we have a timestamp predicate, find all values
// where the timestamp is within range. Otherwise take
// all values.
// Collect matching ids into BTreeSet to deduplicate on
// ids *before* looking up Strings
let column_value_ids: BTreeSet<u32> = match chunk_predicate.range {
None => {
// take all non-null values
column.iter().filter_map(|&s| s).collect()
}
Some(range) => {
// filter out all values that don't match the timestmap
let time_column = table
.column_i64(chunk_predicate.time_column_id)
.context(NamedTableError { table_name })?;
column
.iter()
.zip(time_column.iter())
.filter_map(|(&column_value_id, &timestamp_value)| {
if range.contains_opt(timestamp_value) {
column_value_id
} else {
None
}
})
.collect()
}
};
// convert all the (deduplicated) ids to Strings
let column_values = column_value_ids
.into_iter()
.map(|value_id| {
let value = self.dictionary.lookup_id(value_id).context(
InternalColumnValueIdNotFoundInDictionary { value_id, chunk_id },
)?;
Ok(value.to_string())
})
.collect::<Result<BTreeSet<String>>>()?;
Ok(Some(column_values))
} else {
UnsupportedColumnTypeForListingValues { column_name }.fail()
}
}
/// Return a builder suitable to create predicates for this Chunk
pub fn predicate_builder(&self) -> Result<ChunkPredicateBuilder<'_>, crate::pred::Error> {
ChunkPredicateBuilder::new(&self.dictionary)
// TODO: Incremental snapshot generation
let snapshot = Arc::new(ChunkSnapshot::new(self));
*guard = Some(Arc::clone(&snapshot));
snapshot
}
/// returns true if there is no data in this chunk
@ -420,7 +173,7 @@ impl Chunk {
if let Some(table) = self.table(table_name)? {
dst.push(
table
.to_arrow(&self, selection)
.to_arrow(&self.dictionary, selection)
.context(NamedTableError { table_name })?,
);
}
@ -439,7 +192,7 @@ impl Chunk {
TableSummary {
name: name.to_string(),
columns: table.stats(&self),
columns: table.stats(&self.dictionary),
}
})
.collect()
@ -459,21 +212,6 @@ impl Chunk {
Ok(table)
}
/// Return Schema for the specified table / columns
pub fn table_schema(&self, table_name: &str, selection: Selection<'_>) -> Result<Schema> {
let table = self
.table(table_name)?
// Option --> Result
.context(NamedTableNotFoundInChunk {
table_name,
chunk_id: self.id(),
})?;
table
.schema(self, selection)
.context(NamedTableError { table_name })
}
/// Return the approximate memory size of the chunk, in bytes including the
/// dictionary, tables, and their rows.
pub fn size(&self) -> usize {
@ -486,3 +224,155 @@ impl Chunk {
matches!(self.table(table_name), Ok(Some(_)))
}
}
pub mod test_helpers {
use super::*;
use internal_types::entry::test_helpers::lp_to_entry;
/// A helper that will write line protocol string to the passed in Chunk.
/// All data will be under a single partition with a clock value and
/// writer id of 0.
pub fn write_lp_to_chunk(lp: &str, chunk: &mut Chunk) -> Result<()> {
let entry = lp_to_entry(lp);
for w in entry.partition_writes().unwrap() {
chunk.write_table_batches(ClockValue::new(0), 0, &w.table_batches())?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::test_helpers::write_lp_to_chunk;
use super::*;
use arrow_deps::arrow::util::pretty::pretty_format_batches;
#[test]
fn writes_table_batches() {
let mr = MemRegistry::new();
let mut chunk = Chunk::new(1, &mr);
let lp = vec![
"cpu,host=a val=23 1",
"cpu,host=b val=2 1",
"mem,host=a val=23432i 1",
]
.join("\n");
write_lp_to_chunk(&lp, &mut chunk).unwrap();
assert_table(
&chunk,
"cpu",
&[
"+------+------+-----+",
"| host | time | val |",
"+------+------+-----+",
"| a | 1 | 23 |",
"| b | 1 | 2 |",
"+------+------+-----+\n",
],
);
assert_table(
&chunk,
"mem",
&[
"+------+------+-------+",
"| host | time | val |",
"+------+------+-------+",
"| a | 1 | 23432 |",
"+------+------+-------+\n",
],
);
let lp = vec![
"cpu,host=c val=11 1",
"mem sval=\"hi\" 2",
"disk val=true 1",
]
.join("\n");
write_lp_to_chunk(&lp, &mut chunk).unwrap();
assert_table(
&chunk,
"cpu",
&[
"+------+------+-----+",
"| host | time | val |",
"+------+------+-----+",
"| a | 1 | 23 |",
"| b | 1 | 2 |",
"| c | 1 | 11 |",
"+------+------+-----+\n",
],
);
assert_table(
&chunk,
"disk",
&[
"+------+------+",
"| time | val |",
"+------+------+",
"| 1 | true |",
"+------+------+\n",
],
);
assert_table(
&chunk,
"mem",
&[
"+------+------+------+-------+",
"| host | sval | time | val |",
"+------+------+------+-------+",
"| a | | 1 | 23432 |",
"| | hi | 2 | |",
"+------+------+------+-------+\n",
],
);
}
#[test]
fn test_snapshot() {
let mr = MemRegistry::new();
let mut chunk = Chunk::new(1, &mr);
let lp = vec![
"cpu,host=a val=23 1",
"cpu,host=b val=2 1",
"mem,host=a val=23432i 1",
]
.join("\n");
write_lp_to_chunk(&lp, &mut chunk).unwrap();
let s1 = chunk.snapshot();
let s2 = chunk.snapshot();
write_lp_to_chunk(&lp, &mut chunk).unwrap();
let s3 = chunk.snapshot();
let s4 = chunk.snapshot();
assert_eq!(Arc::as_ptr(&s1), Arc::as_ptr(&s2));
assert_ne!(Arc::as_ptr(&s1), Arc::as_ptr(&s3));
assert_eq!(Arc::as_ptr(&s3), Arc::as_ptr(&s4));
}
fn assert_table(chunk: &Chunk, table: &str, data: &[&str]) {
let mut batches = vec![];
chunk
.table_to_arrow(&mut batches, table, Selection::All)
.unwrap();
let res = pretty_format_batches(&batches).unwrap();
let data = data.join("\n");
assert_eq!(
res, data,
"\n{} table results not as expected:\nEXPECTED:\n{}\nRECEIVED:\n{}",
table, data, res
);
}
}

View File

@ -0,0 +1,181 @@
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use arrow_deps::arrow::record_batch::RecordBatch;
use data_types::timestamp::TimestampRange;
use internal_types::schema::{Schema, TIME_COLUMN_NAME};
use internal_types::selection::Selection;
use snafu::{OptionExt, ResultExt, Snafu};
use super::Chunk;
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Table not found: {}", table_name))]
TableNotFound { table_name: String },
#[snafu(display("Failed to select columns: {}", source))]
SelectColumns {
source: internal_types::schema::Error,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// A queryable snapshot of a mutable buffer chunk
#[derive(Debug)]
pub struct ChunkSnapshot {
/// The ID of the chunk this is a snapshot of
chunk_id: u32,
/// Maps table name to `TableSnapshot`
records: HashMap<String, TableSnapshot>,
// TODO: Memory tracking
}
#[derive(Debug)]
struct TableSnapshot {
schema: Schema,
batch: RecordBatch,
timestamp_range: Option<TimestampRange>,
}
impl TableSnapshot {
fn matches_predicate(&self, timestamp_range: &Option<TimestampRange>) -> bool {
match (self.timestamp_range, timestamp_range) {
(Some(a), Some(b)) => !a.disjoint(b),
(None, Some(_)) => false, /* If this chunk doesn't have a time column it can't match */
// the predicate
(_, None) => true,
}
}
}
impl ChunkSnapshot {
pub fn new(chunk: &Chunk) -> Self {
let mut records: HashMap<String, TableSnapshot> = Default::default();
for (id, table) in &chunk.tables {
let schema = table.schema(&chunk.dictionary, Selection::All).unwrap();
let batch = table.to_arrow(&chunk.dictionary, Selection::All).unwrap();
let name = chunk.dictionary.lookup_id(*id).unwrap();
let timestamp_range = chunk
.dictionary
.lookup_value(TIME_COLUMN_NAME)
.ok()
.and_then(|column_id| {
table.column(column_id).ok().and_then(|column| {
// TimestampRange has an exclusive upper bound
column
.get_i64_stats()
.map(|x| TimestampRange::new(x.min, x.max + 1))
})
});
records.insert(
name.to_string(),
TableSnapshot {
batch,
schema,
timestamp_range,
},
);
}
Self {
chunk_id: chunk.id,
records,
}
}
/// return the ID of the chunk this is a snapshot of
pub fn chunk_id(&self) -> u32 {
self.chunk_id
}
/// returns true if there is no data in this snapshot
pub fn is_empty(&self) -> bool {
self.records.is_empty()
}
/// Return true if this snapshot has the specified table name
pub fn has_table(&self, table_name: &str) -> bool {
self.records.get(table_name).is_some()
}
/// Return Schema for the specified table / columns
pub fn table_schema(&self, table_name: &str, selection: Selection<'_>) -> Result<Schema> {
let table = self
.records
.get(table_name)
.context(TableNotFound { table_name })?;
Ok(match selection {
Selection::All => table.schema.clone(),
Selection::Some(columns) => {
let columns = table.schema.select(columns).context(SelectColumns)?;
table.schema.project(&columns)
}
})
}
/// Returns a list of tables with writes matching the given timestamp_range
pub fn table_names(
&self,
timestamp_range: Option<TimestampRange>,
) -> impl Iterator<Item = &String> + '_ {
self.records
.iter()
.flat_map(move |(table_name, table_snapshot)| {
match table_snapshot.matches_predicate(&timestamp_range) {
true => Some(table_name),
false => None,
}
})
}
/// Returns a RecordBatch with the given selection
pub fn read_filter(&self, table_name: &str, selection: Selection<'_>) -> Result<RecordBatch> {
let table = self
.records
.get(table_name)
.context(TableNotFound { table_name })?;
Ok(match selection {
Selection::All => table.batch.clone(),
Selection::Some(columns) => {
let projection = table.schema.select(columns).context(SelectColumns)?;
let schema = table.schema.project(&projection).into();
let columns = projection
.into_iter()
.map(|x| Arc::clone(table.batch.column(x)))
.collect();
RecordBatch::try_new(schema, columns).expect("failed to project record batch")
}
})
}
/// Returns a given selection of column names from a table
pub fn column_names(
&self,
table_name: &str,
selection: Selection<'_>,
) -> Option<BTreeSet<String>> {
let table = self.records.get(table_name)?;
let fields = table.schema.inner().fields().iter();
Some(match selection {
Selection::Some(cols) => fields
.filter_map(|x| {
if cols.contains(&x.name().as_str()) {
Some(x.name().clone())
} else {
None
}
})
.collect(),
Selection::All => fields.map(|x| x.name().clone()).collect(),
})
}
}

View File

@ -1,10 +1,9 @@
use generated_types::wal as wb;
use snafu::Snafu;
use crate::dictionary::Dictionary;
use arrow_deps::arrow::datatypes::DataType as ArrowDataType;
use crate::dictionary::{Dictionary, DID};
use data_types::partition_metadata::StatValues;
use internal_types::data::type_description;
use generated_types::entry::LogicalColumnType;
use internal_types::entry::TypedValuesIterator;
use std::mem;
@ -37,80 +36,276 @@ pub enum Column {
U64(Vec<Option<u64>>, StatValues<u64>),
String(Vec<Option<String>>, StatValues<String>),
Bool(Vec<Option<bool>>, StatValues<bool>),
Tag(Vec<Option<u32>>, StatValues<String>),
Tag(Vec<Option<DID>>, StatValues<String>),
}
impl Column {
pub fn with_value(
/// Initializes a new column from typed values, the column on a table write
/// batch on an Entry. Will initialize the stats with the first
/// non-null value and update with any other non-null values included.
pub fn new_from_typed_values(
dictionary: &mut Dictionary,
capacity: usize,
value: wb::Value<'_>,
) -> Result<Self> {
Ok(match value.value_type() {
wb::ColumnValue::F64Value => {
let val = value
.value_as_f64value()
.expect("f64 value should be present")
.value();
let mut vals = vec![None; capacity];
vals.push(Some(val));
Self::F64(vals, StatValues::new(val))
}
wb::ColumnValue::I64Value => {
let val = value
.value_as_i64value()
.expect("i64 value should be present")
.value();
let mut vals = vec![None; capacity];
vals.push(Some(val));
Self::I64(vals, StatValues::new(val))
}
wb::ColumnValue::U64Value => {
let val = value
.value_as_u64value()
.expect("u64 value should be present")
.value();
let mut vals = vec![None; capacity];
vals.push(Some(val));
Self::U64(vals, StatValues::new(val))
}
wb::ColumnValue::StringValue => {
let val = value
.value_as_string_value()
.expect("string value should be present")
.value()
.expect("string must be present");
let mut vals = vec![None; capacity];
vals.push(Some(val.to_string()));
Self::String(vals, StatValues::new(val.to_string()))
}
wb::ColumnValue::BoolValue => {
let val = value
.value_as_bool_value()
.expect("bool value should be present")
.value();
let mut vals = vec![None; capacity];
vals.push(Some(val));
Self::Bool(vals, StatValues::new(val))
}
wb::ColumnValue::TagValue => {
let val = value
.value_as_tag_value()
.expect("tag value should be present")
.value()
.expect("tag value must have string value");
let mut vals = vec![None; capacity];
let id = dictionary.lookup_value_or_insert(val);
vals.push(Some(id));
Self::Tag(vals, StatValues::new(val.to_string()))
}
_ => {
return UnknownColumnType {
inserted_value_type: type_description(value.value_type()),
row_count: usize,
logical_type: LogicalColumnType,
values: TypedValuesIterator<'_>,
) -> Self {
match values {
TypedValuesIterator::String(vals) => match logical_type {
LogicalColumnType::Tag => {
let mut tag_values = vec![None; row_count];
let mut stats: Option<StatValues<String>> = None;
let mut added_tag_values: Vec<_> = vals
.map(|tag| {
tag.map(|tag| {
match stats.as_mut() {
Some(s) => StatValues::update_string(s, tag),
None => {
stats = Some(StatValues::new(tag.to_string()));
}
}
dictionary.lookup_value_or_insert(tag)
})
})
.collect();
tag_values.append(&mut added_tag_values);
Self::Tag(
tag_values,
stats.expect("can't insert tag column with no values"),
)
}
.fail()
LogicalColumnType::Field => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<String>> = None;
for value in vals {
match value {
Some(v) => {
match stats.as_mut() {
Some(s) => StatValues::update_string(s, v),
None => stats = Some(StatValues::new(v.to_string())),
}
values.push(Some(v.to_string()));
}
None => values.push(None),
}
}
Self::String(
values,
stats.expect("can't insert string column with no values"),
)
}
_ => panic!("unsupported!"),
},
TypedValuesIterator::I64(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<i64>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::I64(
values,
stats.expect("can't insert i64 column with no values"),
)
}
})
TypedValuesIterator::F64(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<f64>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::F64(
values,
stats.expect("can't insert f64 column with no values"),
)
}
TypedValuesIterator::U64(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<u64>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::U64(
values,
stats.expect("can't insert u64 column with no values"),
)
}
TypedValuesIterator::Bool(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<bool>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::Bool(
values,
stats.expect("can't insert bool column with no values"),
)
}
}
}
/// Pushes typed values, the column from a table write batch on an Entry.
/// Updates statsistics for any non-null values.
pub fn push_typed_values(
&mut self,
dictionary: &mut Dictionary,
logical_type: LogicalColumnType,
values: TypedValuesIterator<'_>,
) -> Result<()> {
match (self, values) {
(Self::Bool(col, stats), TypedValuesIterator::Bool(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::I64(col, stats), TypedValuesIterator::I64(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::F64(col, stats), TypedValuesIterator::F64(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::U64(col, stats), TypedValuesIterator::U64(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::String(col, stats), TypedValuesIterator::String(values)) => {
if logical_type != LogicalColumnType::Field {
TypeMismatch {
existing_column_type: "String",
inserted_value_type: "tag",
}
.fail()?;
}
for val in values {
match val {
Some(v) => {
StatValues::update_string(stats, v);
col.push(Some(v.to_string()));
}
None => col.push(None),
}
}
}
(Self::Tag(col, stats), TypedValuesIterator::String(values)) => {
if logical_type != LogicalColumnType::Tag {
TypeMismatch {
existing_column_type: "tag",
inserted_value_type: "String",
}
.fail()?;
}
for val in values {
match val {
Some(v) => {
StatValues::update_string(stats, v);
let id = dictionary.lookup_value_or_insert(v);
col.push(Some(id));
}
None => col.push(None),
}
}
}
(existing, values) => TypeMismatch {
existing_column_type: existing.type_description(),
inserted_value_type: values.type_description(),
}
.fail()?,
}
Ok(())
}
/// Pushes None values onto the column until its len is equal to that passed
/// in
pub fn push_nulls_to_len(&mut self, len: usize) {
match self {
Self::Tag(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::I64(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::F64(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::U64(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::Bool(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::String(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
}
}
pub fn len(&self) -> usize {
@ -124,10 +319,6 @@ impl Column {
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn type_description(&self) -> &'static str {
match self {
Self::F64(_, _) => "f64",
@ -139,167 +330,10 @@ impl Column {
}
}
/// Return the arrow DataType for this column
pub fn data_type(&self) -> ArrowDataType {
pub fn get_i64_stats(&self) -> Option<StatValues<i64>> {
match self {
Self::F64(..) => ArrowDataType::Float64,
Self::I64(..) => ArrowDataType::Int64,
Self::U64(..) => ArrowDataType::UInt64,
Self::String(..) => ArrowDataType::Utf8,
Self::Bool(..) => ArrowDataType::Boolean,
Self::Tag(..) => ArrowDataType::Utf8,
}
}
pub fn push(&mut self, dictionary: &mut Dictionary, value: &wb::Value<'_>) -> Result<()> {
let inserted = match self {
Self::Tag(vals, stats) => match value.value_as_tag_value() {
Some(tag) => {
let tag_value = tag.value().expect("tag must have string value");
let id = dictionary.lookup_value_or_insert(tag_value);
vals.push(Some(id));
StatValues::update_string(stats, tag_value);
true
}
None => false,
},
Self::String(vals, stats) => match value.value_as_string_value() {
Some(str_val) => {
let str_val = str_val.value().expect("string must have value");
vals.push(Some(str_val.to_string()));
StatValues::update_string(stats, str_val);
true
}
None => false,
},
Self::Bool(vals, stats) => match value.value_as_bool_value() {
Some(bool_val) => {
let bool_val = bool_val.value();
vals.push(Some(bool_val));
stats.update(bool_val);
true
}
None => false,
},
Self::I64(vals, stats) => match value.value_as_i64value() {
Some(i64_val) => {
let i64_val = i64_val.value();
vals.push(Some(i64_val));
stats.update(i64_val);
true
}
None => false,
},
Self::U64(vals, stats) => match value.value_as_u64value() {
Some(u64_val) => {
let u64_val = u64_val.value();
vals.push(Some(u64_val));
stats.update(u64_val);
true
}
None => false,
},
Self::F64(vals, stats) => match value.value_as_f64value() {
Some(f64_val) => {
let f64_val = f64_val.value();
vals.push(Some(f64_val));
stats.update(f64_val);
true
}
None => false,
},
};
if inserted {
Ok(())
} else {
TypeMismatch {
existing_column_type: self.type_description(),
inserted_value_type: type_description(value.value_type()),
}
.fail()
}
}
// push_none_if_len_equal will add a None value to the end of the Vec of values
// if the length is equal to the passed in value. This is used to ensure
// columns are all the same length.
pub fn push_none_if_len_equal(&mut self, len: usize) {
match self {
Self::F64(v, _) => {
if v.len() == len {
v.push(None);
}
}
Self::I64(v, _) => {
if v.len() == len {
v.push(None);
}
}
Self::U64(v, _) => {
if v.len() == len {
v.push(None);
}
}
Self::String(v, _) => {
if v.len() == len {
v.push(None);
}
}
Self::Bool(v, _) => {
if v.len() == len {
v.push(None);
}
}
Self::Tag(v, _) => {
if v.len() == len {
v.push(None);
}
}
}
}
/// Returns true if any rows are within the range [min_value,
/// max_value). Inclusive of `start`, exclusive of `end`
pub fn has_i64_range(&self, start: i64, end: i64) -> Result<bool> {
match self {
Self::I64(_, stats) => {
if stats.max < start || stats.min >= end {
Ok(false)
} else {
Ok(true)
}
}
_ => InternalTypeMismatchForTimePredicate {}.fail(),
}
}
/// Return true of this column's type is a Tag
pub fn is_tag(&self) -> bool {
matches!(self, Self::Tag(..))
}
/// Returns true if there exists at least one row idx where this
/// self[i] is within the range [min_value, max_value). Inclusive
/// of `start`, exclusive of `end` and where col[i] is non null
pub fn has_non_null_i64_range<T>(
&self,
column: &[Option<T>],
start: i64,
end: i64,
) -> Result<bool> {
match self {
Self::I64(v, _) => {
for (index, val) in v.iter().enumerate() {
if let Some(val) = val {
if start <= *val && *val < end && column[index].is_some() {
return Ok(true);
}
}
}
Ok(false)
}
_ => InternalTypeMismatchForTimePredicate {}.fail(),
Self::I64(_, values) => Some(values.clone()),
_ => None,
}
}
@ -322,7 +356,7 @@ impl Column {
mem::size_of::<Option<bool>>() * v.len() + mem::size_of_val(&stats)
}
Self::Tag(v, stats) => {
mem::size_of::<Option<u32>>() * v.len() + mem::size_of_val(&stats)
mem::size_of::<Option<DID>>() * v.len() + mem::size_of_val(&stats)
}
Self::String(v, stats) => {
let string_bytes_size = v
@ -334,89 +368,3 @@ impl Column {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_has_i64_range() {
let mut stats = StatValues::new(1);
stats.update(2);
let col = Column::I64(vec![Some(1), None, Some(2)], stats.clone());
assert!(!col.has_i64_range(-1, 0).unwrap());
assert!(!col.has_i64_range(0, 1).unwrap());
assert!(col.has_i64_range(1, 2).unwrap());
assert!(col.has_i64_range(2, 3).unwrap());
assert!(!col.has_i64_range(3, 4).unwrap());
let col = Column::I64(vec![Some(2), None, Some(1)], stats);
assert!(!col.has_i64_range(-1, 0).unwrap());
assert!(!col.has_i64_range(0, 1).unwrap());
assert!(col.has_i64_range(1, 2).unwrap());
assert!(col.has_i64_range(2, 3).unwrap());
assert!(!col.has_i64_range(3, 4).unwrap());
}
#[test]
fn test_has_i64_range_does_not_panic() {
// providing the wrong column type should get an internal error, not a panic
let col = Column::F64(vec![Some(1.2)], StatValues::new(1.2));
let res = col.has_i64_range(-1, 0);
assert!(res.is_err());
let res_string = format!("{:?}", res);
let expected = "InternalTypeMismatchForTimePredicate";
assert!(
res_string.contains(expected),
"Did not find expected text '{}' in '{}'",
expected,
res_string
);
}
#[test]
fn test_has_non_null_i64_range_() {
let none_col: Vec<Option<u32>> = vec![None, None, None];
let some_col: Vec<Option<u32>> = vec![Some(0), Some(0), Some(0)];
let mut stats = StatValues::new(1);
stats.update(2);
let col = Column::I64(vec![Some(1), None, Some(2)], stats);
assert!(!col.has_non_null_i64_range(&some_col, -1, 0).unwrap());
assert!(!col.has_non_null_i64_range(&some_col, 0, 1).unwrap());
assert!(col.has_non_null_i64_range(&some_col, 1, 2).unwrap());
assert!(col.has_non_null_i64_range(&some_col, 2, 3).unwrap());
assert!(!col.has_non_null_i64_range(&some_col, 3, 4).unwrap());
assert!(!col.has_non_null_i64_range(&none_col, -1, 0).unwrap());
assert!(!col.has_non_null_i64_range(&none_col, 0, 1).unwrap());
assert!(!col.has_non_null_i64_range(&none_col, 1, 2).unwrap());
assert!(!col.has_non_null_i64_range(&none_col, 2, 3).unwrap());
assert!(!col.has_non_null_i64_range(&none_col, 3, 4).unwrap());
}
#[test]
fn column_size() {
let i64col = Column::I64(vec![Some(1), Some(1)], StatValues::new(1));
assert_eq!(40, i64col.size());
let f64col = Column::F64(vec![Some(1.1), Some(1.1), Some(1.1)], StatValues::new(1.1));
assert_eq!(56, f64col.size());
let boolcol = Column::Bool(vec![Some(true)], StatValues::new(true));
assert_eq!(9, boolcol.size());
let tagcol = Column::Tag(
vec![Some(1), Some(1), Some(1), Some(1)],
StatValues::new("foo".to_string()),
);
assert_eq!(40, tagcol.size());
let stringcol = Column::String(
vec![Some("foo".to_string()), Some("hello world".to_string())],
StatValues::new("foo".to_string()),
);
assert_eq!(70, stringcol.size());
}
}

View File

@ -8,7 +8,7 @@ use string_interner::{
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Dictionary lookup error on id {}", id))]
DictionaryIdLookupError { id: u32 },
DictionaryIdLookupError { id: DID },
#[snafu(display("Dictionary lookup error for value {}", value))]
DictionaryValueLookupError { value: String },
@ -16,6 +16,30 @@ pub enum Error {
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// A "dictionary ID" (DID) is a compact numeric representation of an interned
/// string in the dictionary. The same string always maps the same DID. DIDs can
/// be compared, hashed and cheaply copied around, just like small integers.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct DID(DefaultSymbol);
impl DID {
fn new(s: DefaultSymbol) -> Self {
Self(s)
}
}
impl From<DID> for DefaultSymbol {
fn from(id: DID) -> Self {
id.0
}
}
impl std::fmt::Display for DID {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.to_usize())
}
}
#[derive(Debug, Clone)]
pub struct Dictionary {
interner: StringInterner<DefaultSymbol, StringBackend<DefaultSymbol>, DefaultHashBuilder>,
@ -39,43 +63,37 @@ impl Dictionary {
/// Returns the id corresponding to value, adding an entry for the
/// id if it is not yet present in the dictionary.
pub fn lookup_value_or_insert(&mut self, value: &str) -> u32 {
pub fn lookup_value_or_insert(&mut self, value: &str) -> DID {
self.id(value).unwrap_or_else(|| {
self.size += value.len();
self.size += std::mem::size_of::<u32>();
symbol_to_u32(self.interner.get_or_intern(value))
DID::new(self.interner.get_or_intern(value))
})
}
/// Returns the ID in self.dictionary that corresponds to `value`, if any.
/// Returns an error if no such value is found. Does not add the value
/// to the dictionary.
pub fn lookup_value(&self, value: &str) -> Result<u32> {
pub fn lookup_value(&self, value: &str) -> Result<DID> {
self.id(value).context(DictionaryValueLookupError { value })
}
/// Returns the ID in self.dictionary that corresponds to `value`,
/// if any. No error is returned to avoid an allocation when no value is
/// present
pub fn id(&self, value: &str) -> Option<u32> {
self.interner.get(value).map(symbol_to_u32)
pub fn id(&self, value: &str) -> Option<DID> {
self.interner.get(value).map(DID::new)
}
/// Returns the str in self.dictionary that corresponds to `id`,
/// if any. Returns an error if no such id is found
pub fn lookup_id(&self, id: u32) -> Result<&str> {
let symbol =
Symbol::try_from_usize(id as usize).expect("to be able to convert u32 to symbol");
pub fn lookup_id(&self, id: DID) -> Result<&str> {
self.interner
.resolve(symbol)
.resolve(id.into())
.context(DictionaryIdLookupError { id })
}
}
fn symbol_to_u32(sym: DefaultSymbol) -> u32 {
sym.to_usize() as u32
}
#[cfg(test)]
mod test {
use crate::dictionary::Dictionary;

View File

@ -60,5 +60,4 @@
pub mod chunk;
mod column;
mod dictionary;
pub mod pred;
mod table;

View File

@ -1,298 +0,0 @@
use std::collections::{BTreeSet, HashSet};
use crate::dictionary::{Dictionary, Error as DictionaryError};
use arrow_deps::{
datafusion::{
error::{DataFusionError, Result as DatafusionResult},
logical_plan::{Expr, ExpressionVisitor, Operator, Recursion},
optimizer::utils::expr_to_column_names,
},
util::{make_range_expr, AndExprBuilder},
};
use data_types::timestamp::TimestampRange;
use internal_types::schema::TIME_COLUMN_NAME;
//use snafu::{OptionExt, ResultExt, Snafu};
use snafu::{ensure, ResultExt, Snafu};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Error writing table '{}': {}", table_name, source))]
TableWrite {
table_name: String,
source: crate::table::Error,
},
#[snafu(display("Time Column was not not found in dictionary: {}", source))]
TimeColumnNotFound { source: DictionaryError },
#[snafu(display("Unsupported predicate. Mutable buffer does not support: {}", source))]
UnsupportedPredicate { source: DataFusionError },
#[snafu(display(
"Internal error visiting expressions in ChunkPredicateBuilder: {}",
source
))]
InternalVisitingExpressions { source: DataFusionError },
#[snafu(display("table_names has already been specified in ChunkPredicateBuilder"))]
TableNamesAlreadySet {},
#[snafu(display("field_names has already been specified in ChunkPredicateBuilder"))]
FieldNamesAlreadySet {},
#[snafu(display("range has already been specified in ChunkPredicateBuilder"))]
RangeAlreadySet {},
#[snafu(display("exprs has already been specified in ChunkPredicateBuilder"))]
ExprsAlreadySet {},
#[snafu(display("required_columns has already been specified in ChunkPredicateBuilder"))]
RequiredColumnsAlreadySet {},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Describes the result of translating a set of strings into
/// chunk specific ids
#[derive(Debug, PartialEq, Eq)]
pub enum ChunkIdSet {
/// At least one of the strings was not present in the chunks'
/// dictionary.
///
/// This is important when testing for the presence of all ids in
/// a set, as we know they can not all be present
AtLeastOneMissing,
/// All strings existed in this chunk's dictionary
Present(BTreeSet<u32>),
}
/// a 'Compiled' set of predicates / filters that can be evaluated on
/// this chunk (where strings have been translated to chunk
/// specific u32 ids)
#[derive(Debug, Default)]
pub struct ChunkPredicate {
/// If present, restrict the request to just those tables whose
/// names are in table_names. If present but empty, means there
/// was a predicate but no tables named that way exist in the
/// chunk (so no table can pass)
pub table_name_predicate: Option<BTreeSet<u32>>,
/// Optional column restriction. If present, further
/// restrict any field columns returned to only those named, and
/// skip tables entirely when querying metadata that do not have
/// *any* of the fields
pub field_name_predicate: Option<BTreeSet<u32>>,
/// General DataFusion expressions (arbitrary predicates) applied
/// as a filter using logical conjuction (aka are 'AND'ed
/// together). Only rows that evaluate to TRUE for all these
/// expressions should be returned.
///
/// TODO these exprs should eventually be removed (when they are
/// all handled one layer up in the query layer)
pub chunk_exprs: Vec<Expr>,
/// If Some, then the table must contain all columns specified
/// to pass the predicate
pub required_columns: Option<ChunkIdSet>,
/// The id of the "time" column in this chunk
pub time_column_id: u32,
/// Timestamp range: only rows within this range should be considered
pub range: Option<TimestampRange>,
}
impl ChunkPredicate {
/// Creates and adds a datafuson predicate representing the
/// combination of predicate and timestamp.
pub fn filter_expr(&self) -> Option<Expr> {
// build up a list of expressions
let mut builder =
AndExprBuilder::default().append_opt(self.make_timestamp_predicate_expr());
for expr in &self.chunk_exprs {
builder = builder.append_expr(expr.clone());
}
builder.build()
}
/// For plans which select a subset of fields, returns true if
/// the field should be included in the results
pub fn should_include_field(&self, field_id: u32) -> bool {
match &self.field_name_predicate {
None => true,
Some(field_restriction) => field_restriction.contains(&field_id),
}
}
/// Return true if this column is the time column
pub fn is_time_column(&self, id: u32) -> bool {
self.time_column_id == id
}
/// Creates a DataFusion predicate for appliying a timestamp range:
///
/// range.start <= time and time < range.end`
fn make_timestamp_predicate_expr(&self) -> Option<Expr> {
self.range
.map(|range| make_range_expr(range.start, range.end, TIME_COLUMN_NAME))
}
}
/// Builds ChunkPredicates
#[derive(Debug)]
pub struct ChunkPredicateBuilder<'a> {
inner: ChunkPredicate,
dictionary: &'a Dictionary,
}
impl<'a> ChunkPredicateBuilder<'a> {
pub fn new(dictionary: &'a Dictionary) -> Result<Self> {
let time_column_id = dictionary
.lookup_value(TIME_COLUMN_NAME)
.context(TimeColumnNotFound)?;
let inner = ChunkPredicate {
time_column_id,
..Default::default()
};
Ok(Self { inner, dictionary })
}
/// Set table_name_predicate so only tables in `names` are returned
pub fn table_names(mut self, names: Option<&BTreeSet<String>>) -> Result<Self> {
ensure!(
self.inner.table_name_predicate.is_none(),
TableNamesAlreadySet
);
self.inner.table_name_predicate = self.compile_string_list(names);
Ok(self)
}
/// Set field_name_predicate so only tables in `names` are returned
pub fn field_names(mut self, names: Option<&BTreeSet<String>>) -> Result<Self> {
ensure!(
self.inner.field_name_predicate.is_none(),
FieldNamesAlreadySet
);
self.inner.field_name_predicate = self.compile_string_list(names);
Ok(self)
}
pub fn range(mut self, range: Option<TimestampRange>) -> Result<Self> {
ensure!(self.inner.range.is_none(), RangeAlreadySet);
self.inner.range = range;
Ok(self)
}
/// Set the general purpose predicates
pub fn exprs(mut self, chunk_exprs: Vec<Expr>) -> Result<Self> {
// In order to evaluate expressions in the table, all columns
// referenced in the expression must appear (I think, not sure
// about NOT, etc so panic if we see one of those);
let mut visitor = SupportVisitor {};
let mut predicate_columns: HashSet<String> = HashSet::new();
for expr in &chunk_exprs {
visitor = expr.accept(visitor).context(UnsupportedPredicate)?;
expr_to_column_names(&expr, &mut predicate_columns)
.context(InternalVisitingExpressions)?;
}
ensure!(self.inner.chunk_exprs.is_empty(), ExprsAlreadySet);
self.inner.chunk_exprs = chunk_exprs;
// if there are any column references in the expression, ensure they appear in
// any table
if !predicate_columns.is_empty() {
ensure!(
self.inner.required_columns.is_none(),
RequiredColumnsAlreadySet
);
self.inner.required_columns = Some(self.make_chunk_ids(predicate_columns.iter()));
}
Ok(self)
}
/// Return the created chunk predicate, consuming self
pub fn build(self) -> ChunkPredicate {
self.inner
}
/// Converts a Set of strings into a set of ids in terms of this
/// Chunk's dictionary.
///
/// If there are no matching Strings in the chunks dictionary,
/// those strings are ignored and a (potentially empty) set is
/// returned.
fn compile_string_list(&self, names: Option<&BTreeSet<String>>) -> Option<BTreeSet<u32>> {
names.map(|names| {
names
.iter()
.filter_map(|name| self.dictionary.id(name))
.collect::<BTreeSet<_>>()
})
}
/// Translate a bunch of strings into a set of ids from the dictionarythis
/// chunk
pub fn make_chunk_ids<'b, I>(&self, predicate_columns: I) -> ChunkIdSet
where
I: Iterator<Item = &'b String>,
{
let mut symbols = BTreeSet::new();
for column_name in predicate_columns {
if let Some(column_id) = self.dictionary.id(column_name) {
symbols.insert(column_id);
} else {
return ChunkIdSet::AtLeastOneMissing;
}
}
ChunkIdSet::Present(symbols)
}
}
/// Used to figure out if we know how to deal with this kind of
/// predicate in the write buffer
struct SupportVisitor {}
impl ExpressionVisitor for SupportVisitor {
fn pre_visit(self, expr: &Expr) -> DatafusionResult<Recursion<Self>> {
match expr {
Expr::Literal(..) => Ok(Recursion::Continue(self)),
Expr::Column(..) => Ok(Recursion::Continue(self)),
Expr::BinaryExpr { op, .. } => {
match op {
Operator::Eq
| Operator::Lt
| Operator::LtEq
| Operator::Gt
| Operator::GtEq
| Operator::Plus
| Operator::Minus
| Operator::Multiply
| Operator::Divide
| Operator::And
| Operator::Or => Ok(Recursion::Continue(self)),
// Unsupported (need to think about ramifications)
Operator::NotEq | Operator::Modulus | Operator::Like | Operator::NotLike => {
Err(DataFusionError::NotImplemented(format!(
"Operator {:?} not yet supported in IOx MutableBuffer",
op
)))
}
}
}
_ => Err(DataFusionError::NotImplemented(format!(
"Unsupported expression in mutable_buffer database: {:?}",
expr
))),
}
}
}

View File

@ -1,19 +1,16 @@
use generated_types::wal as wb;
use std::{
collections::{BTreeMap, BTreeSet},
sync::Arc,
};
use std::{cmp, collections::BTreeMap, sync::Arc};
use crate::{
chunk::Chunk,
column,
column::Column,
dictionary::{Dictionary, Error as DictionaryError},
pred::{ChunkIdSet, ChunkPredicate},
dictionary::{Dictionary, Error as DictionaryError, DID},
};
use data_types::{
database_rules::WriterId,
partition_metadata::{ColumnSummary, Statistics},
};
use data_types::partition_metadata::{ColumnSummary, Statistics};
use internal_types::{
entry::{self, ClockValue},
schema::{builder::SchemaBuilder, Schema, TIME_COLUMN_NAME},
selection::Selection,
};
@ -33,12 +30,8 @@ use arrow_deps::{
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Tag value ID {} not found in dictionary of chunk {}", value, chunk))]
TagValueIdNotFoundInDictionary {
value: u32,
chunk: u64,
source: DictionaryError,
},
#[snafu(display("Tag value ID {} not found in dictionary of chunk", value))]
TagValueIdNotFoundInDictionary { value: DID, source: DictionaryError },
#[snafu(display("Column error on column {}: {}", column, source))]
ColumnError {
@ -53,7 +46,7 @@ pub enum Error {
actual_column_type
))]
InternalColumnTypeMismatch {
column_id: u32,
column_id: DID,
expected_column_type: String,
actual_column_type: String,
},
@ -61,21 +54,12 @@ pub enum Error {
#[snafu(display("Internal error: unexpected aggregate request for None aggregate",))]
InternalUnexpectedNoneAggregate {},
#[snafu(display(
"Column name '{}' not found in dictionary of chunk {}",
column_name,
chunk
))]
ColumnNameNotFoundInDictionary { column_name: String, chunk: u64 },
#[snafu(display("Column name '{}' not found in dictionary of chunk", column_name,))]
ColumnNameNotFoundInDictionary { column_name: String },
#[snafu(display(
"Internal: Column id '{}' not found in dictionary of chunk {}",
column_id,
chunk
))]
#[snafu(display("Internal: Column id '{}' not found in dictionary", column_id,))]
ColumnIdNotFoundInDictionary {
column_id: u32,
chunk: u64,
column_id: DID,
source: DictionaryError,
},
@ -92,22 +76,22 @@ pub enum Error {
column_name,
column_id
))]
InternalNoColumnInIndex { column_name: String, column_id: u32 },
InternalNoColumnInIndex { column_name: String, column_id: DID },
#[snafu(display("Error creating column from wal for column {}: {}", column, source))]
CreatingFromWal {
column: u32,
column: DID,
source: crate::column::Error,
},
#[snafu(display("Error evaluating column predicate for column {}: {}", column, source))]
ColumnPredicateEvaluation {
column: u32,
column: DID,
source: crate::column::Error,
},
#[snafu(display("Row insert to table {} missing column name", table))]
ColumnNameNotInRow { table: u32 },
ColumnNameNotInRow { table: DID },
#[snafu(display(
"Group column '{}' not found in tag columns: {}",
@ -123,68 +107,27 @@ pub enum Error {
DuplicateGroupColumn { column_name: String },
#[snafu(display("Column {} not found in table {}", id, table_id))]
ColumnIdNotFound { id: u32, table_id: u32 },
ColumnIdNotFound { id: DID, table_id: DID },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug, Clone)]
pub struct Table {
/// Name of the table as a u32 in the chunk dictionary
pub id: u32,
/// Name of the table as a DID in the chunk dictionary
pub id: DID,
/// Map of column id from the chunk dictionary to the column
pub columns: BTreeMap<u32, Column>,
pub columns: BTreeMap<DID, Column>,
}
impl Table {
pub fn new(id: u32) -> Self {
pub fn new(id: DID) -> Self {
Self {
id,
columns: BTreeMap::new(),
}
}
fn append_row(
&mut self,
dictionary: &mut Dictionary,
values: &flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<wb::Value<'_>>>,
) -> Result<()> {
let row_count = self.row_count();
// insert new columns and validate existing ones
for value in values {
let column_name = value
.column()
.context(ColumnNameNotInRow { table: self.id })?;
let column_id = dictionary.lookup_value_or_insert(column_name);
let column = match self.columns.get_mut(&column_id) {
Some(col) => col,
None => {
// Add the column and make all values for existing rows None
self.columns.insert(
column_id,
Column::with_value(dictionary, row_count, value)
.context(CreatingFromWal { column: column_id })?,
);
continue;
}
};
column.push(dictionary, &value).context(ColumnError {
column: column_name,
})?;
}
// make sure all the columns are of the same length
for col in self.columns.values_mut() {
col.push_none_if_len_equal(row_count);
}
Ok(())
}
pub fn row_count(&self) -> usize {
self.columns
.values()
@ -201,55 +144,124 @@ impl Table {
}
/// Returns a reference to the specified column
pub(crate) fn column(&self, column_id: u32) -> Result<&Column> {
pub(crate) fn column(&self, column_id: DID) -> Result<&Column> {
self.columns.get(&column_id).context(ColumnIdNotFound {
id: column_id,
table_id: self.id,
})
}
/// Returns a reference to the specified column as a slice of
/// i64s. Errors if the type is not i64
pub fn column_i64(&self, column_id: u32) -> Result<&[Option<i64>]> {
let column = self.column(column_id)?;
match column {
Column::I64(vals, _) => Ok(vals),
_ => InternalColumnTypeMismatch {
column_id,
expected_column_type: "i64",
actual_column_type: column.type_description(),
}
.fail(),
}
}
pub fn append_rows(
/// Validates the schema of the passed in columns, then adds their values to
/// the associated columns in the table and updates summary statistics.
pub fn write_columns(
&mut self,
dictionary: &mut Dictionary,
rows: &flatbuffers::Vector<'_, flatbuffers::ForwardsUOffset<wb::Row<'_>>>,
_clock_value: ClockValue,
_writer_id: WriterId,
columns: Vec<entry::Column<'_>>,
) -> Result<()> {
for row in rows {
if let Some(values) = row.values() {
self.append_row(dictionary, &values)?;
// get the column ids and validate schema for those that already exist
let columns_with_inserts = columns
.into_iter()
.map(|insert_column| {
let column_id = dictionary.lookup_value_or_insert(insert_column.name());
let values = insert_column.values();
if let Some(c) = self.columns.get(&column_id) {
match (&values, c) {
(entry::TypedValuesIterator::Bool(_), Column::Bool(_, _)) => (),
(entry::TypedValuesIterator::U64(_), Column::U64(_, _)) => (),
(entry::TypedValuesIterator::F64(_), Column::F64(_, _)) => (),
(entry::TypedValuesIterator::I64(_), Column::I64(_, _)) => (),
(entry::TypedValuesIterator::String(_), Column::String(_, _)) => {
if !insert_column.is_field() {
InternalColumnTypeMismatch {
column_id,
expected_column_type: c.type_description(),
actual_column_type: values.type_description(),
}
.fail()?
};
}
(entry::TypedValuesIterator::String(_), Column::Tag(_, _)) => {
if !insert_column.is_tag() {
InternalColumnTypeMismatch {
column_id,
expected_column_type: c.type_description(),
actual_column_type: values.type_description(),
}
.fail()?
};
}
_ => InternalColumnTypeMismatch {
column_id,
expected_column_type: c.type_description(),
actual_column_type: values.type_description(),
}
.fail()?,
}
}
Ok((column_id, insert_column.logical_type(), values))
})
.collect::<Result<Vec<_>>>()?;
let row_count_before_insert = self.row_count();
for (column_id, logical_type, values) in columns_with_inserts.into_iter() {
match self.columns.get_mut(&column_id) {
Some(c) => c
.push_typed_values(dictionary, logical_type, values)
.with_context(|| {
let column = dictionary
.lookup_id(column_id)
.expect("column name must be present in dictionary");
ColumnError { column }
})?,
None => {
self.columns.insert(
column_id,
Column::new_from_typed_values(
dictionary,
row_count_before_insert,
logical_type,
values,
),
);
}
}
}
// ensure all columns have the same number of rows as the one with the most.
// This adds nulls to the columns that weren't included in this write
let max_row_count = self
.columns
.values()
.fold(row_count_before_insert, |max, col| cmp::max(max, col.len()));
for c in self.columns.values_mut() {
c.push_nulls_to_len(max_row_count);
}
Ok(())
}
/// Returns the column selection for all the columns in this table, orderd
/// by table name
fn all_columns_selection<'a>(&self, chunk: &'a Chunk) -> Result<TableColSelection<'a>> {
fn all_columns_selection<'a>(
&self,
dictionary: &'a Dictionary,
) -> Result<TableColSelection<'a>> {
let cols = self
.columns
.iter()
.map(|(column_id, _)| {
let column_name = chunk.dictionary.lookup_id(*column_id).context(
ColumnIdNotFoundInDictionary {
column_id: *column_id,
chunk: chunk.id,
},
)?;
let column_name =
dictionary
.lookup_id(*column_id)
.context(ColumnIdNotFoundInDictionary {
column_id: *column_id,
})?;
Ok(ColSelection {
column_name,
column_id: *column_id,
@ -266,45 +278,45 @@ impl Table {
/// Returns a column selection for just the specified columns
fn specific_columns_selection<'a>(
&self,
chunk: &'a Chunk,
dictionary: &'a Dictionary,
columns: &'a [&'a str],
) -> Result<TableColSelection<'a>> {
let cols =
columns
.iter()
.map(|&column_name| {
let column_id = chunk.dictionary.id(column_name).context(
ColumnNameNotFoundInDictionary {
column_name,
chunk: chunk.id,
},
)?;
let cols = columns
.iter()
.map(|&column_name| {
let column_id = dictionary
.id(column_name)
.context(ColumnNameNotFoundInDictionary { column_name })?;
Ok(ColSelection {
column_name,
column_id,
})
Ok(ColSelection {
column_name,
column_id,
})
.collect::<Result<_>>()?;
})
.collect::<Result<_>>()?;
Ok(TableColSelection { cols })
}
/// Converts this table to an arrow record batch.
pub fn to_arrow(&self, chunk: &Chunk, selection: Selection<'_>) -> Result<RecordBatch> {
pub fn to_arrow(
&self,
dictionary: &Dictionary,
selection: Selection<'_>,
) -> Result<RecordBatch> {
// translate chunk selection into name/indexes:
let selection = match selection {
Selection::All => self.all_columns_selection(chunk),
Selection::Some(cols) => self.specific_columns_selection(chunk, cols),
Selection::All => self.all_columns_selection(dictionary),
Selection::Some(cols) => self.specific_columns_selection(dictionary, cols),
}?;
self.to_arrow_impl(chunk, &selection)
self.to_arrow_impl(dictionary, &selection)
}
pub fn schema(&self, chunk: &Chunk, selection: Selection<'_>) -> Result<Schema> {
pub fn schema(&self, dictionary: &Dictionary, selection: Selection<'_>) -> Result<Schema> {
// translate chunk selection into name/indexes:
let selection = match selection {
Selection::All => self.all_columns_selection(chunk),
Selection::Some(cols) => self.specific_columns_selection(chunk, cols),
Selection::All => self.all_columns_selection(dictionary),
Selection::Some(cols) => self.specific_columns_selection(dictionary, cols),
}?;
self.schema_impl(&selection)
}
@ -341,7 +353,7 @@ impl Table {
/// requested columns with index are tuples of column_name, column_index
fn to_arrow_impl(
&self,
chunk: &Chunk,
dictionary: &Dictionary,
selection: &TableColSelection<'_>,
) -> Result<RecordBatch> {
let mut columns = Vec::with_capacity(selection.cols.len());
@ -370,12 +382,9 @@ impl Table {
match v {
None => builder.append_null(),
Some(value_id) => {
let tag_value = chunk.dictionary.lookup_id(*value_id).context(
TagValueIdNotFoundInDictionary {
value: *value_id,
chunk: chunk.id,
},
)?;
let tag_value = dictionary
.lookup_id(*value_id)
.context(TagValueIdNotFoundInDictionary { value: *value_id })?;
builder.append_value(tag_value)
}
}
@ -430,124 +439,11 @@ impl Table {
RecordBatch::try_new(schema, columns).context(ArrowError {})
}
/// returns true if any row in this table could possible match the
/// predicate. true does not mean any rows will *actually* match,
/// just that the entire table can not be ruled out.
///
/// false means that no rows in this table could possibly match
pub fn could_match_predicate(&self, chunk_predicate: &ChunkPredicate) -> Result<bool> {
Ok(
self.matches_column_name_predicate(chunk_predicate.field_name_predicate.as_ref())
&& self.matches_table_name_predicate(chunk_predicate.table_name_predicate.as_ref())
&& self.matches_timestamp_predicate(chunk_predicate)?
&& self.has_columns(chunk_predicate.required_columns.as_ref()),
)
}
/// Returns true if the table contains any of the field columns
/// requested or there are no specific fields requested.
fn matches_column_name_predicate(&self, column_selection: Option<&BTreeSet<u32>>) -> bool {
match column_selection {
Some(column_selection) => {
for column_id in column_selection {
if let Some(column) = self.columns.get(column_id) {
if !column.is_tag() {
return true;
}
}
}
// selection only had tag columns
false
}
None => true, // no specific selection
}
}
fn matches_table_name_predicate(&self, table_name_predicate: Option<&BTreeSet<u32>>) -> bool {
match table_name_predicate {
Some(table_name_predicate) => table_name_predicate.contains(&self.id),
None => true, // no table predicate
}
}
/// returns true if there are any timestamps in this table that
/// fall within the timestamp range
fn matches_timestamp_predicate(&self, chunk_predicate: &ChunkPredicate) -> Result<bool> {
match &chunk_predicate.range {
None => Ok(true),
Some(range) => {
let time_column_id = chunk_predicate.time_column_id;
let time_column = self.column(time_column_id)?;
time_column.has_i64_range(range.start, range.end).context(
ColumnPredicateEvaluation {
column: time_column_id,
},
)
}
}
}
/// returns true if no columns are specified, or the table has all
/// columns specified
fn has_columns(&self, columns: Option<&ChunkIdSet>) -> bool {
if let Some(columns) = columns {
match columns {
ChunkIdSet::AtLeastOneMissing => return false,
ChunkIdSet::Present(symbols) => {
for symbol in symbols {
if !self.columns.contains_key(symbol) {
return false;
}
}
}
}
}
true
}
/// returns true if there are any rows in column that are non-null
/// and within the timestamp range specified by pred
pub(crate) fn column_matches_predicate(
&self,
column: &Column,
chunk_predicate: &ChunkPredicate,
) -> Result<bool> {
match column {
Column::F64(v, _) => self.column_value_matches_predicate(v, chunk_predicate),
Column::I64(v, _) => self.column_value_matches_predicate(v, chunk_predicate),
Column::U64(v, _) => self.column_value_matches_predicate(v, chunk_predicate),
Column::String(v, _) => self.column_value_matches_predicate(v, chunk_predicate),
Column::Bool(v, _) => self.column_value_matches_predicate(v, chunk_predicate),
Column::Tag(v, _) => self.column_value_matches_predicate(v, chunk_predicate),
}
}
fn column_value_matches_predicate<T>(
&self,
column_value: &[Option<T>],
chunk_predicate: &ChunkPredicate,
) -> Result<bool> {
match chunk_predicate.range {
None => Ok(true),
Some(range) => {
let time_column_id = chunk_predicate.time_column_id;
let time_column = self.column(time_column_id)?;
time_column
.has_non_null_i64_range(column_value, range.start, range.end)
.context(ColumnPredicateEvaluation {
column: time_column_id,
})
}
}
}
pub fn stats(&self, chunk: &Chunk) -> Vec<ColumnSummary> {
pub fn stats(&self, dictionary: &Dictionary) -> Vec<ColumnSummary> {
self.columns
.iter()
.map(|(column_id, c)| {
let column_name = chunk
.dictionary
let column_name = dictionary
.lookup_id(*column_id)
.expect("column name in dictionary");
@ -572,7 +468,7 @@ impl Table {
struct ColSelection<'a> {
column_name: &'a str,
column_id: u32,
column_id: DID,
}
/// Represets a set of column_name, column_index pairs
@ -591,61 +487,13 @@ impl<'a> TableColSelection<'a> {
#[cfg(test)]
mod tests {
use influxdb_line_protocol::{parse_lines, ParsedLine};
use internal_types::data::split_lines_into_write_entry_partitions;
use internal_types::entry::test_helpers::lp_to_entry;
use super::*;
use tracker::MemRegistry;
#[test]
fn test_has_columns() {
let registry = Arc::new(MemRegistry::new());
let mut chunk = Chunk::new(42, registry.as_ref());
let dictionary = &mut chunk.dictionary;
let mut table = Table::new(dictionary.lookup_value_or_insert("table_name"));
let lp_lines = vec![
"h2o,state=MA,city=Boston temp=70.4 100",
"h2o,state=MA,city=Boston temp=72.4 250",
];
write_lines_to_table(&mut table, dictionary, lp_lines);
let state_symbol = dictionary.id("state").unwrap();
let new_symbol = dictionary.lookup_value_or_insert("not_a_columns");
assert!(table.has_columns(None));
let pred = ChunkIdSet::AtLeastOneMissing;
assert!(!table.has_columns(Some(&pred)));
let set = BTreeSet::<u32>::new();
let pred = ChunkIdSet::Present(set);
assert!(table.has_columns(Some(&pred)));
let mut set = BTreeSet::new();
set.insert(state_symbol);
let pred = ChunkIdSet::Present(set);
assert!(table.has_columns(Some(&pred)));
let mut set = BTreeSet::new();
set.insert(new_symbol);
let pred = ChunkIdSet::Present(set);
assert!(!table.has_columns(Some(&pred)));
let mut set = BTreeSet::new();
set.insert(state_symbol);
set.insert(new_symbol);
let pred = ChunkIdSet::Present(set);
assert!(!table.has_columns(Some(&pred)));
}
#[test]
fn table_size() {
let registry = Arc::new(MemRegistry::new());
let mut chunk = Chunk::new(42, registry.as_ref());
let dictionary = &mut chunk.dictionary;
let mut dictionary = Dictionary::new();
let mut table = Table::new(dictionary.lookup_value_or_insert("table_name"));
let lp_lines = vec![
@ -653,111 +501,31 @@ mod tests {
"h2o,state=MA,city=Boston temp=72.4 250",
];
write_lines_to_table(&mut table, dictionary, lp_lines.clone());
assert_eq!(128, table.size());
write_lines_to_table(&mut table, &mut dictionary, lp_lines.clone());
assert_eq!(112, table.size());
// doesn't double because of the stats overhead
write_lines_to_table(&mut table, dictionary, lp_lines.clone());
assert_eq!(224, table.size());
write_lines_to_table(&mut table, &mut dictionary, lp_lines.clone());
assert_eq!(192, table.size());
// now make sure it increased by the same amount minus stats overhead
write_lines_to_table(&mut table, dictionary, lp_lines);
assert_eq!(320, table.size());
}
#[test]
fn test_matches_table_name_predicate() {
let registry = Arc::new(MemRegistry::new());
let mut chunk = Chunk::new(42, registry.as_ref());
let dictionary = &mut chunk.dictionary;
let mut table = Table::new(dictionary.lookup_value_or_insert("h2o"));
let lp_lines = vec![
"h2o,state=MA,city=Boston temp=70.4 100",
"h2o,state=MA,city=Boston temp=72.4 250",
];
write_lines_to_table(&mut table, dictionary, lp_lines);
let h2o_symbol = dictionary.id("h2o").unwrap();
assert!(table.matches_table_name_predicate(None));
let set = BTreeSet::new();
assert!(!table.matches_table_name_predicate(Some(&set)));
let mut set = BTreeSet::new();
set.insert(h2o_symbol);
assert!(table.matches_table_name_predicate(Some(&set)));
// Some symbol that is not the same as h2o_symbol
assert_ne!(37377, h2o_symbol);
let mut set = BTreeSet::new();
set.insert(37377);
assert!(!table.matches_table_name_predicate(Some(&set)));
}
#[test]
fn test_matches_column_name_predicate() {
let registry = Arc::new(MemRegistry::new());
let mut chunk = Chunk::new(42, registry.as_ref());
let dictionary = &mut chunk.dictionary;
let mut table = Table::new(dictionary.lookup_value_or_insert("h2o"));
let lp_lines = vec![
"h2o,state=MA,city=Boston temp=70.4,awesomeness=1000 100",
"h2o,state=MA,city=Boston temp=72.4,awesomeness=2000 250",
];
write_lines_to_table(&mut table, dictionary, lp_lines);
let state_symbol = dictionary.id("state").unwrap();
let temp_symbol = dictionary.id("temp").unwrap();
let awesomeness_symbol = dictionary.id("awesomeness").unwrap();
assert!(table.matches_column_name_predicate(None));
let set = BTreeSet::new();
assert!(!table.matches_column_name_predicate(Some(&set)));
// tag columns should not count
let mut set = BTreeSet::new();
set.insert(state_symbol);
assert!(!table.matches_column_name_predicate(Some(&set)));
let mut set = BTreeSet::new();
set.insert(temp_symbol);
assert!(table.matches_column_name_predicate(Some(&set)));
let mut set = BTreeSet::new();
set.insert(temp_symbol);
set.insert(awesomeness_symbol);
assert!(table.matches_column_name_predicate(Some(&set)));
let mut set = BTreeSet::new();
set.insert(temp_symbol);
set.insert(awesomeness_symbol);
set.insert(1337); // some other symbol, but that is ok
assert!(table.matches_column_name_predicate(Some(&set)));
let mut set = BTreeSet::new();
set.insert(1337);
assert!(!table.matches_column_name_predicate(Some(&set)));
write_lines_to_table(&mut table, &mut dictionary, lp_lines);
assert_eq!(272, table.size());
}
#[test]
fn test_to_arrow_schema_all() {
let registry = Arc::new(MemRegistry::new());
let mut chunk = Chunk::new(42, registry.as_ref());
let dictionary = &mut chunk.dictionary;
let mut dictionary = Dictionary::new();
let mut table = Table::new(dictionary.lookup_value_or_insert("table_name"));
let lp_lines = vec![
"h2o,state=MA,city=Boston float_field=70.4,int_field=8i,uint_field=42u,bool_field=t,string_field=\"foo\" 100",
];
write_lines_to_table(&mut table, dictionary, lp_lines);
write_lines_to_table(&mut table, &mut dictionary, lp_lines);
let selection = Selection::All;
let actual_schema = table.schema(&chunk, selection).unwrap();
let actual_schema = table.schema(&dictionary, selection).unwrap();
let expected_schema = SchemaBuilder::new()
.field("bool_field", ArrowDataType::Boolean)
.tag("city")
@ -779,17 +547,15 @@ mod tests {
#[test]
fn test_to_arrow_schema_subset() {
let registry = Arc::new(MemRegistry::new());
let mut chunk = Chunk::new(42, registry.as_ref());
let dictionary = &mut chunk.dictionary;
let mut dictionary = Dictionary::new();
let mut table = Table::new(dictionary.lookup_value_or_insert("table_name"));
let lp_lines = vec!["h2o,state=MA,city=Boston float_field=70.4 100"];
write_lines_to_table(&mut table, dictionary, lp_lines);
write_lines_to_table(&mut table, &mut dictionary, lp_lines);
let selection = Selection::Some(&["float_field"]);
let actual_schema = table.schema(&chunk, selection).unwrap();
let actual_schema = table.schema(&dictionary, selection).unwrap();
let expected_schema = SchemaBuilder::new()
.field("float_field", ArrowDataType::Float64)
.build()
@ -802,29 +568,172 @@ mod tests {
);
}
#[test]
fn write_columns_validates_schema() {
let mut dictionary = Dictionary::new();
let mut table = Table::new(dictionary.lookup_value_or_insert("foo"));
let lp = "foo,t1=asdf iv=1i,uv=1u,fv=1.0,bv=true,sv=\"hi\" 1";
let entry = lp_to_entry(&lp);
table
.write_columns(
&mut dictionary,
ClockValue::new(0),
0,
entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
.first()
.unwrap()
.columns(),
)
.unwrap();
let lp = "foo t1=\"string\" 1";
let entry = lp_to_entry(&lp);
let response = table
.write_columns(
&mut dictionary,
ClockValue::new(0),
0,
entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
.first()
.unwrap()
.columns(),
)
.err()
.unwrap();
assert!(
matches!(
&response,
Error::InternalColumnTypeMismatch {
expected_column_type,
actual_column_type,
..
} if expected_column_type == "tag" && actual_column_type == "String"),
format!("didn't match returned error: {:?}", response)
);
let lp = "foo iv=1u 1";
let entry = lp_to_entry(&lp);
let response = table
.write_columns(
&mut dictionary,
ClockValue::new(0),
0,
entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
.first()
.unwrap()
.columns(),
)
.err()
.unwrap();
assert!(
matches!(&response, Error::InternalColumnTypeMismatch {expected_column_type, actual_column_type, ..} if expected_column_type == "i64" && actual_column_type == "u64"),
format!("didn't match returned error: {:?}", response)
);
let lp = "foo fv=1i 1";
let entry = lp_to_entry(&lp);
let response = table
.write_columns(
&mut dictionary,
ClockValue::new(0),
0,
entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
.first()
.unwrap()
.columns(),
)
.err()
.unwrap();
assert!(
matches!(&response, Error::InternalColumnTypeMismatch {expected_column_type, actual_column_type, ..} if expected_column_type == "f64" && actual_column_type == "i64"),
format!("didn't match returned error: {:?}", response)
);
let lp = "foo bv=1 1";
let entry = lp_to_entry(&lp);
let response = table
.write_columns(
&mut dictionary,
ClockValue::new(0),
0,
entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
.first()
.unwrap()
.columns(),
)
.err()
.unwrap();
assert!(
matches!(&response, Error::InternalColumnTypeMismatch {expected_column_type, actual_column_type, ..} if expected_column_type == "bool" && actual_column_type == "f64"),
format!("didn't match returned error: {:?}", response)
);
let lp = "foo sv=true 1";
let entry = lp_to_entry(&lp);
let response = table
.write_columns(
&mut dictionary,
ClockValue::new(0),
0,
entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
.first()
.unwrap()
.columns(),
)
.err()
.unwrap();
assert!(
matches!(&response, Error::InternalColumnTypeMismatch {expected_column_type, actual_column_type, ..} if expected_column_type == "String" && actual_column_type == "bool"),
format!("didn't match returned error: {:?}", response)
);
}
/// Insert the line protocol lines in `lp_lines` into this table
fn write_lines_to_table(table: &mut Table, dictionary: &mut Dictionary, lp_lines: Vec<&str>) {
let lp_data = lp_lines.join("\n");
let entry = lp_to_entry(&lp_data);
let lines: Vec<_> = parse_lines(&lp_data).map(|l| l.unwrap()).collect();
let data = split_lines_into_write_entry_partitions(chunk_key_func, &lines);
let batch = flatbuffers::root::<wb::WriteBufferBatch<'_>>(&data).unwrap();
let entries = batch.entries().expect("at least one entry");
for entry in entries {
let table_batches = entry.table_batches().expect("there were table batches");
for batch in table_batches {
let rows = batch.rows().expect("Had rows in the batch");
table
.append_rows(dictionary, &rows)
.expect("Appended the row");
}
for batch in entry
.partition_writes()
.unwrap()
.first()
.unwrap()
.table_batches()
{
table
.write_columns(dictionary, ClockValue::new(0), 0, batch.columns())
.unwrap();
}
}
fn chunk_key_func(_: &ParsedLine<'_>) -> String {
String::from("the_chunk_key")
}
}

View File

@ -14,7 +14,7 @@ bytes = "1.0"
chrono = "0.4"
# Google Cloud Storage integration
cloud-storage = "0.9.0"
futures = "0.3.5"
futures = "0.3"
itertools = "0.9.0"
percent-encoding = "2.1"
# rusoto crates are for Amazon S3 integration

View File

@ -9,6 +9,7 @@ arrow_deps = { path = "../arrow_deps" }
bytes = "1.0"
data_types = { path = "../data_types" }
futures = "0.3.7"
internal_types = {path = "../internal_types"}
object_store = {path = "../object_store"}
parking_lot = "0.11.1"
snafu = "0.6"

View File

@ -1,22 +1,44 @@
use snafu::{OptionExt, ResultExt, Snafu};
use std::collections::BTreeSet;
use crate::table::Table;
use data_types::partition_metadata::TableSummary;
use data_types::{partition_metadata::TableSummary, timestamp::TimestampRange};
use internal_types::{schema::Schema, selection::Selection};
use object_store::path::Path;
use tracker::{MemRegistry, MemTracker};
use std::mem;
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Error writing table '{}': {}", table_name, source))]
TableWrite {
table_name: String,
source: crate::table::Error,
},
#[snafu(display("Table Error in '{}': {}", table_name, source))]
NamedTableError {
table_name: String,
source: crate::table::Error,
},
#[snafu(display("Table '{}' not found in chunk {}", table_name, chunk_id))]
NamedTableNotFoundInChunk { table_name: String, chunk_id: u64 },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug)]
pub struct Chunk {
/// Partition this chunk belongs to
pub partition_key: String,
partition_key: String,
/// The id for this chunk
pub id: u32,
id: u32,
/// Tables of this chunk
pub tables: Vec<Table>,
tables: Vec<Table>,
/// Track memory used by this chunk
memory_tracker: MemTracker,
@ -34,9 +56,36 @@ impl Chunk {
chunk
}
/// Return the chunk id
pub fn id(&self) -> u32 {
self.id
}
/// Return the chunk's partition key
pub fn partition_key(&self) -> &str {
self.partition_key.as_ref()
}
/// Return all paths of this chunks
pub fn all_paths(&self) -> Vec<Path> {
self.tables.iter().map(|t| t.path()).collect()
}
/// Returns a vec of the summary statistics of the tables in this chunk
pub fn table_summaries(&self) -> Vec<TableSummary> {
self.tables.iter().map(|t| t.table_summary()).collect()
}
/// Add a chunk's table and its summary
pub fn add_table(&mut self, table_summary: TableSummary, file_location: Path) {
self.tables.push(Table::new(table_summary, file_location));
pub fn add_table(
&mut self,
table_summary: TableSummary,
file_location: Path,
schema: Schema,
range: Option<TimestampRange>,
) {
self.tables
.push(Table::new(table_summary, file_location, schema, range));
}
/// Return true if this chunk includes the given table
@ -62,4 +111,33 @@ impl Chunk {
size + self.partition_key.len() + mem::size_of::<u32>() + mem::size_of::<Self>()
}
/// Return Schema for the specified table / columns
pub fn table_schema(&self, table_name: &str, selection: Selection<'_>) -> Result<Schema> {
let table = self
.tables
.iter()
.find(|t| t.has_table(table_name))
.context(NamedTableNotFoundInChunk {
table_name,
chunk_id: self.id(),
})?;
table
.schema(selection)
.context(NamedTableError { table_name })
}
pub fn table_names(
&self,
timestamp_range: Option<TimestampRange>,
) -> impl Iterator<Item = String> + '_ {
self.tables.iter().flat_map(move |t| {
if t.matches_predicate(&timestamp_range) {
Some(t.name())
} else {
None
}
})
}
}

View File

@ -1,28 +1,57 @@
use data_types::partition_metadata::TableSummary;
use snafu::{ResultExt, Snafu};
use std::mem;
use data_types::{partition_metadata::TableSummary, timestamp::TimestampRange};
use internal_types::{schema::Schema, selection::Selection};
use object_store::path::Path;
use std::mem;
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Failed to select columns: {}", source))]
SelectColumns {
source: internal_types::schema::Error,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Table that belongs to a chunk persisted in a parquet file in object store
#[derive(Debug, Clone)]
pub struct Table {
/// Meta data of the table
pub table_summary: TableSummary,
table_summary: TableSummary,
/// Path in the object store. Format:
/// <writer id>/<database>/data/<partition key>/<chunk
/// id>/<tablename>.parquet
pub object_store_path: Path,
object_store_path: Path,
/// Schema that goes with this table's parquet file
table_schema: Schema,
/// Timestamp rang of this table's parquet file
timestamp_range: Option<TimestampRange>,
}
impl Table {
pub fn new(meta: TableSummary, path: Path) -> Self {
pub fn new(
meta: TableSummary,
path: Path,
schema: Schema,
range: Option<TimestampRange>,
) -> Self {
Self {
table_summary: meta,
object_store_path: path,
table_schema: schema,
timestamp_range: range,
}
}
pub fn table_summary(&self) -> TableSummary {
self.table_summary.clone()
}
pub fn has_table(&self, table_name: &str) -> bool {
self.table_summary.has_table(table_name)
}
@ -32,10 +61,36 @@ impl Table {
mem::size_of::<Self>()
+ self.table_summary.size()
+ mem::size_of_val(&self.object_store_path)
+ mem::size_of_val(&self.table_schema)
}
/// Return name of this table
pub fn name(&self) -> String {
self.table_summary.name.clone()
}
/// Return the object store path of this table
pub fn path(&self) -> Path {
self.object_store_path.clone()
}
/// return schema of this table for specified selection columns
pub fn schema(&self, selection: Selection<'_>) -> Result<Schema> {
Ok(match selection {
Selection::All => self.table_schema.clone(),
Selection::Some(columns) => {
let columns = self.table_schema.select(columns).context(SelectColumns)?;
self.table_schema.project(&columns)
}
})
}
pub fn matches_predicate(&self, timestamp_range: &Option<TimestampRange>) -> bool {
match (self.timestamp_range, timestamp_range) {
(Some(a), Some(b)) => !a.disjoint(b),
(None, Some(_)) => false, /* If this chunk doesn't have a time column it can't match */
// the predicate
(_, None) => true,
}
}
}

View File

@ -19,7 +19,7 @@ async-trait = "0.1"
chrono = "0.4"
croaring = "0.4.5"
data_types = { path = "../data_types" }
futures = "0.3.7"
futures = "0.3"
influxdb_line_protocol = { path = "../influxdb_line_protocol" }
internal_types = { path = "../internal_types" }
parking_lot = "0.11.1"
@ -29,5 +29,9 @@ tokio = { version = "1.0", features = ["macros"] }
tokio-stream = "0.1.2"
observability_deps = { path = "../observability_deps" }
# use libc on unix like platforms to set worker priority in DedicatedExecutor
[target."cfg(unix)".dependencies.libc]
version = "0.2"
[dev-dependencies] # In alphabetical order
test_helpers = { path = "../test_helpers" }

View File

@ -8,13 +8,14 @@ pub mod fieldlist;
mod schema_pivot;
pub mod seriesset;
pub mod stringset;
mod task;
pub use context::{DEFAULT_CATALOG, DEFAULT_SCHEMA};
use std::sync::Arc;
use arrow_deps::{
arrow::record_batch::RecordBatch,
datafusion::{self, logical_plan::LogicalPlan},
datafusion::{self, logical_plan::LogicalPlan, physical_plan::ExecutionPlan},
};
use counters::ExecutionCounters;
@ -34,6 +35,8 @@ use crate::plan::{
stringset::StringSetPlan,
};
use self::task::{DedicatedExecutor, Error as ExecutorError};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Plan Execution Error: {}", source))]
@ -84,21 +87,29 @@ pub enum Error {
},
#[snafu(display("Joining execution task: {}", source))]
JoinError { source: tokio::task::JoinError },
JoinError { source: ExecutorError },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Handles executing plans, and marshalling the results into rust
/// Handles executing DataFusion plans, and marshalling the results into rust
/// native structures.
#[derive(Debug, Default)]
#[derive(Debug)]
pub struct Executor {
counters: Arc<ExecutionCounters>,
exec: DedicatedExecutor,
}
impl Executor {
pub fn new() -> Self {
Self::default()
/// Creates a new executor with a single dedicated thread pool with
/// num_threads
pub fn new(num_threads: usize) -> Self {
let exec = DedicatedExecutor::new("IOx Executor Thread", num_threads);
Self {
exec,
counters: Arc::new(ExecutionCounters::default()),
}
}
/// Executes this plan and returns the resulting set of strings
@ -148,7 +159,7 @@ impl Executor {
let (plan_tx, plan_rx) = mpsc::channel(1);
rx_channels.push(plan_rx);
tokio::task::spawn(async move {
self.exec.spawn(async move {
let SeriesSetPlan {
table_name,
plan,
@ -161,7 +172,6 @@ impl Executor {
let physical_plan = ctx
.prepare_plan(&plan)
.await
.context(DataFusionPhysicalPlanning)?;
let it = ctx
@ -212,13 +222,10 @@ impl Executor {
let handles = plans
.into_iter()
.map(|plan| {
let counters = Arc::clone(&self.counters);
tokio::task::spawn(async move {
let ctx = IOxExecutionContext::new(counters);
let ctx = self.new_context();
self.exec.spawn(async move {
let physical_plan = ctx
.prepare_plan(&plan)
.await
.context(DataFusionPhysicalPlanning)?;
// TODO: avoid this buffering
@ -250,9 +257,18 @@ impl Executor {
self.run_logical_plans(vec![plan]).await
}
/// Executes the logical plan using DataFusion on a separate
/// thread pool and produces RecordBatches
pub async fn collect(&self, physical_plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
self.new_context()
.collect(physical_plan)
.await
.context(DataFusionExecution)
}
/// Create a new execution context, suitable for executing a new query
pub fn new_context(&self) -> IOxExecutionContext {
IOxExecutionContext::new(Arc::clone(&self.counters))
IOxExecutionContext::new(self.exec.clone(), Arc::clone(&self.counters))
}
/// plans and runs the plans in parallel and collects the results
@ -262,11 +278,10 @@ impl Executor {
.into_iter()
.map(|plan| {
let ctx = self.new_context();
// TODO run these on some executor other than the main tokio pool
tokio::task::spawn(async move {
self.exec.spawn(async move {
let physical_plan = ctx
.prepare_plan(&plan)
.await
.context(DataFusionPhysicalPlanning)?;
// TODO: avoid this buffering
@ -327,7 +342,7 @@ mod tests {
let expected_strings = to_set(&["Foo", "Bar"]);
let plan = StringSetPlan::Known(Arc::clone(&expected_strings));
let executor = Executor::default();
let executor = Executor::new(1);
let result_strings = executor.to_string_set(plan).await.unwrap();
assert_eq!(result_strings, expected_strings);
}
@ -339,7 +354,7 @@ mod tests {
let scan = make_plan(schema, vec![]);
let plan: StringSetPlan = vec![scan].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await.unwrap();
assert_eq!(results, StringSetRef::new(StringSet::new()));
@ -355,7 +370,7 @@ mod tests {
let scan = make_plan(schema, vec![batch]);
let plan: StringSetPlan = vec![scan].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await.unwrap();
assert_eq!(results, to_set(&["foo", "bar", "baz"]));
@ -374,7 +389,7 @@ mod tests {
let scan = make_plan(schema, vec![batch1, batch2]);
let plan: StringSetPlan = vec![scan].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await.unwrap();
assert_eq!(results, to_set(&["foo", "bar", "baz"]));
@ -397,7 +412,7 @@ mod tests {
let plan: StringSetPlan = vec![scan1, scan2].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await.unwrap();
assert_eq!(results, to_set(&["foo", "bar", "baz"]));
@ -417,7 +432,7 @@ mod tests {
let scan = make_plan(schema, vec![batch]);
let plan: StringSetPlan = vec![scan].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await;
let actual_error = match results {
@ -443,7 +458,7 @@ mod tests {
let scan = make_plan(schema, vec![batch]);
let plan: StringSetPlan = vec![scan].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await;
let actual_error = match results {
@ -481,7 +496,7 @@ mod tests {
let pivot = make_schema_pivot(scan);
let plan = vec![pivot].into();
let executor = Executor::new();
let executor = Executor::new(1);
let results = executor.to_string_set(plan).await.expect("Executed plan");
assert_eq!(results, to_set(&["f1", "f2"]));

View File

@ -25,7 +25,7 @@ use observability_deps::tracing::debug;
// Reuse DataFusion error and Result types for this module
pub use arrow_deps::datafusion::error::{DataFusionError as Error, Result};
use super::counters::ExecutionCounters;
use super::{counters::ExecutionCounters, task::DedicatedExecutor};
// The default catalog name - this impacts what SQL queries use if not specified
pub const DEFAULT_CATALOG: &str = "public";
@ -77,15 +77,27 @@ impl ExtensionPlanner for IOxExtensionPlanner {
}
}
/// This is an execution context for planning in IOx.
/// It wraps a DataFusion execution context and incudes
/// statistical counters.
/// This is an execution context for planning in IOx. It wraps a
/// DataFusion execution context and incudes statistical counters and
/// a dedicated thread pool.
///
/// Eventually we envision this as also managing resources
/// and providing visibility into what plans are running
/// Methods on this struct should be preferred to using the raw
/// DataFusion functions (such as `collect`) directly.
///
/// Eventually we envision this also managing additional resource
/// types such as Memory and providing visibility into what plans are
/// running
pub struct IOxExecutionContext {
counters: Arc<ExecutionCounters>,
inner: ExecutionContext,
/// Dedicated executor for query execution.
///
/// DataFusion plans are "CPU" bound and thus can consume tokio
/// executors threads for extended periods of time. We use a
/// dedicated tokio runtime to run them so that other requests
/// can be handled.
exec: DedicatedExecutor,
}
impl fmt::Debug for IOxExecutionContext {
@ -102,7 +114,7 @@ impl IOxExecutionContext {
///
/// The config is created with a default catalog and schema, but this
/// can be overridden at a later date
pub fn new(counters: Arc<ExecutionCounters>) -> Self {
pub fn new(exec: DedicatedExecutor, counters: Arc<ExecutionCounters>) -> Self {
const BATCH_SIZE: usize = 1000;
// TBD: Should we be reusing an execution context across all executions?
@ -115,7 +127,11 @@ impl IOxExecutionContext {
let inner = ExecutionContext::with_config(config);
Self { counters, inner }
Self {
exec,
counters,
inner,
}
}
/// returns a reference to the inner datafusion execution context
@ -130,13 +146,13 @@ impl IOxExecutionContext {
/// Prepare a SQL statement for execution. This assumes that any
/// tables referenced in the SQL have been registered with this context
pub async fn prepare_sql(&mut self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
pub fn prepare_sql(&mut self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
let logical_plan = self.inner.sql(sql)?.to_logical_plan();
self.prepare_plan(&logical_plan).await
self.prepare_plan(&logical_plan)
}
/// Prepare (optimize + plan) a pre-created logical plan for execution
pub async fn prepare_plan(&self, plan: &LogicalPlan) -> Result<Arc<dyn ExecutionPlan>> {
pub fn prepare_plan(&self, plan: &LogicalPlan) -> Result<Arc<dyn ExecutionPlan>> {
debug!(
"Creating plan: Initial plan\n----\n{}\n{}\n----",
plan.display_indent_schema(),
@ -154,13 +170,16 @@ impl IOxExecutionContext {
self.inner.create_physical_plan(&plan)
}
/// Executes the logical plan using DataFusion and produces RecordBatches
/// Executes the logical plan using DataFusion on a separate
/// thread pool and produces RecordBatches
pub async fn collect(&self, physical_plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
self.counters.inc_plans_run();
debug!("Running plan, physical:\n{:?}", physical_plan);
collect(physical_plan).await
self.exec.spawn(collect(physical_plan)).await.map_err(|e| {
Error::Execution(format!("Error running IOxExecutionContext::collect: {}", e))
})?
}
/// Executes the physical plan and produces a RecordBatchStream to stream
@ -169,14 +188,21 @@ impl IOxExecutionContext {
&self,
physical_plan: Arc<dyn ExecutionPlan>,
) -> Result<SendableRecordBatchStream> {
if physical_plan.output_partitioning().partition_count() <= 1 {
physical_plan.execute(0).await
} else {
// merge into a single partition
let plan = MergeExec::new(physical_plan);
// MergeExec must produce a single partition
assert_eq!(1, plan.output_partitioning().partition_count());
plan.execute(0).await
}
self.exec
.spawn(async move {
if physical_plan.output_partitioning().partition_count() <= 1 {
physical_plan.execute(0).await
} else {
// merge into a single partition
let plan = MergeExec::new(physical_plan);
// MergeExec must produce a single partition
assert_eq!(1, plan.output_partitioning().partition_count());
plan.execute(0).await
}
})
.await
.map_err(|e| {
Error::Execution(format!("Error running IOxExecutionContext::execute: {}", e))
})?
}
}

344
query/src/exec/task.rs Normal file
View File

@ -0,0 +1,344 @@
//! This module contains a dedicated thread pool for running "cpu
//! intensive" workloads such as DataFusion plans
use parking_lot::Mutex;
use std::{pin::Pin, sync::Arc};
use tokio::sync::oneshot::Receiver;
use futures::Future;
use observability_deps::tracing::warn;
/// The type of thing that the dedicated executor runs
type Task = Pin<Box<dyn Future<Output = ()> + Send>>;
/// The type of error that is returned from tasks in this module
pub type Error = tokio::sync::oneshot::error::RecvError;
/// Runs futures (and any `tasks` that are `tokio::task::spawned` by
/// them) on a separate tokio Executor
#[derive(Clone)]
pub struct DedicatedExecutor {
state: Arc<Mutex<State>>,
}
/// Runs futures (and any `tasks` that are `tokio::task::spawned` by
/// them) on a separate tokio Executor
struct State {
/// Channel for requests -- the dedicated executor takes requests
/// from here and runs them.
requests: Option<std::sync::mpsc::Sender<Task>>,
/// The thread that is doing the work
thread: Option<std::thread::JoinHandle<()>>,
}
/// The default worker priority (value passed to `libc::setpriority`);
const WORKER_PRIORITY: i32 = 10;
impl std::fmt::Debug for DedicatedExecutor {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Avoid taking the mutex in debug formatting
write!(f, "DedicatedExecutor")
}
}
impl DedicatedExecutor {
/// Creates a new `DedicatedExecutor` with a dedicated tokio
/// executor that is separate from the threadpool created via
/// `[tokio::main]` or similar.
///
/// The worker thread priority is set to low so that such tasks do
/// not starve other more important tasks (such as answering health checks)
///
/// Follows the example from to stack overflow and spawns a new
/// thread to install a Tokio runtime "context"
/// https://stackoverflow.com/questions/62536566
///
/// If you try to do this from a async context you see something like
/// thread 'plan::stringset::tests::test_builder_plan' panicked at 'Cannot
/// drop a runtime in a context where blocking is not allowed. This
/// happens when a runtime is dropped from within an asynchronous
/// context.', .../tokio-1.4.0/src/runtime/blocking/shutdown.rs:51:21
pub fn new(thread_name: &str, num_threads: usize) -> Self {
let thread_name = thread_name.to_string();
let (tx, rx) = std::sync::mpsc::channel();
let thread = std::thread::spawn(move || {
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name(&thread_name)
.worker_threads(num_threads)
.on_thread_start(move || set_current_thread_priority(WORKER_PRIORITY))
.build()
.expect("Creating tokio runtime");
// By entering the context, all calls to `tokio::spawn` go
// to this executor
let _guard = runtime.enter();
while let Ok(request) = rx.recv() {
// TODO track the outstanding tasks
tokio::task::spawn(request);
}
});
let state = State {
requests: Some(tx),
thread: Some(thread),
};
Self {
state: Arc::new(Mutex::new(state)),
}
}
/// Runs the specified Future (and any tasks it spawns) on the
/// `DedicatedExecutor`.
///
/// Currently all tasks are added to the tokio executor
/// immediately and compete for the threadpool's resources.
pub fn spawn<T>(&self, task: T) -> Receiver<T::Output>
where
T: Future + Send + 'static,
T::Output: Send + 'static,
{
let (tx, rx) = tokio::sync::oneshot::channel();
let job = Box::pin(async move {
let task_output = task.await;
if tx.send(task_output).is_err() {
warn!("Spawned task output ignored: receiver dropped")
}
});
let mut state = self.state.lock();
if let Some(requests) = &mut state.requests {
// would fail if someone has started shutdown
requests.send(job).ok();
} else {
warn!("tried to schedule task on an executor that was shutdown");
}
rx
}
/// signals shutdown of this executor and any Clones
pub fn shutdown(&self) {
// hang up the channel which will cause the dedicated thread
// to quit
let mut state = self.state.lock();
state.requests = None;
}
/// Stops all subsequent task executions, and waits for the worker
/// thread to complete. Note this will shutdown all clones of this
/// `DedicatedExecutor` as well.
///
/// Only the first all to `join` will actually wait for the
/// executing thread to complete. All other calls to join will
/// complete immediately.
pub fn join(&self) {
self.shutdown();
// take the thread out when mutex is held
let thread = {
let mut state = self.state.lock();
state.thread.take()
};
// wait for completion while not holding the mutex to avoid
// deadlocks
if let Some(thread) = thread {
thread.join().ok();
}
}
}
#[cfg(unix)]
fn set_current_thread_priority(prio: i32) {
// on linux setpriority sets the current thread's priority
// (as opposed to the current process).
unsafe { libc::setpriority(0, 0, prio) };
}
#[cfg(not(unix))]
fn set_current_thread_priority(prio: i32) {
warn!("Setting worker thread priority not supported on this platform");
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{Arc, Barrier};
#[cfg(unix)]
fn get_current_thread_priority() -> i32 {
// on linux setpriority sets the current thread's priority
// (as opposed to the current process).
unsafe { libc::getpriority(0, 0) }
}
#[cfg(not(unix))]
fn get_current_thread_priority() -> i32 {
WORKER_PRIORITY
}
#[tokio::test]
async fn basic() {
let barrier = Arc::new(Barrier::new(2));
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
let dedicated_task = exec.spawn(do_work(42, Arc::clone(&barrier)));
// Note the dedicated task will never complete if it runs on
// the main tokio thread (as this test is not using the
// 'multithreaded' version of the executor and the call to
// barrier.wait actually blocks the tokio thread)
barrier.wait();
// should be able to get the result
assert_eq!(dedicated_task.await.unwrap(), 42);
}
#[tokio::test]
async fn basic_clone() {
let barrier = Arc::new(Barrier::new(2));
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
// Run task on clone should work fine
let dedicated_task = exec.clone().spawn(do_work(42, Arc::clone(&barrier)));
barrier.wait();
assert_eq!(dedicated_task.await.unwrap(), 42);
}
#[tokio::test]
async fn multi_task() {
let barrier = Arc::new(Barrier::new(3));
// make an executor with two threads
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 2);
let dedicated_task1 = exec.spawn(do_work(11, Arc::clone(&barrier)));
let dedicated_task2 = exec.spawn(do_work(42, Arc::clone(&barrier)));
// block main thread until completion of other two tasks
barrier.wait();
// should be able to get the result
assert_eq!(dedicated_task1.await.unwrap(), 11);
assert_eq!(dedicated_task2.await.unwrap(), 42);
exec.join();
}
#[tokio::test]
async fn worker_priority() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 2);
let dedicated_task = exec.spawn(async move { get_current_thread_priority() });
assert_eq!(dedicated_task.await.unwrap(), WORKER_PRIORITY);
}
#[tokio::test]
async fn tokio_spawn() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 2);
// spawn a task that spawns to other tasks and ensure they run on the dedicated
// executor
let dedicated_task = exec.spawn(async move {
// spawn separate tasks
let t1 = tokio::task::spawn(async {
assert_eq!(
std::thread::current().name(),
Some("Test DedicatedExecutor")
);
25usize
});
t1.await.unwrap()
});
// Validate the inner task ran to completion (aka it did not panic)
assert_eq!(dedicated_task.await.unwrap(), 25);
}
#[tokio::test]
async fn panic_on_executor() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
let dedicated_task = exec.spawn(async move {
if true {
panic!("At the disco, on the dedicated task scheduler");
} else {
42
}
});
// should not be able to get the result
dedicated_task.await.unwrap_err();
}
#[tokio::test]
async fn executor_shutdown_while_task_running() {
let barrier = Arc::new(Barrier::new(2));
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
let dedicated_task = exec.spawn(do_work(42, Arc::clone(&barrier)));
exec.shutdown();
// block main thread until completion of the outstanding task
barrier.wait();
// task should complete successfully
assert_eq!(dedicated_task.await.unwrap(), 42);
}
#[tokio::test]
async fn executor_submit_task_after_shutdown() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
// Simulate trying to submit tasks once executor has shutdown
exec.shutdown();
let dedicated_task = exec.spawn(async { 11 });
// task should complete, but return an error
dedicated_task.await.unwrap_err();
}
#[tokio::test]
async fn executor_submit_task_after_clone_shutdown() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
// shutdown the clone (but not the exec)
exec.clone().join();
// Simulate trying to submit tasks once executor has shutdown
let dedicated_task = exec.spawn(async { 11 });
// task should complete, but return an error
dedicated_task.await.unwrap_err();
}
#[tokio::test]
async fn executor_join() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
// test it doesn't hang
exec.join()
}
#[tokio::test]
#[allow(clippy::redundant_clone)]
async fn executor_clone_join() {
let exec = DedicatedExecutor::new("Test DedicatedExecutor", 1);
// test it doesn't hang
exec.clone().join();
exec.clone().join();
exec.join();
}
/// Wait for the barrier and then return `result`
async fn do_work(result: usize, barrier: Arc<Barrier>) -> usize {
barrier.wait();
result
}
}

View File

@ -195,13 +195,13 @@ impl InfluxRPCPlanner {
/// Returns a plan that lists the names of tables in this
/// database that have at least one row that matches the
/// conditions listed on `predicate`
pub async fn table_names<D>(&self, database: &D, predicate: Predicate) -> Result<StringSetPlan>
pub fn table_names<D>(&self, database: &D, predicate: Predicate) -> Result<StringSetPlan>
where
D: Database + 'static,
{
let mut builder = StringSetPlanBuilder::new();
for chunk in self.filtered_chunks(database, &predicate).await? {
for chunk in self.filtered_chunks(database, &predicate)? {
let new_table_names = chunk
.table_names(&predicate, builder.known_strings())
.map_err(|e| Box::new(e) as _)
@ -227,7 +227,7 @@ impl InfluxRPCPlanner {
/// columns (as defined in the InfluxDB Data model) names in this
/// database that have more than zero rows which pass the
/// conditions specified by `predicate`.
pub async fn tag_keys<D>(&self, database: &D, predicate: Predicate) -> Result<StringSetPlan>
pub fn tag_keys<D>(&self, database: &D, predicate: Predicate) -> Result<StringSetPlan>
where
D: Database + 'static,
{
@ -246,9 +246,9 @@ impl InfluxRPCPlanner {
let mut need_full_plans = BTreeMap::new();
let mut known_columns = BTreeSet::new();
for chunk in self.filtered_chunks(database, &predicate).await? {
for chunk in self.filtered_chunks(database, &predicate)? {
// try and get the table names that have rows that match the predicate
let table_names = self.chunk_table_names(chunk.as_ref(), &predicate).await?;
let table_names = self.chunk_table_names(chunk.as_ref(), &predicate)?;
for table_name in table_names {
debug!(
@ -308,7 +308,7 @@ impl InfluxRPCPlanner {
// were already known to have data (based on the contents of known_columns)
for (table_name, chunks) in need_full_plans.into_iter() {
let plan = self.tag_keys_plan(&table_name, &predicate, chunks).await?;
let plan = self.tag_keys_plan(&table_name, &predicate, chunks)?;
if let Some(plan) = plan {
builder = builder.append(plan)
@ -326,7 +326,7 @@ impl InfluxRPCPlanner {
/// Returns a plan which finds the distinct, non-null tag values
/// in the specified `tag_name` column of this database which pass
/// the conditions specified by `predicate`.
pub async fn tag_values<D>(
pub fn tag_values<D>(
&self,
database: &D,
tag_name: &str,
@ -351,8 +351,8 @@ impl InfluxRPCPlanner {
let mut need_full_plans = BTreeMap::new();
let mut known_values = BTreeSet::new();
for chunk in self.filtered_chunks(database, &predicate).await? {
let table_names = self.chunk_table_names(chunk.as_ref(), &predicate).await?;
for chunk in self.filtered_chunks(database, &predicate)? {
let table_names = self.chunk_table_names(chunk.as_ref(), &predicate)?;
for table_name in table_names {
debug!(
@ -426,9 +426,7 @@ impl InfluxRPCPlanner {
// time in `known_columns`, and some tables in chunks that we
// need to run a plan to find what values pass the predicate.
for (table_name, chunks) in need_full_plans.into_iter() {
let scan_and_filter = self
.scan_and_filter(&table_name, &predicate, chunks)
.await?;
let scan_and_filter = self.scan_and_filter(&table_name, &predicate, chunks)?;
// if we have any data to scan, make a plan!
if let Some(TableScanAndFilter {
@ -471,11 +469,7 @@ impl InfluxRPCPlanner {
/// datatypes (as defined in the data written via `write_lines`),
/// and which have more than zero rows which pass the conditions
/// specified by `predicate`.
pub async fn field_columns<D>(
&self,
database: &D,
predicate: Predicate,
) -> Result<FieldListPlan>
pub fn field_columns<D>(&self, database: &D, predicate: Predicate) -> Result<FieldListPlan>
where
D: Database + 'static,
{
@ -488,15 +482,12 @@ impl InfluxRPCPlanner {
// values and stops the plan executing once it has them
// map table -> Vec<Arc<Chunk>>
let chunks = self.filtered_chunks(database, &predicate).await?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks).await?;
let chunks = self.filtered_chunks(database, &predicate)?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks)?;
let mut field_list_plan = FieldListPlan::new();
for (table_name, chunks) in table_chunks {
if let Some(plan) = self
.field_columns_plan(&table_name, &predicate, chunks)
.await?
{
if let Some(plan) = self.field_columns_plan(&table_name, &predicate, chunks)? {
field_list_plan = field_list_plan.append(plan);
}
}
@ -523,7 +514,7 @@ impl InfluxRPCPlanner {
/// rows for a particular series (groups where all tags are the
/// same) occur together in the plan
pub async fn read_filter<D>(&self, database: &D, predicate: Predicate) -> Result<SeriesSetPlans>
pub fn read_filter<D>(&self, database: &D, predicate: Predicate) -> Result<SeriesSetPlans>
where
D: Database + 'static,
{
@ -531,17 +522,15 @@ impl InfluxRPCPlanner {
// group tables by chunk, pruning if possible
// key is table name, values are chunks
let chunks = self.filtered_chunks(database, &predicate).await?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks).await?;
let chunks = self.filtered_chunks(database, &predicate)?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks)?;
// now, build up plans for each table
let mut ss_plans = Vec::with_capacity(table_chunks.len());
for (table_name, chunks) in table_chunks {
let prefix_columns: Option<&[&str]> = None;
let ss_plan = self
.read_filter_plan(table_name, prefix_columns, &predicate, chunks)
.await?;
let ss_plan = self.read_filter_plan(table_name, prefix_columns, &predicate, chunks)?;
// If we have to do real work, add it to the list of plans
if let Some(ss_plan) = ss_plan {
ss_plans.push(ss_plan);
@ -555,7 +544,7 @@ impl InfluxRPCPlanner {
/// with rows grouped by an aggregate function. Note that we still
/// group by all tags (so group within series) and the
/// group_columns define the order of the result
pub async fn read_group<D>(
pub fn read_group<D>(
&self,
database: &D,
predicate: Predicate,
@ -568,8 +557,8 @@ impl InfluxRPCPlanner {
debug!(predicate=?predicate, agg=?agg, "planning read_group");
// group tables by chunk, pruning if possible
let chunks = self.filtered_chunks(database, &predicate).await?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks).await?;
let chunks = self.filtered_chunks(database, &predicate)?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks)?;
let num_prefix_tag_group_columns = group_columns.len();
// now, build up plans for each table
@ -577,13 +566,9 @@ impl InfluxRPCPlanner {
for (table_name, chunks) in table_chunks {
let ss_plan = match agg {
Aggregate::None => {
self.read_filter_plan(table_name, Some(group_columns), &predicate, chunks)
.await?
}
_ => {
self.read_group_plan(table_name, &predicate, agg, group_columns, chunks)
.await?
self.read_filter_plan(table_name, Some(group_columns), &predicate, chunks)?
}
_ => self.read_group_plan(table_name, &predicate, agg, group_columns, chunks)?,
};
// If we have to do real work, add it to the list of plans
@ -598,7 +583,7 @@ impl InfluxRPCPlanner {
/// Creates a GroupedSeriesSet plan that produces an output table with rows
/// that are grouped by window defintions
pub async fn read_window_aggregate<D>(
pub fn read_window_aggregate<D>(
&self,
database: &D,
predicate: Predicate,
@ -612,15 +597,14 @@ impl InfluxRPCPlanner {
debug!(predicate=?predicate, "planning read_window_aggregate");
// group tables by chunk, pruning if possible
let chunks = self.filtered_chunks(database, &predicate).await?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks).await?;
let chunks = self.filtered_chunks(database, &predicate)?;
let table_chunks = self.group_chunks_by_table(&predicate, chunks)?;
// now, build up plans for each table
let mut ss_plans = Vec::with_capacity(table_chunks.len());
for (table_name, chunks) in table_chunks {
let ss_plan = self
.read_window_aggregate_plan(table_name, &predicate, agg, &every, &offset, chunks)
.await?;
.read_window_aggregate_plan(table_name, &predicate, agg, &every, &offset, chunks)?;
// If we have to do real work, add it to the list of plans
if let Some(ss_plan) = ss_plan {
ss_plans.push(ss_plan);
@ -631,7 +615,7 @@ impl InfluxRPCPlanner {
}
/// Creates a map of table_name --> Chunks that have that table
async fn group_chunks_by_table<C>(
fn group_chunks_by_table<C>(
&self,
predicate: &Predicate,
chunks: Vec<Arc<C>>,
@ -641,7 +625,7 @@ impl InfluxRPCPlanner {
{
let mut table_chunks = BTreeMap::new();
for chunk in chunks {
let table_names = self.chunk_table_names(chunk.as_ref(), &predicate).await?;
let table_names = self.chunk_table_names(chunk.as_ref(), &predicate)?;
for table_name in table_names {
table_chunks
.entry(table_name)
@ -653,11 +637,7 @@ impl InfluxRPCPlanner {
}
/// Find all the table names in the specified chunk that pass the predicate
async fn chunk_table_names<C>(
&self,
chunk: &C,
predicate: &Predicate,
) -> Result<BTreeSet<String>>
fn chunk_table_names<C>(&self, chunk: &C, predicate: &Predicate) -> Result<BTreeSet<String>>
where
C: PartitionChunk + 'static,
{
@ -705,7 +685,7 @@ impl InfluxRPCPlanner {
/// Filter(predicate)
/// TableScan (of chunks)
/// ```
async fn tag_keys_plan<C>(
fn tag_keys_plan<C>(
&self,
table_name: &str,
predicate: &Predicate,
@ -714,7 +694,7 @@ impl InfluxRPCPlanner {
where
C: PartitionChunk + 'static,
{
let scan_and_filter = self.scan_and_filter(table_name, predicate, chunks).await?;
let scan_and_filter = self.scan_and_filter(table_name, predicate, chunks)?;
let TableScanAndFilter {
plan_builder,
@ -767,7 +747,7 @@ impl InfluxRPCPlanner {
/// Filter(predicate) [optional]
/// Scan
/// ```
async fn field_columns_plan<C>(
fn field_columns_plan<C>(
&self,
table_name: &str,
predicate: &Predicate,
@ -776,7 +756,7 @@ impl InfluxRPCPlanner {
where
C: PartitionChunk + 'static,
{
let scan_and_filter = self.scan_and_filter(table_name, predicate, chunks).await?;
let scan_and_filter = self.scan_and_filter(table_name, predicate, chunks)?;
let TableScanAndFilter {
plan_builder,
schema,
@ -817,7 +797,7 @@ impl InfluxRPCPlanner {
/// Order by (tag_columns, timestamp_column)
/// Filter(predicate)
/// Scan
async fn read_filter_plan<C>(
fn read_filter_plan<C>(
&self,
table_name: impl Into<String>,
prefix_columns: Option<&[impl AsRef<str>]>,
@ -828,7 +808,7 @@ impl InfluxRPCPlanner {
C: PartitionChunk + 'static,
{
let table_name = table_name.into();
let scan_and_filter = self.scan_and_filter(&table_name, predicate, chunks).await?;
let scan_and_filter = self.scan_and_filter(&table_name, predicate, chunks)?;
let TableScanAndFilter {
plan_builder,
@ -937,7 +917,7 @@ impl InfluxRPCPlanner {
/// GroupBy(gby cols, aggs, time cols)
/// Filter(predicate)
/// Scan
pub async fn read_group_plan<C>(
pub fn read_group_plan<C>(
&self,
table_name: impl Into<String>,
predicate: &Predicate,
@ -949,7 +929,7 @@ impl InfluxRPCPlanner {
C: PartitionChunk + 'static,
{
let table_name = table_name.into();
let scan_and_filter = self.scan_and_filter(&table_name, predicate, chunks).await?;
let scan_and_filter = self.scan_and_filter(&table_name, predicate, chunks)?;
let TableScanAndFilter {
plan_builder,
@ -1027,7 +1007,7 @@ impl InfluxRPCPlanner {
/// GroupBy(gby: tag columns, window_function; agg: aggregate(field)
/// Filter(predicate)
/// Scan
pub async fn read_window_aggregate_plan<C>(
pub fn read_window_aggregate_plan<C>(
&self,
table_name: impl Into<String>,
predicate: &Predicate,
@ -1040,7 +1020,7 @@ impl InfluxRPCPlanner {
C: PartitionChunk + 'static,
{
let table_name = table_name.into();
let scan_and_filter = self.scan_and_filter(&table_name, predicate, chunks).await?;
let scan_and_filter = self.scan_and_filter(&table_name, predicate, chunks)?;
let TableScanAndFilter {
plan_builder,
@ -1114,7 +1094,7 @@ impl InfluxRPCPlanner {
/// Filter(predicate) [optional]
/// Scan
/// ```
async fn scan_and_filter<C>(
fn scan_and_filter<C>(
&self,
table_name: &str,
predicate: &Predicate,
@ -1190,7 +1170,7 @@ impl InfluxRPCPlanner {
/// Returns a list of chunks across all partitions which may
/// contain data that pass the predicate
async fn filtered_chunks<D>(
fn filtered_chunks<D>(
&self,
database: &D,
predicate: &Predicate,

View File

@ -84,7 +84,7 @@ impl SQLQueryPlanner {
/// Plan a SQL query against the data in `database`, and return a
/// DataFusion physical execution plan. The plan can then be
/// executed using `executor` in a streaming fashion.
pub async fn query<D: CatalogProvider + 'static>(
pub fn query<D: CatalogProvider + 'static>(
&self,
database: Arc<D>,
query: &str,
@ -92,6 +92,6 @@ impl SQLQueryPlanner {
) -> Result<Arc<dyn ExecutionPlan>> {
let mut ctx = executor.new_context();
ctx.inner_mut().register_catalog(DEFAULT_CATALOG, database);
ctx.prepare_sql(query).await.context(Preparing)
ctx.prepare_sql(query).context(Preparing)
}
}

View File

@ -10,7 +10,7 @@ use arrow_deps::datafusion::physical_plan::SendableRecordBatchStream;
use async_trait::async_trait;
use data_types::chunk::ChunkSummary;
use exec::{stringset::StringSet, Executor};
use internal_types::{data::ReplicatedWrite, schema::Schema, selection::Selection};
use internal_types::{schema::Schema, selection::Selection};
use std::{fmt::Debug, sync::Arc};
@ -39,9 +39,6 @@ pub trait Database: Debug + Send + Sync {
type Error: std::error::Error + Send + Sync + 'static;
type Chunk: PartitionChunk;
/// Stores the replicated write into the database.
fn store_replicated_write(&self, write: &ReplicatedWrite) -> Result<(), Self::Error>;
/// Return the partition keys for data in this DB
fn partition_keys(&self) -> Result<Vec<String>, Self::Error>;

View File

@ -211,7 +211,7 @@ mod tests {
let expected_ss = to_string_set(&["foo", "bar", "baz", "from_a_plan"]).into();
assert!(matches!(plan, StringSetPlan::Plan(_)));
let executor = Executor::new();
let executor = Executor::new(1);
let ss = executor.to_string_set(plan).await.unwrap();
assert_eq!(ss, expected_ss);
}

View File

@ -18,10 +18,7 @@ use crate::{
Database, DatabaseStore, PartitionChunk, Predicate,
};
use data_types::database_rules::{PartitionTemplate, TemplatePart};
use influxdb_line_protocol::{parse_lines, ParsedLine};
use internal_types::{
data::{lines_to_replicated_write, ReplicatedWrite},
schema::{
builder::{SchemaBuilder, SchemaMerger},
Schema,
@ -30,10 +27,8 @@ use internal_types::{
};
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use data_types::database_rules::Partitioner;
use parking_lot::Mutex;
use snafu::{OptionExt, ResultExt, Snafu};
use snafu::{OptionExt, Snafu};
use std::{collections::BTreeMap, sync::Arc};
#[derive(Debug, Default)]
@ -43,12 +38,6 @@ pub struct TestDatabase {
/// Value is map of chunk_id to chunk
partitions: Mutex<BTreeMap<String, BTreeMap<u32, Arc<TestChunk>>>>,
/// Lines which have been written to this database, in order
saved_lines: Mutex<Vec<String>>,
/// Replicated writes which have been written to this database, in order
replicated_writes: Mutex<Vec<ReplicatedWrite>>,
/// `column_names` to return upon next request
column_names: Arc<Mutex<Option<StringSetRef>>>,
}
@ -74,33 +63,6 @@ impl TestDatabase {
Self::default()
}
/// Get all lines written to this database
pub fn get_lines(&self) -> Vec<String> {
self.saved_lines.lock().clone()
}
/// Get all replicated writs to this database
pub fn get_writes(&self) -> Vec<ReplicatedWrite> {
self.replicated_writes.lock().clone()
}
/// Parse line protocol and add it as new lines to this
/// database
pub async fn add_lp_string(&self, lp_data: &str) {
let parsed_lines = parse_lines(&lp_data)
.collect::<Result<Vec<_>, _>>()
.unwrap_or_else(|_| panic!("parsing line protocol: {}", lp_data));
let mut writer = TestLPWriter::default();
writer.write_lines(self, &parsed_lines).unwrap();
// Writes parsed lines into this database
let mut saved_lines = self.saved_lines.lock();
for line in parsed_lines {
saved_lines.push(line.to_string())
}
}
/// Add a test chunk to the database
pub fn add_chunk(&self, partition_key: &str, chunk: Arc<TestChunk>) {
let mut partitions = self.partitions.lock();
@ -132,12 +94,6 @@ impl Database for TestDatabase {
type Error = TestError;
type Chunk = TestChunk;
/// Adds the replicated write to this database
fn store_replicated_write(&self, write: &ReplicatedWrite) -> Result<(), Self::Error> {
self.replicated_writes.lock().push(write.clone());
Ok(())
}
/// Return the partition keys for data in this DB
fn partition_keys(&self) -> Result<Vec<String>, Self::Error> {
let partitions = self.partitions.lock();
@ -448,22 +404,13 @@ impl TestDatabaseStore {
pub fn new() -> Self {
Self::default()
}
/// Parse line protocol and add it as new lines to the `db_name` database
pub async fn add_lp_string(&self, db_name: &str, lp_data: &str) {
self.db_or_create(db_name)
.await
.expect("db_or_create suceeeds")
.add_lp_string(lp_data)
.await
}
}
impl Default for TestDatabaseStore {
fn default() -> Self {
Self {
databases: Mutex::new(BTreeMap::new()),
executor: Arc::new(Executor::new()),
executor: Arc::new(Executor::new(1)),
}
}
}
@ -505,91 +452,3 @@ impl DatabaseStore for TestDatabaseStore {
Arc::clone(&self.executor)
}
}
/// Helper for writing line protocol data directly into test databases
/// (handles creating sequence numbers and writer ids
#[derive(Debug, Default)]
pub struct TestLPWriter {
pub writer_id: u32,
sequence_number: u64,
}
impl TestLPWriter {
// writes data in LineProtocol format into a database
pub fn write_lines<D: Database>(
&mut self,
database: &D,
lines: &[ParsedLine<'_>],
) -> Result<()> {
// partitions data in hourly segments
let partition_template = PartitionTemplate {
parts: vec![TemplatePart::TimeFormat("%Y-%m-%dT%H".to_string())],
};
let write = lines_to_replicated_write(
self.writer_id,
self.sequence_number,
&lines,
&partition_template,
);
self.sequence_number += 1;
database
.store_replicated_write(&write)
.map_err(|e| TestError::DatabaseWrite {
source: Box::new(e),
})
}
/// Writes line protocol formatted data in lp_data to `database`
pub fn write_lp_string<D: Database>(&mut self, database: &D, lp_data: &str) -> Result<()> {
let lines = parse_lines(lp_data)
.collect::<Result<Vec<_>, _>>()
.map_err(|e| Box::new(e) as _)
.context(DatabaseWrite)?;
self.write_lines(database, &lines)
}
/// Writes line protocol formatted data to database and partition
pub fn write_lp_to_partition<D: Database>(
&mut self,
database: &D,
lp_data: &str,
paritition_key: impl Into<String>,
) {
let lines = parse_lines(lp_data).collect::<Result<Vec<_>, _>>().unwrap();
self.write_lines_to_partition(database, paritition_key, &lines)
}
/// Writes lines the the given partition
pub fn write_lines_to_partition<D: Database>(
&mut self,
database: &D,
partition_key: impl Into<String>,
lines: &[ParsedLine<'_>],
) {
let partitioner = TestPartitioner {
key: partition_key.into(),
};
let write =
lines_to_replicated_write(self.writer_id, self.sequence_number, &lines, &partitioner);
self.sequence_number += 1;
database.store_replicated_write(&write).unwrap();
}
}
// Outputs a set partition key for testing. Used for parsing line protocol into
// ReplicatedWrite and setting an explicit partition key for all writes therein.
struct TestPartitioner {
key: String,
}
impl Partitioner for TestPartitioner {
fn partition_key(
&self,
_line: &ParsedLine<'_>,
_default_time: &DateTime<Utc>,
) -> data_types::database_rules::Result<String> {
Ok(self.key.clone())
}
}

View File

@ -376,6 +376,29 @@ impl Chunk {
.collect()
}
/// A helper method for determining the time-range associated with the
/// specified table.
///
/// A table's schema need not contain a column representing the time,
/// however any table that represents data using the InfluxDB model does
/// contain a column that represents the timestamp associated with each
/// row.
///
/// `table_time_range` will return the min and max values for that column
/// if the table is using the InfluxDB data-model, otherwise it will return
/// `None`. An error will be returned if the table does not exist.
pub fn table_time_range(&self, table_name: &str) -> Result<Option<(i64, i64)>> {
// read lock on chunk.
let chunk_data = self.chunk_data.read().unwrap();
let table = chunk_data
.data
.get(table_name)
.context(TableNotFound { table_name })?;
Ok(table.time_range())
}
/// Returns a schema object for a `read_filter` operation using the provided
/// column selection. An error is returned if the specified columns do not
/// exist.

View File

@ -13,12 +13,15 @@ use snafu::{ensure, Snafu};
use crate::row_group::{self, ColumnName, Predicate, RowGroup};
use crate::schema::{AggregateType, ColumnType, LogicalDataType, ResultSchema};
use crate::value::Value;
use crate::value::{OwnedValue, Scalar, Value};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("cannot drop last row group in table; drop table"))]
EmptyTableError {},
#[snafu(display("table does not have InfluxDB timestamp column"))]
NoTimestampColumnError {},
#[snafu(display("unsupported column operation on {}: {}", column_name, msg))]
UnsupportedColumnOperation { msg: String, column_name: String },
}
@ -151,9 +154,38 @@ impl Table {
self.table_data.read().unwrap().meta.to_summary(&self.name)
}
/// The time range of all row groups within this table.
/// Returns the column range associated with an InfluxDB Timestamp column
/// or None if the table's schema does not have such a column.
pub fn time_range(&self) -> Option<(i64, i64)> {
self.table_data.read().unwrap().meta.time_range
let table_data = self.table_data.read().unwrap();
let time_column = table_data
.meta
.columns
.values()
.filter(|cm| matches!(cm.typ, crate::schema::ColumnType::Timestamp(_)))
.collect::<Vec<_>>();
if time_column.is_empty() {
return None;
}
assert_eq!(time_column.len(), 1); // can only be one timestamp column.
let range = &time_column[0].range;
let (min, max) = match (&range.0, &range.1) {
(OwnedValue::Scalar(Scalar::I64(min)), OwnedValue::Scalar(Scalar::I64(max))) => {
(min, max)
}
(min, max) => {
panic!(
"invalid range type for timestamp column: ({:?}, {:?})",
min, max
);
}
};
Some((*min, *max))
}
// Helper function used in tests.
@ -612,7 +644,6 @@ impl MetaData {
}
pub fn to_summary(&self, table_name: impl Into<String>) -> TableSummary {
use crate::value::{OwnedValue, Scalar};
use data_types::partition_metadata::{ColumnSummary, StatValues, Statistics};
let columns = self
.columns
@ -1435,4 +1466,20 @@ west,host-b,100
vec!["time".to_owned()],
);
}
#[test]
fn time_range() {
// Build a row group.
let mut columns = vec![];
let tc = ColumnType::Time(Column::from(&[-29_i64, -100, 3, 2][..]));
columns.push((row_group::TIME_COLUMN_NAME.to_string(), tc));
let rc = ColumnType::Tag(Column::from(&["west", "south", "north", "west"][..]));
columns.push(("region".to_string(), rc));
let rg = RowGroup::new(4, columns);
let table = Table::new("cpu".to_owned(), rg);
assert_eq!(table.time_range().unwrap(), (-100, 3));
}
}

View File

@ -14,11 +14,12 @@ data_types = { path = "../data_types" }
# See docs/regenerating_flatbuffers.md about updating generated code when updating the
# version of the flatbuffers crate
flatbuffers = "0.8"
futures = "0.3.7"
futures = "0.3"
generated_types = { path = "../generated_types" }
influxdb_line_protocol = { path = "../influxdb_line_protocol" }
internal_types = { path = "../internal_types" }
mutable_buffer = { path = "../mutable_buffer" }
num_cpus = "1.13.0"
object_store = { path = "../object_store" }
observability_deps = { path = "../observability_deps" }
parking_lot = "0.11.1"
@ -35,4 +36,12 @@ tracker = { path = "../tracker" }
uuid = { version = "0.8", features = ["serde", "v4"] }
[dev-dependencies] # In alphabetical order
criterion = { version = "0.3.4", features = ["async_tokio"] }
flate2 = "1.0.20"
tempfile = "3.1.0"
test_helpers = { path = "../test_helpers" }
[[bench]]
name = "influxrpc"
harness = false

View File

@ -0,0 +1,8 @@
mod tag_values;
use criterion::{criterion_group, criterion_main};
use tag_values::benchmark_tag_values;
criterion_group!(benches, benchmark_tag_values);
criterion_main!(benches);

View File

@ -0,0 +1,122 @@
use std::io::Read;
use arrow_deps::datafusion::{logical_plan::Expr, scalar::ScalarValue};
use criterion::{BenchmarkId, Criterion};
// This is a struct that tells Criterion.rs to use the "futures" crate's
// current-thread executor
use flate2::read::GzDecoder;
use tokio::runtime::Runtime;
use query::frontend::influxrpc::InfluxRPCPlanner;
use query::predicate::PredicateBuilder;
use query::{exec::Executor, predicate::Predicate};
use server::{benchmarks::scenarios::DBScenario, db::Db};
// Uses the `query_tests` module to generate some chunk scenarios, specifically
// the scenarios where there are:
//
// - a single open mutable buffer chunk;
// - a closed mutable buffer chunk and another open one;
// - an open mutable buffer chunk and a closed read buffer chunk;
// - two closed read buffer chunks.
//
// The chunks are all fed the *same* line protocol, so these benchmarks are
// useful for assessig the differences in performance between querying the
// chunks held in different execution engines.
//
// These benchmarks use a synthetically generated set of line protocol using
// `inch`. Each point is a new series containing three tag keys. Those tag keys
// are:
//
// - tag0, cardinality 10.
// - tag1, cardinality 100.
// - tag2, cardinality 1,000.
//
// The timespan of the points in the line protocol is around 1m or wall-clock
// time.
async fn setup_scenarios() -> Vec<DBScenario> {
let raw = include_bytes!("../../tests/fixtures/lineproto/tag_values.lp.gz");
let mut gz = GzDecoder::new(&raw[..]);
let mut lp = String::new();
gz.read_to_string(&mut lp).unwrap();
let db =
server::benchmarks::scenarios::make_two_chunk_scenarios("2021-04-12T17", &lp, &lp).await;
db
}
// Run all benchmarks for `tag_values`.
pub fn benchmark_tag_values(c: &mut Criterion) {
let scenarios = Runtime::new().unwrap().block_on(setup_scenarios());
execute_benchmark_group(c, scenarios.as_slice());
}
// Runs an async criterion benchmark against the provided scenarios and
// predicate.
fn execute_benchmark_group(c: &mut Criterion, scenarios: &[DBScenario]) {
let planner = InfluxRPCPlanner::new();
let predicates = vec![
(PredicateBuilder::default().build(), "no_pred"),
(
PredicateBuilder::default()
.add_expr(
Expr::Column("tag2".to_owned()).eq(Expr::Literal(ScalarValue::Utf8(Some(
"value321".to_owned(),
)))),
)
.build(),
"with_pred",
),
];
// these tags have different cardinalities: 10, 100, 1000.
let tag_keys = &["tag0", "tag1", "tag2"];
for scenario in scenarios {
let DBScenario { scenario_name, db } = scenario;
let mut group = c.benchmark_group(scenario_name);
for (predicate, pred_name) in &predicates {
for tag_key in tag_keys {
group.bench_with_input(
BenchmarkId::from_parameter(format!("{}/{}", tag_key, pred_name)),
tag_key,
|b, &tag_key| {
let executor = db.executor();
b.to_async(Runtime::new().unwrap()).iter(|| {
run_tag_values_query(
&planner,
executor.as_ref(),
db,
tag_key,
predicate.clone(),
)
});
},
);
}
}
group.finish();
}
}
// Plans and runs a tag_values query.
async fn run_tag_values_query(
planner: &InfluxRPCPlanner,
executor: &Executor,
db: &Db,
tag_key: &str,
predicate: Predicate,
) {
let plan = planner
.tag_values(db, &tag_key, predicate)
.expect("built plan successfully");
let names = executor.to_string_set(plan).await.expect(
"converted plan to strings
successfully",
);
assert!(names.len() > 0);
}

View File

@ -9,6 +9,7 @@ use data_types::{
DatabaseName,
};
use object_store::{path::ObjectStorePath, ObjectStore};
use query::exec::Executor;
/// This module contains code for managing the configuration of the server.
use crate::{db::Db, Error, JobRegistry, Result};
@ -114,7 +115,13 @@ impl Config {
state.remotes.remove(&id)
}
fn commit(&self, rules: DatabaseRules, server_id: NonZeroU32, object_store: Arc<ObjectStore>) {
fn commit(
&self,
rules: DatabaseRules,
server_id: NonZeroU32,
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
) {
let mut state = self.state.write().expect("mutex poisoned");
let name = state
.reservations
@ -131,6 +138,7 @@ impl Config {
rules,
server_id,
object_store,
exec,
wal_buffer,
Arc::clone(&self.jobs),
));
@ -253,9 +261,14 @@ pub(crate) struct CreateDatabaseHandle<'a> {
}
impl<'a> CreateDatabaseHandle<'a> {
pub(crate) fn commit(mut self, server_id: NonZeroU32, object_store: Arc<ObjectStore>) {
pub(crate) fn commit(
mut self,
server_id: NonZeroU32,
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
) {
self.config
.commit(self.rules.take().unwrap(), server_id, object_store)
.commit(self.rules.take().unwrap(), server_id, object_store, exec)
}
pub(crate) fn rules(&self) -> &DatabaseRules {
@ -292,7 +305,8 @@ mod test {
let db_reservation = config.create_db(rules).unwrap();
let server_id = NonZeroU32::new(1).unwrap();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
db_reservation.commit(server_id, store);
let exec = Arc::new(Executor::new(1));
db_reservation.commit(server_id, store, exec);
assert!(config.db(&name).is_some());
assert_eq!(config.db_names_sorted(), vec![name.clone()]);
@ -318,7 +332,8 @@ mod test {
let db_reservation = config.create_db(rules).unwrap();
let server_id = NonZeroU32::new(1).unwrap();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
db_reservation.commit(server_id, store);
let exec = Arc::new(Executor::new(1));
db_reservation.commit(server_id, store, exec);
let token = config
.state

View File

@ -3,6 +3,7 @@
use std::any::Any;
use std::{
convert::TryInto,
num::NonZeroU32,
sync::{
atomic::{AtomicU64, AtomicUsize, Ordering},
@ -15,20 +16,24 @@ use observability_deps::tracing::{debug, info};
use parking_lot::{Mutex, RwLock};
use snafu::{ensure, OptionExt, ResultExt, Snafu};
use arrow_deps::datafusion::{
catalog::{catalog::CatalogProvider, schema::SchemaProvider},
physical_plan::SendableRecordBatchStream,
use arrow_deps::{
arrow::datatypes::SchemaRef as ArrowSchemaRef,
datafusion::{
catalog::{catalog::CatalogProvider, schema::SchemaProvider},
physical_plan::SendableRecordBatchStream,
},
};
use catalog::{chunk::ChunkState, Catalog};
pub(crate) use chunk::DBChunk;
use data_types::{
chunk::ChunkSummary, database_rules::DatabaseRules, partition_metadata::PartitionSummary,
timestamp::TimestampRange,
};
use internal_types::{data::ReplicatedWrite, selection::Selection};
use internal_types::selection::Selection;
use object_store::ObjectStore;
use parquet_file::{chunk::Chunk, storage::Storage};
use query::{Database, DEFAULT_SCHEMA};
use query::{exec::Executor, Database, DEFAULT_SCHEMA};
use read_buffer::Chunk as ReadBufferChunk;
use tracker::{MemRegistry, TaskTracker, TrackedFutureExt};
@ -36,6 +41,7 @@ use super::{buffer::Buffer, JobRegistry};
use data_types::job::Job;
use data_types::partition_metadata::TableSummary;
use internal_types::entry::{self, ClockValue, Entry, SequencedEntry};
use lifecycle::LifecycleManager;
use system_tables::{SystemSchemaProvider, SYSTEM_SCHEMA};
@ -114,6 +120,18 @@ pub enum Error {
chunk_id: u32,
},
#[snafu(display("Read Buffer Schema Error in chunk {}: {}", chunk_id, source))]
ReadBufferChunkSchemaError {
source: read_buffer::Error,
chunk_id: u32,
},
#[snafu(display("Read Buffer Timestamp Error in chunk {}: {}", chunk_id, source))]
ReadBufferChunkTimestampError {
chunk_id: u32,
source: read_buffer::Error,
},
#[snafu(display("Error writing to object store: {}", source))]
WritingToObjectStore {
source: parquet_file::storage::Error,
@ -131,6 +149,14 @@ pub enum Error {
chunk_id: u32,
source: mutable_buffer::chunk::Error,
},
#[snafu(display("Error building sequenced entry: {}", source))]
SequencedEntryError { source: entry::Error },
#[snafu(display("Error building sequenced entry: {}", source))]
SchemaConversion {
source: internal_types::schema::Error,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -197,8 +223,12 @@ pub struct Db {
pub server_id: NonZeroU32, // this is also the Query Server ID
/// Interface to use for peristence
pub store: Arc<ObjectStore>,
/// Executor for running queries
exec: Arc<Executor>,
/// The catalog holds chunks of data under partitions for the database.
/// The underlying chunks may be backed by different execution engines
/// depending on their stage in the data lifecycle. Currently there are
@ -245,6 +275,7 @@ impl Db {
rules: DatabaseRules,
server_id: NonZeroU32,
object_store: Arc<ObjectStore>,
exec: Arc<Executor>,
wal_buffer: Option<Buffer>,
jobs: Arc<JobRegistry>,
) -> Self {
@ -258,6 +289,7 @@ impl Db {
rules,
server_id,
store,
exec,
catalog,
wal_buffer,
jobs,
@ -268,6 +300,11 @@ impl Db {
}
}
/// Return a handle to the executor used to run queries
pub fn executor(&self) -> Arc<Executor> {
Arc::clone(&self.exec)
}
/// Rolls over the active chunk in the database's specified
/// partition. Returns the previously open (now closed) Chunk
pub async fn rollover_partition(&self, partition_key: &str) -> Result<Arc<DBChunk>> {
@ -421,7 +458,7 @@ impl Db {
Ok(DBChunk::snapshot(&chunk))
}
pub async fn load_chunk_to_object_store(
pub async fn write_chunk_to_object_store(
&self,
partition_key: &str,
chunk_id: u32,
@ -480,17 +517,19 @@ impl Db {
let predicate = read_buffer::Predicate::default();
// Get RecordBatchStream of data from the read buffer chunk
// TODO: When we have the rb_chunk, the following code will be replaced with one
// line let stream = rb_chunk.read_filter()
let read_results = rb_chunk
.read_filter(stats.name.as_str(), predicate, Selection::All)
.context(ReadBufferChunkError { chunk_id })?;
let schema = rb_chunk
let arrow_schema: ArrowSchemaRef = rb_chunk
.read_filter_table_schema(stats.name.as_str(), Selection::All)
.context(ReadBufferChunkError { chunk_id })?
.context(ReadBufferChunkSchemaError { chunk_id })?
.into();
let stream: SendableRecordBatchStream =
Box::pin(streams::ReadFilterResultsStream::new(read_results, schema));
let time_range = rb_chunk
.table_time_range(stats.name.as_str())
.context(ReadBufferChunkTimestampError { chunk_id })?;
let stream: SendableRecordBatchStream = Box::pin(
streams::ReadFilterResultsStream::new(read_results, Arc::clone(&arrow_schema)),
);
// Write this table data into the object store
let path = storage
@ -504,7 +543,20 @@ impl Db {
.context(WritingToObjectStore)?;
// Now add the saved info into the parquet_chunk
parquet_chunk.add_table(stats, path);
let schema = Arc::clone(&arrow_schema)
.try_into()
.context(SchemaConversion)?;
let table_time_range = match time_range {
None => None,
Some((start, end)) => {
if start < end {
Some(TimestampRange::new(start, end))
} else {
None
}
}
};
parquet_chunk.add_table(stats, path, schema, table_time_range);
}
// Relock the chunk again (nothing else should have been able
@ -524,7 +576,8 @@ impl Db {
Ok(DBChunk::snapshot(&chunk))
}
/// Spawns a task to perform load_chunk_to_read_buffer
/// Spawns a task to perform
/// [`load_chunk_to_read_buffer`](Self::load_chunk_to_read_buffer)
pub fn load_chunk_to_read_buffer_in_background(
self: &Arc<Self>,
partition_key: String,
@ -558,6 +611,41 @@ impl Db {
tracker
}
/// Spawns a task to perform
/// [`write_chunk_to_object_store`](Self::write_chunk_to_object_store)
pub fn write_chunk_to_object_store_in_background(
self: &Arc<Self>,
partition_key: String,
chunk_id: u32,
) -> TaskTracker<Job> {
let name = self.rules.read().name.clone();
let (tracker, registration) = self.jobs.register(Job::WriteChunk {
db_name: name.to_string(),
partition_key: partition_key.clone(),
chunk_id,
});
let captured = Arc::clone(&self);
let task = async move {
debug!(%name, %partition_key, %chunk_id, "background task loading chunk to object store");
let result = captured
.write_chunk_to_object_store(&partition_key, chunk_id)
.await;
if let Err(e) = result {
info!(?e, %name, %partition_key, %chunk_id, "background task error loading object store chunk");
return Err(e);
}
debug!(%name, %partition_key, %chunk_id, "background task completed writing chunk to object store");
Ok(())
};
tokio::spawn(task.track(registration));
tracker
}
/// Returns the next write sequence number
pub fn next_sequence(&self) -> u64 {
self.sequence.fetch_add(1, Ordering::SeqCst)
@ -624,6 +712,79 @@ impl Db {
info!("finished background worker");
}
/// Stores an entry based on the configuration. The Entry will first be
/// converted into a Sequenced Entry with the logical clock assigned
/// from the database. If the write buffer is configured, the sequenced
/// entry is written into the buffer and replicated based on the
/// configured rules. If the mutable buffer is configured, the sequenced
/// entry is then written into the mutable buffer.
pub fn store_entry(&self, entry: Entry) -> Result<()> {
// TODO: build this based on either this or on the write buffer, if configured
let sequenced_entry = SequencedEntry::new_from_entry_bytes(
ClockValue::new(self.next_sequence()),
self.server_id.get(),
entry.data(),
)
.context(SequencedEntryError)?;
if self.rules.read().wal_buffer_config.is_some() {
todo!("route to the Write Buffer. TODO: carols10cents #1157")
}
self.store_sequenced_entry(sequenced_entry)
}
pub fn store_sequenced_entry(&self, sequenced_entry: SequencedEntry) -> Result<()> {
let rules = self.rules.read();
let mutable_size_threshold = rules.lifecycle_rules.mutable_size_threshold;
if rules.lifecycle_rules.immutable {
return DatabaseNotWriteable {}.fail();
}
std::mem::drop(rules);
// TODO: Direct writes to closing chunks
if let Some(partitioned_writes) = sequenced_entry.partition_writes() {
for write in partitioned_writes {
let partition_key = write.key();
let partition = self.catalog.get_or_create_partition(partition_key);
let mut partition = partition.write();
partition.update_last_write_at();
let chunk = partition.open_chunk().unwrap_or_else(|| {
partition.create_open_chunk(self.memory_registries.mutable_buffer.as_ref())
});
let mut chunk = chunk.write();
chunk.record_write();
let chunk_id = chunk.id();
let mb_chunk = chunk.mutable_buffer().expect("cannot mutate open chunk");
mb_chunk
.write_table_batches(
sequenced_entry.clock_value(),
sequenced_entry.writer_id(),
&write.table_batches(),
)
.context(WriteEntry {
partition_key,
chunk_id,
})?;
let size = mb_chunk.size();
if let Some(threshold) = mutable_size_threshold {
if size > threshold.get() {
chunk.set_closing().expect("cannot close open chunk")
}
}
}
}
Ok(())
}
}
#[async_trait]
@ -652,54 +813,6 @@ impl Database for Db {
.collect()
}
fn store_replicated_write(&self, write: &ReplicatedWrite) -> Result<(), Self::Error> {
let rules = self.rules.read();
let mutable_size_threshold = rules.lifecycle_rules.mutable_size_threshold;
if rules.lifecycle_rules.immutable {
return DatabaseNotWriteable {}.fail();
}
std::mem::drop(rules);
let entries = match write.write_buffer_batch().and_then(|batch| batch.entries()) {
Some(entries) => entries,
None => return Ok(()),
};
// TODO: Direct writes to closing chunks
for entry in entries.into_iter() {
if let Some(partition_key) = entry.partition_key() {
let partition = self.catalog.get_or_create_partition(partition_key);
let mut partition = partition.write();
partition.update_last_write_at();
let chunk = partition.open_chunk().unwrap_or_else(|| {
partition.create_open_chunk(self.memory_registries.mutable_buffer.as_ref())
});
let mut chunk = chunk.write();
chunk.record_write();
let chunk_id = chunk.id();
let mb_chunk = chunk.mutable_buffer().expect("cannot mutate open chunk");
mb_chunk.write_entry(&entry).context(WriteEntry {
partition_key,
chunk_id,
})?;
let size = mb_chunk.size();
if let Some(threshold) = mutable_size_threshold {
if size > threshold.get() {
chunk.set_closing().expect("cannot close open chunk")
}
}
}
}
Ok(())
}
fn partition_keys(&self) -> Result<Vec<String>, Self::Error> {
Ok(self.catalog.partition_keys())
}
@ -731,10 +844,25 @@ impl CatalogProvider for Db {
}
}
pub mod test_helpers {
use super::*;
use internal_types::entry::test_helpers::lp_to_entries;
pub fn write_lp(db: &Db, lp: &str) {
let entries = lp_to_entries(lp);
for entry in entries {
db.store_entry(entry).unwrap();
}
}
}
#[cfg(test)]
mod tests {
use crate::query_tests::utils::{make_database, make_db};
use ::test_helpers::assert_contains;
use arrow_deps::{
arrow::record_batch::RecordBatch, assert_table_eq, datafusion::physical_plan::collect,
arrow::record_batch::RecordBatch, assert_batches_sorted_eq, assert_table_eq,
datafusion::execution::context,
};
use chrono::Utc;
use data_types::{
@ -742,23 +870,32 @@ mod tests {
database_rules::{Order, Sort, SortOrder},
partition_metadata::{ColumnSummary, StatValues, Statistics, TableSummary},
};
use query::{
exec::Executor, frontend::sql::SQLQueryPlanner, test::TestLPWriter, PartitionChunk,
use object_store::{
disk::File, path::ObjectStorePath, path::Path, ObjectStore, ObjectStoreApi,
};
use test_helpers::assert_contains;
use crate::query_tests::utils::make_db;
use query::{frontend::sql::SQLQueryPlanner, PartitionChunk};
use super::*;
use futures::stream;
use futures::{StreamExt, TryStreamExt};
use std::iter::Iterator;
use super::test_helpers::write_lp;
use internal_types::entry::test_helpers::lp_to_entry;
use std::num::NonZeroUsize;
use std::str;
use tempfile::TempDir;
type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
type Result<T, E = Error> = std::result::Result<T, E>;
#[tokio::test]
async fn write_no_mutable_buffer() {
// Validate that writes are rejected if there is no mutable buffer
let db = make_db();
let mut writer = TestLPWriter::default();
db.rules.write().lifecycle_rules.immutable = true;
let res = writer.write_lp_string(&db, "cpu bar=1 10");
let entry = lp_to_entry("cpu bar=1 10");
let res = db.store_entry(entry);
assert_contains!(
res.unwrap_err().to_string(),
"Cannot write to this database: no mutable buffer configured"
@ -768,8 +905,7 @@ mod tests {
#[tokio::test]
async fn read_write() {
let db = Arc::new(make_db());
let mut writer = TestLPWriter::default();
writer.write_lp_string(db.as_ref(), "cpu bar=1 10").unwrap();
write_lp(db.as_ref(), "cpu bar=1 10");
let batches = run_query(db, "select * from cpu").await;
@ -786,9 +922,7 @@ mod tests {
#[tokio::test]
async fn write_with_rollover() {
let db = Arc::new(make_db());
let mut writer = TestLPWriter::default();
//writer.write_lp_string(db.as_ref(), "cpu bar=1 10").unwrap();
writer.write_lp_string(db.as_ref(), "cpu bar=1 10").unwrap();
write_lp(db.as_ref(), "cpu bar=1 10");
assert_eq!(vec!["1970-01-01T00"], db.partition_keys().unwrap());
let mb_chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
@ -802,10 +936,10 @@ mod tests {
"+-----+------+",
];
let batches = run_query(Arc::clone(&db), "select * from cpu").await;
assert_table_eq!(expected, &batches);
assert_batches_sorted_eq!(expected, &batches);
// add new data
writer.write_lp_string(db.as_ref(), "cpu bar=2 20").unwrap();
write_lp(db.as_ref(), "cpu bar=2 20");
let expected = vec![
"+-----+------+",
"| bar | time |",
@ -815,20 +949,19 @@ mod tests {
"+-----+------+",
];
let batches = run_query(Arc::clone(&db), "select * from cpu").await;
assert_table_eq!(&expected, &batches);
assert_batches_sorted_eq!(&expected, &batches);
// And expect that we still get the same thing when data is rolled over again
let chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
assert_eq!(chunk.id(), 1);
let batches = run_query(db, "select * from cpu").await;
assert_table_eq!(&expected, &batches);
assert_batches_sorted_eq!(&expected, &batches);
}
#[tokio::test]
async fn write_with_missing_tags_are_null() {
let db = Arc::new(make_db());
let mut writer = TestLPWriter::default();
// Note the `region` tag is introduced in the second line, so
// the values in prior rows for the region column are
// null. Likewise the `core` tag is introduced in the third
@ -839,9 +972,7 @@ mod tests {
"cpu,core=one user=10.0 11",
];
writer
.write_lp_string(db.as_ref(), &lines.join("\n"))
.unwrap();
write_lp(db.as_ref(), &lines.join("\n"));
assert_eq!(vec!["1970-01-01T00"], db.partition_keys().unwrap());
let mb_chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
@ -864,12 +995,11 @@ mod tests {
async fn read_from_read_buffer() {
// Test that data can be loaded into the ReadBuffer
let db = Arc::new(make_db());
let mut writer = TestLPWriter::default();
writer.write_lp_string(db.as_ref(), "cpu bar=1 10").unwrap();
writer.write_lp_string(db.as_ref(), "cpu bar=2 20").unwrap();
write_lp(db.as_ref(), "cpu bar=1 10");
write_lp(db.as_ref(), "cpu bar=2 20");
let partition_key = "1970-01-01T00";
let mb_chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
let mb_chunk = db.rollover_partition(partition_key).await.unwrap();
let rb_chunk = db
.load_chunk_to_read_buffer(partition_key, mb_chunk.id())
.await
@ -909,14 +1039,221 @@ mod tests {
// cpu").await; assert_table_eq!(expected, &batches);
}
async fn flatten_list_stream(
storage: Arc<ObjectStore>,
prefix: Option<&Path>,
) -> Result<Vec<Path>> {
storage
.list(prefix)
.await?
.map_ok(|v| stream::iter(v).map(Ok))
.try_flatten()
.try_collect()
.await
}
#[tokio::test]
async fn write_one_chunk_one_table_to_parquet_file() {
// Test that data can be written into parquet files
// Create an object store with a specified location in a local disk
let root = TempDir::new().unwrap();
let object_store = Arc::new(ObjectStore::new_file(File::new(root.path())));
// Create a DB given a server id, an object store and a db name
let server_id: NonZeroU32 = NonZeroU32::new(10).unwrap();
let db_name = "parquet_test_db";
let db = Arc::new(make_database(server_id, Arc::clone(&object_store), db_name));
// Write some line protocols in Mutable buffer of the DB
write_lp(db.as_ref(), "cpu bar=1 10");
write_lp(db.as_ref(), "cpu bar=2 20");
//Now mark the MB chunk close
let partition_key = "1970-01-01T00";
let mb_chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
// Move that MB chunk to RB chunk and drop it from MB
let rb_chunk = db
.load_chunk_to_read_buffer(partition_key, mb_chunk.id())
.await
.unwrap();
// Write the RB chunk to Object Store but keep it in RB
let pq_chunk = db
.write_chunk_to_object_store(partition_key, mb_chunk.id())
.await
.unwrap();
// it should be the same chunk!
assert_eq!(mb_chunk.id(), rb_chunk.id());
assert_eq!(mb_chunk.id(), pq_chunk.id());
// we should have chunks in the mutable buffer, read buffer, and object store
// (Note the currently open chunk is not listed)
assert_eq!(mutable_chunk_ids(&db, partition_key), vec![1]);
assert_eq!(read_buffer_chunk_ids(&db, partition_key), vec![0]);
assert_eq!(read_parquet_file_chunk_ids(&db, partition_key), vec![0]);
// Verify data written to the parquet file in object store
// First, there must be one path of object store in the catalog
let paths = pq_chunk.object_store_paths();
assert_eq!(paths.len(), 1);
// Check that the path must exist in the object store
let path_list = flatten_list_stream(Arc::clone(&object_store), Some(&paths[0]))
.await
.unwrap();
println!("path_list: {:#?}", path_list);
assert_eq!(path_list.len(), 1);
assert_eq!(path_list, paths.clone());
// Get full string path
let root_path = format!("{:?}", root.path());
let root_path = root_path.trim_matches('"');
let path = format!("{}/{}", root_path, paths[0].display());
println!("path: {}", path);
// Create External table of this parquet file to get its content in a human
// readable form
// Note: We do not care about escaping quotes here because it is just a test
let sql = format!(
"CREATE EXTERNAL TABLE parquet_table STORED AS PARQUET LOCATION '{}'",
path
);
let mut ctx = context::ExecutionContext::new();
let df = ctx.sql(&sql).unwrap();
df.collect().await.unwrap();
// Select data from that table
let sql = "SELECT * FROM parquet_table";
let content = ctx.sql(&sql).unwrap().collect().await.unwrap();
println!("Content: {:?}", content);
let expected = vec![
"+-----+------+",
"| bar | time |",
"+-----+------+",
"| 1 | 10 |",
"| 2 | 20 |",
"+-----+------+",
];
assert_table_eq!(expected, &content);
}
#[tokio::test]
async fn write_one_chunk_many_tables_to_parquet_files() {
// Test that data can be written into parquet files
// Create an object store with a specified location in a local disk
let root = TempDir::new().unwrap();
let object_store = Arc::new(ObjectStore::new_file(File::new(root.path())));
// Create a DB given a server id, an object store and a db name
let server_id: NonZeroU32 = NonZeroU32::new(10).unwrap();
let db_name = "parquet_test_db";
let db = Arc::new(make_database(server_id, Arc::clone(&object_store), db_name));
// Write some line protocols in Mutable buffer of the DB
write_lp(db.as_ref(), "cpu bar=1 10");
write_lp(db.as_ref(), "disk ops=1 20");
write_lp(db.as_ref(), "cpu bar=2 20");
//Now mark the MB chunk close
let partition_key = "1970-01-01T00";
let mb_chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
// Move that MB chunk to RB chunk and drop it from MB
let rb_chunk = db
.load_chunk_to_read_buffer(partition_key, mb_chunk.id())
.await
.unwrap();
// Write the RB chunk to Object Store but keep it in RB
let pq_chunk = db
.write_chunk_to_object_store(partition_key, mb_chunk.id())
.await
.unwrap();
// it should be the same chunk!
assert_eq!(mb_chunk.id(), rb_chunk.id());
assert_eq!(mb_chunk.id(), pq_chunk.id());
// we should have chunks in the mutable buffer, read buffer, and object store
// (Note the currently open chunk is not listed)
assert_eq!(mutable_chunk_ids(&db, partition_key), vec![1]);
assert_eq!(read_buffer_chunk_ids(&db, partition_key), vec![0]);
assert_eq!(read_parquet_file_chunk_ids(&db, partition_key), vec![0]);
// Verify data written to the parquet files in object store
// First, there must be 2 paths of object store in the catalog
// that represents 2 files
let paths = pq_chunk.object_store_paths();
assert_eq!(paths.len(), 2);
// Check that the path must exist in the object store
let prefix = object_store.new_path();
let path_list = flatten_list_stream(Arc::clone(&object_store), Some(&prefix))
.await
.unwrap();
println!("path_list: {:#?}", path_list);
assert_eq!(path_list.len(), 2);
// Check the content of each path
//
// Root path
let root_path = format!("{:?}", root.path());
let root_path = root_path.trim_matches('"');
for path in path_list {
// Get full string path
let path_string = format!("{}/{}", root_path, path.display());
println!("path: {}", path_string);
// Create External table of this parquet file to get its content in a human
// readable form
// Note: We do not care about escaping quotes here because it is just a test
let sql = format!(
"CREATE EXTERNAL TABLE parquet_table STORED AS PARQUET LOCATION '{}'",
path_string
);
let mut ctx = context::ExecutionContext::new();
let df = ctx.sql(&sql).unwrap();
df.collect().await.unwrap();
// Select data from that table
let sql = "SELECT * FROM parquet_table";
let content = ctx.sql(&sql).unwrap().collect().await.unwrap();
println!("Content: {:?}", content);
let expected = if path_string.contains("cpu") {
// file name: cpu.parquet
vec![
"+-----+------+",
"| bar | time |",
"+-----+------+",
"| 1 | 10 |",
"| 2 | 20 |",
"+-----+------+",
]
} else {
// file name: disk.parquet
vec![
"+-----+------+",
"| ops | time |",
"+-----+------+",
"| 1 | 20 |",
"+-----+------+",
]
};
assert_table_eq!(expected, &content);
}
}
#[tokio::test]
async fn write_updates_last_write_at() {
let db = make_db();
let before_create = Utc::now();
let partition_key = "1970-01-01T00";
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu bar=1 10").unwrap();
write_lp(&db, "cpu bar=1 10");
let after_write = Utc::now();
let last_write_prev = {
@ -929,7 +1266,7 @@ mod tests {
partition.last_write_at()
};
writer.write_lp_string(&db, "cpu bar=1 20").unwrap();
write_lp(&db, "cpu bar=1 20");
{
let partition = db.catalog.valid_partition(partition_key).unwrap();
let partition = partition.read();
@ -943,8 +1280,7 @@ mod tests {
let db = make_db();
// Given data loaded into two chunks
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu bar=1 10").unwrap();
write_lp(&db, "cpu bar=1 10");
let after_data_load = Utc::now();
// When the chunk is rolled over
@ -977,9 +1313,8 @@ mod tests {
db.rules.write().lifecycle_rules.mutable_size_threshold =
Some(NonZeroUsize::new(2).unwrap());
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu bar=1 10").unwrap();
writer.write_lp_string(&db, "cpu bar=1 20").unwrap();
write_lp(&db, "cpu bar=1 10");
write_lp(&db, "cpu bar=1 20");
let partitions = db.catalog.partition_keys();
assert_eq!(partitions.len(), 1);
@ -996,15 +1331,10 @@ mod tests {
#[tokio::test]
async fn chunks_sorted_by_times() {
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu val=1 1").unwrap();
writer
.write_lp_string(&db, "mem val=2 400000000000001")
.unwrap();
writer.write_lp_string(&db, "cpu val=1 2").unwrap();
writer
.write_lp_string(&db, "mem val=2 400000000000002")
.unwrap();
write_lp(&db, "cpu val=1 1");
write_lp(&db, "mem val=2 400000000000001");
write_lp(&db, "cpu val=1 2");
write_lp(&db, "mem val=2 400000000000002");
let sort_rules = SortOrder {
order: Order::Desc,
@ -1035,9 +1365,9 @@ mod tests {
// Test that chunk id listing is hooked up
let db = make_db();
let partition_key = "1970-01-01T00";
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu bar=1 10").unwrap();
writer.write_lp_string(&db, "cpu bar=1 20").unwrap();
write_lp(&db, "cpu bar=1 10");
write_lp(&db, "cpu bar=1 20");
assert_eq!(mutable_chunk_ids(&db, partition_key), vec![0]);
assert_eq!(
@ -1051,13 +1381,13 @@ mod tests {
// add a new chunk in mutable buffer, and move chunk1 (but
// not chunk 0) to read buffer
writer.write_lp_string(&db, "cpu bar=1 30").unwrap();
write_lp(&db, "cpu bar=1 30");
let mb_chunk = db.rollover_partition("1970-01-01T00").await.unwrap();
db.load_chunk_to_read_buffer(partition_key, mb_chunk.id())
.await
.unwrap();
writer.write_lp_string(&db, "cpu bar=1 40").unwrap();
write_lp(&db, "cpu bar=1 40");
assert_eq!(mutable_chunk_ids(&db, partition_key), vec![0, 2]);
assert_eq!(read_buffer_chunk_ids(&db, partition_key), vec![1]);
@ -1086,15 +1416,12 @@ mod tests {
async fn partition_chunk_summaries() {
// Test that chunk id listing is hooked up
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu bar=1 1").unwrap();
write_lp(&db, "cpu bar=1 1");
db.rollover_partition("1970-01-01T00").await.unwrap();
// write into a separate partitiion
writer
.write_lp_string(&db, "cpu bar=1,baz2,frob=3 400000000000000")
.unwrap();
write_lp(&db, "cpu bar=1,baz2,frob=3 400000000000000");
print!("Partitions: {:?}", db.partition_keys().unwrap());
@ -1131,11 +1458,10 @@ mod tests {
#[tokio::test]
async fn partition_chunk_summaries_timestamp() {
let db = make_db();
let mut writer = TestLPWriter::default();
let start = Utc::now();
writer.write_lp_string(&db, "cpu bar=1 1").unwrap();
write_lp(&db, "cpu bar=1 1");
let after_first_write = Utc::now();
writer.write_lp_string(&db, "cpu bar=2 2").unwrap();
write_lp(&db, "cpu bar=2 2");
db.rollover_partition("1970-01-01T00").await.unwrap();
let after_close = Utc::now();
@ -1183,17 +1509,13 @@ mod tests {
async fn chunk_summaries() {
// Test that chunk id listing is hooked up
let db = make_db();
let mut writer = TestLPWriter::default();
// get three chunks: one open, one closed in mb and one close in rb
writer.write_lp_string(&db, "cpu bar=1 1").unwrap();
write_lp(&db, "cpu bar=1 1");
db.rollover_partition("1970-01-01T00").await.unwrap();
writer.write_lp_string(&db, "cpu bar=1,baz=2 2").unwrap();
writer
.write_lp_string(&db, "cpu bar=1,baz=2,frob=3 400000000000000")
.unwrap();
write_lp(&db, "cpu bar=1,baz=2 2");
write_lp(&db, "cpu bar=1,baz=2,frob=3 400000000000000");
print!("Partitions: {:?}", db.partition_keys().unwrap());
@ -1204,9 +1526,7 @@ mod tests {
print!("Partitions2: {:?}", db.partition_keys().unwrap());
db.rollover_partition("1970-01-05T15").await.unwrap();
writer
.write_lp_string(&db, "cpu bar=1,baz=3,blargh=3 400000000000000")
.unwrap();
write_lp(&db, "cpu bar=1,baz=3,blargh=3 400000000000000");
fn to_arc(s: &str) -> Arc<String> {
Arc::new(s.to_string())
@ -1256,12 +1576,11 @@ mod tests {
async fn partition_summaries() {
// Test that chunk id listing is hooked up
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, "cpu bar=1 1").unwrap();
write_lp(&db, "cpu bar=1 1");
let chunk_id = db.rollover_partition("1970-01-01T00").await.unwrap().id();
writer.write_lp_string(&db, "cpu bar=2,baz=3.0 2").unwrap();
writer.write_lp_string(&db, "mem foo=1 1").unwrap();
write_lp(&db, "cpu bar=2,baz=3.0 2");
write_lp(&db, "mem foo=1 1");
// load a chunk to the read buffer
db.load_chunk_to_read_buffer("1970-01-01T00", chunk_id)
@ -1269,12 +1588,8 @@ mod tests {
.unwrap();
// write into a separate partitiion
writer
.write_lp_string(&db, "cpu bar=1 400000000000000")
.unwrap();
writer
.write_lp_string(&db, "mem frob=3 400000000000001")
.unwrap();
write_lp(&db, "cpu bar=1 400000000000000");
write_lp(&db, "mem frob=3 400000000000001");
print!("Partitions: {:?}", db.partition_keys().unwrap());
@ -1398,11 +1713,11 @@ mod tests {
// run a sql query against the database, returning the results as record batches
async fn run_query(db: Arc<Db>, query: &str) -> Vec<RecordBatch> {
let planner = SQLQueryPlanner::default();
let executor = Executor::new();
let executor = db.executor();
let physical_plan = planner.query(db, query, &executor).await.unwrap();
let physical_plan = planner.query(db, query, &executor).unwrap();
collect(physical_plan).await.unwrap()
executor.collect(physical_plan).await.unwrap()
}
fn mutable_chunk_ids(db: &Db, partition_key: &str) -> Vec<u32> {
@ -1426,10 +1741,62 @@ mod tests {
.into_iter()
.filter_map(|chunk| match chunk.storage {
ChunkStorage::ReadBuffer => Some(chunk.id),
ChunkStorage::ReadBufferAndObjectStore => Some(chunk.id),
_ => None,
})
.collect();
chunk_ids.sort_unstable();
chunk_ids
}
fn read_parquet_file_chunk_ids(db: &Db, partition_key: &str) -> Vec<u32> {
let mut chunk_ids: Vec<u32> = db
.partition_chunk_summaries(partition_key)
.into_iter()
.filter_map(|chunk| match chunk.storage {
ChunkStorage::ReadBufferAndObjectStore => Some(chunk.id),
ChunkStorage::ObjectStoreOnly => Some(chunk.id),
_ => None,
})
.collect();
chunk_ids.sort_unstable();
chunk_ids
}
#[tokio::test]
async fn write_chunk_to_object_store_in_background() {
// Test that data can be written to object store using a background task
let db = Arc::new(make_db());
// create MB partition
write_lp(db.as_ref(), "cpu bar=1 10");
write_lp(db.as_ref(), "cpu bar=2 20");
// MB => RB
let partition_key = "1970-01-01T00";
let mb_chunk = db.rollover_partition(partition_key).await.unwrap();
let rb_chunk = db
.load_chunk_to_read_buffer(partition_key, mb_chunk.id())
.await
.unwrap();
assert_eq!(mb_chunk.id(), rb_chunk.id());
// RB => OS
let task =
db.write_chunk_to_object_store_in_background(partition_key.to_string(), rb_chunk.id());
let t_start = std::time::Instant::now();
while !task.is_complete() {
tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
assert!(
std::time::Instant::now() - t_start < std::time::Duration::from_secs(10),
"task deadline exceeded"
);
}
// we should have chunks in the mutable buffer, read buffer, and object store
// (Note the currently open chunk is not listed)
assert_eq!(mutable_chunk_ids(&db, partition_key), vec![1]);
assert_eq!(read_buffer_chunk_ids(&db, partition_key), vec![0]);
assert_eq!(read_parquet_file_chunk_ids(&db, partition_key), vec![0]);
}
}

View File

@ -171,8 +171,12 @@ impl Chunk {
ChunkState::Closing(chunk) => (chunk.size(), ChunkStorage::ClosedMutableBuffer),
ChunkState::Moving(chunk) => (chunk.size(), ChunkStorage::ClosedMutableBuffer),
ChunkState::Moved(chunk) => (chunk.size(), ChunkStorage::ReadBuffer),
ChunkState::WritingToObjectStore(chunk) => (chunk.size(), ChunkStorage::ObjectStore),
ChunkState::WrittenToObjectStore(chunk, _) => (chunk.size(), ChunkStorage::ObjectStore),
ChunkState::WritingToObjectStore(chunk) => {
(chunk.size(), ChunkStorage::ReadBufferAndObjectStore)
}
ChunkState::WrittenToObjectStore(chunk, _) => {
(chunk.size(), ChunkStorage::ReadBufferAndObjectStore)
}
};
ChunkSummary {

View File

@ -1,24 +1,28 @@
use arrow_deps::datafusion::physical_plan::SendableRecordBatchStream;
use internal_types::{schema::Schema, selection::Selection};
use mutable_buffer::chunk::Chunk as MBChunk;
use mutable_buffer::chunk::snapshot::ChunkSnapshot;
use object_store::path::Path;
use observability_deps::tracing::debug;
use parquet_file::chunk::Chunk as ParquetChunk;
use query::{exec::stringset::StringSet, predicate::Predicate, PartitionChunk};
use read_buffer::Chunk as ReadBufferChunk;
use snafu::{ResultExt, Snafu};
use std::{collections::BTreeSet, sync::Arc};
use std::{
collections::{BTreeMap, BTreeSet},
sync::Arc,
};
use super::{
pred::{to_mutable_buffer_predicate, to_read_buffer_predicate},
streams::{MutableBufferChunkStream, ReadFilterResultsStream},
pred::to_read_buffer_predicate,
streams::{MemoryStream, ReadFilterResultsStream},
};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Mutable Buffer Chunk Error: {}", source))]
MutableBufferChunk {
source: mutable_buffer::chunk::Error,
source: mutable_buffer::chunk::snapshot::Error,
},
#[snafu(display("Read Buffer Error in chunk {}: {}", chunk_id, source))]
@ -27,6 +31,15 @@ pub enum Error {
chunk_id: u32,
},
#[snafu(display("Read Buffer Error in chunk {}: {}", chunk_id, msg))]
ReadBufferError { chunk_id: u32, msg: String },
#[snafu(display("Parquet File Error in chunk {}: {}", chunk_id, source))]
ParquetFileChunkError {
source: parquet_file::chunk::Error,
chunk_id: u32,
},
#[snafu(display("Internal error restricting schema: {}", source))]
InternalSelectingSchema {
source: internal_types::schema::Error,
@ -58,10 +71,7 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug)]
pub enum DBChunk {
MutableBuffer {
chunk: Arc<MBChunk>,
partition_key: Arc<String>,
/// is this chunk open for writing?
open: bool,
chunk: Arc<ChunkSnapshot>,
},
ReadBuffer {
chunk: Arc<ReadBufferChunk>,
@ -83,36 +93,12 @@ impl DBChunk {
ChunkState::Invalid => {
panic!("Invalid internal state");
}
ChunkState::Open(chunk) => {
// TODO the performance if cloning the chunk is terrible
// Proper performance is tracked in
// https://github.com/influxdata/influxdb_iox/issues/635
let chunk = Arc::new(chunk.clone());
Self::MutableBuffer {
chunk,
partition_key,
open: true,
}
}
ChunkState::Closing(chunk) => {
// TODO the performance if cloning the chunk is terrible
// Proper performance is tracked in
// https://github.com/influxdata/influxdb_iox/issues/635
let chunk = Arc::new(chunk.clone());
Self::MutableBuffer {
chunk,
partition_key,
open: false,
}
}
ChunkState::Moving(chunk) => {
let chunk = Arc::clone(chunk);
Self::MutableBuffer {
chunk,
partition_key,
open: false,
}
}
ChunkState::Open(chunk) | ChunkState::Closing(chunk) => Self::MutableBuffer {
chunk: chunk.snapshot(),
},
ChunkState::Moving(chunk) => Self::MutableBuffer {
chunk: chunk.snapshot(),
},
ChunkState::Moved(chunk) => Self::ReadBuffer {
chunk: Arc::clone(chunk),
partition_key,
@ -128,6 +114,14 @@ impl DBChunk {
};
Arc::new(db_chunk)
}
/// Return object store paths
pub fn object_store_paths(&self) -> Vec<Path> {
match self {
Self::ParquetFile { chunk } => chunk.all_paths(),
_ => vec![],
}
}
}
impl PartitionChunk for DBChunk {
@ -135,15 +129,17 @@ impl PartitionChunk for DBChunk {
fn id(&self) -> u32 {
match self {
Self::MutableBuffer { chunk, .. } => chunk.id(),
Self::MutableBuffer { chunk, .. } => chunk.chunk_id(),
Self::ReadBuffer { chunk, .. } => chunk.id(),
Self::ParquetFile { .. } => unimplemented!("parquet file not implemented"),
Self::ParquetFile { chunk, .. } => chunk.id(),
}
}
fn all_table_names(&self, known_tables: &mut StringSet) {
match self {
Self::MutableBuffer { chunk, .. } => chunk.all_table_names(known_tables),
Self::MutableBuffer { chunk, .. } => {
known_tables.extend(chunk.table_names(None).cloned())
}
Self::ReadBuffer { chunk, .. } => {
// TODO - align APIs so they behave in the same way...
let rb_names = chunk.all_table_names(known_tables);
@ -151,42 +147,22 @@ impl PartitionChunk for DBChunk {
known_tables.insert(name);
}
}
Self::ParquetFile { .. } => unimplemented!("parquet file not implemented"),
Self::ParquetFile { chunk, .. } => chunk.all_table_names(known_tables),
}
}
fn table_names(
&self,
predicate: &Predicate,
_known_tables: &StringSet,
_known_tables: &StringSet, // TODO: Should this be being used?
) -> Result<Option<StringSet>, Self::Error> {
let names = match self {
Self::MutableBuffer { chunk, .. } => {
if chunk.is_empty() {
Some(StringSet::new())
} else {
let chunk_predicate = match to_mutable_buffer_predicate(chunk, predicate) {
Ok(chunk_predicate) => chunk_predicate,
Err(e) => {
debug!(?predicate, %e, "mutable buffer predicate not supported for table_names, falling back");
return Ok(None);
}
};
// we don't support arbitrary expressions in chunk predicate yet
if !chunk_predicate.chunk_exprs.is_empty() {
None
} else {
let names = chunk
.table_names(&chunk_predicate)
.context(MutableBufferChunk)?
.into_iter()
.map(|s| s.to_string())
.collect::<StringSet>();
Some(names)
}
if predicate.has_exprs() {
// TODO: Support more predicates
return Ok(None);
}
chunk.table_names(predicate.range).cloned().collect()
}
Self::ReadBuffer { chunk, .. } => {
// If not supported, ReadBuffer can't answer with
@ -199,26 +175,19 @@ impl PartitionChunk for DBChunk {
}
};
Some(chunk.table_names(&rb_predicate, &BTreeSet::new()))
}
Self::ParquetFile { .. } => {
unimplemented!("parquet file not implemented")
chunk.table_names(&rb_predicate, &BTreeSet::new())
}
Self::ParquetFile { chunk, .. } => chunk.table_names(predicate.range).collect(),
};
// Prune out tables that should not be
// present (based on additional table restrictions of the Predicate)
//
// This is needed because at time of writing, the ReadBuffer's
// table_names implementation doesn't include any way to
// further restrict the tables to a known set of tables
let names = names.map(|names| {
Ok(Some(
names
.into_iter()
.filter(|table_name| predicate.should_include_table(table_name))
.collect()
});
Ok(names)
.collect(),
))
}
fn table_schema(
@ -253,8 +222,12 @@ impl PartitionChunk for DBChunk {
Ok(schema)
}
Self::ParquetFile { .. } => {
unimplemented!("parquet file not implemented for table schema")
Self::ParquetFile { chunk, .. } => {
chunk
.table_schema(table_name, selection)
.context(ParquetFileChunkError {
chunk_id: chunk.id(),
})
}
}
}
@ -263,9 +236,7 @@ impl PartitionChunk for DBChunk {
match self {
Self::MutableBuffer { chunk, .. } => chunk.has_table(table_name),
Self::ReadBuffer { chunk, .. } => chunk.has_table(table_name),
Self::ParquetFile { .. } => {
unimplemented!("parquet file not implemented for has_table")
}
Self::ParquetFile { chunk, .. } => chunk.has_table(table_name),
}
}
@ -277,22 +248,17 @@ impl PartitionChunk for DBChunk {
) -> Result<SendableRecordBatchStream, Self::Error> {
match self {
Self::MutableBuffer { chunk, .. } => {
// Note MutableBuffer doesn't support predicate
// pushdown (other than pruning out the entire chunk
// via `might_pass_predicate)
if !predicate.is_empty() {
return InternalPredicateNotSupported {
predicate: predicate.clone(),
}
.fail();
}
let schema: Schema = self.table_schema(table_name, selection)?;
let batch = chunk
.read_filter(table_name, selection)
.context(MutableBufferChunk)?;
Ok(Box::pin(MutableBufferChunkStream::new(
Arc::clone(&chunk),
schema.as_arrow(),
table_name,
)))
Ok(Box::pin(MemoryStream::new(batch)))
}
Self::ReadBuffer { chunk, .. } => {
// Error converting to a rb_predicate needs to fail
@ -354,17 +320,11 @@ impl PartitionChunk for DBChunk {
) -> Result<Option<StringSet>, Self::Error> {
match self {
Self::MutableBuffer { chunk, .. } => {
let chunk_predicate = match to_mutable_buffer_predicate(chunk, predicate) {
Ok(chunk_predicate) => chunk_predicate,
Err(e) => {
debug!(?predicate, %e, "mutable buffer predicate not supported for column_names, falling back");
return Ok(None);
}
};
chunk
.column_names(table_name, &chunk_predicate, columns)
.context(MutableBufferChunk)
if !predicate.is_empty() {
// TODO: Support predicates
return Ok(None);
}
Ok(chunk.column_names(table_name, columns))
}
Self::ReadBuffer { chunk, .. } => {
let rb_predicate = match to_read_buffer_predicate(&predicate) {
@ -396,32 +356,47 @@ impl PartitionChunk for DBChunk {
predicate: &Predicate,
) -> Result<Option<StringSet>, Self::Error> {
match self {
Self::MutableBuffer { chunk, .. } => {
use mutable_buffer::chunk::Error::UnsupportedColumnTypeForListingValues;
let chunk_predicate = match to_mutable_buffer_predicate(chunk, predicate) {
Ok(chunk_predicate) => chunk_predicate,
Self::MutableBuffer { .. } => {
// There is no advantage to manually implementing this
// vs just letting DataFusion do its thing
Ok(None)
}
Self::ReadBuffer { chunk, .. } => {
let rb_predicate = match to_read_buffer_predicate(predicate) {
Ok(rb_predicate) => rb_predicate,
Err(e) => {
debug!(?predicate, %e, "mutable buffer predicate not supported for column_values, falling back");
debug!(?predicate, %e, "read buffer predicate not supported for column_names, falling back");
return Ok(None);
}
};
let values = chunk.tag_column_values(table_name, column_name, &chunk_predicate);
let mut values = chunk
.column_values(
table_name,
rb_predicate,
Selection::Some(&[column_name]),
BTreeMap::new(),
)
.context(ReadBufferChunkError {
chunk_id: chunk.id(),
})?;
// if the mutable buffer doesn't support getting
// values for this kind of column, report back None
if let Err(UnsupportedColumnTypeForListingValues { .. }) = values {
Ok(None)
} else {
values.context(MutableBufferChunk)
}
}
Self::ReadBuffer { .. } => {
// TODO hook up read buffer API here when ready. Until
// now, fallback to using a full plan
// https://github.com/influxdata/influxdb_iox/issues/857
Ok(None)
// The InfluxRPC frontend only supports getting column values
// for one column at a time (this is a restriction on the Influx
// Read gRPC API too). However, the Read Buffer support multiple
// columns and will return a map - we just need to pull the
// column out to get the set of values.
let values = values
.remove(column_name)
.ok_or_else(|| Error::ReadBufferError {
chunk_id: chunk.id(),
msg: format!(
"failed to find column_name {:?} in results of tag_values",
column_name
),
})?;
Ok(Some(values))
}
Self::ParquetFile { .. } => {
unimplemented!("parquet file not implemented for column_values")

View File

@ -20,6 +20,7 @@ pub struct LifecycleManager {
db: Arc<Db>,
db_name: String,
move_task: Option<TaskTracker<Job>>,
write_task: Option<TaskTracker<Job>>,
}
impl LifecycleManager {
@ -30,6 +31,7 @@ impl LifecycleManager {
db,
db_name,
move_task: None,
write_task: None,
}
}
@ -65,9 +67,15 @@ trait ChunkMover {
/// Returns a boolean indicating if a move is in progress
fn is_move_active(&self) -> bool;
/// Returns a boolean indicating if a write is in progress
fn is_write_active(&self) -> bool;
/// Starts an operation to move a chunk to the read buffer
fn move_to_read_buffer(&mut self, partition_key: String, chunk_id: u32);
/// Starts an operation to write a chunk to the object store
fn write_to_object_store(&mut self, partition_key: String, chunk_id: u32);
/// Drops a chunk from the database
fn drop_chunk(&mut self, partition_key: String, chunk_id: u32);
@ -78,10 +86,11 @@ trait ChunkMover {
let mut buffer_size = 0;
// Only want to start a new move task if there isn't one already in-flight
// Only want to start a new move/write task if there isn't one already in-flight
//
// Note: This does not take into account manually triggered tasks
let mut move_active = self.is_move_active();
let mut write_active = self.is_write_active();
// Iterate through the chunks to determine
// - total memory consumption
@ -90,33 +99,44 @@ trait ChunkMover {
// TODO: Track size globally to avoid iterating through all chunks (#1100)
for chunk in &chunks {
let chunk_guard = chunk.upgradable_read();
buffer_size += Self::chunk_size(&*chunk_guard);
if !move_active && can_move(&rules, &*chunk_guard, now) {
match chunk_guard.state() {
ChunkState::Open(_) => {
let mut chunk_guard = RwLockUpgradableReadGuard::upgrade(chunk_guard);
chunk_guard.set_closing().expect("cannot close open chunk");
let would_move = !move_active && can_move(&rules, &*chunk_guard, now);
let would_write = !write_active && rules.persist;
let partition_key = chunk_guard.key().to_string();
let chunk_id = chunk_guard.id();
match chunk_guard.state() {
ChunkState::Open(_) if would_move => {
let mut chunk_guard = RwLockUpgradableReadGuard::upgrade(chunk_guard);
chunk_guard.set_closing().expect("cannot close open chunk");
std::mem::drop(chunk_guard);
let partition_key = chunk_guard.key().to_string();
let chunk_id = chunk_guard.id();
move_active = true;
self.move_to_read_buffer(partition_key, chunk_id);
}
ChunkState::Closing(_) => {
let partition_key = chunk_guard.key().to_string();
let chunk_id = chunk_guard.id();
std::mem::drop(chunk_guard);
std::mem::drop(chunk_guard);
move_active = true;
self.move_to_read_buffer(partition_key, chunk_id);
}
_ => {}
move_active = true;
self.move_to_read_buffer(partition_key, chunk_id);
}
ChunkState::Closing(_) if would_move => {
let partition_key = chunk_guard.key().to_string();
let chunk_id = chunk_guard.id();
std::mem::drop(chunk_guard);
move_active = true;
self.move_to_read_buffer(partition_key, chunk_id);
}
ChunkState::Moved(_) if would_write => {
let partition_key = chunk_guard.key().to_string();
let chunk_id = chunk_guard.id();
std::mem::drop(chunk_guard);
write_active = true;
self.write_to_object_store(partition_key, chunk_id);
}
_ => {}
}
// TODO: Find and recover cancelled move jobs (#1099)
@ -129,8 +149,9 @@ trait ChunkMover {
match chunks.next() {
Some(chunk) => {
let chunk_guard = chunk.read();
if rules.drop_non_persisted
|| matches!(chunk_guard.state(), ChunkState::Moved(_))
if (rules.drop_non_persisted
&& matches!(chunk_guard.state(), ChunkState::Moved(_)))
|| matches!(chunk_guard.state(), ChunkState::WrittenToObjectStore(_, _))
{
let partition_key = chunk_guard.key().to_string();
let chunk_id = chunk_guard.id();
@ -169,6 +190,13 @@ impl ChunkMover for LifecycleManager {
.unwrap_or(false)
}
fn is_write_active(&self) -> bool {
self.write_task
.as_ref()
.map(|x| !x.is_complete())
.unwrap_or(false)
}
fn move_to_read_buffer(&mut self, partition_key: String, chunk_id: u32) {
info!(%partition_key, %chunk_id, "moving chunk to read buffer");
self.move_task = Some(
@ -177,6 +205,14 @@ impl ChunkMover for LifecycleManager {
)
}
fn write_to_object_store(&mut self, partition_key: String, chunk_id: u32) {
info!(%partition_key, %chunk_id, "write chunk to object store");
self.write_task = Some(
self.db
.write_chunk_to_object_store_in_background(partition_key, chunk_id),
)
}
fn drop_chunk(&mut self, partition_key: String, chunk_id: u32) {
info!(%partition_key, %chunk_id, "dropping chunk");
let _ = self
@ -251,9 +287,57 @@ mod tests {
chunk
}
/// Transitions a new ("open") chunk into the "moving" state.
fn transition_to_moving(mut chunk: Chunk) -> Chunk {
chunk.set_closing().unwrap();
chunk.set_moving().unwrap();
chunk
}
/// Transitions a new ("open") chunk into the "moved" state.
fn transition_to_moved(mut chunk: Chunk, rb: &Arc<read_buffer::Chunk>) -> Chunk {
chunk = transition_to_moving(chunk);
chunk.set_moved(Arc::clone(&rb)).unwrap();
chunk
}
/// Transitions a new ("open") chunk into the "writing to object store"
/// state.
fn transition_to_writing_to_object_store(
mut chunk: Chunk,
rb: &Arc<read_buffer::Chunk>,
) -> Chunk {
chunk = transition_to_moved(chunk, rb);
chunk.set_writing_to_object_store().unwrap();
chunk
}
/// Transitions a new ("open") chunk into the "written to object store"
/// state.
fn transition_to_written_to_object_store(
mut chunk: Chunk,
rb: &Arc<read_buffer::Chunk>,
) -> Chunk {
chunk = transition_to_writing_to_object_store(chunk, rb);
let parquet_chunk = new_parquet_chunk(&chunk);
chunk
.set_written_to_object_store(Arc::new(parquet_chunk))
.unwrap();
chunk
}
fn new_parquet_chunk(chunk: &Chunk) -> parquet_file::chunk::Chunk {
parquet_file::chunk::Chunk::new(
chunk.key().to_string(),
chunk.id(),
&tracker::MemRegistry::new(),
)
}
#[derive(Debug, Eq, PartialEq)]
enum MoverEvents {
Move(u32),
Write(u32),
Drop(u32),
}
@ -262,6 +346,7 @@ mod tests {
struct DummyMover {
rules: LifecycleRules,
move_active: bool,
write_active: bool,
chunks: Vec<Arc<RwLock<Chunk>>>,
events: Vec<MoverEvents>,
}
@ -275,6 +360,7 @@ mod tests {
.map(|x| Arc::new(RwLock::new(x)))
.collect(),
move_active: false,
write_active: false,
events: vec![],
}
}
@ -298,6 +384,10 @@ mod tests {
self.move_active
}
fn is_write_active(&self) -> bool {
self.write_active
}
fn move_to_read_buffer(&mut self, _: String, chunk_id: u32) {
let chunk = self
.chunks
@ -308,7 +398,22 @@ mod tests {
self.events.push(MoverEvents::Move(chunk_id))
}
fn write_to_object_store(&mut self, _partition_key: String, chunk_id: u32) {
let chunk = self
.chunks
.iter()
.find(|x| x.read().id() == chunk_id)
.unwrap();
chunk.write().set_writing_to_object_store().unwrap();
self.events.push(MoverEvents::Write(chunk_id))
}
fn drop_chunk(&mut self, _: String, chunk_id: u32) {
self.chunks = self
.chunks
.drain(..)
.filter(|x| x.read().id() != chunk_id)
.collect();
self.events.push(MoverEvents::Drop(chunk_id))
}
@ -467,7 +572,56 @@ mod tests {
}
#[test]
fn test_buffer_size_soft() {
fn test_buffer_size_soft_drop_non_persisted() {
// test that chunk mover only drops moved and written chunks
// IMPORTANT: the lifecycle rules have the default `persist` flag (false) so NOT
// "write" events will be triggered
let rules = LifecycleRules {
buffer_size_soft: Some(NonZeroUsize::new(5).unwrap()),
drop_non_persisted: true,
..Default::default()
};
let rb = Arc::new(read_buffer::Chunk::new_with_memory_tracker(
22,
&tracker::MemRegistry::new(),
));
let chunks = vec![new_chunk(0, Some(0), Some(0))];
let mut mover = DummyMover::new(rules.clone(), chunks);
mover.check_for_work(from_secs(10));
assert_eq!(mover.events, vec![]);
let chunks = vec![
// two "open" chunks => they must not be dropped (yet)
new_chunk(0, Some(0), Some(0)),
new_chunk(1, Some(0), Some(0)),
// "moved" chunk => can be dropped because `drop_non_persistent=true`
transition_to_moved(new_chunk(2, Some(0), Some(0)), &rb),
// "writing" chunk => cannot be drop while write is in-progess
transition_to_writing_to_object_store(new_chunk(3, Some(0), Some(0)), &rb),
// "written" chunk => can be dropped
transition_to_written_to_object_store(new_chunk(4, Some(0), Some(0)), &rb),
];
let mut mover = DummyMover::new(rules, chunks);
mover.check_for_work(from_secs(10));
assert_eq!(
mover.events,
vec![MoverEvents::Drop(2), MoverEvents::Drop(4)]
);
}
#[test]
fn test_buffer_size_soft_dont_drop_non_persisted() {
// test that chunk mover only drops written chunks
// IMPORTANT: the lifecycle rules have the default `persist` flag (false) so NOT
// "write" events will be triggered
let rules = LifecycleRules {
buffer_size_soft: Some(NonZeroUsize::new(5).unwrap()),
..Default::default()
@ -485,21 +639,27 @@ mod tests {
mover.check_for_work(from_secs(10));
assert_eq!(mover.events, vec![]);
let mut chunks = vec![
let chunks = vec![
// two "open" chunks => they must not be dropped (yet)
new_chunk(0, Some(0), Some(0)),
new_chunk(1, Some(0), Some(0)),
new_chunk(2, Some(0), Some(0)),
// "moved" chunk => cannot be dropped because `drop_non_persistent=false`
transition_to_moved(new_chunk(2, Some(0), Some(0)), &rb),
// "writing" chunk => cannot be drop while write is in-progess
transition_to_writing_to_object_store(new_chunk(3, Some(0), Some(0)), &rb),
// "written" chunk => can be dropped
transition_to_written_to_object_store(new_chunk(4, Some(0), Some(0)), &rb),
];
chunks[2].set_closing().unwrap();
chunks[2].set_moving().unwrap();
chunks[2].set_moved(Arc::clone(&rb)).unwrap();
let mut mover = DummyMover::new(rules, chunks);
mover.check_for_work(from_secs(10));
assert_eq!(mover.events, vec![MoverEvents::Drop(2)]);
assert_eq!(mover.events, vec![MoverEvents::Drop(4)]);
}
#[test]
fn test_buffer_size_soft_no_op() {
// check that we don't drop anything if nothing is to drop
let rules = LifecycleRules {
buffer_size_soft: Some(NonZeroUsize::new(40).unwrap()),
..Default::default()
@ -512,4 +672,33 @@ mod tests {
mover.check_for_work(from_secs(10));
assert_eq!(mover.events, vec![]);
}
#[test]
fn test_persist() {
let rules = LifecycleRules {
mutable_linger_seconds: Some(NonZeroU32::new(10).unwrap()),
persist: true,
..Default::default()
};
let rb = Arc::new(read_buffer::Chunk::new_with_memory_tracker(
22,
&tracker::MemRegistry::new(),
));
let chunks = vec![
// still moving => cannot write
transition_to_moving(new_chunk(0, Some(0), Some(0))),
// moved => write to object store
transition_to_moved(new_chunk(1, Some(0), Some(0)), &rb),
// moved, but there will be already a write in progress (previous chunk) => don't write
transition_to_moved(new_chunk(2, Some(0), Some(0)), &rb),
];
let mut mover = DummyMover::new(rules, chunks);
mover.check_for_work(from_secs(0));
assert_eq!(mover.events, vec![MoverEvents::Write(1)]);
}
}

View File

@ -3,7 +3,6 @@
use std::convert::TryFrom;
use mutable_buffer::{chunk::Chunk, pred::ChunkPredicate};
use query::predicate::Predicate;
use snafu::Snafu;
@ -11,15 +10,6 @@ use snafu::Snafu;
pub enum Error {
#[snafu(display("Error translating predicate: {}", msg))]
ReadBufferPredicate { msg: String, pred: Predicate },
#[snafu(display("Error building predicate for mutable buffer: {}", source))]
MutableBufferPredicate { source: mutable_buffer::pred::Error },
}
impl From<mutable_buffer::pred::Error> for Error {
fn from(source: mutable_buffer::pred::Error) -> Self {
Self::MutableBufferPredicate { source }
}
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -52,25 +42,6 @@ pub fn to_read_buffer_predicate(predicate: &Predicate) -> Result<read_buffer::Pr
}
}
/// Converts a [`query::Predicate`] into [`ChunkPredicate`],
/// suitable for evaluating on the MutableBuffer.
pub fn to_mutable_buffer_predicate(
chunk: impl AsRef<Chunk>,
predicate: &Predicate,
) -> Result<ChunkPredicate> {
let predicate = chunk
.as_ref()
.predicate_builder()?
.table_names(predicate.table_names.as_ref())?
.field_names(predicate.field_columns.as_ref())?
.range(predicate.range)?
// it would be nice to avoid cloning all the exprs here.
.exprs(predicate.exprs.clone())?
.build();
Ok(predicate)
}
#[cfg(test)]
pub mod test {
use super::*;
@ -196,7 +167,6 @@ pub mod test {
Error::ReadBufferPredicate { msg, pred: _ } => {
assert_eq!(msg, exp.to_owned());
}
_ => panic!("Unexpected error type"),
}
}
}

View File

@ -1,15 +1,9 @@
//! Adapter streams for different Chunk types that implement the interface
//! needed by DataFusion
use arrow_deps::{
arrow::{
datatypes::SchemaRef,
error::{ArrowError, Result as ArrowResult},
record_batch::RecordBatch,
},
arrow::{datatypes::SchemaRef, error::Result as ArrowResult, record_batch::RecordBatch},
datafusion::physical_plan::RecordBatchStream,
};
use internal_types::selection::Selection;
use mutable_buffer::chunk::Chunk as MBChunk;
use read_buffer::ReadFilterResults;
use std::{
@ -17,99 +11,6 @@ use std::{
task::{Context, Poll},
};
use snafu::{ResultExt, Snafu};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display(
"Error getting data for table '{}' chunk {}: {}",
table_name,
chunk_id,
source
))]
GettingTableData {
table_name: String,
chunk_id: u32,
source: mutable_buffer::chunk::Error,
},
}
/// Adapter which will produce record batches from a mutable buffer
/// chunk on demand
pub(crate) struct MutableBufferChunkStream {
/// Requested output schema (includes selection)
schema: SchemaRef,
chunk: Arc<MBChunk>,
table_name: Arc<String>,
/// Vector of record batches to send in reverse order (send data[len-1]
/// next) Is None until the first call to poll_next
data: Option<Vec<RecordBatch>>,
}
impl MutableBufferChunkStream {
pub fn new(chunk: Arc<MBChunk>, schema: SchemaRef, table_name: impl Into<String>) -> Self {
Self {
chunk,
schema,
table_name: Arc::new(table_name.into()),
data: None,
}
}
// gets the next batch, as needed
fn next_batch(&mut self) -> ArrowResult<Option<RecordBatch>> {
if self.data.is_none() {
// Want all the columns in the schema. Note we don't
// use `Selection::All` here because the mutable buffer chunk would interpret it
// as "all columns in the table in that chunk" rather than
// all columns this query needs
let selected_cols = self
.schema
.fields()
.iter()
.map(|f| f.name() as &str)
.collect::<Vec<_>>();
let selection = Selection::Some(&selected_cols);
let mut data = Vec::new();
self.chunk
.table_to_arrow(&mut data, self.table_name.as_ref(), selection)
.context(GettingTableData {
table_name: self.table_name.as_ref(),
chunk_id: self.chunk.id(),
})
.map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
// reverse the array so we can pop off the back
data.reverse();
self.data = Some(data);
}
// self.data was set to Some above
Ok(self.data.as_mut().unwrap().pop())
}
}
impl RecordBatchStream for MutableBufferChunkStream {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
}
impl futures::Stream for MutableBufferChunkStream {
type Item = ArrowResult<RecordBatch>;
fn poll_next(
mut self: std::pin::Pin<&mut Self>,
_: &mut Context<'_>,
) -> Poll<Option<Self::Item>> {
Poll::Ready(self.next_batch().transpose())
}
// TODO is there a useful size_hint to pass?
}
/// Adapter which will take a ReadFilterResults and make it an async stream
pub struct ReadFilterResultsStream {
read_results: ReadFilterResults,
@ -143,3 +44,42 @@ impl futures::Stream for ReadFilterResultsStream {
// TODO is there a useful size_hint to pass?
}
/// A RecordBatchStream created from a single RecordBatch
///
/// Unfortunately datafusion's MemoryStream is crate-local
#[derive(Debug)]
pub(crate) struct MemoryStream {
schema: SchemaRef,
batch: Option<RecordBatch>,
}
impl MemoryStream {
pub fn new(batch: RecordBatch) -> Self {
Self {
schema: batch.schema(),
batch: Some(batch),
}
}
}
impl RecordBatchStream for MemoryStream {
fn schema(&self) -> SchemaRef {
Arc::clone(&self.schema)
}
}
impl futures::Stream for MemoryStream {
type Item = ArrowResult<RecordBatch>;
fn poll_next(
mut self: std::pin::Pin<&mut Self>,
_: &mut Context<'_>,
) -> Poll<Option<Self::Item>> {
Poll::Ready(self.batch.take().map(Ok))
}
fn size_hint(&self) -> (usize, Option<usize>) {
(1, Some(1))
}
}

View File

@ -67,6 +67,7 @@
clippy::clone_on_ref_ptr
)]
use std::convert::TryInto;
use std::sync::Arc;
use async_trait::async_trait;
@ -83,11 +84,11 @@ use data_types::{
};
use influxdb_line_protocol::ParsedLine;
use internal_types::{
data::{lines_to_replicated_write, ReplicatedWrite},
entry::{self, lines_to_sharded_entries, Entry},
once::OnceNonZeroU32,
};
use object_store::{path::ObjectStorePath, ObjectStore, ObjectStoreApi};
use query::{exec::Executor, Database, DatabaseStore};
use query::{exec::Executor, DatabaseStore};
use tracker::{TaskId, TaskRegistration, TaskRegistryWithHistory, TaskTracker, TrackedFutureExt};
use futures::{pin_mut, FutureExt};
@ -98,15 +99,20 @@ use crate::{
},
db::Db,
};
use internal_types::entry::SequencedEntry;
use std::num::NonZeroU32;
pub mod buffer;
mod config;
pub mod db;
mod query_tests;
pub mod snapshot;
#[cfg(test)]
mod query_tests;
// This module exposes `query_tests` outside of the crate so that it may be used
// in benchmarks. Do not import this module for non-benchmark purposes!
pub mod benchmarks {
pub use crate::query_tests::*;
}
type DatabaseError = Box<dyn std::error::Error + Send + Sync + 'static>;
@ -147,6 +153,12 @@ pub enum Error {
DatabaseAlreadyExists { db_name: String },
#[snafu(display("error appending to wal buffer: {}", source))]
WalError { source: buffer::Error },
#[snafu(display("error converting line protocol to flatbuffers: {}", source))]
LineConversion { source: entry::Error },
#[snafu(display("error decoding entry flatbuffers: {}", source))]
DecodingEntry {
source: flatbuffers::InvalidFlatbuffer,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -179,6 +191,38 @@ impl JobRegistry {
const STORE_ERROR_PAUSE_SECONDS: u64 = 100;
/// Used to configure a server instance
#[derive(Debug)]
pub struct ServerConfig {
// number of executor worker threads. If not specified, defaults
// to number of cores on the system.
num_worker_threads: Option<usize>,
/// The `ObjectStore` instance to use for persistence
object_store: Arc<ObjectStore>,
}
impl ServerConfig {
/// Create a new config using the specified store
pub fn new(object_store: Arc<ObjectStore>) -> Self {
Self {
num_worker_threads: None,
object_store,
}
}
/// Use `num` worker threads for running queries
pub fn with_num_worker_threads(mut self, num: usize) -> Self {
self.num_worker_threads = Some(num);
self
}
/// return a reference to the object store in this configuration
pub fn store(&self) -> Arc<ObjectStore> {
Arc::clone(&self.object_store)
}
}
/// `Server` is the container struct for how servers store data internally, as
/// well as how they communicate with other servers. Each server will have one
/// of these structs, which keeps track of all replication and query rules.
@ -188,7 +232,7 @@ pub struct Server<M: ConnectionManager> {
config: Arc<Config>,
connection_manager: Arc<M>,
pub store: Arc<ObjectStore>,
executor: Arc<Executor>,
exec: Arc<Executor>,
jobs: Arc<JobRegistry>,
}
@ -205,15 +249,21 @@ impl<E> From<Error> for UpdateError<E> {
}
impl<M: ConnectionManager> Server<M> {
pub fn new(connection_manager: M, store: Arc<ObjectStore>) -> Self {
pub fn new(connection_manager: M, config: ServerConfig) -> Self {
let jobs = Arc::new(JobRegistry::new());
let ServerConfig {
num_worker_threads,
object_store,
} = config;
let num_worker_threads = num_worker_threads.unwrap_or_else(num_cpus::get);
Self {
id: Default::default(),
config: Arc::new(Config::new(Arc::clone(&jobs))),
store,
store: object_store,
connection_manager: Arc::new(connection_manager),
executor: Arc::new(Executor::new()),
exec: Arc::new(Executor::new(num_worker_threads)),
jobs,
}
}
@ -232,12 +282,7 @@ impl<M: ConnectionManager> Server<M> {
}
/// Tells the server the set of rules for a database.
pub async fn create_database(
&self,
rules: DatabaseRules,
server_id: NonZeroU32,
object_store: Arc<ObjectStore>,
) -> Result<()> {
pub async fn create_database(&self, rules: DatabaseRules, server_id: NonZeroU32) -> Result<()> {
// Return an error if this server hasn't yet been setup with an id
self.require_id()?;
let db_reservation = self.config.create_db(rules)?;
@ -245,7 +290,7 @@ impl<M: ConnectionManager> Server<M> {
self.persist_database_rules(db_reservation.rules().clone())
.await?;
db_reservation.commit(server_id, object_store);
db_reservation.commit(server_id, Arc::clone(&self.store), Arc::clone(&self.exec));
Ok(())
}
@ -300,6 +345,7 @@ impl<M: ConnectionManager> Server<M> {
.map(|mut path| {
let store = Arc::clone(&self.store);
let config = Arc::clone(&self.config);
let exec = Arc::clone(&self.exec);
path.set_file_name(DB_RULES_FILE_NAME);
@ -325,7 +371,7 @@ impl<M: ConnectionManager> Server<M> {
}
Ok(rules) => match config.create_db(rules) {
Err(e) => error!("error adding database to config: {}", e),
Ok(handle) => handle.commit(server_id, store),
Ok(handle) => handle.commit(server_id, store, exec),
},
}
})
@ -337,12 +383,12 @@ impl<M: ConnectionManager> Server<M> {
Ok(())
}
/// `write_lines` takes in raw line protocol and converts it to a
/// `ReplicatedWrite`, which is then replicated to other servers based
/// on the configuration of the `db`. This is step #1 from the crate
/// level documentation.
/// `write_lines` takes in raw line protocol and converts it to a collection
/// of ShardedEntry which are then sent to other IOx servers based on
/// the ShardConfig or sent to the local database for buffering in the
/// WriteBuffer and/or the MutableBuffer if configured.
pub async fn write_lines(&self, db_name: &str, lines: &[ParsedLine<'_>]) -> Result<()> {
let id = self.require_id()?.get();
self.require_id()?;
let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
let db = self
@ -350,62 +396,52 @@ impl<M: ConnectionManager> Server<M> {
.db(&db_name)
.context(DatabaseNotFound { db_name: &*db_name })?;
let sequence = db.next_sequence();
let write = lines_to_replicated_write(id, sequence, lines, &*db.rules.read());
let sharded_entries = lines_to_sharded_entries(
lines,
db.rules.read().shard_config.as_ref(),
&*db.rules.read(),
)
.context(LineConversion)?;
self.handle_replicated_write(&db_name, &db, write).await?;
for e in sharded_entries {
// TODO: handle sending to shards based on ShardConfig
self.handle_write_entry(&db, e.entry).await?;
}
Ok(())
}
pub async fn handle_replicated_write(
pub async fn write_entry(&self, db_name: &str, entry_bytes: Vec<u8>) -> Result<()> {
self.require_id()?;
let db_name = DatabaseName::new(db_name).context(InvalidDatabaseName)?;
let db = self
.config
.db(&db_name)
.context(DatabaseNotFound { db_name: &*db_name })?;
let entry = entry_bytes.try_into().context(DecodingEntry)?;
self.handle_write_entry(&db, entry).await
}
pub async fn handle_write_entry(&self, db: &Db, entry: Entry) -> Result<()> {
db.store_entry(entry)
.map_err(|e| Error::UnknownDatabaseError {
source: Box::new(e),
})?;
Ok(())
}
pub async fn handle_sequenced_entry(
&self,
db_name: &DatabaseName<'_>,
db: &Db,
write: ReplicatedWrite,
sequenced_entry: SequencedEntry,
) -> Result<()> {
match db.store_replicated_write(&write) {
Err(db::Error::DatabaseNotWriteable {}) | Ok(_) => {}
Err(e) => {
return Err(Error::UnknownDatabaseError {
source: Box::new(e),
})
}
}
let write = Arc::new(write);
if let Some(wal_buffer) = &db.wal_buffer {
let persist;
let segment = {
let mut wal_buffer = wal_buffer.lock();
persist = wal_buffer.persist;
// TODO: address this issue?
// the mutable buffer and the wal buffer have different locking mechanisms,
// which means that it's possible for a mutable buffer write to
// succeed while a WAL buffer write fails, which would then
// return an error. A single lock is probably undesirable, but
// we need to figure out what semantics we want.
wal_buffer.append(Arc::clone(&write)).context(WalError)?
};
if let Some(segment) = segment {
if persist {
let writer_id = self.require_id()?.get();
let store = Arc::clone(&self.store);
let (_, tracker) = self.jobs.register(Job::PersistSegment {
writer_id,
segment_id: segment.id,
});
segment
.persist_bytes_in_background(tracker, writer_id, db_name, store)
.context(WalError)?;
}
}
}
db.store_sequenced_entry(sequenced_entry)
.map_err(|e| Error::UnknownDatabaseError {
source: Box::new(e),
})?;
Ok(())
}
@ -574,12 +610,8 @@ where
let db = match self.db(&db_name) {
Some(db) => db,
None => {
self.create_database(
DatabaseRules::new(db_name.clone()),
self.require_id()?,
Arc::clone(&self.store),
)
.await?;
self.create_database(DatabaseRules::new(db_name.clone()), self.require_id()?)
.await?;
self.db(&db_name).expect("db not inserted")
}
};
@ -587,8 +619,9 @@ where
Ok(db)
}
/// Return a handle to the query executor
fn executor(&self) -> Arc<Executor> {
Arc::clone(&self.executor)
Arc::clone(&self.exec)
}
}
@ -610,12 +643,17 @@ pub trait ConnectionManager {
pub trait RemoteServer {
type Error: std::error::Error + Send + Sync + 'static;
/// Sends a replicated write to a remote server. This is step #2 from the
/// diagram.
async fn replicate(
/// Sends an Entry to the remote server. An IOx server acting as a
/// router/sharder will call this method to send entries to remotes.
async fn write_entry(&self, db: &str, entry: Entry) -> Result<(), Self::Error>;
/// Sends a SequencedEntry to the remote server. An IOx server acting as a
/// write buffer will call this method to replicate to other write
/// buffer servers or to send data to downstream subscribers.
async fn write_sequenced_entry(
&self,
db: &str,
replicated_write: &ReplicatedWrite,
sequenced_entry: SequencedEntry,
) -> Result<(), Self::Error>;
}
@ -643,10 +681,19 @@ pub struct RemoteServerImpl {}
impl RemoteServer for RemoteServerImpl {
type Error = Error;
async fn replicate(
/// Sends an Entry to the remote server. An IOx server acting as a
/// router/sharder will call this method to send entries to remotes.
async fn write_entry(&self, _db: &str, _entry: Entry) -> Result<(), Self::Error> {
unimplemented!()
}
/// Sends a SequencedEntry to the remote server. An IOx server acting as a
/// write buffer will call this method to replicate to other write
/// buffer servers or to send data to downstream subscribers.
async fn write_sequenced_entry(
&self,
_db: &str,
_replicated_write: &ReplicatedWrite,
_sequenced_entry: SequencedEntry,
) -> Result<(), Self::Error> {
unimplemented!()
}
@ -675,28 +722,27 @@ mod tests {
use async_trait::async_trait;
use futures::TryStreamExt;
use parking_lot::Mutex;
use snafu::Snafu;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use arrow_deps::{assert_table_eq, datafusion::physical_plan::collect};
use data_types::database_rules::{
PartitionTemplate, TemplatePart, WalBufferConfig, WalBufferRollover,
};
use arrow_deps::assert_table_eq;
use data_types::database_rules::{PartitionTemplate, TemplatePart, NO_SHARD_CONFIG};
use influxdb_line_protocol::parse_lines;
use object_store::{memory::InMemory, path::ObjectStorePath};
use query::{frontend::sql::SQLQueryPlanner, Database};
use crate::buffer::Segment;
use super::*;
fn config() -> ServerConfig {
ServerConfig::new(Arc::new(ObjectStore::new_in_memory(InMemory::new())))
.with_num_worker_threads(1)
}
#[tokio::test]
async fn server_api_calls_return_error_with_no_id_set() {
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Server::new(manager, store);
let server = Server::new(manager, config());
let resp = server.require_id().unwrap_err();
assert!(matches!(resp, Error::IdNotSet));
@ -709,8 +755,9 @@ mod tests {
#[tokio::test]
async fn create_database_persists_rules() {
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Server::new(manager, Arc::clone(&store));
let config = config();
let store = config.store();
let server = Server::new(manager, config);
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let name = DatabaseName::new("bananas").unwrap();
@ -727,11 +774,7 @@ mod tests {
// Create a database
server
.create_database(
rules.clone(),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.create_database(rules.clone(), server.require_id().unwrap())
.await
.expect("failed to create database");
@ -759,7 +802,6 @@ mod tests {
.create_database(
DatabaseRules::new(db2.clone()),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.await
.expect("failed to create 2nd db");
@ -767,7 +809,8 @@ mod tests {
store.list_with_delimiter(&store.new_path()).await.unwrap();
let manager = TestConnectionManager::new();
let server2 = Server::new(manager, store);
let config2 = ServerConfig::new(store).with_num_worker_threads(1);
let server2 = Server::new(manager, config2);
server2.set_id(NonZeroU32::new(1).unwrap()).unwrap();
server2.load_database_configs().await.unwrap();
@ -780,8 +823,7 @@ mod tests {
// Covers #643
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Server::new(manager, store);
let server = Server::new(manager, config());
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let name = DatabaseName::new("bananas").unwrap();
@ -791,7 +833,6 @@ mod tests {
.create_database(
DatabaseRules::new(name.clone()),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.await
.expect("failed to create database");
@ -801,7 +842,6 @@ mod tests {
.create_database(
DatabaseRules::new(name.clone()),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.await
.unwrap_err();
@ -814,8 +854,7 @@ mod tests {
#[tokio::test]
async fn db_names_sorted() {
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Server::new(manager, store);
let server = Server::new(manager, config());
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let names = vec!["bar", "baz"];
@ -823,11 +862,7 @@ mod tests {
for name in &names {
let name = DatabaseName::new(name.to_string()).unwrap();
server
.create_database(
DatabaseRules::new(name),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.create_database(DatabaseRules::new(name), server.require_id().unwrap())
.await
.expect("failed to create database");
}
@ -839,17 +874,12 @@ mod tests {
#[tokio::test]
async fn writes_local() {
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Server::new(manager, store);
let server = Server::new(manager, config());
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let name = DatabaseName::new("foo".to_string()).unwrap();
server
.create_database(
DatabaseRules::new(name),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.create_database(DatabaseRules::new(name), server.require_id().unwrap())
.await
.unwrap();
@ -864,10 +894,52 @@ mod tests {
let executor = server.executor();
let physical_plan = planner
.query(db, "select * from cpu", executor.as_ref())
.unwrap();
let batches = executor.collect(physical_plan).await.unwrap();
let expected = vec![
"+-----+------+",
"| bar | time |",
"+-----+------+",
"| 1 | 10 |",
"+-----+------+",
];
assert_table_eq!(expected, &batches);
}
#[tokio::test]
async fn write_entry_local() {
let manager = TestConnectionManager::new();
let server = Server::new(manager, config());
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let name = DatabaseName::new("foo".to_string()).unwrap();
server
.create_database(DatabaseRules::new(name), server.require_id().unwrap())
.await
.unwrap();
let batches = collect(physical_plan).await.unwrap();
let db_name = DatabaseName::new("foo").unwrap();
let db = server.db(&db_name).unwrap();
let line = "cpu bar=1 10";
let lines: Vec<_> = parse_lines(line).map(|l| l.unwrap()).collect();
let sharded_entries = lines_to_sharded_entries(&lines, NO_SHARD_CONFIG, &*db.rules.read())
.expect("sharded entries");
let entry = &sharded_entries[0].entry;
server
.write_entry("foo", entry.data().into())
.await
.expect("write entry");
let planner = SQLQueryPlanner::default();
let executor = server.executor();
let physical_plan = planner
.query(db, "select * from cpu", executor.as_ref())
.unwrap();
let batches = executor.collect(physical_plan).await.unwrap();
let expected = vec![
"+-----+------+",
"| bar | time |",
@ -882,8 +954,7 @@ mod tests {
async fn close_chunk() {
test_helpers::maybe_start_logging();
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Arc::new(Server::new(manager, store));
let server = Arc::new(Server::new(manager, config()));
let cancel_token = CancellationToken::new();
let background_handle = spawn_worker(Arc::clone(&server), cancel_token.clone());
@ -895,7 +966,6 @@ mod tests {
.create_database(
DatabaseRules::new(db_name.clone()),
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.await
.unwrap();
@ -945,71 +1015,10 @@ mod tests {
let _ = background_handle.await;
}
#[tokio::test]
async fn segment_persisted_on_rollover() {
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Server::new(manager, Arc::clone(&store));
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let db_name = DatabaseName::new("my_db").unwrap();
let rules = DatabaseRules {
name: db_name.clone(),
partition_template: Default::default(),
wal_buffer_config: Some(WalBufferConfig {
buffer_size: 500,
segment_size: 10,
buffer_rollover: WalBufferRollover::ReturnError,
store_segments: true,
close_segment_after: None,
}),
lifecycle_rules: Default::default(),
shard_config: None,
};
server
.create_database(
rules,
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.await
.unwrap();
let lines = parsed_lines("disk,host=a used=10.1 12");
server.write_lines(db_name.as_str(), &lines).await.unwrap();
// write lines should have caused a segment rollover and persist, wait
tokio::task::yield_now().await;
let mut path = store.new_path();
path.push_all_dirs(&["1", "my_db", "wal", "000", "000"]);
path.set_file_name("001.segment");
let data = store
.get(&path)
.await
.unwrap()
.map_ok(|b| bytes::BytesMut::from(&b[..]))
.try_concat()
.await
.unwrap();
let segment = Segment::from_file_bytes(&data).unwrap();
assert_eq!(segment.writes.len(), 1);
let write = r#"
writer:1, sequence:1, checksum:2741956553
partition_key:
table:disk
host:a used:10.1 time:12
"#;
assert_eq!(segment.writes[0].to_string(), write);
}
#[tokio::test]
async fn background_task_cleans_jobs() {
let manager = TestConnectionManager::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server = Arc::new(Server::new(manager, store));
let server = Arc::new(Server::new(manager, config()));
let cancel_token = CancellationToken::new();
let background_handle = spawn_worker(Arc::clone(&server), cancel_token.clone());
@ -1057,24 +1066,22 @@ partition_key:
}
#[derive(Debug, Default)]
struct TestRemoteServer {
writes: Mutex<BTreeMap<String, Vec<ReplicatedWrite>>>,
}
struct TestRemoteServer {}
#[async_trait]
impl RemoteServer for TestRemoteServer {
type Error = TestClusterError;
async fn replicate(
&self,
db: &str,
replicated_write: &ReplicatedWrite,
) -> Result<(), Self::Error> {
let mut writes = self.writes.lock();
let entries = writes.entry(db.to_string()).or_insert_with(Vec::new);
entries.push(replicated_write.clone());
async fn write_entry(&self, _db: &str, _entry: Entry) -> Result<(), Self::Error> {
unimplemented!()
}
Ok(())
async fn write_sequenced_entry(
&self,
_db: &str,
_sequenced_entry: SequencedEntry,
) -> Result<(), Self::Error> {
unimplemented!()
}
}

View File

@ -1,3 +1,4 @@
#![allow(unused_imports, dead_code, unused_macros)]
pub mod field_columns;
pub mod read_filter;
pub mod read_group;

View File

@ -4,10 +4,7 @@ use arrow_deps::{
datafusion::logical_plan::{col, lit},
};
use query::{
exec::{
fieldlist::{Field, FieldList},
Executor,
},
exec::fieldlist::{Field, FieldList},
frontend::influxrpc::InfluxRPCPlanner,
predicate::PredicateBuilder,
};
@ -31,11 +28,10 @@ macro_rules! run_field_columns_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let executor = db.executor();
let plan = planner
.field_columns(&db, predicate.clone())
.await
.expect("built plan successfully");
let fields = executor
.to_field_list(plan)
@ -133,11 +129,9 @@ async fn test_field_name_plan() {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plan = planner
.field_columns(&db, predicate.clone())
.await
.expect("built plan successfully");
let mut plans = plan.plans;
@ -146,7 +140,8 @@ async fn test_field_name_plan() {
// run the created plan directly, ensuring the output is as
// expected (specifically that the column ordering is correct)
let results = executor
let results = db
.executor()
.run_logical_plan(plan)
.await
.expect("ok running plan");

View File

@ -4,11 +4,11 @@ use crate::query_tests::scenarios::*;
use arrow_deps::datafusion::logical_plan::{col, lit};
use async_trait::async_trait;
use query::{
exec::Executor,
frontend::influxrpc::InfluxRPCPlanner,
predicate::{Predicate, PredicateBuilder, EMPTY_PREDICATE},
};
#[derive(Debug)]
pub struct TwoMeasurementsMultiSeries {}
#[async_trait]
impl DBSetup for TwoMeasurementsMultiSeries {
@ -46,14 +46,12 @@ macro_rules! run_read_filter_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plan = planner
.read_filter(&db, predicate.clone())
.await
.expect("built plan successfully");
let string_results = run_series_set_plan(executor, plan).await;
let string_results = run_series_set_plan(db.executor(), plan).await;
assert_eq!(
expected_results, string_results,
@ -310,6 +308,7 @@ async fn test_read_filter_data_pred_unsupported_in_scan() {
run_read_filter_test_case!(TwoMeasurementsMultiSeries {}, predicate, expected_results);
}
#[derive(Debug)]
pub struct MeasurementsSortableTags {}
#[async_trait]
impl DBSetup for MeasurementsSortableTags {

View File

@ -4,7 +4,6 @@ use crate::query_tests::scenarios::*;
use arrow_deps::{arrow::util::pretty::pretty_format_batches, datafusion::prelude::*};
use async_trait::async_trait;
use query::{
exec::Executor,
frontend::influxrpc::InfluxRPCPlanner,
group_by::Aggregate,
predicate::{Predicate, PredicateBuilder},
@ -26,11 +25,9 @@ macro_rules! run_read_group_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plans = planner
.read_group(&db, predicate.clone(), agg, &group_columns)
.await
.expect("built plan successfully");
let plans = plans.into_inner();
@ -46,7 +43,8 @@ macro_rules! run_read_group_test_case {
let mut string_results = vec![];
for plan in plans.into_iter() {
let batches = executor
let batches = db
.executor()
.run_logical_plan(plan.plan)
.await
.expect("ok running plan");

View File

@ -1,14 +1,15 @@
//! Tests for the Influx gRPC queries
use crate::query_tests::{scenarios::*, utils::make_db};
use crate::{
db::test_helpers::write_lp,
query_tests::{scenarios::*, utils::make_db},
};
use arrow_deps::{arrow::util::pretty::pretty_format_batches, datafusion::prelude::*};
use async_trait::async_trait;
use query::{
exec::Executor,
frontend::influxrpc::InfluxRPCPlanner,
group_by::{Aggregate, WindowDuration},
predicate::{Predicate, PredicateBuilder},
test::TestLPWriter,
};
/// runs read_window_aggregate(predicate) and compares it to the expected
@ -28,18 +29,17 @@ macro_rules! run_read_window_aggregate_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plans = planner
.read_window_aggregate(&db, predicate.clone(), agg, every.clone(), offset.clone())
.await
.expect("built plan successfully");
let plans = plans.into_inner();
let mut string_results = vec![];
for plan in plans.into_iter() {
let batches = executor
let batches = db
.executor()
.run_logical_plan(plan.plan)
.await
.expect("ok running plan");
@ -162,18 +162,16 @@ impl DBSetup for MeasurementForWindowAggregateMonths {
// "2020-04-02T00"]
let db = make_db();
let mut writer = TestLPWriter::default();
let data = lp_lines.join("\n");
writer.write_lp_string(&db, &data).unwrap();
write_lp(&db, &data);
let scenario1 = DBScenario {
scenario_name: "Data in 4 partitions, open chunks of mutable buffer".into(),
db,
};
let db = make_db();
let mut writer = TestLPWriter::default();
let data = lp_lines.join("\n");
writer.write_lp_string(&db, &data).unwrap();
write_lp(&db, &data);
db.rollover_partition("2020-03-01T00").await.unwrap();
db.rollover_partition("2020-03-02T00").await.unwrap();
let scenario2 = DBScenario {
@ -184,9 +182,8 @@ impl DBSetup for MeasurementForWindowAggregateMonths {
};
let db = make_db();
let mut writer = TestLPWriter::default();
let data = lp_lines.join("\n");
writer.write_lp_string(&db, &data).unwrap();
write_lp(&db, &data);
rollover_and_load(&db, "2020-03-01T00").await;
rollover_and_load(&db, "2020-03-02T00").await;
rollover_and_load(&db, "2020-04-01T00").await;

View File

@ -1,9 +1,6 @@
//! Tests for the Influx gRPC queries
use query::{
exec::{
stringset::{IntoStringSet, StringSetRef},
Executor,
},
exec::stringset::{IntoStringSet, StringSetRef},
frontend::influxrpc::InfluxRPCPlanner,
predicate::{Predicate, PredicateBuilder, EMPTY_PREDICATE},
};
@ -23,13 +20,12 @@ macro_rules! run_table_names_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plan = planner
.table_names(&db, predicate.clone())
.await
.expect("built plan successfully");
let names = executor
let names = db
.executor()
.to_string_set(plan)
.await
.expect("converted plan to strings successfully");

View File

@ -1,9 +1,6 @@
use arrow_deps::datafusion::logical_plan::{col, lit};
use query::{
exec::{
stringset::{IntoStringSet, StringSetRef},
Executor,
},
exec::stringset::{IntoStringSet, StringSetRef},
frontend::influxrpc::InfluxRPCPlanner,
predicate::PredicateBuilder,
};
@ -27,13 +24,12 @@ macro_rules! run_tag_keys_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plan = planner
.tag_keys(&db, predicate.clone())
.await
.expect("built plan successfully");
let names = executor
let names = db
.executor()
.to_string_set(plan)
.await
.expect("converted plan to strings successfully");

View File

@ -1,9 +1,6 @@
use arrow_deps::datafusion::logical_plan::{col, lit};
use query::{
exec::{
stringset::{IntoStringSet, StringSetRef},
Executor,
},
exec::stringset::{IntoStringSet, StringSetRef},
frontend::influxrpc::InfluxRPCPlanner,
predicate::PredicateBuilder,
};
@ -25,13 +22,12 @@ macro_rules! run_tag_values_test_case {
println!("Running scenario '{}'", scenario_name);
println!("Predicate: '{:#?}'", predicate);
let planner = InfluxRPCPlanner::new();
let executor = Executor::new();
let plan = planner
.tag_values(&db, &tag_name, predicate.clone())
.await
.expect("built plan successfully");
let names = executor
let names = db
.executor()
.to_string_set(plan)
.await
.expect("converted plan to strings successfully");
@ -239,7 +235,7 @@ async fn list_tag_values_field_col() {
// Test: temp is a field, not a tag
let tag_name = "temp";
let plan_result = planner.tag_values(&db, &tag_name, predicate.clone()).await;
let plan_result = planner.tag_values(&db, &tag_name, predicate.clone());
assert_eq!(
plan_result.unwrap_err().to_string(),

View File

@ -51,7 +51,7 @@ pub fn dump_series_set(s: SeriesSet) -> Vec<String> {
}
/// Run a series set plan to completion and produce a Vec<String> representation
pub async fn run_series_set_plan(executor: Executor, plans: SeriesSetPlans) -> Vec<String> {
pub async fn run_series_set_plan(executor: Arc<Executor>, plans: SeriesSetPlans) -> Vec<String> {
// Use a channel sufficiently large to buffer the series
let (tx, mut rx) = mpsc::channel(100);
executor

View File

@ -1,14 +1,16 @@
//! This module contains testing scenarios for Db
use query::{test::TestLPWriter, PartitionChunk};
#[allow(unused_imports, dead_code, unused_macros)]
use query::PartitionChunk;
use async_trait::async_trait;
use crate::db::Db;
use crate::db::{test_helpers::write_lp, Db};
use super::utils::{count_mutable_buffer_chunks, count_read_buffer_chunks, make_db};
/// Holds a database and a description of how its data was configured
#[derive(Debug)]
pub struct DBScenario {
pub scenario_name: String,
pub db: Db,
@ -22,6 +24,7 @@ pub trait DBSetup {
}
/// No data
#[derive(Debug)]
pub struct NoData {}
#[async_trait]
impl DBSetup for NoData {
@ -47,8 +50,7 @@ impl DBSetup for NoData {
let db = make_db();
let data = "cpu,region=west user=23.2 100";
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data).unwrap();
write_lp(&db, data);
// move data out of open chunk
assert_eq!(db.rollover_partition(partition_key).await.unwrap().id(), 0);
@ -77,6 +79,7 @@ impl DBSetup for NoData {
}
/// Two measurements data in a single mutable buffer chunk
#[derive(Debug)]
pub struct TwoMeasurements {}
#[async_trait]
impl DBSetup for TwoMeasurements {
@ -92,6 +95,7 @@ impl DBSetup for TwoMeasurements {
}
}
#[derive(Debug)]
pub struct TwoMeasurementsUnsignedType {}
#[async_trait]
impl DBSetup for TwoMeasurementsUnsignedType {
@ -110,6 +114,7 @@ impl DBSetup for TwoMeasurementsUnsignedType {
/// Single measurement that has several different chunks with
/// different (but compatible) schema
#[derive(Debug)]
pub struct MultiChunkSchemaMerge {}
#[async_trait]
impl DBSetup for MultiChunkSchemaMerge {
@ -129,6 +134,7 @@ impl DBSetup for MultiChunkSchemaMerge {
}
/// Two measurements data with many null values
#[derive(Debug)]
pub struct TwoMeasurementsManyNulls {}
#[async_trait]
impl DBSetup for TwoMeasurementsManyNulls {
@ -150,6 +156,7 @@ impl DBSetup for TwoMeasurementsManyNulls {
}
}
#[derive(Debug)]
pub struct TwoMeasurementsManyFields {}
#[async_trait]
impl DBSetup for TwoMeasurementsManyFields {
@ -169,12 +176,12 @@ impl DBSetup for TwoMeasurementsManyFields {
}
}
#[derive(Debug)]
pub struct TwoMeasurementsManyFieldsOneChunk {}
#[async_trait]
impl DBSetup for TwoMeasurementsManyFieldsOneChunk {
async fn make(&self) -> Vec<DBScenario> {
let db = make_db();
let mut writer = TestLPWriter::default();
let lp_lines = vec![
"h2o,state=MA,city=Boston temp=70.4 50",
@ -184,7 +191,7 @@ impl DBSetup for TwoMeasurementsManyFieldsOneChunk {
"o2,state=CA temp=79.0 300",
];
writer.write_lp_string(&db, &lp_lines.join("\n")).unwrap();
write_lp(&db, &lp_lines.join("\n"));
vec![DBScenario {
scenario_name: "Data in open chunk of mutable buffer".into(),
db,
@ -192,6 +199,7 @@ impl DBSetup for TwoMeasurementsManyFieldsOneChunk {
}
}
#[derive(Debug)]
pub struct OneMeasurementManyFields {}
#[async_trait]
impl DBSetup for OneMeasurementManyFields {
@ -212,6 +220,7 @@ impl DBSetup for OneMeasurementManyFields {
}
/// This data (from end to end test)
#[derive(Debug)]
pub struct EndToEndTest {}
#[async_trait]
impl DBSetup for EndToEndTest {
@ -231,9 +240,7 @@ impl DBSetup for EndToEndTest {
let lp_data = lp_lines.join("\n");
let db = make_db();
let mut writer = TestLPWriter::default();
let res = writer.write_lp_string(&db, &lp_data);
assert!(res.is_ok(), "Error: {}", res.unwrap_err());
write_lp(&db, &lp_data);
let scenario1 = DBScenario {
scenario_name: "Data in open chunk of mutable buffer".into(),
@ -251,16 +258,14 @@ impl DBSetup for EndToEndTest {
/// Data in one only read buffer chunk
pub(crate) async fn make_one_chunk_scenarios(partition_key: &str, data: &str) -> Vec<DBScenario> {
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data).unwrap();
write_lp(&db, data);
let scenario1 = DBScenario {
scenario_name: "Data in open chunk of mutable buffer".into(),
db,
};
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data).unwrap();
write_lp(&db, data);
db.rollover_partition(partition_key).await.unwrap();
let scenario2 = DBScenario {
scenario_name: "Data in closed chunk of mutable buffer".into(),
@ -268,8 +273,7 @@ pub(crate) async fn make_one_chunk_scenarios(partition_key: &str, data: &str) ->
};
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data).unwrap();
write_lp(&db, data);
db.rollover_partition(partition_key).await.unwrap();
db.load_chunk_to_read_buffer(partition_key, 0)
.await
@ -294,9 +298,8 @@ pub async fn make_two_chunk_scenarios(
data2: &str,
) -> Vec<DBScenario> {
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data1).unwrap();
writer.write_lp_string(&db, data2).unwrap();
write_lp(&db, data1);
write_lp(&db, data2);
let scenario1 = DBScenario {
scenario_name: "Data in single open chunk of mutable buffer".into(),
db,
@ -304,10 +307,9 @@ pub async fn make_two_chunk_scenarios(
// spread across 2 mutable buffer chunks
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data1).unwrap();
write_lp(&db, data1);
db.rollover_partition(partition_key).await.unwrap();
writer.write_lp_string(&db, data2).unwrap();
write_lp(&db, data2);
let scenario2 = DBScenario {
scenario_name: "Data in one open chunk and one closed chunk of mutable buffer".into(),
db,
@ -315,13 +317,12 @@ pub async fn make_two_chunk_scenarios(
// spread across 1 mutable buffer, 1 read buffer chunks
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data1).unwrap();
write_lp(&db, data1);
db.rollover_partition(partition_key).await.unwrap();
db.load_chunk_to_read_buffer(partition_key, 0)
.await
.unwrap();
writer.write_lp_string(&db, data2).unwrap();
write_lp(&db, data2);
let scenario3 = DBScenario {
scenario_name: "Data in open chunk of mutable buffer, and one chunk of read buffer".into(),
db,
@ -329,10 +330,9 @@ pub async fn make_two_chunk_scenarios(
// in 2 read buffer chunks
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, data1).unwrap();
write_lp(&db, data1);
db.rollover_partition(partition_key).await.unwrap();
writer.write_lp_string(&db, data2).unwrap();
write_lp(&db, data2);
db.rollover_partition(partition_key).await.unwrap();
db.load_chunk_to_read_buffer(partition_key, 0)

View File

@ -3,11 +3,11 @@
//! wired all the pieces together (as well as ensure any particularly
//! important SQL does not regress)
#![allow(unused_imports, dead_code, unused_macros)]
use super::scenarios::*;
use arrow_deps::{
arrow::record_batch::RecordBatch, assert_table_eq, datafusion::physical_plan::collect,
};
use query::{exec::Executor, frontend::sql::SQLQueryPlanner};
use arrow_deps::{arrow::record_batch::RecordBatch, assert_batches_sorted_eq};
use query::frontend::sql::SQLQueryPlanner;
use std::sync::Arc;
/// runs table_names(predicate) and compares it to the expected
@ -25,16 +25,16 @@ macro_rules! run_sql_test_case {
println!("Running scenario '{}'", scenario_name);
println!("SQL: '{:#?}'", sql);
let planner = SQLQueryPlanner::default();
let executor = Executor::new();
let executor = db.executor();
let physical_plan = planner
.query(db, &sql, &executor)
.await
.query(db, &sql, executor.as_ref())
.expect("built plan successfully");
let results: Vec<RecordBatch> = collect(physical_plan).await.expect("Running plan");
let results: Vec<RecordBatch> =
executor.collect(physical_plan).await.expect("Running plan");
assert_table_eq!($EXPECTED_LINES, &results);
assert_batches_sorted_eq!($EXPECTED_LINES, &results);
}
};
}
@ -278,7 +278,7 @@ async fn sql_select_from_system_tables() {
"+----+---------------+-------------------+-----------------+",
"| id | partition_key | storage | estimated_bytes |",
"+----+---------------+-------------------+-----------------+",
"| 0 | 1970-01-01T00 | OpenMutableBuffer | 493 |",
"| 0 | 1970-01-01T00 | OpenMutableBuffer | 453 |",
"+----+---------------+-------------------+-----------------+",
];
run_sql_test_case!(
@ -291,13 +291,13 @@ async fn sql_select_from_system_tables() {
"+---------------+------------+-------------+-------+",
"| partition_key | table_name | column_name | count |",
"+---------------+------------+-------------+-------+",
"| 1970-01-01T00 | h2o | state | 3 |",
"| 1970-01-01T00 | h2o | city | 3 |",
"| 1970-01-01T00 | h2o | other_temp | 2 |",
"| 1970-01-01T00 | h2o | state | 3 |",
"| 1970-01-01T00 | h2o | temp | 1 |",
"| 1970-01-01T00 | h2o | time | 3 |",
"| 1970-01-01T00 | h2o | other_temp | 2 |",
"| 1970-01-01T00 | o2 | state | 2 |",
"| 1970-01-01T00 | o2 | city | 1 |",
"| 1970-01-01T00 | o2 | state | 2 |",
"| 1970-01-01T00 | o2 | temp | 2 |",
"| 1970-01-01T00 | o2 | time | 2 |",
"| 1970-01-01T00 | o2 | reading | 1 |",

View File

@ -1,5 +1,7 @@
//! Tests for the table_names implementation
#![allow(unused_imports, dead_code, unused_macros)]
use arrow_deps::arrow::datatypes::DataType;
use internal_types::{schema::builder::SchemaBuilder, selection::Selection};
use query::{Database, PartitionChunk};

View File

@ -4,7 +4,7 @@ use data_types::{
DatabaseName,
};
use object_store::{memory::InMemory, ObjectStore};
use query::Database;
use query::{exec::Executor, Database};
use crate::{db::Db, JobRegistry};
use std::{num::NonZeroU32, sync::Arc};
@ -13,11 +13,25 @@ use std::{num::NonZeroU32, sync::Arc};
pub fn make_db() -> Db {
let server_id: NonZeroU32 = NonZeroU32::new(1).unwrap();
let object_store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let exec = Arc::new(Executor::new(1));
Db::new(
DatabaseRules::new(DatabaseName::new("placeholder").unwrap()),
server_id,
object_store,
exec,
None, // wal buffer
Arc::new(JobRegistry::new()),
)
}
pub fn make_database(server_id: NonZeroU32, object_store: Arc<ObjectStore>, db_name: &str) -> Db {
let exec = Arc::new(Executor::new(1));
Db::new(
DatabaseRules::new(DatabaseName::new(db_name.to_string()).unwrap()),
server_id,
object_store,
exec,
None, // wal buffer
Arc::new(JobRegistry::new()),
)

View File

@ -273,12 +273,13 @@ mod tests {
};
use super::*;
use crate::db::test_helpers::write_lp;
use data_types::database_rules::DatabaseRules;
use data_types::DatabaseName;
use futures::TryStreamExt;
use mutable_buffer::chunk::Chunk as ChunkWB;
use object_store::memory::InMemory;
use query::{test::TestLPWriter, Database};
use query::{exec::Executor, Database};
use tracker::MemRegistry;
#[tokio::test]
@ -291,8 +292,7 @@ mem,host=A,region=west used=45 1
"#;
let db = make_db();
let mut writer = TestLPWriter::default();
writer.write_lp_string(&db, &lp).unwrap();
write_lp(&db, &lp);
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let (tx, rx) = tokio::sync::oneshot::channel();
@ -354,9 +354,7 @@ mem,host=A,region=west used=45 1
let registry = MemRegistry::new();
let store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let chunk = Arc::new(DBChunk::MutableBuffer {
chunk: Arc::new(ChunkWB::new(11, &registry)),
partition_key: Arc::new("key".to_string()),
open: false,
chunk: ChunkWB::new(11, &registry).snapshot(),
});
let mut metadata_path = store.new_path();
metadata_path.push_dir("meta");
@ -393,11 +391,13 @@ mem,host=A,region=west used=45 1
pub fn make_db() -> Db {
let object_store = Arc::new(ObjectStore::new_in_memory(InMemory::new()));
let server_id = std::num::NonZeroU32::new(1).unwrap();
let exec = Arc::new(Executor::new(1));
Db::new(
DatabaseRules::new(DatabaseName::new("placeholder").unwrap()),
server_id,
object_store,
exec,
None, // wal buffer
Arc::new(JobRegistry::new()),
)

View File

@ -105,6 +105,10 @@ struct Create {
#[structopt(long)]
drop_non_persisted: bool,
/// Persists chunks to object storage.
#[structopt(long)]
persist: bool,
/// Do not allow writing new data to this database
#[structopt(long)]
immutable: bool,
@ -173,6 +177,7 @@ pub async fn command(url: String, config: Config) -> Result<()> {
buffer_size_hard: command.buffer_size_hard as _,
sort_order: None, // Server-side default
drop_non_persisted: command.drop_non_persisted,
persist: command.persist,
immutable: command.immutable,
}),

View File

@ -106,6 +106,16 @@ pub struct Config {
#[structopt(long = "--data-dir", env = "INFLUXDB_IOX_DB_DIR")]
pub database_directory: Option<PathBuf>,
/// The number of threads to use for the query worker pool.
///
/// IOx uses `--num-threads` threads for handling API requests and
/// will use a dedicated thread pool woth `--num-worker-threads`
/// for running queries.
///
/// If not specified, defaults to the number of cores on the system
#[structopt(long = "--num-worker-threads", env = "INFLUXDB_IOX_NUM_WORKER_THREADS")]
pub num_worker_threads: Option<usize>,
#[structopt(
long = "--object-store",
env = "INFLUXDB_IOX_OBJECT_STORE",

View File

@ -10,7 +10,10 @@ use object_store::{
};
use observability_deps::tracing::{self, error, info, warn, Instrument};
use panic_logging::SendPanicsToTracing;
use server::{ConnectionManagerImpl as ConnectionManager, Server as AppServer};
use server::{
ConnectionManagerImpl as ConnectionManager, Server as AppServer,
ServerConfig as AppServerConfig,
};
use snafu::{ResultExt, Snafu};
use std::{convert::TryFrom, fs, net::SocketAddr, path::PathBuf, sync::Arc};
@ -124,9 +127,20 @@ pub async fn main(logging_level: LoggingLevel, config: Config) -> Result<()> {
let object_store = ObjectStore::try_from(&config)?;
let object_storage = Arc::new(object_store);
let server_config = AppServerConfig::new(object_storage);
let server_config = if let Some(n) = config.num_worker_threads {
info!(
num_worker_threads = n,
"Using specified number of worker threads"
);
server_config.with_num_worker_threads(n)
} else {
server_config
};
let connection_manager = ConnectionManager {};
let app_server = Arc::new(AppServer::new(connection_manager, object_storage));
let app_server = Arc::new(AppServer::new(connection_manager, server_config));
// if this ID isn't set the server won't be usable until this is set via an API
// call

View File

@ -12,7 +12,6 @@
// Influx crates
use super::super::commands::metrics;
use arrow_deps::datafusion::physical_plan::collect;
use data_types::{
http::WalMetadataQuery,
names::{org_and_bucket_to_database, OrgBucketMappingError},
@ -32,7 +31,7 @@ use http::header::{CONTENT_ENCODING, CONTENT_TYPE};
use hyper::{Body, Method, Request, Response, StatusCode};
use observability_deps::{
opentelemetry::KeyValue,
tracing::{self, debug, error, info},
tracing::{self, debug, error},
};
use routerify::{prelude::*, Middleware, RequestInfo, Router, RouterError, RouterService};
use serde::Deserialize;
@ -312,11 +311,11 @@ where
Router::builder()
.data(server)
.middleware(Middleware::pre(|req| async move {
info!(request = ?req, "Processing request");
debug!(request = ?req, "Processing request");
Ok(req)
}))
.middleware(Middleware::post(|res| async move {
info!(response = ?res, "Successfully processed request");
debug!(response = ?res, "Successfully processed request");
Ok(res)
})) // this endpoint is for API backward compatibility with InfluxDB 2.x
.post("/api/v2/write", write::<M>)
@ -523,12 +522,12 @@ async fn query<M: ConnectionManager + Send + Sync + Debug + 'static>(
let physical_plan = planner
.query(db, &q, executor.as_ref())
.await
.context(PlanningSQLQuery { query: &q })?;
// TODO: stream read results out rather than rendering the
// whole thing in mem
let batches = collect(physical_plan)
let batches = executor
.collect(physical_plan)
.await
.map_err(|e| Box::new(e) as _)
.context(Query { db_name })?;
@ -733,27 +732,24 @@ mod tests {
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use arrow_deps::{arrow::record_batch::RecordBatch, assert_table_eq};
use query::exec::Executor;
use reqwest::{Client, Response};
use data_types::{
database_rules::{DatabaseRules, WalBufferConfig, WalBufferRollover},
wal::WriterSummary,
DatabaseName,
};
use data_types::{database_rules::DatabaseRules, DatabaseName};
use object_store::{memory::InMemory, ObjectStore};
use serde::de::DeserializeOwned;
use server::{db::Db, ConnectionManagerImpl};
use server::{db::Db, ConnectionManagerImpl, ServerConfig as AppServerConfig};
use std::num::NonZeroU32;
use test_helpers::assert_contains;
fn config() -> AppServerConfig {
AppServerConfig::new(Arc::new(ObjectStore::new_in_memory(InMemory::new())))
.with_num_worker_threads(1)
}
#[tokio::test]
async fn test_health() {
let test_storage = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
let server_url = test_server(Arc::clone(&test_storage));
let app_server = Arc::new(AppServer::new(ConnectionManagerImpl {}, config()));
let server_url = test_server(Arc::clone(&app_server));
let client = Client::new();
let response = client.get(&format!("{}/health", server_url)).send().await;
@ -764,20 +760,16 @@ mod tests {
#[tokio::test]
async fn test_write() {
let test_storage = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
test_storage.set_id(NonZeroU32::new(1).unwrap()).unwrap();
test_storage
let app_server = Arc::new(AppServer::new(ConnectionManagerImpl {}, config()));
app_server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
app_server
.create_database(
DatabaseRules::new(DatabaseName::new("MyOrg_MyBucket").unwrap()),
test_storage.require_id().unwrap(),
Arc::clone(&test_storage.store),
app_server.require_id().unwrap(),
)
.await
.unwrap();
let server_url = test_server(Arc::clone(&test_storage));
let server_url = test_server(Arc::clone(&app_server));
let client = Client::new();
@ -798,7 +790,7 @@ mod tests {
check_response("write", response, StatusCode::NO_CONTENT, "").await;
// Check that the data got into the right bucket
let test_db = test_storage
let test_db = app_server
.db(&DatabaseName::new("MyOrg_MyBucket").unwrap())
.expect("Database exists");
@ -816,20 +808,16 @@ mod tests {
#[tokio::test]
async fn test_write_metrics() {
metrics::init_metrics_for_test();
let test_storage = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
test_storage.set_id(NonZeroU32::new(1).unwrap()).unwrap();
test_storage
let app_server = Arc::new(AppServer::new(ConnectionManagerImpl {}, config()));
app_server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
app_server
.create_database(
DatabaseRules::new(DatabaseName::new("MetricsOrg_MetricsBucket").unwrap()),
test_storage.require_id().unwrap(),
Arc::clone(&test_storage.store),
app_server.require_id().unwrap(),
)
.await
.unwrap();
let server_url = test_server(Arc::clone(&test_storage));
let server_url = test_server(Arc::clone(&app_server));
let client = Client::new();
@ -878,20 +866,16 @@ mod tests {
/// returns a client for communicting with the server, and the server
/// endpoint
async fn setup_test_data() -> (Client, String) {
let test_storage: Arc<AppServer<ConnectionManagerImpl>> = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
test_storage.set_id(NonZeroU32::new(1).unwrap()).unwrap();
test_storage
let app_server = Arc::new(AppServer::new(ConnectionManagerImpl {}, config()));
app_server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
app_server
.create_database(
DatabaseRules::new(DatabaseName::new("MyOrg_MyBucket").unwrap()),
test_storage.require_id().unwrap(),
Arc::clone(&test_storage.store),
app_server.require_id().unwrap(),
)
.await
.unwrap();
let server_url = test_server(Arc::clone(&test_storage));
let server_url = test_server(Arc::clone(&app_server));
let client = Client::new();
@ -1015,20 +999,16 @@ mod tests {
#[tokio::test]
async fn test_gzip_write() {
let test_storage = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
test_storage.set_id(NonZeroU32::new(1).unwrap()).unwrap();
test_storage
let app_server = Arc::new(AppServer::new(ConnectionManagerImpl {}, config()));
app_server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
app_server
.create_database(
DatabaseRules::new(DatabaseName::new("MyOrg_MyBucket").unwrap()),
test_storage.require_id().unwrap(),
Arc::clone(&test_storage.store),
app_server.require_id().unwrap(),
)
.await
.unwrap();
let server_url = test_server(Arc::clone(&test_storage));
let server_url = test_server(Arc::clone(&app_server));
let client = Client::new();
let lp_data = "h2o_temperature,location=santa_monica,state=CA surface_degrees=65.2,bottom_degrees=50.4 1568756160";
@ -1049,7 +1029,7 @@ mod tests {
check_response("gzip_write", response, StatusCode::NO_CONTENT, "").await;
// Check that the data got into the right bucket
let test_db = test_storage
let test_db = app_server
.db(&DatabaseName::new("MyOrg_MyBucket").unwrap())
.expect("Database exists");
@ -1067,20 +1047,16 @@ mod tests {
#[tokio::test]
async fn write_to_invalid_database() {
let test_storage = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
test_storage.set_id(NonZeroU32::new(1).unwrap()).unwrap();
test_storage
let app_server = Arc::new(AppServer::new(ConnectionManagerImpl {}, config()));
app_server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
app_server
.create_database(
DatabaseRules::new(DatabaseName::new("MyOrg_MyBucket").unwrap()),
test_storage.require_id().unwrap(),
Arc::clone(&test_storage.store),
app_server.require_id().unwrap(),
)
.await
.unwrap();
let server_url = test_server(Arc::clone(&test_storage));
let server_url = test_server(Arc::clone(&app_server));
let client = Client::new();
@ -1103,115 +1079,6 @@ mod tests {
.await;
}
#[tokio::test]
async fn get_wal_meta() {
let server = Arc::new(AppServer::new(
ConnectionManagerImpl {},
Arc::new(ObjectStore::new_in_memory(InMemory::new())),
));
server.set_id(NonZeroU32::new(1).unwrap()).unwrap();
let server_url = test_server(Arc::clone(&server));
let database_name = "foo_bar";
let rules = DatabaseRules {
name: DatabaseName::new(database_name).unwrap(),
partition_template: Default::default(),
wal_buffer_config: Some(WalBufferConfig {
buffer_size: 500,
segment_size: 10,
buffer_rollover: WalBufferRollover::ReturnError,
store_segments: true,
close_segment_after: None,
}),
lifecycle_rules: Default::default(),
shard_config: None,
};
server
.create_database(
rules,
server.require_id().unwrap(),
Arc::clone(&server.store),
)
.await
.unwrap();
let base_url = format!(
"{}/iox/api/v1/databases/{}/wal/meta",
server_url, database_name
);
let client = Client::new();
let r1: WalMetadataResponse = check_json_response(&client, &base_url, StatusCode::OK).await;
let lines: std::result::Result<Vec<_>, _> = influxdb_line_protocol::parse_lines(
"cpu,host=A,region=west usage_system=64i 1590488773254420000",
)
.collect();
server
.write_lines(database_name, &lines.unwrap())
.await
.unwrap();
let r2: WalMetadataResponse = check_json_response(&client, &base_url, StatusCode::OK).await;
let limit_1 = serde_urlencoded::to_string(&WalMetadataQuery {
limit: Some(1),
newer_than: None,
offset: None,
})
.unwrap();
let limit_url = format!("{}?{}", base_url, limit_1);
let r3: WalMetadataResponse =
check_json_response(&client, &limit_url, StatusCode::OK).await;
let limit_future = serde_urlencoded::to_string(&WalMetadataQuery {
limit: None,
offset: None,
newer_than: Some(chrono::Utc::now() + chrono::Duration::seconds(5)),
})
.unwrap();
let future_url = format!("{}?{}", base_url, limit_future);
let r4: WalMetadataResponse =
check_json_response(&client, &future_url, StatusCode::OK).await;
// No data written yet - expect no results
assert_eq!(r1.segments.len(), 1);
assert_eq!(r1.segments[0].size, 0);
assert_eq!(r1.segments[0].writers.len(), 0);
// The WAL segment size is less than the line size
// We therefore expect an open and a closed segment in that order
// With the closed segment containing the written data
// And the open segment containing no data
assert_eq!(r2.segments.len(), 2);
assert_eq!(r2.segments[0].size, 0);
assert!(r2.segments[0].created_at >= r2.segments[1].created_at);
assert!(r2.segments[1].persisted.is_none());
assert_eq!(r2.segments[1].size, 368);
assert_eq!(r2.segments[1].writers.len(), 1);
assert_eq!(
r2.segments[1].writers.values().next().unwrap(),
&WriterSummary {
start_sequence: 1,
end_sequence: 1,
missing_sequence: false
}
);
// Query limited to a single segment - expect only the most recent segment
assert_eq!(r3.segments.len(), 1);
assert_eq!(r3.segments[0], r2.segments[0]);
// Requesting segments from future - expect no results
assert_eq!(r4.segments.len(), 0);
}
fn get_content_type(response: &Result<Response, reqwest::Error>) -> String {
if let Ok(response) = response {
response
@ -1250,6 +1117,7 @@ mod tests {
}
}
#[allow(dead_code)]
async fn check_json_response<T: DeserializeOwned + Eq + Debug>(
client: &Client,
url: &str,
@ -1291,9 +1159,9 @@ mod tests {
/// Run the specified SQL query and return formatted results as a string
async fn run_query(db: Arc<Db>, query: &str) -> Vec<RecordBatch> {
let planner = SQLQueryPlanner::default();
let executor = Executor::new();
let physical_plan = planner.query(db, query, &executor).await.unwrap();
let executor = db.executor();
let physical_plan = planner.query(db, query, executor.as_ref()).unwrap();
collect(physical_plan).await.unwrap()
executor.collect(physical_plan).await.unwrap()
}
}

View File

@ -23,6 +23,11 @@ pub fn default_server_error_handler(error: server::Error) -> tonic::Status {
description: source.to_string(),
}
.into(),
Error::DecodingEntry { source } => FieldViolation {
field: "entry".into(),
description: source.to_string(),
}
.into(),
error => {
error!(?error, "Unexpected error");
InternalError {}.into()

View File

@ -1,3 +1,4 @@
//! Implements the native gRPC IOx query API using Arrow Flight
use std::{pin::Pin, sync::Arc};
use futures::Stream;
@ -19,7 +20,6 @@ use arrow_deps::{
Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo,
HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
},
datafusion::physical_plan::collect,
};
use data_types::{DatabaseName, DatabaseNameError};
use query::{frontend::sql::SQLQueryPlanner, DatabaseStore};
@ -157,15 +157,17 @@ where
let planner = SQLQueryPlanner::default();
let executor = self.server.executor();
let physical_plan = planner
.query(db, &read_info.sql_query, &executor)
.await
.context(PlanningSQLQuery {
query: &read_info.sql_query,
})?;
let physical_plan =
planner
.query(db, &read_info.sql_query, &executor)
.context(PlanningSQLQuery {
query: &read_info.sql_query,
})?;
// execute the query
let results = collect(Arc::clone(&physical_plan))
let results = executor
.new_context()
.collect(Arc::clone(&physical_plan))
.await
.map_err(|e| Box::new(e) as _)
.context(Query {

View File

@ -126,13 +126,8 @@ where
Some(id) => id,
None => return Err(NotFound::default().into()),
};
let object_store = Arc::clone(&self.server.store);
match self
.server
.create_database(rules, server_id, object_store)
.await
{
match self.server.create_database(rules, server_id).await {
Ok(_) => Ok(Response::new(CreateDatabaseResponse {})),
Err(Error::DatabaseAlreadyExists { db_name }) => {
return Err(AlreadyExists {

View File

@ -714,7 +714,6 @@ where
let plan = planner
.table_names(db.as_ref(), predicate)
.await
.map_err(|e| Box::new(e) as _)
.context(ListingTables { db_name })?;
let executor = db_store.executor();
@ -765,7 +764,6 @@ where
let tag_key_plan = planner
.tag_keys(db.as_ref(), predicate)
.await
.map_err(|e| Box::new(e) as _)
.context(ListingColumns {
db_name: db_name.as_str(),
@ -825,7 +823,6 @@ where
let tag_value_plan = planner
.tag_values(db.as_ref(), tag_name, predicate)
.await
.map_err(|e| Box::new(e) as _)
.context(ListingTagValues { db_name, tag_name })?;
@ -882,7 +879,6 @@ where
let series_plan = planner
.read_filter(db.as_ref(), predicate)
.await
.map_err(|e| Box::new(e) as _)
.context(PlanningFilteringSeries { db_name })?;
@ -968,14 +964,10 @@ where
let grouped_series_set_plan = match gby_agg {
GroupByAndAggregate::Columns { agg, group_columns } => {
planner
.read_group(db.as_ref(), predicate, agg, &group_columns)
.await
planner.read_group(db.as_ref(), predicate, agg, &group_columns)
}
GroupByAndAggregate::Window { agg, every, offset } => {
planner
.read_window_aggregate(db.as_ref(), predicate, agg, every, offset)
.await
planner.read_window_aggregate(db.as_ref(), predicate, agg, every, offset)
}
};
let grouped_series_set_plan = grouped_series_set_plan
@ -1039,7 +1031,6 @@ where
let field_list_plan = planner
.field_columns(db.as_ref(), predicate)
.await
.map_err(|e| Box::new(e) as _)
.context(ListingFields { db_name })?;

View File

@ -47,6 +47,23 @@ where
let lines_written = lp_line_count as u64;
Ok(Response::new(WriteResponse { lines_written }))
}
async fn write_entry(
&self,
request: tonic::Request<WriteEntryRequest>,
) -> Result<tonic::Response<WriteEntryResponse>, tonic::Status> {
let request = request.into_inner();
if request.entry.is_empty() {
return Err(FieldViolation::required("entry").into());
}
self.server
.write_entry(&request.db_name, request.entry)
.await
.map_err(default_server_error_handler)?;
Ok(Response::new(WriteEntryResponse {}))
}
}
/// Instantiate the write service

View File

@ -277,7 +277,7 @@ async fn test_chunk_get() {
partition_key: "cpu".into(),
id: 0,
storage: ChunkStorage::OpenMutableBuffer as i32,
estimated_bytes: 145,
estimated_bytes: 137,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -286,7 +286,7 @@ async fn test_chunk_get() {
partition_key: "disk".into(),
id: 0,
storage: ChunkStorage::OpenMutableBuffer as i32,
estimated_bytes: 107,
estimated_bytes: 103,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -452,7 +452,7 @@ async fn test_list_partition_chunks() {
partition_key: "cpu".into(),
id: 0,
storage: ChunkStorage::OpenMutableBuffer as i32,
estimated_bytes: 145,
estimated_bytes: 137,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,

View File

@ -191,7 +191,7 @@ async fn test_get_chunks() {
.and(predicate::str::contains(
r#""storage": "OpenMutableBuffer","#,
))
.and(predicate::str::contains(r#""estimated_bytes": 145"#))
.and(predicate::str::contains(r#""estimated_bytes": 137"#))
// Check for a non empty timestamp such as
// "time_of_first_write": "2021-03-30T17:11:10.723866Z",
.and(predicate::str::contains(r#""time_of_first_write": "20"#));

Binary file not shown.

View File

@ -7,7 +7,7 @@ description = "Utilities for tracking resource utilisation within IOx"
[dependencies]
futures = "0.3.7"
futures = "0.3"
hashbrown = "0.9.1"
observability_deps = { path = "../observability_deps" }
pin-project = "1.0"

View File

@ -7,7 +7,7 @@ edition = "2018"
[dependencies] # In alphabetical order
byteorder = "1.3.4"
crc32fast = "1.2.0"
futures = "0.3.4"
futures = "0.3"
itertools = "0.9.0"
once_cell = { version = "1.4.0", features = ["parking_lot"] }
regex = "1.3.7"