Merge branch 'main' into cn/remove-obsolete-docs-infra

pull/24376/head
kodiakhq[bot] 2023-04-14 17:14:45 +00:00 committed by GitHub
commit bc3b69ef3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
179 changed files with 5466 additions and 7991 deletions

524
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -81,7 +81,6 @@ members = [
"trogging",
"wal",
"workspace-hack",
"write_summary",
]
default-members = ["influxdb_iox"]
@ -115,12 +114,18 @@ edition = "2021"
license = "MIT OR Apache-2.0"
[workspace.dependencies]
arrow = { version = "36.0.0" }
arrow-flight = { version = "36.0.0" }
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev="b87871fdd1f4ce64201eb1f7c79a0547627f37e9", default-features = false }
datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev="b87871fdd1f4ce64201eb1f7c79a0547627f37e9" }
arrow = { version = "37.0.0" }
arrow-flight = { version = "37.0.0" }
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev="6e819d6c2b9280198c67fa16df3e54c79ce46ca2", default-features = false }
datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev="6e819d6c2b9280198c67fa16df3e54c79ce46ca2" }
hashbrown = { version = "0.13.2" }
parquet = { version = "36.0.0" }
parquet = { version = "37.0.0" }
tonic = { version = "0.9.1", features = ["tls", "tls-webpki-roots"] }
tonic-build = { version = "0.9.1" }
tonic-health = { version = "0.9.1" }
tonic-reflection = { version = "0.9.1" }
# This profile optimizes for runtime performance and small binary size at the expense of longer
# build times. It's most suitable for final release builds.

View File

@ -36,20 +36,17 @@ RUN \
du -cshx /usr/local/rustup /usr/local/cargo/registry /usr/local/cargo/git /influxdb_iox/target
FROM debian:bullseye-slim
RUN apt update \
&& apt install --yes ca-certificates gettext-base libssl1.1 --no-install-recommends \
&& rm -rf /var/lib/{apt,dpkg,cache,log}
RUN groupadd --gid 1500 iox \
&& rm -rf /var/lib/{apt,dpkg,cache,log} \
&& groupadd --gid 1500 iox \
&& useradd --uid 1500 --gid iox --shell /bin/bash --create-home iox
USER iox
RUN mkdir ~/.influxdb_iox
RUN ls -la ~/.influxdb_iox
ARG PACKAGE=influxdb_iox
ENV PACKAGE=$PACKAGE
@ -57,7 +54,6 @@ ENV PACKAGE=$PACKAGE
COPY --from=build "/root/$PACKAGE" "/usr/bin/$PACKAGE"
COPY docker/entrypoint.sh /usr/bin/entrypoint.sh
EXPOSE 8080 8082
ENTRYPOINT ["/usr/bin/entrypoint.sh"]

View File

@ -153,7 +153,7 @@ impl StringDictionary<i32> {
))
.len(keys.len())
.add_buffer(keys.collect())
.add_child_data(self.storage.to_arrow(dictionary_nulls).data().clone())
.add_child_data(self.storage.to_arrow(dictionary_nulls).into_data())
.nulls(nulls)
// TODO consider skipping the validation checks by using
// `build_unchecked()`

View File

@ -1,22 +1,24 @@
use std::sync::Arc;
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
/// Prepare an arrow Schema for transport over the Arrow Flight protocol
///
/// Converts dictionary types to underlying types due to <https://github.com/apache/arrow-rs/issues/3389>
pub fn prepare_schema_for_flight(schema: SchemaRef) -> SchemaRef {
let fields = schema
let fields: Fields = schema
.fields()
.iter()
.map(|field| match field.data_type() {
DataType::Dictionary(_, value_type) => Field::new(
field.name(),
value_type.as_ref().clone(),
field.is_nullable(),
)
.with_metadata(field.metadata().clone()),
_ => field.clone(),
DataType::Dictionary(_, value_type) => Arc::new(
Field::new(
field.name(),
value_type.as_ref().clone(),
field.is_nullable(),
)
.with_metadata(field.metadata().clone()),
),
_ => Arc::clone(field),
})
.collect();

View File

@ -288,9 +288,9 @@ mod tests {
Box::new(DataType::Utf8),
))
.len(keys.len())
.add_buffer(keys.data().buffers()[0].clone())
.add_buffer(keys.to_data().buffers()[0].clone())
.nulls(keys.nulls().cloned())
.add_child_data(values.data().clone())
.add_child_data(values.into_data())
.build()
.unwrap();

View File

@ -193,7 +193,7 @@ pub fn equalize_batch_schemas(batches: Vec<RecordBatch>) -> Result<Vec<RecordBat
/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet`
///
/// matches `1d325760-2b20-48de-ab48-2267b034133d`
static REGEX_UUID: Lazy<Regex> = Lazy::new(|| {
pub static REGEX_UUID: Lazy<Regex> = Lazy::new(|| {
Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").expect("UUID regex")
});
@ -249,6 +249,11 @@ fn normalize_for_variable_width(s: Cow<'_, str>) -> String {
REGEX_COL.replace_all(&s, " |").to_string()
}
pub fn strip_table_lines(s: Cow<'_, str>) -> String {
let s = REGEX_LINESEP.replace_all(&s, "----------");
REGEX_COL.replace_all(&s, "").to_string()
}
fn normalize_time_ops(s: &str) -> String {
REGEX_TIME_OP
.replace_all(s, |c: &Captures<'_>| {
@ -276,6 +281,9 @@ pub struct Normalizer {
/// if true, normalize filter predicates for explain plans
/// `FilterExec: <REDACTED>`
pub normalized_filters: bool,
/// if `true`, render tables without borders.
pub no_table_borders: bool,
}
impl Normalizer {
@ -403,5 +411,8 @@ impl Normalizer {
if self.normalized_filters {
output.push("-- Results After Normalizing Filters".into())
}
if self.no_table_borders {
output.push("-- Results After No Table Borders".into())
}
}
}

View File

@ -16,5 +16,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
# crates.io dependencies in alphabetical order.
async-trait = "0.1"
snafu = "0.7"
tonic = "0.8"
tonic = { workspace = true }

View File

@ -18,7 +18,7 @@ metric = { path = "../metric" }
object_store = "0.5.6"
observability_deps = { path = "../observability_deps" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.95"
serde_json = "1.0.96"
snafu = "0.7"
tempfile = "3.5.0"
trace = { path = "../trace" }

View File

@ -1,37 +0,0 @@
//! CLI config for the ingest_replica
use crate::ingester_address::IngesterAddress;
/// CLI config for the ingest_replica
#[derive(Debug, Clone, clap::Parser)]
#[allow(missing_copy_implementations)]
pub struct IngestReplicaConfig {
/// gRPC address for the replica to talk with the ingesters. For
/// example:
///
/// "http://127.0.0.1:8083"
///
/// or
///
/// "http://10.10.10.1:8083,http://10.10.10.2:8083"
///
/// for multiple addresses.
#[clap(
long = "ingester-addresses",
env = "INFLUXDB_IOX_INGESTER_ADDRESSES",
required = true,
num_args=1..,
value_delimiter = ','
)]
pub ingester_addresses: Vec<IngesterAddress>,
/// Sets how many queries the replica will handle simultaneously before
/// rejecting further incoming requests.
#[clap(
long = "concurrent-query-limit",
env = "INFLUXDB_IOX_CONCURRENT_QUERY_LIMIT",
default_value = "200",
action
)]
pub concurrent_query_limit: usize,
}

View File

@ -16,7 +16,6 @@ pub mod authz;
pub mod catalog_dsn;
pub mod compactor2;
pub mod garbage_collector;
pub mod ingest_replica;
pub mod ingester2;
pub mod ingester_address;
pub mod object_store;

View File

@ -1,50 +1,7 @@
//! Querier-related configs.
use crate::ingester_address::IngesterAddress;
use data_types::{IngesterMapping, ShardIndex};
use serde::Deserialize;
use snafu::{ResultExt, Snafu};
use std::{
collections::HashMap, fs, io, num::NonZeroUsize, path::PathBuf, str::FromStr, sync::Arc,
};
#[derive(Debug, Snafu)]
#[allow(missing_docs)]
pub enum Error {
#[snafu(display("Could not read shard to ingester file `{}`: {source}", file.display()))]
ShardToIngesterFileReading { source: io::Error, file: PathBuf },
#[snafu(display("Could not deserialize JSON from ingester config: {source}"))]
ShardToIngesterDeserializing { source: serde_json::Error },
#[snafu(display(
"Specifying `\"ignoreAll\": true` requires that both the `ingesters` and \
`shards` configurations are empty. `ingesters`: `{:#?}`, `shards`: `{:#?}`",
ingesters,
shards,
))]
IgnoreAllRequiresEmptyConfig {
ingesters: HashMap<Arc<str>, Arc<IngesterConfig>>,
shards: HashMap<ShardIndex, ShardConfig>,
},
#[snafu(display(
"Ingester `{name}` must either set the `addr` to a non-empty value or set `ignore` to true"
))]
IngesterAddrRequired { name: Arc<str> },
#[snafu(display(
"Could not find ingester `{name}` specified for shard index `{shard_index}`"
))]
IngesterNotFound {
shard_index: ShardIndex,
name: Arc<str>,
},
#[snafu(context(false))]
IngesterAddress {
source: crate::ingester_address::Error,
},
}
use std::num::NonZeroUsize;
/// CLI config for querier configuration
#[derive(Debug, Clone, PartialEq, Eq, clap::Parser)]
@ -71,144 +28,6 @@ pub struct QuerierConfig {
)]
pub exec_mem_pool_bytes: usize,
/// Path to a JSON file containing a Shard index to ingesters gRPC mapping. For example:
///
/// ```json
/// {
/// // Flag to ignore all ingesters and only query persisted data. Useful for development
/// // or creating "cold data only" clusters.
/// //
/// // If this is set to `true`, having non-empty `ingesters` or `shards` is a startup
/// // error.
/// //
/// // default: false
/// "ignoreAll": false,
///
/// // Mapping of ingester name to config.
/// //
/// // default: {}
/// "ingesters": {
/// "i1": {
/// // Ingester address as URL.
/// //
/// // If this is `null` but `ignore` is false, it is an error.
/// //
/// // default: null
/// "addr": "http://ingester-1:1234"
/// },
/// "i2": {
/// // Flag to ignore this ingester at query time and not contact it.
/// //
/// // default: false
/// "ignore": true
/// }
/// },
///
/// // Mapping of shard indexes (as strings) to ingester names. Queries to shards that do
/// // not appear in this mapping will return an error. Using an ingester name in the
/// // `shards` mapping that does not appear in the `ingesters` mapping is a startup error.
/// //
/// // default: {}
/// "shards": {
/// "1": {
/// // Name of an ingester from the `ingester` mapping.
/// //
/// // If this is `null`, queries to this shard will error.
/// //
/// // default: null
/// "ingester": "i1"
/// },
/// "2": {
/// "ingester": "i1"
/// },
/// "3": {
/// "ingester": "i2"
/// },
/// "5": {
/// // Flag to not fetch data from any ingester for queries to this shard.
/// //
/// // default: false
/// "ignore": true
/// }
/// }
/// }
/// ```
#[clap(
long = "shard-to-ingesters-file",
env = "INFLUXDB_IOX_SHARD_TO_INGESTERS_FILE",
action
)]
pub shard_to_ingesters_file: Option<PathBuf>,
/// JSON containing a Shard index to ingesters gRPC mapping. For example:
///
/// ```json
/// {
/// // Flag to ignore all ingesters and only query persisted data. Useful for development
/// // or creating "cold data only" clusters.
/// //
/// // If this is set to `true`, having non-empty `ingesters` or `shards` is a startup
/// // error.
/// //
/// // default: false
/// "ignoreAll": false,
///
/// // Mapping of ingester name to config.
/// //
/// // default: {}
/// "ingesters": {
/// "i1": {
/// // Ingester address as URL.
/// //
/// // If this is `null` but `ignore` is false, it is an error.
/// //
/// // default: null
/// "addr": "http://ingester-1:1234"
/// },
/// "i2": {
/// // Flag to ignore this ingester at query time and not contact it.
/// //
/// // default: false
/// "ignore": true
/// }
/// },
///
/// // Mapping of shard indexes (as strings) to ingester names. Queries to shards that do
/// // not appear in this mapping will return an error. Using an ingester name in the
/// // `shards` mapping that does not appear in the `ingesters` mapping is a startup error.
/// //
/// // default: {}
/// "shards": {
/// "1": {
/// // Name of an ingester from the `ingester` mapping.
/// //
/// // If this is `null`, queries to this shard will error.
/// //
/// // default: null
/// "ingester": "i1"
/// },
/// "2": {
/// "ingester": "i1"
/// },
/// "3": {
/// "ingester": "i2"
/// },
/// "5": {
/// // Flag to not fetch data from any ingester for queries to this shard.
/// //
/// // default: false
/// "ignore": true
/// }
/// }
/// }
/// ```
#[clap(
long = "shard-to-ingesters",
env = "INFLUXDB_IOX_SHARD_TO_INGESTERS",
action
)]
pub shard_to_ingesters: Option<String>,
/// gRPC address for the router to talk with the ingesters. For
/// example:
///
@ -219,8 +38,14 @@ pub struct QuerierConfig {
/// "http://10.10.10.1:8083,http://10.10.10.2:8083"
///
/// for multiple addresses.
#[clap(long = "ingester-addresses", env = "INFLUXDB_IOX_INGESTER_ADDRESSES", num_args=1.., value_delimiter = ',')]
pub ingester_addresses: Vec<String>,
#[clap(
long = "ingester-addresses",
env = "INFLUXDB_IOX_INGESTER_ADDRESSES",
required = false,
num_args = 0..,
value_delimiter = ','
)]
pub ingester_addresses: Vec<IngesterAddress>,
/// Size of the RAM cache used to store catalog metadata information in bytes.
#[clap(
@ -256,11 +81,12 @@ pub struct QuerierConfig {
/// returning results that do not include unpersisted data and enter "circuit breaker mode"
/// to avoid continually retrying the failing connection on subsequent queries.
///
/// If circuits are open, the querier will NOT contact the ingester and no unpersisted data will be presented to the user.
/// If circuits are open, the querier will NOT contact the ingester and no unpersisted data
/// will be presented to the user.
///
/// Circuits will switch to "half open" after some jittered timeout and the querier will try to use the ingester in
/// question again. If this succeeds, we are back to normal, otherwise it will back off exponentially before trying
/// again (and again ...).
/// Circuits will switch to "half open" after some jittered timeout and the querier will try to
/// use the ingester in question again. If this succeeds, we are back to normal, otherwise it
/// will back off exponentially before trying again (and again ...).
///
/// In a production environment the `ingester_circuit_state` metric should be monitored.
#[clap(
@ -279,46 +105,6 @@ impl QuerierConfig {
self.num_query_threads
}
/// Return the querier config's ingester addresses. If `--shard-to-ingesters-file` is used to
/// specify a JSON file containing shard to ingester address mappings, this returns `Err` if
/// there are any problems reading, deserializing, or interpreting the file.
// When we have switched to using the RPC write path only, this method can be changed to be
// infallible as clap will handle failure to parse the list of strings.
//
// Switching into the RPC write path mode requires *both* the `INFLUXDB_IOX_RPC_MODE`
// environment variable to be specified *and* `--ingester-addresses` to be set in order to
// switch. Setting `INFLUXDB_IOX_RPC_MODE` and shard-to-ingesters mapping, or not setting
// `INFLUXDB_IOX_RPC_MODE` and setting ingester addresses, will panic.
pub fn ingester_addresses(&self) -> Result<IngesterAddresses, Error> {
if let Some(file) = &self.shard_to_ingesters_file {
let contents =
fs::read_to_string(file).context(ShardToIngesterFileReadingSnafu { file })?;
let map = deserialize_shard_ingester_map(&contents)?;
if map.is_empty() {
Ok(IngesterAddresses::None)
} else {
Ok(IngesterAddresses::ByShardIndex(map))
}
} else if let Some(contents) = &self.shard_to_ingesters {
let map = deserialize_shard_ingester_map(contents)?;
if map.is_empty() {
Ok(IngesterAddresses::None)
} else {
Ok(IngesterAddresses::ByShardIndex(map))
}
} else if !self.ingester_addresses.is_empty() {
Ok(IngesterAddresses::List(
self.ingester_addresses
.iter()
.map(|addr| IngesterAddress::from_str(addr))
.collect::<Result<Vec<_>, _>>()?,
))
} else {
Ok(IngesterAddresses::None)
}
}
/// Size of the RAM cache pool for metadata in bytes.
pub fn ram_pool_metadata_bytes(&self) -> usize {
self.ram_pool_metadata_bytes
@ -335,131 +121,18 @@ impl QuerierConfig {
}
}
fn deserialize_shard_ingester_map(
contents: &str,
) -> Result<HashMap<ShardIndex, IngesterMapping>, Error> {
let ingesters_config: IngestersConfig =
serde_json::from_str(contents).context(ShardToIngesterDeserializingSnafu)?;
if ingesters_config.ignore_all
&& (!ingesters_config.ingesters.is_empty() || !ingesters_config.shards.is_empty())
{
return IgnoreAllRequiresEmptyConfigSnafu {
ingesters: ingesters_config.ingesters,
shards: ingesters_config.shards,
}
.fail();
}
let mut ingester_mapping_by_name = HashMap::new();
for (name, config) in &ingesters_config.ingesters {
match (config.ignore, config.addr.as_ref()) {
(true, _) => {
ingester_mapping_by_name.insert(name, IngesterMapping::Ignore);
}
(false, None) => {
return IngesterAddrRequiredSnafu {
name: Arc::clone(name),
}
.fail();
}
(false, Some(addr)) if addr.is_empty() => {
return IngesterAddrRequiredSnafu {
name: Arc::clone(name),
}
.fail();
}
(false, Some(addr)) => {
ingester_mapping_by_name.insert(name, IngesterMapping::Addr(Arc::clone(addr)));
}
}
}
let mut map = HashMap::new();
for (shard_index, shard_config) in ingesters_config.shards {
if shard_config.ignore {
map.insert(shard_index, IngesterMapping::Ignore);
continue;
}
match shard_config.ingester {
Some(ingester) => match ingester_mapping_by_name.get(&ingester) {
Some(ingester_mapping) => {
map.insert(shard_index, ingester_mapping.clone());
}
None => {
return IngesterNotFoundSnafu {
name: Arc::clone(&ingester),
shard_index,
}
.fail();
}
},
None => {
map.insert(shard_index, IngesterMapping::NotMapped);
}
}
}
Ok(map)
}
/// Ingester addresses.
#[derive(Debug, PartialEq, Eq)]
pub enum IngesterAddresses {
/// A mapping from shard index to ingesters.
ByShardIndex(HashMap<ShardIndex, IngesterMapping>),
/// A list of ingester2 addresses.
List(Vec<IngesterAddress>),
/// No connections, meaning only persisted data should be used.
None,
}
#[derive(Debug, Deserialize, Default)]
#[serde(rename_all = "camelCase")]
struct IngestersConfig {
#[serde(default)]
ignore_all: bool,
#[serde(default)]
ingesters: HashMap<Arc<str>, Arc<IngesterConfig>>,
#[serde(default)]
shards: HashMap<ShardIndex, ShardConfig>,
}
/// Ingester config.
#[derive(Debug, Deserialize)]
pub struct IngesterConfig {
addr: Option<Arc<str>>,
#[serde(default)]
ignore: bool,
}
/// Shard config.
#[derive(Debug, Deserialize)]
pub struct ShardConfig {
ingester: Option<Arc<str>>,
#[serde(default)]
ignore: bool,
}
#[cfg(test)]
mod tests {
use super::*;
use clap::Parser;
use test_helpers::assert_error;
use test_helpers::assert_contains;
#[test]
fn test_default() {
let actual = QuerierConfig::try_parse_from(["my_binary"]).unwrap();
assert_eq!(actual.num_query_threads(), None);
assert!(matches!(
actual.ingester_addresses().unwrap(),
IngesterAddresses::None,
));
assert!(actual.ingester_addresses.is_empty());
}
#[test]
@ -471,26 +144,25 @@ mod tests {
actual.num_query_threads(),
Some(NonZeroUsize::new(42).unwrap())
);
assert!(matches!(
actual.ingester_addresses().unwrap(),
IngesterAddresses::None,
));
}
#[test]
fn test_ingester_addresses_list() {
let actual = QuerierConfig::try_parse_from([
let querier = QuerierConfig::try_parse_from([
"my_binary",
"--ingester-addresses",
"http://ingester-0:8082,http://ingester-1:8082",
])
.unwrap();
let expected = IngesterAddresses::List(vec![
IngesterAddress::from_str("http://ingester-0:8082").unwrap(),
IngesterAddress::from_str("http://ingester-1:8082").unwrap(),
]);
assert_eq!(actual.ingester_addresses().unwrap(), expected);
let actual: Vec<_> = querier
.ingester_addresses
.iter()
.map(ToString::to_string)
.collect();
let expected = vec!["http://ingester-0:8082/", "http://ingester-1:8082/"];
assert_eq!(actual, expected);
}
#[test]
@ -500,285 +172,15 @@ mod tests {
"--ingester-addresses",
"\\ingester-0:8082",
])
.unwrap()
.ingester_addresses();
assert_error!(actual, Error::IngesterAddress { .. });
}
.unwrap_err()
.to_string();
#[test]
fn supply_json_value() {
let actual = QuerierConfig::try_parse_from([
"my_binary",
"--shard-to-ingesters",
r#"{
"ignoreAll": false,
"ingesters": {
"i1": {
"addr": "http://ingester-1:1234"
},
"i2": {
"ignore": true
},
"i3": {
"ignore": true,
"addr": "http://ingester-2:2345"
}
},
"shards": {
"1": {
"ingester": "i1"
},
"2": {
"ingester": "i2"
},
"5": {
"ignore": true
}
}
}"#,
])
.unwrap();
let expected = IngesterAddresses::ByShardIndex(
[
(
ShardIndex::new(1),
IngesterMapping::Addr("http://ingester-1:1234".into()),
),
(ShardIndex::new(2), IngesterMapping::Ignore),
(ShardIndex::new(5), IngesterMapping::Ignore),
]
.into_iter()
.collect(),
assert_contains!(
actual,
"error: \
invalid value '\\ingester-0:8082' \
for '--ingester-addresses [<INGESTER_ADDRESSES>...]': \
Invalid: invalid uri character"
);
assert_eq!(actual.ingester_addresses().unwrap(), expected);
}
#[test]
fn successful_deserialization() {
let contents = r#"{
"ignoreAll": false,
"ingesters": {
"i1": {
"addr": "http://ingester-1:1234"
},
"i2": {
"ignore": true
},
"i3": {
"ignore": true,
"addr": "http://ingester-2:2345"
}
},
"shards": {
"1": {
"ingester": "i1"
},
"2": {
"ingester": "i2"
},
"3": {
"ingester": "i1",
"ignore": true
},
"5": {
"ignore": true
}
}
}"#;
let map = deserialize_shard_ingester_map(contents).unwrap();
let expected = [
(
ShardIndex::new(1),
IngesterMapping::Addr("http://ingester-1:1234".into()),
),
(ShardIndex::new(2), IngesterMapping::Ignore),
(ShardIndex::new(3), IngesterMapping::Ignore),
(ShardIndex::new(5), IngesterMapping::Ignore),
]
.into_iter()
.collect();
assert_eq!(map, expected);
}
#[test]
fn unsuccessful_deserialization() {
let map = deserialize_shard_ingester_map("");
assert_error!(map, Error::ShardToIngesterDeserializing { .. });
}
#[test]
fn ignore_all_requires_empty_maps() {
let expected = HashMap::new();
let map = deserialize_shard_ingester_map(
r#"{
"ignoreAll": true
}"#,
);
assert_eq!(map.unwrap(), expected);
let map = deserialize_shard_ingester_map(
r#"{
"ignoreAll": true,
"ingesters": {},
"shards": {}
}"#,
);
assert_eq!(map.unwrap(), expected);
let map = deserialize_shard_ingester_map(
r#"{
"ignoreAll": true,
"ingesters": {
"i1": {
"addr": "http://ingester-1:1234"
}
},
"shards": {}
}"#,
);
assert_error!(map, Error::IgnoreAllRequiresEmptyConfig { .. });
let map = deserialize_shard_ingester_map(
r#"{
"ignoreAll": true,
"ingesters": {},
"shards": {
"1": {
"ingester": "i1"
}
}
}"#,
);
assert_error!(map, Error::IgnoreAllRequiresEmptyConfig { .. });
let map = deserialize_shard_ingester_map(
r#"{
"ignoreAll": true,
"ingesters": {
"i1": {
"addr": "http://ingester-1:1234"
}
},
"shards": {
"1": {
"ingester": "i1"
}
}
}"#,
);
assert_error!(map, Error::IgnoreAllRequiresEmptyConfig { .. });
}
#[test]
fn ingester_addr_must_be_specified_if_not_ignored() {
let map = deserialize_shard_ingester_map(
r#"{
"ingesters": {
"i1": {}
}
}"#,
);
assert_error!(map, Error::IngesterAddrRequired { ref name } if name.as_ref() == "i1");
let map = deserialize_shard_ingester_map(
r#"{
"ingesters": {
"i1": {
"addr": ""
}
}
}"#,
);
assert_error!(map, Error::IngesterAddrRequired { ref name } if name.as_ref() == "i1");
}
#[test]
fn ingester_must_be_found() {
let map = deserialize_shard_ingester_map(
r#"{
"ingesters": {},
"shards": {
"1": {
"ingester": "i1"
}
}
}"#,
);
assert_error!(
map,
Error::IngesterNotFound { shard_index, ref name }
if shard_index.get() == 1 && name.as_ref() == "i1"
);
let map = deserialize_shard_ingester_map(
r#"{
"ingesters": {},
"shards": {
"1": {
"ingester": ""
}
}
}"#,
);
assert_error!(
map,
Error::IngesterNotFound { shard_index, ref name }
if shard_index.get() == 1 && name.as_ref() == ""
);
}
#[test]
fn shard_to_ingester_varieties() {
let map = deserialize_shard_ingester_map(
r#"{
"ingesters": {
"i1": {
"addr": "http://ingester-1:1234"
}
},
"shards": {
"1": {
"ingester": "i1"
},
"2": {},
"3": {
"ingester": null
},
"4": {
"ignore": true
},
"5": {
"ignore": true,
"ingester": "i1"
},
"6": {
"ignore": true,
"ingester": null
}
}
}"#,
);
let expected = [
(
ShardIndex::new(1),
IngesterMapping::Addr("http://ingester-1:1234".into()),
),
(ShardIndex::new(2), IngesterMapping::NotMapped),
(ShardIndex::new(3), IngesterMapping::NotMapped),
(ShardIndex::new(4), IngesterMapping::Ignore),
(ShardIndex::new(5), IngesterMapping::Ignore),
(ShardIndex::new(6), IngesterMapping::Ignore),
]
.into_iter()
.collect();
assert_eq!(map.unwrap(), expected);
}
}

View File

@ -10,7 +10,7 @@ license.workspace = true
http = "0.2.9"
reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
thiserror = "1.0.40"
tonic = { version = "0.8", features = ["tls", "tls-webpki-roots"] }
tonic = { workspace = true }
tower = "0.4"
workspace-hack = { version = "0.1", path = "../workspace-hack" }

View File

@ -358,6 +358,16 @@ async fn execute_plan(
// Adjust concurrency based on the column count in the partition.
let permits = compute_permits(job_semaphore.total_permits(), partition_info.column_count());
info!(
partition_id = partition_info.partition_id.get(),
jobs_running = job_semaphore.holders_acquired(),
jobs_pending = job_semaphore.holders_pending(),
permits_needed = permits,
permits_acquired = job_semaphore.permits_acquired(),
permits_pending = job_semaphore.permits_pending(),
"requesting job semaphore",
);
// draw semaphore BEFORE creating the DataFusion plan and drop it directly AFTER finishing the
// DataFusion computation (but BEFORE doing any additional external IO).
//

View File

@ -270,19 +270,6 @@ impl std::str::FromStr for ShardIndex {
}
}
/// Potential configurations of ingester connections for the querier to associate with a shard.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum IngesterMapping {
/// Deliberately not mapping this shard to an ingester. If the querier gets a query for
/// this shard, it should return an error.
NotMapped,
/// Deliberately not contacting ingesters for this shard. If the querier gets a query for
/// this shard, it should only return persisted data.
Ignore,
/// The address of the ingester to contact for this shard.
Addr(Arc<str>),
}
/// Unique ID for a `Partition`
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type, sqlx::FromRow)]
#[sqlx(transparent)]
@ -2300,20 +2287,6 @@ impl TimestampMinMax {
}
}
/// Specifies the status of data in the ingestion process.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ShardWriteStatus {
/// Nothing is known about this write (e.g. it refers to a shard for which we have no
/// information)
ShardUnknown,
/// The data has not yet been processed by the ingester, and thus is unreadable
Durable,
/// The data is readable, but not yet persisted
Readable,
/// The data is both readable and persisted to parquet
Persisted,
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;

View File

@ -12,7 +12,6 @@
pub mod config;
pub mod sender;
pub mod sort_exprs;
pub mod watch;
use std::sync::Arc;
@ -20,7 +19,7 @@ use std::task::{Context, Poll};
use datafusion::arrow::array::BooleanArray;
use datafusion::arrow::compute::filter_record_batch;
use datafusion::arrow::datatypes::DataType;
use datafusion::arrow::datatypes::{DataType, Fields};
use datafusion::common::{DataFusionError, ToDFSchema};
use datafusion::datasource::MemTable;
use datafusion::execution::context::TaskContext;
@ -354,12 +353,12 @@ pub fn nullable_schema(schema: SchemaRef) -> SchemaRef {
schema
} else {
// make a new schema with all nullable fields
let new_fields = schema
let new_fields: Fields = schema
.fields()
.iter()
.map(|f| {
// make a copy of the field, but allow it to be nullable
f.clone().with_nullable(true)
f.as_ref().clone().with_nullable(true)
})
.collect();

View File

@ -1,52 +0,0 @@
use datafusion::{
arrow::compute::SortOptions,
physical_expr::{PhysicalSortExpr, PhysicalSortRequirement},
};
/// Structure to build [`PhysicalSortRequirement`]s for ExecutionPlans.
///
/// Replace with `PhysicalSortExpr::from_sort_exprs` when
/// <https://github.com/apache/arrow-datafusion/pull/5863> is merged
/// upstream.
pub fn requirements_from_sort_exprs<'a>(
exprs: impl IntoIterator<Item = &'a PhysicalSortExpr>,
) -> Vec<PhysicalSortRequirement> {
exprs
.into_iter()
.cloned()
.map(PhysicalSortRequirement::from)
.collect()
}
/// Converts the `PhysicalSortRequirement` to `PhysicalSortExpr`.
/// If required ordering is `None` for an entry, the default
/// ordering `ASC, NULLS LAST` is used.
///
/// The default is picked to be consistent with
/// PostgreSQL: <https://www.postgresql.org/docs/current/queries-order.html>
///
/// Replace with `PhysicalSortExpr::from` when
/// <https://github.com/apache/arrow-datafusion/pull/5863> is merged
/// upstream.
pub fn into_sort_expr(requirement: PhysicalSortRequirement) -> PhysicalSortExpr {
let PhysicalSortRequirement { expr, options } = requirement;
let options = options.unwrap_or(SortOptions {
descending: false,
nulls_first: false,
});
PhysicalSortExpr { expr, options }
}
/// This function converts `PhysicalSortRequirement` to `PhysicalSortExpr`
/// for each entry in the input. If required ordering is None for an entry
/// default ordering `ASC, NULLS LAST` if given.
///
/// replace with PhysicalSortExpr::to_sort_exprs when
/// <https://github.com/apache/arrow-datafusion/pull/5863> is merged
/// upstream.
pub fn requirements_to_sort_exprs(
required: impl IntoIterator<Item = PhysicalSortRequirement>,
) -> Vec<PhysicalSortExpr> {
required.into_iter().map(into_sort_expr).collect()
}

View File

@ -2,19 +2,19 @@
InfluxDB IOx supports running SQL queries via [Apache Arrow Flight SQL](https://arrow.apache.org/docs/format/FlightSql.html)
You can use either a native FlightSQL client as well as JDBC / ODBC Flight SQL drivers
You can use either a native FlightSQL client as well as JDBC / ODBC Flight SQL drivers
## JDBC:
To use the JDBC driver with IOx:
1. Download the driver by following the link from [Maven](https://mvnrepository.com/artifact/org.apache.arrow/flight-sql/10.0.1) or [Dremio](https://www.dremio.com/drivers/jdbc/)
2. Use a jdbc conection of the format: `jdbc:arrow-flight-sql://hostname:port?useEncryption=false&iox-namespace-name=NAME`.
2. Use a jdbc conection of the format: `jdbc:arrow-flight-sql://hostname:port?useEncryption=false&database=NAME`
`hostname:port` is the host / port on which the IOx query gRPC API is running (default port is 8082), and `NAME` is the namespace name (for example, `26f7e5a4b7be365b_917b97a92e883afc`)
`hostname:port` is the host / port on which the IOx query gRPC API is running (default port is 8082), and `NAME` is the database name (for example, `26f7e5a4b7be365b_917b97a92e883afc`)
An example JDBC URL is:
```
jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&iox-namespace-name=26f7e5a4b7be365b_917b97a92e883afc
jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&database=26f7e5a4b7be365b_917b97a92e883afc
```

View File

@ -20,5 +20,5 @@ snafu = "0.7"
once_cell = { version = "1", default-features = false }
prost = "0.11"
tokio = { version = "1.27", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
tonic = "0.8"
tonic = { workspace = true }
workspace-hack = { version = "0.1", path = "../workspace-hack" }

View File

@ -4,8 +4,9 @@ use std::fmt::Display;
use arrow_flight::sql::{
ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, Any,
CommandGetCatalogs, CommandGetDbSchemas, CommandGetPrimaryKeys, CommandGetSqlInfo,
CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery, CommandStatementQuery,
CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
CommandGetTables, CommandPreparedStatementQuery, CommandStatementQuery,
};
use bytes::Bytes;
use prost::Message;
@ -75,9 +76,20 @@ pub enum FlightSQLCommand {
CommandGetSqlInfo(CommandGetSqlInfo),
/// Get a list of the available catalogs. See [`CommandGetCatalogs`] for details.
CommandGetCatalogs(CommandGetCatalogs),
/// Get a description of the foreign key columns in the given foreign key table
/// that reference the primary key or the columns representing a unique constraint
/// of the parent table (could be the same or a different table).
/// See [`CommandGetCrossReference`] for details.
CommandGetCrossReference(CommandGetCrossReference),
/// Get a list of the available schemas. See [`CommandGetDbSchemas`]
/// for details and how to interpret the parameters.
CommandGetDbSchemas(CommandGetDbSchemas),
/// Get a description of the foreign key columns that reference the given
/// table's primary key columns (the foreign keys exported by a table) of a table.
/// See [`CommandGetExportedKeys`] for details.
CommandGetExportedKeys(CommandGetExportedKeys),
/// Get the foreign keys of a table. See [`CommandGetImportedKeys`] for details.
CommandGetImportedKeys(CommandGetImportedKeys),
/// Get a list of primary keys. See [`CommandGetPrimaryKeys`] for details.
CommandGetPrimaryKeys(CommandGetPrimaryKeys),
/// Get a list of the available tables
@ -101,6 +113,37 @@ impl Display for FlightSQLCommand {
write!(f, "CommandGetSqlInfo(...)")
}
Self::CommandGetCatalogs(CommandGetCatalogs {}) => write!(f, "CommandGetCatalogs"),
Self::CommandGetCrossReference(CommandGetCrossReference {
pk_catalog,
pk_db_schema,
pk_table,
fk_catalog,
fk_db_schema,
fk_table,
}) => {
write!(
f,
"CommandGetCrossReference(
pk_catalog={},
pk_db_schema={},
pk_table={},
fk_catalog={},
fk_db_schema={},
fk_table={}",
pk_catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
pk_db_schema
.as_ref()
.map(|c| c.as_str())
.unwrap_or("<NONE>"),
pk_table,
fk_catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
fk_db_schema
.as_ref()
.map(|c| c.as_str())
.unwrap_or("<NONE>"),
fk_table,
)
}
Self::CommandGetDbSchemas(CommandGetDbSchemas {
catalog,
db_schema_filter_pattern,
@ -115,6 +158,32 @@ impl Display for FlightSQLCommand {
.unwrap_or("<NONE>")
)
}
Self::CommandGetExportedKeys(CommandGetExportedKeys {
catalog,
db_schema,
table,
}) => {
write!(
f,
"CommandGetExportedKeys(catalog={}, db_schema={}, table={})",
catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
table
)
}
Self::CommandGetImportedKeys(CommandGetImportedKeys {
catalog,
db_schema,
table,
}) => {
write!(
f,
"CommandGetImportedKeys(catalog={}, db_schema={}, table={})",
catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
table
)
}
Self::CommandGetPrimaryKeys(CommandGetPrimaryKeys {
catalog,
db_schema,
@ -186,8 +255,14 @@ impl FlightSQLCommand {
Ok(Self::CommandGetSqlInfo(decoded_cmd))
} else if let Some(decoded_cmd) = Any::unpack::<CommandGetCatalogs>(&msg)? {
Ok(Self::CommandGetCatalogs(decoded_cmd))
} else if let Some(decoded_cmd) = Any::unpack::<CommandGetCrossReference>(&msg)? {
Ok(Self::CommandGetCrossReference(decoded_cmd))
} else if let Some(decoded_cmd) = Any::unpack::<CommandGetDbSchemas>(&msg)? {
Ok(Self::CommandGetDbSchemas(decoded_cmd))
} else if let Some(decoded_cmd) = Any::unpack::<CommandGetExportedKeys>(&msg)? {
Ok(Self::CommandGetExportedKeys(decoded_cmd))
} else if let Some(decoded_cmd) = Any::unpack::<CommandGetImportedKeys>(&msg)? {
Ok(Self::CommandGetImportedKeys(decoded_cmd))
} else if let Some(decode_cmd) = Any::unpack::<CommandGetPrimaryKeys>(&msg)? {
Ok(Self::CommandGetPrimaryKeys(decode_cmd))
} else if let Some(decode_cmd) = Any::unpack::<CommandGetTables>(&msg)? {
@ -226,7 +301,10 @@ impl FlightSQLCommand {
}
FlightSQLCommand::CommandGetSqlInfo(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetCatalogs(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetCrossReference(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetDbSchemas(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetExportedKeys(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetImportedKeys(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetPrimaryKeys(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetTables(cmd) => Any::pack(&cmd),
FlightSQLCommand::CommandGetTableTypes(cmd) => Any::pack(&cmd),

View File

@ -11,8 +11,9 @@ use arrow::{
use arrow_flight::{
sql::{
ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any,
CommandGetCatalogs, CommandGetDbSchemas, CommandGetPrimaryKeys, CommandGetSqlInfo,
CommandGetTableTypes, CommandGetTables, CommandStatementQuery,
CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
CommandGetTables, CommandStatementQuery,
},
IpcMessage, SchemaAsIpc,
};
@ -64,9 +65,18 @@ impl FlightSQLPlanner {
FlightSQLCommand::CommandGetCatalogs(CommandGetCatalogs {}) => {
encode_schema(get_catalogs_schema())
}
FlightSQLCommand::CommandGetCrossReference(CommandGetCrossReference { .. }) => {
encode_schema(&GET_CROSS_REFERENCE_SCHEMA)
}
FlightSQLCommand::CommandGetDbSchemas(CommandGetDbSchemas { .. }) => {
encode_schema(get_db_schemas_schema().as_ref())
}
FlightSQLCommand::CommandGetExportedKeys(CommandGetExportedKeys { .. }) => {
encode_schema(&GET_EXPORTED_KEYS_SCHEMA)
}
FlightSQLCommand::CommandGetImportedKeys(CommandGetImportedKeys { .. }) => {
encode_schema(&GET_IMPORTED_KEYS_SCHEMA)
}
FlightSQLCommand::CommandGetPrimaryKeys(CommandGetPrimaryKeys { .. }) => {
encode_schema(&GET_PRIMARY_KEYS_SCHEMA)
}
@ -115,6 +125,35 @@ impl FlightSQLPlanner {
let plan = plan_get_catalogs(ctx).await?;
Ok(ctx.create_physical_plan(&plan).await?)
}
FlightSQLCommand::CommandGetCrossReference(CommandGetCrossReference {
pk_catalog,
pk_db_schema,
pk_table,
fk_catalog,
fk_db_schema,
fk_table,
}) => {
debug!(
?pk_catalog,
?pk_db_schema,
?pk_table,
?fk_catalog,
?fk_db_schema,
?fk_table,
"Planning CommandGetCrossReference query"
);
let plan = plan_get_cross_reference(
ctx,
pk_catalog,
pk_db_schema,
pk_table,
fk_catalog,
fk_db_schema,
fk_table,
)
.await?;
Ok(ctx.create_physical_plan(&plan).await?)
}
FlightSQLCommand::CommandGetDbSchemas(CommandGetDbSchemas {
catalog,
db_schema_filter_pattern,
@ -127,6 +166,34 @@ impl FlightSQLPlanner {
let plan = plan_get_db_schemas(ctx, catalog, db_schema_filter_pattern).await?;
Ok(ctx.create_physical_plan(&plan).await?)
}
FlightSQLCommand::CommandGetExportedKeys(CommandGetExportedKeys {
catalog,
db_schema,
table,
}) => {
debug!(
?catalog,
?db_schema,
?table,
"Planning GetExportedKeys query"
);
let plan = plan_get_exported_keys(ctx, catalog, db_schema, table).await?;
Ok(ctx.create_physical_plan(&plan).await?)
}
FlightSQLCommand::CommandGetImportedKeys(CommandGetImportedKeys {
catalog,
db_schema,
table,
}) => {
debug!(
?catalog,
?db_schema,
?table,
"Planning CommandGetImportedKeys query"
);
let plan = plan_get_imported_keys(ctx, catalog, db_schema, table).await?;
Ok(ctx.create_physical_plan(&plan).await?)
}
FlightSQLCommand::CommandGetPrimaryKeys(CommandGetPrimaryKeys {
catalog,
db_schema,
@ -272,6 +339,19 @@ async fn plan_get_catalogs(ctx: &IOxSessionContext) -> Result<LogicalPlan> {
Ok(ctx.batch_to_logical_plan(get_catalogs(ctx.inner())?)?)
}
async fn plan_get_cross_reference(
ctx: &IOxSessionContext,
_pk_catalog: Option<String>,
_pk_db_schema: Option<String>,
_pk_table: String,
_fk_catalog: Option<String>,
_fk_db_schema: Option<String>,
_fk_table: String,
) -> Result<LogicalPlan> {
let batch = RecordBatch::new_empty(Arc::clone(&GET_CROSS_REFERENCE_SCHEMA));
Ok(ctx.batch_to_logical_plan(batch)?)
}
async fn plan_get_db_schemas(
ctx: &IOxSessionContext,
catalog: Option<String>,
@ -281,6 +361,26 @@ async fn plan_get_db_schemas(
Ok(ctx.batch_to_logical_plan(batch)?)
}
async fn plan_get_exported_keys(
ctx: &IOxSessionContext,
_catalog: Option<String>,
_db_schema: Option<String>,
_table: String,
) -> Result<LogicalPlan> {
let batch = RecordBatch::new_empty(Arc::clone(&GET_EXPORTED_KEYS_SCHEMA));
Ok(ctx.batch_to_logical_plan(batch)?)
}
async fn plan_get_imported_keys(
ctx: &IOxSessionContext,
_catalog: Option<String>,
_db_schema: Option<String>,
_table: String,
) -> Result<LogicalPlan> {
let batch = RecordBatch::new_empty(Arc::clone(&GET_IMPORTED_KEYS_SCHEMA));
Ok(ctx.batch_to_logical_plan(batch)?)
}
async fn plan_get_primary_keys(
ctx: &IOxSessionContext,
_catalog: Option<String>,
@ -333,6 +433,68 @@ static TABLE_TYPES_RECORD_BATCH: Lazy<RecordBatch> = Lazy::new(|| {
RecordBatch::try_new(Arc::clone(&GET_TABLE_TYPE_SCHEMA), vec![table_type]).unwrap()
});
/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name,
/// pk_table_name, pk_key_name, then key_sequence.
/// update_rule and delete_rule returns a byte that is equivalent to actions:
/// - 0 = CASCADE
/// - 1 = RESTRICT
/// - 2 = SET NULL
/// - 3 = NO ACTION
/// - 4 = SET DEFAULT
static GET_CROSS_REFERENCE_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
Arc::new(Schema::new(vec![
Field::new("pk_catalog_name", DataType::Utf8, false),
Field::new("pk_db_schema_name", DataType::Utf8, false),
Field::new("pk_table_name", DataType::Utf8, false),
Field::new("pk_column_name", DataType::Utf8, false),
Field::new("fk_catalog_name", DataType::Utf8, false),
Field::new("fk_db_schema_name", DataType::Utf8, false),
Field::new("fk_table_name", DataType::Utf8, false),
Field::new("fk_column_name", DataType::Utf8, false),
Field::new("key_sequence", DataType::Int32, false),
Field::new("fk_key_name", DataType::Utf8, false),
Field::new("pk_key_name", DataType::Utf8, false),
Field::new("update_rule", DataType::UInt8, false),
Field::new("delete_rule", DataType::UInt8, false),
]))
});
static GET_EXPORTED_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
Arc::new(Schema::new(vec![
Field::new("pk_catalog_name", DataType::Utf8, false),
Field::new("pk_db_schema_name", DataType::Utf8, false),
Field::new("pk_table_name", DataType::Utf8, false),
Field::new("pk_column_name", DataType::Utf8, false),
Field::new("fk_catalog_name", DataType::Utf8, false),
Field::new("fk_db_schema_name", DataType::Utf8, false),
Field::new("fk_table_name", DataType::Utf8, false),
Field::new("fk_column_name", DataType::Utf8, false),
Field::new("key_sequence", DataType::Int32, false),
Field::new("fk_key_name", DataType::Utf8, false),
Field::new("pk_key_name", DataType::Utf8, false),
Field::new("update_rule", DataType::UInt8, false),
Field::new("delete_rule", DataType::UInt8, false),
]))
});
static GET_IMPORTED_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
Arc::new(Schema::new(vec![
Field::new("pk_catalog_name", DataType::Utf8, false),
Field::new("pk_db_schema_name", DataType::Utf8, false),
Field::new("pk_table_name", DataType::Utf8, false),
Field::new("pk_column_name", DataType::Utf8, false),
Field::new("fk_catalog_name", DataType::Utf8, false),
Field::new("fk_db_schema_name", DataType::Utf8, false),
Field::new("fk_table_name", DataType::Utf8, false),
Field::new("fk_column_name", DataType::Utf8, false),
Field::new("key_sequence", DataType::Int32, false),
Field::new("fk_key_name", DataType::Utf8, false),
Field::new("pk_key_name", DataType::Utf8, false),
Field::new("update_rule", DataType::UInt8, false),
Field::new("delete_rule", DataType::UInt8, false),
]))
});
static GET_PRIMARY_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
Arc::new(Schema::new(vec![
Field::new("catalog_name", DataType::Utf8, false),

View File

@ -1,9 +1,11 @@
use std::sync::Arc;
use arrow::{
array::{
Array, ArrayBuilder, ArrayData, BooleanBuilder, Int32Builder, Int64Builder, Int8Builder,
ListBuilder, StringBuilder, UnionArray,
},
datatypes::{DataType, Field, UnionMode},
datatypes::{DataType, Field, UnionFields, UnionMode},
};
use arrow_flight::sql::SqlInfo;
use once_cell::sync::Lazy;
@ -118,7 +120,7 @@ static UNION_TYPE: Lazy<DataType> = Lazy::new(|| {
// treat list as nullable b/c that is what hte builders make
Field::new(
"string_list",
DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))),
true,
),
];
@ -127,7 +129,7 @@ static UNION_TYPE: Lazy<DataType> = Lazy::new(|| {
// assume they go from 0 .. num_fields
let type_ids: Vec<i8> = (0..fields.len()).map(|v| v as i8).collect();
DataType::Union(fields, type_ids, UnionMode::Dense)
DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense)
});
impl SqlInfoUnionBuilder {

View File

@ -19,11 +19,11 @@ prost = "0.11"
query_functions = { path = "../query_functions" }
serde = { version = "1.0", features = ["derive"] }
snafu = "0.7"
tonic = "0.8"
tonic = { workspace = true }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
[build-dependencies] # In alphabetical order
tonic-build = "0.8"
tonic-build = { workspace = true }
prost-build = "0.11"
pbjson-build = "0.5"

View File

@ -47,7 +47,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
let sharder_path = root.join("influxdata/iox/sharder/v1");
let wal_path = root.join("influxdata/iox/wal/v1");
let write_buffer_path = root.join("influxdata/iox/write_buffer/v1");
let write_summary_path = root.join("influxdata/iox/write_summary/v1");
let storage_path = root.join("influxdata/platform/storage");
let storage_errors_path = root.join("influxdata/platform/errors");
@ -59,7 +58,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
delete_path.join("service.proto"),
ingester_path.join("parquet_metadata.proto"),
ingester_path.join("query.proto"),
ingester_path.join("write_info.proto"),
ingester_path.join("write.proto"),
ingester_path.join("replication.proto"),
ingester_path.join("persist.proto"),
@ -76,7 +74,6 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
sharder_path.join("sharder.proto"),
wal_path.join("wal.proto"),
write_buffer_path.join("write_buffer.proto"),
write_summary_path.join("write_summary.proto"),
storage_path.join("predicate.proto"),
storage_path.join("service.proto"),
storage_path.join("source.proto"),

View File

@ -71,20 +71,14 @@ message IngesterQueryResponseMetadata {
reserved 6;
// Partition id for this batch.
//
// This field is currently NOT used by the ingester but will be soon.
int64 partition_id = 7;
// Optional partition status.
//
// If this is given, then no schema and no batch will be part of this FlightData object.
//
// This field is currently NOT used by the ingester but will be soon.
PartitionStatus status = 8;
// UUID of this ingester instance.
//
// This field is currently NOT used by the ingester but will be soon.
string ingester_uuid = 9;
// Number of Parquet files that have been persisted to object storage for this partition.

View File

@ -1,57 +0,0 @@
syntax = "proto3";
package influxdata.iox.ingester.v1;
option go_package = "github.com/influxdata/iox/ingester/v1";
// NOTE: This is an ALPHA / Internal API that is used as part of the
// end to end tests.
//
// A public API is tracked here:
// <https://github.com/influxdata/influxdb_iox/issues/4354>
service WriteInfoService {
// Get information about a particular write
rpc GetWriteInfo(GetWriteInfoRequest) returns (GetWriteInfoResponse);
}
message GetWriteInfoRequest {
// The write token returned from a write that was written to one or
// more shards
string write_token = 1;
}
message GetWriteInfoResponse {
// Renamed from kafka_partition_infos to shard_infos
reserved 3;
reserved "kafka_partition_infos";
// Information for all shards in this write
repeated ShardInfo shard_infos = 4;
}
// Status of a part of a write in a particular shard
message ShardInfo {
// Unique shard index
int32 shard_index = 1;
// the status of the data for this shard
ShardStatus status = 2;
}
// the state
enum ShardStatus {
// Unspecified status, will result in an error.
SHARD_STATUS_UNSPECIFIED = 0;
// The ingester has not yet processed data in this write
SHARD_STATUS_DURABLE = 1;
// The ingester has processed the data in this write and it is
// readable (will be included in a query response)?
SHARD_STATUS_READABLE = 2;
// The ingester has processed the data in this write and it is both
// readable and completely persisted to parquet files.
SHARD_STATUS_PERSISTED = 3;
// The ingester does not have information about this shard
SHARD_STATUS_UNKNOWN = 4;
}

View File

@ -1,24 +0,0 @@
syntax = "proto3";
package influxdata.iox.write_summary.v1;
option go_package = "github.com/influxdata/iox/write_summary/v1";
// Represents a single logical write that was partitioned and sharded
// into multiple pieces in multiple shards (kafka partitions)
message WriteSummary {
// Renamed from sequencers to shards
reserved 1;
reserved "sequencers";
// per shard index (kafka partition) information
repeated ShardWrite shards = 2;
}
// Per shard (kafka partition) information about what sequence
// numbers contain part of a write
message ShardWrite {
// Unique shard index (kafka partition).
int32 shard_index = 1;
// Which sequence numbers for this shard had data
repeated int64 sequence_numbers = 2;
}

View File

@ -196,19 +196,6 @@ pub mod influxdata {
));
}
}
pub mod write_summary {
pub mod v1 {
include!(concat!(
env!("OUT_DIR"),
"/influxdata.iox.write_summary.v1.rs"
));
include!(concat!(
env!("OUT_DIR"),
"/influxdata.iox.write_summary.v1.serde.rs"
));
}
}
}
pub mod pbdata {
@ -281,8 +268,6 @@ pub mod compactor;
pub mod delete_predicate;
#[cfg(any(feature = "data_types_conversions", test))]
pub mod ingester;
#[cfg(any(feature = "data_types_conversions", test))]
pub mod write_info;
pub use prost::{DecodeError, EncodeError};

View File

@ -1,155 +0,0 @@
use crate::influxdata::iox::ingester::v1 as proto;
use data_types::ShardWriteStatus;
use std::collections::HashMap;
impl From<ShardWriteStatus> for proto::ShardStatus {
fn from(status: ShardWriteStatus) -> Self {
match status {
ShardWriteStatus::ShardUnknown => Self::Unknown,
ShardWriteStatus::Durable => Self::Durable,
ShardWriteStatus::Readable => Self::Readable,
ShardWriteStatus::Persisted => Self::Persisted,
}
}
}
impl proto::ShardStatus {
/// Convert the status to a number such that higher numbers are later in the data lifecycle.
/// For use in merging multiple write status gRPC responses into one response.
fn status_order(&self) -> u8 {
match self {
Self::Unspecified => panic!("Unspecified status"),
Self::Unknown => 0,
Self::Durable => 1,
Self::Readable => 2,
Self::Persisted => 3,
}
}
}
impl proto::ShardInfo {
fn merge(&mut self, other: &Self) {
let self_status = self.status();
let other_status = other.status();
let new_status = match self_status.status_order().cmp(&other_status.status_order()) {
std::cmp::Ordering::Less => other_status,
std::cmp::Ordering::Equal => self_status,
std::cmp::Ordering::Greater => self_status,
};
self.set_status(new_status);
}
}
/// "Merges" the partition information for write info responses so that the "most recent"
/// information is returned.
pub fn merge_responses(
responses: impl IntoIterator<Item = proto::GetWriteInfoResponse>,
) -> proto::GetWriteInfoResponse {
// Map shard index to status
let mut shard_infos: HashMap<_, proto::ShardInfo> = HashMap::new();
responses
.into_iter()
.flat_map(|res| res.shard_infos.into_iter())
.for_each(|info| {
shard_infos
.entry(info.shard_index)
.and_modify(|existing_info| existing_info.merge(&info))
.or_insert(info);
});
let shard_infos = shard_infos.into_values().collect();
proto::GetWriteInfoResponse { shard_infos }
}
#[cfg(test)]
mod tests {
use super::*;
use proto::{ShardInfo, ShardStatus};
#[test]
fn test_merge() {
#[derive(Debug)]
struct Test<'a> {
left: &'a ShardInfo,
right: &'a ShardInfo,
expected: &'a ShardInfo,
}
let durable = ShardInfo {
shard_index: 1,
status: ShardStatus::Durable.into(),
};
let readable = ShardInfo {
shard_index: 1,
status: ShardStatus::Readable.into(),
};
let persisted = ShardInfo {
shard_index: 1,
status: ShardStatus::Persisted.into(),
};
let unknown = ShardInfo {
shard_index: 1,
status: ShardStatus::Unknown.into(),
};
let tests = vec![
Test {
left: &unknown,
right: &unknown,
expected: &unknown,
},
Test {
left: &unknown,
right: &durable,
expected: &durable,
},
Test {
left: &unknown,
right: &readable,
expected: &readable,
},
Test {
left: &durable,
right: &unknown,
expected: &durable,
},
Test {
left: &readable,
right: &readable,
expected: &readable,
},
Test {
left: &durable,
right: &durable,
expected: &durable,
},
Test {
left: &readable,
right: &durable,
expected: &readable,
},
Test {
left: &persisted,
right: &durable,
expected: &persisted,
},
];
for test in tests {
let mut output = test.left.clone();
output.merge(test.right);
assert_eq!(
&output, test.expected,
"Mismatch\n\nOutput:\n{output:#?}\n\nTest:\n{test:#?}"
);
}
}
}

View File

@ -7,10 +7,10 @@ license.workspace = true
[dependencies]
prost = "0.11"
prost-types = { version = "0.11.7", features = ["std"] }
tonic = "0.8"
prost-types = { version = "0.11.9", features = ["std"] }
tonic = { workspace = true }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
[build-dependencies]
prost-build = "0.11"
tonic-build = "0.8"
tonic-build = { workspace = true }

View File

@ -7,10 +7,10 @@ license.workspace = true
[dependencies]
prost = "0.11"
prost-types = { version = "0.11.7", features = ["std"] }
tonic = "0.8"
prost-types = { version = "0.11.9", features = ["std"] }
tonic = { workspace = true }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
[build-dependencies]
prost-build = "0.11"
tonic-build = "0.8"
tonic-build = { workspace = true }

View File

@ -16,7 +16,7 @@ hyper = "0.14"
pin-project = "1.0"
prost = "0.11"
tokio = {version = "1", features = [ "rt" ]}
tonic = "0.8"
tonic = { workspace = true }
tower = "0.4"
grpc-binary-logger-proto = { path = "../grpc-binary-logger-proto" }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
@ -28,4 +28,4 @@ assert_matches = "1"
[build-dependencies]
prost-build = "0.11"
tonic-build = "0.8"
tonic-build = { workspace = true }

View File

@ -15,10 +15,10 @@ iox_catalog = { path = "../iox_catalog" }
object_store = { version = "0.5.6", features = ["aws"] }
schema = { path = "../schema" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.95"
serde_json = "1.0.96"
thiserror = "1.0.40"
tokio = { version = "1.27" }
tonic = { version = "0.8" }
tonic = { workspace = true }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]

View File

@ -10,7 +10,7 @@ bytes = "1.4"
futures = { version = "0.3", default-features = false }
reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.95"
serde_json = "1.0.96"
snafu = "0.7"
url = "2.3.1"
uuid = { version = "1", features = ["v4"] }

View File

@ -22,7 +22,6 @@ influxrpc_parser = { path = "../influxrpc_parser"}
iox_catalog = { path = "../iox_catalog" }
ioxd_common = { path = "../ioxd_common"}
ioxd_compactor2 = { path = "../ioxd_compactor2"}
ioxd_ingest_replica = { path = "../ioxd_ingest_replica" }
ioxd_ingester2 = { path = "../ioxd_ingester2"}
ioxd_garbage_collector = { path = "../ioxd_garbage_collector" }
ioxd_querier = { path = "../ioxd_querier"}
@ -64,7 +63,7 @@ libc = { version = "0.2" }
num_cpus = "1.15.0"
once_cell = { version = "1.17", features = ["parking_lot"] }
rustyline = { version = "11.0", default-features = false, features = ["with-file-history"]}
serde_json = "1.0.95"
serde_json = "1.0.96"
snafu = "0.7"
tempfile = "3.5.0"
thiserror = "1.0.40"
@ -72,7 +71,7 @@ tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
tokio = { version = "1.27", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time", "io-std"] }
tokio-stream = { version = "0.1", features = ["net"] }
tokio-util = { version = "0.7.7", features = ["compat"] }
tonic = "0.8"
tonic = { workspace = true }
uuid = { version = "1", features = ["v4"] }
# jemalloc-sys with unprefixed_malloc_on_supported_platforms feature and heappy are mutually exclusive
tikv-jemalloc-sys = { version = "0.5.3", optional = true, features = ["unprefixed_malloc_on_supported_platforms"] }
@ -81,11 +80,11 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]
# In alphabetical order
arrow_util = { path = "../arrow_util" }
assert_cmd = "2.0.10"
assert_cmd = "2.0.11"
assert_matches = "1.5"
async-trait = "0.1"
predicate = { path = "../predicate" }
predicates = "3.0.2"
predicates = "3.0.3"
serde = "1.0.159"
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }

View File

@ -1,7 +1,7 @@
use arrow::record_batch::RecordBatch;
use clap::ValueEnum;
use futures::TryStreamExt;
use influxdb_iox_client::format::influxql::write_columnar;
use influxdb_iox_client::format::influxql::{write_columnar, Options};
use influxdb_iox_client::{connection::Connection, flight, format::QueryOutputFormat};
use thiserror::Error;
@ -105,7 +105,7 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
match (query_lang, &format) {
(QueryLanguage::InfluxQL, OutputFormat::Pretty) => {
write_columnar(std::io::stdout(), &batches)?
write_columnar(std::io::stdout(), &batches, Options::default())?
}
_ => {
let format: QueryOutputFormat = format.into();

View File

@ -11,7 +11,7 @@ use clap_blocks::{
ingester2::Ingester2Config,
ingester_address::IngesterAddress,
object_store::{make_object_store, ObjectStoreConfig},
querier::{IngesterAddresses, QuerierConfig},
querier::QuerierConfig,
router2::Router2Config,
run_config::RunConfig,
socket_addr::SocketAddr,
@ -425,6 +425,9 @@ impl Config {
CatalogDsnConfig::new_sqlite(local_catalog_path)
};
let ingester_addresses =
vec![IngesterAddress::from_str(&ingester_grpc_bind_address.to_string()).unwrap()];
let router_run_config = RunConfig::new(
logging_config,
tracing_config,
@ -458,10 +461,7 @@ impl Config {
let router_config = Router2Config {
query_pool_name: QUERY_POOL_NAME.to_string(),
http_request_limit: 1_000,
ingester_addresses: vec![IngesterAddress::from_str(
&ingester_grpc_bind_address.to_string(),
)
.unwrap()],
ingester_addresses: ingester_addresses.clone(),
new_namespace_retention_hours: None, // infinite retention
namespace_autocreation_enabled: true,
partition_key_pattern: "%Y-%m-%d".to_string(),
@ -498,10 +498,8 @@ impl Config {
};
let querier_config = QuerierConfig {
num_query_threads: None, // will be ignored
shard_to_ingesters_file: None, // will be ignored
shard_to_ingesters: None, // will be ignored
ingester_addresses: vec![ingester_grpc_bind_address.to_string()], // will be ignored
num_query_threads: None, // will be ignored
ingester_addresses,
ram_pool_metadata_bytes: querier_ram_pool_metadata_bytes,
ram_pool_data_bytes: querier_ram_pool_data_bytes,
max_concurrent_queries: querier_max_concurrent_queries,
@ -660,12 +658,7 @@ pub async fn command(config: Config) -> Result<()> {
)
.await;
let ingester_addresses = IngesterAddresses::List(vec![IngesterAddress::from_str(
&ingester_run_config.grpc_bind_address.to_string(),
)
.unwrap()]);
info!(?ingester_addresses, "starting querier");
info!(ingester_addresses = ?querier_config.ingester_addresses, "starting querier");
let querier = create_querier_server_type(QuerierServerTypeArgs {
common_state: &common_state,
metric_registry: Arc::clone(&metrics),
@ -673,9 +666,7 @@ pub async fn command(config: Config) -> Result<()> {
object_store,
exec,
time_provider,
ingester_addresses,
querier_config,
rpc_write: true,
authz: authz.as_ref().map(Arc::clone),
})
.await?;

View File

@ -1,106 +0,0 @@
//! Command line options for running an ingester for a router using the RPC write path to talk to.
use super::main;
use crate::process_info::{setup_metric_registry, USIZE_MAX};
use clap_blocks::{
catalog_dsn::CatalogDsnConfig, ingest_replica::IngestReplicaConfig, run_config::RunConfig,
};
use iox_query::exec::Executor;
use ioxd_common::{
server_type::{CommonServerState, CommonServerStateError},
Service,
};
use ioxd_ingest_replica::create_ingest_replica_server_type;
use observability_deps::tracing::*;
use std::{num::NonZeroUsize, sync::Arc};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum Error {
#[error("run: {0}")]
Run(#[from] main::Error),
#[error("invalid config: {0}")]
InvalidConfig(#[from] CommonServerStateError),
#[error("error initializing ingest_replica: {0}")]
IngestReplica(#[from] ioxd_ingest_replica::Error),
#[error("catalog DSN error: {0}")]
CatalogDsn(#[from] clap_blocks::catalog_dsn::Error),
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug, clap::Parser)]
#[clap(
name = "run",
about = "Runs in ingest replica mode",
long_about = "Run the IOx ingest_replica server.\n\nThe configuration options below can be \
set either with the command line flags or with the specified environment \
variable. If there is a file named '.env' in the current working directory, \
it is sourced before loading the configuration.
Configuration is loaded from the following sources (highest precedence first):
- command line arguments
- user set environment variables
- .env file contents
- pre-configured default values"
)]
pub struct Config {
#[clap(flatten)]
pub(crate) run_config: RunConfig,
#[clap(flatten)]
pub(crate) catalog_dsn: CatalogDsnConfig,
#[clap(flatten)]
pub(crate) ingest_replica_config: IngestReplicaConfig,
/// Specify the size of the thread-pool for query execution, and the
/// separate compaction thread-pool.
#[clap(
long = "exec-thread-count",
env = "INFLUXDB_IOX_EXEC_THREAD_COUNT",
default_value = "4",
action
)]
pub exec_thread_count: NonZeroUsize,
/// Size of memory pool used during query exec, in bytes.
#[clap(
long = "exec-mem-pool-bytes",
env = "INFLUXDB_IOX_EXEC_MEM_POOL_BYTES",
default_value = &USIZE_MAX[..],
action
)]
exec_mem_pool_bytes: usize,
}
pub async fn command(config: Config) -> Result<()> {
let common_state = CommonServerState::from_config(config.run_config.clone())?;
let metric_registry = setup_metric_registry();
let catalog = config
.catalog_dsn
.get_catalog("ingester", Arc::clone(&metric_registry))
.await?;
let exec = Arc::new(Executor::new(
config.exec_thread_count,
config.exec_mem_pool_bytes,
));
let server_type = create_ingest_replica_server_type(
&common_state,
catalog,
Arc::clone(&metric_registry),
&config.ingest_replica_config,
exec,
)
.await?;
info!("starting ingester2");
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services, metric_registry).await?)
}

View File

@ -4,7 +4,6 @@ use trogging::cli::LoggingConfig;
pub(crate) mod all_in_one;
mod compactor2;
mod garbage_collector;
mod ingest_replica;
mod ingester2;
mod main;
mod querier;
@ -29,9 +28,6 @@ pub enum Error {
#[snafu(display("Error in ingester2 subcommand: {}", source))]
Ingester2Error { source: ingester2::Error },
#[snafu(display("Error in ingest_replica subcommand: {}", source))]
IngestReplicaError { source: ingest_replica::Error },
#[snafu(display("Error in all in one subcommand: {}", source))]
AllInOneError { source: all_in_one::Error },
@ -60,7 +56,6 @@ impl Config {
Some(Command::Querier(config)) => config.run_config.logging_config(),
Some(Command::Router2(config)) => config.run_config.logging_config(),
Some(Command::Ingester2(config)) => config.run_config.logging_config(),
Some(Command::IngestReplica(config)) => config.run_config.logging_config(),
Some(Command::AllInOne(config)) => &config.logging_config,
Some(Command::Test(config)) => config.run_config.logging_config(),
}
@ -81,9 +76,6 @@ enum Command {
/// Run the server in ingester2 mode
Ingester2(ingester2::Config),
/// Run the server in ingest_replica mode
IngestReplica(ingest_replica::Config),
/// Run the server in "all in one" mode (Default)
AllInOne(all_in_one::Config),
@ -110,9 +102,6 @@ pub async fn command(config: Config) -> Result<()> {
Some(Command::Ingester2(config)) => {
ingester2::command(config).await.context(Ingester2Snafu)
}
Some(Command::IngestReplica(config)) => ingest_replica::command(config)
.await
.context(IngestReplicaSnafu),
Some(Command::AllInOne(config)) => all_in_one::command(config).await.context(AllInOneSnafu),
Some(Command::Test(config)) => test::command(config).await.context(TestSnafu),
}

View File

@ -29,9 +29,6 @@ pub enum Error {
#[error("Invalid config: {0}")]
InvalidConfigCommon(#[from] CommonServerStateError),
#[error("Invalid config: {0}")]
InvalidConfigIngester(#[from] clap_blocks::querier::Error),
#[error("Catalog error: {0}")]
Catalog(#[from] iox_catalog::interface::Error),
@ -120,7 +117,7 @@ pub async fn command(config: Config) -> Result<(), Error> {
info!("using the write buffer path");
}
let ingester_addresses = config.querier_config.ingester_addresses()?;
let ingester_addresses = &config.querier_config.ingester_addresses;
info!(?ingester_addresses, "using ingester addresses");
let exec = Arc::new(Executor::new(
@ -135,9 +132,7 @@ pub async fn command(config: Config) -> Result<(), Error> {
object_store,
exec,
time_provider,
ingester_addresses,
querier_config: config.querier_config,
rpc_write,
authz: authz.as_ref().map(Arc::clone),
})
.await?;

View File

@ -2,7 +2,7 @@ use std::{collections::HashMap, path::PathBuf, sync::Arc};
use arrow::{
array::as_generic_binary_array,
datatypes::{DataType, Schema, SchemaRef, TimeUnit},
datatypes::{DataType, Fields, Schema, SchemaRef, TimeUnit},
record_batch::RecordBatch,
};
use arrow_flight::{
@ -339,6 +339,64 @@ async fn flightsql_get_catalogs_matches_information_schema() {
.await
}
#[tokio::test]
async fn flightsql_get_cross_reference() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let primary_table_name = "primary_table";
let foreign_table_name = "foreign_table";
// Set up the cluster ====================================
let mut cluster = MiniCluster::create_shared2(database_url).await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!(
"{primary_table_name},tag1=A,tag2=B val=42i 123456\n\
{primary_table_name},tag1=A,tag2=C val=43i 123457\n
{foreign_table_name},tag1=B,tag2=D val=42i 123456\n\
{foreign_table_name},tag1=C,tag2=F val=43i 123457"
)),
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let mut client = flightsql_client(state.cluster());
let pk_catalog: Option<String> = None;
let pk_db_schema: Option<String> = None;
let fk_catalog: Option<String> = None;
let fk_db_schema: Option<String> = None;
let stream = client
.get_cross_reference(
pk_catalog,
pk_db_schema,
primary_table_name.to_string(),
fk_catalog,
fk_db_schema,
foreign_table_name.to_string(),
)
.await
.unwrap();
let batches = collect_stream(stream).await;
insta::assert_yaml_snapshot!(
batches_to_sorted_lines(&batches),
@r###"
---
- ++
- ++
"###
);
}
.boxed()
})),
],
)
.run()
.await
}
#[tokio::test]
async fn flightsql_get_tables() {
test_helpers::maybe_start_logging();
@ -938,6 +996,98 @@ async fn flightsql_get_db_schema_matches_information_schema() {
.await
}
#[tokio::test]
async fn flightsql_get_exported_keys() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let table_name = "the_table";
// Set up the cluster ====================================
let mut cluster = MiniCluster::create_shared2(database_url).await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!(
"{table_name},tag1=A,tag2=B val=42i 123456\n\
{table_name},tag1=A,tag2=C val=43i 123457"
)),
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let mut client = flightsql_client(state.cluster());
let catalog: Option<String> = None;
let db_schema: Option<String> = None;
let stream = client
.get_exported_keys(catalog, db_schema, table_name.to_string())
.await
.unwrap();
let batches = collect_stream(stream).await;
insta::assert_yaml_snapshot!(
batches_to_sorted_lines(&batches),
@r###"
---
- ++
- ++
"###
);
}
.boxed()
})),
],
)
.run()
.await
}
#[tokio::test]
async fn flightsql_get_imported_keys() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let table_name = "the_table";
// Set up the cluster ====================================
let mut cluster = MiniCluster::create_shared2(database_url).await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!(
"{table_name},tag1=A,tag2=B val=42i 123456\n\
{table_name},tag1=A,tag2=C val=43i 123457"
)),
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let mut client = flightsql_client(state.cluster());
let catalog: Option<String> = None;
let db_schema: Option<String> = None;
let stream = client
.get_imported_keys(catalog, db_schema, table_name.to_string())
.await
.unwrap();
let batches = collect_stream(stream).await;
insta::assert_yaml_snapshot!(
batches_to_sorted_lines(&batches),
@r###"
---
- ++
- ++
"###
);
}
.boxed()
})),
],
)
.run()
.await
}
#[tokio::test]
async fn flightsql_get_primary_keys() {
test_helpers::maybe_start_logging();
@ -1254,10 +1404,10 @@ async fn assert_schema(client: &mut FlightClient, cmd: Any) {
}
fn strip_metadata(schema: &Schema) -> SchemaRef {
let stripped_fields: Vec<_> = schema
let stripped_fields: Fields = schema
.fields()
.iter()
.map(|f| f.clone().with_metadata(HashMap::new()))
.map(|f| f.as_ref().clone().with_metadata(HashMap::new()))
.collect();
Arc::new(Schema::new(stripped_fields))
@ -1357,8 +1507,149 @@ async fn authz() {
authz.close().await;
}
/// Ensure that FligthSQL API supports the following grpc header names,
/// in addition to the existing `iox-namespace-name`
/// 1. database
/// 2. bucket
/// 3. bucket-name
#[tokio::test]
async fn flightsql_client_header_same_database() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let table_name = "the_table";
// Set up the cluster ====================================
let mut cluster = MiniCluster::create_shared2(database_url).await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!(
"{table_name},tag1=A,tag2=B val=42i 123456\n\
{table_name},tag1=A,tag2=C val=43i 123457"
)),
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let mut client = flightsql_client_helper(state.cluster(), "iox-namespace-name");
for header_name in &["database", "bucket", "bucket-name"] {
// different header names with the same database name
client
.add_header(header_name, state.cluster().namespace())
.unwrap();
}
let stream = client.get_table_types().await.unwrap();
let batches = collect_stream(stream).await;
insta::assert_yaml_snapshot!(
batches_to_sorted_lines(&batches),
@r###"
---
- +------------+
- "| table_type |"
- +------------+
- "| BASE TABLE |"
- "| VIEW |"
- +------------+
"###
);
}
.boxed()
})),
],
)
.run()
.await
}
#[tokio::test]
async fn flightsql_client_header_different_database() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let table_name = "the_table";
// Set up the cluster ====================================
let mut cluster = MiniCluster::create_shared2(database_url).await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!(
"{table_name},tag1=A,tag2=B val=42i 123456\n\
{table_name},tag1=A,tag2=C val=43i 123457"
)),
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let mut client = flightsql_client_helper(state.cluster(), "database");
client
.add_header("bucket", "different_database_name")
.unwrap();
let err = client.get_table_types().await.unwrap_err();
assert_matches!(err, FlightError::Tonic(status) => {
assert_eq!(status.code(), tonic::Code::InvalidArgument);
assert_contains!(status.message(), "More than one headers are found in request");
}
);
}
.boxed()
})),
],
)
.run()
.await
}
#[tokio::test]
async fn flightsql_client_header_no_database() {
test_helpers::maybe_start_logging();
let database_url = maybe_skip_integration!();
let table_name = "the_table";
// Set up the cluster ====================================
let mut cluster = MiniCluster::create_shared2(database_url).await;
StepTest::new(
&mut cluster,
vec![
Step::WriteLineProtocol(format!(
"{table_name},tag1=A,tag2=B val=42i 123456\n\
{table_name},tag1=A,tag2=C val=43i 123457"
)),
Step::Custom(Box::new(move |state: &mut StepTestState| {
async move {
let connection = state.cluster().querier().querier_grpc_connection();
let (channel, _headers) = connection.into_grpc_connection().into_parts();
let mut client = FlightSqlClient::new(channel);
let err = client.get_table_types().await.unwrap_err();
assert_matches!(err, FlightError::Tonic(status) => {
assert_eq!(status.code(), tonic::Code::InvalidArgument);
assert_contains!(status.message(), "no 'database' header in request");
}
);
}
.boxed()
})),
],
)
.run()
.await
}
/// Return a [`FlightSqlClient`] configured for use
fn flightsql_client(cluster: &MiniCluster) -> FlightSqlClient {
flightsql_client_helper(cluster, "database")
}
/// Helper function for fn `flightsql_client` that returns a [`FlightSqlClient`] configured for use
fn flightsql_client_helper(cluster: &MiniCluster, header_name: &str) -> FlightSqlClient {
let connection = cluster.querier().querier_grpc_connection();
let (channel, _headers) = connection.into_grpc_connection().into_parts();
@ -1366,7 +1657,7 @@ fn flightsql_client(cluster: &MiniCluster) -> FlightSqlClient {
// Add namespace to client headers until it is fully supported by FlightSQL
let namespace = cluster.namespace();
client.add_header("iox-namespace-name", namespace).unwrap();
client.add_header(header_name, namespace).unwrap();
client
}

View File

@ -28,6 +28,13 @@ async fn influxql_returns_error() {
"Error while planning query: This feature is not implemented: SHOW TAG KEYS"
.into(),
},
Step::InfluxQLExpectingError {
query: "SHOW TAG KEYYYYYES".into(),
expected_error_code: tonic::Code::InvalidArgument,
expected_message:
"Error while planning query: Error during planning: invalid SHOW TAG statement, expected KEYS or VALUES at pos 9"
.into(),
},
],
)
.run()

View File

@ -40,58 +40,6 @@ fn ingester2_errors_without_mode_env_var() {
));
}
#[test]
fn querier_errors_with_mode_env_var_and_shard_to_ingester_mapping() {
let shard_to_ingesters_json = r#"{
"ingesters": {
"i1": {
"addr": "arbitrary"
}
},
"shards": {
"0": {
"ingester": "i1"
}
}
}"#;
Command::cargo_bin("influxdb_iox")
.unwrap()
.env_clear()
.env("INFLUXDB_IOX_RPC_MODE", "2")
.arg("run")
.arg("querier")
.arg("--shard-to-ingesters")
.arg(shard_to_ingesters_json)
.arg("--catalog")
.arg("memory")
.timeout(Duration::from_secs(2))
.assert()
.failure()
.stderr(predicate::str::contains(
"`INFLUXDB_IOX_RPC_MODE` is set but shard to ingester mappings were provided",
));
}
#[test]
fn querier_errors_without_mode_env_var_and_ingester_addresses() {
Command::cargo_bin("influxdb_iox")
.unwrap()
.env_clear()
.arg("run")
.arg("querier")
.arg("--ingester-addresses")
.arg("http://arbitrary:8082")
.arg("--catalog")
.arg("memory")
.timeout(Duration::from_secs(2))
.assert()
.failure()
.stderr(predicate::str::contains(
"`INFLUXDB_IOX_RPC_MODE` is unset but ingester addresses were provided",
));
}
#[test]
fn querier_without_ingesters_without_mode_env_var_uses_write_buffer() {
Command::cargo_bin("influxdb_iox")

View File

@ -127,11 +127,27 @@ public class Main {
System.out.println("**************");
print_result_set(md.getCatalogs());
System.out.println("**************");
System.out.println("CrossReference");
System.out.println("**************");
print_result_set(md.getCrossReference(null, null, "system", null, null, "iox"));
System.out.println("**************");
System.out.println("Schemas:");
System.out.println("**************");
print_result_set(md.getSchemas());
System.out.println("**************");
System.out.println("ExportedKeys");
System.out.println("**************");
print_result_set(md.getExportedKeys(null, null, "system"));
System.out.println("**************");
System.out.println("ImportedKeys");
System.out.println("**************");
print_result_set(md.getImportedKeys(null, null, "system"));
System.out.println("**************");
System.out.println("PrimaryKeys:");
System.out.println("**************");

View File

@ -10,14 +10,15 @@ influxdb_iox -v
## Run the JDBC test
To run the JDBC test program, specify the target namespace in the JDBC URL:
To run the JDBC test program, specify the target database in the JDBC URL:
```shell
# run the jdbc client driver program, downloading the JDBC driver if needed
./jdbc_client "jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&iox-namespace-name=26f7e5a4b7be365b_917b97a92e883afc" query 'select * from cpu'
./jdbc_client "jdbc:arrow-flight-sql://localhost:8082?useEncryption=false&database=26f7e5a4b7be365b_917b97a92e883afc" query 'select * from cpu'
```
# Cleanup:
Clean up any intermediate files (like JDBC driver)
```shell

View File

@ -1,11 +1,15 @@
-- Gap-filling tests
-- IOX_SETUP: OneMeasurementTwoSeries
-- Input data
-- region=a 2000-05-05T12:20:00Z
-- region=a 2000-05-05T12:40:00Z
-- region=b 2000-05-05T12:31:00Z
-- region=b 2000-05-05T12:39:00Z
-- Input data (by region, time)
SELECT *
FROM cpu
ORDER BY REGION, TIME;
-- Input data (by time)
SELECT *
FROM cpu
ORDER BY TIME;
-- IOX_COMPARE: uuid
EXPLAIN SELECT
@ -75,3 +79,13 @@ from cpu
where time between timestamp '2000-05-05T12:19:00Z' and timestamp '2000-05-05T12:40:00Z'
group by minute;
-- cpu.idle has a null value at 12:31. Interpolation should still occur,
-- overwriting the null value.
SELECT
date_bin_gapfill(interval '4 minutes', time, timestamp '1970-01-01T00:00:00Z') as four_minute,
interpolate(min(cpu.idle)),
interpolate(min(cpu."user"))
from cpu
where time between timestamp '2000-05-05T12:19:00Z' and timestamp '2000-05-05T12:40:00Z'
group by four_minute;

View File

@ -1,21 +1,39 @@
-- Test Setup: OneMeasurementTwoSeries
-- SQL: SELECT * FROM cpu ORDER BY REGION, TIME;
+------+--------+----------------------+------+
| idle | region | time | user |
+------+--------+----------------------+------+
| 70.0 | a | 2000-05-05T12:20:00Z | 23.2 |
| | a | 2000-05-05T12:40:00Z | 21.0 |
| | b | 2000-05-05T12:31:00Z | 25.2 |
| 60.0 | b | 2000-05-05T12:39:00Z | 28.9 |
+------+--------+----------------------+------+
-- SQL: SELECT * FROM cpu ORDER BY TIME;
+------+--------+----------------------+------+
| idle | region | time | user |
+------+--------+----------------------+------+
| 70.0 | a | 2000-05-05T12:20:00Z | 23.2 |
| | b | 2000-05-05T12:31:00Z | 25.2 |
| 60.0 | b | 2000-05-05T12:39:00Z | 28.9 |
| | a | 2000-05-05T12:40:00Z | 21.0 |
+------+--------+----------------------+------+
-- SQL: EXPLAIN SELECT date_bin_gapfill(interval '10 minute', time, timestamp '1970-01-01T00:00:00Z') as minute, count(cpu.user) from cpu where time between timestamp '2000-05-05T12:00:00Z' and timestamp '2000-05-05T12:59:00Z' group by minute;
-- Results After Normalizing UUIDs
----------
| plan_type | plan |
----------
| logical_plan | Projection: date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, COUNT(cpu.user) |
| | GapFill: groupBy=[[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]], time_column=date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalDayTime("600000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None)) |
| | Aggregate: groupBy=[[datebin(IntervalDayTime("600000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]] |
| logical_plan | Projection: date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, COUNT(cpu.user) |
| | GapFill: groupBy=[[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalMonthDayNano("600000000000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None)) |
| | Aggregate: groupBy=[[datebin(IntervalMonthDayNano("600000000000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[COUNT(cpu.user)]] |
| | TableScan: cpu projection=[time, user], full_filters=[cpu.time >= TimestampNanosecond(957528000000000000, None), cpu.time <= TimestampNanosecond(957531540000000000, None)] |
| physical_plan | ProjectionExec: expr=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as minute, COUNT(cpu.user)@1 as COUNT(cpu.user)] |
| | GapFillExec: group_expr=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0], aggr_expr=[COUNT(cpu.user)@1], stride=600000, time_range=Included("957528000000000000")..Included("957531540000000000") |
| | SortPreservingMergeExec: [date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC] |
| | SortExec: expr=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC] |
| | AggregateExec: mode=FinalPartitioned, gby=[date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)] |
| physical_plan | ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as minute, COUNT(cpu.user)@1 as COUNT(cpu.user)] |
| | GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0], aggr_expr=[COUNT(cpu.user)@1], stride=600000000000, time_range=Included("957528000000000000")..Included("957531540000000000") |
| | SortPreservingMergeExec: [date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC] |
| | SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 ASC] |
| | AggregateExec: mode=FinalPartitioned, gby=[date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@0 as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)] |
| | CoalesceBatchesExec: target_batch_size=8192 |
| | RepartitionExec: partitioning=Hash([Column { name: "date_bin_gapfill(IntervalDayTime(\"600000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 0 }], 4), input_partitions=4 |
| | AggregateExec: mode=Partial, gby=[datebin(600000, time@0, 0) as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)] |
| | RepartitionExec: partitioning=Hash([Column { name: "date_bin_gapfill(IntervalMonthDayNano(\"600000000000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 0 }], 4), input_partitions=4 |
| | AggregateExec: mode=Partial, gby=[datebin(600000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[COUNT(cpu.user)] |
| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |
| | CoalesceBatchesExec: target_batch_size=8192 |
| | FilterExec: time@0 >= 957528000000000000 AND time@0 <= 957531540000000000 |
@ -85,18 +103,18 @@
----------
| plan_type | plan |
----------
| logical_plan | Projection: cpu.region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, AVG(cpu.user) |
| | GapFill: groupBy=[[cpu.region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[LOCF(AVG(cpu.user))]], time_column=date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalDayTime("600000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None)) |
| | Aggregate: groupBy=[[cpu.region, datebin(IntervalDayTime("600000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[AVG(cpu.user)]] |
| logical_plan | Projection: cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")) AS minute, AVG(cpu.user) |
| | GapFill: groupBy=[[cpu.region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[LOCF(AVG(cpu.user))]], time_column=date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z")), stride=IntervalMonthDayNano("600000000000"), range=Included(TimestampNanosecond(957528000000000000, None))..Included(TimestampNanosecond(957531540000000000, None)) |
| | Aggregate: groupBy=[[cpu.region, datebin(IntervalMonthDayNano("600000000000"), cpu.time, TimestampNanosecond(0, None)) AS date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))]], aggr=[[AVG(cpu.user)]] |
| | TableScan: cpu projection=[region, time, user], full_filters=[cpu.time >= TimestampNanosecond(957528000000000000, None), cpu.time <= TimestampNanosecond(957531540000000000, None)] |
| physical_plan | ProjectionExec: expr=[region@0 as region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as minute, AVG(cpu.user)@2 as AVG(cpu.user)] |
| | GapFillExec: group_expr=[region@0, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1], aggr_expr=[LOCF(AVG(cpu.user)@2)], stride=600000, time_range=Included("957528000000000000")..Included("957531540000000000") |
| | SortPreservingMergeExec: [region@0 ASC,date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC] |
| | SortExec: expr=[region@0 ASC,date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC] |
| | AggregateExec: mode=FinalPartitioned, gby=[region@0 as region, date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)] |
| physical_plan | ProjectionExec: expr=[region@0 as region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as minute, AVG(cpu.user)@2 as AVG(cpu.user)] |
| | GapFillExec: group_expr=[region@0, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1], aggr_expr=[LOCF(AVG(cpu.user)@2)], stride=600000000000, time_range=Included("957528000000000000")..Included("957531540000000000") |
| | SortPreservingMergeExec: [region@0 ASC,date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC] |
| | SortExec: expr=[region@0 ASC,date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 ASC] |
| | AggregateExec: mode=FinalPartitioned, gby=[region@0 as region, date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))@1 as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)] |
| | CoalesceBatchesExec: target_batch_size=8192 |
| | RepartitionExec: partitioning=Hash([Column { name: "region", index: 0 }, Column { name: "date_bin_gapfill(IntervalDayTime(\"600000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 1 }], 4), input_partitions=4 |
| | AggregateExec: mode=Partial, gby=[region@0 as region, datebin(600000, time@1, 0) as date_bin_gapfill(IntervalDayTime("600000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)] |
| | RepartitionExec: partitioning=Hash([Column { name: "region", index: 0 }, Column { name: "date_bin_gapfill(IntervalMonthDayNano(\"600000000000\"),cpu.time,Utf8(\"1970-01-01T00:00:00Z\"))", index: 1 }], 4), input_partitions=4 |
| | AggregateExec: mode=Partial, gby=[region@0 as region, datebin(600000000000, time@1, 0) as date_bin_gapfill(IntervalMonthDayNano("600000000000"),cpu.time,Utf8("1970-01-01T00:00:00Z"))], aggr=[AVG(cpu.user)] |
| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |
| | CoalesceBatchesExec: target_batch_size=8192 |
| | FilterExec: time@1 >= 957528000000000000 AND time@1 <= 957531540000000000 |
@ -152,4 +170,16 @@
| 2000-05-05T12:38:00Z | 70.0 |
| 2000-05-05T12:39:00Z | 60.0 |
| 2000-05-05T12:40:00Z | 60.0 |
+----------------------+---------------+
+----------------------+---------------+
-- SQL: SELECT date_bin_gapfill(interval '4 minutes', time, timestamp '1970-01-01T00:00:00Z') as four_minute, interpolate(min(cpu.idle)), interpolate(min(cpu."user")) from cpu where time between timestamp '2000-05-05T12:19:00Z' and timestamp '2000-05-05T12:40:00Z' group by four_minute;
+----------------------+---------------+---------------+
| four_minute | MIN(cpu.idle) | MIN(cpu.user) |
+----------------------+---------------+---------------+
| 2000-05-05T12:16:00Z | | |
| 2000-05-05T12:20:00Z | 70.0 | 23.2 |
| 2000-05-05T12:24:00Z | 67.5 | 24.2 |
| 2000-05-05T12:28:00Z | 65.0 | 25.2 |
| 2000-05-05T12:32:00Z | 62.5 | 27.05 |
| 2000-05-05T12:36:00Z | 60.0 | 28.9 |
| 2000-05-05T12:40:00Z | | 21.0 |
+----------------------+---------------+---------------+

View File

@ -1,104 +1,204 @@
-- Test Setup: InfluxQLSelectSupport
-- InfluxQL: SHOW FIELD KEYS;
+------------------+--------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+--------------+-----------+
| cpu | usage_idle | float |
| cpu | usage_system | float |
| disk | bytes_free | integer |
| disk | bytes_used | integer |
| m0 | f64 | float |
| m0 | i64 | integer |
| m0 | str | string |
| m1 | f64 | float |
| m1 | i64 | integer |
| m1 | str | string |
| m2 | f64 | float |
| m3 | u64 | unsigned |
+------------------+--------------+-----------+
name: cpu
+--------------+-----------+
| fieldKey | fieldType |
+--------------+-----------+
| usage_idle | float |
| usage_system | float |
+--------------+-----------+
name: disk
+------------+-----------+
| fieldKey | fieldType |
+------------+-----------+
| bytes_free | integer |
| bytes_used | integer |
+------------+-----------+
name: m0
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
| i64 | integer |
| str | string |
+----------+-----------+
name: m1
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
| i64 | integer |
| str | string |
+----------+-----------+
name: m2
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
+----------+-----------+
name: m3
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| u64 | unsigned |
+----------+-----------+
-- InfluxQL: SHOW FIELD KEYS LIMIT 2;
+------------------+--------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+--------------+-----------+
| cpu | usage_idle | float |
| cpu | usage_system | float |
| disk | bytes_free | integer |
| disk | bytes_used | integer |
| m0 | f64 | float |
| m0 | i64 | integer |
| m1 | f64 | float |
| m1 | i64 | integer |
| m2 | f64 | float |
| m3 | u64 | unsigned |
+------------------+--------------+-----------+
name: cpu
+--------------+-----------+
| fieldKey | fieldType |
+--------------+-----------+
| usage_idle | float |
| usage_system | float |
+--------------+-----------+
name: disk
+------------+-----------+
| fieldKey | fieldType |
+------------+-----------+
| bytes_free | integer |
| bytes_used | integer |
+------------+-----------+
name: m0
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
| i64 | integer |
+----------+-----------+
name: m1
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
| i64 | integer |
+----------+-----------+
name: m2
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
+----------+-----------+
name: m3
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| u64 | unsigned |
+----------+-----------+
-- InfluxQL: SHOW FIELD KEYS OFFSET 1;
+------------------+--------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+--------------+-----------+
| cpu | usage_system | float |
| disk | bytes_used | integer |
| m0 | i64 | integer |
| m0 | str | string |
| m1 | i64 | integer |
| m1 | str | string |
+------------------+--------------+-----------+
name: cpu
+--------------+-----------+
| fieldKey | fieldType |
+--------------+-----------+
| usage_system | float |
+--------------+-----------+
name: disk
+------------+-----------+
| fieldKey | fieldType |
+------------+-----------+
| bytes_used | integer |
+------------+-----------+
name: m0
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| i64 | integer |
| str | string |
+----------+-----------+
name: m1
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| i64 | integer |
| str | string |
+----------+-----------+
-- InfluxQL: SHOW FIELD KEYS LIMIT 1 OFFSET 2;
+------------------+----------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+----------+-----------+
| m0 | str | string |
| m1 | str | string |
+------------------+----------+-----------+
name: m0
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| str | string |
+----------+-----------+
name: m1
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| str | string |
+----------+-----------+
-- InfluxQL: SHOW FIELD KEYS FROM cpu;
+------------------+--------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+--------------+-----------+
| cpu | usage_idle | float |
| cpu | usage_system | float |
+------------------+--------------+-----------+
name: cpu
+--------------+-----------+
| fieldKey | fieldType |
+--------------+-----------+
| usage_idle | float |
| usage_system | float |
+--------------+-----------+
-- InfluxQL: SHOW FIELD KEYS FROM disk,cpu,disk;
+------------------+--------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+--------------+-----------+
| cpu | usage_idle | float |
| cpu | usage_system | float |
| disk | bytes_free | integer |
| disk | bytes_used | integer |
+------------------+--------------+-----------+
name: cpu
+--------------+-----------+
| fieldKey | fieldType |
+--------------+-----------+
| usage_idle | float |
| usage_system | float |
+--------------+-----------+
name: disk
+------------+-----------+
| fieldKey | fieldType |
+------------+-----------+
| bytes_free | integer |
| bytes_used | integer |
+------------+-----------+
-- InfluxQL: SHOW FIELD KEYS FROM cpu,disk,cpu;
+------------------+--------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+--------------+-----------+
| cpu | usage_idle | float |
| cpu | usage_system | float |
| disk | bytes_free | integer |
| disk | bytes_used | integer |
+------------------+--------------+-----------+
name: cpu
+--------------+-----------+
| fieldKey | fieldType |
+--------------+-----------+
| usage_idle | float |
| usage_system | float |
+--------------+-----------+
name: disk
+------------+-----------+
| fieldKey | fieldType |
+------------+-----------+
| bytes_free | integer |
| bytes_used | integer |
+------------+-----------+
-- InfluxQL: SHOW FIELD KEYS FROM /m.*/;
+------------------+----------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+----------+-----------+
| m0 | f64 | float |
| m0 | i64 | integer |
| m0 | str | string |
| m1 | f64 | float |
| m1 | i64 | integer |
| m1 | str | string |
| m2 | f64 | float |
| m3 | u64 | unsigned |
+------------------+----------+-----------+
name: m0
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
| i64 | integer |
| str | string |
+----------+-----------+
name: m1
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
| i64 | integer |
| str | string |
+----------+-----------+
name: m2
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| f64 | float |
+----------+-----------+
name: m3
+----------+-----------+
| fieldKey | fieldType |
+----------+-----------+
| u64 | unsigned |
+----------+-----------+
-- InfluxQL: SHOW FIELD KEYS FROM /d\isk/;
+------------------+------------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+------------+-----------+
| disk | bytes_free | integer |
| disk | bytes_used | integer |
+------------------+------------+-----------+
name: disk
+------------+-----------+
| fieldKey | fieldType |
+------------+-----------+
| bytes_free | integer |
| bytes_used | integer |
+------------+-----------+
-- InfluxQL: SHOW FIELD KEYS FROM does_not_exist;
+------------------+----------+-----------+
| iox::measurement | fieldKey | fieldType |
+------------------+----------+-----------+
+------------------+----------+-----------+
-- InfluxQL: SHOW FIELD KEYS ON my_db;
Error while planning query: This feature is not implemented: SHOW FIELD KEYS ON <database>
-- InfluxQL: SHOW FIELD KEYS FROM x.my_db;

View File

@ -7,49 +7,39 @@
-- Validates expected data is returned
-- Projection wildcard, all tags and fields
-- IOX_COMPARE: sorted
SELECT * FROM m0;
-- No matching measurement
SELECT * FROM non_existent;
-- Projection wildcard, only tags
-- IOX_COMPARE: sorted
SELECT *::tag, f64 FROM m0;
-- Projection wildcard, only fields
-- IOX_COMPARE: sorted
SELECT *::field FROM m0;
-- Projection regex, mixture of tags and fields
-- IOX_COMPARE: sorted
SELECT /64|tag0/ FROM m0;
-- Projection specific tags and fields
-- IOX_COMPARE: sorted
SELECT f64, tag0 FROM m0;
-- Explicitly select time column
-- IOX_COMPARE: sorted
SELECT f64, tag0, time FROM m0;
-- arithmetic operators
-- IOX_COMPARE: sorted
SELECT f64, f64 * 2, i64, i64 + i64 FROM m0;
-- bitwise operators
-- IOX_COMPARE: sorted
SELECT i64, i64 & 1 FROM m0;
-- Automatic type coercion integer → float
-- IOX_COMPARE: sorted
SELECT f64 + i64 FROM m0;
-- Type cast postfix operator
SELECT f64, f64::integer FROM m0;
-- Column alias behaviour
-- IOX_COMPARE: sorted
SELECT f64 AS f64_2, f64, f64, f64 FROM m0 LIMIT 1;
--
@ -57,55 +47,45 @@ SELECT f64 AS f64_2, f64, f64, f64 FROM m0 LIMIT 1;
--
-- Single tag
-- IOX_COMPARE: sorted
SELECT tag0, f64 FROM m0 WHERE tag0 = 'val00';
-- IOX_COMPARE: sorted
SELECT tag0, f64 FROM m0 WHERE tag0 =~ /^val0(1|2)/;
-- Conjunction (AND)
-- IOX_COMPARE: sorted
SELECT /tag(0|1)/, f64 FROM m0 WHERE tag0 = 'val00' AND tag1 = 'val10';
-- Disjunction (OR)
-- IOX_COMPARE: sorted
SELECT /tag(0|1)/, f64 FROM m0 WHERE tag0 = 'val00' OR tag1 = 'val10';
-- arithmetic
-- IOX_COMPARE: sorted
SELECT f64 FROM m0 WHERE f64 > 10 + 10;
-- bitwise
-- IOX_COMPARE: sorted
SELECT i64 FROM m0 WHERE i64 & 1 = 0;
-- time bounds
-- timestamp format %Y-%M-%D
-- IOX_COMPARE: sorted
SELECT i64 FROM m0 WHERE time > '2022-10-31';
-- timestamp format %Y-%M-%D %h:%m:%s
-- IOX_COMPARE: sorted
SELECT i64 FROM m0 WHERE time > '2022-10-31 02:00:10';
-- now() and duration
-- NOTE: 100000d is > 270 years, so this test should be ok for a while.
-- However, if this test is still in use in 270 years and it starts failing,
-- try increasing the number of days 😂
-- IOX_COMPARE: sorted
SELECT i64 FROM m0 WHERE time > now() - 100000d;
-- NOT NULL test
-- WHERE tag1 != '' is the equivalent to tag1 IS NOT NULL
-- TODO(sgc): This is working, but likely by accident
-- IOX_COMPARE: sorted
SELECT tag1, f64 FROM m0 WHERE tag1 != '';
-- NULL test
-- WHERE tag1 = '' is the equivalent to tag1 IS NULL
-- TODO(sgc): Not working, as expected
-- -- IOX_COMPARE: sorted
--
-- SELECT tag1, f64 FROM m0 WHERE tag1 = '';
--
@ -292,6 +272,9 @@ SELECT usage_idle, bytes_free, device, cpu FROM cpu, disk GROUP BY device, cpu;
SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0;
SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1;
SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0 GROUP BY tag0;
-- IOX_COMPARE: no_borders
EXPLAIN SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1 GROUP BY tag0;
-- TODO(sgc): `sorted` is a workaround for https://github.com/influxdata/influxdb_iox/issues/7513
-- IOX_COMPARE: sorted
SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0, m1 GROUP BY tag0;
SELECT COUNT(f64), SUM(f64), stddev(f64) FROM m0 GROUP BY tag0, non_existent;
@ -304,6 +287,7 @@ SELECT COUNT(f64) as the_count, SUM(f64) + SUM(non_existent) as foo FROM m0;
-- measurements with different schema
SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk;
-- TODO(sgc): `sorted` is a workaround for https://github.com/influxdata/influxdb_iox/issues/7513
-- IOX_COMPARE: sorted
SELECT MEAN(usage_idle), MEAN(bytes_free) FROM cpu, disk GROUP BY TIME(10s) FILL(none);
@ -327,6 +311,7 @@ SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s) FILL(none);
SELECT COUNT(f64), SUM(f64) FROM m0 GROUP BY TIME(30s, 1s) FILL(none);
SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk;
-- TODO(sgc): `sorted` is a workaround for https://github.com/influxdata/influxdb_iox/issues/7513
-- IOX_COMPARE: sorted
SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk GROUP BY TIME(1s) FILL(none);
SELECT COUNT(usage_idle), COUNT(bytes_free) FROM cpu, disk GROUP BY cpu;

View File

@ -10,7 +10,7 @@ async fn schema_merge_nonexistent_column() {
setup_name: "MultiChunkSchemaMerge",
sql: "SELECT * from cpu where foo = 8",
expected_error_code: tonic::Code::InvalidArgument,
expected_message: r#"Error while planning query: Schema error: No field named "foo". Valid fields are "cpu"."host", "cpu"."region", "cpu"."system", "cpu"."time", "cpu"."user"."#,
expected_message: r#"Error while planning query: Schema error: No field named foo. Valid fields are cpu.host, cpu.region, cpu.system, cpu.time, cpu.user."#,
}
.run()
.await;

View File

@ -24,11 +24,11 @@ prost = "0.11"
rand = "0.8.3"
reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
schema = { path = "../schema" }
serde_json = "1.0.95"
serde_json = "1.0.96"
tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread"] }
tokio-stream = "0.1.12"
thiserror = "1.0.40"
tonic = { version = "0.8" }
tonic = { workspace = true }
[dev-dependencies]
insta = { version = "1" }

View File

@ -36,8 +36,5 @@ pub mod store;
/// Client for testing purposes.
pub mod test;
/// Client for fetching write info
pub mod write_info;
/// Client for write API
pub mod write;

View File

@ -29,9 +29,9 @@ use arrow_flight::{
error::{FlightError, Result},
sql::{
ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any,
CommandGetCatalogs, CommandGetDbSchemas, CommandGetPrimaryKeys, CommandGetSqlInfo,
CommandGetTableTypes, CommandGetTables, CommandPreparedStatementQuery,
CommandStatementQuery, ProstMessageExt,
CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
CommandGetTables, CommandPreparedStatementQuery, CommandStatementQuery, ProstMessageExt,
},
Action, FlightClient, FlightDescriptor, FlightInfo, IpcMessage, Ticket,
};
@ -153,6 +153,56 @@ impl FlightSqlClient {
self.do_get_with_cmd(msg.as_any()).await
}
/// List a description of the foreign key columns in the given foreign key table that
/// reference the primary key or the columns representing a unique constraint of the
/// parent table (could be the same or a different table) on this server using a
/// [`CommandGetCrossReference`] message.
///
/// # Parameters
///
/// Definition from <https://github.com/apache/arrow/blob/f0c8229f5a09fe53186df171d518430243ddf112/format/FlightSql.proto#L1405-L1477>
///
/// pk_catalog: The catalog name where the parent table is.
/// An empty string retrieves those without a catalog.
/// If omitted the catalog name should not be used to narrow the search.
///
/// pk_db_schema: The Schema name where the parent table is.
/// An empty string retrieves those without a schema.
/// If omitted the schema name should not be used to narrow the search.
///
/// pk_table: The parent table name. It cannot be null.
///
/// fk_catalog: The catalog name where the foreign table is.
/// An empty string retrieves those without a catalog.
/// If omitted the catalog name should not be used to narrow the search.
///
/// fk_db_schema: The schema name where the foreign table is.
/// An empty string retrieves those without a schema.
/// If omitted the schema name should not be used to narrow the search.
///
/// fk_table: The foreign table name. It cannot be null.
///
/// This implementation does not support alternate endpoints
pub async fn get_cross_reference(
&mut self,
pk_catalog: Option<impl Into<String> + Send>,
pk_db_schema: Option<impl Into<String> + Send>,
pk_table: String,
fk_catalog: Option<impl Into<String> + Send>,
fk_db_schema: Option<impl Into<String> + Send>,
fk_table: String,
) -> Result<FlightRecordBatchStream> {
let msg = CommandGetCrossReference {
pk_catalog: pk_catalog.map(|s| s.into()),
pk_db_schema: pk_db_schema.map(|s| s.into()),
pk_table,
fk_catalog: fk_catalog.map(|s| s.into()),
fk_db_schema: fk_db_schema.map(|s| s.into()),
fk_table,
};
self.do_get_with_cmd(msg.as_any()).await
}
/// List the schemas on this server
///
/// # Parameters
@ -182,6 +232,71 @@ impl FlightSqlClient {
self.do_get_with_cmd(msg.as_any()).await
}
/// List a description of the foreign key columns that reference the given
/// table's primary key columns (the foreign keys exported by a table) of a
/// table on this server using a [`CommandGetExportedKeys`] message.
///
/// # Parameters
///
/// Definition from <https://github.com/apache/arrow/blob/0434ab65075ecd1d2ab9245bcd7ec6038934ed29/format/FlightSql.proto#L1307-L1352>
///
/// catalog: Specifies the catalog to search for the foreign key table.
/// An empty string retrieves those without a catalog.
/// If omitted the catalog name should not be used to narrow the search.
///
/// db_schema: Specifies the schema to search for the foreign key table.
/// An empty string retrieves those without a schema.
/// If omitted the schema name should not be used to narrow the search.
///
/// table: Specifies the foreign key table to get the foreign keys for.
///
/// This implementation does not support alternate endpoints
pub async fn get_exported_keys(
&mut self,
catalog: Option<impl Into<String> + Send>,
db_schema: Option<impl Into<String> + Send>,
table: String,
) -> Result<FlightRecordBatchStream> {
let msg = CommandGetExportedKeys {
catalog: catalog.map(|s| s.into()),
db_schema: db_schema.map(|s| s.into()),
table,
};
self.do_get_with_cmd(msg.as_any()).await
}
/// List the foreign keys of a table on this server using a
/// [`CommandGetImportedKeys`] message.
///
/// # Parameters
///
/// Definition from <https://github.com/apache/arrow/blob/196222dbd543d6931f4a1432845add97be0db802/format/FlightSql.proto#L1354-L1403>
///
/// catalog: Specifies the catalog to search for the primary key table.
/// An empty string retrieves those without a catalog.
/// If omitted the catalog name should not be used to narrow the search.
///
/// db_schema: Specifies the schema to search for the primary key table.
/// An empty string retrieves those without a schema.
/// If omitted the schema name should not be used to narrow the search.
///
/// table: Specifies the primary key table to get the foreign keys for.
///
/// This implementation does not support alternate endpoints
pub async fn get_imported_keys(
&mut self,
catalog: Option<impl Into<String> + Send>,
db_schema: Option<impl Into<String> + Send>,
table: String,
) -> Result<FlightRecordBatchStream> {
let msg = CommandGetImportedKeys {
catalog: catalog.map(|s| s.into()),
db_schema: db_schema.map(|s| s.into()),
table,
};
self.do_get_with_cmd(msg.as_any()).await
}
/// List the primary keys on this server using a [`CommandGetPrimaryKeys`] message.
///
/// # Parameters

View File

@ -1,52 +0,0 @@
use client_util::connection::GrpcConnection;
use self::generated_types::{write_info_service_client::WriteInfoServiceClient, *};
use crate::connection::Connection;
use crate::error::Error;
/// Re-export generated_types
pub mod generated_types {
pub use generated_types::influxdata::iox::ingester::v1::{
write_info_service_client, write_info_service_server, GetWriteInfoRequest,
GetWriteInfoResponse, ShardInfo, ShardStatus,
};
pub use generated_types::write_info::merge_responses;
}
/// A basic client for fetching information about write tokens from a
/// single ingester.
///
/// NOTE: This is an ALPHA / Internal API that is used as part of the
/// end to end tests.
///
/// A public API is tracked here:
/// <https://github.com/influxdata/influxdb_iox/issues/4354>
#[derive(Debug, Clone)]
pub struct Client {
inner: WriteInfoServiceClient<GrpcConnection>,
}
impl Client {
/// Creates a new client with the provided connection
pub fn new(connection: Connection) -> Self {
Self {
inner: WriteInfoServiceClient::new(connection.into_grpc_connection()),
}
}
/// Get the write information for a write token
pub async fn get_write_info(
&mut self,
write_token: &str,
) -> Result<GetWriteInfoResponse, Error> {
let response = self
.inner
.get_write_info(GetWriteInfoRequest {
write_token: write_token.to_string(),
})
.await?;
Ok(response.into_inner())
}
}

View File

@ -1,4 +1,5 @@
use arrow::array::StringArray;
use arrow::array::{Array, ArrayData, StringArray};
use arrow::datatypes::DataType;
use arrow::error::ArrowError;
use arrow::record_batch::RecordBatch;
use arrow::util::display::ArrayFormatter;
@ -29,9 +30,38 @@ pub enum Error {
}
type Result<T, E = Error> = std::result::Result<T, E>;
/// Options for controlling how table borders are rendered.
#[derive(Debug, Default, Clone, Copy)]
pub enum TableBorders {
/// Use ASCII characters.
#[default]
Ascii,
/// Use UNICODE box-drawing characters.
Unicode,
/// Do not render borders.
None,
}
/// Options for the [`write_columnar`] function.
#[derive(Debug, Default)]
pub struct Options {
/// Specify how borders should be rendered.
pub borders: TableBorders,
}
impl Options {
fn table_preset(&self) -> &'static str {
match self.borders {
TableBorders::Ascii => "||--+-++| ++++++",
TableBorders::Unicode => comfy_table::presets::UTF8_FULL,
TableBorders::None => comfy_table::presets::NOTHING,
}
}
}
/// Write the record batches in a columnar format.
pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()> {
let options = arrow::util::display::FormatOptions::default().with_display_error(true);
pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch], options: Options) -> Result<()> {
let arrow_opts = arrow::util::display::FormatOptions::default().with_display_error(true);
let Some(schema) = batches.first().map(|b|b.schema()) else { return Ok(()) };
let md = schema
@ -68,7 +98,7 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
let new_table = || {
let mut table = Table::new();
table.load_preset("||--+-++| ++++++");
table.load_preset(options.table_preset());
table.set_header(header.clone());
table
};
@ -78,7 +108,9 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
for batch in batches {
let cols = col_indexes
.iter()
.map(|idx| ArrayFormatter::try_new(batch.column(*idx), &options).map_err(Error::Arrow))
.map(|idx| {
ArrayFormatter::try_new(batch.column(*idx), &arrow_opts).map_err(Error::Arrow)
})
.collect::<Result<Vec<_>>>()?;
let measurement = batch
@ -87,6 +119,10 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
.downcast_ref::<StringArray>()
.expect("expected measurement column to be a StringArray");
// create an empty string array for any tag columns that are NULL
let empty: StringArray =
StringArray::from(ArrayData::new_null(&DataType::Utf8, measurement.len()));
let tag_vals = tag_key_indexes
.iter()
.map(|idx| {
@ -94,7 +130,7 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
.column(*idx)
.as_any()
.downcast_ref::<StringArray>()
.expect("expected tag column to be a StringArray")
.unwrap_or(&empty)
})
.collect::<Vec<_>>();
@ -160,7 +196,7 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch]) -> Result<()>
#[cfg(test)]
mod test {
use crate::format::influxql::write_columnar;
use crate::format::influxql::{write_columnar, Options};
use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray, TimestampNanosecondArray};
use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
use arrow::record_batch::RecordBatch;
@ -241,7 +277,7 @@ mod test {
tag_key_columns: vec![],
});
let mut s = Vec::<u8>::new();
write_columnar(&mut s, &rb).unwrap();
write_columnar(&mut s, &rb, Options::default()).unwrap();
let res = String::from_utf8(s).unwrap();
insta::assert_snapshot!(res, @r###"
name: cpu
@ -271,7 +307,7 @@ mod test {
}],
});
let mut s = Vec::<u8>::new();
write_columnar(&mut s, &rb).unwrap();
write_columnar(&mut s, &rb, Options::default()).unwrap();
let res = String::from_utf8(s).unwrap();
insta::assert_snapshot!(res, @r###"
name: cpu
@ -309,7 +345,7 @@ mod test {
}],
});
let mut s = Vec::<u8>::new();
write_columnar(&mut s, &rb).unwrap();
write_columnar(&mut s, &rb, Options::default()).unwrap();
let res = String::from_utf8(s).unwrap();
insta::assert_snapshot!(res, @r###"
name: cpu
@ -354,7 +390,7 @@ mod test {
],
});
let mut s = Vec::<u8>::new();
write_columnar(&mut s, &rb).unwrap();
write_columnar(&mut s, &rb, Options::default()).unwrap();
let res = String::from_utf8(s).unwrap();
insta::assert_snapshot!(res, @r###"
name: cpu

View File

@ -9,7 +9,7 @@ license.workspace = true
client_util = { path = "../client_util" }
generated_types = { path = "../generated_types", default-features=false, features=["data_types"] }
prost = "0.11"
tonic = { version = "0.8" }
tonic = { workspace = true }
futures-util = { version = "0.3" }
observability_deps = { path = "../observability_deps"}
workspace-hack = { version = "0.1", path = "../workspace-hack" }

View File

@ -1,57 +0,0 @@
[package]
name = "ingest_replica"
version.workspace = true
authors.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
arrow = { workspace = true, features = ["prettyprint"] }
arrow-flight = { workspace = true }
arrow_util = { version = "0.1.0", path = "../arrow_util" }
async-channel = "1.8.0"
async-trait = "0.1.60"
backoff = { version = "0.1.0", path = "../backoff" }
bytes = "1.3.0"
crossbeam-utils = "0.8.14"
data_types = { version = "0.1.0", path = "../data_types" }
datafusion.workspace = true
datafusion_util = { path = "../datafusion_util" }
flatbuffers = "23.1.21"
futures = "0.3.25"
generated_types = { version = "0.1.0", path = "../generated_types" }
hashbrown.workspace = true
iox_catalog = { version = "0.1.0", path = "../iox_catalog" }
iox_query = { version = "0.1.0", path = "../iox_query" }
iox_time = { path = "../iox_time" }
metric = { version = "0.1.0", path = "../metric" }
mutable_batch = { version = "0.1.0", path = "../mutable_batch" }
mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
object_store = "0.5.2"
observability_deps = { version = "0.1.0", path = "../observability_deps" }
once_cell = "1.17"
parking_lot = "0.12.1"
parquet_file = { version = "0.1.0", path = "../parquet_file" }
pin-project = "1.0.12"
predicate = { version = "0.1.0", path = "../predicate" }
prost = { version = "0.11.2", default-features = false, features = ["std"] }
rand = "0.8.5"
schema = { version = "0.1.0", path = "../schema" }
service_grpc_catalog = { version = "0.1.0", path = "../service_grpc_catalog" }
thiserror = "1.0.38"
test_helpers = { path = "../test_helpers", features = ["future_timeout"], optional = true }
tokio = { version = "1.22", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
tonic = "0.8.3"
trace = { version = "0.1.0", path = "../trace" }
uuid = "1.2.2"
workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]
assert_matches = "1.5.0"
criterion = { version = "0.4", default-features = false, features = ["async_tokio"]}
datafusion_util = { path = "../datafusion_util" }
lazy_static = "1.4.0"
mutable_batch_lp = { path = "../mutable_batch_lp" }
paste = "1.0.11"
tempfile = "3.3.0"
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }

View File

@ -1,93 +0,0 @@
//! In memory queryable buffer of data sent from one or more ingesters. It evicts data from the
//! buffer when persist requests are sent in.
use crate::{
cache::SchemaCache,
query::{response::QueryResponse, QueryError, QueryExec},
BufferError, ReplicationBuffer, TableIdToMutableBatch,
};
use async_trait::async_trait;
use data_types::{
sequence_number_set::SequenceNumberSet, NamespaceId, PartitionId, SequenceNumber, TableId,
};
use iox_query::exec::Executor;
use std::sync::Arc;
use trace::span::Span;
use uuid::Uuid;
#[derive(Debug)]
pub(crate) struct Buffer {
_schema_cache: Arc<SchemaCache>,
_exec: Arc<Executor>,
}
impl Buffer {
pub(crate) fn new(_schema_cache: Arc<SchemaCache>, _exec: Arc<Executor>) -> Self {
Self {
_schema_cache,
_exec,
}
}
pub(crate) async fn apply_write(
&self,
_namespace_id: NamespaceId,
_table_batches: TableIdToMutableBatch,
_ingester_id: Uuid,
_sequence_number: SequenceNumber,
) -> Result<(), BufferError> {
panic!("unimplemented")
}
}
#[async_trait]
impl ReplicationBuffer for Buffer {
async fn apply_write(
&self,
namespace_id: NamespaceId,
table_batches: TableIdToMutableBatch,
ingester_id: Uuid,
sequence_number: SequenceNumber,
) -> Result<(), BufferError> {
self.apply_write(namespace_id, table_batches, ingester_id, sequence_number)
.await
}
async fn apply_persist(
&self,
_ingester_id: Uuid,
_namespace_id: NamespaceId,
_table_id: TableId,
_partition_id: PartitionId,
_sequence_set: SequenceNumberSet,
) -> Result<(), BufferError> {
panic!("unimplemented")
}
async fn append_partition_buffer(
&self,
_ingester_id: Uuid,
_namespace_id: NamespaceId,
_table_id: TableId,
_partition_id: PartitionId,
_sequence_set: SequenceNumberSet,
_table_batches: TableIdToMutableBatch,
) -> Result<(), BufferError> {
panic!("unimplemented")
}
}
#[async_trait]
impl QueryExec for Buffer {
type Response = QueryResponse;
async fn query_exec(
&self,
_namespace_id: NamespaceId,
_table_id: TableId,
_columns: Vec<String>,
_span: Option<Span>,
) -> Result<Self::Response, QueryError> {
panic!("unimplemented");
}
}

View File

@ -1,250 +0,0 @@
//! A cache of table schemas and partition sort keys for us with the buffer to answer Flight
//! requests.
use data_types::{NamespaceId, PartitionId, PartitionKey, ShardId, TableId, TableSchema};
use iox_catalog::interface::{
get_table_schema_by_id, list_schemas, Catalog, Error as CatalogError,
};
use parking_lot::RwLock;
use std::{collections::BTreeMap, ops::DerefMut, sync::Arc};
use thiserror::Error;
/// Errors that occur during the use of the cache.
#[derive(Debug, Error)]
pub enum CacheError {
#[error("namespace {id:?} not found")]
NamespaceNotFound { id: NamespaceId },
#[error("table {id:?} not found")]
TableNotFound { id: TableId },
#[error("partition for table {table_id:?} and partition key {partition_key:?} not found")]
PartitionNotFound {
table_id: TableId,
partition_key: PartitionKey,
},
#[error("catalog error: {0}")]
Catalog(#[from] CatalogError),
}
#[derive(Debug)]
pub(crate) struct SchemaCache {
state: RwLock<State>,
catalog: Arc<dyn Catalog>,
transition_shard_id: ShardId,
}
#[derive(Debug, Default)]
struct State {
partition_ids: BTreeMap<(TableId, PartitionKey), PartitionId>,
table_schemas: BTreeMap<TableId, Arc<TableSchema>>,
}
const RECENT_PARTITION_COUNT_TO_WARM: usize = 40000;
impl SchemaCache {
pub async fn warm(&self) -> Result<(), CacheError> {
let namespaces = list_schemas(&*self.catalog).await?.collect::<Vec<_>>();
let partitions = self
.catalog
.repositories()
.await
.partitions()
.most_recent_n(RECENT_PARTITION_COUNT_TO_WARM)
.await?;
let mut state = self.state.write();
for (_namespace, schema) in namespaces {
for (_table_name, table_schema) in schema.tables {
state
.table_schemas
.insert(table_schema.id, Arc::new(table_schema));
}
}
for partition in partitions {
state
.partition_ids
.insert((partition.table_id, partition.partition_key), partition.id);
}
Ok(())
}
pub fn new(catalog: Arc<dyn Catalog>, transition_shard_id: ShardId) -> Self {
Self {
catalog,
state: Default::default(),
transition_shard_id,
}
}
pub async fn get_table_schema(
&self,
table_id: TableId,
) -> Result<Arc<TableSchema>, CacheError> {
match self.get_table_schema_from_cache(&table_id) {
Some(t) => Ok(t),
None => {
let table_schema = {
let mut repos = self.catalog.repositories().await;
get_table_schema_by_id(table_id, repos.deref_mut()).await?
};
let table_schema = Arc::new(table_schema);
let mut s = self.state.write();
s.table_schemas.insert(table_id, Arc::clone(&table_schema));
Ok(table_schema)
}
}
}
fn get_table_schema_from_cache(&self, table_id: &TableId) -> Option<Arc<TableSchema>> {
let s = self.state.read();
s.table_schemas.get(table_id).cloned()
}
pub async fn get_table_schema_from_catalog(
&self,
table_id: TableId,
) -> Result<Arc<TableSchema>, CacheError> {
let table_schema = {
let mut repos = self.catalog.repositories().await;
get_table_schema_by_id(table_id, repos.deref_mut()).await?
};
let table_schema = Arc::new(table_schema);
let mut s = self.state.write();
s.table_schemas.insert(table_id, Arc::clone(&table_schema));
Ok(table_schema)
}
pub async fn get_partition_id(
&self,
table_id: TableId,
partition_key: PartitionKey,
) -> Result<PartitionId, CacheError> {
let id = match self.get_partition_id_from_cache(table_id, partition_key.clone()) {
Some(k) => k,
None => {
let partition = self
.catalog
.repositories()
.await
.partitions()
.create_or_get(partition_key.clone(), self.transition_shard_id, table_id)
.await?;
let mut s = self.state.write();
s.partition_ids
.insert((table_id, partition_key), partition.id);
partition.id
}
};
Ok(id)
}
fn get_partition_id_from_cache(
&self,
table_id: TableId,
partition_key: PartitionKey,
) -> Option<PartitionId> {
let s = self.state.read();
s.partition_ids.get(&(table_id, partition_key)).cloned()
}
}
#[cfg(test)]
mod tests {
use super::*;
use data_types::{ColumnType, Namespace, Partition, Table};
use iox_catalog::create_or_get_default_records;
use iox_catalog::mem::MemCatalog;
use metric::Registry;
const NAMESPACE_NAME: &str = "foo";
const TABLE_NAME: &str = "bar";
const COLUMN_NAME: &str = "time";
const PARTITION_KEY: &str = "2023-01-08";
#[tokio::test]
async fn warms_cache() {
let (catalog, shard_id, _namespace, table, partition) = get_test_data().await;
let cache = SchemaCache::new(catalog, shard_id);
assert!(cache.get_table_schema_from_cache(&table.id).is_none());
assert!(cache
.get_partition_id_from_cache(table.id, partition.partition_key.clone())
.is_none());
cache.warm().await.unwrap();
assert_eq!(
cache.get_table_schema_from_cache(&table.id).unwrap().id,
table.id
);
assert_eq!(
cache
.get_partition_id_from_cache(table.id, partition.partition_key)
.unwrap(),
partition.id
);
}
#[tokio::test]
async fn gets_table_schema_and_partition_id_from_catalog_if_not_in_cache() {
let (catalog, shard_id, _namespace, table, partition) = get_test_data().await;
let cache = SchemaCache::new(catalog, shard_id);
assert!(cache.get_table_schema_from_cache(&table.id).is_none());
assert!(cache
.get_partition_id_from_cache(table.id, partition.partition_key.clone())
.is_none());
assert_eq!(cache.get_table_schema(table.id).await.unwrap().id, table.id);
assert_eq!(
cache
.get_partition_id(table.id, partition.partition_key)
.await
.unwrap(),
partition.id
);
}
async fn get_test_data() -> (Arc<dyn Catalog>, ShardId, Namespace, Table, Partition) {
let catalog = MemCatalog::new(Arc::new(Registry::new()));
let mut txn = catalog.start_transaction().await.unwrap();
let (topic, query_pool, shards) = create_or_get_default_records(1, txn.deref_mut())
.await
.unwrap();
let shard_id = *shards.keys().next().unwrap();
let namespace = txn
.namespaces()
.create(NAMESPACE_NAME, None, topic.id, query_pool.id)
.await
.unwrap();
let table = txn
.tables()
.create_or_get(TABLE_NAME, namespace.id)
.await
.unwrap();
let _ = txn
.columns()
.create_or_get(COLUMN_NAME, table.id, ColumnType::Time)
.await
.unwrap();
let partition = txn
.partitions()
.create_or_get(PARTITION_KEY.into(), shard_id, table.id)
.await
.unwrap();
txn.commit().await.unwrap();
(Arc::new(catalog), shard_id, namespace, table, partition)
}
}

View File

@ -1,67 +0,0 @@
mod query;
mod replication;
use std::{fmt::Debug, sync::Arc};
use arrow_flight::flight_service_server::FlightServiceServer;
use generated_types::influxdata::iox::ingester::v1::replication_service_server::ReplicationServiceServer;
use crate::ReplicationBuffer;
use crate::{
query::{response::QueryResponse, QueryExec},
IngestReplicaRpcInterface,
};
use self::replication::ReplicationServer;
/// This type is responsible for injecting internal dependencies that SHOULD NOT
/// leak outside of the ingester crate into public gRPC handlers.
///
/// Configuration and external dependencies SHOULD be injected through the
/// respective gRPC handler constructor method.
#[derive(Debug)]
pub(crate) struct GrpcDelegate<B> {
buffer: Arc<B>,
metrics: Arc<metric::Registry>,
}
impl<B> GrpcDelegate<B>
where
B: ReplicationBuffer + QueryExec<Response = QueryResponse> + 'static,
{
/// Initialise a new [`GrpcDelegate`].
pub(crate) fn new(buffer: Arc<B>, metrics: Arc<metric::Registry>) -> Self {
Self { buffer, metrics }
}
}
/// Implement the type-erasure trait to hide internal types from crate-external
/// callers.
impl<B> IngestReplicaRpcInterface for GrpcDelegate<B>
where
B: ReplicationBuffer + QueryExec<Response = QueryResponse> + 'static,
{
type ReplicationHandler = ReplicationServer<B>;
type FlightHandler = query::FlightService<Arc<B>>;
/// Return a [`ReplicationService`] gRPC implementation.
///
/// [`ReplicationService`]: generated_types::influxdata::iox::catalog::v1::write_service_server::WriteService.
fn replication_service(&self) -> ReplicationServiceServer<Self::ReplicationHandler> {
ReplicationServiceServer::new(ReplicationServer::new(Arc::clone(&self.buffer)))
}
/// Return an Arrow [`FlightService`] gRPC implementation.
///
/// [`FlightService`]: arrow_flight::flight_service_server::FlightService
fn query_service(
&self,
max_simultaneous_requests: usize,
) -> FlightServiceServer<Self::FlightHandler> {
FlightServiceServer::new(query::FlightService::new(
Arc::clone(&self.buffer),
max_simultaneous_requests,
&self.metrics,
))
}
}

View File

@ -1,363 +0,0 @@
use std::pin::Pin;
use arrow_flight::{
encode::FlightDataEncoderBuilder, error::FlightError,
flight_service_server::FlightService as Flight, Action, ActionType, Criteria, Empty,
FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage,
PutResult, SchemaResult, Ticket,
};
use data_types::{NamespaceId, PartitionId, TableId};
use flatbuffers::FlatBufferBuilder;
use futures::{stream::BoxStream, Stream, StreamExt, TryStreamExt};
use generated_types::influxdata::iox::ingester::v1::{self as proto, PartitionStatus};
use metric::U64Counter;
use observability_deps::tracing::*;
use prost::Message;
use thiserror::Error;
use tokio::sync::{Semaphore, TryAcquireError};
use tonic::{Request, Response, Streaming};
use trace::{ctx::SpanContext, span::SpanExt};
use uuid::Uuid;
use crate::query::{response::QueryResponse, QueryError, QueryExec};
/// Error states for the query RPC handler.
///
/// Note that this DOES NOT include any query-time error states - those are
/// mapped directly from the [`QueryError`] itself.
///
/// Note that this isn't strictly necessary as the [`FlightService`] trait
/// expects a [`tonic::Status`] error value, but by defining the errors here
/// they serve as documentation of the potential error states (which are then
/// converted into [`tonic::Status`] for the handler).
#[derive(Debug, Error)]
enum Error {
/// The payload within the Flight ticket cannot be deserialised into a
/// [`proto::IngesterQueryRequest`].
#[error("invalid flight ticket: {0}")]
InvalidTicket(#[from] prost::DecodeError),
/// The number of simultaneous queries being executed has been reached.
#[error("simultaneous query limit exceeded")]
RequestLimit,
}
/// Map a query-execution error into a [`tonic::Status`].
impl From<QueryError> for tonic::Status {
fn from(e: QueryError) -> Self {
use tonic::Code;
let code = match e {
QueryError::TableNotFound(_, _) | QueryError::NamespaceNotFound(_) => Code::NotFound,
};
Self::new(code, e.to_string())
}
}
/// Map a gRPC handler error to a [`tonic::Status`].
impl From<Error> for tonic::Status {
fn from(e: Error) -> Self {
use tonic::Code;
let code = match e {
Error::InvalidTicket(_) => {
debug!(error=%e, "invalid flight query ticket");
Code::InvalidArgument
}
Error::RequestLimit => {
warn!("simultaneous query limit exceeded");
Code::ResourceExhausted
}
};
Self::new(code, e.to_string())
}
}
/// Concrete implementation of the gRPC Arrow Flight Service API
#[derive(Debug)]
pub(crate) struct FlightService<Q> {
query_handler: Q,
/// A request limiter to restrict the number of simultaneous requests this
/// ingester services.
///
/// This allows the ingester to drop a portion of requests when experiencing
/// an unusual flood of requests
request_sem: Semaphore,
/// Number of queries rejected due to lack of available `request_sem`
/// permit.
query_request_limit_rejected: U64Counter,
ingester_uuid: Uuid,
}
impl<Q> FlightService<Q> {
pub(super) fn new(
query_handler: Q,
max_simultaneous_requests: usize,
metrics: &metric::Registry,
) -> Self {
let query_request_limit_rejected = metrics
.register_metric::<U64Counter>(
"query_request_limit_rejected",
"number of query requests rejected due to exceeding parallel request limit",
)
.recorder(&[]);
Self {
query_handler,
request_sem: Semaphore::new(max_simultaneous_requests),
query_request_limit_rejected,
ingester_uuid: Uuid::new_v4(),
}
}
}
type TonicStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send + 'static>>;
#[tonic::async_trait]
impl<Q> Flight for FlightService<Q>
where
Q: QueryExec<Response = QueryResponse> + 'static,
{
type HandshakeStream = TonicStream<HandshakeResponse>;
type ListFlightsStream = TonicStream<FlightInfo>;
type DoGetStream = TonicStream<FlightData>;
type DoPutStream = TonicStream<PutResult>;
type DoActionStream = TonicStream<arrow_flight::Result>;
type ListActionsStream = TonicStream<ActionType>;
type DoExchangeStream = TonicStream<FlightData>;
async fn get_schema(
&self,
_request: Request<FlightDescriptor>,
) -> Result<Response<SchemaResult>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
async fn do_get(
&self,
request: Request<Ticket>,
) -> Result<Response<Self::DoGetStream>, tonic::Status> {
let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
// Acquire and hold a permit for the duration of this request, or return
// an error if the existing requests have already exhausted the
// allocation.
//
// Our goal is to limit the number of concurrently executing queries as
// a rough way of ensuring we don't explode memory by trying to do too
// much at the same time.
let _permit = match self.request_sem.try_acquire() {
Ok(p) => p,
Err(TryAcquireError::NoPermits) => {
warn!("simultaneous request limit exceeded - dropping query request");
self.query_request_limit_rejected.inc(1);
return Err(Error::RequestLimit)?;
}
Err(e) => panic!("request limiter error: {e}"),
};
let ticket = request.into_inner();
let request = proto::IngesterQueryRequest::decode(&*ticket.ticket).map_err(Error::from)?;
// Extract the namespace/table identifiers
let namespace_id = NamespaceId::new(request.namespace_id);
let table_id = TableId::new(request.table_id);
// Predicate pushdown is part of the API, but not implemented.
if let Some(p) = request.predicate {
warn!(predicate=?p, "ignoring query predicate (unsupported)");
}
let response = self
.query_handler
.query_exec(
namespace_id,
table_id,
request.columns,
span_ctx.child_span("ingester query"),
)
.await?;
let output = encode_response(response, self.ingester_uuid).map_err(tonic::Status::from);
Ok(Response::new(Box::pin(output) as Self::DoGetStream))
}
async fn handshake(
&self,
request: Request<Streaming<HandshakeRequest>>,
) -> Result<Response<Self::HandshakeStream>, tonic::Status> {
let request = request.into_inner().message().await?.unwrap();
let response = HandshakeResponse {
protocol_version: request.protocol_version,
payload: request.payload,
};
let output = futures::stream::iter(std::iter::once(Ok(response)));
Ok(Response::new(Box::pin(output) as Self::HandshakeStream))
}
async fn list_flights(
&self,
_request: Request<Criteria>,
) -> Result<Response<Self::ListFlightsStream>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
async fn get_flight_info(
&self,
_request: Request<FlightDescriptor>,
) -> Result<Response<FlightInfo>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
async fn do_put(
&self,
_request: Request<Streaming<FlightData>>,
) -> Result<Response<Self::DoPutStream>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
async fn do_action(
&self,
_request: Request<Action>,
) -> Result<Response<Self::DoActionStream>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
async fn list_actions(
&self,
_request: Request<Empty>,
) -> Result<Response<Self::ListActionsStream>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
async fn do_exchange(
&self,
_request: Request<Streaming<FlightData>>,
) -> Result<Response<Self::DoExchangeStream>, tonic::Status> {
Err(tonic::Status::unimplemented("Not yet implemented"))
}
}
/// Encode the partition information as a None flight data with metadata
fn encode_partition(
// Partition ID.
partition_id: PartitionId,
// Partition persistence status.
status: PartitionStatus,
// Count of persisted Parquet files
completed_persistence_count: u64,
ingester_uuid: Uuid,
) -> std::result::Result<FlightData, FlightError> {
let mut bytes = bytes::BytesMut::new();
let app_metadata = proto::IngesterQueryResponseMetadata {
partition_id: partition_id.get(),
status: Some(proto::PartitionStatus {
parquet_max_sequence_number: status.parquet_max_sequence_number,
}),
ingester_uuid: ingester_uuid.to_string(),
completed_persistence_count,
};
prost::Message::encode(&app_metadata, &mut bytes)
.map_err(|e| FlightError::from_external_error(Box::new(e)))?;
Ok(FlightData::new(
None,
IpcMessage(build_none_flight_msg().into()),
bytes.to_vec(),
vec![],
))
}
fn build_none_flight_msg() -> Vec<u8> {
let mut fbb = FlatBufferBuilder::new();
let mut message = arrow::ipc::MessageBuilder::new(&mut fbb);
message.add_version(arrow::ipc::MetadataVersion::V5);
message.add_header_type(arrow::ipc::MessageHeader::NONE);
message.add_bodyLength(0);
let data = message.finish();
fbb.finish(data, None);
fbb.finished_data().to_vec()
}
/// Converts a QueryResponse into a stream of Arrow Flight [`FlightData`] response frames.
fn encode_response(
response: QueryResponse,
ingester_uuid: Uuid,
) -> BoxStream<'static, std::result::Result<FlightData, FlightError>> {
response
.into_partition_stream()
.flat_map(move |partition| {
let partition_id = partition.id();
let completed_persistence_count = partition.completed_persistence_count();
let head = futures::stream::once(async move {
encode_partition(
partition_id,
PartitionStatus {
parquet_max_sequence_number: None,
},
completed_persistence_count,
ingester_uuid,
)
});
match partition.into_record_batch_stream() {
Some(stream) => {
let stream = stream.map_err(|e| FlightError::ExternalError(Box::new(e)));
let tail = FlightDataEncoderBuilder::new().build(stream);
head.chain(tail).boxed()
}
None => head.boxed(),
}
})
.boxed()
}
#[cfg(test)]
mod tests {
use bytes::Bytes;
use tonic::Code;
use crate::query::mock_query_exec::MockQueryExec;
use super::*;
#[tokio::test]
async fn limits_concurrent_queries() {
let mut flight =
FlightService::new(MockQueryExec::default(), 100, &metric::Registry::default());
let req = tonic::Request::new(Ticket {
ticket: Bytes::new(),
});
match flight.do_get(req).await {
Ok(_) => panic!("expected error because of invalid ticket"),
Err(s) => {
assert_eq!(s.code(), Code::NotFound); // Mock response value
}
}
flight.request_sem = Semaphore::new(0);
let req = tonic::Request::new(Ticket {
ticket: Bytes::new(),
});
match flight.do_get(req).await {
Ok(_) => panic!("expected error because of request limit"),
Err(s) => {
assert_eq!(s.code(), Code::ResourceExhausted);
}
}
}
}

View File

@ -1,223 +0,0 @@
use std::sync::Arc;
use data_types::sequence_number_set::SequenceNumberSet;
use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, TableId};
use generated_types::influxdata::iox::ingester::v1::{
self as proto, replication_service_server::ReplicationService,
};
use mutable_batch::writer;
use mutable_batch_pb::decode::decode_database_batch;
use observability_deps::tracing::*;
use thiserror::Error;
use tonic::{Code, Request, Response};
use uuid::Uuid;
use crate::{BufferError, ReplicationBuffer};
/// A list of error states when handling a ReplicationService request.
#[derive(Debug, Error)]
enum ReplicationError {
/// The replication request did not contain a write payload.
#[error("replication request does not contain a payload")]
NoPayload,
/// The replication payload contains no tables.
#[error("replication request does not contain any table data")]
NoTables,
/// The replication request didn't contain an ingester id
#[error("replication request does not contain an ingester id")]
NoIngesterId,
/// The replication request had an invalid sequence number set
#[error("replication request to persist contained invalid sequence number set {0}")]
InvalidSequenceNumberSet(String),
/// Ingester ID not a valid UUID
#[error("replication request does not contain valid ingester uuid")]
InvalidIngesterId(#[from] uuid::Error),
/// The serialised write payload could not be read.
#[error(transparent)]
Decode(mutable_batch_pb::decode::Error),
/// An error buffering the write or persist
#[error("error buffering replciation request: {0}")]
Buffer(#[from] BufferError),
}
impl From<ReplicationError> for tonic::Status {
fn from(e: ReplicationError) -> Self {
let code = match e {
ReplicationError::Decode(_)
| ReplicationError::NoPayload
| ReplicationError::NoTables
| ReplicationError::NoIngesterId
| ReplicationError::InvalidIngesterId(_)
| ReplicationError::InvalidSequenceNumberSet(_) => Code::InvalidArgument,
ReplicationError::Buffer(_) => Code::Internal,
};
Self::new(code, e.to_string())
}
}
/// Convert a [`BufferError`] returned by the configured [`ReplicationBuffer`] to a
/// [`tonic::Status`].
impl From<BufferError> for tonic::Status {
fn from(e: BufferError) -> Self {
match e {
BufferError::MutableBatch(e) => map_write_error(e),
}
}
}
/// Map a [`mutable_batch::Error`] to a [`tonic::Status`].
///
/// This method takes care to enumerate all possible error states, so that new
/// error additions cause a compilation failure, and therefore require the new
/// error to be explicitly mapped to a gRPC status code.
fn map_write_error(e: mutable_batch::Error) -> tonic::Status {
use tonic::Status;
match e {
mutable_batch::Error::ColumnError { .. }
| mutable_batch::Error::ArrowError { .. }
| mutable_batch::Error::InternalSchema { .. }
| mutable_batch::Error::ColumnNotFound { .. }
| mutable_batch::Error::WriterError {
source: writer::Error::KeyNotFound { .. } | writer::Error::InsufficientValues { .. },
} => Status::internal(e.to_string()),
mutable_batch::Error::WriterError {
source: writer::Error::TypeMismatch { .. },
} => {
// While a schema type conflict is ultimately a user error, if it
// reaches the ingester it should have already passed through schema
// validation in the router, and as such it is an internal system
// failure.
Status::internal(e.to_string())
}
}
}
/// A gRPC [`ReplicationService`] handler.
///
/// This handler accepts writes from an upstream, and applies them to the
/// provided [`ReplicationBuffer`].
pub(crate) struct ReplicationServer<B: ReplicationBuffer + 'static> {
buffer: Arc<B>,
}
impl<B: ReplicationBuffer + 'static> ReplicationServer<B> {
/// Instantiate a new [`ReplicationServer`]
pub(crate) fn new(buffer: Arc<B>) -> Self {
Self { buffer }
}
}
#[tonic::async_trait]
impl<B: ReplicationBuffer + 'static> ReplicationService for ReplicationServer<B> {
/// Handle an RPC write request.
async fn replicate(
&self,
request: Request<proto::ReplicateRequest>,
) -> Result<Response<proto::ReplicateResponse>, tonic::Status> {
// Extract the remote address for debugging.
let remote_addr = request
.remote_addr()
.map(|v| v.to_string())
.unwrap_or_else(|| "<unknown>".to_string());
let request = request.into_inner();
let ingester_id =
Uuid::parse_str(&request.ingester_uuid).map_err(ReplicationError::InvalidIngesterId)?;
// Extract the database batch payload
let payload = request.payload.ok_or(ReplicationError::NoPayload)?;
let batches = decode_database_batch(&payload).map_err(ReplicationError::Decode)?;
let num_tables = batches.len();
let sequence_number = SequenceNumber::new(request.sequence_number);
let namespace_id = NamespaceId::new(payload.database_id);
let partition_key = PartitionKey::from(payload.partition_key);
if num_tables == 0 {
return Err(ReplicationError::NoTables)?;
}
trace!(
remote_addr,
%ingester_id,
?sequence_number,
num_tables,
%namespace_id,
%partition_key,
"received replicate write"
);
match self
.buffer
.apply_write(namespace_id, batches, ingester_id, sequence_number)
.await
{
Ok(()) => {}
Err(e) => {
error!(error=%e, "failed to write into buffer");
return Err(ReplicationError::Buffer(e))?;
}
}
Ok(Response::new(proto::ReplicateResponse {}))
}
async fn persist_complete(
&self,
request: Request<proto::PersistCompleteRequest>,
) -> Result<Response<proto::PersistCompleteResponse>, tonic::Status> {
// Extract the remote address for debugging.
let remote_addr = request
.remote_addr()
.map(|v| v.to_string())
.unwrap_or_else(|| "<unknown>".to_string());
let request = request.into_inner();
let ingester_id =
Uuid::parse_str(&request.ingester_uuid).map_err(ReplicationError::InvalidIngesterId)?;
let namespace_id = NamespaceId::new(request.namespace_id);
let table_id = TableId::new(request.table_id);
let partition_id = PartitionId::new(request.partition_id);
let sequence_set =
SequenceNumberSet::try_from(request.croaring_sequence_number_bitmap.as_ref())
.map_err(ReplicationError::InvalidSequenceNumberSet)?;
trace!(
remote_addr,
?ingester_id,
?namespace_id,
?table_id,
?partition_id,
);
match self
.buffer
.apply_persist(
ingester_id,
namespace_id,
table_id,
partition_id,
sequence_set,
)
.await
{
Ok(()) => {}
Err(e) => {
error!(error=%e, "failed to apply persist to buffer");
return Err(ReplicationError::Buffer(e))?;
}
}
Ok(Response::new(proto::PersistCompleteResponse {}))
}
}
#[cfg(test)]
mod tests {}

View File

@ -1,169 +0,0 @@
//! IOx Ingest Replica implementation
//!
//! The Ingest Replica serves as an in memory queryable buffer of data from one or more ingesters
//! that are persisting data. It provides horizontal scalability of query workloads on the data in
//! ingesters that has yet to be persisted to Parquet files. It also ensures that the write path
//! and the query path have failure isolation so that an outage in one won't create an outage in
//! the other.
#![allow(dead_code)] // Until ingest_replica is used.
#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
#![warn(
clippy::clone_on_ref_ptr,
clippy::dbg_macro,
clippy::explicit_iter_loop,
clippy::future_not_send,
clippy::todo,
clippy::use_self,
missing_copy_implementations,
missing_debug_implementations,
missing_docs
)]
mod buffer;
mod cache;
mod grpc;
mod query;
mod query_adaptor;
use crate::cache::CacheError;
use crate::{buffer::Buffer, cache::SchemaCache, grpc::GrpcDelegate};
use arrow_flight::flight_service_server::{FlightService, FlightServiceServer};
use async_trait::async_trait;
use data_types::sequence_number_set::SequenceNumberSet;
use data_types::{NamespaceId, PartitionId, SequenceNumber, TableId, TRANSITION_SHARD_INDEX};
use generated_types::influxdata::iox::ingester::v1::replication_service_server::{
ReplicationService, ReplicationServiceServer,
};
use hashbrown::HashMap;
use iox_catalog::interface::Catalog;
use iox_query::exec::Executor;
use mutable_batch::MutableBatch;
use std::sync::Arc;
use thiserror::Error;
use uuid::Uuid;
/// An error returned by the `ReplicationBuffer`.
#[derive(Debug, Error)]
pub enum BufferError {
/// An error from the mutable batch sent to a buffer.
#[error("mutable batch error: {0}")]
MutableBatch(#[from] mutable_batch::Error),
}
/// Acquire opaque handles to the IngestReplica RPC service implementations.
///
/// This trait serves as the public crate API boundary - callers external to the
/// IngestReplica crate utilise this abstraction to acquire type erased handles to
/// the RPC service implementations, hiding internal IngestReplica implementation
/// details & types.
///
/// Callers can mock out this trait or decorate the returned implementation in
/// order to simulate or modify the behaviour of an ingest_replica in their own tests.
pub trait IngestReplicaRpcInterface: Send + Sync + std::fmt::Debug {
/// The type of the [`ReplicationService`] implementation.
type ReplicationHandler: ReplicationService;
/// The type of the [`FlightService`] implementation.
type FlightHandler: FlightService;
/// Acquire an opaque handle to the IngestReplica's [`ReplicationService`] RPC
/// handler implementation.
fn replication_service(&self) -> ReplicationServiceServer<Self::ReplicationHandler>;
/// Acquire an opaque handle to the Ingester's Arrow Flight
/// [`FlightService`] RPC handler implementation, allowing at most
/// `max_simultaneous_requests` queries to be running at any one time.
fn query_service(
&self,
max_simultaneous_requests: usize,
) -> FlightServiceServer<Self::FlightHandler>;
}
/// Alias for the `TableId` to `MutableBatch` hashmap of data received in write and partition
/// buffer requests.
pub(crate) type TableIdToMutableBatch = HashMap<i64, MutableBatch>;
/// ReplicationBuffer can receive data from the replication protocol to get buffers of partition
/// data, individual write requests, and persistence notification to evict data from the buffer.
#[async_trait]
pub(crate) trait ReplicationBuffer: Send + Sync {
/// Apply an individual write request to the buffer. Can write many rows into many partitions.
async fn apply_write(
&self,
namespace_id: NamespaceId,
table_batches: TableIdToMutableBatch,
ingester_id: Uuid,
sequence_number: SequenceNumber,
) -> Result<(), BufferError>;
/// Apply a persist operation to the buffer, which should clear out the data from the given
/// partition.
async fn apply_persist(
&self,
ingester_id: Uuid,
namespace_id: NamespaceId,
table_id: TableId,
partition_id: PartitionId,
sequence_set: SequenceNumberSet,
) -> Result<(), BufferError>;
/// Append an entire partition buffer to the buffer. It should be able to evict this entire
/// buffer in one operation when it later receives a persist operation that has a SequenceSet
/// that is a superset of the one sent here.
async fn append_partition_buffer(
&self,
ingester_id: Uuid,
namespace_id: NamespaceId,
table_id: TableId,
partition_id: PartitionId,
sequence_set: SequenceNumberSet,
table_batches: TableIdToMutableBatch,
) -> Result<(), BufferError>;
}
/// Errors that occur during initialisation of an `ingest_replica` instance.
#[derive(Debug, Error)]
pub enum InitError {
/// An error occurred trying to warm the schema cache
#[error("failed to pre-warm schema cache: {0}")]
WarmCache(#[from] CacheError),
}
/// Initialise a new `ingest_replica` instance, returning the gRPC service handler
/// implementations to be bound by the caller.
#[allow(clippy::too_many_arguments)]
pub async fn new(
catalog: Arc<dyn Catalog>,
_ingesters: Vec<String>,
exec: Arc<Executor>,
metrics: Arc<metric::Registry>,
) -> Result<impl IngestReplicaRpcInterface, InitError> {
// Create the transition shard.
let mut txn = catalog
.start_transaction()
.await
.expect("start transaction");
let topic = txn
.topics()
.create_or_get("iox-shared")
.await
.expect("get topic");
let transition_shard = txn
.shards()
.create_or_get(&topic, TRANSITION_SHARD_INDEX)
.await
.expect("create transition shard");
txn.commit().await.expect("commit transition shard");
let schema_cache = Arc::new(SchemaCache::new(Arc::clone(&catalog), transition_shard.id));
schema_cache.warm().await?;
let buffer = Arc::new(Buffer::new(schema_cache, exec));
// TODO: connect to the remote ingesters and subscribe to their data, receiving the
// PartitionBufferResponses into the buffer. Note that the ReplicationService in this
// GrpcDelegate must be running before the requests are sent as the ingester will
// immediately start sending replciate requests.
Ok(GrpcDelegate::new(Arc::clone(&buffer), metrics))
}

View File

@ -1,156 +0,0 @@
use async_trait::async_trait;
use data_types::{NamespaceId, TableId};
use iox_time::{SystemProvider, TimeProvider};
use metric::{DurationHistogram, Metric};
use trace::span::Span;
use super::QueryExec;
use crate::query::QueryError;
/// An instrumentation decorator over a [`QueryExec`] implementation.
///
/// This wrapper captures the latency distribution of the decorated
/// [`QueryExec::query_exec()`] call, faceted by success/error result.
#[derive(Debug)]
pub(crate) struct QueryExecInstrumentation<T, P = SystemProvider> {
inner: T,
time_provider: P,
/// Query execution duration distribution for successes.
query_duration_success: DurationHistogram,
/// Query execution duration distribution for "not found" errors
query_duration_error_not_found: DurationHistogram,
}
impl<T> QueryExecInstrumentation<T> {
pub(crate) fn new(inner: T, metrics: &metric::Registry) -> Self {
// Record query duration metrics, broken down by query execution result
let query_duration: Metric<DurationHistogram> = metrics.register_metric(
"ingester_flight_query_duration",
"flight request query execution duration",
);
let query_duration_success = query_duration.recorder(&[("result", "success")]);
let query_duration_error_not_found =
query_duration.recorder(&[("result", "error"), ("reason", "not_found")]);
Self {
inner,
time_provider: Default::default(),
query_duration_success,
query_duration_error_not_found,
}
}
}
#[async_trait]
impl<T, P> QueryExec for QueryExecInstrumentation<T, P>
where
T: QueryExec,
P: TimeProvider,
{
type Response = T::Response;
#[inline(always)]
async fn query_exec(
&self,
namespace_id: NamespaceId,
table_id: TableId,
columns: Vec<String>,
span: Option<Span>,
) -> Result<Self::Response, QueryError> {
let t = self.time_provider.now();
let res = self
.inner
.query_exec(namespace_id, table_id, columns, span)
.await;
if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
match &res {
Ok(_) => self.query_duration_success.record(delta),
Err(QueryError::TableNotFound { .. } | QueryError::NamespaceNotFound { .. }) => {
self.query_duration_error_not_found.record(delta)
}
};
}
res
}
}
#[cfg(test)]
mod tests {
use assert_matches::assert_matches;
use metric::Attributes;
use super::*;
use crate::query::{
mock_query_exec::MockQueryExec,
response::{PartitionStream, QueryResponse},
};
macro_rules! test_metric {
(
$name:ident,
inner = $inner:expr,
want_metric_attr = $want_metric_attr:expr,
want_ret = $($want_ret:tt)+
) => {
paste::paste! {
#[tokio::test]
async fn [<test_metric_ $name>]() {
let metrics = metric::Registry::default();
let decorator = QueryExecInstrumentation::new($inner, &metrics);
// Call the decorator and assert the return value
let got = decorator
.query_exec(NamespaceId::new(42), TableId::new(24), vec![], None)
.await;
assert_matches!(got, $($want_ret)+);
// Validate the histogram with the specified attributes saw
// an observation
let histogram = metrics
.get_instrument::<Metric<DurationHistogram>>("ingester_flight_query_duration")
.expect("failed to find metric")
.get_observer(&Attributes::from(&$want_metric_attr))
.expect("failed to find attributes")
.fetch();
assert_eq!(histogram.sample_count(), 1);
}
}
};
}
test_metric!(
ok,
inner = {
let stream: PartitionStream = PartitionStream::new(futures::stream::iter([]));
MockQueryExec::default().with_result(Ok(QueryResponse::new(stream)))
},
want_metric_attr = [("result", "success")],
want_ret = Ok(_)
);
test_metric!(
namespace_not_found,
inner = MockQueryExec::default()
.with_result(Err(QueryError::NamespaceNotFound(NamespaceId::new(42)))),
want_metric_attr = [("result", "error"), ("reason", "not_found")],
want_ret = Err(QueryError::NamespaceNotFound(ns)) => {
assert_eq!(ns, NamespaceId::new(42));
}
);
test_metric!(
table_not_found,
inner = MockQueryExec::default()
.with_result(Err(QueryError::TableNotFound(NamespaceId::new(42), TableId::new(24)))),
want_metric_attr = [("result", "error"), ("reason", "not_found")],
want_ret = Err(QueryError::TableNotFound(ns, t)) => {
assert_eq!(ns, NamespaceId::new(42));
assert_eq!(t, TableId::new(24));
}
);
}

View File

@ -1,36 +0,0 @@
use async_trait::async_trait;
use data_types::{NamespaceId, TableId};
use parking_lot::Mutex;
use trace::span::Span;
use super::{response::QueryResponse, QueryError, QueryExec};
#[derive(Debug, Default)]
pub(crate) struct MockQueryExec {
response: Mutex<Option<Result<QueryResponse, QueryError>>>,
}
impl MockQueryExec {
pub(crate) fn with_result(self, r: Result<QueryResponse, QueryError>) -> Self {
*self.response.lock() = Some(r);
self
}
}
#[async_trait]
impl QueryExec for MockQueryExec {
type Response = QueryResponse;
async fn query_exec(
&self,
_namespace_id: NamespaceId,
_table_id: TableId,
_columns: Vec<String>,
_span: Option<Span>,
) -> Result<Self::Response, QueryError> {
self.response
.lock()
.take()
.unwrap_or(Err(QueryError::NamespaceNotFound(NamespaceId::new(42))))
}
}

View File

@ -1,14 +0,0 @@
//! Query execution abstraction & types.
mod r#trait;
pub(crate) use r#trait::*;
// Response types
pub(crate) mod partition_response;
pub(crate) mod response;
pub(crate) mod instrumentation;
pub(crate) mod tracing;
#[cfg(test)]
pub(crate) mod mock_query_exec;

View File

@ -1,63 +0,0 @@
//! The per-partition data nested in a query [`QueryResponse`].
//!
//! [`QueryResponse`]: super::response::QueryResponse
use data_types::PartitionId;
use datafusion::physical_plan::SendableRecordBatchStream;
/// Response data for a single partition.
pub(crate) struct PartitionResponse {
/// Stream of snapshots.
batches: Option<SendableRecordBatchStream>,
/// Partition ID.
id: PartitionId,
/// Count of persisted Parquet files for this partition by this ingester instance.
completed_persistence_count: u64,
}
impl std::fmt::Debug for PartitionResponse {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PartitionResponse")
.field(
"batches",
&match self.batches {
Some(_) => "<SNAPSHOT STREAM>",
None => "<NO DATA>,",
},
)
.field("partition_id", &self.id)
.field(
"completed_persistence_count",
&self.completed_persistence_count,
)
.finish()
}
}
impl PartitionResponse {
pub(crate) fn new(
data: Option<SendableRecordBatchStream>,
id: PartitionId,
completed_persistence_count: u64,
) -> Self {
Self {
batches: data,
id,
completed_persistence_count,
}
}
pub(crate) fn id(&self) -> PartitionId {
self.id
}
pub(crate) fn completed_persistence_count(&self) -> u64 {
self.completed_persistence_count
}
pub(crate) fn into_record_batch_stream(self) -> Option<SendableRecordBatchStream> {
self.batches
}
}

View File

@ -1,60 +0,0 @@
//! The response type returned from a query [`QueryExec::query_exec()`] call.
//!
//! [`QueryExec::query_exec()`]: super::QueryExec::query_exec()
use std::{future, pin::Pin};
use arrow::record_batch::RecordBatch;
use datafusion::common::DataFusionError;
use futures::{Stream, StreamExt};
use super::partition_response::PartitionResponse;
/// Stream of partitions in this response.
pub(crate) struct PartitionStream(Pin<Box<dyn Stream<Item = PartitionResponse> + Send>>);
impl std::fmt::Debug for PartitionStream {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("PartitionStream").finish()
}
}
impl PartitionStream {
pub(crate) fn new<T>(s: T) -> Self
where
T: Stream<Item = PartitionResponse> + Send + 'static,
{
Self(s.boxed())
}
}
/// A response stream wrapper for ingester query requests.
///
/// The data structure is constructed to allow lazy/streaming/pull-based data
/// sourcing.
#[derive(Debug)]
pub(crate) struct QueryResponse {
/// Stream of partitions.
partitions: PartitionStream,
}
impl QueryResponse {
/// Make a response
pub(crate) fn new(partitions: PartitionStream) -> Self {
Self { partitions }
}
/// Return the stream of [`PartitionResponse`].
pub(crate) fn into_partition_stream(self) -> impl Stream<Item = PartitionResponse> {
self.partitions.0
}
/// Reduce the [`QueryResponse`] to a stream of [`RecordBatch`].
pub(crate) fn into_record_batches(
self,
) -> impl Stream<Item = Result<RecordBatch, DataFusionError>> {
self.into_partition_stream()
.filter_map(|partition| future::ready(partition.into_record_batch_stream()))
.flatten()
}
}

View File

@ -1,148 +0,0 @@
use std::borrow::Cow;
use async_trait::async_trait;
use data_types::{NamespaceId, TableId};
use trace::span::{Span, SpanRecorder};
use super::QueryExec;
use crate::query::QueryError;
/// An tracing decorator over a [`QueryExec`] implementation.
///
/// This wrapper emits child tracing spans covering the execution of the inner
/// [`QueryExec::query_exec()`] call.
///
/// Constructing this decorator is cheap.
#[derive(Debug)]
pub(crate) struct QueryExecTracing<T> {
inner: T,
name: Cow<'static, str>,
}
impl<T> QueryExecTracing<T> {
pub(crate) fn new(inner: T, name: impl Into<Cow<'static, str>>) -> Self {
Self {
inner,
name: name.into(),
}
}
}
#[async_trait]
impl<T> QueryExec for QueryExecTracing<T>
where
T: QueryExec,
{
type Response = T::Response;
#[inline(always)]
async fn query_exec(
&self,
namespace_id: NamespaceId,
table_id: TableId,
columns: Vec<String>,
span: Option<Span>,
) -> Result<Self::Response, QueryError> {
let span = span.map(|s| s.child(self.name.clone()));
let mut recorder = SpanRecorder::new(span.clone());
match self
.inner
.query_exec(namespace_id, table_id, columns, span)
.await
{
Ok(v) => {
recorder.ok("query_exec complete");
Ok(v)
}
Err(e) => {
recorder.error(e.to_string());
Err(e)
}
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use assert_matches::assert_matches;
use trace::{ctx::SpanContext, span::SpanStatus, RingBufferTraceCollector, TraceCollector};
use crate::query::{
mock_query_exec::MockQueryExec,
response::{PartitionStream, QueryResponse},
};
use super::*;
#[track_caller]
fn assert_trace(name: impl Into<String>, status: SpanStatus, traces: &dyn TraceCollector) {
let traces = traces
.as_any()
.downcast_ref::<RingBufferTraceCollector>()
.expect("unexpected collector impl");
let name = name.into();
let span = traces
.spans()
.into_iter()
.find(|s| s.name == name)
.unwrap_or_else(|| panic!("tracing span {name} not found"));
assert_eq!(
span.status, status,
"span status does not match expected value"
);
}
#[tokio::test]
async fn test_ok() {
let stream: PartitionStream = PartitionStream::new(futures::stream::iter([]));
let mock = MockQueryExec::default().with_result(Ok(QueryResponse::new(stream)));
let traces: Arc<dyn TraceCollector> = Arc::new(RingBufferTraceCollector::new(5));
let span = SpanContext::new(Arc::clone(&traces));
// Drive the trace wrapper
let _ = QueryExecTracing::new(mock, "bananas")
.query_exec(
NamespaceId::new(42),
TableId::new(24),
vec![],
Some(span.child("root span")),
)
.await
.expect("wrapper should not modify result");
// Assert the trace showed up.
assert_trace("bananas", SpanStatus::Ok, &*traces);
}
#[tokio::test]
async fn test_err() {
let mock = MockQueryExec::default()
.with_result(Err(QueryError::NamespaceNotFound(NamespaceId::new(42))));
let traces: Arc<dyn TraceCollector> = Arc::new(RingBufferTraceCollector::new(5));
let span = SpanContext::new(Arc::clone(&traces));
// Drive the trace wrapper
let got = QueryExecTracing::new(mock, "bananas")
.query_exec(
NamespaceId::new(42),
TableId::new(24),
vec![],
Some(span.child("root span")),
)
.await
.expect_err("wrapper should not modify result");
assert_matches!(got, QueryError::NamespaceNotFound(ns) => {
assert_eq!(ns, NamespaceId::new(42));
});
// Assert the trace showed up.
assert_trace("bananas", SpanStatus::Err, &*traces);
}
}

View File

@ -1,49 +0,0 @@
use std::{fmt::Debug, ops::Deref, sync::Arc};
use async_trait::async_trait;
use data_types::{NamespaceId, TableId};
use thiserror::Error;
use trace::span::Span;
#[derive(Debug, Error)]
#[allow(missing_copy_implementations)]
pub(crate) enum QueryError {
#[error("namespace id {0} not found")]
NamespaceNotFound(NamespaceId),
#[error("table id {1} not found in namespace id {0}")]
TableNotFound(NamespaceId, TableId),
}
#[async_trait]
pub(crate) trait QueryExec: Send + Sync + Debug {
type Response: Send + Debug;
async fn query_exec(
&self,
namespace_id: NamespaceId,
table_id: TableId,
columns: Vec<String>,
span: Option<Span>,
) -> Result<Self::Response, QueryError>;
}
#[async_trait]
impl<T> QueryExec for Arc<T>
where
T: QueryExec,
{
type Response = T::Response;
async fn query_exec(
&self,
namespace_id: NamespaceId,
table_id: TableId,
columns: Vec<String>,
span: Option<Span>,
) -> Result<Self::Response, QueryError> {
self.deref()
.query_exec(namespace_id, table_id, columns, span)
.await
}
}

View File

@ -1,208 +0,0 @@
//! An adaptor over a set of [`RecordBatch`] allowing them to be used as an IOx
//! [`QueryChunk`].
use std::{any::Any, sync::Arc};
use arrow::record_batch::RecordBatch;
use arrow_util::util::ensure_schema;
use data_types::{ChunkId, ChunkOrder, DeletePredicate, PartitionId, TableSummary};
use datafusion::error::DataFusionError;
use iox_query::{
exec::{stringset::StringSet, IOxSessionContext},
util::{compute_timenanosecond_min_max, create_basic_summary},
QueryChunk, QueryChunkData, QueryChunkMeta,
};
use once_cell::sync::OnceCell;
use predicate::Predicate;
use schema::{merge::merge_record_batch_schemas, sort::SortKey, Projection, Schema};
/// A queryable wrapper over a set of ordered [`RecordBatch`]
///
/// It is an invariant that a [`QueryAdaptor`] MUST always contain at least one
/// row. This frees the caller of having to reason about empty [`QueryAdaptor`]
/// instances yielding empty [`RecordBatch`].
#[derive(Debug, PartialEq, Clone)]
pub struct QueryAdaptor {
/// The snapshot data from a partition.
///
/// This MUST be non-pub(crate) / closed for modification / immutable to support
/// interning the merged schema in [`Self::schema()`].
data: Vec<Arc<RecordBatch>>,
/// The catalog ID of the partition the this data is part of.
partition_id: PartitionId,
/// Chunk ID.
id: ChunkId,
/// An interned schema for all [`RecordBatch`] in data.
schema: OnceCell<Arc<Schema>>,
/// An interned table summary.
summary: OnceCell<Arc<TableSummary>>,
}
impl QueryAdaptor {
/// Construct a [`QueryAdaptor`].
///
/// # Panics
///
/// This constructor panics if `data` contains no [`RecordBatch`], or all
/// [`RecordBatch`] are empty.
pub(crate) fn new(partition_id: PartitionId, data: Vec<Arc<RecordBatch>>) -> Self {
// There must always be at least one record batch and one row.
//
// This upholds an invariant that simplifies dealing with empty
// partitions - if there is a QueryAdaptor, it contains data.
assert!(data.iter().map(|b| b.num_rows()).sum::<usize>() > 0);
Self {
data,
partition_id,
// To return a value for debugging and make it consistent with ChunkId created in Compactor,
// use Uuid for this. Draw this UUID during chunk generation so that it is stable during the whole query process.
id: ChunkId::new(),
schema: OnceCell::default(),
summary: OnceCell::default(),
}
}
pub(crate) fn project_selection(&self, selection: Projection<'_>) -> Vec<RecordBatch> {
// Project the column selection across all RecordBatch
self.data
.iter()
.map(|data| {
let batch = data.as_ref();
let schema = batch.schema();
// Apply selection to in-memory batch
match selection {
Projection::All => batch.clone(),
Projection::Some(columns) => {
let projection = columns
.iter()
.flat_map(|&column_name| {
// ignore non-existing columns
schema.index_of(column_name).ok()
})
.collect::<Vec<_>>();
batch.project(&projection).expect("bug in projection")
}
}
})
.collect()
}
/// Returns the [`RecordBatch`] instances in this [`QueryAdaptor`].
pub(crate) fn record_batches(&self) -> &[Arc<RecordBatch>] {
self.data.as_ref()
}
/// Returns the partition ID from which the data this [`QueryAdaptor`] was
/// sourced from.
pub(crate) fn partition_id(&self) -> PartitionId {
self.partition_id
}
}
impl QueryChunkMeta for QueryAdaptor {
fn summary(&self) -> Arc<TableSummary> {
Arc::clone(self.summary.get_or_init(|| {
let ts_min_max = compute_timenanosecond_min_max(self.data.iter().map(|b| b.as_ref()))
.expect("Should have time range");
Arc::new(create_basic_summary(
self.data.iter().map(|b| b.num_rows()).sum::<usize>() as u64,
self.schema(),
ts_min_max,
))
}))
}
fn schema(&self) -> &Schema {
self.schema
.get_or_init(|| merge_record_batch_schemas(&self.data).into())
.as_ref()
}
fn partition_sort_key(&self) -> Option<&SortKey> {
None // Ingester data has not persisted yet and should not be attached to any partition
}
fn partition_id(&self) -> PartitionId {
self.partition_id
}
fn sort_key(&self) -> Option<&SortKey> {
None // Ingester data is not sorted
}
fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
&[]
}
}
impl QueryChunk for QueryAdaptor {
fn id(&self) -> ChunkId {
self.id
}
/// Returns true if the chunk may contain a duplicate "primary key" within
/// itself
fn may_contain_pk_duplicates(&self) -> bool {
// always true because the rows across record batches have not been
// de-duplicated.
true
}
/// Returns a set of Strings with column names from the specified
/// table that have at least one row that matches `predicate`, if
/// the predicate can be evaluated entirely on the metadata of
/// this Chunk. Returns `None` otherwise
fn column_names(
&self,
_ctx: IOxSessionContext,
_predicate: &Predicate,
_columns: Projection<'_>,
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
/// Return a set of Strings containing the distinct values in the
/// specified columns. If the predicate can be evaluated entirely
/// on the metadata of this Chunk. Returns `None` otherwise
///
/// The requested columns must all have String type.
fn column_values(
&self,
_ctx: IOxSessionContext,
_column_name: &str,
_predicate: &Predicate,
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
fn data(&self) -> QueryChunkData {
let schema = self.schema().as_arrow();
QueryChunkData::RecordBatches(
self.data
.iter()
.map(|b| ensure_schema(&schema, b).expect("schema handling broken"))
.collect(),
)
}
/// Returns chunk type
fn chunk_type(&self) -> &str {
"QueryAdaptor"
}
fn order(&self) -> ChunkOrder {
unimplemented!()
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@ -35,7 +35,7 @@ parking_lot = "0.12.1"
parquet_file = { version = "0.1.0", path = "../parquet_file" }
pin-project = "1.0.12"
predicate = { version = "0.1.0", path = "../predicate" }
prost = { version = "0.11.6", default-features = false, features = ["std"] }
prost = { version = "0.11.9", default-features = false, features = ["std"] }
rand = "0.8.5"
schema = { version = "0.1.0", path = "../schema" }
service_grpc_catalog = { version = "0.1.0", path = "../service_grpc_catalog" }
@ -44,7 +44,7 @@ test_helpers = { path = "../test_helpers", features = ["future_timeout"], option
thiserror = "1.0.40"
tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
tokio-util = "0.7.7"
tonic = "0.8.3"
tonic = { workspace = true }
trace = { version = "0.1.0", path = "../trace" }
uuid = "1.3.1"
wal = { version = "0.1.0", path = "../wal" }

View File

@ -167,7 +167,7 @@ where
table_id: TableId,
table_name: Arc<DeferredLoad<TableName>>,
transition_shard_id: ShardId,
) -> PartitionData {
) -> Arc<Mutex<PartitionData>> {
// Use the cached PartitionKey instead of the caller's partition_key,
// instead preferring to reuse the already-shared Arc<str> in the cache.
@ -188,7 +188,7 @@ where
// Use the returned partition key instead of the callers - this
// allows the backing str memory to be reused across all partitions
// using the same key!
return PartitionData::new(
return Arc::new(Mutex::new(PartitionData::new(
partition_id,
key,
namespace_id,
@ -197,7 +197,7 @@ where
table_name,
SortKeyState::Deferred(Arc::new(sort_key_resolver)),
transition_shard_id,
);
)));
}
debug!(%table_id, %partition_key, "partition cache miss");
@ -218,6 +218,9 @@ where
#[cfg(test)]
mod tests {
// Harmless in tests - saves a bunch of extra vars.
#![allow(clippy::await_holding_lock)]
use data_types::ShardId;
use iox_catalog::mem::MemCatalog;
@ -282,10 +285,10 @@ mod tests {
)
.await;
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(&**got.table_name().get().await, TABLE_NAME);
assert_eq!(&**got.namespace_name().get().await, NAMESPACE_NAME);
assert_eq!(got.lock().partition_id(), PARTITION_ID);
assert_eq!(got.lock().table_id(), TABLE_ID);
assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
assert_eq!(&**got.lock().namespace_name().get().await, NAMESPACE_NAME);
assert!(cache.inner.is_empty());
}
@ -322,11 +325,14 @@ mod tests {
)
.await;
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(&**got.table_name().get().await, TABLE_NAME);
assert_eq!(&**got.namespace_name().get().await, NAMESPACE_NAME);
assert_eq!(*got.partition_key(), PartitionKey::from(PARTITION_KEY));
assert_eq!(got.lock().partition_id(), PARTITION_ID);
assert_eq!(got.lock().table_id(), TABLE_ID);
assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
assert_eq!(&**got.lock().namespace_name().get().await, NAMESPACE_NAME);
assert_eq!(
*got.lock().partition_key(),
PartitionKey::from(PARTITION_KEY)
);
// The cache should have been cleaned up as it was consumed.
assert!(cache.entries.lock().is_empty());
@ -334,10 +340,10 @@ mod tests {
// Assert the partition key from the cache was used for the lifetime of
// the partition, so that it is shared with the cache + other partitions
// that share the same partition key across all tables.
assert!(got.partition_key().ptr_eq(&stored_partition_key));
assert!(got.lock().partition_key().ptr_eq(&stored_partition_key));
// It does not use the short-lived caller's partition key (derived from
// the DML op it is processing).
assert!(!got.partition_key().ptr_eq(&callers_partition_key));
assert!(!got.lock().partition_key().ptr_eq(&callers_partition_key));
}
#[tokio::test]
@ -385,9 +391,9 @@ mod tests {
)
.await;
assert_eq!(got.partition_id(), other_key_id);
assert_eq!(got.table_id(), TABLE_ID);
assert_eq!(&**got.table_name().get().await, TABLE_NAME);
assert_eq!(got.lock().partition_id(), other_key_id);
assert_eq!(got.lock().table_id(), TABLE_ID);
assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
}
#[tokio::test]
@ -434,8 +440,8 @@ mod tests {
)
.await;
assert_eq!(got.partition_id(), PARTITION_ID);
assert_eq!(got.table_id(), other_table);
assert_eq!(&**got.table_name().get().await, TABLE_NAME);
assert_eq!(got.lock().partition_id(), PARTITION_ID);
assert_eq!(got.lock().table_id(), other_table);
assert_eq!(&**got.lock().table_name().get().await, TABLE_NAME);
}
}

View File

@ -8,6 +8,7 @@ use backoff::{Backoff, BackoffConfig};
use data_types::{NamespaceId, Partition, PartitionKey, ShardId, TableId};
use iox_catalog::interface::Catalog;
use observability_deps::tracing::debug;
use parking_lot::Mutex;
use super::r#trait::PartitionProvider;
use crate::{
@ -63,7 +64,7 @@ impl PartitionProvider for CatalogPartitionResolver {
table_id: TableId,
table_name: Arc<DeferredLoad<TableName>>,
transition_shard_id: ShardId,
) -> PartitionData {
) -> Arc<Mutex<PartitionData>> {
debug!(
%partition_key,
%table_id,
@ -78,7 +79,7 @@ impl PartitionProvider for CatalogPartitionResolver {
.await
.expect("retry forever");
PartitionData::new(
Arc::new(Mutex::new(PartitionData::new(
p.id,
// Use the caller's partition key instance, as it MAY be shared with
// other instance, but the instance returned from the catalog
@ -90,12 +91,15 @@ impl PartitionProvider for CatalogPartitionResolver {
table_name,
SortKeyState::Provided(p.sort_key()),
transition_shard_id,
)
)))
}
}
#[cfg(test)]
mod tests {
// Harmless in tests - saves a bunch of extra vars.
#![allow(clippy::await_holding_lock)]
use std::{sync::Arc, time::Duration};
use assert_matches::assert_matches;
@ -157,18 +161,18 @@ mod tests {
.await;
// Ensure the table name is available.
let _ = got.table_name().get().await;
let _ = got.lock().table_name().get().await;
assert_eq!(got.namespace_id(), namespace_id);
assert_eq!(got.table_name().to_string(), table_name.to_string());
assert_matches!(got.sort_key(), SortKeyState::Provided(None));
assert!(got.partition_key.ptr_eq(&callers_partition_key));
assert_eq!(got.lock().namespace_id(), namespace_id);
assert_eq!(got.lock().table_name().to_string(), table_name.to_string());
assert_matches!(got.lock().sort_key(), SortKeyState::Provided(None));
assert!(got.lock().partition_key.ptr_eq(&callers_partition_key));
let got = catalog
.repositories()
.await
.partitions()
.get_by_id(got.partition_id)
.get_by_id(got.lock().partition_id)
.await
.unwrap()
.expect("partition not created");

View File

@ -0,0 +1,423 @@
use std::{
pin::Pin,
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
};
use arrow::compute::kernels::partition;
use async_trait::async_trait;
use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
use futures::{future::Shared, FutureExt};
use hashbrown::{hash_map::Entry, HashMap};
use parking_lot::Mutex;
use crate::{
buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
deferred_load::DeferredLoad,
};
use super::PartitionProvider;
/// A helper alias for a boxed, dynamically dispatched future that resolves to a
/// arc/mutex wrapped [`PartitionData`].
type BoxedResolveFuture =
Pin<Box<dyn std::future::Future<Output = Arc<Mutex<PartitionData>>> + Send>>;
/// A compound key of `(namespace, table, partition_key)` which uniquely
/// identifies a single partition.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct Key {
namespace_id: NamespaceId,
table_id: TableId,
partition_key: PartitionKey,
}
/// The state of the resolver.
///
/// The [`Shared`] requires more space than the simple ref-pointer to the
/// [`PartitionData`], so resolving callers replace the shared handle with the
/// resolved result where possible.
#[derive(Debug)]
enum State {
/// A resolve task is ongoing, and the caller can await the [`Shared`]
/// future to obtain the result.
///
/// If the atomic bool is false, no thread is changing this [`State`] to
/// [`State::Resolved`] for the resolved partition. If true, a thread is in
/// the process of setting (or already has set) the state to
/// [`State::Resolved`].
Resolving(Shared<BoxedResolveFuture>, Arc<AtomicBool>),
/// A prior call resolved this partition.
Resolved(Arc<Mutex<PartitionData>>),
}
/// A coalescing [`PartitionProvider`] reducing N partition fetch requests into
/// a single call to `T` on a per-partition basis.
///
/// This type solves a concurrency problem, where a series of concurrent cache
/// misses "above" this type causes a series of concurrent lookups against the
/// inner resolver "below" this type for a single partition. This is wasteful,
/// as only one result is retained by the callers (a single [`PartitionData`] is
/// used to reference a partition of data).
///
/// This type is typically used to coalesce requests against the
/// [`CatalogPartitionResolver`]:
///
/// ```text
/// ┌─────────────────────────────┐
/// │ Cache │
/// └─────────────────────────────┘
/// │ │ │
/// ▼ ▼ ▼
/// ┌─────────────────────────────┐
/// │ CoalescePartitionResolver │
/// └─────────────────────────────┘
/// │
/// ▼
/// ┌─────────────────────────────┐
/// │ CatalogPartitionResolver │
/// └─────────────────────────────┘
/// ```
///
/// Imagine the following concurrent requests without this type:
///
/// * T1: check cache for partition A, miss
/// * T2: check cache for partition A, miss
/// * T1: inner.get_partition(A)
/// * T2: inner.get_partition(A)
/// * T1: cache put partition A
/// * T2: cache put partition A
///
/// With this type, the concurrent requests for a single partition (A) are
/// coalesced into a single request against the inner resolver:
///
/// * T1: check cache for partition A, miss
/// * T2: check cache for partition A, miss
/// * T1: CoalescePartitionResolver::get_partition(A)
/// * T2: CoalescePartitionResolver::get_partition(A)
/// * inner.get_partition() **(a single call to inner is made)**
/// * T1: cache put partition A
/// * T2: cache put partition A
///
/// # Memory Overhead
///
/// This type makes a best effort attempt to minimise the memory overhead of
/// memorising partition fetches. Callers drop the intermediate resolving state
/// upon success, leaving only a ref-counted pointer to the shared
/// [`PartitionData`] (a single [`Arc`] ref overhead).
///
/// # Cancellation Safety
///
/// This type is cancellation safe - calls to
/// [`CoalescePartitionResolver::get_partition()`] are safe to abort at any
/// point.
///
/// [`CatalogPartitionResolver`]: super::CatalogPartitionResolver
#[derive(Debug)]
pub struct CoalescePartitionResolver<T> {
/// The inner resolver the actual partition fetch is delegated to.
inner: Arc<T>,
/// A map of handles to ongoing resolve futures.
ongoing: Mutex<HashMap<Key, State>>,
}
impl<T> CoalescePartitionResolver<T> {
pub fn new(inner: Arc<T>) -> Self {
Self {
inner,
ongoing: Mutex::new(HashMap::default()),
}
}
}
#[async_trait]
impl<T> PartitionProvider for CoalescePartitionResolver<T>
where
T: PartitionProvider + 'static,
{
async fn get_partition(
&self,
partition_key: PartitionKey,
namespace_id: NamespaceId,
namespace_name: Arc<DeferredLoad<NamespaceName>>,
table_id: TableId,
table_name: Arc<DeferredLoad<TableName>>,
transition_shard_id: ShardId,
) -> Arc<Mutex<PartitionData>> {
let key = Key {
namespace_id,
table_id,
partition_key: partition_key.clone(), // Ref-counted anyway!
};
// Check if there's an ongoing (or recently completed) resolve.
let (shared, done) = match self.ongoing.lock().entry(key.clone()) {
Entry::Occupied(v) => match v.get() {
State::Resolving(fut, done) => (fut.clone(), Arc::clone(done)),
State::Resolved(v) => return Arc::clone(v),
},
Entry::Vacant(v) => {
// Spawn a future to resolve the partition, and retain a handle
// to it.
let inner = Arc::clone(&self.inner);
let fut: BoxedResolveFuture = Box::pin(async move {
inner
.get_partition(
partition_key,
namespace_id,
namespace_name,
table_id,
table_name,
transition_shard_id,
)
.await
});
// Make the future poll-able by many callers, all of which
// resolve to the same output PartitionData instance.
let fut = fut.shared();
let done = Arc::new(AtomicBool::new(false));
// Allow future callers to obtain this shared handle, instead of
// resolving the partition themselves.
v.insert(State::Resolving(fut.clone(), Arc::clone(&done)));
(fut, done)
}
};
// Wait for the resolve to complete.
//
// If this caller future is dropped before this resolve future
// completes, then it remains unpolled until the next caller obtains a
// shared handle and continues the process.
let res = shared.await;
// As an optimisation, select exactly one thread to acquire the lock and
// change the state instead of every caller trying to set the state to
// "resolved", which involves contending on the lock for all concurrent
// callers for all concurrent partition fetches.
//
// Any caller that has been awaiting the shared future above is a
// candidate to perform this state change, but only one thread will
// attempt to. If the presence of aborted callers waiting on the shared
// future, each completed await caller will attempt to change state
// (cancellation safe).
if done
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed)
.is_ok()
{
// This task should drop the Shared, swapping it for the resolved
// state.
//
// This thread SHOULD NOT fail to perform this action as no other
// thread will attempt it now the bool has been toggled.
let old = self
.ongoing
.lock()
.insert(key, State::Resolved(Arc::clone(&res)));
// Invariant: the resolve future must exist in the map, and the
// state may only be changed by the thread that won the CAS.
assert!(matches!(old, Some(State::Resolving(..))));
}
res
}
}
#[cfg(test)]
mod tests {
use std::{
future,
sync::Arc,
task::{Context, Poll},
time::Duration,
};
use assert_matches::assert_matches;
use data_types::{PartitionId, TRANSITION_SHARD_ID};
use futures::{stream::FuturesUnordered, StreamExt};
use test_helpers::timeout::FutureTimeout;
use crate::buffer_tree::partition::{resolver::mock::MockPartitionProvider, SortKeyState};
use super::*;
const PARTITION_KEY: &str = "bananas";
#[tokio::test]
async fn test_coalesce() {
const MAX_TASKS: usize = 50;
let namespace_id = NamespaceId::new(1234);
let namespace_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
NamespaceName::from("ns-platanos")
}));
let table_id = TableId::new(24);
let table_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
TableName::from("platanos")
}));
let partition = PartitionId::new(4242);
let data = PartitionData::new(
partition,
PartitionKey::from(PARTITION_KEY),
namespace_id,
Arc::clone(&namespace_name),
table_id,
Arc::clone(&table_name),
SortKeyState::Provided(None),
TRANSITION_SHARD_ID,
);
// Add a single instance of the partition - if more than one call is
// made, this will cause a panic.
let inner = Arc::new(MockPartitionProvider::default().with_partition(data));
let layer = Arc::new(CoalescePartitionResolver::new(Arc::clone(&inner)));
let results = (0..MAX_TASKS)
.map(|_| {
let namespace_name = Arc::clone(&namespace_name);
let table_name = Arc::clone(&table_name);
layer.get_partition(
PartitionKey::from(PARTITION_KEY),
namespace_id,
namespace_name,
table_id,
table_name,
TRANSITION_SHARD_ID,
)
})
.collect::<FuturesUnordered<_>>()
.collect::<Vec<_>>()
.await;
// All the resulting instances of PartitionData MUST be the same
// ref-counted instance.
results.as_slice().windows(2).for_each(|v| {
assert!(Arc::ptr_eq(&v[0], &v[1]));
});
// The state should have been set to "resolved" to reclaim memory
assert_matches!(
layer.ongoing.lock().values().next(),
Some(State::Resolved(..))
);
}
// A resolver that blocks forever when resolving PARTITION_KEY but instantly
// finishes all others.
#[derive(Debug)]
struct BlockingResolver {
p: Arc<Mutex<PartitionData>>,
}
impl PartitionProvider for BlockingResolver {
fn get_partition<'life0, 'async_trait>(
&'life0 self,
partition_key: PartitionKey,
_namespace_id: NamespaceId,
_namespace_name: Arc<DeferredLoad<NamespaceName>>,
_table_id: TableId,
_table_name: Arc<DeferredLoad<TableName>>,
_transition_shard_id: ShardId,
) -> core::pin::Pin<
Box<
dyn core::future::Future<Output = Arc<Mutex<PartitionData>>>
+ core::marker::Send
+ 'async_trait,
>,
>
where
'life0: 'async_trait,
Self: 'async_trait,
{
if partition_key == PartitionKey::from(PARTITION_KEY) {
return future::pending().boxed();
}
future::ready(Arc::clone(&self.p)).boxed()
}
}
#[tokio::test]
async fn test_disjoint_parallelised() {
use futures::Future;
let namespace_id = NamespaceId::new(1234);
let namespace_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
NamespaceName::from("ns-platanos")
}));
let table_id = TableId::new(24);
let table_name = Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
TableName::from("platanos")
}));
let partition = PartitionId::new(4242);
let data = PartitionData::new(
partition,
PartitionKey::from(PARTITION_KEY),
namespace_id,
Arc::clone(&namespace_name),
table_id,
Arc::clone(&table_name),
SortKeyState::Provided(None),
TRANSITION_SHARD_ID,
);
// Add a single instance of the partition - if more than one call is
// made to the mock, it will panic.
let inner = Arc::new(BlockingResolver {
p: Arc::new(Mutex::new(data)),
});
let layer = Arc::new(CoalescePartitionResolver::new(inner));
// The following two partitions are for the same (blocked) partition and
// neither resolve.
let pa_1 = layer.get_partition(
PartitionKey::from(PARTITION_KEY),
namespace_id,
Arc::clone(&namespace_name),
table_id,
Arc::clone(&table_name),
TRANSITION_SHARD_ID,
);
let pa_2 = layer.get_partition(
PartitionKey::from(PARTITION_KEY),
namespace_id,
Arc::clone(&namespace_name),
table_id,
Arc::clone(&table_name),
TRANSITION_SHARD_ID,
);
let waker = futures::task::noop_waker();
let mut cx = Context::from_waker(&waker);
futures::pin_mut!(pa_1);
futures::pin_mut!(pa_2);
// Neither make progress
assert_matches!(Pin::new(&mut pa_1).poll(&mut cx), Poll::Pending);
assert_matches!(Pin::new(&mut pa_2).poll(&mut cx), Poll::Pending);
// But a non-blocked partition is resolved without issue.
let _ = layer
.get_partition(
PartitionKey::from("platanos"),
namespace_id,
namespace_name,
table_id,
table_name,
TRANSITION_SHARD_ID,
)
.with_timeout_panic(Duration::from_secs(5))
.await;
// While the original requests are still blocked.
assert_matches!(Pin::new(&mut pa_1).poll(&mut cx), Poll::Pending);
assert_matches!(Pin::new(&mut pa_2).poll(&mut cx), Poll::Pending);
}
}

View File

@ -55,7 +55,7 @@ impl PartitionProvider for MockPartitionProvider {
table_id: TableId,
table_name: Arc<DeferredLoad<TableName>>,
_transition_shard_id: ShardId,
) -> PartitionData {
) -> Arc<Mutex<PartitionData>> {
let p = self
.partitions
.lock()
@ -67,6 +67,6 @@ impl PartitionProvider for MockPartitionProvider {
assert_eq!(p.namespace_id(), namespace_id);
assert_eq!(p.namespace_name().to_string(), namespace_name.to_string());
assert_eq!(p.table_name().to_string(), table_name.to_string());
p
Arc::new(Mutex::new(p))
}
}

View File

@ -16,5 +16,8 @@ pub(crate) use catalog::*;
mod sort_key;
pub(crate) use sort_key::*;
mod coalesce;
pub(crate) use coalesce::*;
#[cfg(test)]
pub(crate) mod mock;

View File

@ -2,6 +2,7 @@ use std::{fmt::Debug, sync::Arc};
use async_trait::async_trait;
use data_types::{NamespaceId, PartitionKey, ShardId, TableId};
use parking_lot::Mutex;
use crate::{
buffer_tree::{namespace::NamespaceName, partition::PartitionData, table::TableName},
@ -25,7 +26,7 @@ pub(crate) trait PartitionProvider: Send + Sync + Debug {
table_id: TableId,
table_name: Arc<DeferredLoad<TableName>>,
transition_shard_id: ShardId,
) -> PartitionData;
) -> Arc<Mutex<PartitionData>>;
}
#[async_trait]
@ -41,7 +42,7 @@ where
table_id: TableId,
table_name: Arc<DeferredLoad<TableName>>,
transition_shard_id: ShardId,
) -> PartitionData {
) -> Arc<Mutex<PartitionData>> {
(**self)
.get_partition(
partition_key,
@ -101,9 +102,12 @@ mod tests {
TRANSITION_SHARD_ID,
)
.await;
assert_eq!(got.partition_id(), partition);
assert_eq!(got.namespace_id(), namespace_id);
assert_eq!(got.namespace_name().to_string(), namespace_name.to_string());
assert_eq!(got.table_name().to_string(), table_name.to_string());
assert_eq!(got.lock().partition_id(), partition);
assert_eq!(got.lock().namespace_id(), namespace_id);
assert_eq!(
got.lock().namespace_name().to_string(),
namespace_name.to_string()
);
assert_eq!(got.lock().table_name().to_string(), table_name.to_string());
}
}

View File

@ -183,8 +183,7 @@ where
//
// This MAY return a different instance than `p` if another
// thread has already initialised the partition.
self.partition_data
.get_or_insert_with(&partition_key, || Arc::new(Mutex::new(p)))
self.partition_data.get_or_insert_with(&partition_key, || p)
}
};
@ -223,8 +222,9 @@ where
);
// Gather the partition data from all of the partitions in this table.
let span = SpanRecorder::new(span);
let partitions = self.partitions().into_iter().map(move |p| {
let mut span = SpanRecorder::new(span.clone().map(|s| s.child("partition read")));
let mut span = span.child("partition read");
let (id, completed_persistence_count, data) = {
let mut p = p.lock();

View File

@ -26,7 +26,9 @@ use wal::Wal;
use crate::{
buffer_tree::{
namespace::name_resolver::{NamespaceNameProvider, NamespaceNameResolver},
partition::resolver::{CatalogPartitionResolver, PartitionCache, PartitionProvider},
partition::resolver::{
CatalogPartitionResolver, CoalescePartitionResolver, PartitionCache, PartitionProvider,
},
table::name_resolver::{TableNameProvider, TableNameResolver},
BufferTree,
},
@ -281,8 +283,10 @@ where
.await
.map_err(InitError::PreWarmPartitions)?;
// Build the partition provider, wrapped in the partition cache.
// Build the partition provider, wrapped in the partition cache and request
// coalescer.
let partition_provider = CatalogPartitionResolver::new(Arc::clone(&catalog));
let partition_provider = CoalescePartitionResolver::new(Arc::new(partition_provider));
let partition_provider = PartitionCache::new(
partition_provider,
recent_partitions,

View File

@ -43,12 +43,11 @@ where
columns: Vec<String>,
span: Option<Span>,
) -> Result<Self::Response, QueryError> {
let span = span.map(|s| s.child(self.name.clone()));
let mut recorder = SpanRecorder::new(span.clone());
let mut recorder = SpanRecorder::new(span).child(self.name.clone());
match self
.inner
.query_exec(namespace_id, table_id, columns, span)
.query_exec(namespace_id, table_id, columns, recorder.span().cloned())
.await
{
Ok(v) => {
@ -89,7 +88,7 @@ mod tests {
.spans()
.into_iter()
.find(|s| s.name == name)
.unwrap_or_else(|| panic!("tracing span {name} not found"));
.unwrap_or_else(|| panic!("tracing span {name} not found in\n{traces:#?}"));
assert_eq!(
span.status, status,

View File

@ -146,6 +146,7 @@ where
request: Request<Ticket>,
) -> Result<Response<Self::DoGetStream>, tonic::Status> {
let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
let span = span_ctx.child_span("ingester query");
// Acquire and hold a permit for the duration of this request, or return
// an error if the existing requests have already exhausted the
@ -178,12 +179,7 @@ where
let response = match self
.query_handler
.query_exec(
namespace_id,
table_id,
request.columns,
span_ctx.child_span("ingester query"),
)
.query_exec(namespace_id, table_id, request.columns, span)
.await
{
Ok(v) => v,

View File

@ -25,11 +25,11 @@ mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
object_store = "0.5.6"
observability_deps = { version = "0.1.0", path = "../observability_deps" }
parquet_file = { version = "0.1.0", path = "../parquet_file" }
prost = { version = "0.11.6", default-features = false, features = ["std"] }
prost = { version = "0.11.9", default-features = false, features = ["std"] }
tempfile = { version = "3.5.0" }
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
tokio-util = "0.7.7"
tonic = "0.8.3"
tonic = { workspace = true }
wal = { version = "0.1.0", path = "../wal" }
workspace-hack = { version = "0.1", path = "../workspace-hack" }

View File

@ -24,7 +24,7 @@ rand = { version = "0.8.3", features = ["small_rng"] }
regex = "1.7"
schema = { path = "../schema" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0.95"
serde_json = "1.0.96"
snafu = "0.7"
tokio = { version = "1.27", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
toml = "0.7.3"

View File

@ -29,6 +29,7 @@ indexmap = { version = "1.9", features = ["std"] }
itertools = "0.10.5"
object_store = "0.5.6"
observability_deps = { path = "../observability_deps" }
once_cell = "1"
parking_lot = "0.12"
parquet_file = { path = "../parquet_file" }
query_functions = { path = "../query_functions"}

View File

@ -45,16 +45,19 @@ use super::{params::GapFillParams, FillStrategy};
/// │ ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
/// │ 2 ║ ║ │ │ ║ │ │ ║
/// │ ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
/// │ 3 ║ ║ │ │ ║ │ │ ║
/// │ . . .
/// output_batch_size . . .
/// │ . . .
/// │ ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
/// │ n - 1 ║ ║ │ │ ║ │ │ ║
/// │ ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
/// ┴──── n ║ ║ │ │ ║ │ │ ║
/// ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
/// trailing row n + 1 ║ ║ │ │ ║ │ │ ║
/// ╙────╨───┴───┴─────────────╨───┴───┴─────────────╜
/// trailing row(s) n + 1 ║ ║ │ │ ║ │ │ ║
/// ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
/// . . .
/// . . .
/// . . .
/// ```
///
/// Just before generating output, the cursor will generally point at offset 1
@ -69,13 +72,19 @@ use super::{params::GapFillParams, FillStrategy};
/// (using the [`take`](take::take) kernel) when we are generating trailing gaps, i.e.,
/// when all of the input rows have been output for a series in the previous batch,
/// but there still remains missing rows to produce at the end.
/// - Having one additional _trailing row_ at the end ensures that `GapFiller` can
/// - Having at least one additional _trailing row_ at the end ensures that `GapFiller` can
/// infer whether there is trailing gaps to produce at the beginning of the
/// next batch, since it can discover if the last row starts a new series.
/// - If there are columns that have a fill strategy of [`LinearInterpolate`], then more
/// trailing rows may be necessary to find the next non-null value for the column.
///
/// [`LinearInterpolate`]: FillStrategy::LinearInterpolate
#[derive(Debug)]
pub(super) struct GapFiller {
/// The static parameters of gap-filling: time range start, end and the stride.
params: GapFillParams,
/// The number of rows to produce in each output batch.
batch_size: usize,
/// The current state of gap-filling, including the next timestamp,
/// the offset of the next input row, and remaining space in output batch.
cursor: Cursor,
@ -83,9 +92,25 @@ pub(super) struct GapFiller {
impl GapFiller {
/// Initialize a [GapFiller] at the beginning of an input record batch.
pub fn new(params: GapFillParams) -> Self {
pub fn new(params: GapFillParams, batch_size: usize) -> Self {
let cursor = Cursor::new(&params);
Self { params, cursor }
Self {
params,
batch_size,
cursor,
}
}
/// Given that the cursor points at the input row that will be
/// the first row in the next output batch, return the offset
/// of last input row that could possibly be in the output.
///
/// This offset is used by ['BufferedInput`] to determine how many
/// rows need to be buffered.
///
/// [`BufferedInput`]: super::BufferedInput
pub(super) fn last_output_row_offset(&self) -> usize {
self.cursor.next_input_offset + self.batch_size - 1
}
/// Returns true if there are no more output rows to produce given
@ -100,14 +125,13 @@ impl GapFiller {
/// schema at member `0`.
pub fn build_gapfilled_output(
&mut self,
batch_size: usize,
schema: SchemaRef,
input_time_array: (usize, &TimestampNanosecondArray),
group_arrays: &[(usize, ArrayRef)],
aggr_arrays: &[(usize, ArrayRef)],
) -> Result<RecordBatch> {
let series_ends = self.plan_output_batch(batch_size, input_time_array.1, group_arrays)?;
self.cursor.remaining_output_batch_size = batch_size;
let series_ends = self.plan_output_batch(input_time_array.1, group_arrays)?;
self.cursor.remaining_output_batch_size = self.batch_size;
self.build_output(
schema,
input_time_array,
@ -139,7 +163,6 @@ impl GapFiller {
/// to partition input rows into series.
fn plan_output_batch(
&mut self,
batch_size: usize,
input_time_array: &TimestampNanosecondArray,
group_arr: &[(usize, ArrayRef)],
) -> Result<Vec<usize>> {
@ -165,7 +188,7 @@ impl GapFiller {
let start_offset = cursor.next_input_offset;
assert!(start_offset <= 1, "input is sliced after it is consumed");
while output_row_count < batch_size {
while output_row_count < self.batch_size {
match ranges.next() {
Some(Range { end, .. }) => {
assert!(

View File

@ -90,7 +90,6 @@ impl Cursor {
.map(|seg| Segment::<T::Native>::try_from(seg.clone()))
.transpose()?;
let mut builder = InterpolateBuilder {
params,
values: Vec::with_capacity(self.remaining_output_batch_size),
segment,
input_time_array,
@ -173,7 +172,6 @@ impl_from_segment_scalar_value!(f64);
/// Implements [`VecBuilder`] for build aggregate columns whose gaps
/// are being filled using linear interpolation.
pub(super) struct InterpolateBuilder<'a, T: ArrowPrimitiveType> {
pub params: &'a GapFillParams,
pub values: Vec<Option<T::Native>>,
pub segment: Option<Segment<T::Native>>,
pub input_time_array: &'a TimestampNanosecondArray,
@ -193,27 +191,25 @@ where
offset,
series_end_offset,
} => {
// If
// we are not at the last point
// and the distance to the next point is greater than the stride
// and both this point and the next are not null
// then create a segment that will be used to fill in the missing rows.
if offset + 1 < series_end_offset
&& self.input_time_array.value(offset + 1) > ts + self.params.stride
&& self.input_aggr_array.is_valid(offset)
&& self.input_aggr_array.is_valid(offset + 1)
{
self.segment = Some(Segment {
if self.input_aggr_array.is_valid(offset) {
let end_offset = self.find_end_offset(offset, series_end_offset);
// Find the next non-null value in this column for the series.
// If there is one, start a new segment at the current value.
self.segment = end_offset.map(|end_offset| Segment {
start_point: (ts, self.input_aggr_array.value(offset)),
end_point: (
self.input_time_array.value(offset + 1),
self.input_aggr_array.value(offset + 1),
self.input_time_array.value(end_offset),
self.input_aggr_array.value(end_offset),
),
})
});
self.copy_point(offset);
} else {
self.segment = None;
self.values.push(
self.segment
.as_ref()
.map(|seg| T::Native::interpolate(seg, ts)),
);
}
self.copy_point(offset);
}
RowStatus::Missing { ts, .. } => self.values.push(
self.segment
@ -243,6 +239,17 @@ where
.then_some(self.input_aggr_array.value(offset));
self.values.push(v)
}
/// Scan forward to find the endpoint for a segment that starts at `start_offset`.
/// Skip over any null values.
///
/// We are guaranteed to have buffered enough input to find the next non-null point for this series,
/// if there is one, by the logic in [`BufferedInput`].
///
/// [`BufferedInput`]: super::super::buffered_input::BufferedInput
fn find_end_offset(&self, start_offset: usize, series_end_offset: usize) -> Option<usize> {
((start_offset + 1)..series_end_offset).find(|&i| self.input_aggr_array.is_valid(i))
}
}
/// A trait for the native numeric types that can be interpolated
@ -375,8 +382,8 @@ mod test {
- "| 1970-01-01T00:00:00.000001200Z | 133 |"
- "| 1970-01-01T00:00:00.000001300Z | 166 |"
- "| 1970-01-01T00:00:00.000001400Z | 200 |"
- "| 1970-01-01T00:00:00.000001500Z | |"
- "| 1970-01-01T00:00:00.000001600Z | |"
- "| 1970-01-01T00:00:00.000001500Z | 466 |"
- "| 1970-01-01T00:00:00.000001600Z | 733 |"
- "| 1970-01-01T00:00:00.000001700Z | 1000 |"
- "| 1970-01-01T00:00:00.000001800Z | 500 |"
- "| 1970-01-01T00:00:00.000001900Z | 0 |"
@ -447,8 +454,8 @@ mod test {
- "| 1970-01-01T00:00:00.000001200Z | 133 |"
- "| 1970-01-01T00:00:00.000001300Z | 166 |"
- "| 1970-01-01T00:00:00.000001400Z | 200 |"
- "| 1970-01-01T00:00:00.000001500Z | |"
- "| 1970-01-01T00:00:00.000001600Z | |"
- "| 1970-01-01T00:00:00.000001500Z | 466 |"
- "| 1970-01-01T00:00:00.000001600Z | 733 |"
- "| 1970-01-01T00:00:00.000001700Z | 1000 |"
- "| 1970-01-01T00:00:00.000001800Z | 500 |"
- "| 1970-01-01T00:00:00.000001900Z | 0 |"
@ -519,8 +526,8 @@ mod test {
- "| 1970-01-01T00:00:00.000001200Z | 200.0 |"
- "| 1970-01-01T00:00:00.000001300Z | 300.0 |"
- "| 1970-01-01T00:00:00.000001400Z | 400.0 |"
- "| 1970-01-01T00:00:00.000001500Z | |"
- "| 1970-01-01T00:00:00.000001600Z | |"
- "| 1970-01-01T00:00:00.000001500Z | 600.0 |"
- "| 1970-01-01T00:00:00.000001600Z | 800.0 |"
- "| 1970-01-01T00:00:00.000001700Z | 1000.0 |"
- "| 1970-01-01T00:00:00.000001800Z | 500.0 |"
- "| 1970-01-01T00:00:00.000001900Z | 0.0 |"

View File

@ -0,0 +1,405 @@
//! Logic for buffering record batches for gap filling.
use std::sync::Arc;
use arrow::{
array::ArrayRef,
record_batch::RecordBatch,
row::{RowConverter, Rows, SortField},
};
use datafusion::error::{DataFusionError, Result};
use hashbrown::HashSet;
use super::{params::GapFillParams, FillStrategy};
/// Encapsulate the logic around how to buffer input records.
///
/// If there are no columns with [`FillStrategy::LinearInterpolate`], then
/// we need to buffer up to the last input row that might appear in the output, plus
/// one additional row.
///
/// However, if there are columns filled via interpolation, then we need
/// to ensure that we read ahead far enough to a non-null value, or a change
/// of group columns, in the columns being interpolated.
///
/// [`FillStrategy::LinearInterpolate`]: super::FillStrategy::LinearInterpolate
/// [`GapFillStream`]: super::stream::GapFillStream
pub(super) struct BufferedInput {
/// Indexes of group columns in the schema (not including time).
group_cols: Vec<usize>,
/// Indexes of aggregate columns filled via interpolation.
interpolate_cols: Vec<usize>,
/// Buffered records from the input stream.
batches: Vec<RecordBatch>,
/// When gap filling with interpolated values, this row converter
/// is used to compare rows to see if group columns have changed.
row_converter: Option<RowConverter>,
/// When gap filling with interpolated values, cache a row-oriented
/// representation of the last row that may appear in the output so
/// it doesn't need to be computed more than once.
last_output_row: Option<Rows>,
}
impl BufferedInput {
pub(super) fn new(params: &GapFillParams, group_cols: Vec<usize>) -> Self {
let interpolate_cols = params
.fill_strategy
.iter()
.filter_map(|(col_offset, fs)| {
(fs == &FillStrategy::LinearInterpolate).then_some(*col_offset)
})
.collect::<Vec<usize>>();
Self {
group_cols,
interpolate_cols,
batches: vec![],
row_converter: None,
last_output_row: None,
}
}
/// Add a new batch of buffered records from the input stream.
pub(super) fn push(&mut self, batch: RecordBatch) {
self.batches.push(batch);
}
/// Transfer ownership of the buffered record batches to the caller for
/// processing.
pub(super) fn take(&mut self) -> Vec<RecordBatch> {
self.last_output_row = None;
std::mem::take(&mut self.batches)
}
/// Determine if we need more input before we start processing.
pub(super) fn need_more(&mut self, last_output_row_offset: usize) -> Result<bool> {
let record_count: usize = self.batches.iter().map(|rb| rb.num_rows()).sum();
// min number of rows needed is the number of rows up to and including
// the last row that may appear in the output, plus one more row.
let min_needed = last_output_row_offset + 2;
if record_count < min_needed {
return Ok(true);
} else if self.interpolate_cols.is_empty() {
return Ok(false);
}
// Check to see if the last row that might appear in the output
// has a different group column values than the last buffered row.
// If they are different, then we have enough input to start.
let (last_output_batch_offset, last_output_row_offset) = self
.find_row_idx(last_output_row_offset)
.expect("checked record count");
if self.group_columns_changed((last_output_batch_offset, last_output_row_offset))? {
return Ok(false);
}
// Now check if there are non-null values in the columns being interpolated.
// We skip over the batches that come before the one that contains the last
// possible output row. We start with the last buffered batch, so we can avoid
// having to slice unless necessary.
let mut cols_that_need_more =
HashSet::<usize>::from_iter(self.interpolate_cols.iter().cloned());
let mut to_remove = vec![];
for (i, batch) in self
.batches
.iter()
.enumerate()
.skip(last_output_batch_offset)
.rev()
{
for col_offset in cols_that_need_more.clone() {
// If this is the batch containing the last possible output row, slice the
// array so we are just looking at that value and the ones after.
let array = batch.column(col_offset);
let array = if i == last_output_batch_offset {
let length = array.len() - last_output_row_offset;
batch
.column(col_offset)
.slice(last_output_row_offset, length)
} else {
Arc::clone(array)
};
if array.null_count() < array.len() {
to_remove.push(col_offset);
}
}
to_remove.drain(..).for_each(|c| {
cols_that_need_more.remove(&c);
});
if cols_that_need_more.is_empty() {
break;
}
}
Ok(!cols_that_need_more.is_empty())
}
/// Check to see if the group column values have changed between the last row
/// that may be in the output and the last buffered input row.
///
/// This method uses the row-oriented representation of Arrow data from [`arrow::row`] to
/// compare rows in different record batches.
///
/// [`arrow::row`]: https://docs.rs/arrow-row/36.0.0/arrow_row/index.html
fn group_columns_changed(&mut self, last_output_row_idx: (usize, usize)) -> Result<bool> {
if self.group_cols.is_empty() {
return Ok(false);
}
let last_buffered_row_idx = self.last_buffered_row_idx();
if last_output_row_idx == last_buffered_row_idx {
// the output row is also the last buffered row,
// so there is nothing to compare.
return Ok(false);
}
let last_input_rows = self.convert_row(self.last_buffered_row_idx())?;
let last_row_in_output = self.last_output_row(last_output_row_idx)?;
Ok(last_row_in_output.row(0) != last_input_rows.row(0))
}
/// Get a row converter for comparing records. Keep it in [`Self::row_converter`]
/// to avoid creating it multiple times.
fn get_row_converter(&mut self) -> Result<&mut RowConverter> {
if self.row_converter.is_none() {
let batch = self.batches.first().expect("at least one batch");
let sort_fields = self
.group_cols
.iter()
.map(|c| SortField::new(batch.column(*c).data_type().clone()))
.collect();
let row_converter =
RowConverter::new(sort_fields).map_err(DataFusionError::ArrowError)?;
self.row_converter = Some(row_converter);
}
Ok(self.row_converter.as_mut().expect("cannot be none"))
}
/// Convert a row to row-oriented format for easy comparison.
fn convert_row(&mut self, row_idxs: (usize, usize)) -> Result<Rows> {
let batch = &self.batches[row_idxs.0];
let columns: Vec<ArrayRef> = self
.group_cols
.iter()
.map(|col_idx| batch.column(*col_idx).slice(row_idxs.1, 1))
.collect();
self.get_row_converter()?
.convert_columns(&columns)
.map_err(DataFusionError::ArrowError)
}
/// Returns the row-oriented representation of the last buffered row that may appear in the next
/// output batch. Since this row may be used multiple times, cache it in `self` to
/// avoid computing it multiple times.
fn last_output_row(&mut self, idxs: (usize, usize)) -> Result<&Rows> {
if self.last_output_row.is_none() {
let rows = self.convert_row(idxs)?;
self.last_output_row = Some(rows);
}
Ok(self.last_output_row.as_ref().expect("cannot be none"))
}
/// Return the `(batch_idx, row_idx)` of the last buffered row.
fn last_buffered_row_idx(&self) -> (usize, usize) {
let last_batch_len = self.batches.last().unwrap().num_rows();
(self.batches.len() - 1, last_batch_len - 1)
}
/// Return the `(batch_idx, row_idx)` of the `nth` row.
fn find_row_idx(&self, mut nth: usize) -> Option<(usize, usize)> {
let mut idx = None;
for (i, batch) in self.batches.iter().enumerate() {
if nth >= batch.num_rows() {
nth -= batch.num_rows()
} else {
idx = Some((i, nth));
break;
}
}
idx
}
}
#[cfg(test)]
mod tests {
use std::collections::VecDeque;
use arrow_util::test_util::batches_to_lines;
use super::*;
use crate::exec::gapfill::exec_tests::TestRecords;
fn test_records(batch_size: usize) -> VecDeque<RecordBatch> {
let records = TestRecords {
group_cols: vec![
std::iter::repeat(Some("a")).take(12).collect(),
std::iter::repeat(Some("b"))
.take(6)
.chain(std::iter::repeat(Some("c")).take(6))
.collect(),
],
time_col: (0..12).map(|i| Some(1000 + i * 5)).take(12).collect(),
agg_cols: vec![
vec![
Some(1),
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
Some(10),
],
vec![
Some(2),
None,
None,
None,
None,
None,
None,
None,
Some(20),
None,
None,
None,
],
(0..12).map(Some).collect(),
],
input_batch_size: batch_size,
};
TryInto::<Vec<RecordBatch>>::try_into(records)
.unwrap()
.into()
}
fn test_params() -> GapFillParams {
GapFillParams {
stride: 50_000_000,
first_ts: Some(1_000_000_000),
last_ts: 1_055_000_000,
fill_strategy: [
(3, FillStrategy::LinearInterpolate),
(4, FillStrategy::LinearInterpolate),
]
.into(),
}
}
// This test is just here so it's clear what the
// test data is
#[test]
fn test_test_records() {
let batch = test_records(1000).pop_front().unwrap();
let actual = batches_to_lines(&[batch]);
insta::assert_yaml_snapshot!(actual, @r###"
---
- +----+----+--------------------------+----+----+----+
- "| g0 | g1 | time | a0 | a1 | a2 |"
- +----+----+--------------------------+----+----+----+
- "| a | b | 1970-01-01T00:00:01Z | 1 | 2 | 0 |"
- "| a | b | 1970-01-01T00:00:01.005Z | | | 1 |"
- "| a | b | 1970-01-01T00:00:01.010Z | | | 2 |"
- "| a | b | 1970-01-01T00:00:01.015Z | | | 3 |"
- "| a | b | 1970-01-01T00:00:01.020Z | | | 4 |"
- "| a | b | 1970-01-01T00:00:01.025Z | | | 5 |"
- "| a | c | 1970-01-01T00:00:01.030Z | | | 6 |"
- "| a | c | 1970-01-01T00:00:01.035Z | | | 7 |"
- "| a | c | 1970-01-01T00:00:01.040Z | | 20 | 8 |"
- "| a | c | 1970-01-01T00:00:01.045Z | | | 9 |"
- "| a | c | 1970-01-01T00:00:01.050Z | | | 10 |"
- "| a | c | 1970-01-01T00:00:01.055Z | 10 | | 11 |"
- +----+----+--------------------------+----+----+----+
"###);
}
#[test]
fn no_group_no_interpolate() {
let batch_size = 3;
let mut params = test_params();
params.fill_strategy = [].into();
let mut buffered_input = BufferedInput::new(&params, vec![]);
let mut batches = test_records(batch_size);
// There are no rows, so that is less than the batch size,
// it needs more.
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// There are now 3 rows, still less than batch_size + 1,
// so it needs more.
buffered_input.push(batches.pop_front().unwrap());
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// We now have batch_size * 2, records, which is enough.
buffered_input.push(batches.pop_front().unwrap());
assert!(!buffered_input.need_more(batch_size - 1).unwrap());
}
#[test]
fn no_group() {
let batch_size = 3;
let params = test_params();
let mut buffered_input = BufferedInput::new(&params, vec![]);
let mut batches = test_records(batch_size);
// There are no rows, so that is less than the batch size,
// it needs more.
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// There are now 3 rows, still less than batch_size + 1,
// so it needs more.
buffered_input.push(batches.pop_front().unwrap());
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// There are now 6 rows, if we were not interpolating,
// this would be enough.
buffered_input.push(batches.pop_front().unwrap());
// If we are interpolating, there are no non null values
// at offset 5.
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// Push more rows, now totaling 9.
buffered_input.push(batches.pop_front().unwrap());
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// Column `a1` has a non-null value at offset 8.
// If that were the only column being interpolated, we would have enough.
// 12 rows, with non-null values in both columns being interpolated.
buffered_input.push(batches.pop_front().unwrap());
assert!(!buffered_input.need_more(batch_size - 1).unwrap());
}
#[test]
fn with_group() {
let params = test_params();
let group_cols = vec![0, 1];
let mut buffered_input = BufferedInput::new(&params, group_cols);
let batch_size = 3;
let mut batches = test_records(batch_size);
// no rows
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// 3 rows
buffered_input.push(batches.pop_front().unwrap());
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// 6 rows
buffered_input.push(batches.pop_front().unwrap());
assert!(buffered_input.need_more(batch_size - 1).unwrap());
// 9 rows (series changes here)
buffered_input.push(batches.pop_front().unwrap());
assert!(!buffered_input.need_more(batch_size - 1).unwrap());
}
}

View File

@ -775,6 +775,7 @@ fn test_gapfill_fill_interpolate() {
Some("b"),
Some("b"),
Some("b"),
Some("b"),
]],
time_col: vec![
None,
@ -788,7 +789,7 @@ fn test_gapfill_fill_interpolate() {
// --- new series
None,
Some(975),
// 1000
Some(1000),
Some(1025),
// 1050
Some(1075),
@ -807,7 +808,7 @@ fn test_gapfill_fill_interpolate() {
// --- new series
Some(-10),
Some(1100), // 975
// 1200 1000
None, // 1200 1000 (this null value will be filled)
Some(1300), // 1025
// 1325 1050
Some(1350), // 1075
@ -979,13 +980,13 @@ fn assert_batch_count(actual_batches: &[RecordBatch], batch_size: usize) {
type ExprVec = Vec<Arc<dyn PhysicalExpr>>;
struct TestRecords {
group_cols: Vec<Vec<Option<&'static str>>>,
pub(super) struct TestRecords {
pub group_cols: Vec<Vec<Option<&'static str>>>,
// Stored as millisecods since intervals use millis,
// to let test cases be consistent and easier to read.
time_col: Vec<Option<i64>>,
agg_cols: Vec<Vec<Option<i64>>>,
input_batch_size: usize,
pub time_col: Vec<Option<i64>>,
pub agg_cols: Vec<Vec<Option<i64>>>,
pub input_batch_size: usize,
}
impl TestRecords {
@ -1174,14 +1175,16 @@ fn phys_fill_strategies(
fn get_params_ms_with_fill_strategy(
batch: &TestRecords,
stride: i64,
stride_ms: i64,
start: Option<i64>,
end: i64,
fill_strategy: FillStrategy,
) -> GapFillExecParams {
// stride is in ms
let stride = ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000);
GapFillExecParams {
// interval day time is milliseconds in the low 32-bit word
stride: phys_lit(ScalarValue::IntervalDayTime(Some(stride))), // milliseconds
stride: phys_lit(stride),
time_column: Column::new("t", batch.group_cols.len()),
origin: phys_lit(ScalarValue::TimestampNanosecond(Some(0), None)),
// timestamps are nanos, so scale them accordingly

View File

@ -2,6 +2,7 @@
//! a gap-filling extension to DataFusion
mod algo;
mod buffered_input;
#[cfg(test)]
mod exec_tests;
mod params;
@ -31,7 +32,6 @@ use datafusion::{
},
prelude::Expr,
};
use datafusion_util::sort_exprs::requirements_from_sort_exprs;
use self::stream::GapFillStream;
@ -475,7 +475,9 @@ impl ExecutionPlan for GapFillExec {
}
fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
vec![Some(requirements_from_sort_exprs(&self.sort_expr))]
vec![Some(PhysicalSortRequirement::from_sort_exprs(
&self.sort_expr,
))]
}
fn maintains_input_order(&self) -> Vec<bool> {
@ -740,11 +742,11 @@ mod test {
explain,
@r###"
---
- " ProjectionExec: expr=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, AVG(temps.temp)@1 as AVG(temps.temp)]"
- " GapFillExec: group_expr=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[AVG(temps.temp)@1], stride=60000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
- " SortExec: expr=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]"
- " AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
- " AggregateExec: mode=Partial, gby=[datebin(60000, time@0, 0) as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
- " ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, AVG(temps.temp)@1 as AVG(temps.temp)]"
- " GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[AVG(temps.temp)@1], stride=60000000000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
- " SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]"
- " AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
- " AggregateExec: mode=Partial, gby=[datebin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
- " EmptyExec: produce_one_row=false"
"###
);
@ -770,11 +772,11 @@ mod test {
explain,
@r###"
---
- " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, AVG(temps.temp)@3 as AVG(temps.temp)]"
- " GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[AVG(temps.temp)@3], stride=60000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
- " SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]"
- " AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
- " AggregateExec: mode=Partial, gby=[loc@1 as loc, datebin(60000, time@0, 0) as date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
- " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, AVG(temps.temp)@3 as AVG(temps.temp)]"
- " GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[AVG(temps.temp)@3], stride=60000000000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
- " SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]"
- " AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
- " AggregateExec: mode=Partial, gby=[loc@1 as loc, datebin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
- " EmptyExec: produce_one_row=false"
"###
);

View File

@ -2,7 +2,7 @@
use std::ops::Bound;
use arrow::{
datatypes::{IntervalDayTimeType, SchemaRef},
datatypes::{IntervalMonthDayNanoType, SchemaRef},
record_batch::RecordBatch,
};
use chrono::Duration;
@ -133,10 +133,17 @@ fn extract_timestamp_nanos(cv: &ColumnarValue) -> Result<i64> {
fn extract_interval_nanos(cv: &ColumnarValue) -> Result<i64> {
match cv {
ColumnarValue::Scalar(ScalarValue::IntervalDayTime(Some(v))) => {
let (days, ms) = IntervalDayTimeType::to_parts(*v);
ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(v))) => {
let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(*v);
if months != 0 {
return Err(DataFusionError::Execution(
"gap filling does not support month intervals".to_string(),
));
}
let nanos =
(Duration::days(days as i64) + Duration::milliseconds(ms as i64)).num_nanoseconds();
(Duration::days(days as i64) + Duration::nanoseconds(nanos)).num_nanoseconds();
nanos.ok_or_else(|| {
DataFusionError::Execution("gap filling argument is too large".to_string())
})
@ -261,9 +268,7 @@ mod tests {
}
fn interval(ns: i64) -> Arc<dyn PhysicalExpr> {
Arc::new(Literal::new(ScalarValue::IntervalDayTime(Some(
ns / 1_000_000,
))))
Arc::new(Literal::new(ScalarValue::new_interval_mdn(0, 0, ns)))
}
fn timestamp(ns: i64) -> Arc<dyn PhysicalExpr> {

View File

@ -22,9 +22,16 @@ use datafusion::{
};
use futures::{ready, Stream, StreamExt};
use super::{algo::GapFiller, params::GapFillParams, GapFillExec};
use super::{algo::GapFiller, buffered_input::BufferedInput, params::GapFillParams, GapFillExec};
/// An implementation of a gap-filling operator that uses the [Stream] trait.
///
/// This type takes responsibility for:
/// - Reading input record batches
/// - Accounting for memory
/// - Extracting arrays for processing by [`GapFiller`]
/// - Recording metrics
/// - Sending record batches to next operator (by implementing [`Self::poll_next'])
#[allow(dead_code)]
pub(super) struct GapFillStream {
/// The schema of the input and output.
@ -38,12 +45,10 @@ pub(super) struct GapFillStream {
group_expr: Vec<Arc<dyn PhysicalExpr>>,
/// The aggregate columns from the select list of the original query.
aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
/// The number of rows to produce in each output batch.
batch_size: usize,
/// The producer of the input record batches.
input: SendableRecordBatchStream,
/// Input that has been read from the iput stream.
buffered_input_batches: Vec<RecordBatch>,
buffered_input: BufferedInput,
/// The thing that does the gap filling.
gap_filler: GapFiller,
/// This is true as long as there are more input record batches to read from `input`.
@ -83,16 +88,19 @@ impl GapFillStream {
.collect::<Vec<_>>();
let aggr_expr = aggr_expr.to_owned();
let time_expr = group_expr.split_off(group_expr.len() - 1).pop().unwrap();
let group_cols = group_expr.iter().map(expr_to_index).collect::<Vec<_>>();
let params = GapFillParams::try_new(Arc::clone(&schema), params)?;
let gap_filler = GapFiller::new(params);
let buffered_input = BufferedInput::new(&params, group_cols);
let gap_filler = GapFiller::new(params, batch_size);
Ok(Self {
schema,
time_expr,
group_expr,
aggr_expr,
batch_size,
input,
buffered_input_batches: vec![],
buffered_input,
gap_filler,
more_input: true,
reservation,
@ -112,28 +120,17 @@ impl Stream for GapFillStream {
/// Produces a gap-filled record batch from its input stream.
///
/// This method starts off by reading input until it has buffered `batch_size` + 2 rows,
/// or until there is no more input. Having at least `batch_size` rows ensures that we
/// can produce at least one full output batch. We need two additional rows so that we have
/// 1) an input row that corresponds to the row before the current output batch. This is
/// needed for the case where we are producing trailing gaps, and we need to use the
/// `take` kernel to build the group columns. There must be at least one row from the
/// corresponding series in the input to take from.
/// 2) an input row that corresponds to the next input row that will be read after the
/// current output batch. This tells us if we have processed all of our input for a series
/// but may be in "trailing gaps" mode.
///
/// Once input rows have been buffered, it will produce a gap-filled [RecordBatch] with `self.batch_size`
/// rows (or less, if there is no more input).
/// For details on implementation, see [`GapFiller`].
fn poll_next(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
) -> Poll<Option<Result<RecordBatch>>> {
while self.more_input && self.buffered_input_row_count() < self.batch_size + 2 {
let last_output_row_offset = self.gap_filler.last_output_row_offset();
while self.more_input && self.buffered_input.need_more(last_output_row_offset)? {
match ready!(self.input.poll_next_unpin(cx)) {
Some(Ok(batch)) => {
self.reservation.try_grow(batch.get_array_memory_size())?;
self.buffered_input_batches.push(batch);
self.buffered_input.push(batch);
}
Some(Err(e)) => {
return Poll::Ready(Some(Err(e)));
@ -162,8 +159,7 @@ impl Stream for GapFillStream {
match self.process(input_batch) {
Ok((output_batch, remaining_input_batch)) => {
self.buffered_input_batches.push(remaining_input_batch);
assert_eq!(1, self.buffered_input_batches.len());
self.buffered_input.push(remaining_input_batch);
self.reservation
.shrink(output_batch.get_array_memory_size());
@ -175,30 +171,21 @@ impl Stream for GapFillStream {
}
impl GapFillStream {
/// Count of input rows that are currently buffered.
fn buffered_input_row_count(&self) -> usize {
self.buffered_input_batches
.iter()
.map(|rb| rb.num_rows())
.sum()
}
/// If any buffered input batches are present, concatenates it all together
/// and returns an owned batch to the caller, leaving `self.buffered_input_batches` empty.
fn take_buffered_input(&mut self) -> Result<Option<RecordBatch>> {
if self.buffered_input_batches.is_empty() {
let batches = self.buffered_input.take();
if batches.is_empty() {
return Ok(None);
}
let mut v = vec![];
std::mem::swap(&mut v, &mut self.buffered_input_batches);
let old_size = v.iter().map(|rb| rb.get_array_memory_size()).sum();
let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum();
let mut batch = arrow::compute::concat_batches(&self.schema, &v)
let mut batch = arrow::compute::concat_batches(&self.schema, &batches)
.map_err(DataFusionError::ArrowError)?;
self.reservation.try_grow(batch.get_array_memory_size())?;
if v.len() > 1 {
if batches.len() > 1 {
// Optimize the dictionaries. The output of this operator uses the take kernel to produce
// its output. Since the input batches will usually be smaller than the output, it should
// be less work to optimize here vs optimizing the output.
@ -234,7 +221,6 @@ impl GapFillStream {
let output_batch = self
.gap_filler
.build_gapfilled_output(
self.batch_size,
Arc::clone(&self.schema),
input_time_array,
&group_arrays,

View File

@ -4,7 +4,7 @@
use arrow::{
self,
array::{Array, BooleanArray, DictionaryArray, StringArray},
array::{downcast_array, Array, BooleanArray, DictionaryArray, StringArray},
compute,
datatypes::{DataType, Int32Type, SchemaRef},
record_batch::RecordBatch,
@ -188,9 +188,7 @@ impl SeriesSetConverter {
])
.expect("concat");
// until https://github.com/apache/arrow-rs/issues/2901 is done, use a workaround
// to get a `BooleanArray`
BooleanArray::from(arr.data().clone())
downcast_array(&arr)
}
/// Creates (column_name, column_value) pairs for each column

View File

@ -73,9 +73,7 @@ use datafusion::{
scalar::ScalarValue,
};
use datafusion_util::{
sort_exprs::requirements_from_sort_exprs, watch::WatchedTask, AdapterStream,
};
use datafusion_util::{watch::WatchedTask, AdapterStream};
use futures::StreamExt;
use observability_deps::tracing::*;
use parking_lot::Mutex;
@ -215,7 +213,7 @@ impl ExecutionPlan for StreamSplitExec {
let requirement = self
.input
.output_ordering()
.map(requirements_from_sort_exprs);
.map(PhysicalSortRequirement::from_sort_exprs);
vec![requirement]
}

View File

@ -20,6 +20,7 @@ use datafusion::{error::DataFusionError, prelude::SessionContext};
use exec::{stringset::StringSet, IOxSessionContext};
use hashbrown::HashMap;
use observability_deps::tracing::{debug, trace};
use once_cell::sync::Lazy;
use parquet_file::storage::ParquetExecInput;
use predicate::{rpc_predicate::QueryNamespaceMeta, Predicate, PredicateMatch};
use schema::{
@ -45,9 +46,12 @@ pub use query_functions::group_by::{Aggregate, WindowDuration};
/// The name of the virtual column that represents the chunk order.
pub const CHUNK_ORDER_COLUMN_NAME: &str = "__chunk_order";
static CHUNK_ORDER_FIELD: Lazy<Arc<Field>> =
Lazy::new(|| Arc::new(Field::new(CHUNK_ORDER_COLUMN_NAME, DataType::Int64, false)));
/// Generate [`Field`] for [chunk order column](CHUNK_ORDER_COLUMN_NAME).
pub fn chunk_order_field() -> Field {
Field::new(CHUNK_ORDER_COLUMN_NAME, DataType::Int64, false)
pub fn chunk_order_field() -> Arc<Field> {
Arc::clone(&CHUNK_ORDER_FIELD)
}
/// Trait for an object (designed to be a Chunk) which can provide

View File

@ -14,7 +14,7 @@ use datafusion::{
optimizer::{optimizer::ApplyOrder, OptimizerConfig, OptimizerRule},
prelude::{col, Expr},
};
use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, LOCF_UDF_NAME};
use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME};
use std::{
collections::HashSet,
ops::{Bound, Range},
@ -349,6 +349,14 @@ impl TreeNodeRewriter for DateBinGapfillRewriter {
}
}
fn udf_to_fill_strategy(name: &str) -> Option<FillStrategy> {
match name {
LOCF_UDF_NAME => Some(FillStrategy::PrevNullAsMissing),
INTERPOLATE_UDF_NAME => Some(FillStrategy::LinearInterpolate),
_ => None,
}
}
fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
let Projection {
input,
@ -365,12 +373,16 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
return Ok(None)
};
let fill_cols: Vec<(&Expr, FillStrategy)> = proj_exprs
let fill_cols: Vec<(&Expr, FillStrategy, &str)> = proj_exprs
.iter()
.filter_map(|e| match e {
Expr::ScalarUDF { fun, args } if fun.name == LOCF_UDF_NAME => {
let col = &args[0];
Some((col, FillStrategy::PrevNullAsMissing))
Expr::ScalarUDF { fun, args } => {
if let Some(strategy) = udf_to_fill_strategy(&fun.name) {
let col = &args[0];
Some((col, strategy, fun.name.as_str()))
} else {
None
}
}
_ => None,
})
@ -383,12 +395,12 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
// Clone the existing GapFill node, then modify it in place
// to reflect the new fill strategy.
let mut new_gapfill = child_gapfill.clone();
for (e, col) in fill_cols {
if new_gapfill.replace_fill_strategy(e, col).is_none() {
// There was a gap filling function called on an aggregate column.
return Err(DataFusionError::Plan(
"LOCF must be called on an aggregate column in a gap-filling query".to_string(),
));
for (e, fs, fn_name) in fill_cols {
if new_gapfill.replace_fill_strategy(e, fs).is_none() {
// There was a gap filling function called on a non-aggregate column.
return Err(DataFusionError::Plan(format!(
"{fn_name} must be called on an aggregate column in a gap-filling query"
)));
}
}
@ -397,7 +409,9 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
.iter()
.cloned()
.map(|e| match e {
Expr::ScalarUDF { fun, mut args } if fun.name == LOCF_UDF_NAME => args.remove(0),
Expr::ScalarUDF { fun, mut args } if udf_to_fill_strategy(&fun.name).is_some() => {
args.remove(0)
}
_ => e,
})
.collect();
@ -433,16 +447,19 @@ fn check_node(node: &LogicalPlan) -> Result<()> {
node.expressions().iter().try_for_each(|expr| {
let dbg_count = count_udf(expr, DATE_BIN_GAPFILL_UDF_NAME)?;
if dbg_count > 0 {
Err(DataFusionError::Plan(format!(
return Err(DataFusionError::Plan(format!(
"{DATE_BIN_GAPFILL_UDF_NAME} may only be used as a GROUP BY expression"
)))
} else if count_udf(expr, LOCF_UDF_NAME)? > 0 {
Err(DataFusionError::Plan(format!(
"{LOCF_UDF_NAME} may only be used in the SELECT list of a gap-filling query"
)))
} else {
Ok(())
)));
}
for fn_name in [LOCF_UDF_NAME, INTERPOLATE_UDF_NAME] {
if count_udf(expr, fn_name)? > 0 {
return Err(DataFusionError::Plan(format!(
"{fn_name} may only be used in the SELECT list of a gap-filling query"
)));
}
}
Ok(())
})
}
@ -459,7 +476,9 @@ mod test {
use datafusion::optimizer::OptimizerContext;
use datafusion::prelude::{avg, case, col, lit, lit_timestamp_nano, min, Expr};
use datafusion::scalar::ScalarValue;
use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, LOCF_UDF_NAME};
use query_functions::gapfill::{
DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME,
};
fn table_scan() -> Result<LogicalPlan> {
let schema = Schema::new(vec![
@ -497,6 +516,13 @@ mod test {
})
}
fn interpolate(arg: Expr) -> Result<Expr> {
Ok(Expr::ScalarUDF {
fun: query_functions::registry().udf(INTERPOLATE_UDF_NAME)?,
args: vec![arg],
})
}
fn optimize(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
let optimizer = Optimizer::with_rules(vec![Arc::new(HandleGapFill::default())]);
optimizer.optimize_recursively(
@ -581,6 +607,20 @@ mod test {
Ok(())
}
/// calling INTERPOLATE in a WHERE predicate is not valid
#[test]
fn misplaced_interpolate_err() -> Result<()> {
// date_bin_gapfill used in a filter should produce an error
let scan = table_scan()?;
let plan = LogicalPlanBuilder::from(scan)
.filter(interpolate(col("temp"))?.gt(lit(100.0)))?
.build()?;
assert_optimizer_err(
&plan,
"Error during planning: interpolate may only be used in the SELECT list of a gap-filling query",
);
Ok(())
}
/// calling LOCF on the SELECT list but not on an aggregate column is not valid.
#[test]
fn misplaced_locf_non_agg_err() -> Result<()> {
@ -607,7 +647,7 @@ mod test {
.build()?;
assert_optimizer_err(
&plan,
"LOCF must be called on an aggregate column in a gap-filling query",
"locf must be called on an aggregate column in a gap-filling query",
);
Ok(())
}
@ -852,4 +892,37 @@ mod test {
assert_optimized_plan_eq(&plan, &expected)?;
Ok(())
}
#[test]
fn with_interpolate() -> Result<()> {
let dbg_args = "IntervalDayTime(\"60000\"),temps.time,TimestampNanosecond(0, None)";
let plan = LogicalPlanBuilder::from(table_scan()?)
.filter(
col("time")
.gt_eq(lit_timestamp_nano(1000))
.and(col("time").lt(lit_timestamp_nano(2000))),
)?
.aggregate(
vec![date_bin_gapfill(
lit(ScalarValue::IntervalDayTime(Some(60_000))),
col("time"),
)?],
vec![avg(col("temp")), min(col("temp"))],
)?
.project(vec![
col(format!("date_bin_gapfill({dbg_args})")),
interpolate(col("AVG(temps.temp)"))?,
interpolate(col("MIN(temps.temp)"))?,
])?
.build()?;
let expected = format!(
"Projection: date_bin_gapfill({dbg_args}), AVG(temps.temp), MIN(temps.temp)\
\n GapFill: groupBy=[[date_bin_gapfill({dbg_args})]], aggr=[[INTERPOLATE(AVG(temps.temp)), INTERPOLATE(MIN(temps.temp))]], time_column=date_bin_gapfill({dbg_args}), stride=IntervalDayTime(\"60000\"), range=Included(TimestampNanosecond(1000, None))..Excluded(TimestampNanosecond(2000, None))\
\n Aggregate: groupBy=[[datebin(IntervalDayTime(\"60000\"), temps.time, TimestampNanosecond(0, None))]], aggr=[[AVG(temps.temp), MIN(temps.temp)]]\
\n Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)\
\n TableScan: temps");
assert_optimized_plan_eq(&plan, &expected)?;
Ok(())
}
}

View File

@ -1,6 +1,6 @@
use std::sync::Arc;
use arrow::datatypes::Schema as ArrowSchema;
use arrow::datatypes::{Fields, Schema as ArrowSchema};
use datafusion::physical_plan::ExecutionPlan;
use schema::Schema;
@ -40,7 +40,7 @@ fn dedup_plan_impl(
.iter()
.cloned()
.chain(std::iter::once(chunk_order_field()))
.collect(),
.collect::<Fields>(),
))
} else {
schema.as_arrow()

View File

@ -169,12 +169,14 @@ impl PhysicalOptimizerRule for ProjectionPushdown {
&column_names,
Arc::clone(child_sort.input()),
|plan| {
Ok(Arc::new(SortExec::new_with_partitioning(
reassign_sort_exprs_columns(child_sort.expr(), &plan.schema())?,
plan,
child_sort.preserve_partitioning(),
child_sort.fetch(),
)))
Ok(Arc::new(
SortExec::new(
reassign_sort_exprs_columns(child_sort.expr(), &plan.schema())?,
plan,
)
.with_preserve_partitioning(child_sort.preserve_partitioning())
.with_fetch(child_sort.fetch()),
))
},
)?;
@ -930,7 +932,7 @@ mod tests {
ProjectionExec::try_new(
vec![(expr_col("tag1", &schema), String::from("tag1"))],
Arc::new(
SortExec::try_new(
SortExec::new(
vec![PhysicalSortExpr {
expr: expr_col("tag2", &schema),
options: SortOptions {
@ -939,9 +941,8 @@ mod tests {
},
}],
Arc::new(TestExec::new(schema)),
Some(42),
)
.unwrap(),
.with_fetch(Some(42)),
),
)
.unwrap(),
@ -971,18 +972,20 @@ mod tests {
let plan = Arc::new(
ProjectionExec::try_new(
vec![(expr_col("tag1", &schema), String::from("tag1"))],
Arc::new(SortExec::new_with_partitioning(
vec![PhysicalSortExpr {
expr: expr_col("tag2", &schema),
options: SortOptions {
descending: true,
..Default::default()
},
}],
Arc::new(TestExec::new_with_partitions(schema, 2)),
true,
Some(42),
)),
Arc::new(
SortExec::new(
vec![PhysicalSortExpr {
expr: expr_col("tag2", &schema),
options: SortOptions {
descending: true,
..Default::default()
},
}],
Arc::new(TestExec::new_with_partitions(schema, 2)),
)
.with_preserve_partitioning(true)
.with_fetch(Some(42)),
),
)
.unwrap(),
);

Some files were not shown because too many files have changed in this diff Show More