570 lines
20 KiB
Rust
570 lines
20 KiB
Rust
//! This module contains plumbing to connect InfluxDB IOx extensions to
|
|
//! DataFusion
|
|
|
|
use async_trait::async_trait;
|
|
use std::{convert::TryInto, fmt, sync::Arc};
|
|
|
|
use arrow::record_batch::RecordBatch;
|
|
|
|
use datafusion::{
|
|
catalog::catalog::CatalogProvider,
|
|
execution::context::{ExecutionContextState, QueryPlanner},
|
|
execution::{DiskManager, MemoryManager},
|
|
logical_plan::{LogicalPlan, UserDefinedLogicalNode},
|
|
physical_plan::{
|
|
coalesce_partitions::CoalescePartitionsExec,
|
|
displayable,
|
|
planner::{DefaultPhysicalPlanner, ExtensionPlanner},
|
|
ExecutionPlan, PhysicalPlanner, SendableRecordBatchStream,
|
|
},
|
|
prelude::*,
|
|
};
|
|
use futures::TryStreamExt;
|
|
use observability_deps::tracing::{debug, trace};
|
|
use trace::{ctx::SpanContext, span::SpanRecorder};
|
|
|
|
use crate::exec::{
|
|
fieldlist::{FieldList, IntoFieldList},
|
|
non_null_checker::NonNullCheckerExec,
|
|
query_tracing::TracedStream,
|
|
schema_pivot::{SchemaPivotExec, SchemaPivotNode},
|
|
seriesset::{
|
|
converter::{GroupGenerator, SeriesSetConverter},
|
|
series::Series,
|
|
},
|
|
split::StreamSplitExec,
|
|
stringset::{IntoStringSet, StringSetRef},
|
|
};
|
|
|
|
use crate::plan::{
|
|
fieldlist::FieldListPlan,
|
|
seriesset::{SeriesSetPlan, SeriesSetPlans},
|
|
stringset::StringSetPlan,
|
|
};
|
|
|
|
// Reuse DataFusion error and Result types for this module
|
|
pub use datafusion::error::{DataFusionError as Error, Result};
|
|
|
|
use super::{
|
|
non_null_checker::NonNullCheckerNode, seriesset::series::Either, split::StreamSplitNode,
|
|
task::DedicatedExecutor,
|
|
};
|
|
|
|
// The default catalog name - this impacts what SQL queries use if not specified
|
|
pub const DEFAULT_CATALOG: &str = "public";
|
|
// The default schema name - this impacts what SQL queries use if not specified
|
|
pub const DEFAULT_SCHEMA: &str = "iox";
|
|
|
|
/// This structure implements the DataFusion notion of "query planner"
|
|
/// and is needed to create plans with the IOx extension nodes.
|
|
struct IOxQueryPlanner {}
|
|
|
|
#[async_trait]
|
|
impl QueryPlanner for IOxQueryPlanner {
|
|
/// Given a `LogicalPlan` created from above, create an
|
|
/// `ExecutionPlan` suitable for execution
|
|
async fn create_physical_plan(
|
|
&self,
|
|
logical_plan: &LogicalPlan,
|
|
ctx_state: &ExecutionContextState,
|
|
) -> Result<Arc<dyn ExecutionPlan>> {
|
|
// Teach the default physical planner how to plan SchemaPivot
|
|
// and StreamSplit nodes.
|
|
let physical_planner =
|
|
DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(IOxExtensionPlanner {})]);
|
|
// Delegate most work of physical planning to the default physical planner
|
|
physical_planner
|
|
.create_physical_plan(logical_plan, ctx_state)
|
|
.await
|
|
}
|
|
}
|
|
|
|
/// Physical planner for InfluxDB IOx extension plans
|
|
struct IOxExtensionPlanner {}
|
|
|
|
impl ExtensionPlanner for IOxExtensionPlanner {
|
|
/// Create a physical plan for an extension node
|
|
fn plan_extension(
|
|
&self,
|
|
planner: &dyn PhysicalPlanner,
|
|
node: &dyn UserDefinedLogicalNode,
|
|
logical_inputs: &[&LogicalPlan],
|
|
physical_inputs: &[Arc<dyn ExecutionPlan>],
|
|
ctx_state: &ExecutionContextState,
|
|
) -> Result<Option<Arc<dyn ExecutionPlan>>> {
|
|
let any = node.as_any();
|
|
let plan = if let Some(schema_pivot) = any.downcast_ref::<SchemaPivotNode>() {
|
|
assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
|
|
Some(Arc::new(SchemaPivotExec::new(
|
|
Arc::clone(&physical_inputs[0]),
|
|
schema_pivot.schema().as_ref().clone().into(),
|
|
)) as Arc<dyn ExecutionPlan>)
|
|
} else if let Some(non_null_checker) = any.downcast_ref::<NonNullCheckerNode>() {
|
|
assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
|
|
Some(Arc::new(NonNullCheckerExec::new(
|
|
Arc::clone(&physical_inputs[0]),
|
|
non_null_checker.schema().as_ref().clone().into(),
|
|
non_null_checker.value(),
|
|
)) as Arc<dyn ExecutionPlan>)
|
|
} else if let Some(stream_split) = any.downcast_ref::<StreamSplitNode>() {
|
|
assert_eq!(
|
|
logical_inputs.len(),
|
|
1,
|
|
"Inconsistent number of logical inputs"
|
|
);
|
|
assert_eq!(
|
|
physical_inputs.len(),
|
|
1,
|
|
"Inconsistent number of physical inputs"
|
|
);
|
|
|
|
let split_expr = planner.create_physical_expr(
|
|
stream_split.split_expr(),
|
|
logical_inputs[0].schema(),
|
|
&physical_inputs[0].schema(),
|
|
ctx_state,
|
|
)?;
|
|
|
|
Some(Arc::new(StreamSplitExec::new(
|
|
Arc::clone(&physical_inputs[0]),
|
|
split_expr,
|
|
)) as Arc<dyn ExecutionPlan>)
|
|
} else {
|
|
None
|
|
};
|
|
Ok(plan)
|
|
}
|
|
}
|
|
|
|
/// Configuration for an IOx execution context
|
|
///
|
|
/// Created from an Executor
|
|
#[derive(Clone)]
|
|
pub struct IOxExecutionConfig {
|
|
/// Executor to run on
|
|
exec: DedicatedExecutor,
|
|
|
|
/// DataFusion configuration
|
|
execution_config: ExecutionConfig,
|
|
|
|
/// Default catalog
|
|
default_catalog: Option<Arc<dyn CatalogProvider>>,
|
|
|
|
/// Span context from which to create spans for this query
|
|
span_ctx: Option<SpanContext>,
|
|
}
|
|
|
|
impl fmt::Debug for IOxExecutionConfig {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
write!(f, "IOxExecutionConfig ...")
|
|
}
|
|
}
|
|
|
|
const BATCH_SIZE: usize = 1000;
|
|
|
|
impl IOxExecutionConfig {
|
|
pub(super) fn new(exec: DedicatedExecutor) -> Self {
|
|
let execution_config = ExecutionConfig::new()
|
|
.with_batch_size(BATCH_SIZE)
|
|
.create_default_catalog_and_schema(true)
|
|
.with_information_schema(true)
|
|
.with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
|
|
.with_query_planner(Arc::new(IOxQueryPlanner {}));
|
|
|
|
Self {
|
|
exec,
|
|
execution_config,
|
|
default_catalog: None,
|
|
span_ctx: None,
|
|
}
|
|
}
|
|
|
|
/// Set execution concurrency
|
|
pub fn with_target_partitions(mut self, target_partitions: usize) -> Self {
|
|
self.execution_config = self
|
|
.execution_config
|
|
.with_target_partitions(target_partitions);
|
|
self
|
|
}
|
|
|
|
/// Set the [MemoryManager]
|
|
pub fn with_memory_manager(mut self, memory_manager: Arc<MemoryManager>) -> Self {
|
|
self.execution_config = self
|
|
.execution_config
|
|
.with_existing_memory_manager(memory_manager);
|
|
self
|
|
}
|
|
|
|
/// Set the [DiskManager]
|
|
pub fn with_disk_manager(mut self, disk_manager: Arc<DiskManager>) -> Self {
|
|
self.execution_config = self
|
|
.execution_config
|
|
.with_existing_disk_manager(disk_manager);
|
|
self
|
|
}
|
|
|
|
/// Set the default catalog provider
|
|
pub fn with_default_catalog(self, catalog: Arc<dyn CatalogProvider>) -> Self {
|
|
Self {
|
|
default_catalog: Some(catalog),
|
|
..self
|
|
}
|
|
}
|
|
|
|
/// Set the span context from which to create distributed tracing spans for this query
|
|
pub fn with_span_context(self, span_ctx: Option<SpanContext>) -> Self {
|
|
Self { span_ctx, ..self }
|
|
}
|
|
|
|
/// Create an ExecutionContext suitable for executing DataFusion plans
|
|
pub fn build(self) -> IOxExecutionContext {
|
|
let inner = ExecutionContext::with_config(self.execution_config);
|
|
|
|
if let Some(default_catalog) = self.default_catalog {
|
|
inner.register_catalog(DEFAULT_CATALOG, default_catalog);
|
|
}
|
|
|
|
let maybe_span = self.span_ctx.map(|ctx| ctx.child("Query Execution"));
|
|
|
|
IOxExecutionContext {
|
|
inner,
|
|
exec: self.exec,
|
|
recorder: SpanRecorder::new(maybe_span),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// This is an execution context for planning in IOx. It wraps a
|
|
/// DataFusion execution context with the information needed for planning.
|
|
///
|
|
/// Methods on this struct should be preferred to using the raw
|
|
/// DataFusion functions (such as `collect`) directly.
|
|
///
|
|
/// Eventually we envision this also managing additional resource
|
|
/// types such as Memory and providing visibility into what plans are
|
|
/// running
|
|
///
|
|
/// An IOxExecutionContext is created directly from an Executor, or from
|
|
/// an IOxExecutionConfig created by an Executor
|
|
pub struct IOxExecutionContext {
|
|
inner: ExecutionContext,
|
|
|
|
/// Dedicated executor for query execution.
|
|
///
|
|
/// DataFusion plans are "CPU" bound and thus can consume tokio
|
|
/// executors threads for extended periods of time. We use a
|
|
/// dedicated tokio runtime to run them so that other requests
|
|
/// can be handled.
|
|
exec: DedicatedExecutor,
|
|
|
|
/// Span context from which to create spans for this query
|
|
recorder: SpanRecorder,
|
|
}
|
|
|
|
impl fmt::Debug for IOxExecutionContext {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
f.debug_struct("IOxExecutionContext")
|
|
.field("inner", &"<DataFusion ExecutionContext>")
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
impl IOxExecutionContext {
|
|
/// returns a reference to the inner datafusion execution context
|
|
pub fn inner(&self) -> &ExecutionContext {
|
|
&self.inner
|
|
}
|
|
|
|
/// Prepare a SQL statement for execution. This assumes that any
|
|
/// tables referenced in the SQL have been registered with this context
|
|
pub async fn prepare_sql(&self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
|
|
let ctx = self.child_ctx("prepare_sql");
|
|
debug!(text=%sql, "planning SQL query");
|
|
let logical_plan = ctx.inner.create_logical_plan(sql)?;
|
|
debug!(plan=%logical_plan.display_graphviz(), "logical plan");
|
|
ctx.prepare_plan(&logical_plan).await
|
|
}
|
|
|
|
/// Prepare (optimize + plan) a pre-created logical plan for execution
|
|
pub async fn prepare_plan(&self, plan: &LogicalPlan) -> Result<Arc<dyn ExecutionPlan>> {
|
|
let mut ctx = self.child_ctx("prepare_plan");
|
|
debug!(text=%plan.display_indent_schema(), "prepare_plan: initial plan");
|
|
|
|
let plan = ctx.inner.optimize(plan)?;
|
|
|
|
ctx.recorder.event("optimized plan");
|
|
trace!(text=%plan.display_indent_schema(), graphviz=%plan.display_graphviz(), "optimized plan");
|
|
|
|
let physical_plan = ctx.inner.create_physical_plan(&plan).await?;
|
|
|
|
ctx.recorder.event("plan to run");
|
|
debug!(text=%displayable(physical_plan.as_ref()).indent(), "prepare_plan: plan to run");
|
|
Ok(physical_plan)
|
|
}
|
|
|
|
/// Executes the logical plan using DataFusion on a separate
|
|
/// thread pool and produces RecordBatches
|
|
pub async fn collect(&self, physical_plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
|
|
debug!(
|
|
"Running plan, physical:\n{}",
|
|
displayable(physical_plan.as_ref()).indent()
|
|
);
|
|
let ctx = self.child_ctx("collect");
|
|
let stream = ctx.execute_stream(physical_plan).await?;
|
|
|
|
ctx.run(
|
|
stream
|
|
.err_into() // convert to DataFusionError
|
|
.try_collect(),
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Executes the physical plan and produces a
|
|
/// `SendableRecordBatchStream` to stream over the result that
|
|
/// iterates over the results. The creation of the stream is
|
|
/// performed in a separate thread pool.
|
|
pub async fn execute_stream(
|
|
&self,
|
|
physical_plan: Arc<dyn ExecutionPlan>,
|
|
) -> Result<SendableRecordBatchStream> {
|
|
match physical_plan.output_partitioning().partition_count() {
|
|
0 => unreachable!(),
|
|
1 => self.execute_stream_partitioned(physical_plan, 0).await,
|
|
_ => {
|
|
// Merge into a single partition
|
|
self.execute_stream_partitioned(
|
|
Arc::new(CoalescePartitionsExec::new(physical_plan)),
|
|
0,
|
|
)
|
|
.await
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Executes a single partition of a physical plan and produces a
|
|
/// `SendableRecordBatchStream` to stream over the result that
|
|
/// iterates over the results. The creation of the stream is
|
|
/// performed in a separate thread pool.
|
|
pub async fn execute_stream_partitioned(
|
|
&self,
|
|
physical_plan: Arc<dyn ExecutionPlan>,
|
|
partition: usize,
|
|
) -> Result<SendableRecordBatchStream> {
|
|
let span = self
|
|
.recorder
|
|
.span()
|
|
.map(|span| span.child("execute_stream_partitioned"));
|
|
|
|
let runtime = self.inner.runtime_env();
|
|
|
|
self.run(async move {
|
|
let stream = physical_plan.execute(partition, runtime).await?;
|
|
let stream = TracedStream::new(stream, span, physical_plan);
|
|
Ok(Box::pin(stream) as _)
|
|
})
|
|
.await
|
|
}
|
|
|
|
/// Executes the SeriesSetPlans on the query executor, in
|
|
/// parallel, producing series or groups
|
|
///
|
|
/// TODO make this streaming rather than buffering the results
|
|
pub async fn to_series_and_groups(
|
|
&self,
|
|
series_set_plans: SeriesSetPlans,
|
|
) -> Result<Vec<Either>> {
|
|
let SeriesSetPlans {
|
|
mut plans,
|
|
group_columns,
|
|
} = series_set_plans;
|
|
|
|
if plans.is_empty() {
|
|
return Ok(vec![]);
|
|
}
|
|
|
|
// sort plans by table (measurement) name
|
|
plans.sort_by(|a, b| a.table_name.cmp(&b.table_name));
|
|
|
|
// Run the plans in parallel
|
|
let handles = plans
|
|
.into_iter()
|
|
.map(|plan| {
|
|
let ctx = self.child_ctx("to_series_set");
|
|
self.run(async move {
|
|
let SeriesSetPlan {
|
|
table_name,
|
|
plan,
|
|
tag_columns,
|
|
field_columns,
|
|
} = plan;
|
|
|
|
let tag_columns = Arc::new(tag_columns);
|
|
|
|
let physical_plan = ctx.prepare_plan(&plan).await?;
|
|
|
|
let it = ctx.execute_stream(physical_plan).await?;
|
|
|
|
SeriesSetConverter::default()
|
|
.convert(table_name, tag_columns, field_columns, it)
|
|
.await
|
|
.map_err(|e| {
|
|
Error::Execution(format!(
|
|
"Error executing series set conversion: {}",
|
|
e
|
|
))
|
|
})
|
|
})
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
// join_all ensures that the results are consumed in the same order they
|
|
// were spawned maintaining the guarantee to return results ordered
|
|
// by table name and plan sort order.
|
|
let all_series_sets = futures::future::try_join_all(handles).await?;
|
|
|
|
// convert to series sets
|
|
let mut data: Vec<Series> = vec![];
|
|
for series_sets in all_series_sets {
|
|
for series_set in series_sets {
|
|
// If all timestamps of returned columns are nulls,
|
|
// there must be no data. We need to check this because
|
|
// aggregate (e.g. count, min, max) returns one row that are
|
|
// all null (even the values of aggregate) for min, max and 0 for count.
|
|
// For influx read_group's series and group, we do not want to return 0
|
|
// for count either.
|
|
if series_set.is_timestamp_all_null() {
|
|
continue;
|
|
}
|
|
|
|
let series: Vec<Series> = series_set
|
|
.try_into()
|
|
.map_err(|e| Error::Execution(format!("Error converting to series: {}", e)))?;
|
|
data.extend(series);
|
|
}
|
|
}
|
|
|
|
// If we have group columns, sort the results, and create the
|
|
// appropriate groups
|
|
if let Some(group_columns) = group_columns {
|
|
let grouper = GroupGenerator::new(group_columns);
|
|
grouper
|
|
.group(data)
|
|
.map_err(|e| Error::Execution(format!("Error forming groups: {}", e)))
|
|
} else {
|
|
let data = data.into_iter().map(|series| series.into()).collect();
|
|
Ok(data)
|
|
}
|
|
}
|
|
|
|
/// Executes `plan` and return the resulting FieldList on the query executor
|
|
pub async fn to_field_list(&self, plan: FieldListPlan) -> Result<FieldList> {
|
|
let FieldListPlan { plans } = plan;
|
|
|
|
// Run the plans in parallel
|
|
let handles = plans
|
|
.into_iter()
|
|
.map(|plan| {
|
|
let ctx = self.child_ctx("to_field_list");
|
|
self.run(async move {
|
|
let physical_plan = ctx.prepare_plan(&plan).await?;
|
|
|
|
// TODO: avoid this buffering
|
|
let field_list =
|
|
ctx.collect(physical_plan)
|
|
.await?
|
|
.into_fieldlist()
|
|
.map_err(|e| {
|
|
Error::Execution(format!("Error converting to field list: {}", e))
|
|
})?;
|
|
|
|
Ok(field_list)
|
|
})
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
// collect them all up and combine them
|
|
let mut results = Vec::new();
|
|
for join_handle in handles {
|
|
let fieldlist = join_handle.await?;
|
|
|
|
results.push(fieldlist);
|
|
}
|
|
|
|
// TODO: Stream this
|
|
results
|
|
.into_fieldlist()
|
|
.map_err(|e| Error::Execution(format!("Error converting to field list: {}", e)))
|
|
}
|
|
|
|
/// Executes this plan on the query pool, and returns the
|
|
/// resulting set of strings
|
|
pub async fn to_string_set(&self, plan: StringSetPlan) -> Result<StringSetRef> {
|
|
let ctx = self.child_ctx("to_string_set");
|
|
match plan {
|
|
StringSetPlan::Known(ss) => Ok(ss),
|
|
StringSetPlan::Plan(plans) => ctx
|
|
.run_logical_plans(plans)
|
|
.await?
|
|
.into_stringset()
|
|
.map_err(|e| Error::Execution(format!("Error converting to stringset: {}", e))),
|
|
}
|
|
}
|
|
|
|
/// Run the plan and return a record batch reader for reading the results
|
|
pub async fn run_logical_plan(&self, plan: LogicalPlan) -> Result<Vec<RecordBatch>> {
|
|
self.run_logical_plans(vec![plan]).await
|
|
}
|
|
|
|
/// plans and runs the plans in parallel and collects the results
|
|
/// run each plan in parallel and collect the results
|
|
async fn run_logical_plans(&self, plans: Vec<LogicalPlan>) -> Result<Vec<RecordBatch>> {
|
|
let value_futures = plans
|
|
.into_iter()
|
|
.map(|plan| {
|
|
let ctx = self.child_ctx("run_logical_plans");
|
|
self.run(async move {
|
|
let physical_plan = ctx.prepare_plan(&plan).await?;
|
|
|
|
// TODO: avoid this buffering
|
|
ctx.collect(physical_plan).await
|
|
})
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
// now, wait for all the values to resolve and collect them together
|
|
let mut results = Vec::new();
|
|
for join_handle in value_futures {
|
|
let mut plan_result = join_handle.await?;
|
|
results.append(&mut plan_result);
|
|
}
|
|
Ok(results)
|
|
}
|
|
|
|
/// Runs the provided future using this execution context
|
|
pub async fn run<Fut, T>(&self, fut: Fut) -> Result<T>
|
|
where
|
|
Fut: std::future::Future<Output = Result<T>> + Send + 'static,
|
|
T: Send + 'static,
|
|
{
|
|
self.exec
|
|
.spawn(fut)
|
|
.await
|
|
.unwrap_or_else(|e| Err(Error::Execution(format!("Join Error: {}", e))))
|
|
}
|
|
|
|
/// Returns a IOxExecutionContext with a SpanRecorder that is a child of the current
|
|
pub fn child_ctx(&self, name: &'static str) -> Self {
|
|
Self {
|
|
inner: self.inner.clone(),
|
|
exec: self.exec.clone(),
|
|
recorder: self.recorder.child(name),
|
|
}
|
|
}
|
|
|
|
/// Number of currently active tasks.
|
|
pub fn tasks(&self) -> usize {
|
|
self.exec.tasks()
|
|
}
|
|
}
|