556 lines
20 KiB
Rust
556 lines
20 KiB
Rust
//! Handle all requests from Querier
|
|
|
|
use std::{pin::Pin, sync::Arc};
|
|
|
|
use arrow::{error::ArrowError, record_batch::RecordBatch};
|
|
use arrow_flight::{
|
|
decode::{DecodedPayload, FlightDataDecoder},
|
|
encode::FlightDataEncoderBuilder,
|
|
error::FlightError,
|
|
FlightData, IpcMessage,
|
|
};
|
|
use arrow_util::test_util::equalize_batch_schemas;
|
|
use data_types::{NamespaceId, PartitionId, SequenceNumber, TableId};
|
|
use datafusion::physical_plan::SendableRecordBatchStream;
|
|
use datafusion_util::MemoryStream;
|
|
use flatbuffers::FlatBufferBuilder;
|
|
use futures::{stream::BoxStream, Stream, StreamExt, TryStreamExt};
|
|
use generated_types::ingester::IngesterQueryRequest;
|
|
use observability_deps::tracing::*;
|
|
use schema::Projection;
|
|
use snafu::{ensure, Snafu};
|
|
use trace::span::{Span, SpanRecorder};
|
|
|
|
use crate::data::IngesterData;
|
|
|
|
/// Number of table data read locks that shall be acquired in parallel
|
|
const CONCURRENT_TABLE_DATA_LOCKS: usize = 10;
|
|
|
|
#[derive(Debug, Snafu)]
|
|
#[allow(missing_copy_implementations, missing_docs)]
|
|
pub enum Error {
|
|
#[snafu(display("No Namespace Data found for the given namespace ID {}", namespace_id,))]
|
|
NamespaceNotFound { namespace_id: NamespaceId },
|
|
|
|
#[snafu(display(
|
|
"No Table Data found for the given namespace ID {}, table ID {}",
|
|
namespace_id,
|
|
table_id
|
|
))]
|
|
TableNotFound {
|
|
namespace_id: NamespaceId,
|
|
table_id: TableId,
|
|
},
|
|
|
|
#[snafu(display("Concurrent query request limit exceeded"))]
|
|
RequestLimit,
|
|
}
|
|
|
|
/// A specialized `Error` for Ingester's Query errors
|
|
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
|
|
|
/// Stream of snapshots.
|
|
///
|
|
/// Every snapshot is a dedicated [`SendableRecordBatchStream`].
|
|
pub(crate) type SnapshotStream =
|
|
Pin<Box<dyn Stream<Item = Result<SendableRecordBatchStream, ArrowError>> + Send>>;
|
|
|
|
/// Status of a partition that has unpersisted data.
|
|
///
|
|
/// Note that this structure is specific to a partition (which itself is bound to a table and
|
|
/// shard)!
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
#[allow(missing_copy_implementations)]
|
|
pub struct PartitionStatus {
|
|
/// Max sequence number persisted
|
|
pub parquet_max_sequence_number: Option<SequenceNumber>,
|
|
}
|
|
|
|
/// Response data for a single partition.
|
|
pub(crate) struct IngesterQueryPartition {
|
|
/// Stream of snapshots.
|
|
snapshots: SnapshotStream,
|
|
|
|
/// Partition ID.
|
|
id: PartitionId,
|
|
|
|
/// Partition persistence status.
|
|
status: PartitionStatus,
|
|
}
|
|
|
|
impl std::fmt::Debug for IngesterQueryPartition {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_struct("IngesterQueryPartition")
|
|
.field("snapshots", &"<SNAPSHOT STREAM>")
|
|
.field("id", &self.id)
|
|
.field("status", &self.status)
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
impl IngesterQueryPartition {
|
|
pub(crate) fn new(snapshots: SnapshotStream, id: PartitionId, status: PartitionStatus) -> Self {
|
|
Self {
|
|
snapshots,
|
|
id,
|
|
status,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Stream of partitions in this response.
|
|
pub(crate) type IngesterQueryPartitionStream =
|
|
Pin<Box<dyn Stream<Item = Result<IngesterQueryPartition, ArrowError>> + Send>>;
|
|
|
|
/// Response streams for querier<>ingester requests.
|
|
///
|
|
/// The data structure is constructed to allow lazy/streaming data generation. For easier
|
|
/// consumption according to the wire protocol, use the [`flatten`](Self::flatten) method.
|
|
pub struct IngesterQueryResponse {
|
|
/// Stream of partitions.
|
|
partitions: IngesterQueryPartitionStream,
|
|
}
|
|
|
|
impl std::fmt::Debug for IngesterQueryResponse {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_struct("IngesterQueryResponse")
|
|
.field("partitions", &"<PARTITION STREAM>")
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
fn encode_partition(
|
|
// Partition ID.
|
|
partition_id: PartitionId,
|
|
// Partition persistence status.
|
|
status: PartitionStatus,
|
|
) -> std::result::Result<FlightData, FlightError> {
|
|
use generated_types::influxdata::iox::ingester::v1 as proto;
|
|
let mut bytes = bytes::BytesMut::new();
|
|
let app_metadata = proto::IngesterQueryResponseMetadata {
|
|
partition_id: partition_id.get(),
|
|
status: Some(proto::PartitionStatus {
|
|
parquet_max_sequence_number: status.parquet_max_sequence_number.map(|x| x.get()),
|
|
}),
|
|
// These fields are only used in ingester2.
|
|
ingester_uuid: String::new(),
|
|
completed_persistence_count: 0,
|
|
};
|
|
prost::Message::encode(&app_metadata, &mut bytes)
|
|
.map_err(|e| FlightError::from_external_error(Box::new(e)))?;
|
|
|
|
Ok(FlightData::new(
|
|
None,
|
|
IpcMessage(build_none_flight_msg().into()),
|
|
bytes.to_vec(),
|
|
vec![],
|
|
))
|
|
}
|
|
|
|
fn build_none_flight_msg() -> Vec<u8> {
|
|
let mut fbb = FlatBufferBuilder::new();
|
|
|
|
let mut message = arrow::ipc::MessageBuilder::new(&mut fbb);
|
|
message.add_version(arrow::ipc::MetadataVersion::V5);
|
|
message.add_header_type(arrow::ipc::MessageHeader::NONE);
|
|
message.add_bodyLength(0);
|
|
|
|
let data = message.finish();
|
|
fbb.finish(data, None);
|
|
|
|
fbb.finished_data().to_vec()
|
|
}
|
|
|
|
impl IngesterQueryResponse {
|
|
/// Make a response
|
|
pub(crate) fn new(partitions: IngesterQueryPartitionStream) -> Self {
|
|
Self { partitions }
|
|
}
|
|
|
|
/// Flattens the stream of IngesterPartitions into a stream of FlightData
|
|
pub fn flatten(self) -> BoxStream<'static, std::result::Result<FlightData, FlightError>> {
|
|
self.partitions
|
|
.flat_map(|partition_res| match partition_res {
|
|
Ok(partition) => {
|
|
let head = futures::stream::once(async move {
|
|
encode_partition(partition.id, partition.status)
|
|
});
|
|
|
|
let tail = partition
|
|
.snapshots
|
|
.flat_map(|snapshot_res| match snapshot_res {
|
|
Ok(snapshot) => {
|
|
let snapshot =
|
|
snapshot.map_err(|e| FlightError::ExternalError(Box::new(e)));
|
|
|
|
FlightDataEncoderBuilder::new().build(snapshot).boxed()
|
|
}
|
|
Err(e) => {
|
|
futures::stream::once(async { Err(FlightError::Arrow(e)) }).boxed()
|
|
}
|
|
});
|
|
|
|
head.chain(tail).boxed()
|
|
}
|
|
Err(e) => futures::stream::once(async { Err(FlightError::Arrow(e)) }).boxed(),
|
|
})
|
|
.boxed()
|
|
}
|
|
|
|
/// Convert [`IngesterQueryResponse`] to a set of [`RecordBatch`]es.
|
|
///
|
|
/// If the response contains multiple snapshots, this will merge the schemas into a single one
|
|
/// and create NULL-columns for snapshots that miss columns.
|
|
///
|
|
/// # Panic
|
|
///
|
|
/// Panics if there are no batches returned at all. Also panics if the snapshot-scoped schemas
|
|
/// do not line up with the snapshot-scoped record batches.
|
|
pub async fn into_record_batches(self) -> Vec<RecordBatch> {
|
|
let mut snapshot_schema = None;
|
|
let mut batches = vec![];
|
|
|
|
let mut stream = FlightDataDecoder::new(self.flatten());
|
|
|
|
while let Some(msg) = stream.try_next().await.unwrap() {
|
|
match msg.payload {
|
|
DecodedPayload::None => {}
|
|
DecodedPayload::RecordBatch(batch) => {
|
|
let last_schema = snapshot_schema.as_ref().unwrap();
|
|
assert_eq!(&batch.schema(), last_schema);
|
|
batches.push(batch);
|
|
}
|
|
DecodedPayload::Schema(schema) => {
|
|
snapshot_schema = Some(schema);
|
|
}
|
|
}
|
|
}
|
|
|
|
assert!(!batches.is_empty());
|
|
|
|
equalize_batch_schemas(batches).unwrap()
|
|
}
|
|
}
|
|
|
|
/// Element within the flat wire protocol.
|
|
#[derive(Debug, PartialEq)]
|
|
pub enum FlatIngesterQueryResponse {
|
|
/// Start a new partition.
|
|
StartPartition {
|
|
/// Partition ID.
|
|
partition_id: PartitionId,
|
|
|
|
/// Partition persistence status.
|
|
status: PartitionStatus,
|
|
},
|
|
|
|
/// Start a new snapshot.
|
|
///
|
|
/// The snapshot belongs to the partition of the last [`StartPartition`](Self::StartPartition)
|
|
/// message.
|
|
StartSnapshot {
|
|
/// Snapshot schema.
|
|
schema: Arc<arrow::datatypes::Schema>,
|
|
},
|
|
|
|
/// Add a record batch to the snapshot that was announced by the last
|
|
/// [`StartSnapshot`](Self::StartSnapshot) message.
|
|
RecordBatch {
|
|
/// Record batch.
|
|
batch: RecordBatch,
|
|
},
|
|
}
|
|
|
|
/// Return data to send as a response back to the Querier per its request
|
|
pub async fn prepare_data_to_querier(
|
|
ingest_data: &Arc<IngesterData>,
|
|
request: &Arc<IngesterQueryRequest>,
|
|
span: Option<Span>,
|
|
) -> Result<IngesterQueryResponse> {
|
|
debug!(?request, "prepare_data_to_querier");
|
|
|
|
let mut span_recorder = SpanRecorder::new(span);
|
|
|
|
let mut table_refs = vec![];
|
|
let mut found_namespace = false;
|
|
|
|
for (shard_id, shard_data) in ingest_data.shards() {
|
|
let namespace_data = match shard_data.namespace(request.namespace_id) {
|
|
Some(namespace_data) => {
|
|
trace!(
|
|
shard_id=%shard_id.get(),
|
|
namespace_id=%request.namespace_id,
|
|
"found namespace"
|
|
);
|
|
found_namespace = true;
|
|
namespace_data
|
|
}
|
|
None => {
|
|
continue;
|
|
}
|
|
};
|
|
|
|
if let Some(table_data) = namespace_data.table(request.table_id) {
|
|
trace!(
|
|
shard_id=%shard_id.get(),
|
|
namespace_id=%request.namespace_id,
|
|
table_id=%request.table_id,
|
|
"found table"
|
|
);
|
|
table_refs.push(table_data);
|
|
}
|
|
}
|
|
|
|
ensure!(
|
|
found_namespace,
|
|
NamespaceNotFoundSnafu {
|
|
namespace_id: request.namespace_id,
|
|
},
|
|
);
|
|
|
|
ensure!(
|
|
!table_refs.is_empty(),
|
|
TableNotFoundSnafu {
|
|
namespace_id: request.namespace_id,
|
|
table_id: request.table_id
|
|
},
|
|
);
|
|
|
|
// acquire locks and read table data in parallel
|
|
let unpersisted_partitions: Vec<_> = futures::stream::iter(table_refs)
|
|
.map(|table_data| async move {
|
|
table_data
|
|
.partitions()
|
|
.into_iter()
|
|
.map(|p| {
|
|
let mut p = p.lock();
|
|
(
|
|
p.partition_id(),
|
|
p.get_query_data(),
|
|
p.max_persisted_sequence_number(),
|
|
)
|
|
})
|
|
.collect::<Vec<_>>()
|
|
})
|
|
// Note: the order doesn't matter
|
|
.buffer_unordered(CONCURRENT_TABLE_DATA_LOCKS)
|
|
.concat()
|
|
.await;
|
|
|
|
let request = Arc::clone(request);
|
|
let partitions = futures::stream::iter(unpersisted_partitions.into_iter().map(
|
|
move |(partition_id, data, max_persisted_sequence_number)| {
|
|
let snapshots = match data {
|
|
None => Box::pin(futures::stream::empty()) as SnapshotStream,
|
|
|
|
Some(batch) => {
|
|
assert_eq!(partition_id, batch.partition_id());
|
|
|
|
// Project the data if necessary
|
|
let columns = request
|
|
.columns
|
|
.iter()
|
|
.map(String::as_str)
|
|
.collect::<Vec<_>>();
|
|
let selection = if columns.is_empty() {
|
|
Projection::All
|
|
} else {
|
|
Projection::Some(columns.as_ref())
|
|
};
|
|
|
|
let snapshots = batch.project_selection(selection).into_iter().map(|batch| {
|
|
// Create a stream from the batch.
|
|
Ok(Box::pin(MemoryStream::new(vec![batch])) as SendableRecordBatchStream)
|
|
});
|
|
|
|
Box::pin(futures::stream::iter(snapshots)) as SnapshotStream
|
|
}
|
|
};
|
|
|
|
// NOTE: the partition persist watermark MUST always be provided to
|
|
// the querier for any partition that has performed (or is aware of)
|
|
// a persist operation.
|
|
//
|
|
// This allows the querier to use the per-partition persist marker
|
|
// when planning queries.
|
|
Ok(IngesterQueryPartition::new(
|
|
snapshots,
|
|
partition_id,
|
|
PartitionStatus {
|
|
parquet_max_sequence_number: max_persisted_sequence_number,
|
|
},
|
|
))
|
|
},
|
|
));
|
|
|
|
span_recorder.ok("done");
|
|
|
|
Ok(IngesterQueryResponse::new(Box::pin(partitions)))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
|
|
use arrow_util::assert_batches_sorted_eq;
|
|
use assert_matches::assert_matches;
|
|
use datafusion::prelude::{col, lit};
|
|
use predicate::Predicate;
|
|
|
|
use super::*;
|
|
use crate::test_util::make_ingester_data;
|
|
|
|
#[tokio::test]
|
|
async fn test_prepare_data_to_querier() {
|
|
test_helpers::maybe_start_logging();
|
|
|
|
// make 14 scenarios for ingester data
|
|
let mut table_id = None;
|
|
let mut ns_id = None;
|
|
let mut scenarios = vec![];
|
|
for two_partitions in [false, true] {
|
|
let (scenario, ns, table) = make_ingester_data(two_partitions).await;
|
|
|
|
let old = *table_id.get_or_insert(table);
|
|
assert_eq!(old, table);
|
|
let old = *ns_id.get_or_insert(ns);
|
|
assert_eq!(old, ns);
|
|
|
|
scenarios.push(Arc::new(scenario));
|
|
}
|
|
let table_id = table_id.unwrap();
|
|
let ns_id = ns_id.unwrap();
|
|
|
|
// read data from all scenarios without any filters
|
|
let request = Arc::new(IngesterQueryRequest::new(ns_id, table_id, vec![], None));
|
|
let expected = vec![
|
|
"+------------+-----+------+--------------------------------+",
|
|
"| city | day | temp | time |",
|
|
"+------------+-----+------+--------------------------------+",
|
|
"| Andover | mon | | 1970-01-01T00:00:00.000000046Z |", // in group 1 - seq_num: 2
|
|
"| Andover | tue | 56.0 | 1970-01-01T00:00:00.000000030Z |", // in group 2 - seq_num: 3
|
|
"| Boston | mon | | 1970-01-01T00:00:00.000000038Z |", // in group 1 - seq_num: 1
|
|
"| Boston | sun | 60.0 | 1970-01-01T00:00:00.000000036Z |", // in group 3 - seq_num: 5
|
|
"| Medford | sun | 55.0 | 1970-01-01T00:00:00.000000022Z |", // in group 4 - seq_num: 7
|
|
"| Medford | wed | | 1970-01-01T00:00:00.000000026Z |", // in group 2 - seq_num: 4
|
|
"| Reading | mon | 58.0 | 1970-01-01T00:00:00.000000040Z |", // in group 4 - seq_num: 8
|
|
"| Wilmington | mon | | 1970-01-01T00:00:00.000000035Z |", // in group 3 - seq_num: 6
|
|
"+------------+-----+------+--------------------------------+",
|
|
];
|
|
for scenario in &scenarios {
|
|
let result = prepare_data_to_querier(scenario, &request, None)
|
|
.await
|
|
.unwrap()
|
|
.into_record_batches()
|
|
.await;
|
|
assert_batches_sorted_eq!(&expected, &result);
|
|
}
|
|
|
|
// read data from all scenarios and filter out column day
|
|
let request = Arc::new(IngesterQueryRequest::new(
|
|
ns_id,
|
|
table_id,
|
|
vec![
|
|
"city".to_string(),
|
|
"temp".to_string(),
|
|
"time".to_string(),
|
|
"a_column_that_does_not_exist".to_string(),
|
|
],
|
|
None,
|
|
));
|
|
let expected = vec![
|
|
"+------------+------+--------------------------------+",
|
|
"| city | temp | time |",
|
|
"+------------+------+--------------------------------+",
|
|
"| Andover | | 1970-01-01T00:00:00.000000046Z |",
|
|
"| Andover | 56.0 | 1970-01-01T00:00:00.000000030Z |",
|
|
"| Boston | | 1970-01-01T00:00:00.000000038Z |",
|
|
"| Boston | 60.0 | 1970-01-01T00:00:00.000000036Z |",
|
|
"| Medford | | 1970-01-01T00:00:00.000000026Z |",
|
|
"| Medford | 55.0 | 1970-01-01T00:00:00.000000022Z |",
|
|
"| Reading | 58.0 | 1970-01-01T00:00:00.000000040Z |",
|
|
"| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
|
|
"+------------+------+--------------------------------+",
|
|
];
|
|
for scenario in &scenarios {
|
|
let result = prepare_data_to_querier(scenario, &request, None)
|
|
.await
|
|
.unwrap()
|
|
.into_record_batches()
|
|
.await;
|
|
assert_batches_sorted_eq!(&expected, &result);
|
|
}
|
|
|
|
// read data from all scenarios, filter out column day, city Medford, time outside range [0, 42)
|
|
let expr = col("city").not_eq(lit("Medford"));
|
|
let pred = Predicate::default().with_expr(expr).with_range(0, 42);
|
|
let request = Arc::new(IngesterQueryRequest::new(
|
|
ns_id,
|
|
table_id,
|
|
vec!["city".to_string(), "temp".to_string(), "time".to_string()],
|
|
Some(pred),
|
|
));
|
|
// predicates and de-dup are NOT applied!, otherwise this would look like this:
|
|
// let expected = vec![
|
|
// "+------------+------+--------------------------------+",
|
|
// "| city | temp | time |",
|
|
// "+------------+------+--------------------------------+",
|
|
// "| Andover | 56 | 1970-01-01T00:00:00.000000030Z |",
|
|
// "| Boston | | 1970-01-01T00:00:00.000000038Z |",
|
|
// "| Boston | 60 | 1970-01-01T00:00:00.000000036Z |",
|
|
// "| Reading | 58 | 1970-01-01T00:00:00.000000040Z |",
|
|
// "| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
|
|
// "+------------+------+--------------------------------+",
|
|
// ];
|
|
let expected = vec![
|
|
"+------------+------+--------------------------------+",
|
|
"| city | temp | time |",
|
|
"+------------+------+--------------------------------+",
|
|
"| Andover | | 1970-01-01T00:00:00.000000046Z |",
|
|
"| Andover | 56.0 | 1970-01-01T00:00:00.000000030Z |",
|
|
"| Boston | | 1970-01-01T00:00:00.000000038Z |",
|
|
"| Boston | 60.0 | 1970-01-01T00:00:00.000000036Z |",
|
|
"| Medford | | 1970-01-01T00:00:00.000000026Z |",
|
|
"| Medford | 55.0 | 1970-01-01T00:00:00.000000022Z |",
|
|
"| Reading | 58.0 | 1970-01-01T00:00:00.000000040Z |",
|
|
"| Wilmington | | 1970-01-01T00:00:00.000000035Z |",
|
|
"+------------+------+--------------------------------+",
|
|
];
|
|
for scenario in &scenarios {
|
|
let result = prepare_data_to_querier(scenario, &request, None)
|
|
.await
|
|
.unwrap()
|
|
.into_record_batches()
|
|
.await;
|
|
assert_batches_sorted_eq!(&expected, &result);
|
|
}
|
|
|
|
// test "table not found" handling
|
|
let request = Arc::new(IngesterQueryRequest::new(
|
|
ns_id,
|
|
TableId::new(i64::MAX),
|
|
vec![],
|
|
None,
|
|
));
|
|
for scenario in &scenarios {
|
|
let err = prepare_data_to_querier(scenario, &request, None)
|
|
.await
|
|
.unwrap_err();
|
|
assert_matches!(err, Error::TableNotFound { .. });
|
|
}
|
|
|
|
// test "namespace not found" handling
|
|
let request = Arc::new(IngesterQueryRequest::new(
|
|
NamespaceId::new(i64::MAX),
|
|
table_id,
|
|
vec![],
|
|
None,
|
|
));
|
|
for scenario in &scenarios {
|
|
let err = prepare_data_to_querier(scenario, &request, None)
|
|
.await
|
|
.unwrap_err();
|
|
assert_matches!(err, Error::NamespaceNotFound { .. });
|
|
}
|
|
}
|
|
}
|