influxdb/compactor/src/query.rs

378 lines
13 KiB
Rust

//! Queryable Compactor Data
use data_types::{
ChunkId, ChunkOrder, CompactionLevel, DeletePredicate, PartitionId, SequenceNumber,
TableSummary, Timestamp, TimestampMinMax, Tombstone,
};
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_query::{
exec::{stringset::StringSet, IOxSessionContext},
QueryChunk, QueryChunkMeta,
};
use observability_deps::tracing::trace;
use parquet_file::chunk::ParquetChunk;
use predicate::{delete_predicate::tombstones_to_delete_predicates, Predicate};
use schema::{merge::SchemaMerger, selection::Selection, sort::SortKey, Schema};
use snafu::{ResultExt, Snafu};
use std::{any::Any, sync::Arc};
use uuid::Uuid;
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations, missing_docs)]
pub enum Error {
#[snafu(display("Failed to read parquet: {}", source))]
ReadParquet {
source: parquet_file::storage::ReadError,
},
#[snafu(display(
"Error reading IOx Metadata from Parquet IoxParquetMetadata: {}",
source
))]
ReadParquetMeta {
source: parquet_file::storage::ReadError,
},
}
/// A specialized `Error` for Compactor's query errors
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// QueryableParquetChunk that implements QueryChunk and QueryMetaChunk for building query plan
#[derive(Debug, Clone)]
pub struct QueryableParquetChunk {
data: Arc<ParquetChunk>, // data of the parquet file
delete_predicates: Vec<Arc<DeletePredicate>>, // converted from tombstones
table_name: String, // needed to build query plan
partition_id: PartitionId,
max_sequence_number: SequenceNumber,
min_time: Timestamp,
max_time: Timestamp,
sort_key: Option<SortKey>,
partition_sort_key: Option<SortKey>,
compaction_level: CompactionLevel,
/// The compaction level that this operation will be when finished. Chunks from files that have
/// the same level as this should get chunk order 0 so that files at a lower compaction level
/// (and thus created later) should have priority in deduplication.
///
/// That is:
///
/// * When compacting L0 + L1, the target level is L1. L0 files should have priority, so all L1
/// files should have chunk order 0 to be sorted first.
/// * When compacting L1 + L2, the target level is L2. L1 files should have priority, so all L2
/// files should have chunk order 0 to be sorted first.
target_level: CompactionLevel,
}
impl QueryableParquetChunk {
/// Initialize a QueryableParquetChunk
#[allow(clippy::too_many_arguments)]
pub fn new(
table_name: impl Into<String>,
partition_id: PartitionId,
data: Arc<ParquetChunk>,
deletes: &[Tombstone],
max_sequence_number: SequenceNumber,
min_time: Timestamp,
max_time: Timestamp,
sort_key: Option<SortKey>,
partition_sort_key: Option<SortKey>,
compaction_level: CompactionLevel,
target_level: CompactionLevel,
) -> Self {
let delete_predicates = tombstones_to_delete_predicates(deletes);
Self {
data,
delete_predicates,
table_name: table_name.into(),
partition_id,
max_sequence_number,
min_time,
max_time,
sort_key,
partition_sort_key,
compaction_level,
target_level,
}
}
/// Merge schema of the given chunks
pub fn merge_schemas(chunks: &[Arc<dyn QueryChunk>]) -> Arc<Schema> {
let mut merger = SchemaMerger::new();
for chunk in chunks {
merger = merger.merge(&chunk.schema()).expect("schemas compatible");
}
merger.build()
}
/// Return max sequence number
pub fn max_sequence_number(&self) -> SequenceNumber {
self.max_sequence_number
}
/// Return min time
pub fn min_time(&self) -> i64 {
self.min_time.get()
}
/// Return max time
pub fn max_time(&self) -> i64 {
self.max_time.get()
}
/// Return the parquet file's object store id
pub fn object_store_id(&self) -> Uuid {
self.data.object_store_id()
}
}
impl QueryChunkMeta for QueryableParquetChunk {
fn summary(&self) -> Option<Arc<TableSummary>> {
None
}
fn schema(&self) -> Arc<Schema> {
self.data.schema()
}
fn partition_sort_key(&self) -> Option<&SortKey> {
self.partition_sort_key.as_ref()
}
fn partition_id(&self) -> PartitionId {
self.partition_id
}
fn sort_key(&self) -> Option<&SortKey> {
self.sort_key.as_ref()
}
fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
self.delete_predicates.as_ref()
}
fn timestamp_min_max(&self) -> Option<TimestampMinMax> {
Some(TimestampMinMax {
min: self.min_time(),
max: self.max_time(),
})
}
}
impl QueryChunk for QueryableParquetChunk {
// This function is needed to distinguish the ParquetChunks further if they happen to have the
// same creation order.
// Ref: chunks.sort_unstable_by_key(|c| (c.order(), c.id())); in provider.rs
// Note: The order of this QueryableParquetChunk is the parquet file's min_sequence_number which
// will be the same for parquet files of splitted compacted data.
//
// This function returns the parquet file's min_time which will be always different for the
// parquet files of same order/min_sequence_number and is good to order the parquet file
fn id(&self) -> ChunkId {
// When we need the order to split overlapped chunks, the ChunkOrder is already different.
// ChunkId is used as tiebreaker does not matter much, so use the object store id
self.object_store_id().into()
}
/// Returns the name of the table stored in this chunk
fn table_name(&self) -> &str {
&self.table_name
}
/// Returns true if the chunk may contain a duplicate "primary
/// key" within itself
fn may_contain_pk_duplicates(&self) -> bool {
// data within this parquet chunk was deduplicated
false
}
/// Returns a set of Strings with column names from the specified
/// table that have at least one row that matches `predicate`, if
/// the predicate can be evaluated entirely on the metadata of
/// this Chunk. Returns `None` otherwise
fn column_names(
&self,
_ctx: IOxSessionContext,
_predicate: &Predicate,
_columns: Selection<'_>,
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
/// Return a set of Strings containing the distinct values in the
/// specified columns. If the predicate can be evaluated entirely
/// on the metadata of this Chunk. Returns `None` otherwise
///
/// The requested columns must all have String type.
fn column_values(
&self,
_ctx: IOxSessionContext,
_column_name: &str,
_predicate: &Predicate,
) -> Result<Option<StringSet>, DataFusionError> {
Ok(None)
}
/// Provides access to raw `QueryChunk` data as an
/// asynchronous stream of `RecordBatch`es filtered by a *required*
/// predicate. Note that not all chunks can evaluate all types of
/// predicates and this function will return an error
/// if requested to evaluate with a predicate that is not supported
///
/// This is the analog of the `TableProvider` in DataFusion
///
/// The reason we can't simply use the `TableProvider` trait
/// directly is that the data for a particular Table lives in
/// several chunks within a partition, so there needs to be an
/// implementation of `TableProvider` that stitches together the
/// streams from several different `QueryChunk`s.
fn read_filter(
&self,
mut ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, DataFusionError> {
ctx.set_metadata("storage", "compactor");
ctx.set_metadata("projection", format!("{}", selection));
trace!(?selection, "selection");
self.data
.read_filter(predicate, selection)
.context(ReadParquetSnafu)
.map_err(|e| DataFusionError::External(Box::new(e)))
}
/// Returns chunk type
fn chunk_type(&self) -> &str {
"QueryableParquetChunk"
}
// Order of the chunk so they can be deduplicated correctly
fn order(&self) -> ChunkOrder {
use CompactionLevel::*;
match (self.target_level, self.compaction_level) {
// Files of the same level as what they're being compacting into were created earlier,
// so they should be sorted first so that files created later that haven't yet been
// compacted to this level will have priority when resolving duplicates.
(FileNonOverlapped, FileNonOverlapped) => ChunkOrder::new(0),
(Final, Final) => ChunkOrder::new(0),
// Files that haven't yet been compacted to the target level were created later and
// should be sorted based on their max sequence number.
(FileNonOverlapped, Initial) => ChunkOrder::new(self.max_sequence_number.get()),
(Final, FileNonOverlapped) => ChunkOrder::new(self.max_sequence_number.get()),
// These combinations of target compaction level and file compaction level are
// invalid in this context given the current compaction algorithm.
(Initial, _) => panic!("Can't compact into CompactionLevel::Initial"),
(FileNonOverlapped, Final) => panic!(
"Can't compact CompactionLevel::Final into CompactionLevel::FileNonOverlapped"
),
(Final, Initial) => {
panic!("Can't compact CompactionLevel::Initial into CompactionLevel::Final")
}
}
}
fn as_any(&self) -> &dyn Any {
self
}
}
#[cfg(test)]
mod tests {
use super::*;
use data_types::ColumnType;
use iox_tests::util::{TestCatalog, TestParquetFileBuilder};
use parquet_file::storage::ParquetStorage;
async fn test_setup(
compaction_level: CompactionLevel,
target_level: CompactionLevel,
max_sequence_number: i64,
) -> QueryableParquetChunk {
let catalog = TestCatalog::new();
let ns = catalog.create_namespace("ns").await;
let shard = ns.create_shard(1).await;
let table = ns.create_table("table").await;
table.create_column("field_int", ColumnType::I64).await;
table.create_column("tag1", ColumnType::Tag).await;
table.create_column("time", ColumnType::Time).await;
let partition = table
.with_shard(&shard)
.create_partition("2022-07-13")
.await;
let lp = vec!["table,tag1=WA field_int=1000i 8000"].join("\n");
let builder = TestParquetFileBuilder::default()
.with_line_protocol(&lp)
.with_compaction_level(compaction_level)
.with_max_seq(max_sequence_number);
let file = partition.create_parquet_file(builder).await;
let parquet_file = Arc::new(file.parquet_file);
let parquet_chunk = Arc::new(ParquetChunk::new(
Arc::clone(&parquet_file),
Arc::new(table.schema().await),
ParquetStorage::new(Arc::clone(&catalog.object_store)),
));
QueryableParquetChunk::new(
"table",
partition.partition.id,
parquet_chunk,
&[],
parquet_file.max_sequence_number,
parquet_file.min_time,
parquet_file.max_time,
None,
None,
parquet_file.compaction_level,
target_level,
)
}
#[tokio::test]
async fn chunk_order_is_max_seq_when_compaction_level_0_and_target_level_1() {
let chunk = test_setup(
CompactionLevel::Initial,
CompactionLevel::FileNonOverlapped,
2,
)
.await;
assert_eq!(chunk.order(), ChunkOrder::new(2));
}
#[tokio::test]
async fn chunk_order_is_0_when_compaction_level_1_and_target_level_1() {
let chunk = test_setup(
CompactionLevel::FileNonOverlapped,
CompactionLevel::FileNonOverlapped,
2,
)
.await;
assert_eq!(chunk.order(), ChunkOrder::new(0));
}
#[tokio::test]
async fn chunk_order_is_max_seq_when_compaction_level_1_and_target_level_2() {
let chunk = test_setup(
CompactionLevel::FileNonOverlapped,
CompactionLevel::Final,
2,
)
.await;
assert_eq!(chunk.order(), ChunkOrder::new(2));
}
#[tokio::test]
async fn chunk_order_is_0_when_compaction_level_2_and_target_level_2() {
let chunk = test_setup(CompactionLevel::Final, CompactionLevel::Final, 2).await;
assert_eq!(chunk.order(), ChunkOrder::new(0));
}
}