influxdb/db/src/chunk.rs

781 lines
27 KiB
Rust

use super::{
catalog::chunk::ChunkMetadata, pred::to_read_buffer_predicate, streams::ReadFilterResultsStream,
};
use data_types::chunk_metadata::ChunkAddr;
use data_types::{
chunk_metadata::{ChunkId, ChunkOrder},
delete_predicate::DeletePredicate,
partition_metadata,
};
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion_util::MemoryStream;
use internal_types::access::AccessRecorder;
use iox_object_store::ParquetFilePath;
use mutable_buffer::snapshot::ChunkSnapshot;
use observability_deps::tracing::debug;
use parquet_file::chunk::ParquetChunk;
use partition_metadata::TableSummary;
use predicate::{Predicate, PredicateMatch};
use query::exec::IOxSessionContext;
use query::QueryChunkError;
use query::{exec::stringset::StringSet, QueryChunk, QueryChunkMeta};
use read_buffer::RBChunk;
use schema::InfluxColumnType;
use schema::{selection::Selection, sort::SortKey, Schema};
use snafu::{OptionExt, ResultExt, Snafu};
use std::{
collections::{BTreeMap, BTreeSet},
sync::Arc,
};
use time::Time;
#[allow(clippy::enum_variant_names)]
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("Mutable Buffer Chunk Error: {}", source))]
MutableBufferChunk {
source: mutable_buffer::snapshot::Error,
},
#[snafu(display("Read Buffer Error in chunk {}: {}", chunk_id, source))]
ReadBufferChunkError {
source: read_buffer::Error,
chunk_id: ChunkId,
},
#[snafu(display("Read Buffer Error in chunk {}: {}", chunk_id, msg))]
ReadBufferError { chunk_id: ChunkId, msg: String },
#[snafu(display("Parquet File Error in chunk {}: {}", chunk_id, source))]
ParquetFileChunkError {
source: parquet_file::chunk::Error,
chunk_id: ChunkId,
},
#[snafu(display("Internal error restricting schema: {}", source))]
InternalSelectingSchema { source: schema::Error },
#[snafu(display("Predicate conversion error: {}", source))]
PredicateConversion { source: super::pred::Error },
#[snafu(display(
"Internal error: mutable buffer does not support predicate pushdown, but got: {:?}",
predicate
))]
InternalPredicateNotSupported { predicate: Predicate },
#[snafu(display("internal error creating plan: {}", source))]
InternalPlanCreation {
source: datafusion::error::DataFusionError,
},
#[snafu(display("arrow conversion error: {}", source))]
ArrowConversion { source: arrow::error::ArrowError },
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// A IOx DatabaseChunk can come from one of three places:
/// MutableBuffer, ReadBuffer, or a ParquetFile
#[derive(Debug)]
pub struct DbChunk {
addr: ChunkAddr,
access_recorder: AccessRecorder,
state: State,
meta: Arc<ChunkMetadata>,
time_of_first_write: Time,
time_of_last_write: Time,
order: ChunkOrder,
}
#[derive(Debug)]
enum State {
MutableBuffer { chunk: Arc<ChunkSnapshot> },
ReadBuffer { chunk: Arc<RBChunk> },
ParquetFile { chunk: Arc<ParquetChunk> },
}
impl State {
fn state_name(&self) -> &'static str {
match self {
State::MutableBuffer { .. } => "Mutable Buffer",
State::ReadBuffer { .. } => "Read Buffer",
State::ParquetFile { .. } => "Parquet",
}
}
}
impl DbChunk {
/// Create a DBChunk snapshot of the catalog chunk
pub fn snapshot(chunk: &super::catalog::chunk::CatalogChunk) -> Arc<Self> {
let addr = chunk.addr().clone();
use super::catalog::chunk::{ChunkStage, ChunkStageFrozenRepr};
let (state, meta) = match chunk.stage() {
ChunkStage::Open {
mb_chunk,
time_of_first_write,
time_of_last_write,
} => {
let (snapshot, just_cached) = mb_chunk.snapshot();
// the snapshot might be cached, so we need to update the chunk metrics
if just_cached {
chunk.update_metrics();
}
let state = State::MutableBuffer {
chunk: Arc::clone(&snapshot),
};
let meta = ChunkMetadata {
table_summary: Arc::new(mb_chunk.table_summary()),
schema: snapshot.full_schema(),
delete_predicates: vec![], // open chunk does not have delete predicate
time_of_first_write: *time_of_first_write,
time_of_last_write: *time_of_last_write,
sort_key: None,
};
(state, Arc::new(meta))
}
ChunkStage::Frozen {
representation,
meta,
..
} => {
let state = match &representation {
ChunkStageFrozenRepr::MutableBufferSnapshot(snapshot) => State::MutableBuffer {
chunk: Arc::clone(snapshot),
},
ChunkStageFrozenRepr::ReadBuffer(repr) => State::ReadBuffer {
chunk: Arc::clone(repr),
},
};
(state, Arc::clone(meta))
}
ChunkStage::Persisted {
parquet,
read_buffer,
meta,
..
} => {
let state = if let Some(read_buffer) = &read_buffer {
State::ReadBuffer {
chunk: Arc::clone(read_buffer),
}
} else {
State::ParquetFile {
chunk: Arc::clone(parquet),
}
};
(state, Arc::clone(meta))
}
};
Arc::new(Self {
addr,
access_recorder: chunk.access_recorder().clone(),
state,
meta,
time_of_first_write: chunk.time_of_first_write(),
time_of_last_write: chunk.time_of_last_write(),
order: chunk.order(),
})
}
/// Return the snapshot of the chunk with type ParquetFile
/// This function should be only invoked when you know your chunk
/// is ParquetFile type whose state is Persisted. The
/// reason we have this function is because the above snapshot
/// function always returns the read buffer one for the same state
pub fn parquet_file_snapshot(chunk: &super::catalog::chunk::CatalogChunk) -> Arc<Self> {
use super::catalog::chunk::ChunkStage;
let (state, meta) = match chunk.stage() {
ChunkStage::Persisted { parquet, meta, .. } => {
let chunk = Arc::clone(parquet);
let state = State::ParquetFile { chunk };
(state, Arc::clone(meta))
}
_ => {
panic!("Internal error: This chunk's stage is not Persisted");
}
};
Arc::new(Self {
addr: chunk.addr().clone(),
meta,
state,
access_recorder: chunk.access_recorder().clone(),
time_of_first_write: chunk.time_of_first_write(),
time_of_last_write: chunk.time_of_last_write(),
order: chunk.order(),
})
}
/// Return the Path in ObjectStorage where this chunk is
/// persisted, if any
pub fn object_store_path(&self) -> Option<&ParquetFilePath> {
match &self.state {
State::ParquetFile { chunk } => Some(chunk.path()),
_ => None,
}
}
/// Returns the contained `ParquetChunk`, if this chunk is stored as parquet
pub fn parquet_chunk(&self) -> Option<&Arc<ParquetChunk>> {
match &self.state {
State::ParquetFile { chunk } => Some(chunk),
_ => None,
}
}
/// Return the address of this chunk
pub fn addr(&self) -> &ChunkAddr {
&self.addr
}
/// Return the name of the table in this chunk
pub fn table_name(&self) -> &Arc<str> {
&self.addr.table_name
}
pub fn time_of_first_write(&self) -> Time {
self.time_of_first_write
}
pub fn time_of_last_write(&self) -> Time {
self.time_of_last_write
}
/// NOTE: valid Read Buffer predicates are not guaranteed to be applicable
/// to an arbitrary Read Buffer chunk, because the applicability of a
/// predicate depends on the schema of the chunk. Callers should validate
/// predicates against chunks they are to be executed against using
/// `read_buffer::Chunk::validate_predicate`
fn to_rub_negated_predicates(
delete_predicates: &[Arc<Predicate>],
) -> Result<Vec<read_buffer::Predicate>> {
let mut rub_preds: Vec<read_buffer::Predicate> = vec![];
for pred in delete_predicates {
let rub_pred = to_read_buffer_predicate(pred).context(PredicateConversionSnafu)?;
rub_preds.push(rub_pred);
}
debug!(?rub_preds, "RUB delete predicates");
Ok(rub_preds)
}
/// Return true if any of the fields called for in the `predicate`
/// contain at least 1 null value. Returns false ONLY if all
/// fields that pass `predicate` are entirely non null
fn fields_have_nulls(&self, predicate: &Predicate) -> bool {
self.meta.schema.iter().any(|(influx_column_type, field)| {
if matches!(influx_column_type, Some(InfluxColumnType::Field(_)))
&& predicate.should_include_field(field.name())
{
match self
.meta
.table_summary
.column(field.name())
.and_then(|column_summary| column_summary.null_count())
{
Some(null_count) => {
// only if this is false can we return false
null_count > 0
}
None => {
// don't know the stats for this column, so assume there can be nulls
true
}
}
} else {
// not a field column
false
}
})
}
}
impl QueryChunk for DbChunk {
fn id(&self) -> ChunkId {
self.addr.chunk_id
}
fn addr(&self) -> ChunkAddr {
self.addr.clone()
}
fn table_name(&self) -> &str {
self.addr.table_name.as_ref()
}
fn may_contain_pk_duplicates(&self) -> bool {
// Assume that only data in the MUB can contain duplicates
// within itself as it has the raw incoming stream of writes.
//
// All other types of chunks are deduplicated as part of
// of the reorganization plan run as part of their creation
matches!(self.state, State::MutableBuffer { .. })
}
fn apply_predicate_to_metadata(
&self,
predicate: &Predicate,
) -> Result<PredicateMatch, QueryChunkError> {
let pred_result = match &self.state {
State::MutableBuffer { chunk, .. } => {
if predicate.has_exprs() || chunk.has_timerange(&predicate.range) {
// TODO some more work to figure out if we
// definite have / do not have result
PredicateMatch::Unknown
} else {
PredicateMatch::Zero
}
}
State::ReadBuffer { chunk, .. } => {
// If the predicate is not supported by the Read Buffer then
// it can't determine if the predicate can be answered by
// meta-data only. A future improvement could be to apply this
// logic to chunk meta-data without involving the backing
// execution engine.
let rb_predicate = match to_read_buffer_predicate(predicate) {
Ok(rb_predicate) => rb_predicate,
Err(e) => {
debug!(?predicate, %e, "Cannot push down predicate to RUB, will fully scan");
return Ok(PredicateMatch::Unknown);
}
};
// TODO: currently this will provide an exact answer, which may
// be expensive in pathological cases. It might make more
// sense to implement logic that works to rule out chunks based
// on meta-data only. This should be possible without needing to
// know the execution engine the chunk is held in.
if chunk.satisfies_predicate(&rb_predicate) {
// if any of the fields referred to in the
// predicate has nulls, don't know without more
// work if the rows that matched had non null values
if self.fields_have_nulls(predicate) {
PredicateMatch::Unknown
} else {
PredicateMatch::AtLeastOneNonNullField
}
} else {
PredicateMatch::Zero
}
}
State::ParquetFile { chunk, .. } => {
if predicate.has_exprs() || chunk.has_timerange(predicate.range.as_ref()) {
PredicateMatch::Unknown
} else {
PredicateMatch::Zero
}
}
};
Ok(pred_result)
}
fn read_filter(
&self,
mut ctx: IOxSessionContext,
predicate: &Predicate,
selection: Selection<'_>,
) -> Result<SendableRecordBatchStream, QueryChunkError> {
// Predicate is not required to be applied for correctness. We only pushed it down
// when possible for performance gain
debug!(table=?self.table_name(), chunk_id=%self.addr().chunk_id, ?predicate, ?selection, "read_filter called");
self.access_recorder.record_access();
let delete_predicates: Vec<_> = self
.meta
.delete_predicates
.iter()
.map(|pred| Arc::new(pred.as_ref().clone().into()))
.collect();
ctx.set_metadata("delete_predicates", delete_predicates.len() as i64);
// merge the negated delete predicates into the select predicate
let mut pred_with_deleted_exprs = predicate.clone();
pred_with_deleted_exprs.merge_delete_predicates(&delete_predicates);
debug!(?pred_with_deleted_exprs, "Merged negated predicate");
ctx.set_metadata("storage", self.state.state_name());
ctx.set_metadata("projection", format!("{}", selection));
match &self.state {
State::MutableBuffer { chunk, .. } => {
let batch = chunk
.read_filter(selection)
.context(MutableBufferChunkSnafu)?;
Ok(Box::pin(MemoryStream::new(vec![batch])))
}
State::ReadBuffer { chunk, .. } => {
// Only apply pushdownable predicates
let rb_predicate = chunk
// A predicate unsupported by the Read Buffer or against
// this chunk's schema is replaced with a default empty
// predicate.
.validate_predicate(to_read_buffer_predicate(predicate).unwrap_or_default())
.unwrap_or_default();
debug!(?rb_predicate, "RUB predicate");
ctx.set_metadata("predicate", format!("{}", &rb_predicate));
// combine all delete expressions to RUB's negated ones
let negated_delete_exprs = Self::to_rub_negated_predicates(&delete_predicates)?
.into_iter()
// Any delete predicates unsupported by the Read Buffer will be elided.
.filter_map(|p| chunk.validate_predicate(p).ok())
.collect::<Vec<_>>();
debug!(
?negated_delete_exprs,
"Negated Predicate pushed down to RUB"
);
let read_results = chunk
.read_filter(rb_predicate, selection, negated_delete_exprs)
.context(ReadBufferChunkSnafu {
chunk_id: self.id(),
})?;
let schema =
chunk
.read_filter_table_schema(selection)
.context(ReadBufferChunkSnafu {
chunk_id: self.id(),
})?;
Ok(Box::pin(ReadFilterResultsStream::new(
ctx,
read_results,
schema.into(),
)))
}
State::ParquetFile { chunk, .. } => {
ctx.set_metadata("predicate", format!("{}", &pred_with_deleted_exprs));
chunk
.read_filter(&pred_with_deleted_exprs, selection)
.context(ParquetFileChunkSnafu {
chunk_id: self.id(),
})
.map_err(|e| Box::new(e) as _)
}
}
}
fn column_names(
&self,
mut ctx: IOxSessionContext,
predicate: &Predicate,
columns: Selection<'_>,
) -> Result<Option<StringSet>, QueryChunkError> {
ctx.set_metadata("storage", self.state.state_name());
ctx.set_metadata("projection", format!("{}", columns));
ctx.set_metadata("predicate", format!("{}", &predicate));
match &self.state {
State::MutableBuffer { chunk, .. } => {
if !predicate.is_empty() {
// TODO: Support predicates
return Ok(None);
}
self.access_recorder.record_access();
Ok(chunk.column_names(columns))
}
State::ReadBuffer { chunk, .. } => {
let rb_predicate = match to_read_buffer_predicate(predicate) {
Ok(rb_predicate) => rb_predicate,
Err(e) => {
debug!(?predicate, %e, "read buffer predicate not supported for column_names, falling back");
return Ok(None);
}
};
ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate));
self.access_recorder.record_access();
// TODO(edd): wire up delete predicates to be pushed down to
// the read buffer.
let names = chunk
.column_names(rb_predicate, vec![], columns, BTreeSet::new())
.context(ReadBufferChunkSnafu {
chunk_id: self.id(),
})?;
ctx.set_metadata("output_values", names.len() as i64);
Ok(Some(names))
}
State::ParquetFile { chunk, .. } => {
if !predicate.is_empty() {
// TODO: Support predicates when MB supports it
return Ok(None);
}
self.access_recorder.record_access();
Ok(chunk.column_names(columns))
}
}
}
fn column_values(
&self,
mut ctx: IOxSessionContext,
column_name: &str,
predicate: &Predicate,
) -> Result<Option<StringSet>, QueryChunkError> {
ctx.set_metadata("storage", self.state.state_name());
ctx.set_metadata("column_name", column_name.to_string());
ctx.set_metadata("predicate", format!("{}", &predicate));
match &self.state {
State::MutableBuffer { .. } => {
// There is no advantage to manually implementing this
// vs just letting DataFusion do its thing
Ok(None)
}
State::ReadBuffer { chunk, .. } => {
let rb_predicate = match to_read_buffer_predicate(predicate) {
Ok(rb_predicate) => rb_predicate,
Err(e) => {
debug!(?predicate, %e, "read buffer predicate not supported for column_names, falling back");
return Ok(None);
}
};
ctx.set_metadata("rb_predicate", format!("{}", &rb_predicate));
self.access_recorder.record_access();
let mut values = chunk
.column_values(
rb_predicate,
Selection::Some(&[column_name]),
BTreeMap::new(),
)
.context(ReadBufferChunkSnafu {
chunk_id: self.id(),
})?;
// The InfluxRPC frontend only supports getting column values
// for one column at a time (this is a restriction on the Influx
// Read gRPC API too). However, the Read Buffer support multiple
// columns and will return a map - we just need to pull the
// column out to get the set of values.
let values = values
.remove(column_name)
.with_context(|| ReadBufferSnafu {
chunk_id: self.id(),
msg: format!(
"failed to find column_name {:?} in results of tag_values",
column_name
),
})?;
ctx.set_metadata("output_values", values.len() as i64);
Ok(Some(values))
}
State::ParquetFile { .. } => {
// Since DataFusion can read Parquet, there is no advantage to
// manually implementing this vs just letting DataFusion do its thing
Ok(None)
}
}
}
fn chunk_type(&self) -> &str {
match &self.state {
State::MutableBuffer { .. } => "MUB",
State::ReadBuffer { .. } => "RUB",
State::ParquetFile { .. } => "OS",
}
}
fn order(&self) -> ChunkOrder {
self.order
}
}
impl QueryChunkMeta for DbChunk {
fn summary(&self) -> Option<&TableSummary> {
Some(self.meta.table_summary.as_ref())
}
fn schema(&self) -> Arc<Schema> {
Arc::clone(&self.meta.schema)
}
fn sort_key(&self) -> Option<&SortKey> {
self.meta.sort_key.as_ref()
}
// return a reference to delete predicates of the chunk
fn delete_predicates(&self) -> &[Arc<DeletePredicate>] {
let pred = &self.meta.delete_predicates;
debug!(?pred, "Delete predicate in DbChunk");
pred
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
catalog::chunk::{CatalogChunk, ChunkStage},
test_helpers::write_lp,
utils::make_db_time,
};
use data_types::chunk_metadata::ChunkStorage;
use std::time::Duration;
async fn test_chunk_access(chunk: &CatalogChunk, time: Arc<time::MockProvider>) {
let m1 = chunk.access_recorder().get_metrics();
let snapshot = DbChunk::snapshot(chunk);
let m2 = chunk.access_recorder().get_metrics();
let t1 = time.inc(Duration::from_secs(1));
snapshot
.read_filter(
IOxSessionContext::default(),
&Default::default(),
Selection::All,
)
.unwrap();
let m3 = chunk.access_recorder().get_metrics();
let t2 = time.inc(Duration::from_secs(1));
let column_names = snapshot
.column_names(
IOxSessionContext::default(),
&Default::default(),
Selection::All,
)
.unwrap()
.is_some();
let m4 = chunk.access_recorder().get_metrics();
let t3 = time.inc(Duration::from_secs(1));
let column_values = snapshot
.column_values(IOxSessionContext::default(), "tag", &Default::default())
.unwrap()
.is_some();
let m5 = chunk.access_recorder().get_metrics();
// Snapshot shouldn't count as an access
assert_eq!(m1, m2);
// Query should count as an access
assert_eq!(m2.count + 1, m3.count);
assert!(m2.last_access < m3.last_access);
assert_eq!(m3.last_access, t1);
// If column names successful should record access
match column_names {
true => {
assert_eq!(m3.count + 1, m4.count);
assert_eq!(m4.last_access, t2);
}
false => {
assert_eq!(m3, m4);
}
}
// If column values successful should record access
match column_values {
true => {
assert_eq!(m4.count + 1, m5.count);
assert!(m4.last_access < m5.last_access);
assert_eq!(m5.last_access, t3);
}
false => {
assert_eq!(m4, m5);
}
}
}
#[tokio::test]
async fn mub_records_access() {
let (db, time) = make_db_time().await;
write_lp(&db, "cpu,tag=1 bar=1 1");
let chunks = db.catalog.chunks();
assert_eq!(chunks.len(), 1);
let chunk = chunks.into_iter().next().unwrap();
let chunk = chunk.read();
assert_eq!(chunk.storage().1, ChunkStorage::OpenMutableBuffer);
test_chunk_access(&chunk, time).await;
}
#[tokio::test]
async fn rub_records_access() {
let (db, time) = make_db_time().await;
write_lp(&db, "cpu,tag=1 bar=1 1");
db.compact_partition("cpu", "1970-01-01T00").await.unwrap();
let chunks = db.catalog.chunks();
assert_eq!(chunks.len(), 1);
let chunk = chunks.into_iter().next().unwrap();
let chunk = chunk.read();
assert_eq!(chunk.storage().1, ChunkStorage::ReadBuffer);
test_chunk_access(&chunk, time).await
}
#[tokio::test]
async fn parquet_records_access() {
let (db, time) = make_db_time().await;
let t0 = time.inc(Duration::from_secs(324));
write_lp(&db, "cpu,tag=1 bar=1 1");
let id = db
.persist_partition("cpu", "1970-01-01T00", true)
.await
.unwrap()
.unwrap()
.id();
db.unload_read_buffer("cpu", "1970-01-01T00", id).unwrap();
let chunks = db.catalog.chunks();
assert_eq!(chunks.len(), 1);
let chunk = chunks.into_iter().next().unwrap();
let chunk = chunk.read();
assert_eq!(chunk.storage().1, ChunkStorage::ObjectStoreOnly);
let first_write = chunk.time_of_first_write();
let last_write = chunk.time_of_last_write();
assert_eq!(first_write, t0);
assert_eq!(last_write, t0);
test_chunk_access(&chunk, time).await
}
#[tokio::test]
async fn parquet_snapshot() {
let (db, time) = make_db_time().await;
let w0 = time.inc(Duration::from_secs(10));
write_lp(&db, "cpu,tag=1 bar=1 1");
let w1 = time.inc(Duration::from_secs(10));
write_lp(&db, "cpu,tag=2 bar=2 2");
db.persist_partition("cpu", "1970-01-01T00", true)
.await
.unwrap();
let chunks = db.catalog.chunks();
assert_eq!(chunks.len(), 1);
let chunk = chunks.into_iter().next().unwrap();
let chunk = chunk.read();
assert!(matches!(chunk.stage(), ChunkStage::Persisted { .. }));
let snapshot = DbChunk::parquet_file_snapshot(&chunk);
let first_write = snapshot.time_of_first_write();
let last_write = snapshot.time_of_last_write();
assert_eq!(w0, first_write);
assert_eq!(w1, last_write);
}
}