feat: expr analyzer for buffer to filter table chunks

Trevor Hilton 2025-01-18 17:56:13 -05:00
parent 1d8d3d66fc
commit daa3fe700a
5 changed files with 593 additions and 358 deletions

View File

@ -9,14 +9,28 @@ pub mod paths;
pub mod persister;
pub mod write_buffer;
use anyhow::Context;
use async_trait::async_trait;
use data_types::{NamespaceName, TimestampMinMax};
use datafusion::{catalog::Session, error::DataFusionError, prelude::Expr};
use datafusion::{
catalog::Session,
common::{Column, DFSchema},
error::DataFusionError,
execution::context::ExecutionProps,
physical_expr::{
analyze, create_physical_expr,
utils::{Guarantee, LiteralGuarantee},
AnalysisContext, ExprBoundaries,
},
prelude::Expr,
scalar::ScalarValue,
};
use hashbrown::{HashMap, HashSet};
use influxdb3_cache::{
distinct_cache::{CreateDistinctCacheArgs, DistinctCacheProvider},
last_cache::LastCacheProvider,
};
use influxdb3_catalog::catalog::{Catalog, CatalogSequenceNumber, DatabaseSchema};
use influxdb3_catalog::catalog::{Catalog, CatalogSequenceNumber, DatabaseSchema, TableDefinition};
use influxdb3_id::{ColumnId, DbId, ParquetFileId, SerdeVecMap, TableId};
use influxdb3_wal::{
DistinctCacheDefinition, LastCacheDefinition, SnapshotSequenceNumber, Wal,
@ -24,6 +38,8 @@ use influxdb3_wal::{
};
use iox_query::QueryChunk;
use iox_time::Time;
use observability_deps::tracing::{debug, info, warn};
use schema::{InfluxColumnType, TIME_COLUMN_NAME};
use serde::{Deserialize, Serialize};
use std::{fmt::Debug, sync::Arc, time::Duration};
use thiserror::Error;
@ -41,6 +57,9 @@ pub enum Error {
#[error("persister error: {0}")]
Persister(#[from] persister::Error),
#[error(transparent)]
Anyhow(#[from] anyhow::Error),
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -90,7 +109,16 @@ pub trait Bufferer: Debug + Send + Sync + 'static {
fn wal(&self) -> Arc<dyn Wal>;
/// Returns the parquet files for a given database and table
fn parquet_files(&self, db_id: DbId, table_id: TableId) -> Vec<ParquetFile>;
fn parquet_files(&self, db_id: DbId, table_id: TableId) -> Vec<ParquetFile> {
self.parquet_files_filtered(db_id, table_id, &BufferFilter::default())
}
fn parquet_files_filtered(
&self,
db_id: DbId,
table_id: TableId,
filter: &BufferFilter,
) -> Vec<ParquetFile>;
/// A channel to watch for when new persisted snapshots are created
fn watch_persisted_snapshots(&self) -> tokio::sync::watch::Receiver<Option<PersistedSnapshot>>;
@ -462,6 +490,153 @@ pub(crate) mod test_help {
}
}
#[derive(Debug, Default)]
pub struct BufferFilter {
time_lower_bound: Option<i64>,
time_upper_bound: Option<i64>,
guarantees: HashMap<ColumnId, BufferGuarantee>,
}
#[derive(Debug)]
pub struct BufferGuarantee {
pub guarantee: Guarantee,
pub literals: HashSet<Arc<str>>,
}
impl BufferFilter {
pub fn generate(table_def: &Arc<TableDefinition>, exprs: &[Expr]) -> Result<Self> {
let mut time_lower_bound = None;
let mut time_upper_bound = None;
let arrow_schema = table_def.schema.as_arrow();
let mut guarantees = HashMap::new();
let df_schema = DFSchema::try_from(Arc::clone(&arrow_schema))
.context("table schema was not able to convert to datafusion schema")?;
let props = ExecutionProps::new();
info!(?exprs, "analyzing expressions");
for expr in exprs.iter().filter(|e| {
matches!(
e,
Expr::BinaryExpr(_) | Expr::Not(_) | Expr::Between(_) | Expr::InList(_)
)
}) {
let Ok(physical_expr) = create_physical_expr(expr, &df_schema, &props) else {
continue;
};
if expr
.column_refs()
.contains(&Column::new_unqualified(TIME_COLUMN_NAME))
{
debug!(">>> has time col expr");
let time_col_index = arrow_schema
.fields()
.iter()
.position(|f| f.name() == TIME_COLUMN_NAME)
.expect("table should have a time column");
// Determine time bounds, if provided:
let boundaries = ExprBoundaries::try_new_unbounded(&arrow_schema)
.context("unable to create unbounded expr boundaries on incoming expression")?;
let analysis = analyze(
&physical_expr,
AnalysisContext::new(boundaries),
&arrow_schema,
)
.inspect_err(|error| {
warn!(?physical_expr, ?arrow_schema, ?error, "failed to analyze")
})
.context("unable to analyze provided filters")?;
// Set the time boundaries by the analyzed expression, if they have not already been
// set. If they have been set, we remove the bounds, because it is not clear how to
// evaluate multiple intervals.
if let Some(ExprBoundaries { interval, .. }) =
analysis.boundaries.get(time_col_index)
{
debug!(?interval, ">>> got the interval");
if let ScalarValue::TimestampNanosecond(Some(lower), _) = interval.lower() {
if time_lower_bound.take().is_none() {
time_lower_bound.replace(*lower);
}
}
if let ScalarValue::TimestampNanosecond(Some(upper), _) = interval.upper() {
if time_upper_bound.take().is_none() {
time_upper_bound.replace(*upper);
}
}
}
}
// Determine any literal guarantees made on tag columns:
let literal_guarantees = LiteralGuarantee::analyze(&physical_expr);
for LiteralGuarantee {
column,
guarantee,
literals,
} in literal_guarantees
{
// NOTE: only retaining string literals for matching
// on tag columns for the buffer index:
let Some((column_id, InfluxColumnType::Tag)) = table_def
.column_definition(column.name())
.map(|def| (def.id, def.data_type))
else {
continue;
};
let literals = literals
.into_iter()
.filter_map(|l| match l {
ScalarValue::Utf8(Some(s)) | ScalarValue::Utf8View(Some(s)) => {
Some(Arc::<str>::from(s.as_str()))
}
_ => None,
})
.collect::<HashSet<Arc<str>>>();
guarantees
.entry(column_id)
.and_modify(|e: &mut BufferGuarantee| {
// NOTE: it seems unlikely that there would be
// multiple literal guarantees on a single
// column from the Expr set. But we handle
// that here:
use Guarantee::*;
match (e.guarantee, guarantee) {
(In, In) | (NotIn, NotIn) => {
e.literals = e.literals.union(&literals).cloned().collect()
}
(In, NotIn) => {
e.literals = e.literals.difference(&literals).cloned().collect()
}
(NotIn, In) => {
e.literals = literals.difference(&e.literals).cloned().collect()
}
}
})
.or_insert(BufferGuarantee {
guarantee,
literals,
});
}
}
Ok(Self {
time_lower_bound,
time_upper_bound,
guarantees,
})
}
pub fn test_time_stamp_min_max(&self, min: i64, max: i64) -> bool {
match (self.time_lower_bound, self.time_upper_bound) {
(None, None) => true,
(None, Some(u)) => min <= u,
(Some(l), None) => max >= l,
(Some(l), Some(u)) => min <= u && max >= l,
}
}
pub fn guarantees(&self) -> impl Iterator<Item = (&ColumnId, &BufferGuarantee)> {
self.guarantees.iter()
}
}
#[cfg(test)]
mod tests {
use influxdb3_catalog::catalog::CatalogSequenceNumber;

View File

@ -12,8 +12,8 @@ use crate::write_buffer::queryable_buffer::QueryableBuffer;
use crate::write_buffer::validator::WriteValidator;
use crate::{chunk::ParquetChunk, DatabaseManager};
use crate::{
BufferedWriteRequest, Bufferer, ChunkContainer, DistinctCacheManager, LastCacheManager,
ParquetFile, PersistedSnapshot, Precision, WriteBuffer, WriteLineError,
BufferFilter, BufferedWriteRequest, Bufferer, ChunkContainer, DistinctCacheManager,
LastCacheManager, ParquetFile, PersistedSnapshot, Precision, WriteBuffer, WriteLineError,
};
use async_trait::async_trait;
use data_types::{
@ -319,30 +319,35 @@ impl WriteBufferImpl {
DataFusionError::Execution(format!("database {} not found", database_name))
})?;
let (table_id, table_schema) =
db_schema.table_id_and_schema(table_name).ok_or_else(|| {
DataFusionError::Execution(format!(
"table {} not found in db {}",
table_name, database_name
))
})?;
let table_def = db_schema.table_definition(table_name).ok_or_else(|| {
DataFusionError::Execution(format!(
"table {} not found in db {}",
table_name, database_name
))
})?;
let buffer_filter = BufferFilter::generate(&table_def, filters)
.inspect_err(|error| warn!(?error, "buffer filter generation failed"))
.map_err(|error| DataFusionError::External(Box::new(error)))?;
let mut chunks = self.buffer.get_table_chunks(
Arc::clone(&db_schema),
table_name,
filters,
&buffer_filter,
projection,
ctx,
)?;
let parquet_files = self.persisted_files.get_files(db_schema.id, table_id);
let parquet_files =
self.persisted_files
.get_files(db_schema.id, table_def.table_id, &buffer_filter);
let mut chunk_order = chunks.len() as i64;
for parquet_file in parquet_files {
let parquet_chunk = parquet_chunk_from_file(
&parquet_file,
&table_schema,
&table_def.schema,
self.persister.object_store_url().clone(),
self.persister.object_store(),
chunk_order,
@ -427,8 +432,13 @@ impl Bufferer for WriteBufferImpl {
Arc::clone(&self.wal)
}
fn parquet_files(&self, db_id: DbId, table_id: TableId) -> Vec<ParquetFile> {
self.buffer.persisted_parquet_files(db_id, table_id)
fn parquet_files_filtered(
&self,
db_id: DbId,
table_id: TableId,
filter: &BufferFilter,
) -> Vec<ParquetFile> {
self.buffer.persisted_parquet_files(db_id, table_id, filter)
}
fn watch_persisted_snapshots(&self) -> Receiver<Option<PersistedSnapshot>> {
@ -2092,7 +2102,9 @@ mod tests {
verify_snapshot_count(1, &wbuf.persister).await;
// get the path for the created parquet file:
let persisted_files = wbuf.persisted_files().get_files(db_id, tbl_id);
let persisted_files =
wbuf.persisted_files()
.get_files(db_id, tbl_id, &BufferFilter::default());
assert_eq!(1, persisted_files.len());
let path = ObjPath::from(persisted_files[0].path.as_str());
@ -2198,7 +2210,9 @@ mod tests {
verify_snapshot_count(1, &wbuf.persister).await;
// get the path for the created parquet file:
let persisted_files = wbuf.persisted_files().get_files(db_id, tbl_id);
let persisted_files =
wbuf.persisted_files()
.get_files(db_id, tbl_id, &BufferFilter::default());
assert_eq!(1, persisted_files.len());
let path = ObjPath::from(persisted_files[0].path.as_str());

View File

@ -2,6 +2,7 @@
//! When queries come in they will combine whatever chunks exist from `QueryableBuffer` with
//! the persisted files to get the full set of data to query.
use crate::BufferFilter;
use crate::{ParquetFile, PersistedSnapshot};
use hashbrown::HashMap;
use influxdb3_id::DbId;
@ -47,7 +48,12 @@ impl PersistedFiles {
}
/// Get the list of files for a given database and table, always return in descending order of min_time
pub fn get_files(&self, db_id: DbId, table_id: TableId) -> Vec<ParquetFile> {
pub fn get_files(
&self,
db_id: DbId,
table_id: TableId,
filter: &BufferFilter,
) -> Vec<ParquetFile> {
let three_days_ago = (self.time_provider.now() - crate::THREE_DAYS).timestamp_nanos();
let mut files = {
let inner = self.inner.read();
@ -58,6 +64,7 @@ impl PersistedFiles {
.cloned()
.unwrap_or_default()
.into_iter()
.filter(|file| filter.test_time_stamp_min_max(file.min_time, file.max_time))
.filter(|file| dbg!(file.min_time) > dbg!(three_days_ago))
.collect::<Vec<_>>()
};

View File

@ -3,7 +3,7 @@ use crate::paths::ParquetFilePath;
use crate::persister::Persister;
use crate::write_buffer::persisted_files::PersistedFiles;
use crate::write_buffer::table_buffer::TableBuffer;
use crate::{ParquetFile, ParquetFileId, PersistedSnapshot};
use crate::{BufferFilter, ParquetFile, ParquetFileId, PersistedSnapshot};
use anyhow::Context;
use arrow::record_batch::RecordBatch;
use async_trait::async_trait;
@ -13,7 +13,6 @@ use data_types::{
};
use datafusion::catalog::Session;
use datafusion::common::DataFusionError;
use datafusion::logical_expr::Expr;
use datafusion_util::stream_from_batches;
use hashbrown::HashMap;
use influxdb3_cache::distinct_cache::DistinctCacheProvider;
@ -100,7 +99,7 @@ impl QueryableBuffer {
&self,
db_schema: Arc<DatabaseSchema>,
table_name: &str,
filters: &[Expr],
buffer_filter: &BufferFilter,
_projection: Option<&Vec<usize>>,
_ctx: &dyn Session,
) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
@ -120,7 +119,7 @@ impl QueryableBuffer {
};
Ok(table_buffer
.partitioned_record_batches(Arc::clone(&table_def), filters)
.partitioned_record_batches(Arc::clone(&table_def), buffer_filter)
.map_err(|e| DataFusionError::Execution(format!("error getting batches {}", e)))?
.into_iter()
.filter(|(_, (ts_min_max, _))| {
@ -416,8 +415,13 @@ impl QueryableBuffer {
receiver
}
pub fn persisted_parquet_files(&self, db_id: DbId, table_id: TableId) -> Vec<ParquetFile> {
self.persisted_files.get_files(db_id, table_id)
pub fn persisted_parquet_files(
&self,
db_id: DbId,
table_id: TableId,
filter: &BufferFilter,
) -> Vec<ParquetFile> {
self.persisted_files.get_files(db_id, table_id, filter)
}
pub fn persisted_snapshot_notify_rx(
@ -877,9 +881,11 @@ mod tests {
// validate we have a single persisted file
let db = catalog.db_schema("testdb").unwrap();
let table = db.table_definition("foo").unwrap();
let files = queryable_buffer
.persisted_files
.get_files(db.id, table.table_id);
let files = queryable_buffer.persisted_files.get_files(
db.id,
table.table_id,
&BufferFilter::default(),
);
assert_eq!(files.len(), 1);
// now force another snapshot, persisting the data to parquet file
@ -908,9 +914,11 @@ mod tests {
.unwrap();
// validate we have two persisted files
let files = queryable_buffer
.persisted_files
.get_files(db.id, table.table_id);
let files = queryable_buffer.persisted_files.get_files(
db.id,
table.table_id,
&BufferFilter::default(),
);
assert_eq!(files.len(), 2);
}
}

View File

@ -8,20 +8,22 @@ use arrow::array::{
use arrow::datatypes::{GenericStringType, Int32Type};
use arrow::record_batch::RecordBatch;
use data_types::TimestampMinMax;
use datafusion::logical_expr::{BinaryExpr, Expr};
use hashbrown::HashMap;
use datafusion::physical_expr::utils::Guarantee;
use hashbrown::{HashMap, HashSet};
use influxdb3_catalog::catalog::TableDefinition;
use influxdb3_id::ColumnId;
use influxdb3_wal::{FieldData, Row};
use observability_deps::tracing::{debug, error};
use observability_deps::tracing::error;
use schema::sort::SortKey;
use schema::{InfluxColumnType, InfluxFieldType, Schema, SchemaBuilder};
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, HashSet};
use std::collections::BTreeMap;
use std::mem::size_of;
use std::sync::Arc;
use thiserror::Error;
use crate::{BufferFilter, BufferGuarantee};
#[derive(Debug, Error)]
pub enum Error {
#[error("Field not found in table buffer: {0}")]
@ -71,11 +73,13 @@ impl TableBuffer {
pub fn partitioned_record_batches(
&self,
table_def: Arc<TableDefinition>,
filter: &[Expr],
filter: &BufferFilter,
) -> Result<HashMap<i64, (TimestampMinMax, Vec<RecordBatch>)>> {
let mut batches = HashMap::new();
let schema = table_def.schema.as_arrow();
for sc in &self.snapshotting_chunks {
for sc in self.snapshotting_chunks.iter().filter(|sc| {
filter.test_time_stamp_min_max(sc.timestamp_min_max.min, sc.timestamp_min_max.max)
}) {
let cols: std::result::Result<Vec<_>, _> = schema
.fields()
.iter()
@ -95,7 +99,11 @@ impl TableBuffer {
*ts = ts.union(&sc.timestamp_min_max);
v.push(rb);
}
for (t, c) in &self.chunk_time_to_chunks {
for (t, c) in self
.chunk_time_to_chunks
.iter()
.filter(|(_, c)| filter.test_time_stamp_min_max(c.timestamp_min, c.timestamp_max))
{
let ts_min_max = TimestampMinMax::new(c.timestamp_min, c.timestamp_max);
let (ts, v) = batches
.entry(*t)
@ -106,40 +114,6 @@ impl TableBuffer {
Ok(batches)
}
pub fn record_batches(
&self,
table_def: Arc<TableDefinition>,
filter: &[Expr],
) -> Result<Vec<RecordBatch>> {
let mut batches =
Vec::with_capacity(self.snapshotting_chunks.len() + self.chunk_time_to_chunks.len());
let schema = table_def.schema.as_arrow();
for sc in &self.snapshotting_chunks {
let cols: std::result::Result<Vec<_>, _> = schema
.fields()
.iter()
.map(|f| {
let col = sc
.record_batch
.column_by_name(f.name())
.ok_or(Error::FieldNotFound(f.name().to_string()));
col.cloned()
})
.collect();
let cols = cols?;
let rb = RecordBatch::try_new(schema.clone(), cols)?;
batches.push(rb);
}
for c in self.chunk_time_to_chunks.values() {
batches.push(c.record_batch(Arc::clone(&table_def), filter)?)
}
Ok(batches)
}
pub fn timestamp_min_max(&self) -> TimestampMinMax {
let (min, max) = if self.chunk_time_to_chunks.is_empty() {
(0, 0)
@ -265,7 +239,6 @@ impl MutableTableChunk {
self.timestamp_max = self.timestamp_max.max(*v);
let b = self.data.entry(f.id).or_insert_with(|| {
debug!("Creating new timestamp builder");
let mut time_builder = TimestampNanosecondBuilder::new();
// append nulls for all previous rows
time_builder.append_nulls(row_index + self.row_count);
@ -383,7 +356,6 @@ impl MutableTableChunk {
// add nulls for any columns not present
for (name, builder) in &mut self.data {
if !value_added.contains(name) {
debug!("Adding null for column {}", name);
match builder {
Builder::Bool(b) => b.append_null(),
Builder::F64(b) => b.append_null(),
@ -408,18 +380,16 @@ impl MutableTableChunk {
fn record_batch(
&self,
table_def: Arc<TableDefinition>,
filter: &[Expr],
filter: &BufferFilter,
) -> Result<RecordBatch> {
let row_ids = self
.index
.get_rows_from_index_for_filter(Arc::clone(&table_def), filter);
let row_ids = self.index.get_rows_from_index_for_filter(filter);
let schema = table_def.schema.as_arrow();
let mut cols = Vec::with_capacity(schema.fields().len());
for f in schema.fields() {
match row_ids {
Some(row_ids) => {
Some(ref row_ids) => {
let b = table_def
.column_name_to_id(f.name().as_str())
.and_then(|id| self.data.get(&id));
@ -576,7 +546,7 @@ impl std::fmt::Debug for MutableTableChunk {
#[derive(Debug, Clone)]
struct BufferIndex {
// column id -> string value -> row indexes
columns: HashMap<ColumnId, HashMap<String, Vec<usize>>>,
columns: HashMap<ColumnId, HashMap<String, HashSet<usize>>>,
}
impl BufferIndex {
@ -594,34 +564,52 @@ impl BufferIndex {
if let Some(column) = self.columns.get_mut(&column_id) {
column
.entry_ref(value)
.and_modify(|c| c.push(row_index))
.or_insert(vec![row_index]);
.and_modify(|c| {
c.insert(row_index);
})
.or_insert([row_index].into_iter().collect());
}
}
fn get_rows_from_index_for_filter(
&self,
table_def: Arc<TableDefinition>,
filter: &[Expr],
) -> Option<&Vec<usize>> {
for expr in filter {
if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr {
if *op == datafusion::logical_expr::Operator::Eq {
if let Expr::Column(c) = left.as_ref() {
if let Expr::Literal(datafusion::scalar::ScalarValue::Utf8(Some(v))) =
right.as_ref()
{
return table_def
.column_name_to_id(c.name())
.and_then(|id| self.columns.get(&id))
.and_then(|m| m.get(v.as_str()));
}
fn get_rows_from_index_for_filter(&self, filter: &BufferFilter) -> Option<HashSet<usize>> {
let mut row_ids = HashSet::new();
for (
col_id,
BufferGuarantee {
guarantee,
literals,
},
) in filter.guarantees()
{
let Some(row_map) = self.columns.get(col_id) else {
continue;
};
match guarantee {
Guarantee::In => {
for literal in literals {
let Some(row) = row_map.get(literal.as_ref()) else {
continue;
};
row_ids = row_ids.union(row).copied().collect();
}
}
Guarantee::NotIn => {
row_ids.extend(row_map.values().flatten().copied());
for literal in literals {
let Some(row) = row_map.get(literal.as_ref()) else {
continue;
};
row_ids = row_ids.difference(row).copied().collect();
}
}
}
}
None
if row_ids.is_empty() {
None
} else {
Some(row_ids.into_iter().collect())
}
}
#[allow(dead_code)]
@ -695,7 +683,7 @@ impl Builder {
}
}
fn get_rows(&self, rows: &[usize]) -> ArrayRef {
fn get_rows(&self, rows: &HashSet<usize>) -> ArrayRef {
match self {
Self::Bool(b) => {
let b = b.finish_cloned();
@ -795,82 +783,79 @@ impl Builder {
#[cfg(test)]
mod tests {
use crate::{write_buffer::validator::WriteValidator, Precision};
use super::*;
use arrow_util::{assert_batches_eq, assert_batches_sorted_eq};
use datafusion::common::Column;
use influxdb3_id::TableId;
use influxdb3_wal::Field;
use schema::InfluxFieldType;
use arrow_util::assert_batches_sorted_eq;
use data_types::NamespaceName;
use datafusion::prelude::{col, lit, lit_timestamp_nano, Expr};
use influxdb3_catalog::catalog::{Catalog, DatabaseSchema};
use iox_time::Time;
struct TestWriter {
catalog: Arc<Catalog>,
}
impl TestWriter {
const DB_NAME: &str = "test-db";
fn new() -> Self {
let catalog = Arc::new(Catalog::new("test-node".into(), "test-instance".into()));
Self { catalog }
}
fn write_to_rows(&self, lp: impl AsRef<str>, ingest_time_sec: i64) -> Vec<Row> {
let db = NamespaceName::try_from(Self::DB_NAME).unwrap();
let ingest_time_ns = ingest_time_sec * 1_000_000_000;
let validator =
WriteValidator::initialize(db, Arc::clone(&self.catalog), ingest_time_ns).unwrap();
validator
.v1_parse_lines_and_update_schema(
lp.as_ref(),
false,
Time::from_timestamp_nanos(ingest_time_ns),
Precision::Nanosecond,
)
.map(|r| r.into_inner().to_rows())
.unwrap()
}
fn db_schema(&self) -> Arc<DatabaseSchema> {
self.catalog.db_schema(Self::DB_NAME).unwrap()
}
}
#[test]
fn partitioned_table_buffer_batches() {
let table_def = Arc::new(
TableDefinition::new(
TableId::new(),
"test_table".into(),
vec![
(ColumnId::from(0), "tag".into(), InfluxColumnType::Tag),
(
ColumnId::from(1),
"val".into(),
InfluxColumnType::Field(InfluxFieldType::String),
),
(
ColumnId::from(2),
"time".into(),
InfluxColumnType::Timestamp,
),
],
vec![0.into()],
)
.unwrap(),
);
let mut table_buffer = TableBuffer::new(vec![ColumnId::from(0)], SortKey::empty());
fn test_partitioned_table_buffer_batches() {
let writer = TestWriter::new();
let mut row_batches = Vec::new();
for t in 0..10 {
let offset = t * 10;
let rows = vec![
Row {
time: offset + 1,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("a".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::String(format!("thing {t}-1")),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(offset + 1),
},
],
},
Row {
time: offset + 2,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("b".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::String(format!("thing {t}-2")),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(offset + 2),
},
],
},
];
let rows = writer.write_to_rows(
format!(
"\
tbl,tag=a val=\"thing {t}-1\" {o1}\n\
tbl,tag=b val=\"thing {t}-2\" {o2}\n\
",
o1 = offset + 1,
o2 = offset + 2,
),
offset,
);
row_batches.push((rows, offset));
}
let table_def = writer.db_schema().table_definition("tbl").unwrap();
let tag_col_id = table_def.column_name_to_id("tag").unwrap();
let mut table_buffer = TableBuffer::new(vec![tag_col_id], SortKey::empty());
for (rows, offset) in row_batches {
table_buffer.buffer_chunk(offset, &rows);
}
let partitioned_batches = table_buffer
.partitioned_record_batches(Arc::clone(&table_def), &[])
.partitioned_record_batches(Arc::clone(&table_def), &BufferFilter::default())
.unwrap();
assert_eq!(10, partitioned_batches.len());
@ -902,209 +887,165 @@ mod tests {
}
#[test]
fn tag_row_index() {
let table_def = Arc::new(
TableDefinition::new(
TableId::new(),
"test_table".into(),
vec![
(ColumnId::from(0), "tag".into(), InfluxColumnType::Tag),
(
ColumnId::from(1),
"value".into(),
InfluxColumnType::Field(InfluxFieldType::Integer),
),
(
ColumnId::from(2),
"time".into(),
InfluxColumnType::Timestamp,
),
],
vec![0.into()],
)
.unwrap(),
fn test_row_index_tag_filtering() {
let writer = TestWriter::new();
let rows = writer.write_to_rows(
"\
tbl,tag=a value=1i 1\n\
tbl,tag=b value=2i 1\n\
tbl,tag=a value=3i 2\n\
tbl,tag=b value=4i 2\n\
tbl,tag=a value=5i 3\n\
tbl,tag=c value=6i 3",
0,
);
let mut table_buffer = TableBuffer::new(vec![ColumnId::from(0)], SortKey::empty());
let rows = vec![
Row {
time: 1,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("a".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::Integer(1),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(1),
},
],
},
Row {
time: 2,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("b".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::Integer(2),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(2),
},
],
},
Row {
time: 3,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("a".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::Integer(3),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(3),
},
],
},
];
let table_def = writer.db_schema().table_definition("tbl").unwrap();
let tag_id = table_def.column_name_to_id("tag").unwrap();
let mut table_buffer = TableBuffer::new(vec![tag_id], SortKey::empty());
table_buffer.buffer_chunk(0, &rows);
let filter = &[Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "tag".to_string(),
})),
op: datafusion::logical_expr::Operator::Eq,
right: Box::new(Expr::Literal(datafusion::scalar::ScalarValue::Utf8(Some(
"a".to_string(),
)))),
})];
let a_rows = table_buffer
.chunk_time_to_chunks
.get(&0)
.unwrap()
.index
.get_rows_from_index_for_filter(Arc::clone(&table_def), filter)
.unwrap();
assert_eq!(a_rows, &[0, 2]);
struct TestCase<'a> {
filter: &'a [Expr],
expected_rows: &'a [usize],
expected_output: &'a [&'a str],
}
let a = table_buffer
.record_batches(Arc::clone(&table_def), filter)
.unwrap();
let expected_a = vec![
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| a | 1970-01-01T00:00:00.000000001Z | 1 |",
"| a | 1970-01-01T00:00:00.000000003Z | 3 |",
"+-----+--------------------------------+-------+",
let test_cases = [
TestCase {
filter: &[col("tag").eq(lit("a"))],
expected_rows: &[0, 2, 4],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| a | 1970-01-01T00:00:00.000000001Z | 1 |",
"| a | 1970-01-01T00:00:00.000000002Z | 3 |",
"| a | 1970-01-01T00:00:00.000000003Z | 5 |",
"+-----+--------------------------------+-------+",
],
},
TestCase {
filter: &[col("tag").eq(lit("b"))],
expected_rows: &[1, 3],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| b | 1970-01-01T00:00:00.000000001Z | 2 |",
"| b | 1970-01-01T00:00:00.000000002Z | 4 |",
"+-----+--------------------------------+-------+",
],
},
TestCase {
filter: &[col("tag").eq(lit("c"))],
expected_rows: &[5],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| c | 1970-01-01T00:00:00.000000003Z | 6 |",
"+-----+--------------------------------+-------+",
],
},
TestCase {
filter: &[col("tag").eq(lit("a")).or(col("tag").eq(lit("c")))],
expected_rows: &[0, 2, 4, 5],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| a | 1970-01-01T00:00:00.000000001Z | 1 |",
"| a | 1970-01-01T00:00:00.000000002Z | 3 |",
"| a | 1970-01-01T00:00:00.000000003Z | 5 |",
"| c | 1970-01-01T00:00:00.000000003Z | 6 |",
"+-----+--------------------------------+-------+",
],
},
TestCase {
filter: &[col("tag").not_eq(lit("a"))],
expected_rows: &[1, 3, 5],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| b | 1970-01-01T00:00:00.000000001Z | 2 |",
"| b | 1970-01-01T00:00:00.000000002Z | 4 |",
"| c | 1970-01-01T00:00:00.000000003Z | 6 |",
"+-----+--------------------------------+-------+",
],
},
TestCase {
filter: &[col("tag").in_list(vec![lit("a"), lit("c")], false)],
expected_rows: &[0, 2, 4, 5],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| a | 1970-01-01T00:00:00.000000001Z | 1 |",
"| a | 1970-01-01T00:00:00.000000002Z | 3 |",
"| a | 1970-01-01T00:00:00.000000003Z | 5 |",
"| c | 1970-01-01T00:00:00.000000003Z | 6 |",
"+-----+--------------------------------+-------+",
],
},
TestCase {
filter: &[col("tag").in_list(vec![lit("a"), lit("c")], true)],
expected_rows: &[1, 3],
expected_output: &[
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| b | 1970-01-01T00:00:00.000000001Z | 2 |",
"| b | 1970-01-01T00:00:00.000000002Z | 4 |",
"+-----+--------------------------------+-------+",
],
},
];
assert_batches_eq!(&expected_a, &a);
let filter = &[Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "tag".to_string(),
})),
op: datafusion::logical_expr::Operator::Eq,
right: Box::new(Expr::Literal(datafusion::scalar::ScalarValue::Utf8(Some(
"b".to_string(),
)))),
})];
let b_rows = table_buffer
.chunk_time_to_chunks
.get(&0)
.unwrap()
.index
.get_rows_from_index_for_filter(Arc::clone(&table_def), filter)
.unwrap();
assert_eq!(b_rows, &[1]);
let b = table_buffer
.record_batches(Arc::clone(&table_def), filter)
.unwrap();
let expected_b = vec![
"+-----+--------------------------------+-------+",
"| tag | time | value |",
"+-----+--------------------------------+-------+",
"| b | 1970-01-01T00:00:00.000000002Z | 2 |",
"+-----+--------------------------------+-------+",
];
assert_batches_eq!(&expected_b, &b);
for t in test_cases {
let filter = BufferFilter::generate(&table_def, t.filter).unwrap();
let rows = table_buffer
.chunk_time_to_chunks
.get(&0)
.unwrap()
.index
.get_rows_from_index_for_filter(&filter)
.unwrap();
assert_eq!(
rows,
HashSet::<usize>::from_iter(t.expected_rows.iter().copied())
);
let batches = table_buffer
.partitioned_record_batches(Arc::clone(&table_def), &filter)
.unwrap()
.into_values()
.flat_map(|(_, batch)| batch.into_iter())
.collect::<Vec<RecordBatch>>();
assert_batches_sorted_eq!(t.expected_output, &batches);
}
}
#[test]
fn computed_size_of_buffer() {
let mut table_buffer = TableBuffer::new(vec![ColumnId::from(0)], SortKey::empty());
fn test_computed_size_of_buffer() {
let writer = TestWriter::new();
let rows = vec![
Row {
time: 1,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("a".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::Integer(1),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(1),
},
],
},
Row {
time: 2,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("b".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::Integer(2),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(2),
},
],
},
Row {
time: 3,
fields: vec![
Field {
id: ColumnId::from(0),
value: FieldData::Tag("this is a long tag value to store".to_string()),
},
Field {
id: ColumnId::from(1),
value: FieldData::Integer(3),
},
Field {
id: ColumnId::from(2),
value: FieldData::Timestamp(3),
},
],
},
];
let rows = writer.write_to_rows(
"\
tbl,tag=a value=1i 1\n\
tbl,tag=b value=2i 2\n\
tbl,tag=this\\ is\\ a\\ long\\ tag\\ value\\ to\\ store value=3i 3\n\
",
0,
);
let tag_col_id = writer
.db_schema()
.table_definition("tbl")
.and_then(|tbl| tbl.column_name_to_id("tag"))
.unwrap();
let mut table_buffer = TableBuffer::new(vec![tag_col_id], SortKey::empty());
table_buffer.buffer_chunk(0, &rows);
let size = table_buffer.computed_size();
@ -1118,4 +1059,94 @@ mod tests {
assert_eq!(timestamp_min_max.min, 0);
assert_eq!(timestamp_min_max.max, 0);
}
#[test_log::test]
fn test_time_filters() {
let writer = TestWriter::new();
let mut row_batches = Vec::new();
for offset in 0..100 {
let rows = writer.write_to_rows(
format!(
"\
tbl,tag=a val={}\n\
tbl,tag=b val={}\n\
",
offset + 1,
offset + 2
),
offset,
);
row_batches.push((offset, rows));
}
let table_def = writer.db_schema().table_definition("tbl").unwrap();
let tag_col_id = table_def.column_name_to_id("tag").unwrap();
let mut table_buffer = TableBuffer::new(vec![tag_col_id], SortKey::empty());
for (offset, rows) in row_batches {
table_buffer.buffer_chunk(offset, &rows);
}
struct TestCase<'a> {
filter: &'a [Expr],
expected_output: &'a [&'a str],
}
let test_cases = [
TestCase {
filter: &[col("time").gt(lit_timestamp_nano(97_000_000_000i64))],
expected_output: &[
"+-----+----------------------+-------+",
"| tag | time | val |",
"+-----+----------------------+-------+",
"| a | 1970-01-01T00:01:38Z | 99.0 |",
"| a | 1970-01-01T00:01:39Z | 100.0 |",
"| b | 1970-01-01T00:01:38Z | 100.0 |",
"| b | 1970-01-01T00:01:39Z | 101.0 |",
"+-----+----------------------+-------+",
],
},
TestCase {
filter: &[col("time").lt(lit_timestamp_nano(3_000_000_000i64))],
expected_output: &[
"+-----+----------------------+-----+",
"| tag | time | val |",
"+-----+----------------------+-----+",
"| a | 1970-01-01T00:00:00Z | 1.0 |",
"| a | 1970-01-01T00:00:01Z | 2.0 |",
"| a | 1970-01-01T00:00:02Z | 3.0 |",
"| b | 1970-01-01T00:00:00Z | 2.0 |",
"| b | 1970-01-01T00:00:01Z | 3.0 |",
"| b | 1970-01-01T00:00:02Z | 4.0 |",
"+-----+----------------------+-----+",
],
},
TestCase {
filter: &[col("time")
.gt(lit_timestamp_nano(3_000_000_000i64))
.and(col("time").lt(lit_timestamp_nano(6_000_000_000i64)))],
expected_output: &[
"+-----+----------------------+-----+",
"| tag | time | val |",
"+-----+----------------------+-----+",
"| a | 1970-01-01T00:00:04Z | 5.0 |",
"| a | 1970-01-01T00:00:05Z | 6.0 |",
"| b | 1970-01-01T00:00:04Z | 6.0 |",
"| b | 1970-01-01T00:00:05Z | 7.0 |",
"+-----+----------------------+-----+",
],
},
];
for t in test_cases {
let filter = BufferFilter::generate(&table_def, t.filter).unwrap();
let batches = table_buffer
.partitioned_record_batches(Arc::clone(&table_def), &filter)
.unwrap()
.into_values()
.flat_map(|(_, batches)| batches)
.collect::<Vec<RecordBatch>>();
assert_batches_sorted_eq!(t.expected_output, &batches);
}
}
}