2359 lines
86 KiB
Rust
2359 lines
86 KiB
Rust
//! Data for the lifecycle of the Ingester
|
|
|
|
use crate::{
|
|
compact::compact_persisting_batch,
|
|
lifecycle::LifecycleHandle,
|
|
partioning::{Partitioner, PartitionerError},
|
|
persist::persist,
|
|
querier_handler::query,
|
|
};
|
|
use arrow::record_batch::RecordBatch;
|
|
use async_trait::async_trait;
|
|
use backoff::{Backoff, BackoffConfig};
|
|
use data_types::{
|
|
DeletePredicate, KafkaPartition, NamespaceId, PartitionId, PartitionInfo, SequenceNumber,
|
|
SequencerId, TableId, Timestamp, Tombstone,
|
|
};
|
|
use datafusion::physical_plan::SendableRecordBatchStream;
|
|
use dml::DmlOperation;
|
|
use iox_catalog::interface::Catalog;
|
|
use iox_query::exec::Executor;
|
|
use iox_time::SystemProvider;
|
|
use metric::U64Counter;
|
|
use mutable_batch::MutableBatch;
|
|
use object_store::DynObjectStore;
|
|
use observability_deps::tracing::{debug, warn};
|
|
use parking_lot::RwLock;
|
|
use parquet_file::storage::ParquetStorage;
|
|
use predicate::Predicate;
|
|
use schema::{selection::Selection, Schema};
|
|
use snafu::{OptionExt, ResultExt, Snafu};
|
|
use std::{
|
|
collections::{btree_map::Entry, BTreeMap},
|
|
convert::TryFrom,
|
|
sync::Arc,
|
|
};
|
|
use uuid::Uuid;
|
|
use write_summary::SequencerProgress;
|
|
|
|
#[derive(Debug, Snafu)]
|
|
#[allow(missing_copy_implementations, missing_docs)]
|
|
pub enum Error {
|
|
#[snafu(display("Error while reading Topic {}", name))]
|
|
ReadTopic {
|
|
source: iox_catalog::interface::Error,
|
|
name: String,
|
|
},
|
|
|
|
#[snafu(display("Error while reading Kafka Partition id {}", id.get()))]
|
|
ReadSequencer {
|
|
source: iox_catalog::interface::Error,
|
|
id: KafkaPartition,
|
|
},
|
|
|
|
#[snafu(display("Sequencer {} not found in data map", sequencer_id))]
|
|
SequencerNotFound { sequencer_id: SequencerId },
|
|
|
|
#[snafu(display(
|
|
"Sequencer not found for kafka partition {} in data map",
|
|
kafka_partition
|
|
))]
|
|
SequencerForPartitionNotFound { kafka_partition: KafkaPartition },
|
|
|
|
#[snafu(display("Namespace {} not found in catalog", namespace))]
|
|
NamespaceNotFound { namespace: String },
|
|
|
|
#[snafu(display("Table {} not found in buffer", table_name))]
|
|
TableNotFound { table_name: String },
|
|
|
|
#[snafu(display("Table must be specified in delete"))]
|
|
TableNotPresent,
|
|
|
|
#[snafu(display("Error accessing catalog: {}", source))]
|
|
Catalog {
|
|
source: iox_catalog::interface::Error,
|
|
},
|
|
|
|
#[snafu(display("The persisting is in progress. Cannot accept more persisting batch"))]
|
|
PersistingNotEmpty,
|
|
|
|
#[snafu(display("Nothing in the Persisting list to get removed"))]
|
|
PersistingEmpty,
|
|
|
|
#[snafu(display(
|
|
"The given batch does not match any in the Persisting list. \
|
|
Nothing is removed from the Persisting list"
|
|
))]
|
|
PersistingNotMatch,
|
|
|
|
#[snafu(display("Cannot partition data: {}", source))]
|
|
Partitioning { source: PartitionerError },
|
|
|
|
#[snafu(display("Snapshot error: {}", source))]
|
|
Snapshot { source: mutable_batch::Error },
|
|
|
|
#[snafu(display("Error while filtering columns from snapshot: {}", source))]
|
|
FilterColumn { source: arrow::error::ArrowError },
|
|
|
|
#[snafu(display("Partition not found: {}", partition_id))]
|
|
PartitionNotFound { partition_id: PartitionId },
|
|
|
|
#[snafu(display("Error while copying buffer to snapshot: {}", source))]
|
|
BufferToSnapshot { source: mutable_batch::Error },
|
|
|
|
#[snafu(display("Error adding to buffer in mutable batch: {}", source))]
|
|
BufferWrite { source: mutable_batch::Error },
|
|
}
|
|
|
|
/// A specialized `Error` for Ingester Data errors
|
|
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
|
|
|
/// Contains all buffered and cached data for the ingester.
|
|
#[derive(Debug)]
|
|
pub struct IngesterData {
|
|
/// Object store for persistence of parquet files
|
|
store: ParquetStorage,
|
|
|
|
/// The global catalog for schema, parquet files and tombstones
|
|
catalog: Arc<dyn Catalog>,
|
|
|
|
/// This map gets set up on initialization of the ingester so it won't ever be modified.
|
|
/// The content of each SequenceData will get changed when more namespaces and tables
|
|
/// get ingested.
|
|
sequencers: BTreeMap<SequencerId, SequencerData>,
|
|
|
|
/// Partitioner.
|
|
partitioner: Arc<dyn Partitioner>,
|
|
|
|
/// Executor for running queries and compacting and persisting
|
|
exec: Arc<Executor>,
|
|
|
|
/// Backoff config
|
|
backoff_config: BackoffConfig,
|
|
}
|
|
|
|
impl IngesterData {
|
|
/// Create new instance.
|
|
pub fn new(
|
|
object_store: Arc<DynObjectStore>,
|
|
catalog: Arc<dyn Catalog>,
|
|
sequencers: BTreeMap<SequencerId, SequencerData>,
|
|
partitioner: Arc<dyn Partitioner>,
|
|
exec: Arc<Executor>,
|
|
backoff_config: BackoffConfig,
|
|
) -> Self {
|
|
Self {
|
|
store: ParquetStorage::new(object_store),
|
|
catalog,
|
|
sequencers,
|
|
partitioner,
|
|
exec,
|
|
backoff_config,
|
|
}
|
|
}
|
|
|
|
/// Executor for running queries and compacting and persisting
|
|
pub(crate) fn exec(&self) -> &Arc<Executor> {
|
|
&self.exec
|
|
}
|
|
|
|
/// Get sequencer data for specific sequencer.
|
|
#[allow(dead_code)] // Used in tests
|
|
pub(crate) fn sequencer(&self, sequencer_id: SequencerId) -> Option<&SequencerData> {
|
|
self.sequencers.get(&sequencer_id)
|
|
}
|
|
|
|
/// Get iterator over sequencers (ID and data).
|
|
pub(crate) fn sequencers(&self) -> impl Iterator<Item = (&SequencerId, &SequencerData)> {
|
|
self.sequencers.iter()
|
|
}
|
|
|
|
/// Store the write or delete in the in memory buffer. Deletes will
|
|
/// be written into the catalog before getting stored in the buffer.
|
|
/// Any writes that create new IOx partitions will have those records
|
|
/// created in the catalog before putting into the buffer. Writes will
|
|
/// get logged in the lifecycle manager. If it indicates ingest should
|
|
/// be paused, this function will return true.
|
|
pub async fn buffer_operation(
|
|
&self,
|
|
sequencer_id: SequencerId,
|
|
dml_operation: DmlOperation,
|
|
lifecycle_handle: &dyn LifecycleHandle,
|
|
) -> Result<bool> {
|
|
let sequencer_data = self
|
|
.sequencers
|
|
.get(&sequencer_id)
|
|
.context(SequencerNotFoundSnafu { sequencer_id })?;
|
|
sequencer_data
|
|
.buffer_operation(
|
|
dml_operation,
|
|
sequencer_id,
|
|
self.catalog.as_ref(),
|
|
lifecycle_handle,
|
|
self.partitioner.as_ref(),
|
|
&self.exec,
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Return the ingestion progress for the specified kafka
|
|
/// partitions. Returns an empty `SequencerProgress` for any kafka
|
|
/// partitions that this ingester doesn't know about.
|
|
pub(crate) async fn progresses(
|
|
&self,
|
|
partitions: Vec<KafkaPartition>,
|
|
) -> BTreeMap<KafkaPartition, SequencerProgress> {
|
|
let mut progresses = BTreeMap::new();
|
|
for kafka_partition in partitions {
|
|
let sequencer_data = self
|
|
.sequencers
|
|
.iter()
|
|
.map(|(_, sequencer_data)| sequencer_data)
|
|
.find(|sequencer_data| sequencer_data.kafka_partition == kafka_partition);
|
|
|
|
let progress = match sequencer_data {
|
|
Some(sequencer_data) => sequencer_data.progress().await,
|
|
None => SequencerProgress::new(), // don't know about this sequencer
|
|
};
|
|
|
|
progresses.insert(kafka_partition, progress);
|
|
}
|
|
progresses
|
|
}
|
|
}
|
|
|
|
/// The Persister has a function to persist a given partition ID and to update the
|
|
/// assocated sequencer's `min_unpersisted_sequence_number`.
|
|
#[async_trait]
|
|
pub trait Persister: Send + Sync + 'static {
|
|
/// Persits the partition ID. Will retry forever until it succeeds.
|
|
async fn persist(&self, partition_id: PartitionId);
|
|
|
|
/// Updates the sequencer's `min_unpersisted_sequence_number` in the catalog.
|
|
/// This number represents the minimum that might be unpersisted, which is the
|
|
/// farthest back the ingester would need to read in the write buffer to ensure
|
|
/// that all data would be correctly replayed on startup.
|
|
async fn update_min_unpersisted_sequence_number(
|
|
&self,
|
|
sequencer_id: SequencerId,
|
|
sequence_number: SequenceNumber,
|
|
);
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Persister for IngesterData {
|
|
async fn persist(&self, partition_id: PartitionId) {
|
|
// lookup the partition_info from the catalog
|
|
let partition_info = Backoff::new(&self.backoff_config)
|
|
.retry_all_errors("get partition_info_by_id", || async {
|
|
let mut repos = self.catalog.repositories().await;
|
|
repos.partitions().partition_info_by_id(partition_id).await
|
|
})
|
|
.await
|
|
.expect("retry forever");
|
|
|
|
// lookup the state from the ingester data. If something isn't found, it's unexpected. Crash
|
|
// so someone can take a look.
|
|
let partition_info = partition_info
|
|
.unwrap_or_else(|| panic!("partition {} not found in catalog", partition_id));
|
|
let sequencer_data = self
|
|
.sequencers
|
|
.get(&partition_info.partition.sequencer_id)
|
|
.unwrap_or_else(|| {
|
|
panic!(
|
|
"sequencer state for {} not in ingester data",
|
|
partition_info.partition.sequencer_id
|
|
)
|
|
}); //{
|
|
let namespace = sequencer_data
|
|
.namespace(&partition_info.namespace_name)
|
|
.unwrap_or_else(|| {
|
|
panic!(
|
|
"namespace {} not in sequencer {} state",
|
|
partition_info.namespace_name, partition_info.partition.sequencer_id
|
|
)
|
|
});
|
|
debug!(?partition_info, "Persisting");
|
|
|
|
let persisting_batch = namespace.snapshot_to_persisting(&partition_info).await;
|
|
|
|
if let Some(persisting_batch) = persisting_batch {
|
|
// do the CPU intensive work of compaction, de-duplication and sorting
|
|
let (record_batches, iox_meta, sort_key_update) = match compact_persisting_batch(
|
|
Arc::new(SystemProvider::new()),
|
|
&self.exec,
|
|
namespace.namespace_id.get(),
|
|
&partition_info,
|
|
Arc::clone(&persisting_batch),
|
|
)
|
|
.await
|
|
{
|
|
Err(e) => {
|
|
// this should never error out. if it does, we need to crash hard so
|
|
// someone can take a look.
|
|
panic!("unable to compact persisting batch with error: {:?}", e);
|
|
}
|
|
Ok(Some(r)) => r,
|
|
Ok(None) => {
|
|
warn!("persist called with no data");
|
|
return;
|
|
}
|
|
};
|
|
|
|
// save the compacted data to a parquet file in object storage
|
|
let file_size_and_md = Backoff::new(&self.backoff_config)
|
|
.retry_all_errors("persist to object store", || {
|
|
persist(&iox_meta, record_batches.to_vec(), self.store.clone())
|
|
})
|
|
.await
|
|
.expect("retry forever");
|
|
|
|
if let Some((file_size, md)) = file_size_and_md {
|
|
// Add the parquet file to the catalog until succeed
|
|
let parquet_file = iox_meta.to_parquet_file(partition_id, file_size, &md);
|
|
Backoff::new(&self.backoff_config)
|
|
.retry_all_errors("add parquet file to catalog", || async {
|
|
let mut repos = self.catalog.repositories().await;
|
|
debug!(
|
|
table_name=%iox_meta.table_name,
|
|
"adding parquet file to catalog"
|
|
);
|
|
|
|
repos.parquet_files().create(parquet_file.clone()).await
|
|
})
|
|
.await
|
|
.expect("retry forever");
|
|
}
|
|
|
|
// Update the sort key in the catalog if there are additional columns
|
|
if let Some(new_sort_key) = sort_key_update {
|
|
let sort_key_string = new_sort_key.to_columns();
|
|
Backoff::new(&self.backoff_config)
|
|
.retry_all_errors("update_sort_key", || async {
|
|
let mut repos = self.catalog.repositories().await;
|
|
repos
|
|
.partitions()
|
|
.update_sort_key(partition_id, &sort_key_string)
|
|
.await
|
|
})
|
|
.await
|
|
.expect("retry forever");
|
|
}
|
|
|
|
// and remove the persisted data from memory
|
|
debug!(
|
|
table_name=%partition_info.table_name,
|
|
partition_key=%partition_info.partition.partition_key,
|
|
max_sequence_number=%iox_meta.max_sequence_number.get(),
|
|
"mark_persisted"
|
|
);
|
|
namespace
|
|
.mark_persisted(
|
|
&partition_info.table_name,
|
|
&partition_info.partition.partition_key,
|
|
iox_meta.max_sequence_number,
|
|
)
|
|
.await;
|
|
}
|
|
}
|
|
|
|
async fn update_min_unpersisted_sequence_number(
|
|
&self,
|
|
sequencer_id: SequencerId,
|
|
sequence_number: SequenceNumber,
|
|
) {
|
|
Backoff::new(&self.backoff_config)
|
|
.retry_all_errors("updating min_unpersisted_sequence_number", || async {
|
|
self.catalog
|
|
.repositories()
|
|
.await
|
|
.sequencers()
|
|
.update_min_unpersisted_sequence_number(sequencer_id, sequence_number)
|
|
.await
|
|
})
|
|
.await
|
|
.expect("retry forever")
|
|
}
|
|
}
|
|
|
|
/// Data of a Shard
|
|
#[derive(Debug)]
|
|
pub struct SequencerData {
|
|
/// The kafka partition for this sequencer
|
|
kafka_partition: KafkaPartition,
|
|
|
|
// New namespaces can come in at any time so we need to be able to add new ones
|
|
namespaces: RwLock<BTreeMap<String, Arc<NamespaceData>>>,
|
|
|
|
metrics: Arc<metric::Registry>,
|
|
namespace_count: U64Counter,
|
|
}
|
|
|
|
impl SequencerData {
|
|
/// Initialise a new [`SequencerData`] that emits metrics to `metrics`.
|
|
pub fn new(kafka_partition: KafkaPartition, metrics: Arc<metric::Registry>) -> Self {
|
|
let namespace_count = metrics
|
|
.register_metric::<U64Counter>(
|
|
"ingester_namespaces_total",
|
|
"Number of namespaces known to the ingester",
|
|
)
|
|
.recorder(&[]);
|
|
|
|
Self {
|
|
kafka_partition,
|
|
namespaces: Default::default(),
|
|
metrics,
|
|
namespace_count,
|
|
}
|
|
}
|
|
|
|
/// Initialize new SequncerData with namespace for testing purpose only
|
|
#[cfg(test)]
|
|
pub fn new_for_test(
|
|
kafka_partition: KafkaPartition,
|
|
namespaces: BTreeMap<String, Arc<NamespaceData>>,
|
|
) -> Self {
|
|
Self {
|
|
kafka_partition,
|
|
namespaces: RwLock::new(namespaces),
|
|
metrics: Default::default(),
|
|
namespace_count: Default::default(),
|
|
}
|
|
}
|
|
|
|
/// Store the write or delete in the sequencer. Deletes will
|
|
/// be written into the catalog before getting stored in the buffer.
|
|
/// Any writes that create new IOx partitions will have those records
|
|
/// created in the catalog before putting into the buffer.
|
|
pub async fn buffer_operation(
|
|
&self,
|
|
dml_operation: DmlOperation,
|
|
sequencer_id: SequencerId,
|
|
catalog: &dyn Catalog,
|
|
lifecycle_handle: &dyn LifecycleHandle,
|
|
partitioner: &dyn Partitioner,
|
|
executor: &Executor,
|
|
) -> Result<bool> {
|
|
let namespace_data = match self.namespace(dml_operation.namespace()) {
|
|
Some(d) => d,
|
|
None => {
|
|
self.insert_namespace(dml_operation.namespace(), catalog)
|
|
.await?
|
|
}
|
|
};
|
|
|
|
namespace_data
|
|
.buffer_operation(
|
|
dml_operation,
|
|
sequencer_id,
|
|
catalog,
|
|
lifecycle_handle,
|
|
partitioner,
|
|
executor,
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Gets the namespace data out of the map
|
|
pub fn namespace(&self, namespace: &str) -> Option<Arc<NamespaceData>> {
|
|
let n = self.namespaces.read();
|
|
n.get(namespace).cloned()
|
|
}
|
|
|
|
/// Retrieves the namespace from the catalog and initializes an empty buffer, or
|
|
/// retrieves the buffer if some other caller gets it first
|
|
async fn insert_namespace(
|
|
&self,
|
|
namespace: &str,
|
|
catalog: &dyn Catalog,
|
|
) -> Result<Arc<NamespaceData>> {
|
|
let mut repos = catalog.repositories().await;
|
|
let namespace = repos
|
|
.namespaces()
|
|
.get_by_name(namespace)
|
|
.await
|
|
.context(CatalogSnafu)?
|
|
.context(NamespaceNotFoundSnafu { namespace })?;
|
|
|
|
let mut n = self.namespaces.write();
|
|
|
|
let data = match n.entry(namespace.name) {
|
|
Entry::Vacant(v) => {
|
|
let v = v.insert(Arc::new(NamespaceData::new(namespace.id, &*self.metrics)));
|
|
self.namespace_count.inc(1);
|
|
Arc::clone(v)
|
|
}
|
|
Entry::Occupied(v) => Arc::clone(v.get()),
|
|
};
|
|
|
|
Ok(data)
|
|
}
|
|
|
|
/// Return the progress of this sequencer
|
|
async fn progress(&self) -> SequencerProgress {
|
|
let namespaces: Vec<_> = self.namespaces.read().values().map(Arc::clone).collect();
|
|
|
|
let mut progress = SequencerProgress::new();
|
|
|
|
for namespace_data in namespaces {
|
|
progress = progress.combine(namespace_data.progress().await);
|
|
}
|
|
progress
|
|
}
|
|
}
|
|
|
|
/// Data of a Namespace that belongs to a given Shard
|
|
#[derive(Debug)]
|
|
pub struct NamespaceData {
|
|
namespace_id: NamespaceId,
|
|
tables: RwLock<BTreeMap<String, Arc<tokio::sync::RwLock<TableData>>>>,
|
|
|
|
table_count: U64Counter,
|
|
}
|
|
|
|
impl NamespaceData {
|
|
/// Initialize new tables with default partition template of daily
|
|
pub fn new(namespace_id: NamespaceId, metrics: &metric::Registry) -> Self {
|
|
let table_count = metrics
|
|
.register_metric::<U64Counter>(
|
|
"ingester_tables_total",
|
|
"Number of tables known to the ingester",
|
|
)
|
|
.recorder(&[]);
|
|
|
|
Self {
|
|
namespace_id,
|
|
tables: Default::default(),
|
|
table_count,
|
|
}
|
|
}
|
|
|
|
/// Initialize new tables with data for testing purpose only
|
|
#[cfg(test)]
|
|
pub(crate) fn new_for_test(
|
|
namespace_id: NamespaceId,
|
|
tables: BTreeMap<String, Arc<tokio::sync::RwLock<TableData>>>,
|
|
) -> Self {
|
|
Self {
|
|
namespace_id,
|
|
tables: RwLock::new(tables),
|
|
table_count: Default::default(),
|
|
}
|
|
}
|
|
|
|
/// Buffer the operation in the cache, adding any new partitions or delete tombstones to the
|
|
/// catalog. Returns true if ingest should be paused due to memory limits set in the passed
|
|
/// lifecycle manager.
|
|
pub async fn buffer_operation(
|
|
&self,
|
|
dml_operation: DmlOperation,
|
|
sequencer_id: SequencerId,
|
|
catalog: &dyn Catalog,
|
|
lifecycle_handle: &dyn LifecycleHandle,
|
|
partitioner: &dyn Partitioner,
|
|
executor: &Executor,
|
|
) -> Result<bool> {
|
|
let sequence_number = dml_operation
|
|
.meta()
|
|
.sequence()
|
|
.expect("must have sequence number")
|
|
.sequence_number;
|
|
let sequence_number = i64::try_from(sequence_number).expect("sequence out of bounds");
|
|
let sequence_number = SequenceNumber::new(sequence_number);
|
|
|
|
match dml_operation {
|
|
DmlOperation::Write(write) => {
|
|
let mut pause_writes = false;
|
|
|
|
for (t, b) in write.into_tables() {
|
|
let table_data = match self.table_data(&t) {
|
|
Some(t) => t,
|
|
None => self.insert_table(sequencer_id, &t, catalog).await?,
|
|
};
|
|
|
|
let mut table_data = table_data.write().await;
|
|
let should_pause = table_data
|
|
.buffer_table_write(
|
|
sequence_number,
|
|
b,
|
|
sequencer_id,
|
|
catalog,
|
|
lifecycle_handle,
|
|
partitioner,
|
|
)
|
|
.await?;
|
|
|
|
pause_writes = pause_writes || should_pause;
|
|
}
|
|
|
|
Ok(pause_writes)
|
|
}
|
|
DmlOperation::Delete(delete) => {
|
|
let table_name = delete.table_name().context(TableNotPresentSnafu)?;
|
|
let table_data = match self.table_data(table_name) {
|
|
Some(t) => t,
|
|
None => self.insert_table(sequencer_id, table_name, catalog).await?,
|
|
};
|
|
|
|
let mut table_data = table_data.write().await;
|
|
|
|
table_data
|
|
.buffer_delete(
|
|
table_name,
|
|
delete.predicate(),
|
|
sequencer_id,
|
|
sequence_number,
|
|
catalog,
|
|
executor,
|
|
)
|
|
.await?;
|
|
|
|
// don't pause writes since deletes don't count towards memory limits
|
|
Ok(false)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Snapshots the mutable buffer for the partition, which clears it out and moves it over to
|
|
/// snapshots. Then return a vec of the snapshots and the optional persisting batch.
|
|
pub async fn snapshot(
|
|
&self,
|
|
table_name: &str,
|
|
partition_key: &str,
|
|
) -> Option<(Vec<Arc<SnapshotBatch>>, Option<Arc<PersistingBatch>>)> {
|
|
if let Some(t) = self.table_data(table_name) {
|
|
let mut t = t.write().await;
|
|
|
|
return t.partition_data.get_mut(partition_key).map(|p| {
|
|
p.data
|
|
.snapshot()
|
|
.expect("snapshot on mutable batch should never fail");
|
|
(p.data.snapshots.to_vec(), p.data.persisting.clone())
|
|
});
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Snapshots the mutable buffer for the partition, which clears it out and then moves all
|
|
/// snapshots over to a persisting batch, which is returned. If there is no data to snapshot
|
|
/// or persist, None will be returned.
|
|
pub async fn snapshot_to_persisting(
|
|
&self,
|
|
partition_info: &PartitionInfo,
|
|
) -> Option<Arc<PersistingBatch>> {
|
|
if let Some(table_data) = self.table_data(&partition_info.table_name) {
|
|
let mut table_data = table_data.write().await;
|
|
|
|
return table_data
|
|
.partition_data
|
|
.get_mut(&partition_info.partition.partition_key)
|
|
.and_then(|partition_data| {
|
|
partition_data.snapshot_to_persisting_batch(
|
|
partition_info.partition.sequencer_id,
|
|
partition_info.partition.table_id,
|
|
partition_info.partition.id,
|
|
&partition_info.table_name,
|
|
)
|
|
});
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
/// Gets the buffered table data
|
|
pub(crate) fn table_data(
|
|
&self,
|
|
table_name: &str,
|
|
) -> Option<Arc<tokio::sync::RwLock<TableData>>> {
|
|
let t = self.tables.read();
|
|
t.get(table_name).cloned()
|
|
}
|
|
|
|
/// Inserts the table or returns it if it happens to be inserted by some other thread
|
|
async fn insert_table(
|
|
&self,
|
|
sequencer_id: SequencerId,
|
|
table_name: &str,
|
|
catalog: &dyn Catalog,
|
|
) -> Result<Arc<tokio::sync::RwLock<TableData>>> {
|
|
let mut repos = catalog.repositories().await;
|
|
let info = repos
|
|
.tables()
|
|
.get_table_persist_info(sequencer_id, self.namespace_id, table_name)
|
|
.await
|
|
.context(CatalogSnafu)?
|
|
.context(TableNotFoundSnafu { table_name })?;
|
|
|
|
let mut t = self.tables.write();
|
|
|
|
let data = match t.entry(table_name.to_string()) {
|
|
Entry::Vacant(v) => {
|
|
let v = v.insert(Arc::new(tokio::sync::RwLock::new(TableData::new(
|
|
info.table_id,
|
|
info.tombstone_max_sequence_number,
|
|
))));
|
|
self.table_count.inc(1);
|
|
Arc::clone(v)
|
|
}
|
|
Entry::Occupied(v) => Arc::clone(v.get()),
|
|
};
|
|
|
|
Ok(data)
|
|
}
|
|
|
|
/// Walks down the table and partition and clears the persisting batch. The sequence number is
|
|
/// the max_sequence_number for the persisted parquet file, which should be kept in the table
|
|
/// data buffer.
|
|
async fn mark_persisted(
|
|
&self,
|
|
table_name: &str,
|
|
partition_key: &str,
|
|
sequence_number: SequenceNumber,
|
|
) {
|
|
if let Some(t) = self.table_data(table_name) {
|
|
let mut t = t.write().await;
|
|
let partition = t.partition_data.get_mut(partition_key);
|
|
|
|
if let Some(p) = partition {
|
|
p.data.max_persisted_sequence_number = Some(sequence_number);
|
|
p.data.persisting = None;
|
|
// clear the deletes kept for this persisting batch
|
|
p.data.deletes_during_persisting.clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Return progress from this Namespace
|
|
async fn progress(&self) -> SequencerProgress {
|
|
let tables: Vec<_> = self.tables.read().values().map(Arc::clone).collect();
|
|
|
|
let mut progress = SequencerProgress::new();
|
|
for table_data in tables {
|
|
progress = progress.combine(table_data.read().await.progress())
|
|
}
|
|
|
|
progress
|
|
}
|
|
}
|
|
|
|
/// Data of a Table in a given Namesapce that belongs to a given Shard
|
|
#[derive(Debug)]
|
|
pub(crate) struct TableData {
|
|
table_id: TableId,
|
|
// the max sequence number for a tombstone associated with this table
|
|
tombstone_max_sequence_number: Option<SequenceNumber>,
|
|
// Map pf partition key to its data
|
|
partition_data: BTreeMap<String, PartitionData>,
|
|
}
|
|
|
|
impl TableData {
|
|
/// Initialize new table buffer
|
|
pub fn new(table_id: TableId, tombstone_max_sequence_number: Option<SequenceNumber>) -> Self {
|
|
Self {
|
|
table_id,
|
|
tombstone_max_sequence_number,
|
|
partition_data: Default::default(),
|
|
}
|
|
}
|
|
|
|
/// Initialize new table buffer for testing purpose only
|
|
#[cfg(test)]
|
|
pub fn new_for_test(
|
|
table_id: TableId,
|
|
tombstone_max_sequence_number: Option<SequenceNumber>,
|
|
partitions: BTreeMap<String, PartitionData>,
|
|
) -> Self {
|
|
Self {
|
|
table_id,
|
|
tombstone_max_sequence_number,
|
|
partition_data: partitions,
|
|
}
|
|
}
|
|
|
|
/// Return parquet_max_sequence_number
|
|
pub fn parquet_max_sequence_number(&self) -> Option<SequenceNumber> {
|
|
self.partition_data
|
|
.values()
|
|
.map(|p| p.data.max_persisted_sequence_number)
|
|
.max()
|
|
.flatten()
|
|
}
|
|
|
|
/// Return tombstone_max_sequence_number
|
|
#[allow(dead_code)] // Used in tests
|
|
pub fn tombstone_max_sequence_number(&self) -> Option<SequenceNumber> {
|
|
self.tombstone_max_sequence_number
|
|
}
|
|
|
|
// buffers the table write and returns true if the lifecycle manager indicates that
|
|
// ingest should be paused.
|
|
async fn buffer_table_write(
|
|
&mut self,
|
|
sequence_number: SequenceNumber,
|
|
batch: MutableBatch,
|
|
sequencer_id: SequencerId,
|
|
catalog: &dyn Catalog,
|
|
lifecycle_handle: &dyn LifecycleHandle,
|
|
partitioner: &dyn Partitioner,
|
|
) -> Result<bool> {
|
|
let partition_key = partitioner
|
|
.partition_key(&batch)
|
|
.context(PartitioningSnafu)?;
|
|
|
|
let partition_data = match self.partition_data.get_mut(&partition_key) {
|
|
Some(p) => p,
|
|
None => {
|
|
self.insert_partition(&partition_key, sequencer_id, catalog)
|
|
.await?;
|
|
self.partition_data.get_mut(&partition_key).unwrap()
|
|
}
|
|
};
|
|
|
|
// skip the write if it has already been persisted
|
|
if let Some(max) = partition_data.data.max_persisted_sequence_number {
|
|
if max >= sequence_number {
|
|
return Ok(false);
|
|
}
|
|
}
|
|
|
|
let should_pause = lifecycle_handle.log_write(
|
|
partition_data.id,
|
|
sequencer_id,
|
|
sequence_number,
|
|
batch.size(),
|
|
);
|
|
partition_data.buffer_write(sequence_number, batch)?;
|
|
|
|
Ok(should_pause)
|
|
}
|
|
|
|
async fn buffer_delete(
|
|
&mut self,
|
|
table_name: &str,
|
|
predicate: &DeletePredicate,
|
|
sequencer_id: SequencerId,
|
|
sequence_number: SequenceNumber,
|
|
catalog: &dyn Catalog,
|
|
executor: &Executor,
|
|
) -> Result<()> {
|
|
let min_time = Timestamp::new(predicate.range.start());
|
|
let max_time = Timestamp::new(predicate.range.end());
|
|
|
|
let mut repos = catalog.repositories().await;
|
|
let tombstone = repos
|
|
.tombstones()
|
|
.create_or_get(
|
|
self.table_id,
|
|
sequencer_id,
|
|
sequence_number,
|
|
min_time,
|
|
max_time,
|
|
&predicate.expr_sql_string(),
|
|
)
|
|
.await
|
|
.context(CatalogSnafu)?;
|
|
|
|
// remember "persisted" state
|
|
self.tombstone_max_sequence_number = Some(sequence_number);
|
|
|
|
// modify one partition at a time
|
|
for data in self.partition_data.values_mut() {
|
|
data.buffer_tombstone(executor, table_name, tombstone.clone())
|
|
.await;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn unpersisted_partition_data(&self) -> Vec<UnpersistedPartitionData> {
|
|
self.partition_data
|
|
.values()
|
|
.map(|p| UnpersistedPartitionData {
|
|
partition_id: p.id,
|
|
non_persisted: p
|
|
.get_non_persisting_data()
|
|
.expect("get_non_persisting should always work"),
|
|
persisting: p.get_persisting_data(),
|
|
partition_status: PartitionStatus {
|
|
parquet_max_sequence_number: p.data.max_persisted_sequence_number,
|
|
tombstone_max_sequence_number: self.tombstone_max_sequence_number,
|
|
},
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
async fn insert_partition(
|
|
&mut self,
|
|
partition_key: &str,
|
|
sequencer_id: SequencerId,
|
|
catalog: &dyn Catalog,
|
|
) -> Result<()> {
|
|
let mut repos = catalog.repositories().await;
|
|
let partition = repos
|
|
.partitions()
|
|
.create_or_get(partition_key, sequencer_id, self.table_id)
|
|
.await
|
|
.context(CatalogSnafu)?;
|
|
|
|
// get info on the persisted parquet files to use later for replay or for snapshot
|
|
// information on query.
|
|
let files = repos
|
|
.parquet_files()
|
|
.list_by_partition_not_to_delete(partition.id)
|
|
.await
|
|
.context(CatalogSnafu)?;
|
|
// for now we just need the max persisted
|
|
let max_persisted_sequence_number = files.iter().map(|p| p.max_sequence_number).max();
|
|
|
|
let mut data = PartitionData::new(partition.id);
|
|
data.data.max_persisted_sequence_number = max_persisted_sequence_number;
|
|
|
|
self.partition_data.insert(partition.partition_key, data);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Return progress from this Table
|
|
fn progress(&self) -> SequencerProgress {
|
|
let progress = SequencerProgress::new();
|
|
let progress = match self.parquet_max_sequence_number() {
|
|
Some(n) => progress.with_persisted(n),
|
|
None => progress,
|
|
};
|
|
|
|
self.partition_data
|
|
.values()
|
|
.fold(progress, |progress, partition_data| {
|
|
progress.combine(partition_data.progress())
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Read only copy of the unpersisted data for a partition in the ingester for a specific partition.
|
|
#[derive(Debug)]
|
|
pub(crate) struct UnpersistedPartitionData {
|
|
pub partition_id: PartitionId,
|
|
pub non_persisted: Vec<Arc<SnapshotBatch>>,
|
|
pub persisting: Option<QueryableBatch>,
|
|
pub partition_status: PartitionStatus,
|
|
}
|
|
|
|
/// Data of an IOx Partition of a given Table of a Namesapce that belongs to a given Shard
|
|
#[derive(Debug)]
|
|
pub(crate) struct PartitionData {
|
|
id: PartitionId,
|
|
data: DataBuffer,
|
|
}
|
|
|
|
impl PartitionData {
|
|
/// Initialize a new partition data buffer
|
|
pub fn new(id: PartitionId) -> Self {
|
|
Self {
|
|
id,
|
|
data: Default::default(),
|
|
}
|
|
}
|
|
|
|
/// Snapshot anything in the buffer and move all snapshot data into a persisting batch
|
|
pub fn snapshot_to_persisting_batch(
|
|
&mut self,
|
|
sequencer_id: SequencerId,
|
|
table_id: TableId,
|
|
partition_id: PartitionId,
|
|
table_name: &str,
|
|
) -> Option<Arc<PersistingBatch>> {
|
|
self.data
|
|
.snapshot_to_persisting(sequencer_id, table_id, partition_id, table_name)
|
|
}
|
|
|
|
/// Snapshot whatever is in the buffer and return a new vec of the
|
|
/// arc cloned snapshots
|
|
#[allow(dead_code)] // Used in tests
|
|
pub fn snapshot(&mut self) -> Result<Vec<Arc<SnapshotBatch>>> {
|
|
self.data.snapshot().context(SnapshotSnafu)?;
|
|
Ok(self.data.snapshots.to_vec())
|
|
}
|
|
|
|
/// Return non persisting data
|
|
pub fn get_non_persisting_data(&self) -> Result<Vec<Arc<SnapshotBatch>>> {
|
|
self.data.buffer_and_snapshots()
|
|
}
|
|
|
|
/// Return persisting data
|
|
pub fn get_persisting_data(&self) -> Option<QueryableBatch> {
|
|
self.data.get_persisting_data()
|
|
}
|
|
|
|
/// Write the given mb in the buffer
|
|
pub(crate) fn buffer_write(
|
|
&mut self,
|
|
sequencer_number: SequenceNumber,
|
|
mb: MutableBatch,
|
|
) -> Result<()> {
|
|
match &mut self.data.buffer {
|
|
Some(buf) => {
|
|
buf.max_sequence_number = sequencer_number.max(buf.max_sequence_number);
|
|
buf.data.extend_from(&mb).context(BufferWriteSnafu)?;
|
|
}
|
|
None => {
|
|
self.data.buffer = Some(BufferBatch {
|
|
min_sequence_number: sequencer_number,
|
|
max_sequence_number: sequencer_number,
|
|
data: mb,
|
|
})
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Buffers a new tombstone:
|
|
/// . All the data in the `buffer` and `snapshots` will be replaced with one
|
|
/// tombstone-applied snapshot
|
|
/// . The tombstone is only added in the `deletes_during_persisting` if the `persisting`
|
|
/// exists
|
|
pub(crate) async fn buffer_tombstone(
|
|
&mut self,
|
|
executor: &Executor,
|
|
table_name: &str,
|
|
tombstone: Tombstone,
|
|
) {
|
|
self.data.add_tombstone(tombstone.clone());
|
|
|
|
// ----------------------------------------------------------
|
|
// First apply the tombstone on all in-memeory & non-persisting data
|
|
// Make a QueryableBatch for all buffer + snapshots + the given tombstone
|
|
let max_sequencer_number = tombstone.sequence_number;
|
|
let query_batch = match self
|
|
.data
|
|
.snapshot_to_queryable_batch(table_name, Some(tombstone.clone()))
|
|
{
|
|
Some(query_batch) if !query_batch.is_empty() => query_batch,
|
|
_ => {
|
|
// No need to proceed further
|
|
return;
|
|
}
|
|
};
|
|
|
|
let (min_sequencer_number, _) = query_batch.min_max_sequence_numbers();
|
|
assert!(min_sequencer_number <= max_sequencer_number);
|
|
|
|
// Run query on the QueryableBatch to apply the tombstone.
|
|
let stream = match query(
|
|
executor,
|
|
Arc::new(query_batch),
|
|
Predicate::default(),
|
|
Selection::All,
|
|
)
|
|
.await
|
|
{
|
|
Err(e) => {
|
|
// this should never error out. if it does, we need to crash hard so
|
|
// someone can take a look.
|
|
panic!("unable to apply tombstones on snapshots: {:?}", e);
|
|
}
|
|
Ok(stream) => stream,
|
|
};
|
|
let record_batches = match datafusion::physical_plan::common::collect(stream).await {
|
|
Err(e) => {
|
|
// this should never error out. if it does, we need to crash hard so
|
|
// someone can take a look.
|
|
panic!("unable to collect record batches: {:?}", e);
|
|
}
|
|
Ok(batches) => batches,
|
|
};
|
|
|
|
// Merge all result record batches into one record batch
|
|
// and make a snapshot for it
|
|
let snapshot = if !record_batches.is_empty() {
|
|
let record_batch = RecordBatch::concat(&record_batches[0].schema(), &record_batches)
|
|
.unwrap_or_else(|e| {
|
|
panic!("unable to concat record batches: {:?}", e);
|
|
});
|
|
let snapshot = SnapshotBatch {
|
|
min_sequencer_number,
|
|
max_sequencer_number,
|
|
data: Arc::new(record_batch),
|
|
};
|
|
|
|
Some(Arc::new(snapshot))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
// ----------------------------------------------------------
|
|
// Add the tombstone-applied data back in as one snapshot
|
|
if let Some(snapshot) = snapshot {
|
|
self.data.snapshots.push(snapshot);
|
|
}
|
|
}
|
|
|
|
/// Return the progress from this Partition
|
|
fn progress(&self) -> SequencerProgress {
|
|
self.data.progress()
|
|
}
|
|
}
|
|
|
|
/// Data of an IOx partition split into batches
|
|
/// ┌────────────────────────┐ ┌────────────────────────┐ ┌─────────────────────────┐
|
|
/// │ Buffer │ │ Snapshots │ │ Persisting │
|
|
/// │ ┌───────────────────┐ │ │ │ │ │
|
|
/// │ │ ┌───────────────┐│ │ │ ┌───────────────────┐ │ │ ┌───────────────────┐ │
|
|
/// │ │ ┌┴──────────────┐│├─┼────────┼─┼─▶┌───────────────┐│ │ │ │ ┌───────────────┐│ │
|
|
/// │ │┌┴──────────────┐├┘│ │ │ │ ┌┴──────────────┐││ │ │ │ ┌┴──────────────┐││ │
|
|
/// │ ││ BufferBatch ├┘ │ │ │ │┌┴──────────────┐├┘│──┼──────┼─▶│┌┴──────────────┐├┘│ │
|
|
/// │ │└───────────────┘ │ │ ┌───┼─▶│ SnapshotBatch ├┘ │ │ │ ││ SnapshotBatch ├┘ │ │
|
|
/// │ └───────────────────┘ │ │ │ │└───────────────┘ │ │ │ │└───────────────┘ │ │
|
|
/// │ ... │ │ │ └───────────────────┘ │ │ └───────────────────┘ │
|
|
/// │ ┌───────────────────┐ │ │ │ │ │ │
|
|
/// │ │ ┌───────────────┐│ │ │ │ ... │ │ ... │
|
|
/// │ │ ┌┴──────────────┐││ │ │ │ │ │ │
|
|
/// │ │┌┴──────────────┐├┘│─┼────┘ │ ┌───────────────────┐ │ │ ┌───────────────────┐ │
|
|
/// │ ││ BufferBatch ├┘ │ │ │ │ ┌───────────────┐│ │ │ │ ┌───────────────┐│ │
|
|
/// │ │└───────────────┘ │ │ │ │ ┌┴──────────────┐││ │ │ │ ┌┴──────────────┐││ │
|
|
/// │ └───────────────────┘ │ │ │┌┴──────────────┐├┘│──┼──────┼─▶│┌┴──────────────┐├┘│ │
|
|
/// │ │ │ ││ SnapshotBatch ├┘ │ │ │ ││ SnapshotBatch ├┘ │ │
|
|
/// │ ... │ │ │└───────────────┘ │ │ │ │└───────────────┘ │ │
|
|
/// │ │ │ └───────────────────┘ │ │ └───────────────────┘ │
|
|
/// └────────────────────────┘ └────────────────────────┘ └─────────────────────────┘
|
|
#[derive(Debug, Default)]
|
|
struct DataBuffer {
|
|
/// Buffer of incoming writes
|
|
pub(crate) buffer: Option<BufferBatch>,
|
|
|
|
/// The max_persisted_sequence number for any parquet_file in this partition
|
|
pub(crate) max_persisted_sequence_number: Option<SequenceNumber>,
|
|
|
|
/// Buffer of tombstones whose time range may overlap with this partition.
|
|
/// All tombstones were already applied to corresponding snapshots. This list
|
|
/// only keep the ones that come during persisting. The reason
|
|
/// we keep them becasue if a query comes, we need to apply these tombstones
|
|
/// on the persiting data before sending it to the Querier
|
|
/// When the `persiting` is done and removed, this list will get empty, too
|
|
pub(crate) deletes_during_persisting: Vec<Tombstone>,
|
|
|
|
/// Data in `buffer` will be moved to a `snapshot` when one of these happens:
|
|
/// . A background persist is called
|
|
/// . A read request from Querier
|
|
/// The `buffer` will be empty when this happens.
|
|
pub(crate) snapshots: Vec<Arc<SnapshotBatch>>,
|
|
/// When a persist is called, data in `buffer` will be moved to a `snapshot`
|
|
/// and then all `snapshots` will be moved to a `persisting`.
|
|
/// Both `buffer` and 'snaphots` will be empty when this happens.
|
|
pub(crate) persisting: Option<Arc<PersistingBatch>>,
|
|
// Extra Notes:
|
|
// . In MVP, we will only persist a set of snapshots at a time.
|
|
// In later version, multiple perssiting operations may be happenning concurrently but
|
|
// their persisted info must be added into the Catalog in thier data
|
|
// ingesting order.
|
|
// . When a read request comes from a Querier, all data from `snaphots`
|
|
// and `persisting` must be sent to the Querier.
|
|
// . After the `persiting` data is persisted and successfully added
|
|
// into the Catalog, it will be removed from this Data Buffer.
|
|
// This data might be added into an extra cache to serve up to
|
|
// Queriers that may not have loaded the parquet files from object
|
|
// storage yet. But this will be decided after MVP.
|
|
}
|
|
|
|
impl DataBuffer {
|
|
/// Add a new tombstones into the DataBuffer
|
|
pub fn add_tombstone(&mut self, tombstone: Tombstone) {
|
|
// Only keep this tombstone if some data is being persisted
|
|
if self.persisting.is_some() {
|
|
self.deletes_during_persisting.push(tombstone);
|
|
}
|
|
}
|
|
|
|
/// Move `BufferBatch`es to a `SnapshotBatch`.
|
|
pub fn snapshot(&mut self) -> Result<(), mutable_batch::Error> {
|
|
let snapshot = self.copy_buffer_to_snapshot()?;
|
|
if let Some(snapshot) = snapshot {
|
|
self.snapshots.push(snapshot);
|
|
self.buffer = None;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Returns snapshot of the buffer but keep data in the buffer
|
|
pub fn copy_buffer_to_snapshot(
|
|
&self,
|
|
) -> Result<Option<Arc<SnapshotBatch>>, mutable_batch::Error> {
|
|
if let Some(buf) = &self.buffer {
|
|
return Ok(Some(Arc::new(SnapshotBatch {
|
|
min_sequencer_number: buf.min_sequence_number,
|
|
max_sequencer_number: buf.max_sequence_number,
|
|
data: Arc::new(buf.data.to_arrow(Selection::All)?),
|
|
})));
|
|
}
|
|
|
|
Ok(None)
|
|
}
|
|
|
|
/// Snapshots the buffer and make a QueryableBatch for all the snapshots
|
|
/// Both buffer and snapshots will be empty after this
|
|
pub fn snapshot_to_queryable_batch(
|
|
&mut self,
|
|
table_name: &str,
|
|
tombstone: Option<Tombstone>,
|
|
) -> Option<QueryableBatch> {
|
|
self.snapshot()
|
|
.expect("This mutable batch snapshot error should be impossible.");
|
|
|
|
let mut data = vec![];
|
|
std::mem::swap(&mut data, &mut self.snapshots);
|
|
|
|
let mut tombstones = vec![];
|
|
if let Some(tombstone) = tombstone {
|
|
tombstones.push(tombstone);
|
|
}
|
|
|
|
// only produce batch if there is any data
|
|
if data.is_empty() {
|
|
None
|
|
} else {
|
|
Some(QueryableBatch::new(table_name, data, tombstones))
|
|
}
|
|
}
|
|
|
|
/// Returns all existing snapshots plus data in the buffer
|
|
/// This only read data. Data in the buffer will be kept in the buffer
|
|
pub fn buffer_and_snapshots(&self) -> Result<Vec<Arc<SnapshotBatch>>> {
|
|
// Existing snapshots
|
|
let mut snapshots = self.snapshots.clone();
|
|
|
|
// copy the buffer to a snapshot
|
|
let buffer_snapshot = self
|
|
.copy_buffer_to_snapshot()
|
|
.context(BufferToSnapshotSnafu)?;
|
|
snapshots.extend(buffer_snapshot);
|
|
|
|
Ok(snapshots)
|
|
}
|
|
|
|
/// Snapshots the buffer and moves snapshots over to the `PersistingBatch`.
|
|
///
|
|
/// # Panic
|
|
///
|
|
/// Panics if there is already a persisting batch.
|
|
pub fn snapshot_to_persisting(
|
|
&mut self,
|
|
sequencer_id: SequencerId,
|
|
table_id: TableId,
|
|
partition_id: PartitionId,
|
|
table_name: &str,
|
|
) -> Option<Arc<PersistingBatch>> {
|
|
if self.persisting.is_some() {
|
|
panic!("Unable to snapshot while persisting. This is an unexpected state.")
|
|
}
|
|
|
|
if let Some(queryable_batch) = self.snapshot_to_queryable_batch(table_name, None) {
|
|
let persisting_batch = Arc::new(PersistingBatch {
|
|
sequencer_id,
|
|
table_id,
|
|
partition_id,
|
|
object_store_id: Uuid::new_v4(),
|
|
data: Arc::new(queryable_batch),
|
|
});
|
|
|
|
self.persisting = Some(Arc::clone(&persisting_batch));
|
|
|
|
Some(persisting_batch)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Return a QueryableBatch of the persisting batch after applying new tombstones
|
|
pub fn get_persisting_data(&self) -> Option<QueryableBatch> {
|
|
let persisting = match &self.persisting {
|
|
Some(p) => p,
|
|
None => return None,
|
|
};
|
|
|
|
// persisting data
|
|
let mut queryable_batch = (*persisting.data).clone();
|
|
|
|
// Add new tombstones if any
|
|
queryable_batch.add_tombstones(&self.deletes_during_persisting);
|
|
|
|
Some(queryable_batch)
|
|
}
|
|
|
|
/// Return the progress in this DataBuffer
|
|
fn progress(&self) -> SequencerProgress {
|
|
let progress = SequencerProgress::new();
|
|
|
|
let progress = if let Some(buffer) = &self.buffer {
|
|
progress.combine(buffer.progress())
|
|
} else {
|
|
progress
|
|
};
|
|
|
|
let progress = self.snapshots.iter().fold(progress, |progress, snapshot| {
|
|
progress.combine(snapshot.progress())
|
|
});
|
|
|
|
if let Some(persisting) = &self.persisting {
|
|
persisting
|
|
.data
|
|
.data
|
|
.iter()
|
|
.fold(progress, |progress, snapshot| {
|
|
progress.combine(snapshot.progress())
|
|
})
|
|
} else {
|
|
progress
|
|
}
|
|
}
|
|
}
|
|
|
|
/// BufferBatch is a MutableBatch with its ingesting order, sequencer_number, that helps the
|
|
/// ingester keep the batches of data in their ingesting order
|
|
#[derive(Debug)]
|
|
pub struct BufferBatch {
|
|
/// Sequence number of the first write in this batch
|
|
pub(crate) min_sequence_number: SequenceNumber,
|
|
/// Sequence number of the last write in this batch
|
|
pub(crate) max_sequence_number: SequenceNumber,
|
|
/// Ingesting data
|
|
pub(crate) data: MutableBatch,
|
|
}
|
|
|
|
impl BufferBatch {
|
|
/// Return the progress in this DataBuffer
|
|
fn progress(&self) -> SequencerProgress {
|
|
SequencerProgress::new()
|
|
.with_buffered(self.min_sequence_number)
|
|
.with_buffered(self.max_sequence_number)
|
|
}
|
|
}
|
|
|
|
/// SnapshotBatch contains data of many contiguous BufferBatches
|
|
#[derive(Debug, PartialEq)]
|
|
pub struct SnapshotBatch {
|
|
/// Min sequencer number of its combined BufferBatches
|
|
pub(crate) min_sequencer_number: SequenceNumber,
|
|
/// Max sequencer number of its combined BufferBatches
|
|
pub(crate) max_sequencer_number: SequenceNumber,
|
|
/// Data of its combined BufferBatches kept in one RecordBatch
|
|
pub(crate) data: Arc<RecordBatch>,
|
|
}
|
|
|
|
impl SnapshotBatch {
|
|
/// Return only data of the given columns
|
|
pub fn scan(&self, selection: Selection<'_>) -> Result<Option<Arc<RecordBatch>>> {
|
|
Ok(match selection {
|
|
Selection::All => Some(Arc::clone(&self.data)),
|
|
Selection::Some(columns) => {
|
|
let schema = self.data.schema();
|
|
|
|
let indices = columns
|
|
.iter()
|
|
.filter_map(|&column_name| {
|
|
match schema.index_of(column_name) {
|
|
Ok(idx) => Some(idx),
|
|
_ => None, // this batch does not include data of this column_name
|
|
}
|
|
})
|
|
.collect::<Vec<_>>();
|
|
if indices.is_empty() {
|
|
None
|
|
} else {
|
|
Some(Arc::new(
|
|
self.data.project(&indices).context(FilterColumnSnafu {})?,
|
|
))
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
/// Return progress in this data
|
|
fn progress(&self) -> SequencerProgress {
|
|
SequencerProgress::new()
|
|
.with_buffered(self.min_sequencer_number)
|
|
.with_buffered(self.max_sequencer_number)
|
|
}
|
|
}
|
|
|
|
/// PersistingBatch contains all needed info and data for creating
|
|
/// a parquet file for given set of SnapshotBatches
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
pub struct PersistingBatch {
|
|
/// Sequencer id of the data
|
|
pub(crate) sequencer_id: SequencerId,
|
|
|
|
/// Table id of the data
|
|
pub(crate) table_id: TableId,
|
|
|
|
/// Partition Id of the data
|
|
pub(crate) partition_id: PartitionId,
|
|
|
|
/// Id of to-be-created parquet file of this data
|
|
pub(crate) object_store_id: Uuid,
|
|
|
|
/// data
|
|
pub(crate) data: Arc<QueryableBatch>,
|
|
}
|
|
|
|
/// Queryable data used for both query and persistence
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
pub struct QueryableBatch {
|
|
/// data
|
|
pub(crate) data: Vec<Arc<SnapshotBatch>>,
|
|
|
|
/// Delete predicates of the tombstones
|
|
pub(crate) delete_predicates: Vec<Arc<DeletePredicate>>,
|
|
|
|
/// This is needed to return a reference for a trait function
|
|
pub(crate) table_name: String,
|
|
}
|
|
|
|
/// Status of a partition that has unpersisted data.
|
|
///
|
|
/// Note that this structure is specific to a partition (which itself is bound to a table and
|
|
/// sequencer)!
|
|
#[derive(Debug, Clone)]
|
|
#[allow(missing_copy_implementations)]
|
|
pub struct PartitionStatus {
|
|
/// Max sequence number persisted
|
|
pub parquet_max_sequence_number: Option<SequenceNumber>,
|
|
|
|
/// Max sequence number for a tombstone
|
|
pub tombstone_max_sequence_number: Option<SequenceNumber>,
|
|
}
|
|
|
|
/// Response sending to the query service per its request defined in IngesterQueryRequest
|
|
pub struct IngesterQueryResponse {
|
|
/// Stream of RecordBatch results that match the requested query
|
|
pub data: SendableRecordBatchStream,
|
|
|
|
/// The schema of the record batches
|
|
pub schema: Schema,
|
|
|
|
/// Contains status for every partition that has unpersisted data.
|
|
///
|
|
/// If a partition does NOT appear within this map, then either all data was persisted or the
|
|
/// ingester has never seen data for this partition. In either case the querier may just read
|
|
/// all parquet files for the missing partition.
|
|
pub unpersisted_partitions: BTreeMap<PartitionId, PartitionStatus>,
|
|
|
|
/// Map each record batch to a partition ID.
|
|
pub batch_partition_ids: Vec<PartitionId>,
|
|
}
|
|
|
|
impl std::fmt::Debug for IngesterQueryResponse {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.debug_struct("IngesterQueryResponse")
|
|
.field("data", &"<RECORDBATCH STREAM>")
|
|
.field("schema", &self.schema)
|
|
.field("unpersisted_partitions", &self.unpersisted_partitions)
|
|
.field("batch_partition_ids", &self.batch_partition_ids)
|
|
.finish()
|
|
}
|
|
}
|
|
|
|
impl IngesterQueryResponse {
|
|
/// Make a response
|
|
pub fn new(
|
|
data: SendableRecordBatchStream,
|
|
schema: Schema,
|
|
unpersisted_partitions: BTreeMap<PartitionId, PartitionStatus>,
|
|
batch_partition_ids: Vec<PartitionId>,
|
|
) -> Self {
|
|
Self {
|
|
data,
|
|
schema,
|
|
unpersisted_partitions,
|
|
batch_partition_ids,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::{
|
|
lifecycle::{LifecycleConfig, LifecycleManager},
|
|
partioning::DefaultPartitioner,
|
|
test_util::create_tombstone,
|
|
};
|
|
use arrow_util::assert_batches_sorted_eq;
|
|
use assert_matches::assert_matches;
|
|
use data_types::{
|
|
NamespaceSchema, NonEmptyString, ParquetFileParams, Sequence, TimestampRange,
|
|
};
|
|
use dml::{DmlDelete, DmlMeta, DmlWrite};
|
|
use futures::TryStreamExt;
|
|
use iox_catalog::{
|
|
interface::INITIAL_COMPACTION_LEVEL, mem::MemCatalog, validate_or_insert_schema,
|
|
};
|
|
use iox_time::Time;
|
|
use metric::{MetricObserver, Observation};
|
|
use mutable_batch_lp::{lines_to_batches, test_helpers::lp_to_mutable_batch};
|
|
use object_store::memory::InMemory;
|
|
use std::{ops::DerefMut, time::Duration};
|
|
|
|
#[test]
|
|
fn snapshot_empty_buffer_adds_no_snapshots() {
|
|
let mut data_buffer = DataBuffer::default();
|
|
|
|
data_buffer.snapshot().unwrap();
|
|
|
|
assert!(data_buffer.snapshots.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn snapshot_buffer_batch_moves_to_snapshots() {
|
|
let mut data_buffer = DataBuffer::default();
|
|
|
|
let seq_num1 = SequenceNumber::new(1);
|
|
let (_, mutable_batch1) =
|
|
lp_to_mutable_batch(r#"foo,t1=asdf iv=1i,uv=774u,fv=1.0,bv=true,sv="hi" 1"#);
|
|
let buffer_batch1 = BufferBatch {
|
|
min_sequence_number: seq_num1,
|
|
max_sequence_number: seq_num1,
|
|
data: mutable_batch1,
|
|
};
|
|
let record_batch1 = buffer_batch1.data.to_arrow(Selection::All).unwrap();
|
|
data_buffer.buffer = Some(buffer_batch1);
|
|
|
|
data_buffer.snapshot().unwrap();
|
|
|
|
assert!(data_buffer.buffer.is_none());
|
|
assert_eq!(data_buffer.snapshots.len(), 1);
|
|
|
|
let snapshot = &data_buffer.snapshots[0];
|
|
assert_eq!(snapshot.min_sequencer_number, seq_num1);
|
|
assert_eq!(snapshot.max_sequencer_number, seq_num1);
|
|
assert_eq!(&*snapshot.data, &record_batch1);
|
|
}
|
|
|
|
#[test]
|
|
fn snapshot_buffer_different_but_compatible_schemas() {
|
|
let mut partition_data = PartitionData {
|
|
id: PartitionId::new(1),
|
|
data: Default::default(),
|
|
};
|
|
|
|
let seq_num1 = SequenceNumber::new(1);
|
|
// Missing tag `t1`
|
|
let (_, mut mutable_batch1) =
|
|
lp_to_mutable_batch(r#"foo iv=1i,uv=774u,fv=1.0,bv=true,sv="hi" 1"#);
|
|
partition_data
|
|
.buffer_write(seq_num1, mutable_batch1.clone())
|
|
.unwrap();
|
|
|
|
let seq_num2 = SequenceNumber::new(2);
|
|
// Missing field `iv`
|
|
let (_, mutable_batch2) =
|
|
lp_to_mutable_batch(r#"foo,t1=aoeu uv=1u,fv=12.0,bv=false,sv="bye" 10000"#);
|
|
|
|
partition_data
|
|
.buffer_write(seq_num2, mutable_batch2.clone())
|
|
.unwrap();
|
|
partition_data.data.snapshot().unwrap();
|
|
|
|
assert!(partition_data.data.buffer.is_none());
|
|
assert_eq!(partition_data.data.snapshots.len(), 1);
|
|
|
|
let snapshot = &partition_data.data.snapshots[0];
|
|
assert_eq!(snapshot.min_sequencer_number, seq_num1);
|
|
assert_eq!(snapshot.max_sequencer_number, seq_num2);
|
|
|
|
mutable_batch1.extend_from(&mutable_batch2).unwrap();
|
|
let combined_record_batch = mutable_batch1.to_arrow(Selection::All).unwrap();
|
|
assert_eq!(&*snapshot.data, &combined_record_batch);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn buffer_write_updates_lifecycle_manager_indicates_pause() {
|
|
let metrics = Arc::new(metric::Registry::new());
|
|
let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
|
|
let mut repos = catalog.repositories().await;
|
|
let kafka_topic = repos.kafka_topics().create_or_get("whatevs").await.unwrap();
|
|
let query_pool = repos.query_pools().create_or_get("whatevs").await.unwrap();
|
|
let kafka_partition = KafkaPartition::new(0);
|
|
let namespace = repos
|
|
.namespaces()
|
|
.create("foo", "inf", kafka_topic.id, query_pool.id)
|
|
.await
|
|
.unwrap();
|
|
let sequencer1 = repos
|
|
.sequencers()
|
|
.create_or_get(&kafka_topic, kafka_partition)
|
|
.await
|
|
.unwrap();
|
|
|
|
let mut sequencers = BTreeMap::new();
|
|
let kafka_partition = KafkaPartition::new(0);
|
|
sequencers.insert(
|
|
sequencer1.id,
|
|
SequencerData::new(kafka_partition, Arc::clone(&metrics)),
|
|
);
|
|
|
|
let object_store: Arc<DynObjectStore> = Arc::new(InMemory::new());
|
|
|
|
let data = Arc::new(IngesterData::new(
|
|
Arc::clone(&object_store),
|
|
Arc::clone(&catalog),
|
|
sequencers,
|
|
Arc::new(DefaultPartitioner::default()),
|
|
Arc::new(Executor::new(1)),
|
|
BackoffConfig::default(),
|
|
));
|
|
|
|
let schema = NamespaceSchema::new(namespace.id, kafka_topic.id, query_pool.id);
|
|
|
|
let ignored_ts = Time::from_timestamp_millis(42);
|
|
|
|
let w1 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("mem foo=1 10", 0).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(1, 1), ignored_ts, None, 50),
|
|
);
|
|
|
|
let _ = validate_or_insert_schema(w1.tables(), &schema, repos.deref_mut())
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
std::mem::drop(repos);
|
|
let pause_size = w1.size() + 1;
|
|
let manager = LifecycleManager::new(
|
|
LifecycleConfig::new(
|
|
pause_size,
|
|
0,
|
|
0,
|
|
Duration::from_secs(1),
|
|
Duration::from_secs(1),
|
|
),
|
|
metrics,
|
|
Arc::new(SystemProvider::new()),
|
|
);
|
|
let should_pause = data
|
|
.buffer_operation(
|
|
sequencer1.id,
|
|
DmlOperation::Write(w1.clone()),
|
|
&manager.handle(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
assert!(!should_pause);
|
|
let should_pause = data
|
|
.buffer_operation(sequencer1.id, DmlOperation::Write(w1), &manager.handle())
|
|
.await
|
|
.unwrap();
|
|
assert!(should_pause);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn persist() {
|
|
let metrics = Arc::new(metric::Registry::new());
|
|
let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
|
|
let mut repos = catalog.repositories().await;
|
|
let kafka_topic = repos.kafka_topics().create_or_get("whatevs").await.unwrap();
|
|
let query_pool = repos.query_pools().create_or_get("whatevs").await.unwrap();
|
|
let kafka_partition = KafkaPartition::new(0);
|
|
let namespace = repos
|
|
.namespaces()
|
|
.create("foo", "inf", kafka_topic.id, query_pool.id)
|
|
.await
|
|
.unwrap();
|
|
let sequencer1 = repos
|
|
.sequencers()
|
|
.create_or_get(&kafka_topic, kafka_partition)
|
|
.await
|
|
.unwrap();
|
|
let sequencer2 = repos
|
|
.sequencers()
|
|
.create_or_get(&kafka_topic, kafka_partition)
|
|
.await
|
|
.unwrap();
|
|
let mut sequencers = BTreeMap::new();
|
|
sequencers.insert(
|
|
sequencer1.id,
|
|
SequencerData::new(sequencer1.kafka_partition, Arc::clone(&metrics)),
|
|
);
|
|
sequencers.insert(
|
|
sequencer2.id,
|
|
SequencerData::new(sequencer2.kafka_partition, Arc::clone(&metrics)),
|
|
);
|
|
|
|
let object_store: Arc<DynObjectStore> = Arc::new(InMemory::new());
|
|
|
|
let data = Arc::new(IngesterData::new(
|
|
Arc::clone(&object_store),
|
|
Arc::clone(&catalog),
|
|
sequencers,
|
|
Arc::new(DefaultPartitioner::default()),
|
|
Arc::new(Executor::new(1)),
|
|
BackoffConfig::default(),
|
|
));
|
|
|
|
let schema = NamespaceSchema::new(namespace.id, kafka_topic.id, query_pool.id);
|
|
|
|
let ignored_ts = Time::from_timestamp_millis(42);
|
|
|
|
let w1 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("mem foo=1 10", 0).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(1, 1), ignored_ts, None, 50),
|
|
);
|
|
// drop repos so the mem catalog won't deadlock.
|
|
let schema = validate_or_insert_schema(w1.tables(), &schema, repos.deref_mut())
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
let w2 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("cpu foo=1 10", 1).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(2, 1), ignored_ts, None, 50),
|
|
);
|
|
let _ = validate_or_insert_schema(w2.tables(), &schema, repos.deref_mut())
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
std::mem::drop(repos);
|
|
let w3 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("mem foo=1 30", 2).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(1, 2), ignored_ts, None, 50),
|
|
);
|
|
|
|
let manager = LifecycleManager::new(
|
|
LifecycleConfig::new(1, 0, 0, Duration::from_secs(1), Duration::from_secs(1)),
|
|
metrics,
|
|
Arc::new(SystemProvider::new()),
|
|
);
|
|
|
|
data.buffer_operation(sequencer1.id, DmlOperation::Write(w1), &manager.handle())
|
|
.await
|
|
.unwrap();
|
|
data.buffer_operation(sequencer2.id, DmlOperation::Write(w2), &manager.handle())
|
|
.await
|
|
.unwrap();
|
|
data.buffer_operation(sequencer1.id, DmlOperation::Write(w3), &manager.handle())
|
|
.await
|
|
.unwrap();
|
|
|
|
// check progresses
|
|
let progresses = data.progresses(vec![kafka_partition]).await;
|
|
let mut expected_progresses = BTreeMap::new();
|
|
expected_progresses.insert(
|
|
kafka_partition,
|
|
SequencerProgress::new()
|
|
.with_buffered(SequenceNumber::new(1))
|
|
.with_buffered(SequenceNumber::new(2)),
|
|
);
|
|
assert_eq!(progresses, expected_progresses);
|
|
|
|
let sd = data.sequencers.get(&sequencer1.id).unwrap();
|
|
let n = sd.namespace("foo").unwrap();
|
|
let partition_id;
|
|
let table_id;
|
|
{
|
|
let mem_table = n.table_data("mem").unwrap();
|
|
assert!(n.table_data("cpu").is_some());
|
|
let mem_table = mem_table.write().await;
|
|
let p = mem_table.partition_data.get("1970-01-01").unwrap();
|
|
|
|
table_id = mem_table.table_id;
|
|
partition_id = p.id;
|
|
}
|
|
{
|
|
// verify the partition doesn't have a sort key before any data has been persisted
|
|
let mut repos = catalog.repositories().await;
|
|
let partition_info = repos
|
|
.partitions()
|
|
.partition_info_by_id(partition_id)
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
assert!(partition_info.partition.sort_key.is_none());
|
|
}
|
|
|
|
data.persist(partition_id).await;
|
|
|
|
// verify that a file got put into object store
|
|
let file_paths: Vec<_> = object_store
|
|
.list(None)
|
|
.await
|
|
.unwrap()
|
|
.try_collect()
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(file_paths.len(), 1);
|
|
|
|
let mut repos = catalog.repositories().await;
|
|
// verify it put the record in the catalog
|
|
let parquet_files = repos
|
|
.parquet_files()
|
|
.list_by_sequencer_greater_than(sequencer1.id, SequenceNumber::new(0))
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(parquet_files.len(), 1);
|
|
let pf = parquet_files.first().unwrap();
|
|
assert_eq!(pf.partition_id, partition_id);
|
|
assert_eq!(pf.table_id, table_id);
|
|
assert_eq!(pf.min_time, Timestamp::new(10));
|
|
assert_eq!(pf.max_time, Timestamp::new(30));
|
|
assert_eq!(pf.min_sequence_number, SequenceNumber::new(1));
|
|
assert_eq!(pf.max_sequence_number, SequenceNumber::new(2));
|
|
assert_eq!(pf.sequencer_id, sequencer1.id);
|
|
assert!(pf.to_delete.is_none());
|
|
|
|
// verify it set a sort key on the partition in the catalog
|
|
let partition_info = repos
|
|
.partitions()
|
|
.partition_info_by_id(partition_id)
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
assert_eq!(partition_info.partition.sort_key.unwrap(), "time");
|
|
|
|
let mem_table = n.table_data("mem").unwrap();
|
|
let mem_table = mem_table.read().await;
|
|
|
|
// verify that the parquet_max_sequence_number got updated
|
|
assert_eq!(
|
|
mem_table.parquet_max_sequence_number(),
|
|
Some(SequenceNumber::new(2))
|
|
);
|
|
|
|
// check progresses after persist
|
|
let progresses = data.progresses(vec![kafka_partition]).await;
|
|
let mut expected_progresses = BTreeMap::new();
|
|
expected_progresses.insert(
|
|
kafka_partition,
|
|
SequencerProgress::new()
|
|
.with_buffered(SequenceNumber::new(1))
|
|
.with_persisted(SequenceNumber::new(2)),
|
|
);
|
|
assert_eq!(progresses, expected_progresses);
|
|
}
|
|
|
|
// Test deletes mixed with writes on a single parittion
|
|
#[tokio::test]
|
|
async fn writes_and_deletes() {
|
|
// Make a partition with empty DataBuffer
|
|
let s_id = 1;
|
|
let t_id = 1;
|
|
let p_id = 1;
|
|
let table_name = "restaurant";
|
|
let mut p = PartitionData::new(PartitionId::new(p_id));
|
|
let exec = Executor::new(1);
|
|
|
|
// ------------------------------------------
|
|
// Fill `buffer`
|
|
// --- seq_num: 1
|
|
let (_, mb) = lp_to_mutable_batch(r#"restaurant,city=Boston day="fri",temp=50 10"#);
|
|
p.buffer_write(SequenceNumber::new(1), mb).unwrap();
|
|
|
|
// --- seq_num: 2
|
|
let (_, mb) = lp_to_mutable_batch(r#"restaurant,city=Andover day="thu",temp=44 15"#);
|
|
|
|
p.buffer_write(SequenceNumber::new(2), mb).unwrap();
|
|
|
|
// verify data
|
|
assert_eq!(
|
|
p.data.buffer.as_ref().unwrap().min_sequence_number,
|
|
SequenceNumber::new(1)
|
|
);
|
|
assert_eq!(
|
|
p.data.buffer.as_ref().unwrap().max_sequence_number,
|
|
SequenceNumber::new(2)
|
|
);
|
|
assert_eq!(p.data.snapshots.len(), 0);
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 0);
|
|
assert_eq!(p.data.persisting, None);
|
|
|
|
// ------------------------------------------
|
|
// Delete
|
|
// --- seq_num: 3
|
|
let ts = create_tombstone(
|
|
1, // tombstone id
|
|
t_id, // table id
|
|
s_id, // sequencer id
|
|
3, // delete's seq_number
|
|
0, // min time of data to get deleted
|
|
20, // max time of data to get deleted
|
|
"day=thu", // delete predicate
|
|
);
|
|
// one row will get deleted, the other is moved to snapshot
|
|
p.buffer_tombstone(&exec, "restaurant", ts).await;
|
|
|
|
// verify data
|
|
assert!(p.data.buffer.is_none()); // always empty after delete
|
|
assert_eq!(p.data.snapshots.len(), 1); // one snpashot if there is data
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 0);
|
|
assert_eq!(p.data.persisting, None);
|
|
// snapshot only has one row since the other one got deleted
|
|
let data = (*p.data.snapshots[0].data).clone();
|
|
let expected = vec![
|
|
"+--------+-----+------+--------------------------------+",
|
|
"| city | day | temp | time |",
|
|
"+--------+-----+------+--------------------------------+",
|
|
"| Boston | fri | 50 | 1970-01-01T00:00:00.000000010Z |",
|
|
"+--------+-----+------+--------------------------------+",
|
|
];
|
|
assert_batches_sorted_eq!(&expected, &[data]);
|
|
assert_eq!(p.data.snapshots[0].min_sequencer_number.get(), 1);
|
|
assert_eq!(p.data.snapshots[0].max_sequencer_number.get(), 3);
|
|
|
|
// ------------------------------------------
|
|
// Fill `buffer`
|
|
// --- seq_num: 4
|
|
let (_, mb) = lp_to_mutable_batch(
|
|
r#"
|
|
restaurant,city=Medford day="sun",temp=55 22
|
|
restaurant,city=Boston day="sun",temp=57 24
|
|
"#,
|
|
);
|
|
p.buffer_write(SequenceNumber::new(4), mb).unwrap();
|
|
|
|
// --- seq_num: 5
|
|
let (_, mb) = lp_to_mutable_batch(r#"restaurant,city=Andover day="tue",temp=56 30"#);
|
|
|
|
p.buffer_write(SequenceNumber::new(5), mb).unwrap();
|
|
|
|
// verify data
|
|
assert_eq!(
|
|
p.data.buffer.as_ref().unwrap().min_sequence_number,
|
|
SequenceNumber::new(4)
|
|
);
|
|
assert_eq!(
|
|
p.data.buffer.as_ref().unwrap().max_sequence_number,
|
|
SequenceNumber::new(5)
|
|
);
|
|
assert_eq!(p.data.snapshots.len(), 1); // existing sanpshot
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 0);
|
|
assert_eq!(p.data.persisting, None);
|
|
|
|
// ------------------------------------------
|
|
// Delete
|
|
// --- seq_num: 6
|
|
let ts = create_tombstone(
|
|
2, // tombstone id
|
|
t_id, // table id
|
|
s_id, // sequencer id
|
|
6, // delete's seq_number
|
|
10, // min time of data to get deleted
|
|
50, // max time of data to get deleted
|
|
"city=Boston", // delete predicate
|
|
);
|
|
// two rows will get deleted, one from existing snapshot, one from the buffer being moved to snpashot
|
|
p.buffer_tombstone(&exec, "restaurant", ts).await;
|
|
|
|
// verify data
|
|
assert!(p.data.buffer.is_none()); // always empty after delete
|
|
assert_eq!(p.data.snapshots.len(), 1); // one snpashot
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 0);
|
|
assert_eq!(p.data.persisting, None);
|
|
// snapshot only has two rows since the other 2 rows with city=Boston have got deleted
|
|
let data = (*p.data.snapshots[0].data).clone();
|
|
let expected = vec![
|
|
"+---------+-----+------+--------------------------------+",
|
|
"| city | day | temp | time |",
|
|
"+---------+-----+------+--------------------------------+",
|
|
"| Andover | tue | 56 | 1970-01-01T00:00:00.000000030Z |",
|
|
"| Medford | sun | 55 | 1970-01-01T00:00:00.000000022Z |",
|
|
"+---------+-----+------+--------------------------------+",
|
|
];
|
|
assert_batches_sorted_eq!(&expected, &[data]);
|
|
assert_eq!(p.data.snapshots[0].min_sequencer_number.get(), 1);
|
|
assert_eq!(p.data.snapshots[0].max_sequencer_number.get(), 6);
|
|
|
|
// ------------------------------------------
|
|
// Persisting
|
|
let p_batch = p
|
|
.snapshot_to_persisting_batch(
|
|
SequencerId::new(s_id),
|
|
TableId::new(t_id),
|
|
PartitionId::new(p_id),
|
|
table_name,
|
|
)
|
|
.unwrap();
|
|
|
|
// verify data
|
|
assert!(p.data.buffer.is_none()); // always empty after issuing persit
|
|
assert_eq!(p.data.snapshots.len(), 0); // always empty after issuing persit
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 0); // deletes not happen yet
|
|
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
|
|
|
|
// ------------------------------------------
|
|
// Delete
|
|
// --- seq_num: 7
|
|
let ts = create_tombstone(
|
|
3, // tombstone id
|
|
t_id, // table id
|
|
s_id, // sequencer id
|
|
7, // delete's seq_number
|
|
10, // min time of data to get deleted
|
|
50, // max time of data to get deleted
|
|
"temp=55", // delete predicate
|
|
);
|
|
// if a query come while persisting, the row with temp=55 will be deleted before
|
|
// data is sent back to Querier
|
|
p.buffer_tombstone(&exec, "restaurant", ts).await;
|
|
|
|
// verify data
|
|
assert!(p.data.buffer.is_none()); // always empty after delete
|
|
// no snpashots becasue buffer has not data yet and the sanpshot was empty too
|
|
assert_eq!(p.data.snapshots.len(), 0);
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 1); // tombstone added since data is persisting
|
|
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
|
|
|
|
// ------------------------------------------
|
|
// Fill `buffer`
|
|
// --- seq_num: 8
|
|
let (_, mb) = lp_to_mutable_batch(
|
|
r#"
|
|
restaurant,city=Wilmington day="sun",temp=55 35
|
|
restaurant,city=Boston day="sun",temp=60 36
|
|
restaurant,city=Boston day="sun",temp=62 38
|
|
"#,
|
|
);
|
|
p.buffer_write(SequenceNumber::new(8), mb).unwrap();
|
|
|
|
// verify data
|
|
assert_eq!(
|
|
p.data.buffer.as_ref().unwrap().min_sequence_number,
|
|
SequenceNumber::new(8)
|
|
); // 1 newlly added mutable batch of 3 rows of data
|
|
assert_eq!(p.data.snapshots.len(), 0); // still empty
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 1);
|
|
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
|
|
|
|
// ------------------------------------------
|
|
// Take snaphot of the `buffer`
|
|
p.snapshot().unwrap();
|
|
// verify data
|
|
assert!(p.data.buffer.is_none()); // empty after snaphot
|
|
assert_eq!(p.data.snapshots.len(), 1); // data moved from buffer
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 1);
|
|
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
|
|
// snapshot has three rows moved from buffer
|
|
let data = (*p.data.snapshots[0].data).clone();
|
|
let expected = vec![
|
|
"+------------+-----+------+--------------------------------+",
|
|
"| city | day | temp | time |",
|
|
"+------------+-----+------+--------------------------------+",
|
|
"| Wilmington | sun | 55 | 1970-01-01T00:00:00.000000035Z |",
|
|
"| Boston | sun | 60 | 1970-01-01T00:00:00.000000036Z |",
|
|
"| Boston | sun | 62 | 1970-01-01T00:00:00.000000038Z |",
|
|
"+------------+-----+------+--------------------------------+",
|
|
];
|
|
assert_batches_sorted_eq!(&expected, &[data]);
|
|
assert_eq!(p.data.snapshots[0].min_sequencer_number.get(), 8);
|
|
assert_eq!(p.data.snapshots[0].max_sequencer_number.get(), 8);
|
|
|
|
// ------------------------------------------
|
|
// Delete
|
|
// --- seq_num: 9
|
|
let ts = create_tombstone(
|
|
4, // tombstone id
|
|
t_id, // table id
|
|
s_id, // sequencer id
|
|
9, // delete's seq_number
|
|
10, // min time of data to get deleted
|
|
50, // max time of data to get deleted
|
|
"temp=60", // delete predicate
|
|
);
|
|
// the row with temp=60 will be removed from the sanphot
|
|
p.buffer_tombstone(&exec, "restaurant", ts).await;
|
|
|
|
// verify data
|
|
assert!(p.data.buffer.is_none()); // always empty after delete
|
|
assert_eq!(p.data.snapshots.len(), 1); // new snapshot of the existing with delete applied
|
|
assert_eq!(p.data.deletes_during_persisting.len(), 2); // one more tombstone added make it 2
|
|
assert_eq!(p.data.persisting, Some(Arc::clone(&p_batch)));
|
|
// snapshot has only 2 rows becasue the row with tem=60 was removed
|
|
let data = (*p.data.snapshots[0].data).clone();
|
|
let expected = vec![
|
|
"+------------+-----+------+--------------------------------+",
|
|
"| city | day | temp | time |",
|
|
"+------------+-----+------+--------------------------------+",
|
|
"| Wilmington | sun | 55 | 1970-01-01T00:00:00.000000035Z |",
|
|
"| Boston | sun | 62 | 1970-01-01T00:00:00.000000038Z |",
|
|
"+------------+-----+------+--------------------------------+",
|
|
];
|
|
assert_batches_sorted_eq!(&expected, &[data]);
|
|
assert_eq!(p.data.snapshots[0].min_sequencer_number.get(), 8);
|
|
assert_eq!(p.data.snapshots[0].max_sequencer_number.get(), 9);
|
|
|
|
exec.join().await;
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn buffer_operation_ignores_already_persisted_data() {
|
|
let metrics = Arc::new(metric::Registry::new());
|
|
let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
|
|
let mut repos = catalog.repositories().await;
|
|
let kafka_topic = repos.kafka_topics().create_or_get("whatevs").await.unwrap();
|
|
let query_pool = repos.query_pools().create_or_get("whatevs").await.unwrap();
|
|
let kafka_partition = KafkaPartition::new(0);
|
|
let namespace = repos
|
|
.namespaces()
|
|
.create("foo", "inf", kafka_topic.id, query_pool.id)
|
|
.await
|
|
.unwrap();
|
|
let sequencer = repos
|
|
.sequencers()
|
|
.create_or_get(&kafka_topic, kafka_partition)
|
|
.await
|
|
.unwrap();
|
|
|
|
let schema = NamespaceSchema::new(namespace.id, kafka_topic.id, query_pool.id);
|
|
|
|
let ignored_ts = Time::from_timestamp_millis(42);
|
|
|
|
let w1 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("mem foo=1 10", 0).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(1, 1), ignored_ts, None, 50),
|
|
);
|
|
let w2 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("mem foo=1 10", 0).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(1, 2), ignored_ts, None, 50),
|
|
);
|
|
|
|
let _ = validate_or_insert_schema(w1.tables(), &schema, repos.deref_mut())
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
// create some persisted state
|
|
let table = repos
|
|
.tables()
|
|
.create_or_get("mem", namespace.id)
|
|
.await
|
|
.unwrap();
|
|
let partition = repos
|
|
.partitions()
|
|
.create_or_get("1970-01-01", sequencer.id, table.id)
|
|
.await
|
|
.unwrap();
|
|
let partition2 = repos
|
|
.partitions()
|
|
.create_or_get("1970-01-02", sequencer.id, table.id)
|
|
.await
|
|
.unwrap();
|
|
|
|
let parquet_file_params = ParquetFileParams {
|
|
sequencer_id: sequencer.id,
|
|
namespace_id: namespace.id,
|
|
table_id: table.id,
|
|
partition_id: partition.id,
|
|
object_store_id: Uuid::new_v4(),
|
|
min_sequence_number: SequenceNumber::new(0),
|
|
max_sequence_number: SequenceNumber::new(1),
|
|
min_time: Timestamp::new(1),
|
|
max_time: Timestamp::new(1),
|
|
file_size_bytes: 0,
|
|
parquet_metadata: vec![],
|
|
row_count: 0,
|
|
compaction_level: INITIAL_COMPACTION_LEVEL,
|
|
created_at: Timestamp::new(1),
|
|
};
|
|
repos
|
|
.parquet_files()
|
|
.create(parquet_file_params.clone())
|
|
.await
|
|
.unwrap();
|
|
|
|
// now create a parquet file in another partition with a much higher sequence persisted
|
|
// sequence number. We want to make sure that this doesn't cause our write in the other
|
|
// partition to get ignored.
|
|
let other_file_params = ParquetFileParams {
|
|
min_sequence_number: SequenceNumber::new(12),
|
|
max_sequence_number: SequenceNumber::new(15),
|
|
object_store_id: Uuid::new_v4(),
|
|
partition_id: partition2.id,
|
|
..parquet_file_params
|
|
};
|
|
repos
|
|
.parquet_files()
|
|
.create(other_file_params)
|
|
.await
|
|
.unwrap();
|
|
std::mem::drop(repos);
|
|
|
|
let manager = LifecycleManager::new(
|
|
LifecycleConfig::new(1, 0, 0, Duration::from_secs(1), Duration::from_secs(1)),
|
|
Arc::clone(&metrics),
|
|
Arc::new(SystemProvider::new()),
|
|
);
|
|
let partitioner = DefaultPartitioner::default();
|
|
let exec = Executor::new(1);
|
|
|
|
let data = NamespaceData::new(namespace.id, &*metrics);
|
|
|
|
// w1 should be ignored so it shouldn't be present in the buffer
|
|
let should_pause = data
|
|
.buffer_operation(
|
|
DmlOperation::Write(w1),
|
|
sequencer.id,
|
|
catalog.as_ref(),
|
|
&manager.handle(),
|
|
&partitioner,
|
|
&exec,
|
|
)
|
|
.await
|
|
.unwrap();
|
|
{
|
|
let table_data = data.table_data("mem").unwrap();
|
|
let table = table_data.read().await;
|
|
let p = table.partition_data.get("1970-01-01").unwrap();
|
|
assert_eq!(
|
|
p.data.max_persisted_sequence_number,
|
|
Some(SequenceNumber::new(1))
|
|
);
|
|
assert!(p.data.buffer.is_none());
|
|
}
|
|
assert!(!should_pause);
|
|
|
|
// w2 should be in the buffer
|
|
data.buffer_operation(
|
|
DmlOperation::Write(w2),
|
|
sequencer.id,
|
|
catalog.as_ref(),
|
|
&manager.handle(),
|
|
&partitioner,
|
|
&exec,
|
|
)
|
|
.await
|
|
.unwrap();
|
|
|
|
let table_data = data.table_data("mem").unwrap();
|
|
let table = table_data.read().await;
|
|
let partition = table.partition_data.get("1970-01-01").unwrap();
|
|
assert_eq!(
|
|
partition.data.buffer.as_ref().unwrap().min_sequence_number,
|
|
SequenceNumber::new(2)
|
|
);
|
|
|
|
assert_matches!(data.table_count.observe(), Observation::U64Counter(v) => {
|
|
assert_eq!(v, 1, "unexpected table count metric value");
|
|
});
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn buffer_deletes_updates_tombstone_watermark() {
|
|
let metrics = Arc::new(metric::Registry::new());
|
|
let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metrics)));
|
|
let mut repos = catalog.repositories().await;
|
|
let kafka_topic = repos.kafka_topics().create_or_get("whatevs").await.unwrap();
|
|
let query_pool = repos.query_pools().create_or_get("whatevs").await.unwrap();
|
|
let kafka_partition = KafkaPartition::new(0);
|
|
let namespace = repos
|
|
.namespaces()
|
|
.create("foo", "inf", kafka_topic.id, query_pool.id)
|
|
.await
|
|
.unwrap();
|
|
let sequencer1 = repos
|
|
.sequencers()
|
|
.create_or_get(&kafka_topic, kafka_partition)
|
|
.await
|
|
.unwrap();
|
|
|
|
let mut sequencers = BTreeMap::new();
|
|
let kafka_partition = KafkaPartition::new(0);
|
|
sequencers.insert(
|
|
sequencer1.id,
|
|
SequencerData::new(kafka_partition, Arc::clone(&metrics)),
|
|
);
|
|
|
|
let object_store: Arc<DynObjectStore> = Arc::new(InMemory::new());
|
|
|
|
let data = Arc::new(IngesterData::new(
|
|
Arc::clone(&object_store),
|
|
Arc::clone(&catalog),
|
|
sequencers,
|
|
Arc::new(DefaultPartitioner::default()),
|
|
Arc::new(Executor::new(1)),
|
|
BackoffConfig::default(),
|
|
));
|
|
|
|
let schema = NamespaceSchema::new(namespace.id, kafka_topic.id, query_pool.id);
|
|
|
|
let ignored_ts = Time::from_timestamp_millis(42);
|
|
|
|
let w1 = DmlWrite::new(
|
|
"foo",
|
|
lines_to_batches("mem foo=1 10", 0).unwrap(),
|
|
DmlMeta::sequenced(Sequence::new(1, 1), ignored_ts, None, 50),
|
|
);
|
|
|
|
let _ = validate_or_insert_schema(w1.tables(), &schema, repos.deref_mut())
|
|
.await
|
|
.unwrap()
|
|
.unwrap();
|
|
|
|
std::mem::drop(repos);
|
|
let pause_size = w1.size() + 1;
|
|
let manager = LifecycleManager::new(
|
|
LifecycleConfig::new(
|
|
pause_size,
|
|
0,
|
|
0,
|
|
Duration::from_secs(1),
|
|
Duration::from_secs(1),
|
|
),
|
|
metrics,
|
|
Arc::new(SystemProvider::new()),
|
|
);
|
|
data.buffer_operation(
|
|
sequencer1.id,
|
|
DmlOperation::Write(w1.clone()),
|
|
&manager.handle(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
|
|
assert_eq!(
|
|
data.sequencer(sequencer1.id)
|
|
.unwrap()
|
|
.namespace(&namespace.name)
|
|
.unwrap()
|
|
.table_data("mem")
|
|
.unwrap()
|
|
.read()
|
|
.await
|
|
.tombstone_max_sequence_number(),
|
|
None,
|
|
);
|
|
|
|
let predicate = DeletePredicate {
|
|
range: TimestampRange::new(1, 2),
|
|
exprs: vec![],
|
|
};
|
|
let d1 = DmlDelete::new(
|
|
"foo",
|
|
predicate,
|
|
Some(NonEmptyString::new("mem").unwrap()),
|
|
DmlMeta::sequenced(Sequence::new(1, 2), ignored_ts, None, 1337),
|
|
);
|
|
data.buffer_operation(sequencer1.id, DmlOperation::Delete(d1), &manager.handle())
|
|
.await
|
|
.unwrap();
|
|
|
|
assert_eq!(
|
|
data.sequencer(sequencer1.id)
|
|
.unwrap()
|
|
.namespace(&namespace.name)
|
|
.unwrap()
|
|
.table_data("mem")
|
|
.unwrap()
|
|
.read()
|
|
.await
|
|
.tombstone_max_sequence_number(),
|
|
Some(SequenceNumber::new(2)),
|
|
);
|
|
}
|
|
}
|