feat: Add basic Edge server structure (#24552)

* WIP: basic influxdb3 command and http server

* WIP: write lp, buffer, query out

* WIP: test write & query on influxdb3_server, fix warnings

* WIP: pull write buffer and catalog into separate crate

* WIP: sketch out types used for write: buffer, wal, persister

* WIP: remove a bunch of old IOx stuff and fmt
pull/24563/head
Paul Dix 2024-01-08 11:50:59 -05:00 committed by GitHub
parent acfef87659
commit 5831cf8cee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
628 changed files with 3381 additions and 177231 deletions

1207
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -2,42 +2,30 @@
# In alphabetical order
members = [
"arrow_util",
"authz",
"backoff",
"cache_system",
"clap_blocks",
"client_util",
"compactor_test_utils",
"compactor",
"compactor_scheduler",
"data_types",
"datafusion_util",
"dml",
"executor",
"flightsql",
"garbage_collector",
"generated_types",
"gossip",
"gossip_compaction",
"gossip_parquet_file",
"gossip_schema",
"grpc-binary-logger-proto",
"grpc-binary-logger-test-proto",
"grpc-binary-logger",
"import_export",
"influxdb_influxql_parser",
"influxdb_iox_client",
"influxdb_iox",
"influxdb_line_protocol",
"influxdb_storage_client",
"influxdb_tsm",
"influxdb2_client",
"influxdb3",
"influxdb3_server",
"influxdb3_write",
"influxrpc_parser",
"ingest_structure",
"ingester_query_grpc",
"ingester_query_client",
"ingester_test_ctx",
"ingester",
"iox_catalog",
"iox_data_generator",
"iox_query_influxql",
@ -46,11 +34,6 @@ members = [
"iox_tests",
"iox_time",
"ioxd_common",
"ioxd_compactor",
"ioxd_garbage_collector",
"ioxd_ingester",
"ioxd_querier",
"ioxd_router",
"ioxd_test",
"logfmt",
"metric_exporters",
@ -65,23 +48,14 @@ members = [
"parquet_file",
"parquet_to_line_protocol",
"predicate",
"querier",
"query_functions",
"router",
"schema",
"service_common",
"service_grpc_catalog",
"service_grpc_flight",
"service_grpc_influxrpc",
"service_grpc_namespace",
"service_grpc_object_store",
"service_grpc_schema",
"service_grpc_table",
"service_grpc_testing",
"sharder",
"sqlx-hotswap-pool",
"test_helpers_end_to_end",
"test_helpers",
"tokio_metrics_bridge",
"trace_exporters",
"trace_http",
@ -92,7 +66,7 @@ members = [
"wal",
"workspace-hack",
]
default-members = ["influxdb_iox"]
default-members = ["influxdb3"]
resolver = "2"

View File

@ -1,44 +0,0 @@
[package]
name = "compactor"
version.workspace = true
authors.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
async-trait = "0.1.73"
backoff = { path = "../backoff" }
bytes = "1.5"
chrono = { version = "0.4", default-features = false }
compactor_scheduler = { path = "../compactor_scheduler" }
data_types = { path = "../data_types" }
datafusion = { workspace = true }
futures = "0.3"
generated_types = { version = "0.1.0", path = "../generated_types" }
gossip = { version = "0.1.0", path = "../gossip" }
gossip_compaction = { version = "0.1.0", path = "../gossip_compaction" }
iox_catalog = { path = "../iox_catalog" }
iox_query = { path = "../iox_query" }
iox_time = { path = "../iox_time" }
itertools = "0.11.0"
metric = { path = "../metric" }
object_store = { workspace = true }
observability_deps = { path = "../observability_deps" }
parking_lot = "0.12.1"
parquet_file = { path = "../parquet_file" }
rand = "0.8.3"
schema = { path = "../schema" }
tokio = { version = "1", features = ["macros", "rt", "sync"] }
tokio-util = { version = "0.7.9" }
trace = { version = "0.1.0", path = "../trace" }
tracker = { path = "../tracker" }
uuid = { version = "1", features = ["v4"] }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]
arrow_util = { path = "../arrow_util" }
assert_matches = "1"
compactor_test_utils = { path = "../compactor_test_utils" }
iox_tests = { path = "../iox_tests" }
test_helpers = { path = "../test_helpers" }
insta = { version = "1.32.0", features = ["yaml"] }

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 210 KiB

View File

@ -1,121 +0,0 @@
//! Main compactor entry point.
use std::sync::Arc;
use futures::{
future::{BoxFuture, Shared},
FutureExt, TryFutureExt,
};
use generated_types::influxdata::iox::gossip::{v1::CompactionEvent, Topic};
use gossip::{NopDispatcher, TopicInterests};
use observability_deps::tracing::{info, warn};
use tokio::task::{JoinError, JoinHandle};
use tokio_util::sync::CancellationToken;
use tracker::AsyncSemaphoreMetrics;
use crate::{
components::{
hardcoded::hardcoded_components,
report::{log_components, log_config},
},
config::Config,
driver::compact,
};
/// A [`JoinHandle`] that can be cloned
type SharedJoinHandle = Shared<BoxFuture<'static, Result<(), Arc<JoinError>>>>;
/// Convert a [`JoinHandle`] into a [`SharedJoinHandle`].
fn shared_handle(handle: JoinHandle<()>) -> SharedJoinHandle {
handle.map_err(Arc::new).boxed().shared()
}
/// Main compactor driver.
#[derive(Debug)]
pub struct Compactor {
shutdown: CancellationToken,
worker: SharedJoinHandle,
}
impl Compactor {
/// Start compactor.
pub async fn start(config: Config) -> Self {
info!("compactor starting");
log_config(&config);
let shutdown = CancellationToken::new();
let shutdown_captured = shutdown.clone();
let components = hardcoded_components(&config);
log_components(&components);
let semaphore_metrics = Arc::new(AsyncSemaphoreMetrics::new(
&config.metric_registry,
&[("semaphore", "job")],
));
let df_semaphore = Arc::new(semaphore_metrics.new_semaphore(config.df_concurrency.get()));
// Initialise the gossip subsystem, if configured.
let gossip = match config.gossip_bind_address {
Some(bind) => {
// Initialise the gossip subsystem.
let handle = gossip::Builder::<_, Topic>::new(
config.gossip_seeds,
NopDispatcher,
Arc::clone(&config.metric_registry),
)
// Configure the compactor to subscribe to no topics - it
// currently only sends events.
.with_topic_filter(TopicInterests::default())
.bind(bind)
.await
.expect("failed to start gossip reactor");
let event_tx =
gossip_compaction::tx::CompactionEventTx::<CompactionEvent>::new(handle);
Some(Arc::new(event_tx))
}
None => None,
};
let worker = tokio::spawn(async move {
tokio::select! {
_ = shutdown_captured.cancelled() => {}
_ = async {
compact(
config.trace_collector,
config.partition_concurrency,
config.partition_timeout,
Arc::clone(&df_semaphore),
&components,
gossip,
).await;
info!("compactor done");
} => {}
}
});
let worker = shared_handle(worker);
Self { shutdown, worker }
}
/// Trigger shutdown. You should [join](Self::join) afterwards.
pub fn shutdown(&self) {
info!("compactor shutting down");
self.shutdown.cancel();
}
/// Wait until the compactor finishes.
pub async fn join(&self) -> Result<(), Arc<JoinError>> {
self.worker.clone().await
}
}
impl Drop for Compactor {
fn drop(&mut self) {
if self.worker.clone().now_or_never().is_none() {
warn!("Compactor was not shut down properly");
}
}
}

View File

@ -1,33 +0,0 @@
use std::fmt::Display;
use super::{ChangedFilesFilter, SavedParquetFileState};
use async_trait::async_trait;
use observability_deps::tracing::info;
#[derive(Debug, Default, Copy, Clone)]
pub struct LoggingChangedFiles {}
impl LoggingChangedFiles {
pub fn new() -> Self {
Self {}
}
}
impl Display for LoggingChangedFiles {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging_changed_files")
}
}
#[async_trait]
impl ChangedFilesFilter for LoggingChangedFiles {
async fn apply(&self, old: &SavedParquetFileState, new: &SavedParquetFileState) -> bool {
if old.existing_files_modified(new) {
let modified_ids_and_levels = old.modified_ids_and_levels(new);
info!(?modified_ids_and_levels, "Concurrent modification detected");
}
false // we're ignoring the return value anyway for the moment
}
}

View File

@ -1,213 +0,0 @@
use std::{
collections::HashSet,
fmt::{Debug, Display},
};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFile, ParquetFileId};
pub mod logging;
/// Returns `true` if the files in the saved state have been changed according to the current state.
#[async_trait]
pub trait ChangedFilesFilter: Debug + Display + Send + Sync {
/// Return `true` if some other process modified the files in `old` such that they don't appear or appear with a
/// different compaction level than `new`, and thus we should stop compacting.
async fn apply(&self, old: &SavedParquetFileState, new: &SavedParquetFileState) -> bool;
}
/// Saved snapshot of a partition's Parquet files' IDs and compaction levels. Save this state at the beginning of a
/// compaction operation, then just before committing ask for the catalog state again. If the ID+compaction level pairs
/// in the initial saved state still appear in the latest catalog state (disregarding any new files that may appear in
/// the latest catalog state) we assume no other compactor instance has compacted the relevant files and this compactor
/// instance should commit its work. If any old ID+compaction level pairs are missing from the latest catalog state
/// (and thus show up in a set difference operation of `old - current`), throw away the work and do not commit as the
/// relevant Parquet files have been changed by some other process while this compactor instance was working.
#[derive(Debug, Clone)]
pub struct SavedParquetFileState {
ids_and_levels: HashSet<(ParquetFileId, CompactionLevel)>,
}
impl<'a, T> From<T> for SavedParquetFileState
where
T: IntoIterator<Item = &'a ParquetFile>,
{
fn from(parquet_files: T) -> Self {
let ids_and_levels = parquet_files
.into_iter()
.map(|pf| (pf.id, pf.compaction_level))
.collect();
Self { ids_and_levels }
}
}
impl SavedParquetFileState {
fn missing<'a>(
&'a self,
new: &'a Self,
) -> impl Iterator<Item = &'a (ParquetFileId, CompactionLevel)> {
let old = self;
old.ids_and_levels.difference(&new.ids_and_levels)
}
pub fn existing_files_modified(&self, new: &Self) -> bool {
let mut missing = self.missing(new);
// If there are any `(ParquetFileId, CompactionLevel)` pairs in `self` that are not present in `new`, that
// means some files were marked to delete or had their compaction level changed by some other process.
missing.next().is_some()
}
pub fn modified_ids_and_levels(&self, new: &Self) -> Vec<(ParquetFileId, CompactionLevel)> {
self.missing(new).cloned().collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use iox_tests::ParquetFileBuilder;
#[test]
fn saved_state_sorts_by_parquet_file_id() {
let pf_id1_level_0 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.build();
let pf_id2_level_2 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Final)
.build();
let pf_id3_level_1 = ParquetFileBuilder::new(3)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let saved_state_1 =
SavedParquetFileState::from([&pf_id1_level_0, &pf_id2_level_2, &pf_id3_level_1]);
let saved_state_2 =
SavedParquetFileState::from([&pf_id3_level_1, &pf_id1_level_0, &pf_id2_level_2]);
assert!(!saved_state_1.existing_files_modified(&saved_state_2));
assert!(saved_state_1
.modified_ids_and_levels(&saved_state_2)
.is_empty());
}
#[test]
fn both_empty_parquet_files() {
let saved_state_1 = SavedParquetFileState::from([]);
let saved_state_2 = SavedParquetFileState::from([]);
assert!(!saved_state_1.existing_files_modified(&saved_state_2));
assert!(saved_state_1
.modified_ids_and_levels(&saved_state_2)
.is_empty());
}
#[test]
fn missing_files_indicates_modifications() {
let pf_id1_level_0 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.build();
let saved_state_1 = SavedParquetFileState::from([&pf_id1_level_0]);
let saved_state_2 = SavedParquetFileState::from([]);
assert!(saved_state_1.existing_files_modified(&saved_state_2));
assert_eq!(
saved_state_1.modified_ids_and_levels(&saved_state_2),
&[(ParquetFileId::new(1), CompactionLevel::Initial)]
);
}
#[test]
fn disregard_new_files() {
let pf_id1_level_0 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.build();
// New files of any level don't affect whether the old saved state is considered modified
let pf_id2_level_2 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Final)
.build();
let pf_id3_level_1 = ParquetFileBuilder::new(3)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let pf_id4_level_0 = ParquetFileBuilder::new(4)
.with_compaction_level(CompactionLevel::Initial)
.build();
let saved_state_1 = SavedParquetFileState::from([&pf_id1_level_0]);
let saved_state_2 = SavedParquetFileState::from([&pf_id1_level_0, &pf_id2_level_2]);
assert!(!saved_state_1.existing_files_modified(&saved_state_2));
assert!(saved_state_1
.modified_ids_and_levels(&saved_state_2)
.is_empty());
let saved_state_2 = SavedParquetFileState::from([&pf_id1_level_0, &pf_id3_level_1]);
assert!(!saved_state_1.existing_files_modified(&saved_state_2));
assert!(saved_state_1
.modified_ids_and_levels(&saved_state_2)
.is_empty());
let saved_state_2 = SavedParquetFileState::from([&pf_id1_level_0, &pf_id4_level_0]);
assert!(!saved_state_1.existing_files_modified(&saved_state_2));
assert!(saved_state_1
.modified_ids_and_levels(&saved_state_2)
.is_empty());
let saved_state_2 = SavedParquetFileState::from([
&pf_id1_level_0,
&pf_id2_level_2,
&pf_id4_level_0,
&pf_id4_level_0,
]);
assert!(!saved_state_1.existing_files_modified(&saved_state_2));
assert!(saved_state_1
.modified_ids_and_levels(&saved_state_2)
.is_empty());
}
#[test]
fn changed_compaction_level_indicates_modification() {
let pf_id1_level_0 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.build();
let pf_id1_level_1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let pf_id2_level_2 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Final)
.build();
let saved_state_1 = SavedParquetFileState::from([&pf_id1_level_0, &pf_id2_level_2]);
let saved_state_2 = SavedParquetFileState::from([&pf_id1_level_1, &pf_id2_level_2]);
assert!(saved_state_1.existing_files_modified(&saved_state_2));
assert_eq!(
saved_state_1.modified_ids_and_levels(&saved_state_2),
&[(ParquetFileId::new(1), CompactionLevel::Initial)]
);
}
#[test]
fn same_number_of_files_different_ids_indicates_modification() {
let pf_id1_level_0 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.build();
let pf_id2_level_0 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Initial)
.build();
let pf_id3_level_2 = ParquetFileBuilder::new(3)
.with_compaction_level(CompactionLevel::Final)
.build();
let saved_state_1 = SavedParquetFileState::from([&pf_id1_level_0, &pf_id3_level_2]);
let saved_state_2 = SavedParquetFileState::from([&pf_id2_level_0, &pf_id3_level_2]);
assert!(saved_state_1.existing_files_modified(&saved_state_2));
assert_eq!(
saved_state_1.modified_ids_and_levels(&saved_state_2),
&[(ParquetFileId::new(1), CompactionLevel::Initial)]
);
}
}

View File

@ -1,46 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use backoff::{Backoff, BackoffConfig};
use data_types::{Column, TableId};
use iox_catalog::interface::Catalog;
use super::ColumnsSource;
#[derive(Debug)]
pub struct CatalogColumnsSource {
backoff_config: BackoffConfig,
catalog: Arc<dyn Catalog>,
}
impl CatalogColumnsSource {
pub fn new(backoff_config: BackoffConfig, catalog: Arc<dyn Catalog>) -> Self {
Self {
backoff_config,
catalog,
}
}
}
impl Display for CatalogColumnsSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "catalog")
}
}
#[async_trait]
impl ColumnsSource for CatalogColumnsSource {
async fn fetch(&self, table: TableId) -> Vec<Column> {
Backoff::new(&self.backoff_config)
.retry_all_errors("table_of_given_table_id", || async {
self.catalog
.repositories()
.await
.columns()
.list_by_table_id(table)
.await
})
.await
.expect("retry forever")
}
}

View File

@ -1,71 +0,0 @@
use std::{collections::HashMap, fmt::Display};
use async_trait::async_trait;
use data_types::{Column, TableId};
use super::ColumnsSource;
#[derive(Debug)]
pub struct MockColumnsSource {
tables: HashMap<TableId, Vec<Column>>,
}
impl MockColumnsSource {
#[allow(dead_code)] // not used anywhere
pub fn new(tables: HashMap<TableId, Vec<Column>>) -> Self {
Self { tables }
}
}
impl Display for MockColumnsSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl ColumnsSource for MockColumnsSource {
async fn fetch(&self, table: TableId) -> Vec<Column> {
self.tables.get(&table).cloned().unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use data_types::ColumnType;
use iox_tests::{ColumnBuilder, TableBuilder};
use super::*;
#[test]
fn test_display() {
assert_eq!(
MockColumnsSource::new(HashMap::default()).to_string(),
"mock",
)
}
#[tokio::test]
async fn test_fetch() {
// // t_1 has one column and t_2 has no column
let t1 = TableBuilder::new(1).with_name("table1").build();
let t1_c1 = ColumnBuilder::new(1, t1.id.get())
.with_name("time")
.with_column_type(ColumnType::Time)
.build();
let t2 = TableBuilder::new(2).with_name("table2").build();
let tables = HashMap::from([(t1.id, vec![t1_c1.clone()]), (t2.id, vec![])]);
let source = MockColumnsSource::new(tables);
// different tables
assert_eq!(source.fetch(t1.id).await, vec![t1_c1.clone()],);
assert_eq!(source.fetch(t2.id).await, vec![]);
// fetching does not drain
assert_eq!(source.fetch(t1.id).await, vec![t1_c1],);
// unknown table => empty result
assert_eq!(source.fetch(TableId::new(3)).await, vec![]);
}
}

View File

@ -1,15 +0,0 @@
use std::fmt::{Debug, Display};
use async_trait::async_trait;
use data_types::{Column, TableId};
pub mod catalog;
pub mod mock;
#[async_trait]
pub trait ColumnsSource: Debug + Display + Send + Sync {
/// Get Columns for a given table
///
/// This method performs retries.
async fn fetch(&self, table: TableId) -> Vec<Column>;
}

View File

@ -1,51 +0,0 @@
use std::sync::Arc;
use compactor_scheduler::{
CommitUpdate, CompactionJob, CompactionJobStatus, CompactionJobStatusResponse,
CompactionJobStatusVariant, Scheduler,
};
use data_types::{CompactionLevel, ParquetFile, ParquetFileId, ParquetFileParams};
#[derive(Debug)]
pub struct CommitToScheduler {
scheduler: Arc<dyn Scheduler>,
}
impl CommitToScheduler {
pub fn new(scheduler: Arc<dyn Scheduler>) -> Self {
Self { scheduler }
}
pub async fn commit(
&self,
job: CompactionJob,
delete: &[ParquetFile],
upgrade: &[ParquetFile],
create: &[ParquetFileParams],
target_level: CompactionLevel,
) -> Result<Vec<ParquetFileId>, crate::DynError> {
match self
.scheduler
.update_job_status(CompactionJobStatus {
job: job.clone(),
status: CompactionJobStatusVariant::Update(CommitUpdate::new(
job.partition_id,
delete.into(),
upgrade.into(),
create.into(),
target_level,
)),
})
.await?
{
CompactionJobStatusResponse::CreatedParquetFiles(ids) => Ok(ids),
CompactionJobStatusResponse::Ack => unreachable!("scheduler should not ack"),
}
}
}
impl std::fmt::Display for CommitToScheduler {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "CommitToScheduler")
}
}

View File

@ -1,161 +0,0 @@
use std::{collections::HashSet, fmt::Display, sync::Arc};
use async_trait::async_trait;
use compactor_scheduler::{
CompactionJob, CompactionJobStatus, CompactionJobStatusResponse, CompactionJobStatusVariant,
ErrorKind as SchedulerErrorKind, Scheduler,
};
use crate::error::{DynError, ErrorKind, ErrorKindExt};
use super::CompactionJobDoneSink;
#[derive(Debug)]
pub struct ErrorKindCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
kind: HashSet<ErrorKind>,
inner: T,
scheduler: Arc<dyn Scheduler>,
}
impl<T> ErrorKindCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
pub fn new(inner: T, kind: HashSet<ErrorKind>, scheduler: Arc<dyn Scheduler>) -> Self {
Self {
kind,
inner,
scheduler,
}
}
}
impl<T> Display for ErrorKindCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut kinds = self.kind.iter().copied().collect::<Vec<_>>();
kinds.sort();
write!(f, "kind({:?}, {})", kinds, self.inner)
}
}
#[async_trait]
impl<T> CompactionJobDoneSink for ErrorKindCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError> {
match res {
Ok(()) => self.inner.record(job, Ok(())).await,
Err(e) if self.kind.contains(&e.classify()) => {
let scheduler_error = match SchedulerErrorKind::from(e.classify()) {
SchedulerErrorKind::OutOfMemory => SchedulerErrorKind::OutOfMemory,
SchedulerErrorKind::ObjectStore => SchedulerErrorKind::ObjectStore,
SchedulerErrorKind::Timeout => SchedulerErrorKind::Timeout,
SchedulerErrorKind::Unknown(_) => SchedulerErrorKind::Unknown(e.to_string()),
};
match self
.scheduler
.update_job_status(CompactionJobStatus {
job: job.clone(),
status: CompactionJobStatusVariant::Error(scheduler_error),
})
.await?
{
CompactionJobStatusResponse::Ack => {}
CompactionJobStatusResponse::CreatedParquetFiles(_) => {
unreachable!("scheduler should not created parquet files")
}
}
self.inner.record(job, Err(e)).await
}
Err(e) => {
// contract of this abstraction,
// where we do not pass to `self.inner` if not in `self.kind`
Err(e)
}
}
}
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, sync::Arc};
use compactor_scheduler::create_test_scheduler;
use data_types::PartitionId;
use datafusion::error::DataFusionError;
use iox_tests::TestCatalog;
use iox_time::{MockProvider, Time};
use object_store::Error as ObjectStoreError;
use super::{super::mock::MockCompactionJobDoneSink, *};
#[test]
fn test_display() {
let sink = ErrorKindCompactionJobDoneSinkWrapper::new(
MockCompactionJobDoneSink::new(),
HashSet::from([ErrorKind::ObjectStore, ErrorKind::OutOfMemory]),
create_test_scheduler(
TestCatalog::new().catalog(),
Arc::new(MockProvider::new(Time::MIN)),
None,
),
);
assert_eq!(sink.to_string(), "kind([ObjectStore, OutOfMemory], mock)");
}
#[tokio::test]
async fn test_record() {
let inner = Arc::new(MockCompactionJobDoneSink::new());
let sink = ErrorKindCompactionJobDoneSinkWrapper::new(
Arc::clone(&inner),
HashSet::from([ErrorKind::ObjectStore, ErrorKind::OutOfMemory]),
create_test_scheduler(
TestCatalog::new().catalog(),
Arc::new(MockProvider::new(Time::MIN)),
None,
),
);
let cj_1 = CompactionJob::new(PartitionId::new(1));
let cj_2 = CompactionJob::new(PartitionId::new(2));
let cj_3 = CompactionJob::new(PartitionId::new(3));
let cj_4 = CompactionJob::new(PartitionId::new(4));
sink.record(
cj_1.clone(),
Err(Box::new(ObjectStoreError::NotImplemented)),
)
.await
.expect("record failed");
sink.record(
cj_2.clone(),
Err(Box::new(DataFusionError::ResourcesExhausted(String::from(
"foo",
)))),
)
.await
.expect("record failed");
sink.record(cj_3, Err("foo".into())).await.unwrap_err();
sink.record(cj_4.clone(), Ok(()))
.await
.expect("record failed");
assert_eq!(
inner.results(),
HashMap::from([
(cj_1, Err(String::from("Operation not yet implemented.")),),
(cj_2, Err(String::from("Resources exhausted: foo")),),
(cj_4, Ok(()),),
]),
);
}
}

View File

@ -1,126 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use observability_deps::tracing::{error, info};
use crate::error::{DynError, ErrorKindExt};
use super::CompactionJobDoneSink;
#[derive(Debug)]
pub struct LoggingCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
inner: T,
}
impl<T> LoggingCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({})", self.inner)
}
}
#[async_trait]
impl<T> CompactionJobDoneSink for LoggingCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError> {
match &res {
Ok(()) => {
info!(
partition_id = job.partition_id.get(),
job_uuid = job.uuid().to_string(),
"Finished compaction job",
);
}
Err(e) => {
// log compactor errors, classified by compactor ErrorKind
error!(
%e,
kind=e.classify().name(),
partition_id = job.partition_id.get(),
job_uuid = job.uuid().to_string(),
"Error while compacting partition",
);
}
}
self.inner.record(job, res).await
}
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, sync::Arc};
use data_types::PartitionId;
use object_store::Error as ObjectStoreError;
use test_helpers::tracing::TracingCapture;
use super::{super::mock::MockCompactionJobDoneSink, *};
#[test]
fn test_display() {
let sink = LoggingCompactionJobDoneSinkWrapper::new(MockCompactionJobDoneSink::new());
assert_eq!(sink.to_string(), "logging(mock)");
}
#[tokio::test]
async fn test_record() {
let inner = Arc::new(MockCompactionJobDoneSink::new());
let sink = LoggingCompactionJobDoneSinkWrapper::new(Arc::clone(&inner));
let capture = TracingCapture::new();
let cj_1 = CompactionJob::new(PartitionId::new(1));
let cj_2 = CompactionJob::new(PartitionId::new(2));
let cj_3 = CompactionJob::new(PartitionId::new(3));
sink.record(cj_1.clone(), Err("msg 1".into()))
.await
.expect("record failed");
sink.record(cj_2.clone(), Err("msg 2".into()))
.await
.expect("record failed");
sink.record(
cj_1.clone(),
Err(Box::new(ObjectStoreError::NotImplemented)),
)
.await
.expect("record failed");
sink.record(cj_3.clone(), Ok(()))
.await
.expect("record failed");
assert_eq!(
capture.to_string(),
format!("level = ERROR; message = Error while compacting partition; e = msg 1; kind = \"unknown\"; partition_id = 1; job_uuid = {:?}; \n\
level = ERROR; message = Error while compacting partition; e = msg 2; kind = \"unknown\"; partition_id = 2; job_uuid = {:?}; \n\
level = ERROR; message = Error while compacting partition; e = Operation not yet implemented.; kind = \"object_store\"; partition_id = 1; job_uuid = {:?}; \n\
level = INFO; message = Finished compaction job; partition_id = 3; job_uuid = {:?}; ", cj_1.uuid().to_string(), cj_2.uuid().to_string(), cj_1.uuid().to_string(), cj_3.uuid().to_string()),
);
assert_eq!(
inner.results(),
HashMap::from([
(cj_1, Err(String::from("Operation not yet implemented.")),),
(cj_2, Err(String::from("msg 2"))),
(cj_3, Ok(())),
]),
);
}
}

View File

@ -1,164 +0,0 @@
use std::{collections::HashMap, fmt::Display};
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use metric::{Registry, U64Counter};
use crate::error::{DynError, ErrorKind, ErrorKindExt};
use super::CompactionJobDoneSink;
const METRIC_NAME_PARTITION_COMPLETE_COUNT: &str = "iox_compactor_partition_complete_count";
#[derive(Debug)]
pub struct MetricsCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
ok_counter: U64Counter,
error_counter: HashMap<ErrorKind, U64Counter>,
inner: T,
}
impl<T> MetricsCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
pub fn new(inner: T, registry: &Registry) -> Self {
let metric = registry.register_metric::<U64Counter>(
METRIC_NAME_PARTITION_COMPLETE_COUNT,
"Number of completed partitions",
);
let ok_counter = metric.recorder(&[("result", "ok")]);
let error_counter = ErrorKind::variants()
.iter()
.map(|kind| {
(
*kind,
metric.recorder(&[("result", "error"), ("kind", kind.name())]),
)
})
.collect();
Self {
ok_counter,
error_counter,
inner,
}
}
}
impl<T> Display for MetricsCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "metrics({})", self.inner)
}
}
#[async_trait]
impl<T> CompactionJobDoneSink for MetricsCompactionJobDoneSinkWrapper<T>
where
T: CompactionJobDoneSink,
{
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError> {
match &res {
Ok(()) => {
self.ok_counter.inc(1);
}
Err(e) => {
// classify and track counts of compactor ErrorKind
let kind = e.classify();
self.error_counter
.get(&kind)
.expect("all kinds constructed")
.inc(1);
}
}
self.inner.record(job, res).await
}
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, sync::Arc};
use data_types::PartitionId;
use metric::{assert_counter, Attributes};
use object_store::Error as ObjectStoreError;
use super::{super::mock::MockCompactionJobDoneSink, *};
#[test]
fn test_display() {
let registry = Registry::new();
let sink =
MetricsCompactionJobDoneSinkWrapper::new(MockCompactionJobDoneSink::new(), &registry);
assert_eq!(sink.to_string(), "metrics(mock)");
}
#[tokio::test]
async fn test_record() {
let registry = Registry::new();
let inner = Arc::new(MockCompactionJobDoneSink::new());
let sink = MetricsCompactionJobDoneSinkWrapper::new(Arc::clone(&inner), &registry);
assert_ok_counter(&registry, 0);
assert_error_counter(&registry, "unknown", 0);
assert_error_counter(&registry, "object_store", 0);
let cj_1 = CompactionJob::new(PartitionId::new(1));
let cj_2 = CompactionJob::new(PartitionId::new(2));
let cj_3 = CompactionJob::new(PartitionId::new(3));
sink.record(cj_1.clone(), Err("msg 1".into()))
.await
.expect("record failed");
sink.record(cj_2.clone(), Err("msg 2".into()))
.await
.expect("record failed");
sink.record(
cj_1.clone(),
Err(Box::new(ObjectStoreError::NotImplemented)),
)
.await
.expect("record failed");
sink.record(cj_3.clone(), Ok(()))
.await
.expect("record failed");
assert_ok_counter(&registry, 1);
assert_error_counter(&registry, "unknown", 2);
assert_error_counter(&registry, "object_store", 1);
assert_eq!(
inner.results(),
HashMap::from([
(cj_1, Err(String::from("Operation not yet implemented.")),),
(cj_2, Err(String::from("msg 2"))),
(cj_3, Ok(())),
]),
);
}
fn assert_ok_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_COMPLETE_COUNT,
labels = Attributes::from(&[("result", "ok")]),
value = value,
);
}
fn assert_error_counter(registry: &Registry, kind: &'static str, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_COMPLETE_COUNT,
labels = Attributes::from(&[("result", "error"), ("kind", kind)]),
value = value,
);
}
}

View File

@ -1,88 +0,0 @@
use std::{collections::HashMap, fmt::Display, sync::Mutex};
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use super::{CompactionJobDoneSink, DynError};
/// Mock for [`CompactionJobDoneSink`].
#[derive(Debug, Default)]
pub struct MockCompactionJobDoneSink {
last: Mutex<HashMap<CompactionJob, Result<(), String>>>,
}
impl MockCompactionJobDoneSink {
/// Create new mock.
#[allow(dead_code)] // used for testing
pub fn new() -> Self {
Self::default()
}
/// Get the last recorded results.
#[allow(dead_code)] // used for testing
pub fn results(&self) -> HashMap<CompactionJob, Result<(), String>> {
self.last.lock().expect("not poisoned").clone()
}
}
impl Display for MockCompactionJobDoneSink {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl CompactionJobDoneSink for MockCompactionJobDoneSink {
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError> {
self.last
.lock()
.expect("not poisoned")
.insert(job, res.map_err(|e| e.to_string()));
Ok(())
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use super::*;
#[test]
fn test_display() {
assert_eq!(MockCompactionJobDoneSink::new().to_string(), "mock",);
}
#[tokio::test]
async fn test_record() {
let sink = MockCompactionJobDoneSink::new();
assert_eq!(sink.results(), HashMap::default(),);
let cj_1 = CompactionJob::new(PartitionId::new(1));
let cj_2 = CompactionJob::new(PartitionId::new(2));
let cj_3 = CompactionJob::new(PartitionId::new(3));
sink.record(cj_1.clone(), Err("msg 1".into()))
.await
.expect("record failed");
sink.record(cj_2.clone(), Err("msg 2".into()))
.await
.expect("record failed");
sink.record(cj_1.clone(), Err("msg 3".into()))
.await
.expect("record failed");
sink.record(cj_3.clone(), Ok(()))
.await
.expect("record failed");
assert_eq!(
sink.results(),
HashMap::from([
(cj_1, Err(String::from("msg 3"))),
(cj_2, Err(String::from("msg 2"))),
(cj_3, Ok(())),
]),
);
}
}

View File

@ -1,34 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use crate::DynError;
pub mod error_kind;
pub mod logging;
pub mod metrics;
pub mod mock;
pub mod outcome;
/// Records "compaction job is done" status for given partition.
#[async_trait]
pub trait CompactionJobDoneSink: Debug + Display + Send + Sync {
/// Record "compaction job is done" status for given partition.
///
/// This method should retry.
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError>;
}
#[async_trait]
impl<T> CompactionJobDoneSink for Arc<T>
where
T: CompactionJobDoneSink + ?Sized,
{
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError> {
self.as_ref().record(job, res).await
}
}

View File

@ -1,42 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use compactor_scheduler::{
CompactionJob, CompactionJobEnd, CompactionJobEndVariant, Scheduler, SkipReason,
};
use crate::DynError;
use super::CompactionJobDoneSink;
#[derive(Debug)]
pub struct CompactionJobDoneSinkToScheduler {
scheduler: Arc<dyn Scheduler>,
}
impl CompactionJobDoneSinkToScheduler {
pub fn new(scheduler: Arc<dyn Scheduler>) -> Self {
Self { scheduler }
}
}
impl Display for CompactionJobDoneSinkToScheduler {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "CompactionJobDoneSinkToScheduler")
}
}
#[async_trait]
impl CompactionJobDoneSink for CompactionJobDoneSinkToScheduler {
async fn record(&self, job: CompactionJob, res: Result<(), DynError>) -> Result<(), DynError> {
let end_action = CompactionJobEnd {
job,
end_action: match res {
Ok(_) => CompactionJobEndVariant::Complete,
Err(e) => CompactionJobEndVariant::RequestToSkip(SkipReason(e.to_string())),
},
};
self.scheduler.end_job(end_action).await
}
}

View File

@ -1,114 +0,0 @@
use std::{collections::VecDeque, fmt::Display, sync::Arc};
use compactor_scheduler::CompactionJob;
use futures::{stream::BoxStream, StreamExt};
use super::super::{
compaction_jobs_source::CompactionJobsSource, partition_files_source::rate_limit::RateLimit,
};
use super::CompactionJobStream;
#[derive(Debug)]
pub struct EndlessCompactionJobStream<T>
where
T: CompactionJobsSource,
{
source: Arc<T>,
limiter: RateLimit,
}
impl<T> EndlessCompactionJobStream<T>
where
T: CompactionJobsSource,
{
pub fn new(source: T) -> Self {
Self {
source: Arc::new(source),
limiter: RateLimit::new(1, 1), // Initial rate is irrelevant, it will be updated before first use.
}
}
}
impl<T> Display for EndlessCompactionJobStream<T>
where
T: CompactionJobsSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "endless({})", self.source)
}
}
impl<T> CompactionJobStream for EndlessCompactionJobStream<T>
where
T: CompactionJobsSource,
{
fn stream(&self) -> BoxStream<'_, CompactionJob> {
let source = Arc::clone(&self.source);
// Note: we use a VecDeque as a buffer so we can preserve the order and cheaply remove the first element without
// relocating the entire buffer content.
futures::stream::unfold(VecDeque::new(), move |mut buffer| {
let source = Arc::clone(&source);
async move {
loop {
while let Some(d) = self.limiter.can_proceed() {
// Throttling because either we don't need to go this fast, or we're at risk of hitting the catalog
// to hard, or both.
tokio::time::sleep(d).await;
}
if let Some(p_id) = buffer.pop_front() {
return Some((p_id, buffer));
}
// fetch new data
buffer = VecDeque::from(source.fetch().await);
// update rate limiter so we can complete the batch in 5m, which is plenty fast.
// allow a burst of 25, so after a period of inactivity, up to 25 can go quickly.
let mut rate = buffer.len() / (5 * 60);
if rate < 10 {
// The purpose of this rate limiter is to keep us from hitting the catalog too hard. There is no need to
// slow it down to less than 10/s
rate = 10;
}
self.limiter.update_rps(rate, 25);
}
}
})
.boxed()
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use super::{super::super::compaction_jobs_source::mock::MockCompactionJobsSource, *};
#[test]
fn test_display() {
let stream = EndlessCompactionJobStream::new(MockCompactionJobsSource::new(vec![]));
assert_eq!(stream.to_string(), "endless(mock)");
}
#[tokio::test]
async fn test_stream() {
let ids = vec![
CompactionJob::new(PartitionId::new(1)),
CompactionJob::new(PartitionId::new(3)),
CompactionJob::new(PartitionId::new(2)),
];
let stream = EndlessCompactionJobStream::new(MockCompactionJobsSource::new(ids.clone()));
// stream is stateless
for _ in 0..2 {
// we need to limit the stream at one point because it is endless
assert_eq!(
stream.stream().take(5).collect::<Vec<_>>().await,
[&ids[..], &ids[..2]].concat(),
);
}
}
}

View File

@ -1,15 +0,0 @@
use std::fmt::{Debug, Display};
use compactor_scheduler::CompactionJob;
use futures::stream::BoxStream;
pub mod endless;
pub mod once;
/// Source for compaction jobs.
pub trait CompactionJobStream: Debug + Display + Send + Sync {
/// Create new source stream of compaction jobs.
///
/// This stream may be endless.
fn stream(&self) -> BoxStream<'_, CompactionJob>;
}

View File

@ -1,74 +0,0 @@
use std::{fmt::Display, sync::Arc};
use compactor_scheduler::CompactionJob;
use futures::{stream::BoxStream, StreamExt};
use super::{super::compaction_jobs_source::CompactionJobsSource, CompactionJobStream};
#[derive(Debug)]
pub struct OnceCompactionJobStream<T>
where
T: CompactionJobsSource,
{
source: Arc<T>,
}
impl<T> OnceCompactionJobStream<T>
where
T: CompactionJobsSource,
{
pub fn new(source: T) -> Self {
Self {
source: Arc::new(source),
}
}
}
impl<T> Display for OnceCompactionJobStream<T>
where
T: CompactionJobsSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "once({})", self.source)
}
}
impl<T> CompactionJobStream for OnceCompactionJobStream<T>
where
T: CompactionJobsSource,
{
fn stream(&self) -> BoxStream<'_, CompactionJob> {
let source = Arc::clone(&self.source);
futures::stream::once(async move { futures::stream::iter(source.fetch().await) })
.flatten()
.boxed()
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use super::{super::super::compaction_jobs_source::mock::MockCompactionJobsSource, *};
#[test]
fn test_display() {
let stream = OnceCompactionJobStream::new(MockCompactionJobsSource::new(vec![]));
assert_eq!(stream.to_string(), "once(mock)");
}
#[tokio::test]
async fn test_stream() {
let ids = vec![
CompactionJob::new(PartitionId::new(1)),
CompactionJob::new(PartitionId::new(3)),
CompactionJob::new(PartitionId::new(2)),
];
let stream = OnceCompactionJobStream::new(MockCompactionJobsSource::new(ids.clone()));
// stream is stateless
for _ in 0..2 {
assert_eq!(stream.stream().collect::<Vec<_>>().await, ids,);
}
}
}

View File

@ -1,92 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use observability_deps::tracing::info;
use super::CompactionJobsSource;
#[derive(Debug)]
pub struct LoggingCompactionJobsWrapper<T>
where
T: CompactionJobsSource,
{
inner: T,
}
impl<T> LoggingCompactionJobsWrapper<T>
where
T: CompactionJobsSource,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingCompactionJobsWrapper<T>
where
T: CompactionJobsSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({})", self.inner)
}
}
#[async_trait]
impl<T> CompactionJobsSource for LoggingCompactionJobsWrapper<T>
where
T: CompactionJobsSource,
{
async fn fetch(&self) -> Vec<CompactionJob> {
let jobs = self.inner.fetch().await;
info!(n_jobs = jobs.len(), "Fetch jobs",);
if jobs.is_empty() {
info!("No compaction job found");
}
jobs
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use test_helpers::tracing::TracingCapture;
use super::{super::mock::MockCompactionJobsSource, *};
#[test]
fn test_display() {
let source = LoggingCompactionJobsWrapper::new(MockCompactionJobsSource::new(vec![]));
assert_eq!(source.to_string(), "logging(mock)",);
}
#[tokio::test]
async fn test_fetch_empty() {
let source = LoggingCompactionJobsWrapper::new(MockCompactionJobsSource::new(vec![]));
let capture = TracingCapture::new();
assert_eq!(source.fetch().await, vec![],);
// logs normal log message (so it's easy search for every single call) but also an extra warning
assert_eq!(
capture.to_string(),
"level = INFO; message = Fetch jobs; n_jobs = 0; \
\nlevel = INFO; message = No compaction job found; ",
);
}
#[tokio::test]
async fn test_fetch_some() {
let cj_1 = CompactionJob::new(PartitionId::new(5));
let cj_2 = CompactionJob::new(PartitionId::new(1));
let cj_3 = CompactionJob::new(PartitionId::new(12));
let jobs = vec![cj_1, cj_2, cj_3];
let source = LoggingCompactionJobsWrapper::new(MockCompactionJobsSource::new(jobs.clone()));
let capture = TracingCapture::new();
assert_eq!(source.fetch().await, jobs,);
// just the ordinary log message, no warning
assert_eq!(
capture.to_string(),
"level = INFO; message = Fetch jobs; n_jobs = 3; ",
);
}
}

View File

@ -1,126 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use metric::{Registry, U64Counter};
use super::CompactionJobsSource;
const METRIC_NAME_PARTITIONS_FETCH_COUNT: &str = "iox_compactor_partitions_fetch_count";
const METRIC_NAME_PARTITIONS_COUNT: &str = "iox_compactor_partitions_count";
#[derive(Debug)]
pub struct MetricsCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
partitions_fetch_counter: U64Counter,
partitions_counter: U64Counter,
inner: T,
}
impl<T> MetricsCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
pub fn new(inner: T, registry: &Registry) -> Self {
let partitions_fetch_counter = registry
.register_metric::<U64Counter>(
METRIC_NAME_PARTITIONS_FETCH_COUNT,
"Number of times the compactor fetched fresh partitions",
)
.recorder(&[]);
let partitions_counter = registry
.register_metric::<U64Counter>(
METRIC_NAME_PARTITIONS_COUNT,
"Number of partitions processed by the compactor. This contains the sum over ALL rounds (i.e. the same partition may be counted multiple times).",
)
.recorder(&[]);
Self {
partitions_fetch_counter,
partitions_counter,
inner,
}
}
}
impl<T> Display for MetricsCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "metrics({})", self.inner)
}
}
#[async_trait]
impl<T> CompactionJobsSource for MetricsCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
async fn fetch(&self) -> Vec<CompactionJob> {
let jobs = self.inner.fetch().await;
self.partitions_fetch_counter.inc(1);
self.partitions_counter.inc(jobs.len() as u64);
jobs
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use metric::assert_counter;
use super::{super::mock::MockCompactionJobsSource, *};
#[test]
fn test_display() {
let registry = Registry::new();
let source = MetricsCompactionJobsSourceWrapper::new(
MockCompactionJobsSource::new(vec![]),
&registry,
);
assert_eq!(source.to_string(), "metrics(mock)",);
}
#[tokio::test]
async fn test_fetch() {
let registry = Registry::new();
let partitions = vec![
CompactionJob::new(PartitionId::new(5)),
CompactionJob::new(PartitionId::new(1)),
CompactionJob::new(PartitionId::new(12)),
];
let source = MetricsCompactionJobsSourceWrapper::new(
MockCompactionJobsSource::new(partitions.clone()),
&registry,
);
assert_fetch_counter(&registry, 0);
assert_partition_counter(&registry, 0);
assert_eq!(source.fetch().await, partitions);
assert_fetch_counter(&registry, 1);
assert_partition_counter(&registry, 3);
}
fn assert_fetch_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITIONS_FETCH_COUNT,
value = value,
);
}
fn assert_partition_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITIONS_COUNT,
value = value,
);
}
}

View File

@ -1,65 +0,0 @@
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use parking_lot::Mutex;
use super::CompactionJobsSource;
/// A mock structure for providing [compaction jobs](CompactionJob).
#[derive(Debug)]
pub struct MockCompactionJobsSource {
compaction_jobs: Mutex<Vec<CompactionJob>>,
}
impl MockCompactionJobsSource {
#[allow(dead_code)]
/// Create a new MockCompactionJobsSource.
pub fn new(jobs: Vec<CompactionJob>) -> Self {
Self {
compaction_jobs: Mutex::new(jobs),
}
}
/// Set CompactionJobs for MockCompactionJobsSource.
#[allow(dead_code)] // not used anywhere
pub fn set(&self, jobs: Vec<CompactionJob>) {
*self.compaction_jobs.lock() = jobs;
}
}
impl std::fmt::Display for MockCompactionJobsSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl CompactionJobsSource for MockCompactionJobsSource {
async fn fetch(&self) -> Vec<CompactionJob> {
self.compaction_jobs.lock().clone()
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use super::*;
#[test]
fn test_display() {
assert_eq!(MockCompactionJobsSource::new(vec![]).to_string(), "mock",);
}
#[tokio::test]
async fn test_fetch() {
let source = MockCompactionJobsSource::new(vec![]);
assert_eq!(source.fetch().await, vec![],);
let cj_1 = CompactionJob::new(PartitionId::new(5));
let cj_2 = CompactionJob::new(PartitionId::new(1));
let cj_3 = CompactionJob::new(PartitionId::new(12));
let parts = vec![cj_1, cj_2, cj_3];
source.set(parts.clone());
assert_eq!(source.fetch().await, parts,);
}
}

View File

@ -1,38 +0,0 @@
//! Abstractions that provide functionality over a [`CompactionJobsSource`] of compaction jobs.
//!
//! These abstractions are for actions taken in a compactor using the CompactionJobs received from a compactor_scheduler.
pub mod logging;
pub mod metrics;
pub mod mock;
pub mod not_empty;
pub mod randomize_order;
pub mod scheduled;
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
/// A source of partitions, noted by [`CompactionJob`](compactor_scheduler::CompactionJob), that may potentially need compacting.
#[async_trait]
pub trait CompactionJobsSource: Debug + Display + Send + Sync {
/// Get compaction jobs. (For now, 1 job equals 1 partition ID).
///
/// This method performs retries.
///
/// This should only perform basic, efficient filtering. It MUST NOT inspect individual parquet files.
async fn fetch(&self) -> Vec<CompactionJob>;
}
#[async_trait]
impl<T> CompactionJobsSource for Arc<T>
where
T: CompactionJobsSource + ?Sized,
{
async fn fetch(&self) -> Vec<CompactionJob> {
self.as_ref().fetch().await
}
}

View File

@ -1,113 +0,0 @@
use std::{fmt::Display, sync::Arc, time::Duration};
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use iox_time::TimeProvider;
use super::CompactionJobsSource;
#[derive(Debug)]
pub struct NotEmptyCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
inner: T,
throttle: Duration,
time_provider: Arc<dyn TimeProvider>,
}
impl<T> NotEmptyCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
pub fn new(inner: T, throttle: Duration, time_provider: Arc<dyn TimeProvider>) -> Self {
Self {
inner,
throttle,
time_provider,
}
}
}
impl<T> Display for NotEmptyCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "not_empty({})", self.inner)
}
}
#[async_trait]
impl<T> CompactionJobsSource for NotEmptyCompactionJobsSourceWrapper<T>
where
T: CompactionJobsSource,
{
async fn fetch(&self) -> Vec<CompactionJob> {
loop {
let res = self.inner.fetch().await;
if !res.is_empty() {
return res;
}
self.time_provider.sleep(self.throttle).await;
}
}
}
#[cfg(test)]
mod tests {
use compactor_test_utils::AssertFutureExt;
use data_types::PartitionId;
use iox_time::{MockProvider, Time};
use super::{super::mock::MockCompactionJobsSource, *};
#[test]
fn test_display() {
let source = NotEmptyCompactionJobsSourceWrapper::new(
MockCompactionJobsSource::new(vec![]),
Duration::from_secs(1),
Arc::new(MockProvider::new(Time::MIN)),
);
assert_eq!(source.to_string(), "not_empty(mock)",);
}
#[tokio::test]
async fn test_fetch() {
let inner = Arc::new(MockCompactionJobsSource::new(vec![]));
let time_provider = Arc::new(MockProvider::new(Time::MIN));
let source = NotEmptyCompactionJobsSourceWrapper::new(
Arc::clone(&inner),
Duration::from_secs(1),
Arc::clone(&time_provider) as _,
);
// intially pending because no data
let mut fut = source.fetch();
fut.assert_pending().await;
// still not data, still pending
time_provider.inc(Duration::from_secs(10));
fut.assert_pending().await;
// insert data but system is still throttled
let p = CompactionJob::new(PartitionId::new(5));
let parts = vec![p];
inner.set(parts.clone());
fut.assert_pending().await;
// still throttled
time_provider.inc(Duration::from_millis(500));
fut.assert_pending().await;
// finally a result
time_provider.inc(Duration::from_millis(500));
let res = fut.poll_timeout().await;
assert_eq!(res, parts);
// not empty, so data arrives immediately
let fut = source.fetch();
let res = fut.poll_timeout().await;
assert_eq!(res, parts);
}
}

View File

@ -1,117 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use compactor_scheduler::CompactionJob;
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
use super::CompactionJobsSource;
#[derive(Debug)]
pub struct RandomizeOrderCompactionJobsSourcesWrapper<T>
where
T: CompactionJobsSource,
{
inner: T,
seed: u64,
}
impl<T> RandomizeOrderCompactionJobsSourcesWrapper<T>
where
T: CompactionJobsSource,
{
pub fn new(inner: T, seed: u64) -> Self {
Self { inner, seed }
}
}
impl<T> Display for RandomizeOrderCompactionJobsSourcesWrapper<T>
where
T: CompactionJobsSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "randomize_order({})", self.inner)
}
}
#[async_trait]
impl<T> CompactionJobsSource for RandomizeOrderCompactionJobsSourcesWrapper<T>
where
T: CompactionJobsSource,
{
async fn fetch(&self) -> Vec<CompactionJob> {
let mut compaction_jobs = self.inner.fetch().await;
let mut rng = StdRng::seed_from_u64(self.seed);
compaction_jobs.shuffle(&mut rng);
compaction_jobs
}
}
#[cfg(test)]
mod tests {
use data_types::PartitionId;
use super::{super::mock::MockCompactionJobsSource, *};
#[test]
fn test_display() {
let source = RandomizeOrderCompactionJobsSourcesWrapper::new(
MockCompactionJobsSource::new(vec![]),
123,
);
assert_eq!(source.to_string(), "randomize_order(mock)",);
}
#[tokio::test]
async fn test_fetch_empty() {
let source = RandomizeOrderCompactionJobsSourcesWrapper::new(
MockCompactionJobsSource::new(vec![]),
123,
);
assert_eq!(source.fetch().await, vec![],);
}
#[tokio::test]
async fn test_fetch_some() {
let cj_1 = CompactionJob::new(PartitionId::new(5));
let cj_2 = CompactionJob::new(PartitionId::new(1));
let cj_3 = CompactionJob::new(PartitionId::new(12));
let compaction_jobs = vec![cj_1.clone(), cj_2.clone(), cj_3.clone()];
// shuffles
let source = RandomizeOrderCompactionJobsSourcesWrapper::new(
MockCompactionJobsSource::new(compaction_jobs.clone()),
123,
);
assert_eq!(
source.fetch().await,
vec![cj_3.clone(), cj_2.clone(), cj_1.clone(),],
);
// is deterministic in same source
for _ in 0..100 {
assert_eq!(
source.fetch().await,
vec![cj_3.clone(), cj_2.clone(), cj_1.clone(),],
);
}
// is deterministic with new source
for _ in 0..100 {
let source = RandomizeOrderCompactionJobsSourcesWrapper::new(
MockCompactionJobsSource::new(compaction_jobs.clone()),
123,
);
assert_eq!(
source.fetch().await,
vec![cj_3.clone(), cj_2.clone(), cj_1.clone(),],
);
}
// different seed => different output
let source = RandomizeOrderCompactionJobsSourcesWrapper::new(
MockCompactionJobsSource::new(compaction_jobs.clone()),
1234,
);
assert_eq!(source.fetch().await, vec![cj_2, cj_3, cj_1,],);
}
}

View File

@ -1,54 +0,0 @@
use std::sync::Arc;
use async_trait::async_trait;
use compactor_scheduler::{CompactionJob, Scheduler};
use super::CompactionJobsSource;
#[derive(Debug)]
pub struct ScheduledCompactionJobsSource {
scheduler: Arc<dyn Scheduler>,
}
impl ScheduledCompactionJobsSource {
pub fn new(scheduler: Arc<dyn Scheduler>) -> Self {
Self { scheduler }
}
}
#[async_trait]
impl CompactionJobsSource for ScheduledCompactionJobsSource {
async fn fetch(&self) -> Vec<CompactionJob> {
self.scheduler.get_jobs().await
}
}
impl std::fmt::Display for ScheduledCompactionJobsSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "scheduled_compaction_jobs_source({})", self.scheduler)
}
}
#[cfg(test)]
mod tests {
use compactor_scheduler::create_test_scheduler;
use iox_tests::TestCatalog;
use iox_time::{MockProvider, Time};
use super::*;
#[test]
fn test_display() {
let scheduler = create_test_scheduler(
TestCatalog::new().catalog(),
Arc::new(MockProvider::new(Time::MIN)),
None,
);
let source = ScheduledCompactionJobsSource { scheduler };
assert_eq!(
source.to_string(),
"scheduled_compaction_jobs_source(local_compaction_scheduler)",
);
}
}

View File

@ -1,75 +0,0 @@
use std::{fmt::Display, sync::Arc};
use datafusion::physical_plan::{
stream::RecordBatchStreamAdapter, ExecutionPlan, SendableRecordBatchStream,
};
use futures::TryStreamExt;
use iox_query::exec::{Executor, ExecutorType};
use super::DataFusionPlanExec;
#[derive(Debug)]
pub struct DedicatedDataFusionPlanExec {
exec: Arc<Executor>,
}
impl DedicatedDataFusionPlanExec {
pub fn new(exec: Arc<Executor>) -> Self {
Self { exec }
}
}
impl Display for DedicatedDataFusionPlanExec {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "dedicated")
}
}
impl DataFusionPlanExec for DedicatedDataFusionPlanExec {
fn exec(&self, plan: Arc<dyn ExecutionPlan>) -> Vec<SendableRecordBatchStream> {
let stream_count = plan.output_partitioning().partition_count();
let schema = plan.schema();
let ctx = self.exec.new_context(ExecutorType::Reorg);
(0..stream_count)
.map(|i| {
let plan = Arc::clone(&plan);
let ctx = ctx.child_ctx("partition");
let stream =
futures::stream::once(
async move { ctx.execute_stream_partitioned(plan, i).await },
)
.try_flatten();
let stream = RecordBatchStreamAdapter::new(Arc::clone(&schema), stream);
Box::pin(stream) as SendableRecordBatchStream
})
.collect()
}
}
#[cfg(test)]
mod tests {
use crate::components::df_planner::panic::PanicPlan;
use super::*;
#[test]
fn test_display() {
let exec = DedicatedDataFusionPlanExec::new(Arc::new(Executor::new_testing()));
assert_eq!(exec.to_string(), "dedicated");
}
#[tokio::test]
async fn test_panic() {
let exec = DedicatedDataFusionPlanExec::new(Arc::new(Executor::new_testing()));
let mut streams = exec.exec(Arc::new(PanicPlan));
assert_eq!(streams.len(), 1);
let stream = streams.pop().unwrap();
let err = stream.try_collect::<Vec<_>>().await.unwrap_err();
assert_eq!(
err.to_string(),
"Join Error (panic)\ncaused by\nExternal error: Panic: foo"
);
}
}

View File

@ -1,24 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream};
pub mod dedicated;
pub mod noop;
pub trait DataFusionPlanExec: Debug + Display + Send + Sync {
/// Convert DataFusion [`ExecutionPlan`] to multiple output streams.
///
/// # Stream Polling
/// These streams *must* to run in parallel otherwise a deadlock
/// can occur. Since there is a merge in the plan, in order to make
/// progress on one stream there must be (potential space) on the
/// other streams.
///
/// See:
/// - <https://github.com/influxdata/influxdb_iox/issues/4306>
/// - <https://github.com/influxdata/influxdb_iox/issues/4324>
fn exec(&self, plan: Arc<dyn ExecutionPlan>) -> Vec<SendableRecordBatchStream>;
}

View File

@ -1,38 +0,0 @@
use std::{fmt::Display, sync::Arc};
use datafusion::physical_plan::{
stream::RecordBatchStreamAdapter, ExecutionPlan, SendableRecordBatchStream,
};
use super::DataFusionPlanExec;
/// Creates a DataFusion plan that does nothing (for use in testing)
#[derive(Debug, Default)]
pub struct NoopDataFusionPlanExec;
impl NoopDataFusionPlanExec {
pub fn new() -> Self {
Self
}
}
impl Display for NoopDataFusionPlanExec {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "noop")
}
}
impl DataFusionPlanExec for NoopDataFusionPlanExec {
fn exec(&self, plan: Arc<dyn ExecutionPlan>) -> Vec<SendableRecordBatchStream> {
let stream_count = plan.output_partitioning().partition_count();
let schema = plan.schema();
(0..stream_count)
.map(|_| {
let stream = futures::stream::empty();
let stream = RecordBatchStreamAdapter::new(Arc::clone(&schema), stream);
Box::pin(stream) as SendableRecordBatchStream
})
.collect()
}
}

View File

@ -1,24 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan};
pub mod panic;
pub mod planner_v1;
mod query_chunk;
use crate::{partition_info::PartitionInfo, plan_ir::PlanIR};
/// Creates an [`ExecutionPlan`] for a [`PlanIR`] that compacts some
/// number of input files together
#[async_trait]
pub trait DataFusionPlanner: Debug + Display + Send + Sync {
async fn plan(
&self,
ir: &PlanIR,
partition: Arc<PartitionInfo>,
) -> Result<Arc<dyn ExecutionPlan>, DataFusionError>;
}

View File

@ -1,142 +0,0 @@
use std::{any::Any, fmt::Display, sync::Arc};
use async_trait::async_trait;
use datafusion::{
arrow::datatypes::SchemaRef,
error::DataFusionError,
execution::context::TaskContext,
physical_expr::PhysicalSortExpr,
physical_plan::{
stream::RecordBatchStreamAdapter, DisplayAs, DisplayFormatType, ExecutionPlan,
Partitioning, SendableRecordBatchStream, Statistics,
},
};
use schema::SchemaBuilder;
use crate::{partition_info::PartitionInfo, plan_ir::PlanIR};
use super::DataFusionPlanner;
/// A planner that always generates a panic
#[derive(Debug, Default, Clone, Copy)]
pub struct PanicDataFusionPlanner;
impl PanicDataFusionPlanner {
/// Create a new planner
pub fn new() -> Self {
Self
}
}
impl Display for PanicDataFusionPlanner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "panic")
}
}
#[async_trait]
impl DataFusionPlanner for PanicDataFusionPlanner {
async fn plan(
&self,
_ir: &PlanIR,
_partition: Arc<PartitionInfo>,
) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
Ok(Arc::new(PanicPlan))
}
}
#[derive(Debug)]
pub struct PanicPlan;
impl ExecutionPlan for PanicPlan {
fn as_any(&self) -> &dyn Any {
self as _
}
fn schema(&self) -> SchemaRef {
SchemaBuilder::new().build().unwrap().as_arrow()
}
fn output_partitioning(&self) -> Partitioning {
Partitioning::UnknownPartitioning(1)
}
fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
None
}
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
vec![]
}
fn with_new_children(
self: Arc<Self>,
children: Vec<Arc<dyn ExecutionPlan>>,
) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
assert!(children.is_empty());
Ok(self)
}
fn execute(
&self,
partition: usize,
_context: Arc<TaskContext>,
) -> datafusion::error::Result<SendableRecordBatchStream> {
assert_eq!(partition, 0);
let stream = futures::stream::once(async move { panic!("foo") });
let stream = RecordBatchStreamAdapter::new(self.schema(), stream);
Ok(Box::pin(stream))
}
fn statistics(&self) -> Statistics {
unimplemented!()
}
}
impl DisplayAs for PanicPlan {
fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match t {
DisplayFormatType::Default | DisplayFormatType::Verbose => {
write!(f, "PanicPlan")
}
}
}
}
#[cfg(test)]
mod tests {
use data_types::CompactionLevel;
use datafusion::{physical_plan::collect, prelude::SessionContext};
use crate::{file_classification::CompactReason, test_utils::PartitionInfoBuilder};
use super::*;
#[test]
fn test_display() {
assert_eq!(PanicDataFusionPlanner::new().to_string(), "panic");
}
#[tokio::test]
#[should_panic(expected = "foo")]
async fn test_panic() {
let planner = PanicDataFusionPlanner::new();
let partition = Arc::new(PartitionInfoBuilder::new().build());
let plan = planner
.plan(
&PlanIR::Compact {
files: vec![],
target_level: CompactionLevel::Final,
// This reason is arbitrary
reason: CompactReason::ManySmallFiles,
},
partition,
)
.await
.unwrap();
let session_ctx = SessionContext::new();
let task_ctx = Arc::new(TaskContext::from(&session_ctx));
collect(plan, task_ctx).await.ok();
}
}

View File

@ -1,112 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan};
use iox_query::{
exec::{Executor, ExecutorType},
frontend::reorg::ReorgPlanner,
};
use parquet_file::storage::ParquetStorage;
use crate::{
components::df_planner::query_chunk::{to_query_chunks, QueryableParquetChunk},
partition_info::PartitionInfo,
plan_ir::PlanIR,
};
use super::DataFusionPlanner;
/// Builder for compaction plans.
///
/// This uses the first draft / version of how the compactor splits files / time ranges. There will probably future
/// implementations (maybe called V2, but maybe it also gets a proper name).
#[derive(Debug)]
pub struct V1DataFusionPlanner {
store: ParquetStorage,
exec: Arc<Executor>,
}
impl V1DataFusionPlanner {
/// Create a new compact plan builder.
pub fn new(store: ParquetStorage, exec: Arc<Executor>) -> Self {
Self { store, exec }
}
}
impl Display for V1DataFusionPlanner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "v1")
}
}
#[async_trait]
impl DataFusionPlanner for V1DataFusionPlanner {
async fn plan(
&self,
ir: &PlanIR,
partition: Arc<PartitionInfo>,
) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
let ctx = self.exec.new_context(ExecutorType::Reorg);
let plan = match ir {
PlanIR::None { .. } => unreachable!("filter out None plans before calling plan"),
PlanIR::Compact { files, .. } => {
let query_chunks = to_query_chunks(files, &partition, self.store.clone());
let merged_schema = QueryableParquetChunk::merge_schemas(&query_chunks);
let sort_key = partition
.sort_key
.as_ref()
.expect("no partition sort key in catalog")
.filter_to(&merged_schema.primary_key(), partition.partition_id.get());
ReorgPlanner::new()
.compact_plan(
Arc::from(partition.table.name.clone()),
&merged_schema,
query_chunks,
sort_key,
)
.map_err(|e| {
DataFusionError::Context(
String::from("planner"),
Box::new(DataFusionError::External(Box::new(e))),
)
})?
}
PlanIR::Split {
files, split_times, ..
} => {
let query_chunks = to_query_chunks(files, &partition, self.store.clone());
let merged_schema = QueryableParquetChunk::merge_schemas(&query_chunks);
let sort_key = partition
.sort_key
.as_ref()
.expect("no partition sort key in catalog")
.filter_to(&merged_schema.primary_key(), partition.partition_id.get());
ReorgPlanner::new()
.split_plan(
Arc::from(partition.table.name.clone()),
&merged_schema,
query_chunks,
sort_key,
split_times.clone(),
)
.map_err(|e| {
DataFusionError::Context(
String::from("planner"),
Box::new(DataFusionError::External(Box::new(e))),
)
})?
}
};
// Build physical compact plan
ctx.create_physical_plan(&plan).await.map_err(|e| {
DataFusionError::Context(
String::from("planner"),
Box::new(DataFusionError::External(Box::new(e))),
)
})
}
}

View File

@ -1,174 +0,0 @@
//! QueryableParquetChunk for building query plan
use std::{any::Any, sync::Arc};
use data_types::{ChunkId, ChunkOrder, TransitionPartitionId};
use datafusion::physical_plan::Statistics;
use iox_query::{util::create_basic_summary, QueryChunk, QueryChunkData};
use observability_deps::tracing::debug;
use parquet_file::{chunk::ParquetChunk, storage::ParquetStorage};
use schema::{merge::SchemaMerger, sort::SortKey, Schema};
use uuid::Uuid;
use crate::{partition_info::PartitionInfo, plan_ir::FileIR};
/// QueryableParquetChunk that implements QueryChunk and QueryMetaChunk for building query plan
#[derive(Debug, Clone)]
pub struct QueryableParquetChunk {
// Data of the parquet file
data: Arc<ParquetChunk>,
partition_id: TransitionPartitionId,
sort_key: Option<SortKey>,
order: ChunkOrder,
stats: Arc<Statistics>,
}
impl QueryableParquetChunk {
/// Initialize a QueryableParquetChunk
pub fn new(
partition_id: TransitionPartitionId,
data: Arc<ParquetChunk>,
sort_key: Option<SortKey>,
order: ChunkOrder,
) -> Self {
let stats = Arc::new(create_basic_summary(
data.rows() as u64,
data.schema(),
Some(data.timestamp_min_max()),
));
Self {
data,
partition_id,
sort_key,
order,
stats,
}
}
/// Merge schema of the given chunks
pub fn merge_schemas(chunks: &[Arc<dyn QueryChunk>]) -> Schema {
let mut merger = SchemaMerger::new();
for chunk in chunks {
merger = merger.merge(chunk.schema()).expect("schemas compatible");
}
merger.build()
}
/// Return the parquet file's object store id
pub fn object_store_id(&self) -> Uuid {
self.data.object_store_id()
}
}
impl QueryChunk for QueryableParquetChunk {
fn stats(&self) -> Arc<Statistics> {
Arc::clone(&self.stats)
}
fn schema(&self) -> &Schema {
self.data.schema()
}
fn partition_id(&self) -> &TransitionPartitionId {
&self.partition_id
}
fn sort_key(&self) -> Option<&SortKey> {
self.sort_key.as_ref()
}
// This function is needed to distinguish the ParquetChunks further if they happen to have the
// same creation order.
// Ref: chunks.sort_unstable_by_key(|c| (c.order(), c.id())); in provider.rs
fn id(&self) -> ChunkId {
// When we need the order to split overlapped chunks, the ChunkOrder is already different.
// ChunkId is used as tiebreaker does not matter much, so use the object store id
self.object_store_id().into()
}
/// Returns true if the chunk may contain a duplicate "primary key" within itself
fn may_contain_pk_duplicates(&self) -> bool {
// Data of a parquet file has no duplicates
false
}
fn data(&self) -> QueryChunkData {
QueryChunkData::Parquet(self.data.parquet_exec_input())
}
/// Returns chunk type
fn chunk_type(&self) -> &str {
"QueryableParquetChunk"
}
// Order of the chunk so they can be deduplicated correctly
fn order(&self) -> ChunkOrder {
self.order
}
fn as_any(&self) -> &dyn Any {
self
}
}
pub fn to_query_chunks(
files: &[FileIR],
partition_info: &PartitionInfo,
store: ParquetStorage,
) -> Vec<Arc<dyn QueryChunk>> {
files
.iter()
.map(|file| {
Arc::new(to_queryable_parquet_chunk(
file,
partition_info,
store.clone(),
)) as _
})
.collect()
}
/// Convert to a QueryableParquetChunk
fn to_queryable_parquet_chunk(
file: &FileIR,
partition_info: &PartitionInfo,
store: ParquetStorage,
) -> QueryableParquetChunk {
let column_id_lookup = partition_info.table_schema.column_id_map();
let selection: Vec<_> = file
.file
.column_set
.iter()
.flat_map(|id| column_id_lookup.get(id).copied())
.collect();
let table_schema: Schema = partition_info
.table_schema
.as_ref()
.columns
.clone()
.try_into()
.expect("table schema is broken");
let schema = table_schema
.select_by_names(&selection)
.expect("schema in-sync");
let pk = schema.primary_key();
let sort_key = partition_info
.sort_key
.as_ref()
.map(|sk| sk.filter_to(&pk, partition_info.partition_id.get()));
let partition_id = partition_info.partition_id();
// Make it debug for it to show up in prod's initial setup
let uuid = file.file.object_store_id;
debug!(
parquet_file_id = file.file.id.get(),
parquet_file_namespace_id = file.file.namespace_id.get(),
parquet_file_table_id = file.file.table_id.get(),
parquet_file_partition_id = %file.file.partition_id,
parquet_file_object_store_id = uuid.to_string().as_str(),
"built parquet chunk from metadata"
);
let parquet_chunk = ParquetChunk::new(Arc::new(file.file.clone()), schema, store);
QueryableParquetChunk::new(partition_id, Arc::new(parquet_chunk), sort_key, file.order)
}

View File

@ -1,21 +0,0 @@
use std::fmt::{Debug, Display};
use data_types::{ParquetFile, TransitionPartitionId};
use crate::round_info::CompactType;
pub mod multiple_branches;
pub trait DivideInitial: Debug + Display + Send + Sync {
/// Divides a group of files that should be compacted into
/// potentially smaller groups called "branches",
///
/// Each branch is compacted together in a single plan, and each
/// compact plan may produce one or more parquet files.
fn divide(
&self,
files: Vec<ParquetFile>,
op: CompactType,
partition: TransitionPartitionId,
) -> (Vec<Vec<ParquetFile>>, Vec<ParquetFile>);
}

View File

@ -1,333 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile, Timestamp, TransitionPartitionId};
use observability_deps::tracing::warn;
use crate::round_info::CompactType;
use super::DivideInitial;
#[derive(Debug, Default)]
pub struct MultipleBranchesDivideInitial;
impl MultipleBranchesDivideInitial {
pub fn new() -> Self {
Self
}
}
impl Display for MultipleBranchesDivideInitial {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "multiple_branches")
}
}
// TODO(joe): maintain this comment through the next few PRs; see how true the comment is/remains.
// divide is the second of three file list manipluation layers. split has already filtered most of the
// files not relevant to this round of compaction. Now divide will group files into branches which can
// be concurrently operated on. As of this comment, dividing the files into branches is fairly simple,
// except for:
// - ManySmallFiles, which is cluttered from previous challenges when RoundInfo didn't have as much influence.
// this clutter will either be going away, or maybe ManySmallFiles goes away entirely
// - SimulatedLeadingEdge which is likley a temporary workaround during the refactoring (i.e. likely go go away)
//
// Over time this function should work towards being a simpler grouping into batches for concurrent operations.
// If that happens (and this layer remains), this should probably be renamed to emphasize its role in branch creation
// rather than the current abigious 'divide' which sounds potentially overlapping with divisions happening in the
// other layers.
impl DivideInitial for MultipleBranchesDivideInitial {
fn divide(
&self,
files: Vec<ParquetFile>,
op: CompactType,
partition: TransitionPartitionId,
) -> (Vec<Vec<ParquetFile>>, Vec<ParquetFile>) {
let mut more_for_later = vec![];
match op {
CompactType::ManySmallFiles {
start_level,
max_num_files_to_group,
max_total_file_size_to_group,
} => {
// Since its ManySmallFiles, we know the files are L0s, and the total bytes is under our limit.
// We just need to split them up into branches for compaction.
// TODO: it would be nice to pick some good split times, store them in the ManySmallFiles op, and use them consistently across all the branches. That should make the later round more efficient.
let mut branches = Vec::with_capacity(files.len() / max_num_files_to_group);
let files = order_files(files, start_level);
let capacity = files.len();
let mut current_branch = Vec::with_capacity(capacity.min(max_num_files_to_group));
let mut current_branch_size = 0;
for f in files {
if current_branch.len() == max_num_files_to_group
|| current_branch_size + f.file_size_bytes as usize
> max_total_file_size_to_group
{
if current_branch.is_empty() {
warn!(
"Size of a file {} is larger than the max size limit to compact on partition {}.",
f.file_size_bytes,
partition
);
}
if current_branch.len() == 1 {
// Compacting a branch of 1 won't help us reduce the L0 file count. Put it on the ignore list.
more_for_later.push(current_branch.pop().unwrap());
} else if !current_branch.is_empty() {
branches.push(current_branch);
}
current_branch = Vec::with_capacity(capacity);
current_branch_size = 0;
}
current_branch_size += f.file_size_bytes as usize;
current_branch.push(f);
}
// push the last branch
if !current_branch.is_empty() {
if current_branch.len() == 1 {
// Compacting a branch of 1 won't help us reduce the L0 file count. Put it on the ignore list.
more_for_later.push(current_branch.pop().unwrap());
} else {
branches.push(current_branch);
}
}
(branches, more_for_later)
}
CompactType::TargetLevel {
target_level,
max_total_file_size_to_group,
} => {
let start_level = target_level.prev();
let total_bytes: usize = files.iter().map(|f| f.file_size_bytes as usize).sum();
let start_file_cnt = files
.iter()
.filter(|f| f.compaction_level == start_level)
.count();
if start_file_cnt == 0 {
// No files to compact
(vec![], files)
} else if total_bytes < max_total_file_size_to_group {
(vec![files], more_for_later)
} else {
let (mut for_now, rest): (Vec<ParquetFile>, Vec<ParquetFile>) = files
.into_iter()
.partition(|f| f.compaction_level == start_level);
let min_time = for_now.iter().map(|f| f.min_time).min().unwrap();
let max_time = for_now.iter().map(|f| f.max_time).max().unwrap();
let (overlaps, for_later): (Vec<ParquetFile>, Vec<ParquetFile>) = rest
.into_iter()
.partition(|f2| f2.overlaps_time_range(min_time, max_time));
for_now.extend(overlaps);
(vec![for_now], for_later)
}
}
CompactType::SimulatedLeadingEdge {
max_num_files_to_group,
max_total_file_size_to_group,
} => {
// There may be a lot of L0s, but we're going to keep it simple and just look at the first (few).
let start_level = CompactionLevel::Initial;
// Separate start_level_files (L0s) from target_level_files (L1), and `more_for_later` which is all the rest of the files (L2).
// We'll then with the first L0 and see how many L1s it needs to drag into its compaction, and then look at the next L0
// and see how many L1s it needs, etc. We'll end up with 1 or more L0s, and whatever L1s they need to compact with.
// When we can't add any more without getting "too big", we'll consider that our branch, and all remaining L0s, L1s & L2s
// are returned as
let (start_level_files, rest): (Vec<ParquetFile>, Vec<ParquetFile>) = files
.into_iter()
.partition(|f| f.compaction_level == start_level);
let (mut target_level_files, mut more_for_later): (
Vec<ParquetFile>,
Vec<ParquetFile>,
) = rest
.into_iter()
.partition(|f| f.compaction_level == start_level.next());
let mut start_level_files = order_files(start_level_files, start_level);
let mut current_branch = Vec::with_capacity(start_level_files.len());
let mut current_branch_size = 0;
let mut min_time = Timestamp::new(i64::MAX);
let mut max_time = Timestamp::new(0);
// Until we run out of L0s or return early with a branch to compact, keep adding L0s & their overlapping L1s to the current branch.
while !start_level_files.is_empty() {
let f = start_level_files.remove(0);
// overlaps of this new file isn't enough - we must look for overlaps of this + all previously added L0s, because the result of
// compacting them will be that whole time range. Therefore, we must include all L1s overlapping that entire time range.
min_time = min_time.min(f.min_time);
max_time = max_time.max(f.max_time);
let (mut overlaps, remainder): (Vec<ParquetFile>, Vec<ParquetFile>) =
target_level_files
.into_iter()
.partition(|f2| f2.overlaps_time_range(min_time, max_time));
target_level_files = remainder;
if current_branch_size == 0 || // minimum 1 start level file
(current_branch.len() + overlaps.len() < max_num_files_to_group && current_branch_size + f.size() < max_total_file_size_to_group)
{
// This L0 & its overlapping L1s fit in the current branch.
current_branch_size += f.size();
current_branch.push(f);
current_branch.append(&mut overlaps);
} else {
// This L0 & its overlapping L1s would make the current branch too big.
// We're done - what we previously added to the branch will be compacted, everything else goes in more_for_later.
more_for_later.push(f);
more_for_later.append(&mut overlaps);
more_for_later.append(&mut start_level_files);
more_for_later.append(&mut target_level_files);
return (vec![current_branch], more_for_later);
}
}
(vec![current_branch], more_for_later)
}
// RoundSplit already eliminated all the files we don't need to work on.
CompactType::VerticalSplit { .. } => (vec![files], more_for_later),
// Deferred does nothing now, everything is for later
CompactType::Deferred { .. } => (vec![], files),
}
}
}
/// Return a sorted files of the given ones.
/// The order is used to split the files and form the right groups of files to compact
/// and deduplicate correctly to fewer and larger but same level files
///
/// All given files are in the same given start_level.
/// They will be sorted on their `max_l0_created_at` (then `min_time`) if the start_level is 0,
/// otherwise on their `min_time`
pub fn order_files(files: Vec<ParquetFile>, start_level: CompactionLevel) -> Vec<ParquetFile> {
let mut files = files;
if start_level == CompactionLevel::Initial {
files.sort_by(|a, b| {
if a.max_l0_created_at == b.max_l0_created_at {
a.min_time.cmp(&b.min_time)
} else {
a.max_l0_created_at.cmp(&b.max_l0_created_at)
}
})
} else {
files.sort_by(|a, b| a.min_time.cmp(&b.min_time));
}
files
}
#[cfg(test)]
mod tests {
use data_types::{CompactionLevel, PartitionId};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(
MultipleBranchesDivideInitial::new().to_string(),
"multiple_branches"
);
}
#[test]
fn test_divide_num_file() {
let op = CompactType::ManySmallFiles {
start_level: CompactionLevel::Initial,
max_num_files_to_group: 2,
max_total_file_size_to_group: 100,
};
let divide = MultipleBranchesDivideInitial::new();
// empty input
assert_eq!(
divide.divide(
vec![],
op.clone(),
TransitionPartitionId::Deprecated(PartitionId::new(0))
),
(Vec::<Vec<_>>::new(), Vec::new())
);
// not empty
let f1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(1)
.build();
let f2 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(5)
.build();
let f3 = ParquetFileBuilder::new(3)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(10)
.build();
// files in random order of max_l0_created_at
let files = vec![f2.clone(), f3.clone(), f1.clone()];
let (branches, more_for_later) = divide.divide(
files,
op.clone(),
TransitionPartitionId::Deprecated(PartitionId::new(0)),
);
// output must be split into their max_l0_created_at
assert_eq!(branches.len(), 1);
assert_eq!(more_for_later.len(), 1);
assert_eq!(branches[0], vec![f1, f2]);
}
#[test]
fn test_divide_size_limit() {
let op = CompactType::ManySmallFiles {
start_level: CompactionLevel::Initial,
max_num_files_to_group: 10,
max_total_file_size_to_group: 100,
};
let divide = MultipleBranchesDivideInitial::new();
let f1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(1)
.with_file_size_bytes(90)
.build();
let f2 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(5)
.with_file_size_bytes(20)
.build();
let f3 = ParquetFileBuilder::new(3)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(10)
.with_file_size_bytes(30)
.build();
// files in random order of max_l0_created_at
let files = vec![f2.clone(), f3.clone(), f1.clone()];
let (branches, more_for_later) = divide.divide(
files,
op,
TransitionPartitionId::Deprecated(PartitionId::new(0)),
);
// output must be split into their max_l0_created_at
assert_eq!(branches.len(), 1);
assert_eq!(more_for_later.len(), 1);
assert_eq!(branches[0], vec![f2, f3]);
}
}

View File

@ -1,63 +0,0 @@
use std::fmt::Display;
use data_types::ParquetFile;
use observability_deps::tracing::info;
use crate::{
file_classification::FileClassification, partition_info::PartitionInfo, round_info::CompactType,
};
use super::FileClassifier;
#[derive(Debug)]
pub struct LoggingFileClassifierWrapper<T>
where
T: FileClassifier,
{
inner: T,
}
impl<T> LoggingFileClassifierWrapper<T>
where
T: FileClassifier,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingFileClassifierWrapper<T>
where
T: FileClassifier,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "display({})", self.inner)
}
}
impl<T> FileClassifier for LoggingFileClassifierWrapper<T>
where
T: FileClassifier,
{
fn classify(
&self,
partition_info: &PartitionInfo,
op: &CompactType,
files: Vec<ParquetFile>,
) -> FileClassification {
let classification = self.inner.classify(partition_info, op, files);
info!(
partition_id = partition_info.partition_id.get(),
target_level = %classification.target_level,
op = %op,
files_to_compact = classification.num_files_to_compact(),
files_to_split = classification.num_files_to_split(),
files_to_upgrade = classification.num_files_to_upgrade(),
files_to_keep = classification.num_files_to_keep(),
"file classification"
);
classification
}
}

View File

@ -1,36 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use data_types::ParquetFile;
use crate::{
file_classification::FileClassification, partition_info::PartitionInfo, round_info::CompactType,
};
pub mod logging;
pub mod split_based;
pub trait FileClassifier: Debug + Display + Send + Sync {
fn classify(
&self,
partition_info: &PartitionInfo,
op: &CompactType,
files: Vec<ParquetFile>,
) -> FileClassification;
}
impl<T> FileClassifier for Arc<T>
where
T: FileClassifier + ?Sized,
{
fn classify(
&self,
partition_info: &PartitionInfo,
op: &CompactType,
files: Vec<ParquetFile>,
) -> FileClassification {
self.as_ref().classify(partition_info, op, files)
}
}

View File

@ -1,371 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
use crate::{
components::{
divide_initial::multiple_branches::order_files, files_split::FilesSplit,
split_or_compact::SplitOrCompact,
},
file_classification::{
CompactReason, FileClassification, FileToSplit, FilesForProgress, FilesToSplitOrCompact,
NoneReason, SplitReason,
},
partition_info::PartitionInfo,
round_info::CompactType,
};
use super::FileClassifier;
/// Use [`FilesSplit`] to build a [`FileClassification`].
///
/// Uses the target_level from the `round_info` in the following data flow:
///
/// ```text
/// (files+target_level)-+.......................................
/// | :
/// | :
/// | +................................+
/// | : :
/// | : :
/// V V :
/// [target level split (FT)] :
/// | | :
/// | | :
/// | +------------+ :
/// | | :
/// | | :
/// | +............|...................+
/// | : | :
/// V V | :
/// [non overlap split (FO)] | :
/// | | | :
/// | | | :
/// | +------------+------+ :
/// | | :
/// | | :
/// | +................................+
/// | : | :
/// V V | :
/// [upgrade split (FU)] | :
/// | | | :
/// | | | :
/// | V | :
/// | (files upgrade) | :
/// | | :
/// | +................................+
/// | | |
/// V V |
/// [split or compact (FSC)] |
/// | | |
/// | +-------------------+
/// | |
/// V V
/// (files compact or split) (files keep)
/// ```
#[derive(Debug)]
pub struct SplitBasedFileClassifier<FT, FO, FU, FSC>
where
FT: FilesSplit,
FO: FilesSplit,
FU: FilesSplit,
FSC: SplitOrCompact,
{
target_level_split: FT,
non_overlap_split: FO,
upgrade_split: FU,
split_or_compact: FSC,
}
impl<FT, FO, FU, FSC> SplitBasedFileClassifier<FT, FO, FU, FSC>
where
FT: FilesSplit,
FO: FilesSplit,
FU: FilesSplit,
FSC: SplitOrCompact,
{
pub fn new(
target_level_split: FT,
non_overlap_split: FO,
upgrade_split: FU,
split_or_compact: FSC,
) -> Self {
Self {
target_level_split,
non_overlap_split,
upgrade_split,
split_or_compact,
}
}
}
impl<FT, FO, FU, FSC> Display for SplitBasedFileClassifier<FT, FO, FU, FSC>
where
FT: FilesSplit,
FO: FilesSplit,
FU: FilesSplit,
FSC: SplitOrCompact,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"split_based(target_level_split={}, non_overlap_split={}, upgrade_split={})",
self.target_level_split, self.non_overlap_split, self.upgrade_split,
)
}
}
// TODO(joe): maintain this comment through the next few PRs; see how true the comment is/remains.
// classify is the third of three file list manipulation layers. It is given a list of files
// for a single branch and decides which files should be upgraded, split (with split times) or compacted.
// classification can decide to ignore some files for this round (putting them in files_to_keep), but
// that's generally the result of split|compact decisions. Files not needing considered for action in
// this round were already filtered out before we get to classification.
impl<FT, FO, FU, FSC> FileClassifier for SplitBasedFileClassifier<FT, FO, FU, FSC>
where
FT: FilesSplit,
FO: FilesSplit,
FU: FilesSplit,
FSC: SplitOrCompact,
{
fn classify(
&self,
partition_info: &PartitionInfo,
op: &CompactType,
files: Vec<ParquetFile>,
) -> FileClassification {
let files_to_compact = files;
match op {
CompactType::ManySmallFiles {
start_level,
max_num_files_to_group,
max_total_file_size_to_group,
} => file_classification_for_many_files(
partition_info.partition_id(),
*max_total_file_size_to_group,
*max_num_files_to_group,
files_to_compact,
*start_level,
),
CompactType::SimulatedLeadingEdge { .. } => {
// file division already done in round_info_source
FileClassification {
target_level: op.target_level(),
files_to_make_progress_on: FilesForProgress {
upgrade: vec![],
split_or_compact: FilesToSplitOrCompact::Compact(
files_to_compact,
CompactReason::TotalSizeLessThanMaxCompactSize,
),
},
files_to_keep: vec![],
}
}
CompactType::VerticalSplit { split_times } => file_classification_for_vertical_split(
split_times,
files_to_compact,
partition_info.partition_id(),
),
CompactType::TargetLevel { target_level, .. } => {
let partition_id = partition_info.partition_id();
// Split files into files_to_compact, files_to_upgrade, and files_to_keep
//
// Since output of one compaction is used as input of next compaction, all files that are not
// compacted or upgraded are still kept to consider in next round of compaction
// Split actual files to compact from its higher-target-level files
// The higher-target-level files are kept for next round of compaction
let target_level = *target_level;
let (files_to_compact, mut files_to_keep) = self.target_level_split.apply(
files_to_compact,
target_level,
partition_id.clone(),
);
// To have efficient compaction performance, we do not need to compact eligible non-overlapped files
// Find eligible non-overlapped files and keep for next round of compaction
let (files_to_compact, non_overlapping_files) = self.non_overlap_split.apply(
files_to_compact,
target_level,
partition_id.clone(),
);
files_to_keep.extend(non_overlapping_files);
// To have efficient compaction performance, we only need to upgrade (catalog update only) eligible files
let (files_to_compact, files_to_upgrade) =
self.upgrade_split
.apply(files_to_compact, target_level, partition_id);
// See if we need to split start-level files due to over compaction size limit
let (files_to_split_or_compact, other_files) =
self.split_or_compact
.apply(partition_info, files_to_compact, target_level);
files_to_keep.extend(other_files);
let files_to_make_progress_on = FilesForProgress {
upgrade: files_to_upgrade,
split_or_compact: files_to_split_or_compact,
};
FileClassification {
target_level,
files_to_make_progress_on,
files_to_keep,
}
}
CompactType::Deferred {} => FileClassification {
target_level: CompactionLevel::Initial,
files_to_make_progress_on: FilesForProgress {
upgrade: vec![],
split_or_compact: FilesToSplitOrCompact::None(NoneReason::Deferred),
},
files_to_keep: files_to_compact,
},
}
}
}
// ManySmallFiles assumes the L0 files are tiny and aims to do L0-> L0 compaction to reduce the number of tiny files.
// With vertical splitting, this only operates on a CompactionRange that's up to the max_compact_size. So while there's
// many files, we know there's not many bytes (in total). Because of this, We can skip anything that's a sizeable portion
// of the max_compact_size, knowing that we can still get the L0 quantity down to max_num_files_to_group.
fn file_classification_for_many_files(
partition: TransitionPartitionId,
max_total_file_size_to_group: usize,
max_num_files_to_group: usize,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
) -> FileClassification {
// Verify all input files are in the target_level
let err_msg = format!(
"all files to compact must be in {target_level} level, but found files in other levels",
);
assert!(
files.iter().all(|f| f.compaction_level == target_level),
"{err_msg}, partition_id {}",
partition,
);
let mut files_to_compact = vec![];
let mut files_to_keep: Vec<ParquetFile> = vec![];
// If we're under the max_total_file_size_to_group, we could unconditionally put all files in files_to_compact.
// But ManySmallFiles always compacts to L0, which means we'll be rewriting them again anyway to get to L1, which
// means if there's a larger file or two we can skip on this L0->L0 compaction (while still getting file count
// small enough), that will help write amplification. So assume tiny files separated by non-tiny files, and we
// need to get down to max_num_files_to_group.
// skip_size is the biggest files we can skip, and still be guaranteed to get down to max_num_files_to_group.
let skip_size = max_total_file_size_to_group * 2 / max_num_files_to_group;
// Enforce max_num_files_to_group
let ordered_files = order_files(files, target_level.prev());
let mut chunk_bytes: usize = 0;
let mut chunk: Vec<ParquetFile> = Vec::with_capacity(max_num_files_to_group);
for f in ordered_files {
if !files_to_compact.is_empty() {
// We've already got a batch of files to compact, this can wait.
files_to_keep.push(f);
} else if chunk_bytes + f.file_size_bytes as usize > max_total_file_size_to_group
|| chunk.len() + 1 > max_num_files_to_group
|| f.file_size_bytes >= skip_size as i64
{
// This file will not be included in this compaction.
files_to_keep.push(f);
if chunk.len() > 1 {
// Several files; we'll do an L0->L0 comapction on them.
files_to_compact = chunk.to_vec();
chunk = Vec::with_capacity(max_num_files_to_group);
} else if !chunk.is_empty() {
// Just one file, and we don't want to compact it with 'f', so skip it.
files_to_keep.append(chunk.to_vec().as_mut());
chunk = Vec::with_capacity(max_num_files_to_group);
}
} else {
// This files goes in our draft chunk to compact
chunk_bytes += f.file_size_bytes as usize;
chunk.push(f);
}
}
if !chunk.is_empty() {
assert!(files_to_compact.is_empty(), "we shouldn't accumulate multiple non-contiguous chunks to compact, but we found non-contiguous chunks in compaction job for partition_id={}", partition);
if chunk.len() > 1 {
// We need to compact what comes before f
files_to_compact = chunk.to_vec();
} else if !chunk.is_empty() {
files_to_keep.append(chunk.to_vec().as_mut());
}
}
assert!(
chunk.is_empty() || chunk.len() > 1,
"should not have only 1 chunk, for partition {}",
partition
);
let files_to_make_progress_on = FilesForProgress {
upgrade: vec![],
split_or_compact: FilesToSplitOrCompact::Compact(
files_to_compact,
CompactReason::ManySmallFiles,
),
};
FileClassification {
target_level,
files_to_make_progress_on,
files_to_keep,
}
}
// VerticalSplit splits the given files at the given split_times.
// All files given here must be L0 files overlapping at least one of the split_times.
fn file_classification_for_vertical_split(
split_times: &[i64],
files: Vec<ParquetFile>,
partition: TransitionPartitionId,
) -> FileClassification {
let target_level = CompactionLevel::Initial;
let files_to_keep: Vec<ParquetFile> = vec![];
let mut files_to_split: Vec<FileToSplit> = Vec::with_capacity(files.len());
// Determine the necessary splits for each file.
// split time is the last ns included in the 'left' file in the split. So if the the split time matches max time
// of a file, that file does not need split.
for f in files {
let this_file_splits: Vec<i64> = split_times
.iter()
.filter(|split| split >= &&f.min_time.get() && split < &&f.max_time.get())
.cloned()
.collect();
assert!(
!this_file_splits.is_empty(),
"files not needing split should be filtered out, instead found to-compact file (not to-split) in partition {}", partition
);
let file_to_split = FileToSplit {
file: f,
split_times: this_file_splits,
};
files_to_split.push(file_to_split);
}
let files_to_make_progress_on = FilesForProgress {
upgrade: vec![],
split_or_compact: FilesToSplitOrCompact::Split(files_to_split, SplitReason::VerticalSplit),
};
FileClassification {
target_level,
files_to_make_progress_on,
files_to_keep,
}
}

View File

@ -1,36 +0,0 @@
use std::{fmt::Display, sync::Arc};
use data_types::ParquetFile;
use super::FileFilter;
#[derive(Debug)]
pub struct AndFileFilter {
filters: Vec<Arc<dyn FileFilter>>,
}
impl AndFileFilter {
#[allow(dead_code)]
pub fn new(filters: Vec<Arc<dyn FileFilter>>) -> Self {
Self { filters }
}
}
impl Display for AndFileFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "and([")?;
for (i, sub) in self.filters.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{sub}")?;
}
write!(f, "])")
}
}
impl FileFilter for AndFileFilter {
fn apply(&self, file: &ParquetFile) -> bool {
self.filters.iter().all(|filter| filter.apply(file))
}
}

View File

@ -1,77 +0,0 @@
use std::{fmt::Display, ops::RangeInclusive};
use data_types::CompactionLevel;
use super::FileFilter;
#[derive(Debug)]
pub struct LevelRangeFileFilter {
range: RangeInclusive<CompactionLevel>,
}
impl LevelRangeFileFilter {
pub fn new(range: RangeInclusive<CompactionLevel>) -> Self {
Self { range }
}
}
impl Display for LevelRangeFileFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"level_range({}..={})",
*self.range.start() as i32,
*self.range.end() as i32
)
}
}
impl FileFilter for LevelRangeFileFilter {
fn apply(&self, file: &data_types::ParquetFile) -> bool {
self.range.contains(&file.compaction_level)
}
}
#[cfg(test)]
mod tests {
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(
LevelRangeFileFilter::new(
CompactionLevel::Initial..=CompactionLevel::FileNonOverlapped
)
.to_string(),
"level_range(0..=1)"
);
}
#[test]
fn test_apply() {
let filter_a = LevelRangeFileFilter::new(
CompactionLevel::Initial..=CompactionLevel::FileNonOverlapped,
);
let filter_b =
LevelRangeFileFilter::new(CompactionLevel::FileNonOverlapped..=CompactionLevel::Final);
let f0 = ParquetFileBuilder::new(0)
.with_compaction_level(CompactionLevel::Initial)
.build();
let f1 = ParquetFileBuilder::new(0)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let f2 = ParquetFileBuilder::new(0)
.with_compaction_level(CompactionLevel::Final)
.build();
assert!(filter_a.apply(&f0));
assert!(filter_a.apply(&f1));
assert!(!filter_a.apply(&f2));
assert!(!filter_b.apply(&f0));
assert!(filter_b.apply(&f1));
assert!(filter_b.apply(&f2));
}
}

View File

@ -1,10 +0,0 @@
use std::fmt::{Debug, Display};
use data_types::ParquetFile;
pub mod and;
pub mod level_range;
pub trait FileFilter: Debug + Display + Send + Sync {
fn apply(&self, file: &ParquetFile) -> bool;
}

View File

@ -1,29 +0,0 @@
use std::fmt::{Debug, Display};
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
pub mod non_overlap_split;
pub mod target_level_split;
pub mod upgrade_split;
pub trait FilesSplit: Debug + Display + Send + Sync {
/// Split provided files into 2 groups of files:
/// (files_to_compact, files_to_keep)
///
/// Only files in files_to_compact are considered for compaction this round
///
/// There will be different split needs:
/// . `[files <= target_level]` and `[files > target_level]`
/// . `[overlapping_files]` and `[non_overlapping_files]`
/// . `[files_to_upgrade]` and `[files_to_compact]`
///
/// Note that for AllAtOnce version, we do not split anything and compact all files at once
/// This split is mainly for version after the naive AllAtOnce. For the AllAtOnce, we will
/// create dummy modules to return all files
fn apply(
&self,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
partition: TransitionPartitionId,
) -> (Vec<ParquetFile>, Vec<ParquetFile>);
}

View File

@ -1,592 +0,0 @@
use std::{collections::VecDeque, fmt::Display};
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
use crate::file_group::{split_by_level, FilesTimeRange};
use super::FilesSplit;
#[derive(Debug)]
/// Split files into `[compact_files]` and `[non_overlapping_files]`
/// To have better and efficient compaction performance, eligible non-overlapped files
/// should not be compacted.
pub struct NonOverlapSplit {
/// undersized_threshold is the threshold for unnecessarily including & rewriting adjacent
/// small files. This does increase write amplification, so it shouldn't be too high, but
/// it also prevents leaving tiny L1/L2 files that will never be compacted, so it shouldn't
/// be too low.
undersized_threshold: u64,
}
impl NonOverlapSplit {
pub fn new(undersized_threshold: u64) -> Self {
Self {
undersized_threshold,
}
}
}
impl Display for NonOverlapSplit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Non-overlapping split for TargetLevel version")
}
}
impl FilesSplit for NonOverlapSplit {
/// Return (`[compact_files]`, `[non_overlapping_files]`) of given files
/// such that after combining all `compact_files` into a new file, the new file will
/// have no overlap with any file in `non_overlapping_files`.
/// The non_overlapping_files must be in the target_level
///
/// Eligible non-overlapping files are files of the target level that do not
/// overlap on time range of all files in lower-level. All files in target level
/// are assumed to not overlap with each other and do not need to check
///
/// Example:
/// . Input:
/// |--L0.1--| |--L0.2--|
/// |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--|
///
/// (L1.1, L1.3, L1.4) do not overlap with any L0s but only (L1.1, L1.4) are eligible non-overlapping files.
/// (L1.2, L1.3) must be compacted with L0s to produce the right non-overlapping L1s.
///
/// . Output:
/// . compact_files: [L0.1, L0.2, L1.2, L1.3]
/// . non_overlapping_files: [L1.1, L1.4]
///
/// Algorithm:
/// The non-overlappings files are files from 2 ends of the target level files that are
/// completely ouside the time range of all lower level files
///
/// L0s |--L0.1--| |--L0.2--|
/// ==> L0s' time range: |-------L0's time range --------|
///
/// L1s |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--|
/// ==> Only L1.1 and L1.4 are completely outside the time range of L0s.
/// So L1.1 and L1.4 are usually not included in compact_files. However, if either of L1.1 or L1.4
/// are small (below undersized_threshold), they will be included in compact_files to avoid leaving
/// tiny L1 files behind. Note that the application of undersized_threshold can only contiguously
/// expand the set of min_time sorted target level files. So if there was a small L1.5 file, we
/// could not skip over a large L1.4 file to include L1.5 in compact_files.
///
fn apply(
&self,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
partition: TransitionPartitionId,
) -> (Vec<ParquetFile>, Vec<ParquetFile>) {
assert_ne!(
target_level,
CompactionLevel::Initial,
"unexpected compaction target_level, should not be L0, partition_id={}",
partition
);
let num_files = files.len();
// Split files into levels
let prev_level = target_level.prev();
let (mut target_level_files, prev_level_files) =
split_by_level(files, target_level, prev_level, partition);
// compute time range of prev_level_files
let prev_level_range = if let Some(r) = FilesTimeRange::try_new(&prev_level_files) {
r
} else {
// No prev_level_files, all target_level_files are non_overlapping_files
return (vec![], target_level_files);
};
// Split target_level_files into 3 parts, those before, during and after prev_level_files.
// Since target level files during the time range of prev_level_files must be compacted,
// they are the start of compact_files.
let mut before: Vec<ParquetFile>;
let mut compact_files: Vec<ParquetFile>;
let mut after: Vec<ParquetFile>;
let mut non_overlapping_files = Vec::with_capacity(num_files);
(before, compact_files, after) =
three_range_split(&mut target_level_files, prev_level_range);
// Closure that checks if a file is under the size threshold and adds it to compact_files
let mut check_undersize_and_add = |file: ParquetFile| -> bool {
let mut under = false;
// Check if file overlaps with (min_time, max_time)
if file.file_size_bytes <= self.undersized_threshold as i64 {
under = true;
compact_files.push(file);
} else {
non_overlapping_files.push(file);
}
under
};
// Contiguously add `before` files to the list to compact, so long as they're under the size threshold.
before.sort_by_key(|f| f.min_time);
let mut before = before.into_iter().collect::<VecDeque<_>>();
while let Some(file) = before.pop_back() {
if !check_undersize_and_add(file) {
break;
}
}
// Contiguously add `after` files to the list to compact, so long as they're under the size threshold.
after.sort_by_key(|f| f.min_time);
let mut after = after.into_iter().collect::<VecDeque<_>>();
while let Some(file) = after.pop_front() {
if !check_undersize_and_add(file) {
break;
}
}
compact_files.extend(prev_level_files);
// Add remaining files to non_overlapping_files.
non_overlapping_files.extend(before);
non_overlapping_files.extend(after);
(compact_files, non_overlapping_files)
}
}
// three_range_split splits the files into 3 vectors: before, during, after the specified time range.
pub fn three_range_split(
files: &mut Vec<ParquetFile>,
range: FilesTimeRange,
) -> (Vec<ParquetFile>, Vec<ParquetFile>, Vec<ParquetFile>) {
let num_files = files.len();
let mut before = Vec::with_capacity(num_files);
let mut during = Vec::with_capacity(num_files);
let mut after = Vec::with_capacity(num_files);
while let Some(file) = files.pop() {
if range.contains(&file) {
during.push(file);
} else if range.before(&file) {
before.push(file);
} else {
after.push(file);
}
}
(before, during, after)
}
#[cfg(test)]
mod tests {
use compactor_test_utils::{
create_fake_partition_id, create_l1_files, create_overlapped_files,
create_overlapped_files_2, create_overlapped_files_mix_sizes_1,
create_overlapped_l0_l1_files, create_overlapped_l1_l2_files, format_files,
format_files_split,
};
use super::*;
#[test]
fn test_display() {
assert_eq!(
NonOverlapSplit::new(1024 * 1024).to_string(),
"Non-overlapping split for TargetLevel version"
);
}
#[test]
#[should_panic]
fn test_wrong_target_level() {
let files = create_overlapped_files();
let split = NonOverlapSplit::new(1024 * 1024);
split.apply(files, CompactionLevel::Initial, create_fake_partition_id());
}
#[test]
#[should_panic(
expected = "unexpected compaction level for partition 0, expected CompactionLevel::L1 or CompactionLevel::L0 but got CompactionLevel::L2"
)]
fn test_unexpected_compaction_level_2() {
let files = create_overlapped_files();
let split = NonOverlapSplit::new(1024 * 1024);
// There are L2 files and will panic
split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
}
#[test]
#[should_panic(
expected = "unexpected compaction level for partition 0, expected CompactionLevel::L2 or CompactionLevel::L1 but got CompactionLevel::L0"
)]
fn test_unexpected_compaction_level_0() {
let files = create_overlapped_files();
let split = NonOverlapSplit::new(1024 * 1024);
// There are L0 files and will panic
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
}
#[test]
fn test_apply_empty_files() {
let files = vec![];
let split = NonOverlapSplit::new(1024 * 1024);
let (overlap, non_overlap) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
assert_eq!(overlap.len(), 0);
assert_eq!(non_overlap.len(), 0);
}
#[test]
fn test_apply_one_level_empty() {
let files = create_l1_files(1);
let fake_partition_id = create_fake_partition_id();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1, all files 1b "
- "L1.13[600,700] 0ns |------L1.13-------|"
- "L1.12[400,500] 0ns |------L1.12-------| "
- "L1.11[250,350] 0ns |------L1.11-------| "
"###
);
let split = NonOverlapSplit::new(0);
// Lower level is empty -> all files will be in non_overlapping_files
let (overlap, non_overlap) = split.apply(
files.clone(),
CompactionLevel::FileNonOverlapped,
fake_partition_id.clone(),
);
assert_eq!(overlap.len(), 0);
assert_eq!(non_overlap.len(), 3);
// target level is empty -> all files will be in compact_files
let (overlap, non_overlap) = split.apply(files, CompactionLevel::Final, fake_partition_id);
assert_eq!(overlap.len(), 3);
assert_eq!(non_overlap.len(), 0);
}
#[test]
fn test_apply_mix_1() {
let files = create_overlapped_l0_l1_files(1);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 1b "
- "L0.2[650,750] 180s |---L0.2----| "
- "L0.1[450,620] 120s |--------L0.1---------| "
- "L0.3[800,900] 300s |---L0.3----| "
- "L1, all files 1b "
- "L1.13[600,700] 60s |---L1.13---| "
- "L1.12[400,500] 60s |---L1.12---| "
- "L1.11[250,350] 60s |---L1.11---| "
"###
);
let split = NonOverlapSplit::new(0);
let (overlap, non_overlap) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("overlap", &overlap, "non_overlap", &non_overlap),
@r###"
---
- overlap
- "L0, all files 1b "
- "L0.2[650,750] 180s |------L0.2------| "
- "L0.1[450,620] 120s |------------L0.1------------| "
- "L0.3[800,900] 300s |------L0.3------|"
- "L1, all files 1b "
- "L1.12[400,500] 60s |-----L1.12------| "
- "L1.13[600,700] 60s |-----L1.13------| "
- non_overlap
- "L1, all files 1b "
- "L1.11[250,350] 60s |-----------------------------------------L1.11------------------------------------------|"
"###
);
}
// |--L2.1--| |--L2.2--|
// |--L1.1--| |--L1.2--| |--L1.3--|
// |--L0.1--| |--L0.2--| |--L0.3--|
#[test]
fn test_apply_mix_2() {
let files = create_overlapped_l1_l2_files(1);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1, all files 1b "
- "L1.13[600,700] 0ns |--L1.13---| "
- "L1.12[400,500] 0ns |--L1.12---| "
- "L1.11[250,350] 0ns |--L1.11---| "
- "L2, all files 1b "
- "L2.21[0,100] 0ns |--L2.21---| "
- "L2.22[200,300] 0ns |--L2.22---| "
"###
);
let split = NonOverlapSplit::new(0);
let (overlap, non_overlap) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
insta::assert_yaml_snapshot!(
format_files_split("overlap", &overlap, "non_overlap", &non_overlap),
@r###"
---
- overlap
- "L1, all files 1b "
- "L1.13[600,700] 0ns |-----L1.13------|"
- "L1.12[400,500] 0ns |-----L1.12------| "
- "L1.11[250,350] 0ns |-----L1.11------| "
- "L2, all files 1b "
- "L2.22[200,300] 0ns |-----L2.22------| "
- non_overlap
- "L2, all files 1b "
- "L2.21[0,100] 0ns |-----------------------------------------L2.21------------------------------------------|"
"###
);
}
#[test]
fn test_apply_mix_3() {
// Create files with levels and time ranges
// . Input:
// |--L0.1--| |--L0.2--|
// |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--|
//
// . Output: (overlap, non_overlap) = ( [L0.1, L0.2, L1.2, L1.3] , [L1.1, L1.4] )
let files = create_overlapped_files_2(1);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 1b "
- "L0.2[520,550] 0ns |L0.2| "
- "L0.1[250,350] 0ns |---L0.1---| "
- "L1, all files 1b "
- "L1.13[400,500] 0ns |--L1.13---| "
- "L1.12[200,300] 0ns |--L1.12---| "
- "L1.11[0,100] 0ns |--L1.11---| "
- "L1.14[600,700] 0ns |--L1.14---| "
"###
);
let split = NonOverlapSplit::new(0);
let (overlap, non_overlap) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("overlap", &overlap, "non_overlap", &non_overlap),
@r###"
---
- overlap
- "L0, all files 1b "
- "L0.2[520,550] 0ns |L0.2-| "
- "L0.1[250,350] 0ns |---------L0.1----------| "
- "L1, all files 1b "
- "L1.12[200,300] 0ns |---------L1.12---------| "
- "L1.13[400,500] 0ns |---------L1.13---------| "
- non_overlap
- "L1, all files 1b "
- "L1.11[0,100] 0ns |--L1.11---| "
- "L1.14[600,700] 0ns |--L1.14---| "
"###
);
}
#[test]
fn test_undersized_1() {
// Create files with levels and time ranges, where all L1 files are undersized.
// Input:
// |--L0.1--| |--L0.2--|
// |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--|
//
// Output: (compact, non_overlap) = ( [L0.1, L0.2, L1.1, L1.2, L1.3, L1.4] , [] )
// Since all the files are below the undersized threshold, they all get compacted, even though some
// don't overlap.
let files = create_overlapped_files_2(10);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 10b "
- "L0.2[520,550] 0ns |L0.2| "
- "L0.1[250,350] 0ns |---L0.1---| "
- "L1, all files 10b "
- "L1.13[400,500] 0ns |--L1.13---| "
- "L1.12[200,300] 0ns |--L1.12---| "
- "L1.11[0,100] 0ns |--L1.11---| "
- "L1.14[600,700] 0ns |--L1.14---| "
"###
);
let split = NonOverlapSplit::new(11);
let (compact_files, non_overlap) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("compact_files", &compact_files, "non_overlap", &non_overlap),
@r###"
---
- compact_files
- "L0, all files 10b "
- "L0.2[520,550] 0ns |L0.2| "
- "L0.1[250,350] 0ns |---L0.1---| "
- "L1, all files 10b "
- "L1.12[200,300] 0ns |--L1.12---| "
- "L1.13[400,500] 0ns |--L1.13---| "
- "L1.11[0,100] 0ns |--L1.11---| "
- "L1.14[600,700] 0ns |--L1.14---| "
- non_overlap
"###
);
}
#[test]
fn test_undersized_2() {
// Create files with levels and time ranges, where some non-overlapping L1 files are undersized.
// . Input:
// |--L0.1--| |-L0.2-|
// |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--| |--L1.5--| |--L1.6--| |--L1.7--| |--L1.8--|
// underized: no no yes x x yes no no
// Results:
// L1.4 and L1.5 are compacted regardless of their size because they overlap L0s.
// L1.3 and L1.6 are compacted because they are undersized and contiguous to the overlapping files.
// L1.1, L1.2, L1.7, and L1.8 are not compacted because they are not undersized.
//
let files = create_overlapped_files_mix_sizes_1(10, 20, 30);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.2[820,850] 0ns 10b |L0.2| "
- "L0.1[650,750] 0ns 10b |L0.1| "
- "L1 "
- "L1.13[400,500] 0ns 10b |L1.13| "
- "L1.17[1200,1300] 0ns 30b |L1.17| "
- "L1.12[200,300] 0ns 30b |L1.12| "
- "L1.11[0,100] 0ns 20b |L1.11| "
- "L1.14[600,700] 0ns 20b |L1.14| "
- "L1.15[800,900] 0ns 20b |L1.15| "
- "L1.16[1000,1100] 0ns 10b |L1.16| "
- "L1.18[1400,1500] 0ns 20b |L1.18|"
"###
);
let split = NonOverlapSplit::new(11);
let (compact_files, non_overlap) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("compact_files", &compact_files, "non_overlap", &non_overlap),
@r###"
---
- compact_files
- "L0 "
- "L0.2[820,850] 0ns 10b |L0.2| "
- "L0.1[650,750] 0ns 10b |---L0.1---| "
- "L1 "
- "L1.15[800,900] 0ns 20b |--L1.15---| "
- "L1.14[600,700] 0ns 20b |--L1.14---| "
- "L1.13[400,500] 0ns 10b |--L1.13---| "
- "L1.16[1000,1100] 0ns 10b |--L1.16---| "
- non_overlap
- "L1 "
- "L1.12[200,300] 0ns 30b |L1.12| "
- "L1.17[1200,1300] 0ns 30b |L1.17| "
- "L1.11[0,100] 0ns 20b |L1.11| "
- "L1.18[1400,1500] 0ns 20b |L1.18|"
"###
);
}
#[test]
fn test_undersized_3() {
// This case is like undersized 3, except that the outermost L1 files (L1 & L18) are also undersized. But since
// they're separated from the overlapping files by oversized files, they are not compacted.
// Input:
// |--L0.1--| |-L0.2-|
// |--L1.1--| |--L1.2--| |--L1.3--| |--L1.4--| |--L1.5--| |--L1.6--| |--L1.7--| |--L1.8--|
// underized: yes no yes x x yes no yes
// Results:
// L1.4 and L1.5 are compacted regardless of their size because they overlap.
// L1.3 and L1.6 are compacted because they are undersized and contiguous to the overlapping files.
// L1.2 and L1.7 are not compacted because they are not undersized.
// L1.1 and L1.8 are not compacted because even though they are undersized, they are not contiguous to the
// overlapping files.
//
let files = create_overlapped_files_mix_sizes_1(10, 20, 30);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.2[820,850] 0ns 10b |L0.2| "
- "L0.1[650,750] 0ns 10b |L0.1| "
- "L1 "
- "L1.13[400,500] 0ns 10b |L1.13| "
- "L1.17[1200,1300] 0ns 30b |L1.17| "
- "L1.12[200,300] 0ns 30b |L1.12| "
- "L1.11[0,100] 0ns 20b |L1.11| "
- "L1.14[600,700] 0ns 20b |L1.14| "
- "L1.15[800,900] 0ns 20b |L1.15| "
- "L1.16[1000,1100] 0ns 10b |L1.16| "
- "L1.18[1400,1500] 0ns 20b |L1.18|"
"###
);
let split = NonOverlapSplit::new(21);
let (compact_files, non_overlap) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("compact_files", &compact_files, "non_overlap", &non_overlap),
@r###"
---
- compact_files
- "L0 "
- "L0.2[820,850] 0ns 10b |L0.2| "
- "L0.1[650,750] 0ns 10b |---L0.1---| "
- "L1 "
- "L1.15[800,900] 0ns 20b |--L1.15---| "
- "L1.14[600,700] 0ns 20b |--L1.14---| "
- "L1.13[400,500] 0ns 10b |--L1.13---| "
- "L1.16[1000,1100] 0ns 10b |--L1.16---| "
- non_overlap
- "L1 "
- "L1.12[200,300] 0ns 30b |L1.12| "
- "L1.17[1200,1300] 0ns 30b |L1.17| "
- "L1.11[0,100] 0ns 20b |L1.11| "
- "L1.18[1400,1500] 0ns 20b |L1.18|"
"###
);
}
}

View File

@ -1,344 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
use super::FilesSplit;
/// Split given files into 2 groups of files: `[<= target_level]` and `[> target_level]`
#[derive(Debug)]
pub struct TargetLevelSplit {}
impl TargetLevelSplit {
pub fn new() -> Self {
Self {}
}
}
impl Display for TargetLevelSplit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Target level split for TargetLevel version")
}
}
impl FilesSplit for TargetLevelSplit {
fn apply(
&self,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
_partition: TransitionPartitionId,
) -> (Vec<ParquetFile>, Vec<ParquetFile>) {
files
.into_iter()
.partition(|f| f.compaction_level <= target_level)
}
}
#[cfg(test)]
mod tests {
use compactor_test_utils::{
create_fake_partition_id, create_l0_files, create_l1_files, create_l2_files,
create_overlapped_files, format_files, format_files_split,
};
use super::*;
#[test]
fn test_display() {
assert_eq!(
TargetLevelSplit::new().to_string(),
"Target level split for TargetLevel version"
);
}
#[test]
fn test_apply_empty_files() {
let files = vec![];
let split = TargetLevelSplit::new();
let (lower, higher) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
assert_eq!(lower.len(), 0);
assert_eq!(higher.len(), 0);
}
#[test]
fn test_apply_partial_empty_files_l0() {
let files = create_l0_files(1);
let fake_partition_id = create_fake_partition_id();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 1b "
- "L0.2[650,750] 0ns |-------L0.2-------| "
- "L0.1[450,620] 0ns |--------------L0.1--------------| "
- "L0.3[800,900] 0ns |-------L0.3-------|"
"###
);
let split = TargetLevelSplit::new();
let (lower, higher) = split.apply(
files.clone(),
CompactionLevel::Initial,
fake_partition_id.clone(),
);
assert_eq!(lower.len(), 3);
assert_eq!(higher.len(), 0);
let (lower, higher) = split.apply(
files.clone(),
CompactionLevel::FileNonOverlapped,
fake_partition_id.clone(),
);
assert_eq!(lower.len(), 3);
assert_eq!(higher.len(), 0);
let (lower, higher) = split.apply(files, CompactionLevel::Final, fake_partition_id);
assert_eq!(lower.len(), 3);
assert_eq!(higher.len(), 0);
}
#[test]
fn test_apply_partial_empty_files_l1() {
let files = create_l1_files(1);
let fake_partition_id = create_fake_partition_id();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1, all files 1b "
- "L1.13[600,700] 0ns |------L1.13-------|"
- "L1.12[400,500] 0ns |------L1.12-------| "
- "L1.11[250,350] 0ns |------L1.11-------| "
"###
);
let split = TargetLevelSplit::new();
let (lower, higher) = split.apply(
files.clone(),
CompactionLevel::Initial,
fake_partition_id.clone(),
);
assert_eq!(lower.len(), 0);
assert_eq!(higher.len(), 3);
let (lower, higher) = split.apply(
files.clone(),
CompactionLevel::FileNonOverlapped,
fake_partition_id.clone(),
);
assert_eq!(lower.len(), 3);
assert_eq!(higher.len(), 0);
//
let (lower, higher) = split.apply(files, CompactionLevel::Final, fake_partition_id);
assert_eq!(lower.len(), 3);
assert_eq!(higher.len(), 0);
}
#[test]
fn test_apply_partial_empty_files_l2() {
let files = create_l2_files();
let fake_partition_id = create_fake_partition_id();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L2, all files 1b "
- "L2.21[0,100] 0ns |-----------L2.21------------| "
- "L2.22[200,300] 0ns |-----------L2.22------------|"
"###
);
let split = TargetLevelSplit::new();
let (lower, higher) = split.apply(
files.clone(),
CompactionLevel::Initial,
fake_partition_id.clone(),
);
assert_eq!(lower.len(), 0);
assert_eq!(higher.len(), 2);
let (lower, higher) = split.apply(
files.clone(),
CompactionLevel::FileNonOverlapped,
fake_partition_id.clone(),
);
assert_eq!(lower.len(), 0);
assert_eq!(higher.len(), 2);
let (lower, higher) = split.apply(files, CompactionLevel::Final, fake_partition_id);
assert_eq!(lower.len(), 2);
assert_eq!(higher.len(), 0);
}
#[test]
fn test_apply_target_level_0() {
// Test target level Initial
let files = create_overlapped_files();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.2[650,750] 0ns 1b |--L0.2--| "
- "L0.1[450,620] 0ns 1b |-----L0.1------| "
- "L0.3[800,900] 0ns 100b |--L0.3--|"
- "L1 "
- "L1.13[600,700] 0ns 100b |-L1.13--| "
- "L1.12[400,500] 0ns 1b |-L1.12--| "
- "L1.11[250,350] 0ns 1b |-L1.11--| "
- "L2 "
- "L2.21[0,100] 0ns 1b |-L2.21--| "
- "L2.22[200,300] 0ns 1b |-L2.22--| "
"###
);
let split = TargetLevelSplit::new();
let (lower, higher) =
split.apply(files, CompactionLevel::Initial, create_fake_partition_id());
insta::assert_yaml_snapshot!(
format_files_split("lower", &lower, "higher", &higher),
@r###"
---
- lower
- "L0 "
- "L0.2[650,750] 0ns 1b |-------L0.2-------| "
- "L0.1[450,620] 0ns 1b |--------------L0.1--------------| "
- "L0.3[800,900] 0ns 100b |-------L0.3-------|"
- higher
- "L1 "
- "L1.13[600,700] 0ns 100b |--L1.13---| "
- "L1.12[400,500] 0ns 1b |--L1.12---| "
- "L1.11[250,350] 0ns 1b |--L1.11---| "
- "L2 "
- "L2.21[0,100] 0ns 1b |--L2.21---| "
- "L2.22[200,300] 0ns 1b |--L2.22---| "
"###
);
// verify number of files
assert_eq!(lower.len(), 3);
assert_eq!(higher.len(), 5);
// verify compaction level of files
assert!(lower
.iter()
.all(|f| f.compaction_level == CompactionLevel::Initial));
assert!(higher
.iter()
.all(|f| f.compaction_level == CompactionLevel::FileNonOverlapped
|| f.compaction_level == CompactionLevel::Final));
}
#[test]
fn test_apply_target_level_l1() {
// Test target level is FileNonOverlapped
let files = create_overlapped_files();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.2[650,750] 0ns 1b |--L0.2--| "
- "L0.1[450,620] 0ns 1b |-----L0.1------| "
- "L0.3[800,900] 0ns 100b |--L0.3--|"
- "L1 "
- "L1.13[600,700] 0ns 100b |-L1.13--| "
- "L1.12[400,500] 0ns 1b |-L1.12--| "
- "L1.11[250,350] 0ns 1b |-L1.11--| "
- "L2 "
- "L2.21[0,100] 0ns 1b |-L2.21--| "
- "L2.22[200,300] 0ns 1b |-L2.22--| "
"###
);
let split = TargetLevelSplit::new();
let (lower, higher) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("lower", &lower, "higher", &higher),
@r###"
---
- lower
- "L0 "
- "L0.2[650,750] 0ns 1b |---L0.2----| "
- "L0.1[450,620] 0ns 1b |--------L0.1---------| "
- "L0.3[800,900] 0ns 100b |---L0.3----| "
- "L1 "
- "L1.13[600,700] 0ns 100b |---L1.13---| "
- "L1.12[400,500] 0ns 1b |---L1.12---| "
- "L1.11[250,350] 0ns 1b |---L1.11---| "
- higher
- "L2, all files 1b "
- "L2.21[0,100] 0ns |-----------L2.21------------| "
- "L2.22[200,300] 0ns |-----------L2.22------------|"
"###
);
// verify number of files
assert_eq!(lower.len(), 6);
assert_eq!(higher.len(), 2);
// verify compaction level of files
assert!(lower
.iter()
.all(|f| f.compaction_level == CompactionLevel::Initial
|| f.compaction_level == CompactionLevel::FileNonOverlapped));
assert!(higher
.iter()
.all(|f| f.compaction_level == CompactionLevel::Final));
}
#[test]
fn test_apply_taget_level_l2() {
// Test target level is Final
let files = create_overlapped_files();
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.2[650,750] 0ns 1b |--L0.2--| "
- "L0.1[450,620] 0ns 1b |-----L0.1------| "
- "L0.3[800,900] 0ns 100b |--L0.3--|"
- "L1 "
- "L1.13[600,700] 0ns 100b |-L1.13--| "
- "L1.12[400,500] 0ns 1b |-L1.12--| "
- "L1.11[250,350] 0ns 1b |-L1.11--| "
- "L2 "
- "L2.21[0,100] 0ns 1b |-L2.21--| "
- "L2.22[200,300] 0ns 1b |-L2.22--| "
"###
);
let split = TargetLevelSplit::new();
let (lower, higher) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
// verify number of files (nothing in higher)
assert_eq!(lower.len(), 8);
assert_eq!(higher.len(), 0);
// verify compaction level of files
assert!(lower
.iter()
.all(|f| f.compaction_level == CompactionLevel::Initial
|| f.compaction_level == CompactionLevel::FileNonOverlapped
|| f.compaction_level == CompactionLevel::Final));
}
}

View File

@ -1,909 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
use super::FilesSplit;
use crate::file_group::{overlaps_in_time, split_by_level, FilesTimeRange};
#[derive(Debug)]
/// Split files into `[files_to_compact]` and `[files_to_upgrade]`
/// To have better and efficient compaction performance, eligible upgradable files
/// should not be compacted but only need to update its compaction_level to the target_level
pub struct UpgradeSplit {
// Maximum desired file size (try and avoid compacting files above this size)
max_desired_file_size_bytes: u64,
}
impl UpgradeSplit {
pub fn new(size: u64) -> Self {
Self {
max_desired_file_size_bytes: size,
}
}
}
impl Display for UpgradeSplit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Upgrade split for TargetLevel version - Size: {}",
self.max_desired_file_size_bytes
)
}
}
impl FilesSplit for UpgradeSplit {
/// Return (`[files_to_compact]`, `[files_to_upgrade]`) of the given files
/// so that `files_to_upgrade` does not overlap with any files in next level
///
/// The files_to_upgrade must in the (target_level - 1)
///
/// Eligible upgradable files are large-enough-file (>= max_desired_file_size/2) files of the previous level of
/// the target level that do not overlap on time range with any files in its level and higher-level files.
/// Note: we always have to stick to the invariance that the outout files must not overlap
///
/// Example:
/// |--L0.1--| |--L0.2--| |--L0.3--|
/// |--L0.4--| |--L0.5--| |--L0.6--|
/// |--L1.1--| |--L1.2--|
///
/// . There are 4 L0 files that do not overlap with any L1s and L0s: [L0.1, L0.3, L0.5, L0.6]. However,
/// file L0.3 is in the middle of L0.2 and L0.4 and is not eligible.
/// . Even if L0.5 is large enough, it in the midle of L0.6 and the rest. L0.5 is only
/// eligible to upgrade if L0.6 is eligible (large enough), too.
///
/// Algorithm:
/// The non-overlappings files are files of the (target_level -1) files that:
/// 1. Size >= max_desire_file_size/2
/// 2. Completely outside the time range of all higher level files
/// 3. Not overlap with any files in the same level
/// 4. Not overlap with the time range of the files not meet 3 conditions above
/// This is the case that L0.5 is large but L0.6 is small
///
fn apply(
&self,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
partition: TransitionPartitionId,
) -> (Vec<ParquetFile>, Vec<ParquetFile>) {
assert_ne!(
target_level,
CompactionLevel::Initial,
"unexpected compaction target_level, should not be L0, partition_id={}",
partition
);
let mut files_to_upgrade = Vec::with_capacity(files.len());
let mut files_to_compact = Vec::with_capacity(files.len());
// Split files into levels
let prev_level = target_level.prev();
let (target_level_files, mut prev_level_files) =
split_by_level(files, target_level, prev_level, partition);
// compute time range of target_level_files, if any
let target_time_range = FilesTimeRange::try_new(&target_level_files);
// Go go over all files of previous level and check if they are NOT eligible to upgrade
// by hit one of this conditions
// . Size < max_desire_file_size/2
// . Overlap with time range of target_level_files
// . Overlap with any files in the same level
// Otherwise, they are large and not overlap. Put them in the potential upgradable list
// to check if they are actually upgradable or not. If they are in the middle of the
// non-eligible files above, they are not upgradable.
let mut potential_upgradable_files = Vec::with_capacity(prev_level_files.len());
while let Some(file) = prev_level_files.pop() {
// size is small
if file.file_size_bytes < self.max_desired_file_size_bytes as i64 / 2 {
files_to_compact.push(file);
} else if let Some(target_time_range) = target_time_range {
// overlap with target_level_files
if target_time_range.contains(&file) ||
// overlap with files in the same level
overlaps_in_time(&file, &prev_level_files) ||
overlaps_in_time(&file, &files_to_compact)
{
files_to_compact.push(file);
} else {
potential_upgradable_files.push(file);
}
} else if prev_level_files.iter().any(|f| f.overlaps(&file))
|| files_to_compact.iter().any(|f| f.overlaps(&file))
{
// overlap with files in the same level
files_to_compact.push(file);
} else {
potential_upgradable_files.push(file);
}
}
// Add target_level_files to files_to_compact
files_to_compact.extend(target_level_files);
// Compute time range of files_to_compact again to check if the potential upgradable files
let to_compact_time_range = FilesTimeRange::try_new(&files_to_compact);
// Go over all potential upgradable files and check if they are actually upgradable
// by not overlapping with the min_max_range of files_to_compact
while let Some(file) = potential_upgradable_files.pop() {
if let Some(to_compact_time_range) = to_compact_time_range {
if !to_compact_time_range.contains(&file) {
files_to_upgrade.push(file);
} else {
files_to_compact.push(file);
}
} else {
files_to_upgrade.push(file);
}
}
(files_to_compact, files_to_upgrade)
}
}
#[cfg(test)]
mod tests {
use compactor_test_utils::{
create_fake_partition_id, create_l0_files, create_l1_files, create_l1_files_mix_size,
create_overlapped_files, create_overlapped_files_2, create_overlapped_files_3,
create_overlapped_files_3_mix_size, create_overlapped_l0_l1_files,
create_overlapped_l1_l2_files, create_overlapped_l1_l2_files_mix_size,
create_overlapped_l1_l2_files_mix_size_2, create_overlapping_l0_files, format_files,
format_files_split,
};
use super::*;
use data_types::CompactionLevel;
const MAX_SIZE: u64 = 100;
#[test]
fn test_display() {
assert_eq!(
UpgradeSplit::new(MAX_SIZE).to_string(),
"Upgrade split for TargetLevel version - Size: 100"
);
}
#[test]
#[should_panic]
fn test_wrong_target_level() {
let split = UpgradeSplit::new(MAX_SIZE);
let (_files_to_compact, _files_to_upgrade) =
split.apply(vec![], CompactionLevel::Initial, create_fake_partition_id());
}
#[test]
#[should_panic(
expected = "unexpected compaction level for partition 0, expected CompactionLevel::L1 or CompactionLevel::L0 but got CompactionLevel::L2"
)]
fn test_unexpected_compaction_level_2() {
let files = create_overlapped_files();
let split = UpgradeSplit::new(MAX_SIZE);
// There are L2 files and will panic
split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
}
#[test]
#[should_panic(
expected = "unexpected compaction level for partition 0, expected CompactionLevel::L2 or CompactionLevel::L1 but got CompactionLevel::L0"
)]
fn test_unexpected_compaction_level_0() {
let files = create_overlapped_files();
let split = UpgradeSplit::new(MAX_SIZE);
// There are L0 files and will panic
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
}
#[test]
fn test_apply_empty_files() {
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
vec![],
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
assert_eq!((files_to_compact, files_to_upgrade), (vec![], vec![]));
let (files_to_compact, files_to_upgrade) =
split.apply(vec![], CompactionLevel::Final, create_fake_partition_id());
assert_eq!((files_to_compact, files_to_upgrade), (vec![], vec![]));
}
#[test]
fn test_apply_one_level_overlap_small_l0() {
let files = create_overlapping_l0_files((MAX_SIZE / 2 - 1) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 49b "
- "L0.2[150,180] 0ns |L0.2| "
- "L0.1[100,200] 0ns |--L0.1---| "
- "L0.3[800,900] 0ns |--L0.3---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are small --> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 49b "
- "L0.3[800,900] 0ns |--L0.3---| "
- "L0.1[100,200] 0ns |--L0.1---| "
- "L0.2[150,180] 0ns |L0.2| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_one_level_overlap_large_l0() {
let files = create_overlapping_l0_files((MAX_SIZE + 1) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 101b "
- "L0.2[150,180] 0ns |L0.2| "
- "L0.1[100,200] 0ns |--L0.1---| "
- "L0.3[800,900] 0ns |--L0.3---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are large but only one eligible for upgrade
// files_to_compact = [L0.1, L0.2]
// files_to_upgrade = [L0.3]
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 101b "
- "L0.1[100,200] 0ns |------------------------------------------L0.1------------------------------------------|"
- "L0.2[150,180] 0ns |----------L0.2-----------| "
- files_to_upgrade
- "L0, all files 101b "
- "L0.3[800,900] 0ns |------------------------------------------L0.3------------------------------------------|"
"###
);
}
#[test]
fn test_apply_one_level_small_l0() {
let files = create_l0_files((MAX_SIZE / 2 - 1) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 49b "
- "L0.2[650,750] 0ns |-------L0.2-------| "
- "L0.1[450,620] 0ns |--------------L0.1--------------| "
- "L0.3[800,900] 0ns |-------L0.3-------|"
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are small --> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 49b "
- "L0.3[800,900] 0ns |-------L0.3-------|"
- "L0.1[450,620] 0ns |--------------L0.1--------------| "
- "L0.2[650,750] 0ns |-------L0.2-------| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_one_level_large_l0() {
let files = create_l0_files((MAX_SIZE + 1) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 101b "
- "L0.2[650,750] 0ns |-------L0.2-------| "
- "L0.1[450,620] 0ns |--------------L0.1--------------| "
- "L0.3[800,900] 0ns |-------L0.3-------|"
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are large and eligible for upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- files_to_upgrade
- "L0, all files 101b "
- "L0.2[650,750] 0ns |-------L0.2-------| "
- "L0.1[450,620] 0ns |--------------L0.1--------------| "
- "L0.3[800,900] 0ns |-------L0.3-------|"
"###
);
}
#[test]
fn test_apply_one_level_small_l1() {
let files = create_l1_files((MAX_SIZE - 1) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1, all files 99b "
- "L1.13[600,700] 0ns |------L1.13-------|"
- "L1.12[400,500] 0ns |------L1.12-------| "
- "L1.11[250,350] 0ns |------L1.11-------| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are small --> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L1, all files 99b "
- "L1.13[600,700] 0ns |------L1.13-------|"
- "L1.12[400,500] 0ns |------L1.12-------| "
- "L1.11[250,350] 0ns |------L1.11-------| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_one_level_large_l1() {
let files = create_l1_files((MAX_SIZE + 1) as i64);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
// All files are large and eligible for upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- files_to_upgrade
- "L1, all files 101b "
- "L1.13[600,700] 0ns |------L1.13-------|"
- "L1.12[400,500] 0ns |------L1.12-------| "
- "L1.11[250,350] 0ns |------L1.11-------| "
"###
);
}
#[test]
fn test_apply_one_level_l1_mix_size() {
let files = create_l1_files_mix_size((MAX_SIZE / 2) as i64);
// . small files (< size ): L1.1, L1.3
// . Large files (.= size): L1.2, L1.4, L1.5
//
// . files_to_compact = [L1.1, L1.2, L1.3]
// . files_to_upgrade = [L1.4, L1.5]
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1 "
- "L1.15[1000,1100] 0ns 150b |-L1.15--| "
- "L1.13[600,700] 0ns 40b |-L1.13--| "
- "L1.12[400,500] 0ns 51b |-L1.12--| "
- "L1.11[250,350] 0ns 49b |-L1.11--| "
- "L1.14[800,900] 0ns 50b |-L1.14--| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
// Some files are large and eligible for upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L1 "
- "L1.11[250,350] 0ns 49b |------L1.11-------| "
- "L1.13[600,700] 0ns 40b |------L1.13-------|"
- "L1.12[400,500] 0ns 51b |------L1.12-------| "
- files_to_upgrade
- "L1 "
- "L1.15[1000,1100] 0ns 150b |-----------L1.15------------|"
- "L1.14[800,900] 0ns 50b |-----------L1.14------------| "
"###
);
}
#[test]
fn test_apply_all_small_target_l1() {
let files = create_overlapped_l0_l1_files((MAX_SIZE / 2 - 1) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 49b "
- "L0.2[650,750] 180s |---L0.2----| "
- "L0.1[450,620] 120s |--------L0.1---------| "
- "L0.3[800,900] 300s |---L0.3----| "
- "L1, all files 49b "
- "L1.13[600,700] 60s |---L1.13---| "
- "L1.12[400,500] 60s |---L1.12---| "
- "L1.11[250,350] 60s |---L1.11---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are small --> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 49b "
- "L0.3[800,900] 300s |---L0.3----| "
- "L0.1[450,620] 120s |--------L0.1---------| "
- "L0.2[650,750] 180s |---L0.2----| "
- "L1, all files 49b "
- "L1.13[600,700] 60s |---L1.13---| "
- "L1.12[400,500] 60s |---L1.12---| "
- "L1.11[250,350] 60s |---L1.11---| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_all_large_target_l1() {
let files = create_overlapped_l0_l1_files((MAX_SIZE) as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 100b "
- "L0.2[650,750] 180s |---L0.2----| "
- "L0.1[450,620] 120s |--------L0.1---------| "
- "L0.3[800,900] 300s |---L0.3----| "
- "L1, all files 100b "
- "L1.13[600,700] 60s |---L1.13---| "
- "L1.12[400,500] 60s |---L1.12---| "
- "L1.11[250,350] 60s |---L1.11---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
// All files are large --> L0.3 is eligible for upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 100b "
- "L0.1[450,620] 120s |------------L0.1------------| "
- "L0.2[650,750] 180s |------L0.2------|"
- "L1, all files 100b "
- "L1.13[600,700] 60s |-----L1.13------| "
- "L1.12[400,500] 60s |-----L1.12------| "
- "L1.11[250,350] 60s |-----L1.11------| "
- files_to_upgrade
- "L0, all files 100b "
- "L0.3[800,900] 300s |------------------------------------------L0.3------------------------------------------|"
"###
);
}
#[test]
fn test_apply_all_small_target_l2() {
let files = create_overlapped_l1_l2_files((MAX_SIZE / 2 - 1) as i64);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
// All files are small --> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L1, all files 49b "
- "L1.11[250,350] 0ns |--L1.11---| "
- "L1.12[400,500] 0ns |--L1.12---| "
- "L1.13[600,700] 0ns |--L1.13---| "
- "L2, all files 49b "
- "L2.21[0,100] 0ns |--L2.21---| "
- "L2.22[200,300] 0ns |--L2.22---| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_all_large_target_l2() {
let files = create_overlapped_l1_l2_files(MAX_SIZE as i64);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1, all files 100b "
- "L1.13[600,700] 0ns |--L1.13---| "
- "L1.12[400,500] 0ns |--L1.12---| "
- "L1.11[250,350] 0ns |--L1.11---| "
- "L2, all files 100b "
- "L2.21[0,100] 0ns |--L2.21---| "
- "L2.22[200,300] 0ns |--L2.22---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
// All files are large --> L1.2 and L1.3 are eligible for upgrade
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L1, all files 100b "
- "L1.11[250,350] 0ns |---------L1.11---------| "
- "L2, all files 100b "
- "L2.21[0,100] 0ns |---------L2.21---------| "
- "L2.22[200,300] 0ns |---------L2.22---------| "
- files_to_upgrade
- "L1, all files 100b "
- "L1.13[600,700] 0ns |-----------L1.13------------|"
- "L1.12[400,500] 0ns |-----------L1.12------------| "
"###
);
}
#[test]
fn test_apply_all_small_target_l2_mix_size() {
let files = create_overlapped_l1_l2_files_mix_size((MAX_SIZE / 2) as i64);
// Small files (< size): [L1.3]
// Large files: [L2.1, L2.2, L1.1, L1.2]
// ==> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1 "
- "L1.13[600,700] 0ns 49b |--L1.13---| "
- "L1.12[400,500] 0ns 50b |--L1.12---| "
- "L1.11[250,350] 0ns 50b |--L1.11---| "
- "L2 "
- "L2.21[0,100] 0ns 50b |--L2.21---| "
- "L2.22[200,300] 0ns 50b |--L2.22---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L1 "
- "L1.11[250,350] 0ns 50b |--L1.11---| "
- "L1.13[600,700] 0ns 49b |--L1.13---| "
- "L1.12[400,500] 0ns 50b |--L1.12---| "
- "L2 "
- "L2.21[0,100] 0ns 50b |--L2.21---| "
- "L2.22[200,300] 0ns 50b |--L2.22---| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_all_small_target_l2_mix_size_2() {
let files = create_overlapped_l1_l2_files_mix_size_2((MAX_SIZE / 2) as i64);
// Small files (< size): [L1.2]
// Large files: [L2.1, L2.2, L1.1, L1.3]
// ==> L1.3 is eligible for upgrade
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L1 "
- "L1.13[600,700] 0ns 50b |--L1.13---| "
- "L1.12[400,500] 0ns 49b |--L1.12---| "
- "L1.11[250,350] 0ns 50b |--L1.11---| "
- "L2 "
- "L2.21[0,100] 0ns 50b |--L2.21---| "
- "L2.22[200,300] 0ns 50b |--L2.22---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) =
split.apply(files, CompactionLevel::Final, create_fake_partition_id());
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L1 "
- "L1.11[250,350] 0ns 50b |-----L1.11------| "
- "L1.12[400,500] 0ns 49b |-----L1.12------|"
- "L2 "
- "L2.21[0,100] 0ns 50b |-----L2.21------| "
- "L2.22[200,300] 0ns 50b |-----L2.22------| "
- files_to_upgrade
- "L1, all files 50b "
- "L1.13[600,700] 0ns |-----------------------------------------L1.13------------------------------------------|"
"###
);
}
#[test]
fn test_apply_all_large_but_no_upragde() {
let files = create_overlapped_files_2(MAX_SIZE as i64);
// L0s in the time range of L1 ==> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 100b "
- "L0.2[520,550] 0ns |L0.2| "
- "L0.1[250,350] 0ns |---L0.1---| "
- "L1, all files 100b "
- "L1.13[400,500] 0ns |--L1.13---| "
- "L1.12[200,300] 0ns |--L1.12---| "
- "L1.11[0,100] 0ns |--L1.11---| "
- "L1.14[600,700] 0ns |--L1.14---| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 100b "
- "L0.1[250,350] 0ns |---L0.1---| "
- "L0.2[520,550] 0ns |L0.2| "
- "L1, all files 100b "
- "L1.13[400,500] 0ns |--L1.13---| "
- "L1.12[200,300] 0ns |--L1.12---| "
- "L1.11[0,100] 0ns |--L1.11---| "
- "L1.14[600,700] 0ns |--L1.14---| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_all_small_target_l1_2() {
let files = create_overlapped_files_3((MAX_SIZE / 2 - 1) as i64);
// All small ==> nothing to upgrade
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 49b "
- "L0.3[400,500] 0ns |-L0.3-| "
- "L0.2[200,300] 0ns |-L0.2-| "
- "L0.1[0,100] 0ns |-L0.1-| "
- "L0.4[600,700] 0ns |-L0.4-| "
- "L0.5[800,900] 0ns |-L0.5-| "
- "L0.6[1000,1100] 0ns |-L0.6-| "
- "L1, all files 49b "
- "L1.11[250,350] 0ns |L1.11-| "
- "L1.12[650,750] 0ns |L1.12-| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 49b "
- "L0.6[1000,1100] 0ns |-L0.6-| "
- "L0.5[800,900] 0ns |-L0.5-| "
- "L0.4[600,700] 0ns |-L0.4-| "
- "L0.1[0,100] 0ns |-L0.1-| "
- "L0.2[200,300] 0ns |-L0.2-| "
- "L0.3[400,500] 0ns |-L0.3-| "
- "L1, all files 49b "
- "L1.11[250,350] 0ns |L1.11-| "
- "L1.12[650,750] 0ns |L1.12-| "
- files_to_upgrade
"###
);
}
#[test]
fn test_apply_all_large_target_l1_2() {
let files = create_overlapped_files_3((MAX_SIZE + 10) as i64);
// All large ==> L0.1, L0.5, L0.6 are eligible for upgrade
// files_to_compact: [L0.2, L0.3, L0.4, L1.1, L1.2]
// files_to_upgrade: [L0.1, L0.5, L0.6]
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 110b "
- "L0.3[400,500] 0ns |-L0.3-| "
- "L0.2[200,300] 0ns |-L0.2-| "
- "L0.1[0,100] 0ns |-L0.1-| "
- "L0.4[600,700] 0ns |-L0.4-| "
- "L0.5[800,900] 0ns |-L0.5-| "
- "L0.6[1000,1100] 0ns |-L0.6-| "
- "L1, all files 110b "
- "L1.11[250,350] 0ns |L1.11-| "
- "L1.12[650,750] 0ns |L1.12-| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0, all files 110b "
- "L0.4[600,700] 0ns |-----L0.4-----| "
- "L0.2[200,300] 0ns |-----L0.2-----| "
- "L0.3[400,500] 0ns |-----L0.3-----| "
- "L1, all files 110b "
- "L1.11[250,350] 0ns |----L1.11-----| "
- "L1.12[650,750] 0ns |----L1.12-----| "
- files_to_upgrade
- "L0, all files 110b "
- "L0.1[0,100] 0ns |-L0.1-| "
- "L0.5[800,900] 0ns |-L0.5-| "
- "L0.6[1000,1100] 0ns |-L0.6-| "
"###
);
}
#[test]
fn test_apply_mix_size_target_l1_2() {
let files = create_overlapped_files_3_mix_size((MAX_SIZE / 2) as i64);
// Small files (< size): L0.6
// Large files: the rest
// ==> only L0.1 is eligible for upgrade
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.3[400,500] 0ns 50b |-L0.3-| "
- "L0.2[200,300] 0ns 50b |-L0.2-| "
- "L0.1[0,100] 0ns 50b |-L0.1-| "
- "L0.4[600,700] 0ns 50b |-L0.4-| "
- "L0.5[800,900] 0ns 50b |-L0.5-| "
- "L0.6[1000,1100] 0ns 49b |-L0.6-| "
- "L1 "
- "L1.11[250,350] 0ns 50b |L1.11-| "
- "L1.12[650,750] 0ns 50b |L1.12-| "
"###
);
let split = UpgradeSplit::new(MAX_SIZE);
let (files_to_compact, files_to_upgrade) = split.apply(
files,
CompactionLevel::FileNonOverlapped,
create_fake_partition_id(),
);
insta::assert_yaml_snapshot!(
format_files_split("files_to_compact", &files_to_compact, "files_to_upgrade", &files_to_upgrade),
@r###"
---
- files_to_compact
- "L0 "
- "L0.6[1000,1100] 0ns 49b |--L0.6--|"
- "L0.4[600,700] 0ns 50b |--L0.4--| "
- "L0.2[200,300] 0ns 50b |--L0.2--| "
- "L0.3[400,500] 0ns 50b |--L0.3--| "
- "L0.5[800,900] 0ns 50b |--L0.5--| "
- "L1 "
- "L1.11[250,350] 0ns 50b |-L1.11--| "
- "L1.12[650,750] 0ns 50b |-L1.12--| "
- files_to_upgrade
- "L0, all files 50b "
- "L0.1[0,100] 0ns |------------------------------------------L0.1------------------------------------------|"
"###
);
}
}

View File

@ -1,373 +0,0 @@
//! Current hardcoded component setup.
//!
//! TODO: Make this a runtime-config.
use std::{sync::Arc, time::Duration};
use compactor_scheduler::{create_scheduler, Scheduler};
use data_types::CompactionLevel;
use object_store::memory::InMemory;
use crate::{config::Config, error::ErrorKind, object_store::ignore_writes::IgnoreWrites};
use super::{
changed_files_filter::logging::LoggingChangedFiles,
columns_source::catalog::CatalogColumnsSource,
commit::CommitToScheduler,
compaction_job_done_sink::{
error_kind::ErrorKindCompactionJobDoneSinkWrapper,
logging::LoggingCompactionJobDoneSinkWrapper, metrics::MetricsCompactionJobDoneSinkWrapper,
outcome::CompactionJobDoneSinkToScheduler, CompactionJobDoneSink,
},
compaction_job_stream::{
endless::EndlessCompactionJobStream, once::OnceCompactionJobStream, CompactionJobStream,
},
compaction_jobs_source::{
logging::LoggingCompactionJobsWrapper, metrics::MetricsCompactionJobsSourceWrapper,
not_empty::NotEmptyCompactionJobsSourceWrapper,
randomize_order::RandomizeOrderCompactionJobsSourcesWrapper,
scheduled::ScheduledCompactionJobsSource, CompactionJobsSource,
},
df_plan_exec::{
dedicated::DedicatedDataFusionPlanExec, noop::NoopDataFusionPlanExec, DataFusionPlanExec,
},
df_planner::{planner_v1::V1DataFusionPlanner, DataFusionPlanner},
divide_initial::multiple_branches::MultipleBranchesDivideInitial,
file_classifier::{
logging::LoggingFileClassifierWrapper, split_based::SplitBasedFileClassifier,
FileClassifier,
},
file_filter::level_range::LevelRangeFileFilter,
files_split::{
non_overlap_split::NonOverlapSplit, target_level_split::TargetLevelSplit,
upgrade_split::UpgradeSplit,
},
ir_planner::{logging::LoggingIRPlannerWrapper, planner_v1::V1IRPlanner, IRPlanner},
namespaces_source::catalog::CatalogNamespacesSource,
parquet_file_sink::{
dedicated::DedicatedExecParquetFileSinkWrapper, logging::LoggingParquetFileSinkWrapper,
object_store::ObjectStoreParquetFileSink,
},
parquet_files_sink::{dispatch::DispatchParquetFilesSink, ParquetFilesSink},
partition_files_source::{
catalog::{CatalogPartitionFilesSource, QueryRateLimiter},
rate_limit::RateLimit,
PartitionFilesSource,
},
partition_filter::{
and::AndPartitionFilter, greater_matching_files::GreaterMatchingFilesPartitionFilter,
greater_size_matching_files::GreaterSizeMatchingFilesPartitionFilter,
has_files::HasFilesPartitionFilter, has_matching_file::HasMatchingFilePartitionFilter,
logging::LoggingPartitionFilterWrapper, max_num_columns::MaxNumColumnsPartitionFilter,
metrics::MetricsPartitionFilterWrapper, or::OrPartitionFilter, PartitionFilter,
},
partition_info_source::{sub_sources::SubSourcePartitionInfoSource, PartitionInfoSource},
partition_source::{
catalog::CatalogPartitionSource, logging::LoggingPartitionSourceWrapper,
metrics::MetricsPartitionSourceWrapper,
},
post_classification_partition_filter::{
logging::LoggingPostClassificationFilterWrapper,
metrics::MetricsPostClassificationFilterWrapper, possible_progress::PossibleProgressFilter,
PostClassificationPartitionFilter,
},
round_info_source::{LevelBasedRoundInfo, LoggingRoundInfoWrapper, RoundInfoSource},
round_split::many_files::ManyFilesRoundSplit,
scratchpad::{noop::NoopScratchpadGen, prod::ProdScratchpadGen, ScratchpadGen},
split_or_compact::{
logging::LoggingSplitOrCompactWrapper, metrics::MetricsSplitOrCompactWrapper,
split_compact::SplitCompact,
},
tables_source::catalog::CatalogTablesSource,
Components,
};
/// Get hardcoded components.
pub fn hardcoded_components(config: &Config) -> Arc<Components> {
let scheduler = create_scheduler(
config.scheduler_config.clone(),
Arc::clone(&config.catalog),
Arc::clone(&config.time_provider),
Arc::clone(&config.metric_registry),
config.shadow_mode,
);
let (compaction_jobs_source, commit, compaction_job_done_sink) =
make_jobs_source_commit_jobs_sink(config, Arc::clone(&scheduler));
Arc::new(Components {
compaction_job_stream: make_compaction_job_stream(config, compaction_jobs_source),
partition_info_source: make_partition_info_source(config),
partition_files_source: make_partition_files_source(config),
round_info_source: make_round_info_source(config),
partition_filter: make_partition_filter(config),
compaction_job_done_sink,
commit,
ir_planner: make_ir_planner(config),
df_planner: make_df_planner(config),
df_plan_exec: make_df_plan_exec(config),
parquet_files_sink: make_parquet_files_sink(config),
round_split: Arc::new(ManyFilesRoundSplit::new()),
divide_initial: Arc::new(MultipleBranchesDivideInitial::new()),
scratchpad_gen: make_scratchpad_gen(config),
file_classifier: make_file_classifier(config),
post_classification_partition_filter: make_post_classification_partition_filter(config),
changed_files_filter: Arc::new(LoggingChangedFiles::new()),
})
}
fn make_jobs_source_commit_jobs_sink(
config: &Config,
scheduler: Arc<dyn Scheduler>,
) -> (
Arc<dyn CompactionJobsSource>,
Arc<CommitToScheduler>,
Arc<dyn CompactionJobDoneSink>,
) {
let compaction_jobs_source = ScheduledCompactionJobsSource::new(Arc::clone(&scheduler));
let commit = CommitToScheduler::new(Arc::clone(&scheduler));
let compaction_job_done_sink = CompactionJobDoneSinkToScheduler::new(Arc::clone(&scheduler));
// compactors are responsible for error classification
// and any future decisions regarding graceful shutdown
let compaction_job_done_sink: Arc<dyn CompactionJobDoneSink> = if config.all_errors_are_fatal {
Arc::new(compaction_job_done_sink)
} else {
Arc::new(ErrorKindCompactionJobDoneSinkWrapper::new(
compaction_job_done_sink,
ErrorKind::variants()
.iter()
.filter(|kind| {
// use explicit match statement so we never forget to add new variants
match kind {
ErrorKind::OutOfMemory | ErrorKind::Timeout | ErrorKind::Unknown => true,
ErrorKind::ObjectStore => false,
}
})
.copied()
.collect(),
scheduler,
))
};
let compaction_job_done_sink = Arc::new(LoggingCompactionJobDoneSinkWrapper::new(
MetricsCompactionJobDoneSinkWrapper::new(compaction_job_done_sink, &config.metric_registry),
));
// Note: Place "not empty" wrapper at the very last so that the logging and metric wrapper work
// even when there is not data.
let compaction_jobs_source =
LoggingCompactionJobsWrapper::new(MetricsCompactionJobsSourceWrapper::new(
RandomizeOrderCompactionJobsSourcesWrapper::new(compaction_jobs_source, 1234),
&config.metric_registry,
));
let compaction_jobs_source: Arc<dyn CompactionJobsSource> = if config.process_once {
// do not wrap into the "not empty" filter because we do NOT wanna throttle in this case
// but just exit early
Arc::new(compaction_jobs_source)
} else {
Arc::new(NotEmptyCompactionJobsSourceWrapper::new(
compaction_jobs_source,
Duration::from_secs(5),
Arc::clone(&config.time_provider),
))
};
(
compaction_jobs_source,
Arc::new(commit),
compaction_job_done_sink,
)
}
fn make_compaction_job_stream(
config: &Config,
compaction_jobs_source: Arc<dyn CompactionJobsSource>,
) -> Arc<dyn CompactionJobStream> {
if config.process_once {
Arc::new(OnceCompactionJobStream::new(compaction_jobs_source))
} else {
Arc::new(EndlessCompactionJobStream::new(compaction_jobs_source))
}
}
fn make_partition_info_source(config: &Config) -> Arc<dyn PartitionInfoSource> {
Arc::new(SubSourcePartitionInfoSource::new(
CatalogColumnsSource::new(config.backoff_config.clone(), Arc::clone(&config.catalog)),
LoggingPartitionSourceWrapper::new(MetricsPartitionSourceWrapper::new(
CatalogPartitionSource::new(config.backoff_config.clone(), Arc::clone(&config.catalog)),
&config.metric_registry,
)),
CatalogTablesSource::new(config.backoff_config.clone(), Arc::clone(&config.catalog)),
CatalogNamespacesSource::new(config.backoff_config.clone(), Arc::clone(&config.catalog)),
))
}
fn make_partition_files_source(config: &Config) -> Arc<dyn PartitionFilesSource> {
match config.max_partition_fetch_queries_per_second {
Some(rps) => Arc::new(CatalogPartitionFilesSource::new(
config.backoff_config.clone(),
QueryRateLimiter::new(Arc::clone(&config.catalog), RateLimit::new(rps, 25)),
)),
None => Arc::new(CatalogPartitionFilesSource::new(
config.backoff_config.clone(),
Arc::clone(&config.catalog),
)),
}
}
fn make_round_info_source(config: &Config) -> Arc<dyn RoundInfoSource> {
Arc::new(LoggingRoundInfoWrapper::new(Arc::new(
LevelBasedRoundInfo::new(
config.max_num_files_per_plan,
config.max_compact_size_bytes(),
),
)))
}
// Conditions to compact this partition
fn make_partition_filter(config: &Config) -> Arc<dyn PartitionFilter> {
let mut partition_filters = exceptional_cases_partition_filters(config);
partition_filters.push(continue_condition_filter(config));
let partition_continue_conditions = "continue_conditions";
Arc::new(LoggingPartitionFilterWrapper::new(
MetricsPartitionFilterWrapper::new(
AndPartitionFilter::new(partition_filters),
&config.metric_registry,
partition_continue_conditions,
),
partition_continue_conditions,
))
}
fn exceptional_cases_partition_filters(config: &Config) -> Vec<Arc<dyn PartitionFilter>> {
// Capacity is hardcoded to a somewhat arbitrary number to prevent some reallocations
let mut partition_filters: Vec<Arc<dyn PartitionFilter>> = Vec::with_capacity(8);
partition_filters.push(Arc::new(HasFilesPartitionFilter::new()));
partition_filters.push(Arc::new(MaxNumColumnsPartitionFilter::new(
config.max_num_columns_per_table,
)));
partition_filters
}
fn continue_condition_filter(config: &Config) -> Arc<dyn PartitionFilter> {
// (Has-L0) OR -- to avoid overlapped files
// (num(L1) > N) OR -- to avoid many files
// (total_size(L1) > max_desired_file_size) -- to avoid compact and than split
Arc::new(OrPartitionFilter::new(vec![
Arc::new(HasMatchingFilePartitionFilter::new(
LevelRangeFileFilter::new(CompactionLevel::Initial..=CompactionLevel::Initial),
)),
Arc::new(GreaterMatchingFilesPartitionFilter::new(
LevelRangeFileFilter::new(
CompactionLevel::FileNonOverlapped..=CompactionLevel::FileNonOverlapped,
),
config.min_num_l1_files_to_compact,
)),
Arc::new(GreaterSizeMatchingFilesPartitionFilter::new(
LevelRangeFileFilter::new(
CompactionLevel::FileNonOverlapped..=CompactionLevel::FileNonOverlapped,
),
config.max_desired_file_size_bytes,
)),
]))
}
fn make_ir_planner(config: &Config) -> Arc<dyn IRPlanner> {
Arc::new(LoggingIRPlannerWrapper::new(V1IRPlanner::new(
config.max_desired_file_size_bytes,
config.percentage_max_file_size,
config.split_percentage,
)))
}
fn make_df_planner(config: &Config) -> Arc<dyn DataFusionPlanner> {
Arc::new(V1DataFusionPlanner::new(
config.parquet_store_scratchpad.clone(),
Arc::clone(&config.exec),
))
}
fn make_df_plan_exec(config: &Config) -> Arc<dyn DataFusionPlanExec> {
if config.simulate_without_object_store {
Arc::new(NoopDataFusionPlanExec::new())
} else {
Arc::new(DedicatedDataFusionPlanExec::new(Arc::clone(&config.exec)))
}
}
fn make_parquet_files_sink(config: &Config) -> Arc<dyn ParquetFilesSink> {
if let Some(sink) = config.parquet_files_sink_override.as_ref() {
Arc::clone(sink)
} else {
let parquet_file_sink = Arc::new(LoggingParquetFileSinkWrapper::new(
DedicatedExecParquetFileSinkWrapper::new(
ObjectStoreParquetFileSink::new(
config.exec.pool(),
config.parquet_store_scratchpad.clone(),
Arc::clone(&config.time_provider),
),
Arc::clone(&config.exec),
),
));
Arc::new(DispatchParquetFilesSink::new(parquet_file_sink))
}
}
fn make_scratchpad_gen(config: &Config) -> Arc<dyn ScratchpadGen> {
if config.simulate_without_object_store || !config.enable_scratchpad {
Arc::new(NoopScratchpadGen::new())
} else {
let scratchpad_store_output = if config.shadow_mode {
Arc::new(IgnoreWrites::new(Arc::new(InMemory::new())))
} else {
Arc::clone(config.parquet_store_real.object_store())
};
Arc::new(ProdScratchpadGen::new(
config.shadow_mode,
config.partition_scratchpad_concurrency,
config.backoff_config.clone(),
Arc::clone(config.parquet_store_real.object_store()),
Arc::clone(config.parquet_store_scratchpad.object_store()),
scratchpad_store_output,
))
}
}
fn make_file_classifier(config: &Config) -> Arc<dyn FileClassifier> {
Arc::new(LoggingFileClassifierWrapper::new(Arc::new(
SplitBasedFileClassifier::new(
TargetLevelSplit::new(),
NonOverlapSplit::new(config.max_desired_file_size_bytes / 20), // rewrite non-overlapping files up to 5% of max
UpgradeSplit::new(config.max_desired_file_size_bytes),
LoggingSplitOrCompactWrapper::new(MetricsSplitOrCompactWrapper::new(
SplitCompact::new(
config.max_num_files_per_plan,
config.max_compact_size_bytes(),
config.max_desired_file_size_bytes,
),
&config.metric_registry,
)),
),
)))
}
fn make_post_classification_partition_filter(
config: &Config,
) -> Arc<dyn PostClassificationPartitionFilter> {
let partition_resource_limit_conditions = "resource_limit_conditions";
Arc::new(LoggingPostClassificationFilterWrapper::new(
MetricsPostClassificationFilterWrapper::new(
PossibleProgressFilter::new(config.max_compact_size_bytes()),
&config.metric_registry,
partition_resource_limit_conditions,
),
partition_resource_limit_conditions,
))
}

View File

@ -1,136 +0,0 @@
use std::{fmt::Display, sync::Arc};
use data_types::{CompactionLevel, ParquetFile};
use observability_deps::tracing::info;
use parquet_file::ParquetFilePath;
use uuid::Uuid;
use crate::{
file_classification::{CompactReason, FileToSplit, FilesToSplitOrCompact, SplitReason},
partition_info::PartitionInfo,
plan_ir::PlanIR,
};
use super::IRPlanner;
#[derive(Debug)]
pub struct LoggingIRPlannerWrapper<T>
where
T: IRPlanner,
{
inner: T,
}
impl<T> LoggingIRPlannerWrapper<T>
where
T: IRPlanner,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingIRPlannerWrapper<T>
where
T: IRPlanner,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({})", self.inner)
}
}
impl<T> IRPlanner for LoggingIRPlannerWrapper<T>
where
T: IRPlanner,
{
fn create_plans(
&self,
partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
split_or_compact: FilesToSplitOrCompact,
object_store_ids: Vec<Uuid>,
object_store_paths: Vec<ParquetFilePath>,
) -> Vec<PlanIR> {
self.inner.create_plans(
partition,
target_level,
split_or_compact,
object_store_ids,
object_store_paths,
)
}
fn compact_plan(
&self,
files: Vec<ParquetFile>,
object_store_paths: Vec<ParquetFilePath>,
object_store_ids: Vec<Uuid>,
reason: CompactReason,
partition: Arc<PartitionInfo>,
compaction_level: CompactionLevel,
) -> PlanIR {
let partition_id = partition.partition_id;
let n_input_files = files.len();
let column_count = partition.column_count();
let input_file_size_bytes = files.iter().map(|f| f.file_size_bytes).sum::<i64>();
let plan = self.inner.compact_plan(
files,
object_store_paths,
object_store_ids,
reason,
partition,
compaction_level,
);
info!(
partition_id = partition_id.get(),
n_input_files,
column_count,
input_file_size_bytes,
n_output_files = plan.n_output_files(),
compaction_level = compaction_level as i16,
?reason,
%plan,
"created IR compact plan",
);
plan
}
fn split_plan(
&self,
file_to_split: FileToSplit,
object_store_path: ParquetFilePath,
object_store_id: Uuid,
reason: SplitReason,
partition: Arc<PartitionInfo>,
compaction_level: CompactionLevel,
) -> PlanIR {
let partition_id = partition.partition_id;
let n_input_files = 1;
let column_count = partition.column_count();
let input_file_size_bytes = file_to_split.file.file_size_bytes;
let plan = self.inner.split_plan(
file_to_split,
object_store_path,
object_store_id,
reason,
partition,
compaction_level,
);
info!(
partition_id = partition_id.get(),
n_input_files,
column_count,
input_file_size_bytes,
n_output_files = plan.n_output_files(),
compaction_level = compaction_level as i16,
?reason,
%plan,
"created IR split plan",
);
plan
}
}

View File

@ -1,52 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use data_types::{CompactionLevel, ParquetFile};
use parquet_file::ParquetFilePath;
use uuid::Uuid;
pub mod logging;
pub mod planner_v1;
use crate::{
file_classification::{CompactReason, FileToSplit, FilesToSplitOrCompact, SplitReason},
partition_info::PartitionInfo,
plan_ir::PlanIR,
};
/// Creates [`PlanIR`] that describes what files should be compacted and updated
pub trait IRPlanner: Debug + Display + Send + Sync {
/// Build compact or split plans as appropriate
fn create_plans(
&self,
partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
split_or_compact: FilesToSplitOrCompact,
object_store_ids: Vec<Uuid>,
object_store_paths: Vec<ParquetFilePath>,
) -> Vec<PlanIR>;
/// Build a plan to compact give files
fn compact_plan(
&self,
files: Vec<ParquetFile>,
paths: Vec<ParquetFilePath>,
object_store_ids: Vec<Uuid>,
reason: CompactReason,
partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
) -> PlanIR;
/// Build a plan to split a given file into given split times
fn split_plan(
&self,
file_to_split: FileToSplit,
path: ParquetFilePath,
object_store_id: Uuid,
reason: SplitReason,
partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
) -> PlanIR;
}

View File

@ -1,493 +0,0 @@
use std::{fmt::Display, sync::Arc};
use data_types::{ChunkOrder, CompactionLevel, ParquetFile, Timestamp, TimestampMinMax};
use parquet_file::ParquetFilePath;
use uuid::Uuid;
use crate::{
file_classification::{CompactReason, FileToSplit, FilesToSplitOrCompact, SplitReason},
partition_info::PartitionInfo,
plan_ir::{FileIR, PlanIR},
};
use super::IRPlanner;
/// Builder for compaction plans.
///
/// This uses the first draft / version of how the compactor splits files / time ranges. There will probably future
/// implementations (maybe called V2, but maybe it also gets a proper name).
#[derive(Debug)]
pub struct V1IRPlanner {
max_desired_file_size_bytes: u64,
percentage_max_file_size: u16,
split_percentage: u16,
}
impl V1IRPlanner {
/// Create a new compact plan builder.
pub fn new(
max_desired_file_size_bytes: u64,
percentage_max_file_size: u16,
split_percentage: u16,
) -> Self {
Self {
max_desired_file_size_bytes,
percentage_max_file_size,
split_percentage,
}
}
// compute cut off bytes for files
fn cutoff_bytes(max_desired_file_size_bytes: u64, percentage_max_file_size: u16) -> (u64, u64) {
(
(max_desired_file_size_bytes * percentage_max_file_size as u64) / 100,
(max_desired_file_size_bytes * (100 + percentage_max_file_size as u64)) / 100,
)
}
// Compute time to split data
// Return a list of times at which we want data to be split. The times are computed
// based on the max_desired_file_size each file should not exceed and the total_size this input
// time range [min_time, max_time] contains.
// The split times assume that the data is evenly distributed in the time range and if
// that is not the case the resulting files are not guaranteed to be below max_desired_file_size
// Hence, the range between two contiguous returned time is percentage of
// max_desired_file_size/total_size of the time range
// Example:
// . Input
// min_time = 1
// max_time = 21
// total_size = 100
// max_desired_file_size = 30
//
// . Pecentage = 70/100 = 0.3
// . Time range between 2 times = (21 - 1) * 0.3 = 6
//
// . Output = [7, 13, 19] in which
// 7 = 1 (min_time) + 6 (time range)
// 13 = 7 (previous time) + 6 (time range)
// 19 = 13 (previous time) + 6 (time range)
pub fn compute_split_time(
chunk_times: Vec<TimestampMinMax>,
min_time: i64,
max_time: i64,
total_size: u64,
max_desired_file_size: u64,
) -> Vec<i64> {
// Too small to split
if total_size <= max_desired_file_size {
return vec![max_time];
}
// Same min and max time, nothing to split
if min_time == max_time {
return vec![max_time];
}
let mut split_times = vec![];
let percentage = max_desired_file_size as f64 / total_size as f64;
let interval = ((max_time - min_time) as f64 * percentage).ceil() as i64;
let mut min = min_time;
loop {
let split_time = min + interval;
if split_time >= max_time {
break;
} else if Self::time_range_present(&chunk_times, min, split_time) {
split_times.push(split_time);
}
min = split_time;
}
split_times
}
// time_range_present returns true if the given time range is included in any of the chunks.
fn time_range_present(chunk_times: &[TimestampMinMax], min_time: i64, max_time: i64) -> bool {
chunk_times
.iter()
.any(|&chunk| chunk.max >= min_time && chunk.min <= max_time)
}
}
impl Display for V1IRPlanner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "v1")
}
}
impl IRPlanner for V1IRPlanner {
/// Build compact or split plans as appropriate
fn create_plans(
&self,
partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
split_or_compact: FilesToSplitOrCompact,
object_store_ids: Vec<Uuid>,
object_store_paths: Vec<ParquetFilePath>,
) -> Vec<PlanIR> {
match split_or_compact {
FilesToSplitOrCompact::Compact(files, reason) => {
vec![self.compact_plan(
files,
object_store_paths,
object_store_ids,
reason,
partition,
target_level,
)]
}
FilesToSplitOrCompact::Split(files, reason) => {
files
.into_iter()
.zip(object_store_ids)
.zip(object_store_paths)
.map(|((file_to_split, object_store_id), object_store_path)| {
// target level of a split file is the same as its level
let target_level = file_to_split.file.compaction_level;
self.split_plan(
file_to_split,
object_store_path,
object_store_id,
reason,
Arc::clone(&partition),
target_level,
)
})
.collect()
}
FilesToSplitOrCompact::None(reason) => vec![PlanIR::None { reason }], // Nothing to do
}
}
/// Build a plan to compact many files into a single file. Since we limit the size of the files,
/// if the compact result is larger than that limit, we will split the output into many files
fn compact_plan(
&self,
files: Vec<ParquetFile>,
paths: Vec<ParquetFilePath>,
object_store_ids: Vec<Uuid>,
reason: CompactReason,
_partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
) -> PlanIR {
// gather data
// total file size is the sum of the file sizes of the files to compact
let total_size = files.iter().map(|f| f.file_size_bytes).sum::<i64>() as u64;
let chunk_times = files
.iter()
.map(|f| TimestampMinMax::new(f.min_time.get(), f.max_time.get()))
.collect::<Vec<_>>();
let min_time = files
.iter()
.map(|f| f.min_time.get())
.min()
.expect("at least one file");
let max_time = files
.iter()
.map(|f| f.max_time.get())
.max()
.expect("at least one file");
let (small_cutoff_bytes, large_cutoff_bytes) = Self::cutoff_bytes(
self.max_desired_file_size_bytes,
self.percentage_max_file_size,
);
let files = files
.into_iter()
.zip(object_store_ids)
.zip(paths)
.map(|((file, object_store_id), path)| {
let order = order(file.compaction_level, target_level, file.max_l0_created_at);
FileIR {
file: ParquetFile {
object_store_id,
..file
},
path,
order,
}
})
.collect::<Vec<_>>();
// Build logical compact plan
if total_size <= small_cutoff_bytes || reason == CompactReason::ManySmallFiles {
PlanIR::Compact {
files,
target_level,
reason,
}
} else {
let split_times = if small_cutoff_bytes < total_size && total_size <= large_cutoff_bytes
{
// Split compaction into two files, the earlier of split_percentage amount of
// max_desired_file_size_bytes, the later of the rest
vec![min_time + ((max_time - min_time) * self.split_percentage as i64) / 100]
} else {
// Split compaction into multiple files
Self::compute_split_time(
chunk_times,
min_time,
max_time,
total_size,
self.max_desired_file_size_bytes,
)
};
if split_times.is_empty() || (split_times.len() == 1 && split_times[0] == max_time) {
// The split times might not have actually split anything, so in this case, compact
// everything into one file
PlanIR::Compact {
files,
target_level,
reason,
}
} else {
// split compact query plan to split the result into multiple files
PlanIR::Split {
files,
split_times,
target_level,
reason: SplitReason::CompactAndSplitOutput(reason),
}
}
}
}
/// Build a plan to split a file into multiple files based on the given split times
fn split_plan(
&self,
file_to_split: FileToSplit,
path: ParquetFilePath,
object_store_id: Uuid,
reason: SplitReason,
_partition: Arc<PartitionInfo>,
target_level: CompactionLevel,
) -> PlanIR {
let FileToSplit { file, split_times } = file_to_split;
let order = order(file.compaction_level, target_level, file.max_l0_created_at);
let file = FileIR {
file: ParquetFile {
object_store_id,
..file
},
path,
order,
};
PlanIR::Split {
files: vec![file],
split_times,
target_level,
reason,
}
}
}
// Order of the chunk so they can be deduplicated correctly
fn order(
compaction_level: CompactionLevel,
target_level: CompactionLevel,
max_l0_created_at: Timestamp,
) -> ChunkOrder {
// TODO: If we chnage this design specified in driver.rs's compact functions, we will need to refine this
// Currently, we only compact files of level_n with level_n+1 and produce level_n+1 files,
// and with the strictly design that:
// . Level-0 files can overlap with any files.
// . Level-N files (N > 0) cannot overlap with any files in the same level.
// . For Level-0 files, we always pick the smaller `max_l0_created_at` files to compact (with
// each other and overlapped L1 files) first. `max_l0_created_at` is the max created time of all L0 files
// that were compacted into this given file. This value is used to order chunk for deduplication.
// . Level-N+1 files are results of compacting Level-N and/or Level-N+1 files, their `created_at`
// can be after the `created_at` of other Level-N files but they may include data loaded before
// the other Level-N files. Hence we should never use `created_at` of Level-N+1 files to order
// them with Level-N files.
// . We can only compact different sets of files of the same partition concurrently into the same target_level.
// We can use the following rules to set order of the chunk of its (compaction_level, target_level) as follows:
// . compaction_level < target_level : the order is `created_at`
// . compaction_level == target_level : order is 0 to make sure it is in the front of the ordered list.
// This means that the chunk of `compaction_level == target_level` will be in arbitrary order and will be
// fine as long as they are in front of the chunks of `compaction_level < target_level`
match (compaction_level, target_level) {
(CompactionLevel::Initial, CompactionLevel::Initial)
| (CompactionLevel::Initial, CompactionLevel::FileNonOverlapped)
| (CompactionLevel::FileNonOverlapped, CompactionLevel::Final) => {
ChunkOrder::new(max_l0_created_at.get())
}
(CompactionLevel::FileNonOverlapped, CompactionLevel::FileNonOverlapped)
| (CompactionLevel::Final, CompactionLevel::Final) => ChunkOrder::new(0),
_ => {
panic!(
"Invalid compaction level combination: ({compaction_level:?}, {target_level:?})",
);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use data_types::TimestampMinMax;
#[test]
fn test_cutoff_bytes() {
let (small, large) = V1IRPlanner::cutoff_bytes(100, 30);
assert_eq!(small, 30);
assert_eq!(large, 130);
let (small, large) = V1IRPlanner::cutoff_bytes(100 * 1024 * 1024, 30);
assert_eq!(small, 30 * 1024 * 1024);
assert_eq!(large, 130 * 1024 * 1024);
let (small, large) = V1IRPlanner::cutoff_bytes(100, 60);
assert_eq!(small, 60);
assert_eq!(large, 160);
}
#[test]
fn test_compute_split_time() {
let min_time = 1;
let max_time = 11;
let total_size = 100;
let max_desired_file_size = 100;
let chunk_times = vec![TimestampMinMax {
min: min_time,
max: max_time,
}];
// no split
let result = V1IRPlanner::compute_split_time(
chunk_times.clone(),
min_time,
max_time,
total_size,
max_desired_file_size,
);
assert_eq!(result.len(), 1);
assert_eq!(result[0], max_time);
// split 70% and 30%
let max_desired_file_size = 70;
let result = V1IRPlanner::compute_split_time(
chunk_times.clone(),
min_time,
max_time,
total_size,
max_desired_file_size,
);
// only need to store the last split time
assert_eq!(result.len(), 1);
assert_eq!(result[0], 8); // = 1 (min_time) + 7
// split 40%, 40%, 20%
let max_desired_file_size = 40;
let result = V1IRPlanner::compute_split_time(
chunk_times,
min_time,
max_time,
total_size,
max_desired_file_size,
);
// store first and second split time
assert_eq!(result.len(), 2);
assert_eq!(result[0], 5); // = 1 (min_time) + 4
assert_eq!(result[1], 9); // = 5 (previous split_time) + 4
}
#[test]
fn compute_split_time_when_min_time_equals_max() {
// Imagine a customer is backfilling a large amount of data and for some reason, all the
// times on the data are exactly the same. That means the min_time and max_time will be the
// same, but the total_size will be greater than the desired size.
// We will not split it because the split has to stick to non-overlapped time range
let min_time = 1;
let max_time = 1;
let total_size = 200;
let max_desired_file_size = 100;
let chunk_times = vec![TimestampMinMax {
min: min_time,
max: max_time,
}];
let result = V1IRPlanner::compute_split_time(
chunk_times,
min_time,
max_time,
total_size,
max_desired_file_size,
);
// must return vector of one containing max_time
assert_eq!(result.len(), 1);
assert_eq!(result[0], 1);
}
#[test]
fn compute_split_time_please_dont_explode() {
// degenerated case where the step size is so small that it is < 1 (but > 0). In this case we shall still
// not loop forever.
let min_time = 10;
let max_time = 20;
let total_size = 600000;
let max_desired_file_size = 10000;
let chunk_times = vec![TimestampMinMax {
min: min_time,
max: max_time,
}];
let result = V1IRPlanner::compute_split_time(
chunk_times,
min_time,
max_time,
total_size,
max_desired_file_size,
);
assert_eq!(result.len(), 9);
}
#[test]
fn compute_split_time_chunk_gaps() {
// When the chunks have large gaps, we should not introduce a splits that cause time ranges
// known to be empty. Split T2 below should not exist.
// │ │
//┌────────────────┐ ┌──────────────┐
//│ Chunk 1 │ │ │ │ Chunk 2 │
//└────────────────┘ └──────────────┘
// │ │
// Split T1 Split T2
// Create a scenario where naive splitting would produce 2 splits (3 chunks) as shown above, but
// the only chunk data present is in the highest and lowest quarters, similar to what's shown above.
let min_time = 1;
let max_time = 100;
let total_size = 200;
let max_desired_file_size = total_size / 3;
let chunk_times = vec![
TimestampMinMax { min: 1, max: 24 },
TimestampMinMax { min: 75, max: 100 },
];
let result = V1IRPlanner::compute_split_time(
chunk_times,
min_time,
max_time,
total_size,
max_desired_file_size,
);
// must return vector of one, containing a Split T1 shown above.
assert_eq!(result.len(), 1);
assert_eq!(result[0], 34);
}
}

View File

@ -1,83 +0,0 @@
use std::sync::Arc;
use self::{
changed_files_filter::ChangedFilesFilter, commit::CommitToScheduler,
compaction_job_done_sink::CompactionJobDoneSink, compaction_job_stream::CompactionJobStream,
df_plan_exec::DataFusionPlanExec, df_planner::DataFusionPlanner, divide_initial::DivideInitial,
file_classifier::FileClassifier, ir_planner::IRPlanner, parquet_files_sink::ParquetFilesSink,
partition_files_source::PartitionFilesSource, partition_filter::PartitionFilter,
partition_info_source::PartitionInfoSource,
post_classification_partition_filter::PostClassificationPartitionFilter,
round_info_source::RoundInfoSource, round_split::RoundSplit, scratchpad::ScratchpadGen,
};
pub mod changed_files_filter;
pub mod columns_source;
pub(crate) mod commit;
pub mod compaction_job_done_sink;
pub mod compaction_job_stream;
pub mod compaction_jobs_source;
pub mod df_plan_exec;
pub mod df_planner;
pub mod divide_initial;
pub mod file_classifier;
pub mod file_filter;
pub mod files_split;
pub mod hardcoded;
pub mod ir_planner;
pub mod namespaces_source;
pub mod parquet_file_sink;
pub mod parquet_files_sink;
pub mod partition_files_source;
pub mod partition_filter;
pub mod partition_info_source;
pub mod partition_source;
pub mod post_classification_partition_filter;
pub mod report;
pub mod round_info_source;
pub mod round_split;
pub mod scratchpad;
pub mod split_or_compact;
pub mod tables_source;
pub mod timeout;
/// Pluggable system to determine compactor behavior. Please see
/// [Crate Level Documentation](crate) for more details on the
/// design.
#[derive(Debug, Clone)]
pub struct Components {
/// Source of partitions for the compactor to compact
pub compaction_job_stream: Arc<dyn CompactionJobStream>,
/// Source of information about a partition neededed for compaction
pub partition_info_source: Arc<dyn PartitionInfoSource>,
/// Source of files in a partition for compaction
pub partition_files_source: Arc<dyn PartitionFilesSource>,
/// Determines what type of compaction round the compactor will be doing
pub round_info_source: Arc<dyn RoundInfoSource>,
/// stop condition for completing a partition compaction
pub partition_filter: Arc<dyn PartitionFilter>,
/// condition to avoid running out of resources during compaction
pub post_classification_partition_filter: Arc<dyn PostClassificationPartitionFilter>,
/// Records "compaction job is done" status for given partition.
pub compaction_job_done_sink: Arc<dyn CompactionJobDoneSink>,
/// Commits changes (i.e. deletion and creation).
pub commit: Arc<CommitToScheduler>,
/// Creates `PlanIR` that describes what files should be compacted and updated
pub ir_planner: Arc<dyn IRPlanner>,
/// Creates an Execution plan for a `PlanIR`
pub df_planner: Arc<dyn DataFusionPlanner>,
/// Executes a DataFusion plan to multiple output streams.
pub df_plan_exec: Arc<dyn DataFusionPlanExec>,
/// Writes the streams created by [`DataFusionPlanExec`] to the object store.
pub parquet_files_sink: Arc<dyn ParquetFilesSink>,
/// Split files into two buckets "now" and "later".
pub round_split: Arc<dyn RoundSplit>,
/// Divide files in a partition into "branches"
pub divide_initial: Arc<dyn DivideInitial>,
/// Create intermediate temporary storage
pub scratchpad_gen: Arc<dyn ScratchpadGen>,
/// Classify files for each compaction branch.
pub file_classifier: Arc<dyn FileClassifier>,
/// Check for other processes modifying files.
pub changed_files_filter: Arc<dyn ChangedFilesFilter>,
}

View File

@ -1,61 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use backoff::{Backoff, BackoffConfig};
use data_types::{Namespace, NamespaceId, NamespaceSchema};
use iox_catalog::interface::{get_schema_by_id, Catalog, SoftDeletedRows};
use super::NamespacesSource;
#[derive(Debug)]
pub struct CatalogNamespacesSource {
backoff_config: BackoffConfig,
catalog: Arc<dyn Catalog>,
}
impl CatalogNamespacesSource {
pub fn new(backoff_config: BackoffConfig, catalog: Arc<dyn Catalog>) -> Self {
Self {
backoff_config,
catalog,
}
}
}
impl Display for CatalogNamespacesSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "catalog")
}
}
#[async_trait]
impl NamespacesSource for CatalogNamespacesSource {
async fn fetch_by_id(&self, ns: NamespaceId) -> Option<Namespace> {
Backoff::new(&self.backoff_config)
.retry_all_errors("namespace_of_given_namespace_id", || async {
self.catalog
.repositories()
.await
.namespaces()
.get_by_id(ns, SoftDeletedRows::AllRows)
.await
})
.await
.expect("retry forever")
}
async fn fetch_schema_by_id(&self, ns: NamespaceId) -> Option<NamespaceSchema> {
Backoff::new(&self.backoff_config)
.retry_all_errors("namespace_of_given_namespace_id", || async {
let mut repos = self.catalog.repositories().await;
let res = get_schema_by_id(ns, repos.as_mut(), SoftDeletedRows::AllRows).await;
match res {
Ok(schema) => Ok(Some(schema)),
Err(iox_catalog::interface::Error::NamespaceNotFoundById { .. }) => Ok(None),
Err(e) => Err(e),
}
})
.await
.expect("retry forever")
}
}

View File

@ -1,210 +0,0 @@
use std::{collections::HashMap, fmt::Display};
use async_trait::async_trait;
use data_types::{Namespace, NamespaceId, NamespaceSchema};
use super::NamespacesSource;
#[derive(Debug, Clone)]
/// contains [`Namespace`] and a [`NamespaceSchema`]
pub struct NamespaceWrapper {
/// namespace
pub ns: Namespace,
/// schema
pub schema: NamespaceSchema,
}
#[derive(Debug)]
pub struct MockNamespacesSource {
namespaces: HashMap<NamespaceId, NamespaceWrapper>,
}
impl MockNamespacesSource {
#[allow(dead_code)] // not used anywhere
pub fn new(namespaces: HashMap<NamespaceId, NamespaceWrapper>) -> Self {
Self { namespaces }
}
}
impl Display for MockNamespacesSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl NamespacesSource for MockNamespacesSource {
async fn fetch_by_id(&self, ns: NamespaceId) -> Option<Namespace> {
let wrapper = self.namespaces.get(&ns);
wrapper.map(|wrapper| wrapper.ns.clone())
}
async fn fetch_schema_by_id(&self, ns: NamespaceId) -> Option<NamespaceSchema> {
let wrapper = self.namespaces.get(&ns);
wrapper.map(|wrapper| wrapper.schema.clone())
}
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use data_types::{
Column, ColumnId, ColumnType, ColumnsByName, MaxColumnsPerTable, MaxTables, TableId,
TableSchema,
};
use super::*;
#[test]
fn test_display() {
assert_eq!(
MockNamespacesSource::new(HashMap::default()).to_string(),
"mock",
)
}
#[tokio::test]
async fn test_fetch_namespace() {
let ns_1 = NamespaceBuilder::new(1).build();
let ns_2 = NamespaceBuilder::new(2).build();
let namespaces = HashMap::from([
(NamespaceId::new(1), ns_1.clone()),
(NamespaceId::new(2), ns_2.clone()),
]);
let source = MockNamespacesSource::new(namespaces);
// different tables
assert_eq!(
source.fetch_by_id(NamespaceId::new(1)).await,
Some(ns_1.clone().ns),
);
assert_eq!(source.fetch_by_id(NamespaceId::new(2)).await, Some(ns_2.ns),);
// fetching does not drain
assert_eq!(source.fetch_by_id(NamespaceId::new(1)).await, Some(ns_1.ns),);
// unknown namespace => None result
assert_eq!(source.fetch_by_id(NamespaceId::new(3)).await, None,);
}
#[tokio::test]
async fn test_fetch_namespace_schema() {
let ns_1 = NamespaceBuilder::new(1).build();
let ns_2 = NamespaceBuilder::new(2).build();
let namespaces = HashMap::from([
(NamespaceId::new(1), ns_1.clone()),
(NamespaceId::new(2), ns_2.clone()),
]);
let source = MockNamespacesSource::new(namespaces);
// different tables
assert_eq!(
source.fetch_schema_by_id(NamespaceId::new(1)).await,
Some(ns_1.clone().schema),
);
assert_eq!(
source.fetch_schema_by_id(NamespaceId::new(2)).await,
Some(ns_2.schema),
);
// fetching does not drain
assert_eq!(
source.fetch_schema_by_id(NamespaceId::new(1)).await,
Some(ns_1.schema),
);
// unknown namespace => None result
assert_eq!(source.fetch_schema_by_id(NamespaceId::new(3)).await, None,);
}
#[derive(Debug)]
/// Build [`NamespaceWrapper`] for testing
pub struct NamespaceBuilder {
namespace: NamespaceWrapper,
}
impl NamespaceBuilder {
pub fn new(id: i64) -> Self {
let tables = BTreeMap::from([
(
"table1".to_string(),
TableSchema {
id: TableId::new(1),
partition_template: Default::default(),
columns: ColumnsByName::new([
Column {
name: "col1".to_string(),
id: ColumnId::new(1),
column_type: ColumnType::I64,
table_id: TableId::new(1),
},
Column {
name: "col2".to_string(),
id: ColumnId::new(2),
column_type: ColumnType::String,
table_id: TableId::new(1),
},
]),
},
),
(
"table2".to_string(),
TableSchema {
id: TableId::new(2),
partition_template: Default::default(),
columns: ColumnsByName::new([
Column {
name: "col1".to_string(),
id: ColumnId::new(3),
column_type: ColumnType::I64,
table_id: TableId::new(2),
},
Column {
name: "col2".to_string(),
id: ColumnId::new(4),
column_type: ColumnType::String,
table_id: TableId::new(2),
},
Column {
name: "col3".to_string(),
id: ColumnId::new(5),
column_type: ColumnType::F64,
table_id: TableId::new(2),
},
]),
},
),
]);
let id = NamespaceId::new(id);
Self {
namespace: NamespaceWrapper {
ns: Namespace {
id,
name: "ns".to_string(),
max_tables: MaxTables::new(10),
max_columns_per_table: MaxColumnsPerTable::new(10),
retention_period_ns: None,
deleted_at: None,
partition_template: Default::default(),
},
schema: NamespaceSchema {
id,
tables,
max_tables: MaxTables::new(42),
max_columns_per_table: MaxColumnsPerTable::new(10),
retention_period_ns: None,
partition_template: Default::default(),
},
},
}
}
pub fn build(self) -> NamespaceWrapper {
self.namespace
}
}
}

View File

@ -1,20 +0,0 @@
use std::fmt::{Debug, Display};
use async_trait::async_trait;
use data_types::{Namespace, NamespaceId, NamespaceSchema};
pub mod catalog;
pub mod mock;
#[async_trait]
pub trait NamespacesSource: Debug + Display + Send + Sync {
/// Get Namespace for a given namespace
///
/// This method performs retries.
async fn fetch_by_id(&self, ns: NamespaceId) -> Option<Namespace>;
/// Get NamespaceSchema for a given namespace
///
/// todo: make this method perform retries.
async fn fetch_schema_by_id(&self, ns: NamespaceId) -> Option<NamespaceSchema>;
}

View File

@ -1,107 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFileParams};
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_query::exec::{Executor, ExecutorType};
use iox_time::Time;
use crate::partition_info::PartitionInfo;
use super::ParquetFileSink;
#[derive(Debug)]
pub struct DedicatedExecParquetFileSinkWrapper<T>
where
T: ParquetFileSink + 'static,
{
exec: Arc<Executor>,
inner: Arc<T>,
}
impl<T> DedicatedExecParquetFileSinkWrapper<T>
where
T: ParquetFileSink + 'static,
{
pub fn new(inner: T, exec: Arc<Executor>) -> Self {
Self {
inner: Arc::new(inner),
exec,
}
}
}
impl<T> Display for DedicatedExecParquetFileSinkWrapper<T>
where
T: ParquetFileSink + 'static,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "dedicated_exec({})", self.inner)
}
}
#[async_trait]
impl<T> ParquetFileSink for DedicatedExecParquetFileSinkWrapper<T>
where
T: ParquetFileSink + 'static,
{
async fn store(
&self,
stream: SendableRecordBatchStream,
partition: Arc<PartitionInfo>,
level: CompactionLevel,
max_l0_created_at: Time,
) -> Result<Option<ParquetFileParams>, DataFusionError> {
let inner = Arc::clone(&self.inner);
self.exec
.executor(ExecutorType::Reorg)
.spawn(async move {
inner
.store(stream, partition, level, max_l0_created_at)
.await
})
.await
.map_err(|e| DataFusionError::External(e.into()))?
}
}
#[cfg(test)]
mod tests {
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
use schema::SchemaBuilder;
use crate::components::parquet_file_sink::mock::MockParquetFileSink;
use crate::test_utils::PartitionInfoBuilder;
use super::*;
#[test]
fn test_display() {
let sink = DedicatedExecParquetFileSinkWrapper::new(
MockParquetFileSink::new(true),
Arc::new(Executor::new_testing()),
);
assert_eq!(sink.to_string(), "dedicated_exec(mock)",)
}
#[tokio::test]
async fn test_panic() {
let sink = DedicatedExecParquetFileSinkWrapper::new(
MockParquetFileSink::new(true),
Arc::new(Executor::new_testing()),
);
let schema = SchemaBuilder::new().build().unwrap().as_arrow();
let stream = Box::pin(RecordBatchStreamAdapter::new(
Arc::clone(&schema),
futures::stream::once(async move { panic!("foo") }),
));
let partition = Arc::new(PartitionInfoBuilder::new().build());
let level = CompactionLevel::FileNonOverlapped;
let max_l0_created_at = Time::from_timestamp_nanos(0);
let err = sink
.store(stream, partition, level, max_l0_created_at)
.await
.unwrap_err();
assert_eq!(err.to_string(), "External error: Panic: foo",);
}
}

View File

@ -1,80 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFileParams};
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_time::Time;
use observability_deps::tracing::{info, warn};
use crate::partition_info::PartitionInfo;
use super::ParquetFileSink;
#[derive(Debug)]
pub struct LoggingParquetFileSinkWrapper<T>
where
T: ParquetFileSink,
{
inner: T,
}
impl<T> LoggingParquetFileSinkWrapper<T>
where
T: ParquetFileSink,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingParquetFileSinkWrapper<T>
where
T: ParquetFileSink,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({})", self.inner)
}
}
#[async_trait]
impl<T> ParquetFileSink for LoggingParquetFileSinkWrapper<T>
where
T: ParquetFileSink,
{
async fn store(
&self,
stream: SendableRecordBatchStream,
partition: Arc<PartitionInfo>,
level: CompactionLevel,
max_l0_created_at: Time,
) -> Result<Option<ParquetFileParams>, DataFusionError> {
let res = self
.inner
.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await;
match &res {
Ok(Some(f)) => {
info!(
partition_id = partition.partition_id.get(),
object_store_id=%f.object_store_id,
file_size_bytes=f.file_size_bytes,
"Stored file",
)
}
Ok(None) => {
warn!(
partition_id = partition.partition_id.get(),
"SplitExec produced an empty result stream"
);
}
Err(e) => {
warn!(
%e,
partition_id=partition.partition_id.get(),
"Error while uploading file",
);
}
}
res
}
}

View File

@ -1,253 +0,0 @@
use std::{
fmt::Display,
sync::{Arc, Mutex},
};
use async_trait::async_trait;
use data_types::{ColumnSet, CompactionLevel, ParquetFileParams, Timestamp};
use datafusion::{
arrow::{datatypes::SchemaRef, record_batch::RecordBatch},
error::DataFusionError,
physical_plan::SendableRecordBatchStream,
};
use futures::TryStreamExt;
use iox_time::Time;
use uuid::Uuid;
use crate::partition_info::PartitionInfo;
use super::ParquetFileSink;
#[derive(Debug, Clone)]
pub struct StoredFile {
pub batches: Vec<RecordBatch>,
pub level: CompactionLevel,
pub partition: Arc<PartitionInfo>,
pub schema: SchemaRef,
}
#[derive(Debug)]
pub struct MockParquetFileSink {
filter_empty_files: bool,
records: Mutex<Vec<StoredFile>>,
}
impl MockParquetFileSink {
/// If filter_empty_files is true, parquet files that have "0" rows will not be written to `ParquetFile`s in the catalog.
#[cfg(test)]
pub fn new(filter_empty_files: bool) -> Self {
Self {
filter_empty_files,
records: Default::default(),
}
}
#[allow(dead_code)] // not used anywhere
pub fn records(&self) -> Vec<StoredFile> {
self.records.lock().expect("not poisoned").clone()
}
}
impl Display for MockParquetFileSink {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl ParquetFileSink for MockParquetFileSink {
async fn store(
&self,
stream: SendableRecordBatchStream,
partition: Arc<PartitionInfo>,
level: CompactionLevel,
max_l0_created_at: Time,
) -> Result<Option<ParquetFileParams>, DataFusionError> {
let schema = stream.schema();
let batches: Vec<_> = stream.try_collect().await?;
let row_count = batches.iter().map(|b| b.num_rows()).sum::<usize>();
let mut guard = self.records.lock().expect("not poisoned");
let out = ((row_count > 0) || !self.filter_empty_files).then(|| ParquetFileParams {
namespace_id: partition.namespace_id,
table_id: partition.table.id,
partition_id: partition.partition_id(),
object_store_id: Uuid::from_u128(guard.len() as u128),
min_time: Timestamp::new(0),
max_time: Timestamp::new(0),
file_size_bytes: 1,
row_count: 1,
compaction_level: level,
created_at: Timestamp::new(1),
column_set: ColumnSet::new(vec![]),
max_l0_created_at: max_l0_created_at.into(),
});
guard.push(StoredFile {
batches,
level,
partition,
schema,
});
Ok(out)
}
}
#[cfg(test)]
mod tests {
use arrow_util::assert_batches_eq;
use data_types::{NamespaceId, TableId};
use datafusion::{
arrow::{array::new_null_array, datatypes::DataType},
physical_plan::stream::RecordBatchStreamAdapter,
};
use schema::SchemaBuilder;
use crate::test_utils::PartitionInfoBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(MockParquetFileSink::new(false).to_string(), "mock");
}
#[tokio::test]
async fn test_store_filter_empty() {
let sink = MockParquetFileSink::new(true);
let schema = SchemaBuilder::new()
.field("f", DataType::Int64)
.unwrap()
.build()
.unwrap()
.as_arrow();
let partition = Arc::new(PartitionInfoBuilder::new().build());
let level = CompactionLevel::FileNonOverlapped;
let max_l0_created_at = Time::from_timestamp_nanos(1);
let stream = Box::pin(RecordBatchStreamAdapter::new(
Arc::clone(&schema),
futures::stream::empty(),
));
assert_eq!(
sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await
.unwrap(),
None,
);
let record_batch = RecordBatch::new_empty(Arc::clone(&schema));
let record_batch_captured = record_batch.clone();
let stream = Box::pin(RecordBatchStreamAdapter::new(
Arc::clone(&schema),
futures::stream::once(async move { Ok(record_batch_captured) }),
));
assert_eq!(
sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await
.unwrap(),
None,
);
let record_batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![new_null_array(&DataType::Int64, 1)],
)
.unwrap();
let record_batch_captured = record_batch.clone();
let stream = Box::pin(RecordBatchStreamAdapter::new(
Arc::clone(&schema),
futures::stream::once(async move { Ok(record_batch_captured) }),
));
let partition_id = partition.partition_id();
assert_eq!(
sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await
.unwrap(),
Some(ParquetFileParams {
namespace_id: NamespaceId::new(2),
table_id: TableId::new(3),
partition_id,
object_store_id: Uuid::from_u128(2),
min_time: Timestamp::new(0),
max_time: Timestamp::new(0),
file_size_bytes: 1,
row_count: 1,
compaction_level: CompactionLevel::FileNonOverlapped,
created_at: Timestamp::new(1),
column_set: ColumnSet::new([]),
max_l0_created_at: max_l0_created_at.into(),
}),
);
let records = sink.records();
assert_eq!(records.len(), 3);
assert_eq!(records[0].batches.len(), 0);
assert_eq!(records[0].schema, schema);
assert_eq!(records[0].level, level);
assert_eq!(records[0].partition, partition);
assert_batches_eq!(["+---+", "| f |", "+---+", "+---+",], &records[1].batches);
assert_eq!(records[1].batches.len(), 1);
assert_eq!(records[1].schema, schema);
assert_eq!(records[1].level, level);
assert_eq!(records[1].partition, partition);
assert_batches_eq!(
["+---+", "| f |", "+---+", "| |", "+---+",],
&records[2].batches
);
assert_eq!(records[2].batches.len(), 1);
assert_eq!(records[2].schema, schema);
assert_eq!(records[2].level, level);
assert_eq!(records[2].partition, partition);
}
#[tokio::test]
async fn test_store_keep_empty() {
let sink = MockParquetFileSink::new(false);
let schema = SchemaBuilder::new()
.field("f", DataType::Int64)
.unwrap()
.build()
.unwrap()
.as_arrow();
let partition = Arc::new(PartitionInfoBuilder::new().build());
let level = CompactionLevel::FileNonOverlapped;
let max_l0_created_at = Time::from_timestamp_nanos(1);
let stream = Box::pin(RecordBatchStreamAdapter::new(
Arc::clone(&schema),
futures::stream::empty(),
));
let partition_id = partition.partition_id();
assert_eq!(
sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await
.unwrap(),
Some(ParquetFileParams {
namespace_id: NamespaceId::new(2),
table_id: TableId::new(3),
partition_id,
object_store_id: Uuid::from_u128(0),
min_time: Timestamp::new(0),
max_time: Timestamp::new(0),
file_size_bytes: 1,
row_count: 1,
compaction_level: CompactionLevel::FileNonOverlapped,
created_at: Timestamp::new(1),
column_set: ColumnSet::new([]),
max_l0_created_at: max_l0_created_at.into(),
}),
);
let records = sink.records();
assert_eq!(records.len(), 1);
assert_eq!(records[0].batches.len(), 0);
assert_eq!(records[0].schema, schema);
assert_eq!(records[0].level, level);
assert_eq!(records[0].partition, partition);
}
}

View File

@ -1,46 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFileParams};
use datafusion::{error::DataFusionError, physical_plan::SendableRecordBatchStream};
use iox_time::Time;
use crate::partition_info::PartitionInfo;
pub mod dedicated;
pub mod logging;
pub mod mock;
pub mod object_store;
/// Writes streams if data to the object store as one or more parquet files
#[async_trait]
pub trait ParquetFileSink: Debug + Display + Send + Sync {
async fn store(
&self,
stream: SendableRecordBatchStream,
partition: Arc<PartitionInfo>,
level: CompactionLevel,
max_l0_created_at: Time,
) -> Result<Option<ParquetFileParams>, DataFusionError>;
}
#[async_trait]
impl<T> ParquetFileSink for Arc<T>
where
T: ParquetFileSink + ?Sized,
{
async fn store(
&self,
stream: SendableRecordBatchStream,
partition: Arc<PartitionInfo>,
level: CompactionLevel,
max_l0_created_at: Time,
) -> Result<Option<ParquetFileParams>, DataFusionError> {
self.as_ref()
.store(stream, partition, level, max_l0_created_at)
.await
}
}

View File

@ -1,106 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFileParams};
use datafusion::{
error::DataFusionError, execution::memory_pool::MemoryPool,
physical_plan::SendableRecordBatchStream,
};
use iox_time::{Time, TimeProvider};
use parquet_file::{
metadata::IoxMetadata,
serialize::CodecError,
storage::{ParquetStorage, UploadError},
};
use uuid::Uuid;
use crate::partition_info::PartitionInfo;
use super::ParquetFileSink;
#[derive(Debug)]
pub struct ObjectStoreParquetFileSink {
// pool on which to register parquet buffering
pool: Arc<dyn MemoryPool>,
store: ParquetStorage,
time_provider: Arc<dyn TimeProvider>,
}
impl ObjectStoreParquetFileSink {
pub fn new(
pool: Arc<dyn MemoryPool>,
store: ParquetStorage,
time_provider: Arc<dyn TimeProvider>,
) -> Self {
Self {
pool,
store,
time_provider,
}
}
}
impl Display for ObjectStoreParquetFileSink {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "object_store")
}
}
#[async_trait]
impl ParquetFileSink for ObjectStoreParquetFileSink {
async fn store(
&self,
stream: SendableRecordBatchStream,
partition: Arc<PartitionInfo>,
level: CompactionLevel,
max_l0_created_at: Time,
) -> Result<Option<ParquetFileParams>, DataFusionError> {
let meta = IoxMetadata {
object_store_id: Uuid::new_v4(),
creation_timestamp: self.time_provider.now(),
namespace_id: partition.namespace_id,
namespace_name: partition.namespace_name.clone().into(),
table_id: partition.table.id,
table_name: partition.table.name.clone().into(),
partition_key: partition.partition_key.clone(),
compaction_level: level,
sort_key: partition.sort_key.clone(),
max_l0_created_at,
};
// Stream the record batches from the compaction exec, serialize
// them, and directly upload the resulting Parquet files to
// object storage.
let pool = Arc::clone(&self.pool);
let (parquet_meta, file_size) = match self
.store
.upload(stream, &partition.partition_id(), &meta, pool)
.await
{
Ok(v) => v,
Err(UploadError::Serialise(CodecError::NoRows | CodecError::NoRecordBatches)) => {
// This MAY be a bug.
//
// This also may happen legitimately, though very, very
// rarely. See test_empty_parquet_file_panic for an
// explanation.
return Ok(None);
}
Err(e) => {
return Err(e.into());
}
};
let parquet_file =
meta.to_parquet_file(partition.partition_id(), file_size, &parquet_meta, |name| {
partition
.table_schema
.columns
.get(name)
.expect("unknown column")
.id
});
Ok(Some(parquet_file))
}
}

View File

@ -1,93 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFileParams};
use datafusion::physical_plan::SendableRecordBatchStream;
use futures::{stream::FuturesOrdered, TryFutureExt, TryStreamExt};
use iox_time::Time;
use crate::{
components::parquet_file_sink::ParquetFileSink, error::DynError, partition_info::PartitionInfo,
plan_ir::PlanIR,
};
use super::ParquetFilesSink;
#[derive(Debug)]
/// Writes parquet files to an inner [`ParquetFileSink`] (note the
/// lack of "s").
pub struct DispatchParquetFilesSink<T>
where
T: ParquetFileSink,
{
inner: Arc<T>,
}
impl<T> DispatchParquetFilesSink<T>
where
T: ParquetFileSink,
{
pub fn new(inner: T) -> Self {
Self {
inner: Arc::new(inner),
}
}
}
impl<T> Display for DispatchParquetFilesSink<T>
where
T: ParquetFileSink,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "dispatch({})", self.inner)
}
}
#[async_trait]
impl<T> ParquetFilesSink for DispatchParquetFilesSink<T>
where
T: ParquetFileSink + 'static,
{
async fn stream_into_file_sink(
&self,
streams: Vec<SendableRecordBatchStream>,
partition_info: Arc<PartitionInfo>,
target_level: CompactionLevel,
plan_ir: &PlanIR,
) -> Result<Vec<ParquetFileParams>, DynError> {
// compute max_l0_created_at
let max_l0_created_at: Time = plan_ir
.input_files()
.iter()
.map(|f| f.file.max_l0_created_at)
.max()
.expect("max_l0_created_at should have value")
.into();
let inner = Arc::clone(&self.inner);
streams
.into_iter()
.map(move |stream| {
let inner = Arc::clone(&inner);
let partition_info = Arc::clone(&partition_info);
async move {
inner
.store(stream, partition_info, target_level, max_l0_created_at)
.await
}
})
// NB: FuturesOrdered allows the futures to run in parallel
.collect::<FuturesOrdered<_>>()
// Discard the streams that resulted in empty output / no file uploaded
// to the object store.
.try_filter_map(|v| futures::future::ready(Ok(v)))
// Collect all the persisted parquet files together.
.try_collect::<Vec<_>>()
.map_err(|e| Box::new(e) as _)
.await
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}

View File

@ -1,33 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFileParams};
use datafusion::physical_plan::SendableRecordBatchStream;
use crate::{error::DynError, partition_info::PartitionInfo, plan_ir::PlanIR};
pub mod dispatch;
/// Writes streams, which corresponds to the `plan_ir.files()` to
/// parquet files on object store, returning information about the
/// files that were created.
#[async_trait]
pub trait ParquetFilesSink: Debug + Display + Send + Sync {
/// Writes the streams of RecordBatches, corresponding to the list
/// of files on `plan_ir` to parquet files of the specified
/// `target_level` files on object store, and returns the details
/// needed to create entries in the catalog for those files.
async fn stream_into_file_sink(
&self,
streams: Vec<SendableRecordBatchStream>,
partition_info: Arc<PartitionInfo>,
target_level: CompactionLevel,
plan_ir: &PlanIR,
) -> Result<Vec<ParquetFileParams>, DynError>;
/// return this files sync as an Any dynamic object
fn as_any(&self) -> &dyn std::any::Any;
}

View File

@ -1,182 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use backoff::{Backoff, BackoffConfig};
use data_types::{ParquetFile, PartitionId, TransitionPartitionId};
use iox_catalog::interface::Catalog;
use observability_deps::tracing::warn;
use super::{rate_limit::RateLimit, PartitionFilesSource};
#[async_trait]
pub(crate) trait CatalogQuerier: Send + Sync + Debug {
async fn get_partitions(
&self,
partition_id: PartitionId,
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error>;
}
/// a QueryRateLimiter applies a RateLimit to a CatalogQuerier.
#[derive(Debug)]
pub struct QueryRateLimiter<T> {
inner: T,
rate_limit: RateLimit,
}
impl<T> QueryRateLimiter<T> {
pub fn new(inner: T, rate_limit: RateLimit) -> Self {
Self { inner, rate_limit }
}
}
#[async_trait]
impl<T> CatalogQuerier for QueryRateLimiter<T>
where
T: CatalogQuerier,
{
async fn get_partitions(
&self,
partition_id: PartitionId,
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
while let Some(d) = self.rate_limit.can_proceed() {
warn!(%partition_id, "partition fetch rate limited");
// Don't busy loop - wait the fractions of a second before a retry
// is allowed.
tokio::time::sleep(d).await;
}
self.inner.get_partitions(partition_id).await
}
}
#[async_trait]
impl CatalogQuerier for Arc<dyn Catalog> {
async fn get_partitions(
&self,
partition_id: PartitionId,
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
self.repositories()
.await
.parquet_files()
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
.await
}
}
#[derive(Debug)]
pub struct CatalogPartitionFilesSource<T = QueryRateLimiter<Arc<dyn Catalog>>> {
backoff_config: BackoffConfig,
catalog: T,
}
impl<T> CatalogPartitionFilesSource<T> {
pub fn new(backoff_config: BackoffConfig, catalog: T) -> Self {
Self {
backoff_config,
catalog,
}
}
}
impl<T> Display for CatalogPartitionFilesSource<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "catalog")
}
}
#[async_trait]
impl<T> PartitionFilesSource for CatalogPartitionFilesSource<T>
where
T: CatalogQuerier,
{
async fn fetch(&self, partition_id: PartitionId) -> Vec<ParquetFile> {
Backoff::new(&self.backoff_config)
.retry_all_errors("parquet_files_of_given_partition", || async {
self.catalog.get_partitions(partition_id).await
})
.await
.expect("retry forever")
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::{sync::Mutex, time::Duration};
use tokio::time::Instant;
/// A [`CatalogQuerier`] that always returns OK, and counts the number of
/// calls made.
#[derive(Debug, Default)]
struct MockInner(Mutex<usize>);
#[async_trait]
impl CatalogQuerier for &MockInner {
async fn get_partitions(
&self,
_partition_id: PartitionId,
) -> Result<Vec<ParquetFile>, iox_catalog::interface::Error> {
*self.0.lock().unwrap() += 1;
Ok(vec![])
}
}
#[tokio::test]
async fn test_rate_limit() {
const ALLOWED_PER_SECOND: usize = 100;
let inner = MockInner::default();
let r = QueryRateLimiter::new(
&inner,
RateLimit::new(ALLOWED_PER_SECOND, ALLOWED_PER_SECOND / 10),
);
let mut start = Instant::now();
// If there are ALLOWED_PER_SECOND queries allowed per second, then it
// should take 1 second to issue ALLOWED_PER_SECOND number of queries.
//
// Attempt to make 1/10th the number of permissible queries per second,
// which should take at least 1/10th of a second due to smoothing, so
// the test does not take so long.
for _ in 0..(ALLOWED_PER_SECOND / 10) {
r.get_partitions(PartitionId::new(42)).await.unwrap();
}
// It should have taken at least 1/10th of a second
let duration = Instant::now() - start;
assert!(duration > Duration::from_millis(100));
// It should have taken less than 2/10th of a second
// If this test is flaky, increase this a bit.
assert!(duration < Duration::from_millis(200));
// Exactly 1/10th the number of queries should be dispatched to the
// inner impl.
assert_eq!(*inner.0.lock().unwrap(), ALLOWED_PER_SECOND / 10);
// Now repeat with a delay to fill the burst balance.
start = Instant::now();
for i in 0..(ALLOWED_PER_SECOND / 10) {
r.get_partitions(PartitionId::new(42)).await.unwrap();
if i == 0 {
// delay until the next request, to fill the burst balance.
tokio::time::sleep(Duration::from_millis(90)).await;
}
}
// It should have taken at least 1/10th of a second
let duration = Instant::now() - start;
assert!(duration > Duration::from_millis(100));
// It should have taken less than 2/10th of a second
// If this test is flaky, increase this a bit.
assert!(duration < Duration::from_millis(200));
// Exactly 2/10th the number of queries should be dispatched to the
// inner impl.
assert_eq!(*inner.0.lock().unwrap(), 2 * ALLOWED_PER_SECOND / 10);
}
}

View File

@ -1,102 +0,0 @@
use std::{collections::HashMap, fmt::Display};
use super::PartitionFilesSource;
use async_trait::async_trait;
use data_types::{ParquetFile, PartitionId, TransitionPartitionId};
#[derive(Debug)]
pub struct MockPartitionFilesSource {
// This complexity is because we're in the process of moving to partition hash IDs rather than
// partition catalog IDs, and Parquet files might only have the partition hash ID on their
// record, but the compactor deals with partition catalog IDs because we haven't transitioned
// it yet. This should become simpler when the transition is complete.
partition_lookup: HashMap<PartitionId, TransitionPartitionId>,
file_lookup: HashMap<TransitionPartitionId, Vec<ParquetFile>>,
}
impl MockPartitionFilesSource {
#[cfg(test)]
pub fn new(
partition_lookup: HashMap<PartitionId, TransitionPartitionId>,
parquet_files: Vec<ParquetFile>,
) -> Self {
let mut file_lookup: HashMap<TransitionPartitionId, Vec<ParquetFile>> = HashMap::new();
for file in parquet_files {
let files = file_lookup.entry(file.partition_id.clone()).or_default();
files.push(file);
}
Self {
partition_lookup,
file_lookup,
}
}
}
impl Display for MockPartitionFilesSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl PartitionFilesSource for MockPartitionFilesSource {
async fn fetch(&self, partition_id: PartitionId) -> Vec<ParquetFile> {
self.partition_lookup
.get(&partition_id)
.and_then(|partition_hash_id| self.file_lookup.get(partition_hash_id).cloned())
.unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
use iox_tests::{partition_identifier, ParquetFileBuilder};
#[test]
fn test_display() {
assert_eq!(
MockPartitionFilesSource::new(Default::default(), Default::default()).to_string(),
"mock",
)
}
#[tokio::test]
async fn test_fetch() {
let partition_id_1 = PartitionId::new(1);
let partition_id_2 = PartitionId::new(2);
let partition_identifier_1 = partition_identifier(1);
let partition_identifier_2 = partition_identifier(2);
let f_1_1 = ParquetFileBuilder::new(1)
.with_partition(partition_identifier_1.clone())
.build();
let f_1_2 = ParquetFileBuilder::new(2)
.with_partition(partition_identifier_1.clone())
.build();
let f_2_1 = ParquetFileBuilder::new(3)
.with_partition(partition_identifier_2.clone())
.build();
let partition_lookup = HashMap::from([
(partition_id_1, partition_identifier_1.clone()),
(partition_id_2, partition_identifier_2.clone()),
]);
let files = vec![f_1_1.clone(), f_1_2.clone(), f_2_1.clone()];
let source = MockPartitionFilesSource::new(partition_lookup, files);
// different partitions
assert_eq!(
source.fetch(partition_id_1).await,
vec![f_1_1.clone(), f_1_2.clone()],
);
assert_eq!(source.fetch(partition_id_2).await, vec![f_2_1],);
// fetching does not drain
assert_eq!(source.fetch(partition_id_1).await, vec![f_1_1, f_1_2],);
// unknown partition => empty result
assert_eq!(source.fetch(PartitionId::new(3)).await, vec![],);
}
}

View File

@ -1,19 +0,0 @@
use std::fmt::{Debug, Display};
use async_trait::async_trait;
use data_types::{ParquetFile, PartitionId};
pub mod catalog;
pub mod mock;
pub mod rate_limit;
/// Finds files in a partition for compaction
#[async_trait]
pub trait PartitionFilesSource: Debug + Display + Send + Sync {
/// Get undeleted parquet files for given partition.
///
/// This MUST NOT perform any filtering (expect for the "not marked for deletion" flag).
///
/// This method performs retries.
async fn fetch(&self, partition: PartitionId) -> Vec<ParquetFile>;
}

View File

@ -1,75 +0,0 @@
use std::{sync::Mutex, time::Duration};
use tokio::time::Instant;
/// A [`RateLimit`] rate limiter that smooths `N` queries over a second.
#[derive(Debug)]
pub struct RateLimit {
last_query: Mutex<Instant>,
min_interval: Mutex<Duration>,
// if we compute a simple interval and enforce at least that much time between each, a variable query
// rate will be unpredictably slower than the specified rate. So when the delay between queries is more
// than the average, allow it to 'bank' some extra credit - so a few can go quicker after a period of
// inactivity.
//
// burst_balance is how many can proceed immediately, without delay, due to slowness of the previous.
burst_balance: Mutex<usize>,
// max_burst is the maximum burst balance we allow. Without this, a large delay in queries could
// allow too big of a burst of queries.
max_burst: Mutex<usize>,
}
impl RateLimit {
pub(crate) fn new(rps: usize, max_burst: usize) -> Self {
Self {
last_query: Mutex::new(Instant::now()),
min_interval: Mutex::new(Duration::from_secs(1) / rps as u32),
max_burst: Mutex::new(max_burst),
burst_balance: Mutex::new(0),
}
}
pub fn can_proceed(&self) -> Option<Duration> {
let mut last_query = self.last_query.lock().unwrap();
let mut burst_balance = self.burst_balance.lock().unwrap();
let interval = *self.min_interval.lock().unwrap();
let now = Instant::now();
// Has enough time passed since the last query was allowed?
let next_allowed = last_query.checked_add(interval).unwrap();
if now < next_allowed {
// This request came quickly after the prior one. If we have burst balance, we can use it,
// otherwise we'll have to wait.
if *burst_balance > 0 {
*burst_balance -= 1;
return None;
}
return Some(next_allowed - now);
}
// For simpliicity, only add the burst balance in whole numbers. So if the time since the last
// request was more than 2x the min interval, we can add to the burst balance.
let credits = now.duration_since(next_allowed).as_nanos() / interval.as_nanos();
if credits > 1 {
let max = *self.max_burst.lock().unwrap();
*burst_balance += credits as usize - 1;
if *burst_balance > max {
*burst_balance = max;
}
}
*last_query = now;
None
}
pub fn update_rps(&self, rps: usize, max_burst: usize) {
*self.min_interval.lock().unwrap() = Duration::from_secs(1) / rps as u32;
*self.max_burst.lock().unwrap() = max_burst;
let mut burst_balance = self.burst_balance.lock().unwrap();
if *burst_balance > max_burst {
*burst_balance = max_burst;
}
}
}

View File

@ -1,99 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{error::DynError, PartitionInfo};
use super::PartitionFilter;
#[derive(Debug)]
pub struct AndPartitionFilter {
filters: Vec<Arc<dyn PartitionFilter>>,
}
impl AndPartitionFilter {
pub fn new(filters: Vec<Arc<dyn PartitionFilter>>) -> Self {
Self { filters }
}
}
impl Display for AndPartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "and([")?;
for (i, sub) in self.filters.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{sub}")?;
}
write!(f, "])")
}
}
#[async_trait]
impl PartitionFilter for AndPartitionFilter {
async fn apply(
&self,
partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
for filter in &self.filters {
if !filter.apply(partition_info, files).await? {
return Ok(false);
}
}
Ok(true)
}
}
#[cfg(test)]
mod tests {
use crate::{
components::partition_filter::{
has_files::HasFilesPartitionFilter, FalsePartitionFilter, TruePartitionFilter,
},
test_utils::PartitionInfoBuilder,
};
use super::*;
#[test]
fn test_display() {
let has_files = Arc::new(HasFilesPartitionFilter::new());
let max_num_files = Arc::new(TruePartitionFilter::new());
let filter = AndPartitionFilter::new(vec![has_files, max_num_files]);
assert_eq!(format!("{filter}"), "and([has_files, true])");
}
#[tokio::test]
async fn test_apply() {
let p_info = Arc::new(PartitionInfoBuilder::new().build());
let filter = AndPartitionFilter::new(vec![
Arc::new(TruePartitionFilter::new()),
Arc::new(TruePartitionFilter::new()),
]);
assert!(filter.apply(&p_info, &[]).await.unwrap());
let filter = AndPartitionFilter::new(vec![
Arc::new(TruePartitionFilter::new()),
Arc::new(FalsePartitionFilter::new()),
]);
assert!(!filter.apply(&p_info, &[]).await.unwrap());
let filter = AndPartitionFilter::new(vec![
Arc::new(FalsePartitionFilter::new()),
Arc::new(TruePartitionFilter::new()),
]);
assert!(!filter.apply(&p_info, &[]).await.unwrap());
let filter = AndPartitionFilter::new(vec![
Arc::new(FalsePartitionFilter::new()),
Arc::new(FalsePartitionFilter::new()),
]);
assert!(!filter.apply(&p_info, &[]).await.unwrap());
}
}

View File

@ -1,123 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{components::file_filter::FileFilter, error::DynError, PartitionInfo};
use super::PartitionFilter;
/// A partition filter that matches partitions that have more than `min_num_files` files
/// matching the given file filter.
#[derive(Debug)]
pub struct GreaterMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
filter: T,
min_num_files: usize,
}
impl<T> GreaterMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
pub fn new(filter: T, min_num_files: usize) -> Self {
Self {
filter,
min_num_files,
}
}
}
impl<T> Display for GreaterMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"greater_matching_file({}, {})",
self.filter, self.min_num_files
)
}
}
#[async_trait]
impl<T> PartitionFilter for GreaterMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
async fn apply(
&self,
_partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
Ok(files.iter().filter(|file| self.filter.apply(file)).count() >= self.min_num_files)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use data_types::CompactionLevel;
use crate::{
components::file_filter::level_range::LevelRangeFileFilter,
test_utils::PartitionInfoBuilder,
};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
let filter = GreaterMatchingFilesPartitionFilter::new(
LevelRangeFileFilter::new(
CompactionLevel::FileNonOverlapped..=CompactionLevel::FileNonOverlapped,
),
1,
);
assert_eq!(
filter.to_string(),
"greater_matching_file(level_range(1..=1), 1)"
);
}
#[tokio::test]
async fn test_apply() {
let filter = GreaterMatchingFilesPartitionFilter::new(
LevelRangeFileFilter::new(
CompactionLevel::FileNonOverlapped..=CompactionLevel::FileNonOverlapped,
),
2,
);
let f1 = ParquetFileBuilder::new(0)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let f2 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let f3 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let p_info = Arc::new(PartitionInfoBuilder::new().build());
// empty, not enough matching
assert!(!filter.apply(&p_info, &[]).await.unwrap());
// Not enough matching
assert!(!filter.apply(&p_info, &[f1.clone()]).await.unwrap());
// enough matching
assert!(filter
.apply(&p_info, &[f1.clone(), f2.clone()])
.await
.unwrap());
// enough matching
assert!(filter.apply(&p_info, &[f1, f2, f3]).await.unwrap());
}
}

View File

@ -1,139 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{components::file_filter::FileFilter, error::DynError, PartitionInfo};
use super::PartitionFilter;
/// A partition filter that matches partitions that have files
/// matching the given file filter and their total size > max_desired_file_bytes
/// The idea for doing this:
/// 1. Not to compact large input size to avoid hitting OOM/crash.
/// 2. Not to compact too-large input size that lead to unecessary split into many files.
/// - Because we limit the size of a file. If the compacted result is too large, we will split them into many files.
/// - Because Level-1 files do not overlap, it is a waste to compact too-large size and then split.
#[derive(Debug)]
pub struct GreaterSizeMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
filter: T,
max_desired_file_bytes: u64,
}
impl<T> GreaterSizeMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
pub fn new(filter: T, max_desired_file_bytes: u64) -> Self {
Self {
filter,
max_desired_file_bytes,
}
}
}
impl<T> Display for GreaterSizeMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"greater_size_matching_file({}, {})",
self.filter, self.max_desired_file_bytes
)
}
}
#[async_trait]
impl<T> PartitionFilter for GreaterSizeMatchingFilesPartitionFilter<T>
where
T: FileFilter,
{
async fn apply(
&self,
_partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
// Matching files
let matching_files: Vec<&ParquetFile> = files
.iter()
.filter(|file| self.filter.apply(file))
.collect();
// Sum of file_size_bytes matching files
let sum: i64 = matching_files.iter().map(|file| file.file_size_bytes).sum();
Ok(sum >= self.max_desired_file_bytes as i64)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use data_types::CompactionLevel;
use crate::{
components::file_filter::level_range::LevelRangeFileFilter,
test_utils::PartitionInfoBuilder,
};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
let filter = GreaterSizeMatchingFilesPartitionFilter::new(
LevelRangeFileFilter::new(
CompactionLevel::FileNonOverlapped..=CompactionLevel::FileNonOverlapped,
),
1,
);
assert_eq!(
filter.to_string(),
"greater_size_matching_file(level_range(1..=1), 1)"
);
}
#[tokio::test]
async fn test_apply() {
let filter = GreaterSizeMatchingFilesPartitionFilter::new(
LevelRangeFileFilter::new(
CompactionLevel::FileNonOverlapped..=CompactionLevel::FileNonOverlapped,
),
15,
);
let f1 = ParquetFileBuilder::new(0)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_file_size_bytes(10)
.build();
let f2 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_file_size_bytes(14)
.build();
let f3 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_file_size_bytes(15)
.build();
let p_info = Arc::new(PartitionInfoBuilder::new().build());
// empty, not large enough
assert!(!filter.apply(&p_info, &[]).await.unwrap());
// Not large enough
assert!(!filter.apply(&p_info, &[f1.clone()]).await.unwrap());
assert!(!filter.apply(&p_info, &[f2.clone()]).await.unwrap());
// large enough
assert!(filter
.apply(&p_info, &[f1.clone(), f2.clone()])
.await
.unwrap());
assert!(filter.apply(&p_info, &[f3.clone()]).await.unwrap());
assert!(filter.apply(&p_info, &[f1, f2, f3]).await.unwrap());
}
}

View File

@ -1,59 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use crate::{error::DynError, PartitionInfo};
use super::PartitionFilter;
#[derive(Debug, Default)]
pub struct HasFilesPartitionFilter;
impl HasFilesPartitionFilter {
pub fn new() -> Self {
Self
}
}
impl Display for HasFilesPartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "has_files")
}
}
#[async_trait]
impl PartitionFilter for HasFilesPartitionFilter {
async fn apply(
&self,
_partition_info: &PartitionInfo,
files: &[data_types::ParquetFile],
) -> Result<bool, DynError> {
Ok(!files.is_empty())
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use iox_tests::ParquetFileBuilder;
use crate::test_utils::PartitionInfoBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(HasFilesPartitionFilter::new().to_string(), "has_files");
}
#[tokio::test]
async fn test_apply() {
let filter = HasFilesPartitionFilter::new();
let f = ParquetFileBuilder::new(0).build();
let p_info = Arc::new(PartitionInfoBuilder::new().build());
assert!(!filter.apply(&p_info, &[]).await.unwrap());
assert!(filter.apply(&p_info, &[f]).await.unwrap());
}
}

View File

@ -1,98 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{components::file_filter::FileFilter, error::DynError, PartitionInfo};
use super::PartitionFilter;
#[derive(Debug)]
pub struct HasMatchingFilePartitionFilter<T>
where
T: FileFilter,
{
filter: T,
}
impl<T> HasMatchingFilePartitionFilter<T>
where
T: FileFilter,
{
pub fn new(filter: T) -> Self {
Self { filter }
}
}
impl<T> Display for HasMatchingFilePartitionFilter<T>
where
T: FileFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "has_matching_file({})", self.filter)
}
}
#[async_trait]
impl<T> PartitionFilter for HasMatchingFilePartitionFilter<T>
where
T: FileFilter,
{
async fn apply(
&self,
_partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
Ok(files.iter().any(|file| self.filter.apply(file)))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use data_types::CompactionLevel;
use crate::{
components::file_filter::level_range::LevelRangeFileFilter,
test_utils::PartitionInfoBuilder,
};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
let filter = HasMatchingFilePartitionFilter::new(LevelRangeFileFilter::new(
CompactionLevel::Initial..=CompactionLevel::FileNonOverlapped,
));
assert_eq!(filter.to_string(), "has_matching_file(level_range(0..=1))");
}
#[tokio::test]
async fn test_apply() {
let filter = HasMatchingFilePartitionFilter::new(LevelRangeFileFilter::new(
CompactionLevel::Initial..=CompactionLevel::FileNonOverlapped,
));
let f1 = ParquetFileBuilder::new(0)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let f2 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Final)
.build();
let p_info = Arc::new(PartitionInfoBuilder::new().build());
// empty
assert!(!filter.apply(&p_info, &[]).await.unwrap());
// all matching
assert!(filter.apply(&p_info, &[f1.clone()]).await.unwrap());
// none matching
assert!(!filter.apply(&p_info, &[f2.clone()]).await.unwrap());
// some matching
assert!(filter.apply(&p_info, &[f1, f2]).await.unwrap());
}
}

View File

@ -1,110 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::ParquetFile;
use observability_deps::tracing::{debug, error, info};
use crate::{error::DynError, PartitionInfo};
use super::PartitionFilter;
#[derive(Debug)]
pub struct LoggingPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
inner: T,
filter_type: &'static str,
}
impl<T> LoggingPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
pub fn new(inner: T, filter_type: &'static str) -> Self {
Self { inner, filter_type }
}
}
impl<T> Display for LoggingPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({}, {})", self.inner, self.filter_type)
}
}
#[async_trait]
impl<T> PartitionFilter for LoggingPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
async fn apply(
&self,
partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
let res = self.inner.apply(partition_info, files).await;
match &res {
Ok(true) => {
debug!(
partition_id = partition_info.partition_id.get(),
filter_type = self.filter_type,
"NOT filtered partition"
);
}
Ok(false) => {
info!(
partition_id = partition_info.partition_id.get(),
filter_type = self.filter_type,
"filtered partition"
);
}
Err(e) => {
error!(partition_id = partition_info.partition_id.get(), filter_type = self.filter_type, %e, "error filtering filtered partition");
}
}
res
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use test_helpers::tracing::TracingCapture;
use crate::{
components::partition_filter::has_files::HasFilesPartitionFilter,
test_utils::PartitionInfoBuilder,
};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
let filter = LoggingPartitionFilterWrapper::new(HasFilesPartitionFilter::new(), "test");
assert_eq!(filter.to_string(), "logging(has_files, test)");
}
#[tokio::test]
async fn test_apply() {
let filter = LoggingPartitionFilterWrapper::new(HasFilesPartitionFilter::new(), "test");
let f = ParquetFileBuilder::new(0).build();
let p_info1 = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
let p_info2 = Arc::new(PartitionInfoBuilder::new().with_partition_id(2).build());
let capture = TracingCapture::new();
assert!(!filter.apply(&p_info1, &[]).await.unwrap());
assert!(filter.apply(&p_info2, &[f]).await.unwrap());
assert_eq!(
capture.to_string(),
"level = INFO; message = filtered partition; partition_id = 1; filter_type = \"test\";
level = DEBUG; message = NOT filtered partition; partition_id = 2; filter_type = \"test\"; ",
);
}
}

View File

@ -1,116 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{
error::{DynError, ErrorKind, SimpleError},
PartitionInfo,
};
use super::PartitionFilter;
#[derive(Debug)]
pub struct MaxNumColumnsPartitionFilter {
max_num_columns: usize,
}
impl MaxNumColumnsPartitionFilter {
pub fn new(max_num_columns: usize) -> Self {
Self { max_num_columns }
}
}
impl Display for MaxNumColumnsPartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "max_num_columns")
}
}
#[async_trait]
impl PartitionFilter for MaxNumColumnsPartitionFilter {
async fn apply(
&self,
partition_info: &PartitionInfo,
_files: &[ParquetFile],
) -> Result<bool, DynError> {
let col_count = partition_info.column_count();
if col_count <= self.max_num_columns {
Ok(true)
} else {
Err(SimpleError::new(
ErrorKind::OutOfMemory,
format!(
"table of partition {} has {} number of columns, limit is {}",
partition_info.partition_id, col_count, self.max_num_columns
),
)
.into())
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use crate::{error::ErrorKindExt, test_utils::PartitionInfoBuilder};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(
MaxNumColumnsPartitionFilter::new(1).to_string(),
"max_num_columns"
);
}
#[tokio::test]
async fn test_apply_skip() {
let filter = MaxNumColumnsPartitionFilter::new(2);
let f1 = ParquetFileBuilder::new(1).with_file_size_bytes(7).build();
let f2 = ParquetFileBuilder::new(2).with_file_size_bytes(4).build();
let p_info = Arc::new(
PartitionInfoBuilder::new()
.with_partition_id(1)
.with_num_columns(3)
.build(),
);
let err = filter.apply(&p_info, &[f1, f2]).await.unwrap_err();
assert_eq!(err.classify(), ErrorKind::OutOfMemory);
assert_eq!(
err.to_string(),
"table of partition 1 has 3 number of columns, limit is 2"
);
// empty files
// This filter doea not look into the file set, so it should not fail
let err = filter.apply(&p_info, &[]).await.unwrap_err();
assert_eq!(err.classify(), ErrorKind::OutOfMemory);
assert_eq!(
err.to_string(),
"table of partition 1 has 3 number of columns, limit is 2"
);
}
#[tokio::test]
async fn test_apply() {
let filter = MaxNumColumnsPartitionFilter::new(5);
let f1 = ParquetFileBuilder::new(1).with_file_size_bytes(7).build();
let f2 = ParquetFileBuilder::new(2).with_file_size_bytes(4).build();
let p_info = Arc::new(
PartitionInfoBuilder::new()
.with_partition_id(1)
.with_num_columns(3)
.build(),
);
assert!(filter.apply(&p_info, &[f1, f2]).await.unwrap());
assert!(filter.apply(&p_info, &[]).await.unwrap());
}
}

View File

@ -1,156 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::ParquetFile;
use metric::{Registry, U64Counter};
use crate::{error::DynError, PartitionInfo};
use super::PartitionFilter;
const METRIC_NAME_PARTITION_FILTER_COUNT: &str = "iox_compactor_partition_filter_count";
#[derive(Debug)]
pub struct MetricsPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
pass_counter: U64Counter,
filter_counter: U64Counter,
error_counter: U64Counter,
inner: T,
filter_type: &'static str,
}
impl<T> MetricsPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
pub fn new(inner: T, registry: &Registry, filter_type: &'static str) -> Self {
let metric = registry.register_metric::<U64Counter>(
METRIC_NAME_PARTITION_FILTER_COUNT,
"Number of times the compactor fetched fresh partitions",
);
let pass_counter = metric.recorder(&[("result", "pass"), ("filter_type", filter_type)]);
let filter_counter = metric.recorder(&[("result", "filter"), ("filter_type", filter_type)]);
let error_counter = metric.recorder(&[("result", "error"), ("filter_type", filter_type)]);
Self {
pass_counter,
filter_counter,
error_counter,
inner,
filter_type,
}
}
}
impl<T> Display for MetricsPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "metrics({}, {})", self.inner, self.filter_type)
}
}
#[async_trait]
impl<T> PartitionFilter for MetricsPartitionFilterWrapper<T>
where
T: PartitionFilter,
{
async fn apply(
&self,
partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
let res = self.inner.apply(partition_info, files).await;
match res {
Ok(true) => {
self.pass_counter.inc(1);
}
Ok(false) => {
self.filter_counter.inc(1);
}
Err(_) => {
self.error_counter.inc(1);
}
}
res
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use metric::{assert_counter, Attributes};
use crate::{
components::partition_filter::has_files::HasFilesPartitionFilter,
test_utils::PartitionInfoBuilder,
};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
let registry = Registry::new();
let filter =
MetricsPartitionFilterWrapper::new(HasFilesPartitionFilter::new(), &registry, "test");
assert_eq!(filter.to_string(), "metrics(has_files, test)",);
}
#[tokio::test]
async fn test_apply() {
let registry = Registry::new();
let filter =
MetricsPartitionFilterWrapper::new(HasFilesPartitionFilter::new(), &registry, "test");
let p_info = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
let f = ParquetFileBuilder::new(0).build();
assert_pass_counter(&registry, 0);
assert_filter_counter(&registry, 0);
assert_error_counter(&registry, 0);
assert!(!filter.apply(&p_info, &[]).await.unwrap());
assert!(!filter.apply(&p_info, &[]).await.unwrap());
assert!(filter.apply(&p_info, &[f]).await.unwrap());
assert_pass_counter(&registry, 1);
assert_filter_counter(&registry, 2);
assert_error_counter(&registry, 0);
}
fn assert_pass_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FILTER_COUNT,
labels = Attributes::from(&[("result", "pass"), ("filter_type", "test")]),
value = value,
);
}
fn assert_filter_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FILTER_COUNT,
labels = Attributes::from(&[("result", "filter"), ("filter_type", "test")]),
value = value,
);
}
fn assert_error_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FILTER_COUNT,
labels = Attributes::from(&[("result", "error"), ("filter_type", "test")]),
value = value,
);
}
}

View File

@ -1,88 +0,0 @@
use std::fmt::{Debug, Display};
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{error::DynError, PartitionInfo};
pub mod and;
pub mod greater_matching_files;
pub mod greater_size_matching_files;
pub mod has_files;
pub mod has_matching_file;
pub mod logging;
pub mod max_num_columns;
pub mod metrics;
pub mod or;
/// Filters partition based on ID and Parquet files.
///
/// May return an error. In this case, the partition will be marked as "skipped".
#[async_trait]
pub trait PartitionFilter: Debug + Display + Send + Sync {
/// Return `true` if the compactor should run a compaction on this partition. Return `false`
/// if this partition does not need any more compaction.
async fn apply(
&self,
partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError>;
}
// Simple Partitions filters for testing purposes
/// True partition filter.
#[derive(Debug)]
pub struct TruePartitionFilter;
impl Display for TruePartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "true")
}
}
#[async_trait]
impl PartitionFilter for TruePartitionFilter {
async fn apply(
&self,
_partition_info: &PartitionInfo,
_files: &[ParquetFile],
) -> Result<bool, DynError> {
Ok(true)
}
}
impl TruePartitionFilter {
#[allow(dead_code)]
pub fn new() -> Self {
Self
}
}
/// False partition filter.
#[derive(Debug)]
pub struct FalsePartitionFilter;
impl Display for FalsePartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "false")
}
}
#[async_trait]
impl PartitionFilter for FalsePartitionFilter {
async fn apply(
&self,
_partition_info: &PartitionInfo,
_files: &[ParquetFile],
) -> Result<bool, DynError> {
Ok(false)
}
}
impl FalsePartitionFilter {
#[allow(dead_code)]
pub fn new() -> Self {
Self
}
}

View File

@ -1,107 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::ParquetFile;
use crate::{error::DynError, PartitionInfo};
use super::PartitionFilter;
#[derive(Debug)]
pub struct OrPartitionFilter {
filters: Vec<Arc<dyn PartitionFilter>>,
}
impl OrPartitionFilter {
pub fn new(filters: Vec<Arc<dyn PartitionFilter>>) -> Self {
Self { filters }
}
}
impl Display for OrPartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "or([")?;
for (i, sub) in self.filters.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{sub}")?;
}
write!(f, "])")
}
}
#[async_trait]
impl PartitionFilter for OrPartitionFilter {
async fn apply(
&self,
partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<bool, DynError> {
for filter in &self.filters {
if filter.apply(partition_info, files).await? {
return Ok(true);
}
}
Ok(false)
}
}
#[cfg(test)]
mod tests {
use crate::{
components::partition_filter::{
has_files::HasFilesPartitionFilter, FalsePartitionFilter, TruePartitionFilter,
},
test_utils::PartitionInfoBuilder,
};
use super::*;
#[test]
fn test_display() {
let has_files = Arc::new(HasFilesPartitionFilter::new());
let max_num_files = Arc::new(TruePartitionFilter::new());
let filter = OrPartitionFilter::new(vec![has_files, max_num_files]);
assert_eq!(format!("{filter}"), "or([has_files, true])");
}
#[tokio::test]
async fn test_apply() {
let p_info = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
let filter = OrPartitionFilter::new(vec![
Arc::new(TruePartitionFilter::new()),
Arc::new(TruePartitionFilter::new()),
]);
assert!(filter.apply(&p_info, &[]).await.unwrap());
let filter = OrPartitionFilter::new(vec![
Arc::new(TruePartitionFilter::new()),
Arc::new(FalsePartitionFilter::new()),
]);
assert!(filter.apply(&p_info, &[]).await.unwrap());
let filter = OrPartitionFilter::new(vec![
Arc::new(FalsePartitionFilter::new()),
Arc::new(TruePartitionFilter::new()),
]);
assert!(filter.apply(&p_info, &[]).await.unwrap());
let filter = OrPartitionFilter::new(vec![
Arc::new(FalsePartitionFilter::new()),
Arc::new(FalsePartitionFilter::new()),
]);
assert!(!filter.apply(&p_info, &[]).await.unwrap());
let filter = OrPartitionFilter::new(vec![
Arc::new(FalsePartitionFilter::new()),
Arc::new(FalsePartitionFilter::new()),
Arc::new(TruePartitionFilter::new()),
]);
assert!(filter.apply(&p_info, &[]).await.unwrap());
}
}

View File

@ -1,17 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Arc,
};
use async_trait::async_trait;
use data_types::PartitionId;
use crate::{error::DynError, partition_info::PartitionInfo};
pub mod sub_sources;
/// Fetches the subset of information about a partition neededed for compaction
#[async_trait]
pub trait PartitionInfoSource: Debug + Display + Send + Sync {
async fn fetch(&self, partition_id: PartitionId) -> Result<Arc<PartitionInfo>, DynError>;
}

View File

@ -1,159 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use data_types::PartitionId;
use schema::sort::SortKey;
use crate::{
components::{
columns_source::ColumnsSource, namespaces_source::NamespacesSource,
partition_source::PartitionSource, tables_source::TablesSource,
},
error::DynError,
partition_info::PartitionInfo,
};
use super::PartitionInfoSource;
#[derive(Debug)]
pub struct SubSourcePartitionInfoSource<C, P, T, N>
where
C: ColumnsSource,
P: PartitionSource,
T: TablesSource,
N: NamespacesSource,
{
columns_source: C,
partition_source: P,
tables_source: T,
namespaces_source: N,
}
impl<C, P, T, N> SubSourcePartitionInfoSource<C, P, T, N>
where
C: ColumnsSource,
P: PartitionSource,
T: TablesSource,
N: NamespacesSource,
{
pub fn new(
columns_source: C,
partition_source: P,
tables_source: T,
namespaces_source: N,
) -> Self {
Self {
columns_source,
partition_source,
tables_source,
namespaces_source,
}
}
}
impl<C, P, T, N> Display for SubSourcePartitionInfoSource<C, P, T, N>
where
C: ColumnsSource,
P: PartitionSource,
T: TablesSource,
N: NamespacesSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"sub_sources(partition={}, tables={}, namespaces={})",
self.partition_source, self.tables_source, self.namespaces_source
)
}
}
#[async_trait]
impl<C, P, T, N> PartitionInfoSource for SubSourcePartitionInfoSource<C, P, T, N>
where
C: ColumnsSource,
P: PartitionSource,
T: TablesSource,
N: NamespacesSource,
{
async fn fetch(&self, partition_id: PartitionId) -> Result<Arc<PartitionInfo>, DynError> {
// Get info for the partition
let partition = self
.partition_source
.fetch_by_id(partition_id)
.await
.ok_or_else::<DynError, _>(|| String::from("Cannot find partition info").into())?;
let table = self
.tables_source
.fetch(partition.table_id)
.await
.ok_or_else::<DynError, _>(|| String::from("Cannot find table").into())?;
// TODO: after we have catalog function to read table schema, we should use it
// and avoid reading namespace schema
let namespace = self
.namespaces_source
.fetch_by_id(table.namespace_id)
.await
.ok_or_else::<DynError, _>(|| String::from("Cannot find namespace").into())?;
let namespace_schema = self
.namespaces_source
.fetch_schema_by_id(table.namespace_id)
.await
.ok_or_else::<DynError, _>(|| String::from("Cannot find namespace schema").into())?;
let table_schema = namespace_schema
.tables
.get(&table.name)
.ok_or_else::<DynError, _>(|| String::from("Cannot find table schema").into())?;
// fetch table columns to get column names for the partition's sort_key_ids
let columns = self.columns_source.fetch(table.id).await;
// sort_key_ids of the partition
let sort_key_ids = partition.sort_key_ids_none_if_empty();
// sort_key of the partition. This will be removed but until then, use it to validate the
// sort_key computed by mapping sort_key_ids to column names
let p_sort_key = partition.sort_key();
// convert column ids to column names
let sort_key = sort_key_ids.as_ref().map(|ids| {
let names = ids
.iter()
.map(|id| {
columns
.iter()
.find(|c| c.id == *id)
.map(|c| c.name.clone())
.ok_or_else::<DynError, _>(|| {
format!(
"Cannot find column with id {} for table {}",
id.get(),
table.name
)
.into()
})
})
.collect::<Result<Vec<_>, _>>()
.expect("Cannot find column names for sort key ids");
SortKey::from_columns(names.iter().map(|s| &**s))
});
// This is here to catch bugs if any while mapping sort_key_ids to column names
// This wil be removed once sort_key is removed from partition
assert_eq!(sort_key, p_sort_key);
Ok(Arc::new(PartitionInfo {
partition_id,
partition_hash_id: partition.hash_id().cloned(),
namespace_id: table.namespace_id,
namespace_name: namespace.name,
table: Arc::new(table),
table_schema: Arc::new(table_schema.clone()),
sort_key,
partition_key: partition.partition_key,
}))
}
}

View File

@ -1,43 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use backoff::{Backoff, BackoffConfig};
use data_types::{Partition, PartitionId, TransitionPartitionId};
use iox_catalog::{interface::Catalog, partition_lookup};
use super::PartitionSource;
#[derive(Debug)]
pub struct CatalogPartitionSource {
backoff_config: BackoffConfig,
catalog: Arc<dyn Catalog>,
}
impl CatalogPartitionSource {
pub fn new(backoff_config: BackoffConfig, catalog: Arc<dyn Catalog>) -> Self {
Self {
backoff_config,
catalog,
}
}
}
impl Display for CatalogPartitionSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "catalog")
}
}
#[async_trait]
impl PartitionSource for CatalogPartitionSource {
async fn fetch_by_id(&self, partition_id: PartitionId) -> Option<Partition> {
Backoff::new(&self.backoff_config)
.retry_all_errors("partition_by_id", || async {
let mut repos = self.catalog.repositories().await;
let id = TransitionPartitionId::Deprecated(partition_id);
partition_lookup(repos.as_mut(), &id).await
})
.await
.expect("retry forever")
}
}

View File

@ -1,89 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::{Partition, PartitionId};
use observability_deps::tracing::{info, warn};
use super::PartitionSource;
#[derive(Debug)]
pub struct LoggingPartitionSourceWrapper<T>
where
T: PartitionSource,
{
inner: T,
}
impl<T> LoggingPartitionSourceWrapper<T>
where
T: PartitionSource,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingPartitionSourceWrapper<T>
where
T: PartitionSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({})", self.inner)
}
}
#[async_trait]
impl<T> PartitionSource for LoggingPartitionSourceWrapper<T>
where
T: PartitionSource,
{
async fn fetch_by_id(&self, partition_id: PartitionId) -> Option<Partition> {
let partition = self.inner.fetch_by_id(partition_id).await;
match &partition {
Some(_) => {
info!(partition_id = partition_id.get(), "Fetch a partition",);
}
None => {
warn!(partition_id = partition_id.get(), "Partition not found",);
}
}
partition
}
}
#[cfg(test)]
mod tests {
use test_helpers::tracing::TracingCapture;
use crate::components::partition_source::mock::MockPartitionSource;
use iox_tests::PartitionBuilder;
use super::*;
#[test]
fn test_display() {
let source = LoggingPartitionSourceWrapper::new(MockPartitionSource::new(vec![]));
assert_eq!(source.to_string(), "logging(mock)",);
}
#[tokio::test]
async fn test_fetch_by_id() {
let p = PartitionBuilder::new(5).build();
let source = LoggingPartitionSourceWrapper::new(MockPartitionSource::new(vec![p.clone()]));
let capture = TracingCapture::new();
assert_eq!(
source.fetch_by_id(PartitionId::new(5)).await,
Some(p.clone())
);
assert_eq!(source.fetch_by_id(PartitionId::new(5)).await, Some(p));
assert_eq!(source.fetch_by_id(PartitionId::new(1)).await, None);
assert_eq!(
capture.to_string(),
"level = INFO; message = Fetch a partition; partition_id = 5; \n\
level = INFO; message = Fetch a partition; partition_id = 5; \n\
level = WARN; message = Partition not found; partition_id = 1; ",
);
}
}

View File

@ -1,124 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::{Partition, PartitionId};
use metric::{Registry, U64Counter};
use super::PartitionSource;
const METRIC_NAME_PARTITION_FETCH_COUNT: &str = "iox_compactor_partition_fetch_count";
#[derive(Debug)]
pub struct MetricsPartitionSourceWrapper<T>
where
T: PartitionSource,
{
fetch_found_counter: U64Counter,
fetch_notfound_counter: U64Counter,
inner: T,
}
impl<T> MetricsPartitionSourceWrapper<T>
where
T: PartitionSource,
{
pub fn new(inner: T, registry: &Registry) -> Self {
let fetch_metric = registry.register_metric::<U64Counter>(
METRIC_NAME_PARTITION_FETCH_COUNT,
"Number of times the compactor fetched information for a dedicated partition",
);
let fetch_found_counter = fetch_metric.recorder(&[("result", "found")]);
let fetch_notfound_counter = fetch_metric.recorder(&[("result", "not_found")]);
Self {
fetch_found_counter,
fetch_notfound_counter,
inner,
}
}
}
impl<T> Display for MetricsPartitionSourceWrapper<T>
where
T: PartitionSource,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "metrics({})", self.inner)
}
}
#[async_trait]
impl<T> PartitionSource for MetricsPartitionSourceWrapper<T>
where
T: PartitionSource,
{
async fn fetch_by_id(&self, partition_id: PartitionId) -> Option<Partition> {
let res = self.inner.fetch_by_id(partition_id).await;
match res {
Some(_) => self.fetch_found_counter.inc(1),
None => self.fetch_notfound_counter.inc(1),
}
res
}
}
#[cfg(test)]
mod tests {
use metric::{assert_counter, Attributes};
use crate::components::partition_source::mock::MockPartitionSource;
use iox_tests::PartitionBuilder;
use super::*;
#[test]
fn test_display() {
let registry = Registry::new();
let source =
MetricsPartitionSourceWrapper::new(MockPartitionSource::new(vec![]), &registry);
assert_eq!(source.to_string(), "metrics(mock)",);
}
#[tokio::test]
async fn test_fetch_by_id() {
let registry = Registry::new();
let p = PartitionBuilder::new(5).build();
let source = MetricsPartitionSourceWrapper::new(
MockPartitionSource::new(vec![p.clone()]),
&registry,
);
assert_fetch_found_counter(&registry, 0);
assert_fetch_notfound_counter(&registry, 0);
assert_eq!(
source.fetch_by_id(PartitionId::new(5)).await,
Some(p.clone())
);
assert_eq!(source.fetch_by_id(PartitionId::new(5)).await, Some(p));
assert_eq!(source.fetch_by_id(PartitionId::new(1)).await, None);
assert_fetch_found_counter(&registry, 2);
assert_fetch_notfound_counter(&registry, 1);
}
fn assert_fetch_found_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FETCH_COUNT,
labels = Attributes::from(&[("result", "found")]),
value = value,
);
}
fn assert_fetch_notfound_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FETCH_COUNT,
labels = Attributes::from(&[("result", "not_found")]),
value = value,
);
}
}

View File

@ -1,73 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use data_types::{Partition, PartitionId};
use super::PartitionSource;
#[derive(Debug)]
pub struct MockPartitionSource {
partitions: Vec<Partition>,
}
impl MockPartitionSource {
#[allow(dead_code)] // not used anywhere
pub fn new(partitions: Vec<Partition>) -> Self {
Self { partitions }
}
}
impl Display for MockPartitionSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl PartitionSource for MockPartitionSource {
async fn fetch_by_id(&self, partition_id: PartitionId) -> Option<Partition> {
self.partitions
.iter()
.find(|p| p.id == partition_id)
.cloned()
}
}
#[cfg(test)]
mod tests {
use iox_tests::PartitionBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(MockPartitionSource::new(vec![]).to_string(), "mock",);
}
#[tokio::test]
async fn test_fetch_by_id() {
let cj_1 = PartitionBuilder::new(5).build();
let cj_2 = PartitionBuilder::new(1).build();
let cj_3 = PartitionBuilder::new(12).build();
let compaction_jobs = vec![cj_1.clone(), cj_2.clone(), cj_3.clone()];
let source = MockPartitionSource::new(compaction_jobs);
assert_eq!(
source.fetch_by_id(PartitionId::new(5)).await,
Some(cj_1.clone())
);
assert_eq!(
source.fetch_by_id(PartitionId::new(1)).await,
Some(cj_2.clone())
);
// fetching does not drain
assert_eq!(
source.fetch_by_id(PartitionId::new(5)).await,
Some(cj_1.clone())
);
// unknown table => None result
assert_eq!(source.fetch_by_id(PartitionId::new(3)).await, None,);
}
}

View File

@ -1,18 +0,0 @@
use std::fmt::{Debug, Display};
use async_trait::async_trait;
use data_types::{Partition, PartitionId};
pub mod catalog;
pub mod logging;
pub mod metrics;
pub mod mock;
/// A source of [partition](Partition) that may potentially need compacting.
#[async_trait]
pub trait PartitionSource: Debug + Display + Send + Sync {
/// Get partition for a given partition ID.
///
/// This method performs retries.
async fn fetch_by_id(&self, partition_id: PartitionId) -> Option<Partition>;
}

View File

@ -1,151 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use observability_deps::tracing::{debug, error, info};
use crate::{error::DynError, file_classification::FilesForProgress, PartitionInfo};
use super::PostClassificationPartitionFilter;
#[derive(Debug)]
pub struct LoggingPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
inner: T,
filter_type: &'static str,
}
impl<T> LoggingPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
pub fn new(inner: T, filter_type: &'static str) -> Self {
Self { inner, filter_type }
}
}
impl<T> Display for LoggingPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "logging({}, {})", self.inner, self.filter_type)
}
}
#[async_trait]
impl<T> PostClassificationPartitionFilter for LoggingPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
async fn apply(
&self,
partition_info: &PartitionInfo,
files_to_make_progress_on: &FilesForProgress,
) -> Result<bool, DynError> {
let res = self
.inner
.apply(partition_info, files_to_make_progress_on)
.await;
match &res {
Ok(true) => {
debug!(
partition_id = partition_info.partition_id.get(),
filter_type = self.filter_type,
"NOT filtered partition"
);
}
Ok(false) => {
info!(
partition_id = partition_info.partition_id.get(),
filter_type = self.filter_type,
"filtered partition"
);
}
Err(e) => {
error!(
partition_id = partition_info.partition_id.get(),
filter_type = self.filter_type,
%e,
"error filtering filtered partition"
);
}
}
res
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use test_helpers::tracing::TracingCapture;
use crate::{
components::post_classification_partition_filter::mock::MockPostClassificationPartitionFilter,
test_utils::PartitionInfoBuilder,
};
use super::*;
#[test]
fn test_display() {
let filter = LoggingPostClassificationFilterWrapper::new(
MockPostClassificationPartitionFilter::new(vec![Ok(true)]),
"test",
);
assert_eq!(filter.to_string(), "logging(mock, test)");
}
#[tokio::test]
async fn test_apply() {
let filter = LoggingPostClassificationFilterWrapper::new(
MockPostClassificationPartitionFilter::new(vec![
Ok(true),
Ok(false),
Err("problem".into()),
]),
"test",
);
let p_info1 = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
let p_info2 = Arc::new(PartitionInfoBuilder::new().with_partition_id(2).build());
let p_info3 = Arc::new(PartitionInfoBuilder::new().with_partition_id(3).build());
let capture = TracingCapture::new();
assert!(filter
.apply(&p_info1, &FilesForProgress::empty())
.await
.unwrap());
assert!(!filter
.apply(&p_info2, &FilesForProgress::empty())
.await
.unwrap());
assert_eq!(
filter
.apply(&p_info3, &FilesForProgress::empty())
.await
.unwrap_err()
.to_string(),
"problem"
);
assert_eq!(
capture.to_string(),
"level = DEBUG; \
message = NOT filtered partition; \
partition_id = 1; \
filter_type = \"test\"; \n\
level = INFO; \
message = filtered partition; \
partition_id = 2; \
filter_type = \"test\"; \n\
level = ERROR; \
message = error filtering filtered partition; \
partition_id = 3; \
filter_type = \"test\"; \
e = problem; ",
);
}
}

View File

@ -1,180 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use metric::{Registry, U64Counter};
use crate::{error::DynError, file_classification::FilesForProgress, PartitionInfo};
use super::PostClassificationPartitionFilter;
const METRIC_NAME_PARTITION_FILTER_COUNT: &str =
"iox_compactor_post_classification_partition_filter_count";
#[derive(Debug)]
pub struct MetricsPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
pass_counter: U64Counter,
filter_counter: U64Counter,
error_counter: U64Counter,
inner: T,
filter_type: &'static str,
}
impl<T> MetricsPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
pub fn new(inner: T, registry: &Registry, filter_type: &'static str) -> Self {
let metric = registry.register_metric::<U64Counter>(
METRIC_NAME_PARTITION_FILTER_COUNT,
"Number of times the compactor filtered partitions after its files were classified",
);
let pass_counter = metric.recorder(&[("result", "pass"), ("filter_type", filter_type)]);
let filter_counter = metric.recorder(&[("result", "filter"), ("filter_type", filter_type)]);
let error_counter = metric.recorder(&[("result", "error"), ("filter_type", filter_type)]);
Self {
pass_counter,
filter_counter,
error_counter,
inner,
filter_type,
}
}
}
impl<T> Display for MetricsPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "metrics({}, {})", self.inner, self.filter_type)
}
}
#[async_trait]
impl<T> PostClassificationPartitionFilter for MetricsPostClassificationFilterWrapper<T>
where
T: PostClassificationPartitionFilter,
{
async fn apply(
&self,
partition_info: &PartitionInfo,
files_to_make_progress_on: &FilesForProgress,
) -> Result<bool, DynError> {
let res = self
.inner
.apply(partition_info, files_to_make_progress_on)
.await;
match res {
Ok(true) => {
self.pass_counter.inc(1);
}
Ok(false) => {
self.filter_counter.inc(1);
}
Err(_) => {
self.error_counter.inc(1);
}
}
res
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use metric::{assert_counter, Attributes};
use crate::{
components::post_classification_partition_filter::mock::MockPostClassificationPartitionFilter,
test_utils::PartitionInfoBuilder,
};
use super::*;
#[test]
fn test_display() {
let registry = Registry::new();
let filter = MetricsPostClassificationFilterWrapper::new(
MockPostClassificationPartitionFilter::new(vec![Ok(true)]),
&registry,
"test",
);
assert_eq!(filter.to_string(), "metrics(mock, test)",);
}
#[tokio::test]
async fn test_apply() {
let registry = Registry::new();
let filter = MetricsPostClassificationFilterWrapper::new(
MockPostClassificationPartitionFilter::new(vec![
Ok(true),
Ok(false),
Err("problem".into()),
]),
&registry,
"test",
);
let p_info = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
assert_pass_counter(&registry, 0);
assert_filter_counter(&registry, 0);
assert_error_counter(&registry, 0);
assert!(filter
.apply(&p_info, &FilesForProgress::empty())
.await
.unwrap());
assert!(!filter
.apply(&p_info, &FilesForProgress::empty())
.await
.unwrap());
assert_eq!(
filter
.apply(&p_info, &FilesForProgress::empty())
.await
.unwrap_err()
.to_string(),
"problem"
);
assert_pass_counter(&registry, 1);
assert_filter_counter(&registry, 1);
assert_error_counter(&registry, 1);
}
fn assert_pass_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FILTER_COUNT,
labels = Attributes::from(&[("result", "pass"), ("filter_type", "test")]),
value = value,
);
}
fn assert_filter_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FILTER_COUNT,
labels = Attributes::from(&[("result", "filter"), ("filter_type", "test")]),
value = value,
);
}
fn assert_error_counter(registry: &Registry, value: u64) {
assert_counter!(
registry,
U64Counter,
METRIC_NAME_PARTITION_FILTER_COUNT,
labels = Attributes::from(&[("result", "error"), ("filter_type", "test")]),
value = value,
);
}
}

View File

@ -1,60 +0,0 @@
use std::{
fmt::{Debug, Display},
sync::Mutex,
};
use async_trait::async_trait;
use crate::{error::DynError, file_classification::FilesForProgress, PartitionInfo};
use super::PostClassificationPartitionFilter;
pub struct MockPostClassificationPartitionFilter {
return_values: Mutex<Box<dyn Iterator<Item = Result<bool, DynError>> + Send>>,
}
impl MockPostClassificationPartitionFilter {
#[cfg(test)]
pub fn new(return_values: Vec<Result<bool, DynError>>) -> Self {
Self {
return_values: Mutex::new(Box::new(return_values.into_iter())),
}
}
}
impl Display for MockPostClassificationPartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
impl Debug for MockPostClassificationPartitionFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "mock")
}
}
#[async_trait]
impl PostClassificationPartitionFilter for MockPostClassificationPartitionFilter {
async fn apply(
&self,
_partition_info: &PartitionInfo,
_files_to_make_progress_on: &FilesForProgress,
) -> Result<bool, DynError> {
self.return_values.lock().unwrap().next().unwrap()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_display() {
assert_eq!(
MockPostClassificationPartitionFilter::new(vec![Ok(true), Err("problem".into())])
.to_string(),
"mock"
);
}
}

View File

@ -1,24 +0,0 @@
use std::fmt::{Debug, Display};
use async_trait::async_trait;
use crate::{error::DynError, file_classification::FilesForProgress, PartitionInfo};
pub mod logging;
pub mod metrics;
pub mod mock;
pub mod possible_progress;
/// Filters partition based on ID and Parquet files after the files have been classified.
///
/// May return an error. In this case, the partition will be marked as "skipped".
#[async_trait]
pub trait PostClassificationPartitionFilter: Debug + Display + Send + Sync {
/// Return `true` if the compactor should run a compaction on this partition. Return `false`
/// if this partition does not need any more compaction.
async fn apply(
&self,
partition_info: &PartitionInfo,
files_to_make_progress_on: &FilesForProgress,
) -> Result<bool, DynError>;
}

View File

@ -1,107 +0,0 @@
use std::fmt::Display;
use async_trait::async_trait;
use crate::{
error::{DynError, ErrorKind, SimpleError},
file_classification::FilesForProgress,
PartitionInfo,
};
use super::PostClassificationPartitionFilter;
#[derive(Debug)]
pub struct PossibleProgressFilter {
max_parquet_bytes: usize,
}
impl PossibleProgressFilter {
pub fn new(max_parquet_bytes: usize) -> Self {
Self { max_parquet_bytes }
}
}
impl Display for PossibleProgressFilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "possible_progress")
}
}
#[async_trait]
impl PostClassificationPartitionFilter for PossibleProgressFilter {
async fn apply(
&self,
partition_info: &PartitionInfo,
files_to_make_progress_on: &FilesForProgress,
) -> Result<bool, DynError> {
if !files_to_make_progress_on.is_empty() {
// There is some files to compact or split; we can make progress
Ok(true)
} else {
// No files means the split_compact cannot find any reasonable set of files to make progress on
Err(SimpleError::new(
ErrorKind::OutOfMemory,
format!(
"partition {} has overlapped files that exceed max compact size limit {}. \
This may happen if a large amount of data has the same timestamp",
partition_info.partition_id, self.max_parquet_bytes
),
)
.into())
}
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use crate::{
error::ErrorKindExt,
file_classification::{CompactReason, FilesToSplitOrCompact},
test_utils::PartitionInfoBuilder,
};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(
PossibleProgressFilter::new(10).to_string(),
"possible_progress"
);
}
#[tokio::test]
async fn test_apply_empty() {
let filter = PossibleProgressFilter::new(10);
let p_info = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
let err = filter
.apply(&p_info, &FilesForProgress::empty())
.await
.unwrap_err();
assert_eq!(err.classify(), ErrorKind::OutOfMemory);
assert_eq!(
err.to_string(),
"partition 1 has overlapped files that exceed max compact size limit 10. \
This may happen if a large amount of data has the same timestamp"
);
}
#[tokio::test]
async fn test_apply_not_empty() {
let filter = PossibleProgressFilter::new(10);
let p_info = Arc::new(PartitionInfoBuilder::new().with_partition_id(1).build());
let f1 = ParquetFileBuilder::new(1).with_file_size_bytes(7).build();
let files_for_progress = FilesForProgress {
upgrade: vec![],
split_or_compact: FilesToSplitOrCompact::Compact(
vec![f1],
// This reason is arbitrary
CompactReason::ManySmallFiles,
),
};
assert!(filter.apply(&p_info, &files_for_progress).await.unwrap());
}
}

View File

@ -1,124 +0,0 @@
//! Report component system state.
use observability_deps::tracing::info;
use crate::config::Config;
use super::Components;
/// Log config at info level.
pub fn log_config(config: &Config) {
// use struct unpack so we don't forget any members
let Config {
// no need to print the internal state of the registry
metric_registry: _,
// no need to print the internal state of the trace collector
trace_collector: _,
catalog,
scheduler_config,
parquet_store_real,
parquet_store_scratchpad,
exec,
time_provider,
backoff_config,
partition_concurrency,
df_concurrency,
partition_scratchpad_concurrency,
max_desired_file_size_bytes,
percentage_max_file_size,
split_percentage,
partition_timeout,
shadow_mode,
enable_scratchpad,
min_num_l1_files_to_compact,
process_once,
parquet_files_sink_override,
simulate_without_object_store,
all_errors_are_fatal,
max_num_columns_per_table,
max_num_files_per_plan,
max_partition_fetch_queries_per_second,
gossip_bind_address,
gossip_seeds,
} = &config;
let parquet_files_sink_override = parquet_files_sink_override
.as_ref()
.map(|_| "Some")
.unwrap_or("None");
info!(
%catalog,
%scheduler_config,
%parquet_store_real,
%parquet_store_scratchpad,
%exec,
%time_provider,
?backoff_config,
partition_concurrency=partition_concurrency.get(),
df_concurrency=df_concurrency.get(),
partition_scratchpad_concurrency=partition_scratchpad_concurrency.get(),
max_desired_file_size_bytes,
percentage_max_file_size,
split_percentage,
partition_timeout_secs=partition_timeout.as_secs_f32(),
shadow_mode,
enable_scratchpad,
min_num_l1_files_to_compact,
process_once,
simulate_without_object_store,
%parquet_files_sink_override,
all_errors_are_fatal,
max_num_columns_per_table,
max_num_files_per_plan,
max_partition_fetch_queries_per_second,
?gossip_bind_address,
?gossip_seeds,
"config",
);
}
/// Log component system at info level.
pub fn log_components(components: &Components) {
// use struct unpack so we don't forget any members
let Components {
compaction_job_stream,
partition_info_source,
partition_files_source,
round_info_source,
partition_filter,
post_classification_partition_filter: partition_too_large_to_compact_filter,
compaction_job_done_sink,
commit,
ir_planner,
df_planner,
df_plan_exec,
parquet_files_sink,
round_split,
divide_initial,
scratchpad_gen,
file_classifier,
changed_files_filter,
} = components;
info!(
%compaction_job_stream,
%partition_info_source,
%partition_files_source,
%round_info_source,
%partition_filter,
%partition_too_large_to_compact_filter,
%compaction_job_done_sink,
%commit,
%ir_planner,
%df_planner,
%df_plan_exec,
%parquet_files_sink,
%round_split,
%divide_initial,
%scratchpad_gen,
%file_classifier,
%changed_files_filter,
"component setup",
);
}

View File

@ -1,855 +0,0 @@
use std::{
cmp::max,
fmt::{Debug, Display},
sync::{Arc, Mutex},
};
use crate::components::{
split_or_compact::start_level_files_to_split::{
linear_dist_ranges, merge_l1_spanned_chains, merge_small_l0_chains, select_split_times,
split_into_chains,
},
Components,
};
use async_trait::async_trait;
use data_types::{CompactionLevel, ParquetFile, Timestamp, TransitionPartitionId};
use itertools::Itertools;
use observability_deps::tracing::{debug, info};
use crate::{
error::DynError, round_info::CompactRange, round_info::CompactType, PartitionInfo, RoundInfo,
};
/// Calculates information about what this compaction round does.
/// When we get deeper into the compaction decision making, there
/// may not be as much context information available. It may not
/// be possible to reach the same conclusions about the intention
/// for this compaction round. So RoundInfo must contain enough
/// information carry that intention through the compactions.
#[async_trait]
pub trait RoundInfoSource: Debug + Display + Send + Sync {
async fn calculate(
&self,
components: Arc<Components>,
last_round_info: Option<Arc<RoundInfo>>,
partition_info: &PartitionInfo,
files: Vec<ParquetFile>,
) -> Result<(Arc<RoundInfo>, bool), DynError>;
}
#[derive(Debug)]
pub struct LoggingRoundInfoWrapper {
inner: Arc<dyn RoundInfoSource>,
}
impl LoggingRoundInfoWrapper {
pub fn new(inner: Arc<dyn RoundInfoSource>) -> Self {
Self { inner }
}
}
impl Display for LoggingRoundInfoWrapper {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "LoggingRoundInfoWrapper({})", self.inner)
}
}
#[async_trait]
impl RoundInfoSource for LoggingRoundInfoWrapper {
async fn calculate(
&self,
components: Arc<Components>,
last_round_info: Option<Arc<RoundInfo>>,
partition_info: &PartitionInfo,
files: Vec<ParquetFile>,
) -> Result<(Arc<RoundInfo>, bool), DynError> {
let res = self
.inner
.calculate(components, last_round_info, partition_info, files)
.await;
if let Ok((round_info, done)) = &res {
debug!(round_info_source=%self.inner, %round_info, %done, "running round");
}
res
}
}
/// Computes the type of round based on the levels of the input files
#[derive(Debug)]
pub struct LevelBasedRoundInfo {
pub max_num_files_per_plan: usize,
pub max_total_file_size_per_plan: usize,
}
impl Display for LevelBasedRoundInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "LevelBasedRoundInfo {}", self.max_num_files_per_plan)
}
}
impl LevelBasedRoundInfo {
pub fn new(max_num_files_per_plan: usize, max_total_file_size_per_plan: usize) -> Self {
Self {
max_num_files_per_plan,
max_total_file_size_per_plan,
}
}
/// Returns true if the scenario looks like ManySmallFiles, but we can't group them well into branches.
/// TODO: use this or remove it. For now, keep it in case we need the temporary workaround again.
/// This can be used to identify criteria to trigger a SimulatedLeadingEdge as a temporary workaround
/// for a situation that isn't well handled, when the desire is to postpone optimal handling to a later PR.
#[allow(dead_code)]
pub fn many_ungroupable_files(
&self,
files: &[ParquetFile],
start_level: CompactionLevel,
max_total_file_size_to_group: usize,
) -> bool {
if self.too_many_small_files_to_compact(files, CompactionLevel::Initial) {
let start_level_files = files
.iter()
.filter(|f| f.compaction_level == start_level)
.collect::<Vec<_>>();
let start_count = start_level_files.len();
let mut chains = split_into_chains(start_level_files.into_iter().cloned().collect());
chains = merge_small_l0_chains(chains, max_total_file_size_to_group);
if chains.len() > 1 && chains.len() > start_count / 3 {
return true;
}
}
false
}
/// Returns true if number of files of the given start_level and
/// their overlapped files in next level is over limit, and if those
/// files are sufficiently small.
///
/// over the limit means that the maximum number of files that a subsequent compaction
/// branch may choose to compact in a single plan would exceed `max_num_files_per_plan`
pub fn too_many_small_files_to_compact(
&self,
files: &[ParquetFile],
start_level: CompactionLevel,
) -> bool {
let start_level_files = files
.iter()
.filter(|f| f.compaction_level == start_level)
.collect::<Vec<_>>();
let num_start_level = start_level_files.len();
let size_start_level: usize = start_level_files
.iter()
.map(|f| f.file_size_bytes as usize)
.sum();
let start_max_l0_created_at = start_level_files
.iter()
.map(|f| f.max_l0_created_at)
.unique()
.count();
let next_level_files = files
.iter()
.filter(|f| f.compaction_level == start_level.next())
.collect::<Vec<_>>();
// The compactor may compact all the target level and next level together in one
// branch in the worst case, thus if that would result in too many files to compact in a single
// plan, run a pre-phase to reduce the number of files first
let num_overlapped_files = get_num_overlapped_files(start_level_files, next_level_files);
if num_start_level > 1
&& num_start_level + num_overlapped_files > self.max_num_files_per_plan
{
// This scaenario meets the simple criteria of start level files + their overlaps are lots of files.
// But ManySmallFiles implies we must compact only within the start level to reduce the quantity of
// start level files. There are several reasons why that might be unhelpful.
// Reason 1: if all the start level files have the same max_l0_created_at, then they were split from
// the same file. If we previously decided to split them, we should not undo that now.
if start_max_l0_created_at == 1 {
return false;
}
// Reason 2: Maybe its many LARGE files making reduction of file count in the start level impossible.
if size_start_level / num_start_level
> self.max_total_file_size_per_plan / self.max_num_files_per_plan
{
// Average start level file size is more than the average implied by max bytes & files per plan.
// Even though there are "many files", this is not "many small files".
// There isn't much (perhaps not any) file reduction to be done, attempting it can get us stuck
// in a loop.
return false;
}
// Reason 3: Maybe there are so many start level files because we did a bunch of splits.
// Note that we'll do splits to ensure each start level file overlaps at most one target level file.
// If the prior round did that, and now we declare this ManySmallFiles, which forces compactions
// within the start level, we'll undo the splits performed in the prior round, which can get us
// stuck in a loop.
let chains = split_into_chains(files.to_vec());
let mut max_target_level_files: usize = 0;
let mut max_chain_len: usize = 0;
for chain in chains {
let target_file_cnt = chain
.iter()
.filter(|f| f.compaction_level == start_level.next())
.count();
max_target_level_files = max(max_target_level_files, target_file_cnt);
let chain_len = chain.len();
max_chain_len = max(max_chain_len, chain_len);
}
if max_target_level_files <= 1 && max_chain_len <= self.max_num_files_per_plan {
// All of our start level files overlap with at most one target level file. If the prior round did
// splits to cause this, declaring this a ManySmallFiles case can lead to an endless loop.
// If we got lucky and this happened without splits, declaring this ManySmallFiles will waste
// our good fortune.
return false;
}
return true;
}
false
}
/// consider_vertical_splitting determines if vertical splitting is necessary, and if so, a vec of split times is
/// returned.
pub fn consider_vertical_splitting(
&self,
partition_id: TransitionPartitionId,
files: Vec<ParquetFile>,
max_compact_size: usize,
) -> Vec<i64> {
let file_cnt = files.len();
let (start_level_files, target_level_files): (Vec<ParquetFile>, Vec<ParquetFile>) = files
.into_iter()
.filter(|f| f.compaction_level != CompactionLevel::Final)
.partition(|f| f.compaction_level == CompactionLevel::Initial);
let len = start_level_files.len();
let mut split_times = Vec::with_capacity(len);
let cap: usize = start_level_files
.iter()
.map(|f| f.file_size_bytes as usize)
.sum();
// TODO: remove this:
if start_level_files.len() > 300 && cap / file_cnt < max_compact_size / 10 {
info!("skipping vertical splitting on partition_id {} for now, due to excessive file count. file count: {}, cap: {} MB",
partition_id, start_level_files.len(), cap/1024/1024);
return vec![];
}
// A single file over max size can just get upgraded to L1, then L2, unless it overlaps other L0s.
// So multi file filess over the max compact size may need split
if start_level_files.len() > 1 && cap > max_compact_size {
// files in this range are too big to compact in one job, so files will be split it into smaller, more manageable ranges.
// We can't know the data distribution within each file without reading the file (too expensive), but we can
// still learn a lot about the data distribution accross the set of files by assuming even distribtuion within each
// file and considering the distribution of files within the files's time range.
let linear_ranges = linear_dist_ranges(
&start_level_files,
cap,
max_compact_size,
partition_id.clone(),
);
let mut first_range = true;
for range in linear_ranges {
// split at every time range of linear distribution.
if !first_range {
split_times.push(range.min - 1);
}
first_range = false;
// how many start level files are in this range?
let overlaps = start_level_files
.iter()
.filter(|f| {
f.overlaps_time_range(Timestamp::new(range.min), Timestamp::new(range.max))
})
.count();
if overlaps > 1 && range.cap > max_compact_size {
// Since we'll be splitting the start level files within this range, it would be nice to align the split times to
// the min/max times of target level files. select_split_times will use the min/max time of target level files
// as hints, and see what lines up to where the range needs split.
let mut split_hints: Vec<i64> =
Vec::with_capacity(range.cap * 2 / max_compact_size + 1);
// split time is the last time included in the 'left' side of the split. Our goal with these hints is to avoid
// overlaps with L1 files, we'd like the 'left file' to end before this L1 file starts (split=min-1), or it can
// include up to the last ns of the L1 file (split=max).
for f in &target_level_files {
if f.min_time.get() - 1 > range.min && f.min_time.get() < range.max {
split_hints.push(f.min_time.get() - 1);
}
if f.max_time.get() > range.min && f.max_time.get() < range.max {
split_hints.push(f.max_time.get());
}
}
// We may have started splitting files, and now there's a new L0 added that spans our previous splitting.
// We'll detect multiple L0 files ending at the same time, and add that to the split hints.
let end_times = start_level_files
.iter()
.map(|f| f.max_time.get())
.sorted()
.dedup_with_count();
for (count, time) in end_times {
if count > 1 {
// wether we previously split here or not, with at least 2 L0s ending here, its a good place to split.
split_hints.push(time);
}
}
let splits = select_split_times(
range.cap,
max_compact_size,
range.min,
range.max,
split_hints.clone(),
);
split_times.extend(splits);
}
}
}
split_times.sort();
split_times.dedup();
split_times
}
// derive_draft_ranges takes a last round info option and a vec of files - one of them must be populated.
// From this, we'll get a draft of CompactRanges for the current round of compaction. Its a draft because
// it partially set up, having only the files, min, max, and cap set. The op, branches, and files_for_later
// will be determined shortly.
// We split up into several ranges to keep the L0->L1 compaction simple (the overlaps allowed in L0 make it messy).
// But we don't want to create artificial divisions in L2, so L2's get set aside until we've consolidated to
// a single CompactRange.
fn derive_draft_ranges(
&self,
partition_info: &PartitionInfo,
last_round_info: Option<Arc<RoundInfo>>,
files: Vec<ParquetFile>,
) -> (Vec<CompactRange>, Option<Vec<ParquetFile>>) {
// We require exactly 1 source of information: either 'files' because this is the first round, or 'last_round_info' from the prior round.
if let Some(last_round_info) = last_round_info {
assert!(
files.is_empty(),
"last_round_info and files must not both be populated"
);
self.evaluate_prior_ranges(partition_info, last_round_info)
} else {
assert!(
!files.is_empty(),
"last_round_info and files must not both be empty"
);
// This is the first round, so no prior round info.
// We'll take a look at 'files' and see what we can do.
self.split_files_into_ranges(files)
}
}
// evaluate_prior_ranges is a helper function for derive_draft_ranges, used when there is prior round info.
// It takes the prior round's ranges, and splits them if they did vertical splitting, or combines them if
// they finished compacting their L0s.
fn evaluate_prior_ranges(
&self,
partition_info: &PartitionInfo,
last_round_info: Arc<RoundInfo>,
) -> (Vec<CompactRange>, Option<Vec<ParquetFile>>) {
// We'll start with the ranges from the prior round.
let mut ranges = Vec::with_capacity(last_round_info.ranges.len());
// As we iterate through the last_round_info's ranges, we'll try to consolidate ranges for any that don't have L0s.
let mut prior_range: Option<CompactRange> = None;
for range in &last_round_info.ranges {
// The prior round should have handled its `files_for_now`, so that should `None`.
// What the prior round considered `files_for_later` will now become `files_for_now`.
assert!(
range.files_for_now.lock().unwrap().is_none(),
"files_for_now should be empty for range {}->{} on partition {}",
range.min,
range.max,
partition_info.partition_id()
);
assert!(
range.branches.lock().unwrap().is_none(),
"branches should be empty for range {}->{} on partition {}",
range.min,
range.max,
partition_info.partition_id()
);
let files_for_now = range.files_for_later.lock().unwrap().take();
assert!(
files_for_now.is_some(),
"files_for_later should not be None for range {}->{} on partition {}",
range.min,
range.max,
partition_info.partition_id()
);
let mut files_for_now = files_for_now.unwrap();
assert!(
!files_for_now.is_empty(),
"files_for_later should not be empty for range {}->{} on partition {}",
range.min,
range.max,
partition_info.partition_id()
);
if let Some(split_times) = range.op.split_times() {
// In the prior round, this range did vertical splitting. Those split times now divide this range into several ranges.
if prior_range.is_some() {
ranges.push(prior_range.unwrap());
prior_range = None;
}
let mut split_ranges = Vec::with_capacity(split_times.len());
let mut max = range.max;
for split_time in split_times.into_iter().rev() {
// By iterating in reverse, everything above the split time is in this split
let this_split_files_for_now: Vec<ParquetFile>;
(this_split_files_for_now, files_for_now) = files_for_now
.into_iter()
.partition(|f| f.max_time.get() > split_time);
let cap = this_split_files_for_now
.iter()
.map(|f| f.file_size_bytes as usize)
.sum::<usize>();
let this_split_files_for_now = if this_split_files_for_now.is_empty() {
None
} else {
Some(this_split_files_for_now.clone())
};
split_ranges.insert(
0,
CompactRange {
op: CompactType::Deferred {},
min: split_time + 1,
max,
cap,
has_l0s: true,
files_for_now: Mutex::new(this_split_files_for_now),
branches: Mutex::new(None),
files_for_later: Mutex::new(None),
},
);
// split_time is the highest time in the 'left' file, so that will be max time for the next range.
max = split_time;
}
if !files_for_now.is_empty() {
let cap = files_for_now
.iter()
.map(|f| f.file_size_bytes as usize)
.sum::<usize>();
let files_for_now = Some(files_for_now.clone());
split_ranges.insert(
0,
CompactRange {
op: CompactType::Deferred {},
min: range.min,
max,
cap,
has_l0s: true,
files_for_now: Mutex::new(files_for_now),
branches: Mutex::new(None),
files_for_later: Mutex::new(None),
},
);
}
ranges.append(&mut split_ranges);
} else {
// Carry forward the prior range
let has_l0s = files_for_now
.iter()
.any(|f| f.compaction_level == CompactionLevel::Initial);
if prior_range.is_some() && (!prior_range.as_mut().unwrap().has_l0s || !has_l0s) {
// This and the prior range don't both have L0s; we can consolidate.
let prior = prior_range.as_mut().unwrap();
prior.max = range.max;
prior.cap += range.cap;
prior.has_l0s = prior.has_l0s || has_l0s;
prior.add_files_for_now(files_for_now);
} else {
if let Some(prior_range) = prior_range {
// we'll not be consolidating with with the prior range, so push it
ranges.push(prior_range);
}
let files_for_now = if files_for_now.is_empty() {
None
} else {
Some(files_for_now.clone())
};
let this_range = CompactRange {
op: range.op.clone(),
min: range.min,
max: range.max,
cap: range.cap,
has_l0s,
files_for_now: Mutex::new(files_for_now),
branches: Mutex::new(None),
files_for_later: Mutex::new(None),
};
prior_range = Some(this_range);
};
}
}
if let Some(prior_range) = prior_range {
ranges.push(prior_range);
}
// If we still have several ranges, L2s (if any) need to stay in round_info.files_for_later. If we have 1 range
// without L0s, the L2s can go in that range.
let mut deferred_l2s = last_round_info.take_l2_files_for_later();
if ranges.len() == 1 && !ranges[0].has_l0s && deferred_l2s.is_some() {
ranges[0].add_files_for_now(deferred_l2s.unwrap());
deferred_l2s = None;
}
(ranges, deferred_l2s)
}
// split_files_into_ranges is a helper function for derive_draft_ranges, used when there is no prior round info.
// Its given the files found in the catalog, and puts them into range(s).
fn split_files_into_ranges(
&self,
files: Vec<ParquetFile>,
) -> (Vec<CompactRange>, Option<Vec<ParquetFile>>) {
let (l0_files, other_files): (Vec<ParquetFile>, Vec<ParquetFile>) = files
.into_iter()
.partition(|f| f.compaction_level == CompactionLevel::Initial);
if !l0_files.is_empty() {
// We'll get all the L0 files compacted to L1 before dealing with L2 files - so separate them.
let (l2_files_for_later, mut l1_files): (Vec<ParquetFile>, Vec<ParquetFile>) =
other_files
.into_iter()
.partition(|f| f.compaction_level == CompactionLevel::Final);
// Break up the start level files into chains of files that overlap each other.
// Then we'll determine if vertical splitting is needed within each chain.
let chains = split_into_chains(l0_files);
// This function is detecting what ranges we already have, not identifying splitting to make ranges we want.
// So we may have to combine some chains based on L1s overlapping.
let chains = merge_l1_spanned_chains(chains, &l1_files);
// the goal is nice bite sized chains. If some are very small, merge them with their neighbor(s).
let chains = merge_small_l0_chains(chains, self.max_total_file_size_per_plan);
let mut ranges = Vec::with_capacity(chains.len());
let mut this_split: Vec<ParquetFile>;
for mut chain in chains {
let mut max = chain.iter().map(|f| f.max_time).max().unwrap().get();
// 'chain' is the L0s that will become a region. We also need the L1s and L2s that belong in this region.
(this_split, l1_files) =
l1_files.into_iter().partition(|f| f.min_time.get() <= max);
if !this_split.is_empty() {
max = max.max(this_split.iter().map(|f| f.max_time).max().unwrap().get());
}
this_split.append(&mut chain);
let min = this_split.iter().map(|f| f.min_time).min().unwrap().get();
let cap = this_split
.iter()
.map(|f| f.file_size_bytes as usize)
.sum::<usize>();
ranges.push(CompactRange {
op: CompactType::Deferred {},
min,
max,
cap,
has_l0s: true,
files_for_now: Mutex::new(Some(this_split)),
branches: Mutex::new(None),
files_for_later: Mutex::new(None),
});
}
this_split = l1_files;
if !this_split.is_empty() {
let min = this_split.iter().map(|f| f.min_time).min().unwrap().get();
let max = this_split.iter().map(|f| f.max_time).max().unwrap().get();
let cap = this_split
.iter()
.map(|f| f.file_size_bytes as usize)
.sum::<usize>();
ranges.push(CompactRange {
op: CompactType::Deferred {},
min,
max,
cap,
has_l0s: true,
files_for_now: Mutex::new(Some(this_split)),
branches: Mutex::new(None),
files_for_later: Mutex::new(None),
});
}
let l2_files_for_later = if l2_files_for_later.is_empty() {
None
} else {
Some(l2_files_for_later)
};
(ranges, l2_files_for_later)
} else {
// No start level files, we can put everything in one range.
let min = other_files.iter().map(|f| f.min_time).min().unwrap().get();
let max = other_files.iter().map(|f| f.max_time).max().unwrap().get();
let cap = other_files
.iter()
.map(|f| f.file_size_bytes as usize)
.sum::<usize>();
(
vec![CompactRange {
op: CompactType::Deferred {},
min,
max,
cap,
has_l0s: false,
files_for_now: Mutex::new(Some(other_files)),
branches: Mutex::new(None),
files_for_later: Mutex::new(None),
}],
None,
)
}
}
}
#[async_trait]
impl RoundInfoSource for LevelBasedRoundInfo {
// The calculated RoundInfo is the most impactful decision for this round of compactions.
// Later decisions should be just working out details to implement what RoundInfo dictates.
async fn calculate(
&self,
components: Arc<Components>,
last_round_info: Option<Arc<RoundInfo>>,
partition_info: &PartitionInfo,
files: Vec<ParquetFile>,
) -> Result<(Arc<RoundInfo>, bool), DynError> {
// Step 1: Establish range boundaries, with files in each range.
let (prior_ranges, mut l2_files_for_later) =
self.derive_draft_ranges(partition_info, last_round_info, files);
let range_cnt = prior_ranges.len();
// Step 2: Determine the op for each range.
let mut ranges: Vec<CompactRange> = Vec::with_capacity(range_cnt);
for mut range in prior_ranges {
let files_for_now = range.files_for_now.lock().unwrap().take();
assert!(
files_for_now.is_some(),
"files_for_now should not be None for range {}->{} on partition {}",
range.min,
range.max,
partition_info.partition_id()
);
let files_for_now = files_for_now.unwrap();
// If we're down to a single range, we should check if we're done.
if range_cnt == 1
&& !components
.partition_filter
.apply(partition_info, &files_for_now)
.await?
{
return Ok((
Arc::new(RoundInfo {
ranges,
l2_files_for_later: Mutex::new(l2_files_for_later),
}),
true,
));
}
range.has_l0s = files_for_now
.iter()
.any(|f| f.compaction_level == CompactionLevel::Initial);
if range.has_l0s {
let split_times = self.consider_vertical_splitting(
partition_info.partition_id(),
files_for_now.clone().to_vec(),
self.max_total_file_size_per_plan,
);
if !split_times.is_empty() {
range.op = CompactType::VerticalSplit { split_times };
} else if self
.too_many_small_files_to_compact(&files_for_now, CompactionLevel::Initial)
{
range.op = CompactType::ManySmallFiles {
start_level: CompactionLevel::Initial,
max_num_files_to_group: self.max_num_files_per_plan,
max_total_file_size_to_group: self.max_total_file_size_per_plan,
};
} else {
range.op = CompactType::TargetLevel {
target_level: CompactionLevel::FileNonOverlapped,
max_total_file_size_to_group: self.max_total_file_size_per_plan,
};
}
} else if range_cnt == 1 {
range.op = CompactType::TargetLevel {
target_level: CompactionLevel::Final,
max_total_file_size_to_group: self.max_total_file_size_per_plan,
};
} else {
// The L0s of this range are compacted, but this range needs to hang out a while until its neighbors catch up.
range.op = CompactType::Deferred {};
};
if range.op.is_deferred() {
range.add_files_for_later(files_for_now);
ranges.push(range);
} else {
// start_level is usually the lowest level we have files in, but occasionally we decide to
// compact L1->L2 when L0s still exist. If this comes back as L1, we'll ignore L0s for this
// round and force an early L1-L2 compaction.
let (files_for_now, mut files_later) = components.round_split.split(
files_for_now,
range.op.clone(),
partition_info.partition_id(),
);
let (branches, more_for_later) = components.divide_initial.divide(
files_for_now,
range.op.clone(),
partition_info.partition_id(),
);
files_later.extend(more_for_later);
if !branches.is_empty() {
range.branches = Mutex::new(Some(branches));
} // else, leave it None, since Some is assumed to be non-empty.
if !files_later.is_empty() {
range.files_for_later = Mutex::new(Some(files_later));
} // else, leave it None, since Some is assumed to be non-empty.
ranges.push(range);
}
}
if ranges.len() == 1 && !ranges[0].has_l0s && l2_files_for_later.is_some() {
// Single range without L0s, its time to work on the L2s.
ranges[0].add_files_for_now(l2_files_for_later.unwrap());
l2_files_for_later = None;
}
Ok((
Arc::new(RoundInfo {
ranges,
l2_files_for_later: Mutex::new(l2_files_for_later),
}),
false,
))
}
}
fn get_num_overlapped_files(
start_level_files: Vec<&ParquetFile>,
next_level_files: Vec<&ParquetFile>,
) -> usize {
// min_time and max_time of files in start_level
let (min_time, max_time) =
start_level_files
.iter()
.fold((None, None), |(min_time, max_time), f| {
let min_time = min_time
.map(|v: Timestamp| v.min(f.min_time))
.unwrap_or(f.min_time);
let max_time = max_time
.map(|v: Timestamp| v.max(f.max_time))
.unwrap_or(f.max_time);
(Some(min_time), Some(max_time))
});
// There must be values, otherwise panic
let min_time = min_time.unwrap();
let max_time = max_time.unwrap();
// number of files in next level that overlap with files in start_level
let count_overlapped = next_level_files
.iter()
.filter(|f| f.min_time <= max_time && f.max_time >= min_time)
.count();
count_overlapped
}
#[cfg(test)]
mod tests {
use data_types::CompactionLevel;
use iox_tests::ParquetFileBuilder;
use crate::components::round_info_source::LevelBasedRoundInfo;
#[test]
fn test_too_many_small_files_to_compact() {
// L0 files
let f1 = ParquetFileBuilder::new(1)
.with_time_range(0, 100)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(0)
.build();
let f2 = ParquetFileBuilder::new(2)
.with_time_range(0, 100)
.with_compaction_level(CompactionLevel::Initial)
.with_max_l0_created_at(2)
.build();
// non overlapping L1 file
let f3 = ParquetFileBuilder::new(3)
.with_time_range(101, 200)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
// overlapping L1 file
let f4 = ParquetFileBuilder::new(4)
.with_time_range(50, 150)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
// max 2 files per plan
let round_info = LevelBasedRoundInfo {
max_num_files_per_plan: 2,
max_total_file_size_per_plan: 1000,
};
// f1 and f2 are not over limit
assert!(!round_info
.too_many_small_files_to_compact(&[f1.clone(), f2.clone()], CompactionLevel::Initial));
// f1, f2 and f3 are not over limit
assert!(!round_info.too_many_small_files_to_compact(
&[f1.clone(), f2.clone(), f3.clone()],
CompactionLevel::Initial
));
// f1, f2 and f4 are over limit
assert!(round_info.too_many_small_files_to_compact(
&[f1.clone(), f2.clone(), f4.clone()],
CompactionLevel::Initial
));
// f1, f2, f3 and f4 are over limit
assert!(
round_info.too_many_small_files_to_compact(&[f1, f2, f3, f4], CompactionLevel::Initial)
);
}
}

View File

@ -1,174 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
use crate::round_info::CompactType;
use super::RoundSplit;
#[derive(Debug, Default)]
pub struct ManyFilesRoundSplit;
impl ManyFilesRoundSplit {
pub fn new() -> Self {
Self
}
}
impl Display for ManyFilesRoundSplit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "many_files")
}
}
// TODO(joe): maintain this comment through the next few PRs; see how true the comment is/remains.
// split is the first of three file list manipulation layers. Based on the RoundInfo, split will do
// some simple filtering to remove files easily identifiable as not relevant to this round.
impl RoundSplit for ManyFilesRoundSplit {
fn split(
&self,
files: Vec<ParquetFile>,
op: CompactType,
partition: TransitionPartitionId,
) -> (Vec<ParquetFile>, Vec<ParquetFile>) {
// Scpecify specific arms to avoid missing any new variants
match op {
CompactType::ManySmallFiles { start_level, .. } => {
// Split start_level from the rest
let (start_level_files, rest) = files
.into_iter()
.partition(|f| f.compaction_level == start_level);
(start_level_files, rest)
}
// A TargetLevel round only needs its start (source) and target (destination) levels.
// All other files are a distraction that should wait for another round.
CompactType::TargetLevel { target_level, .. } => {
// Split start_level & target level from the rest
let start_level = target_level.prev();
let (start_files, rest) = files.into_iter().partition(|f| {
f.compaction_level == start_level
|| f.compaction_level == target_level
|| f.compaction_level == CompactionLevel::Final
});
(start_files, rest)
}
CompactType::SimulatedLeadingEdge { .. } => {
// Split first two levels from the rest
let (start_files, rest) = files.into_iter().partition(|f| {
f.compaction_level == CompactionLevel::Initial
|| f.compaction_level == CompactionLevel::FileNonOverlapped
});
(start_files, rest)
}
CompactType::VerticalSplit { split_times } => {
// We're splitting L0 files at split_times. So any L0 that overlaps a split_time needs processed, and all other files are ignored until later.
let (split_files, rest): (Vec<ParquetFile>, Vec<ParquetFile>) =
files.into_iter().partition(|f| {
f.compaction_level != CompactionLevel::Final && f.needs_split(&split_times)
});
assert!(
!split_files.is_empty(),
"if we decided to split, there should be something to split, instead found no split_files for partition {}",
partition
);
(split_files, rest)
}
CompactType::Deferred { .. } => {
// Nothing now, its all for later
(vec![], files)
}
}
}
}
#[cfg(test)]
mod tests {
use data_types::{CompactionLevel, PartitionId};
use iox_tests::ParquetFileBuilder;
use super::*;
#[test]
fn test_display() {
assert_eq!(ManyFilesRoundSplit::new().to_string(), "many_files");
}
#[test]
fn test_split_many_files() {
let op = CompactType::ManySmallFiles {
start_level: CompactionLevel::Initial,
max_num_files_to_group: 2,
max_total_file_size_to_group: 100,
};
let split = ManyFilesRoundSplit::new();
let default_partition = TransitionPartitionId::Deprecated(PartitionId::new(0));
// empty input
assert_eq!(
split.split(vec![], op.clone(), default_partition.clone()),
(vec![], vec![])
);
// all L0
let f1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.build();
let f2 = ParquetFileBuilder::new(2)
.with_compaction_level(CompactionLevel::Initial)
.build();
assert_eq!(
split.split(
vec![f1.clone(), f2.clone()],
op.clone(),
default_partition.clone()
),
(vec![f1.clone(), f2.clone()], vec![])
);
// some L0 some L1 and some l2
let f3 = ParquetFileBuilder::new(3)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.build();
let f4 = ParquetFileBuilder::new(4)
.with_compaction_level(CompactionLevel::Final)
.build();
assert_eq!(
split.split(
vec![f1.clone(), f2.clone(), f3.clone(), f4.clone()],
op.clone(),
default_partition,
),
(vec![f1, f2], vec![f3, f4])
);
}
#[test]
fn test_split_target_level() {
let op = CompactType::TargetLevel {
target_level: CompactionLevel::Final,
max_total_file_size_to_group: 100 * 1024 * 1024,
};
let split = ManyFilesRoundSplit::new();
let default_partition = TransitionPartitionId::Deprecated(PartitionId::new(0));
// empty input
assert_eq!(
split.split(vec![], op.clone(), default_partition.clone()),
(vec![], vec![])
);
// non empty
let f1 = ParquetFileBuilder::new(1).build();
let f2 = ParquetFileBuilder::new(2).build();
assert_eq!(
split.split(vec![f1.clone(), f2.clone()], op.clone(), default_partition),
(vec![f1, f2], vec![])
);
}
}

View File

@ -1,21 +0,0 @@
use std::fmt::{Debug, Display};
use data_types::{ParquetFile, TransitionPartitionId};
use crate::round_info::CompactType;
pub mod many_files;
pub trait RoundSplit: Debug + Display + Send + Sync {
/// Split files into two buckets "now" and "later".
///
/// All files belong to the same partition.
///
/// - **now:** will be processed in this round
/// - **later:** will be processed in the next round
fn split(
&self,
files: Vec<ParquetFile>,
op: CompactType,
partition: TransitionPartitionId,
) -> (Vec<ParquetFile>, Vec<ParquetFile>);
}

View File

@ -1,58 +0,0 @@
use std::fmt::{Debug, Display};
use std::sync::Arc;
use async_trait::async_trait;
use parquet_file::ParquetFilePath;
use uuid::Uuid;
pub mod noop;
pub mod prod;
mod util;
#[cfg(test)]
mod test_util;
/// Create a [`Scratchpad`] for use as intermediate storage
pub trait ScratchpadGen: Debug + Display + Send + Sync {
fn pad(&self) -> Arc<dyn Scratchpad>;
}
/// An intermediate in-memory store (can be a disk later if we want)
/// to stage all inputs and outputs of the compaction. The reasons
/// are:
///
/// **fewer IO ops:** DataFusion's streaming IO requires slightly more IO
/// requests (at least 2 per file) due to the way it is optimized to
/// read as little as possible. It first reads the metadata and then
/// decides which content to fetch. In the compaction case this is
/// (esp. w/o delete predicates) EVERYTHING. So in contrast to the
/// querier, there is no advantage of this approach. In contrary this
/// easily adds 100ms latency to every single input file.
///
/// **less traffic**: For divide&conquer partitions (i.e. when we need
/// to run multiple compaction steps to deal with them) it is kinda
/// pointless to upload an intermediate result just to download it
/// again. The scratchpad avoids that.
///
/// **higher throughput**: We want to limit the number of concurrent
/// DataFusion jobs because we don't wanna blow up the whole process
/// by having too much in-flight arrow data at the same time. However
/// while we perform the actual computation, we were waiting for
/// object store IO. This was limiting our throughput substantially.
///
/// **shadow mode**: De-coupling the stores in this way makes it easier
/// to implement compactor: shadow mode #6645. Shadow mode relies on
/// leaving the compaction output in the scratchpad so
/// `clean_written_from_scratchpad` is a no-op for shadow mode.
///
/// Note that we assume here that the input parquet files are WAY
/// SMALLER than the uncompressed Arrow data during compaction itself.
#[async_trait]
pub trait Scratchpad: Debug + Send + Sync + 'static {
fn uuids(&self, files: &[ParquetFilePath]) -> Vec<Uuid>;
async fn load_to_scratchpad(&self, files: &[ParquetFilePath]) -> Vec<Uuid>;
async fn make_public(&self, files: &[ParquetFilePath]) -> Vec<Uuid>;
async fn clean_from_scratchpad(&self, files: &[ParquetFilePath]);
async fn clean_written_from_scratchpad(&self, files: &[ParquetFilePath]);
async fn clean(&self);
}

View File

@ -1,53 +0,0 @@
use std::{fmt::Display, sync::Arc};
use async_trait::async_trait;
use parquet_file::ParquetFilePath;
use uuid::Uuid;
use super::{Scratchpad, ScratchpadGen};
/// A scratchpad that ignores all inputs and outputs, for use in testing
#[derive(Debug, Default)]
pub struct NoopScratchpadGen;
impl NoopScratchpadGen {
pub fn new() -> Self {
Self
}
}
impl Display for NoopScratchpadGen {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "noop")
}
}
impl ScratchpadGen for NoopScratchpadGen {
fn pad(&self) -> Arc<dyn Scratchpad> {
Arc::new(NoopScratchpad)
}
}
#[derive(Debug)]
struct NoopScratchpad;
#[async_trait]
impl Scratchpad for NoopScratchpad {
fn uuids(&self, files: &[ParquetFilePath]) -> Vec<Uuid> {
files.iter().map(|f| f.objest_store_id()).collect()
}
async fn load_to_scratchpad(&self, files: &[ParquetFilePath]) -> Vec<Uuid> {
files.iter().map(|f| f.objest_store_id()).collect()
}
async fn make_public(&self, files: &[ParquetFilePath]) -> Vec<Uuid> {
files.iter().map(|f| f.objest_store_id()).collect()
}
async fn clean_from_scratchpad(&self, _files: &[ParquetFilePath]) {}
async fn clean_written_from_scratchpad(&self, _files: &[ParquetFilePath]) {}
async fn clean(&self) {}
}

View File

@ -1,551 +0,0 @@
use std::{
collections::{hash_map::Entry, HashMap},
fmt::Display,
num::NonZeroUsize,
sync::{Arc, RwLock},
};
use async_trait::async_trait;
use backoff::BackoffConfig;
use object_store::DynObjectStore;
use observability_deps::tracing::warn;
use parquet_file::ParquetFilePath;
use uuid::Uuid;
use super::{
util::{copy_files, delete_files},
Scratchpad, ScratchpadGen,
};
#[derive(Debug)]
pub struct ProdScratchpadGen {
concurrency: NonZeroUsize,
shadow_mode: bool,
backoff_config: BackoffConfig,
store_input: Arc<DynObjectStore>,
store_scratchpad: Arc<DynObjectStore>,
store_output: Arc<DynObjectStore>,
}
impl ProdScratchpadGen {
pub fn new(
shadow_mode: bool,
concurrency: NonZeroUsize,
backoff_config: BackoffConfig,
store_input: Arc<DynObjectStore>,
store_scratchpad: Arc<DynObjectStore>,
store_output: Arc<DynObjectStore>,
) -> Self {
Self {
shadow_mode,
concurrency,
backoff_config,
store_input,
store_scratchpad,
store_output,
}
}
}
impl Display for ProdScratchpadGen {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "prod")
}
}
/// ScratchpadGen is the factory pattern; it creates Scratchpads
impl ScratchpadGen for ProdScratchpadGen {
fn pad(&self) -> Arc<dyn Scratchpad> {
Arc::new(ProdScratchpad {
shadow_mode: self.shadow_mode,
concurrency: self.concurrency,
backoff_config: self.backoff_config.clone(),
store_input: Arc::clone(&self.store_input),
store_scratchpad: Arc::clone(&self.store_scratchpad),
store_output: Arc::clone(&self.store_output),
mask: Uuid::new_v4(),
files_unmasked: RwLock::new(HashMap::default()),
})
}
}
struct ProdScratchpad {
shadow_mode: bool,
concurrency: NonZeroUsize,
backoff_config: BackoffConfig,
store_input: Arc<DynObjectStore>,
store_scratchpad: Arc<DynObjectStore>,
store_output: Arc<DynObjectStore>,
mask: Uuid,
/// Set of known, unmasked file.
///
/// If the file is part of this map, it is in the scratchpad. If the boolean key is set, it was already copied to
/// the output store
files_unmasked: RwLock<HashMap<ParquetFilePath, bool>>,
}
impl std::fmt::Debug for ProdScratchpad {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let ref_files_unmasked = self.files_unmasked.read().unwrap();
f.debug_struct("ProdScratchpad")
.field("concurrency", &self.concurrency)
.field("backoff_config", &self.backoff_config)
.field("store_input", &self.store_input)
.field("store_scratchpad", &self.store_scratchpad)
.field("store_output", &self.store_output)
.field("mask", &self.mask)
.field("files_unmasked", &ref_files_unmasked)
.finish()
}
}
impl ProdScratchpad {
fn apply_mask(&self, files: &[ParquetFilePath]) -> (Vec<ParquetFilePath>, Vec<Uuid>) {
files
.iter()
.map(|f| {
let uuid = Self::xor_uuids(f.objest_store_id(), self.mask);
let f = (f.clone()).with_object_store_id(uuid);
(f, uuid)
})
.unzip()
}
fn xor_uuids(a: Uuid, b: Uuid) -> Uuid {
Uuid::from_u128(a.as_u128() ^ b.as_u128())
}
fn check_known(
&self,
files_unmasked: &[ParquetFilePath],
files_masked: &[ParquetFilePath],
output: bool,
) -> (Vec<ParquetFilePath>, Vec<ParquetFilePath>) {
let mut ref_files_unmasked = self.files_unmasked.write().unwrap();
files_unmasked
.iter()
.zip(files_masked)
.filter(|(f_unmasked, _f_masked)| {
match ref_files_unmasked.entry((*f_unmasked).clone()) {
Entry::Occupied(mut o) => {
let old_var = *o.get();
*o.get_mut() |= output;
output && !old_var
}
Entry::Vacant(v) => {
v.insert(output);
true
}
}
})
.map(|(un, masked)| (un.clone(), masked.clone()))
.unzip()
}
}
impl Drop for ProdScratchpad {
fn drop(&mut self) {
let mut ref_files_unmasked = self.files_unmasked.write().unwrap();
if !ref_files_unmasked.is_empty() {
warn!("scratchpad context not cleaned, may leak resources");
// clean up eventually
// Note: Use manual clean up code and do not create yet-another ProdScratchpad to avoid infinite recursions
// during drop.
let files = ref_files_unmasked
.drain()
.map(|(k, _in_out)| k)
.collect::<Vec<_>>();
let (files_masked, _uuids) = self.apply_mask(&files);
let store_scratchpad = Arc::clone(&self.store_scratchpad);
let concurrency = self.concurrency;
let backoff_config = self.backoff_config.clone();
tokio::spawn(async move {
delete_files(
&files_masked,
Arc::clone(&store_scratchpad),
&backoff_config,
concurrency,
)
.await;
});
}
}
}
#[async_trait]
impl Scratchpad for ProdScratchpad {
fn uuids(&self, files: &[ParquetFilePath]) -> Vec<Uuid> {
let (_, uuids) = self.apply_mask(files);
uuids
}
async fn load_to_scratchpad(&self, files: &[ParquetFilePath]) -> Vec<Uuid> {
let (files_to, uuids) = self.apply_mask(files);
let (files_from, files_to) = self.check_known(files, &files_to, false);
copy_files(
&files_from,
&files_to,
Arc::clone(&self.store_input),
Arc::clone(&self.store_scratchpad),
&self.backoff_config,
self.concurrency,
)
.await;
uuids
}
async fn make_public(&self, files: &[ParquetFilePath]) -> Vec<Uuid> {
let (files_to, uuids) = self.apply_mask(files);
// only keep files that we did not know about, all others we've already synced it between the two stores
let (files_to, files_from) = self.check_known(&files_to, files, true);
copy_files(
&files_from,
&files_to,
Arc::clone(&self.store_scratchpad),
Arc::clone(&self.store_output),
&self.backoff_config,
self.concurrency,
)
.await;
uuids
}
// clean_from_scratchpad selectively removes some files from the scratchpad.
// This should be called after uploading files to objectstore.
// Cleaning should be done regularly, so the scratchpad doesn't get too big.
async fn clean_from_scratchpad(&self, files: &[ParquetFilePath]) {
let files_masked: Vec<ParquetFilePath>;
let _uuid: Vec<Uuid>;
// scope the files_unmasked lock to protect manipulation of the scratchpad's state, but release it
// before doing the async delete of files removed from the scratchpad.
{
let mut ref_files_unmasked = self.files_unmasked.write().unwrap();
let files = files
.iter()
.filter(|f| ref_files_unmasked.remove(f).is_some())
.cloned()
.collect::<Vec<_>>();
(files_masked, _uuid) = self.apply_mask(&files);
}
delete_files(
&files_masked,
Arc::clone(&self.store_scratchpad),
&self.backoff_config,
self.concurrency,
)
.await;
}
// clean_written_from_scratchpad is the same as clean_from_scratchpad, but it does not remove files
// when in shadow mode, since in shadow mode the scratchpad is the only copy of files.
async fn clean_written_from_scratchpad(&self, files: &[ParquetFilePath]) {
if !self.shadow_mode {
self.clean_from_scratchpad(files).await;
}
}
async fn clean(&self) {
// clean will remove all files in the scratchpad as of the time files_unmasked is locked.
let files: Vec<_> = self
.files_unmasked
.read()
.unwrap()
.keys()
.cloned()
.collect();
// self.files_unmasked is locked again in clean_from_scratchpad. If another thread removes a file
// between this relock, clean_from_scratchpad will skip it.
self.clean_from_scratchpad(&files).await;
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use test_helpers::{maybe_start_logging, tracing::TracingCapture};
use crate::components::scratchpad::test_util::{assert_content, file_path, stores};
use compactor_test_utils::list_object_store;
use super::*;
#[test]
fn test_display() {
let (store_input, store_scratchpad, store_output) = stores();
let gen = ProdScratchpadGen::new(
true,
NonZeroUsize::new(1).unwrap(),
BackoffConfig::default(),
store_input,
store_scratchpad,
store_output,
);
assert_eq!(gen.to_string(), "prod");
}
#[tokio::test]
async fn test_staging() {
maybe_start_logging();
let (store_input, store_scratchpad, store_output) = stores();
let gen = ProdScratchpadGen::new(
true,
NonZeroUsize::new(1).unwrap(),
BackoffConfig::default(),
Arc::clone(&store_input),
Arc::clone(&store_scratchpad),
Arc::clone(&store_output),
);
let pad = gen.pad();
let f1 = file_path(1);
let f2 = file_path(2);
let f3 = file_path(3);
let f4 = file_path(4);
let f5_masked = file_path(5);
let f6_masked = file_path(6);
let f7_masked = file_path(7);
for f in [&f1, &f2, &f3, &f4] {
store_input
.put(&f.object_store_path(), vec![].into())
.await
.unwrap();
}
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(&store_scratchpad, []).await;
assert_content(&store_output, []).await;
let early_get_uuids = pad.uuids(&[f1.clone(), f2.clone()]);
let uuids = pad.load_to_scratchpad(&[f1.clone(), f2.clone()]).await;
assert_eq!(uuids.len(), 2);
assert_eq!(early_get_uuids, uuids);
let f1_masked = f1.clone().with_object_store_id(uuids[0]);
let f2_masked = f2.clone().with_object_store_id(uuids[1]);
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(&store_scratchpad, [&f1_masked, &f2_masked]).await;
assert_content(&store_output, []).await;
let uuids = pad.load_to_scratchpad(&[f2.clone(), f3.clone()]).await;
assert_eq!(uuids.len(), 2);
assert_eq!(f2_masked.objest_store_id(), uuids[0]);
let f3_masked = f3.clone().with_object_store_id(uuids[1]);
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(&store_scratchpad, [&f1_masked, &f2_masked, &f3_masked]).await;
assert_content(&store_output, []).await;
for f in [&f5_masked, &f6_masked, &f7_masked] {
store_scratchpad
.put(&f.object_store_path(), vec![].into())
.await
.unwrap();
}
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(
&store_scratchpad,
[
&f1_masked, &f2_masked, &f3_masked, &f5_masked, &f6_masked, &f7_masked,
],
)
.await;
assert_content(&store_output, []).await;
let uuids = pad
.make_public(&[f5_masked.clone(), f6_masked.clone()])
.await;
assert_eq!(uuids.len(), 2);
let f5 = f5_masked.clone().with_object_store_id(uuids[0]);
let f6 = f6_masked.clone().with_object_store_id(uuids[1]);
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(
&store_scratchpad,
[
&f1_masked, &f2_masked, &f3_masked, &f5_masked, &f6_masked, &f7_masked,
],
)
.await;
assert_content(&store_output, [&f5, &f6]).await;
let uuids = pad.make_public(&[f1_masked.clone()]).await;
assert_eq!(uuids.len(), 1);
assert_eq!(f1.objest_store_id(), uuids[0]);
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(
&store_scratchpad,
[
&f1_masked, &f2_masked, &f3_masked, &f5_masked, &f6_masked, &f7_masked,
],
)
.await;
assert_content(&store_output, [&f1, &f5, &f6]).await;
// we're in shadow mode, so written (compaction output) files must be be removed.
pad.clean_written_from_scratchpad(&[f1.clone(), f5.clone()])
.await;
// they're still there
assert_content(
&store_scratchpad,
[
&f1_masked, &f2_masked, &f3_masked, &f5_masked, &f6_masked, &f7_masked,
],
)
.await;
pad.clean_from_scratchpad(&[f1.clone(), f5.clone()]).await;
assert_content(
&store_scratchpad,
[&f2_masked, &f3_masked, &f6_masked, &f7_masked],
)
.await;
// Reload a cleaned file back into the scratchpad, simulating a backlogged partition that
// requires several compaction loops (where the output of one compaction is later the input
// to a subsequent compaction).
let uuids = pad.load_to_scratchpad(&[f1.clone()]).await;
assert_eq!(uuids.len(), 1);
assert_eq!(f1_masked.objest_store_id(), uuids[0]);
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(
&store_scratchpad,
[&f1_masked, &f2_masked, &f3_masked, &f6_masked, &f7_masked],
)
.await;
assert_content(&store_output, [&f1, &f5, &f6]).await;
pad.clean().await;
assert_content(&store_input, [&f1, &f2, &f3, &f4]).await;
assert_content(&store_scratchpad, [&f7_masked]).await; // pad didn't know about these files
assert_content(&store_output, [&f1, &f5, &f6]).await;
}
#[tokio::test]
async fn test_collision() {
let (store_input, store_scratchpad, store_output) = stores();
let gen = ProdScratchpadGen::new(
false,
NonZeroUsize::new(1).unwrap(),
BackoffConfig::default(),
Arc::clone(&store_input),
Arc::clone(&store_scratchpad),
Arc::clone(&store_output),
);
let pad1 = gen.pad();
let pad2 = gen.pad();
let f = file_path(1);
store_input
.put(&f.object_store_path(), Default::default())
.await
.unwrap();
let uuids = pad1.load_to_scratchpad(&[f.clone()]).await;
assert_eq!(uuids.len(), 1);
let f_masked1 = f.clone().with_object_store_id(uuids[0]);
let uuids = pad2.load_to_scratchpad(&[f.clone()]).await;
assert_eq!(uuids.len(), 1);
let f_masked2 = f.with_object_store_id(uuids[0]);
assert_content(&store_scratchpad, [&f_masked1, &f_masked2]).await;
pad2.clean().await;
assert_content(&store_scratchpad, [&f_masked1]).await;
}
#[tokio::test]
async fn test_clean_on_drop() {
let (store_input, store_scratchpad, store_output) = stores();
let gen = ProdScratchpadGen::new(
false,
NonZeroUsize::new(1).unwrap(),
BackoffConfig::default(),
Arc::clone(&store_input),
Arc::clone(&store_scratchpad),
Arc::clone(&store_output),
);
let pad = gen.pad();
let f = file_path(1);
store_input
.put(&f.object_store_path(), Default::default())
.await
.unwrap();
pad.load_to_scratchpad(&[f]).await;
let capture = TracingCapture::new();
drop(pad);
// warning emitted
assert_eq!(
capture.to_string(),
"level = WARN; message = scratchpad context not cleaned, may leak resources; "
);
// eventually cleaned up
tokio::time::timeout(Duration::from_secs(5), async {
loop {
if list_object_store(&store_scratchpad).await.is_empty() {
return;
}
tokio::time::sleep(Duration::from_millis(10)).await;
}
})
.await
.expect("no timeout");
}
#[tokio::test]
#[should_panic(expected = "foo")]
async fn test_clean_does_not_crash_on_panic() {
let (store_input, store_scratchpad, store_output) = stores();
let gen = ProdScratchpadGen::new(
false,
NonZeroUsize::new(1).unwrap(),
BackoffConfig::default(),
Arc::clone(&store_input),
Arc::clone(&store_scratchpad),
Arc::clone(&store_output),
);
let pad = gen.pad();
let f = file_path(1);
store_input
.put(&f.object_store_path(), Default::default())
.await
.unwrap();
pad.load_to_scratchpad(&[f]).await;
panic!("foo");
}
}

View File

@ -1,43 +0,0 @@
use std::{collections::HashSet, sync::Arc};
use data_types::{NamespaceId, PartitionId, TableId, TransitionPartitionId};
use object_store::{memory::InMemory, DynObjectStore};
use parquet_file::ParquetFilePath;
use uuid::Uuid;
use compactor_test_utils::list_object_store;
pub fn stores() -> (
Arc<DynObjectStore>,
Arc<DynObjectStore>,
Arc<DynObjectStore>,
) {
(
Arc::new(InMemory::new()),
Arc::new(InMemory::new()),
Arc::new(InMemory::new()),
)
}
pub fn file_path(i: u128) -> ParquetFilePath {
ParquetFilePath::new(
NamespaceId::new(1),
TableId::new(1),
&TransitionPartitionId::Deprecated(PartitionId::new(1)),
Uuid::from_u128(i),
)
}
pub async fn assert_content<const N: usize>(
store: &Arc<DynObjectStore>,
files: [&ParquetFilePath; N],
) {
let expected = files
.iter()
.map(|f| f.object_store_path())
.collect::<HashSet<_>>();
assert_eq!(expected.len(), N, "duplicate files in expected clause");
let actual = list_object_store(store).await;
assert_eq!(actual, expected);
}

View File

@ -1,63 +0,0 @@
use std::{num::NonZeroUsize, sync::Arc};
use backoff::{Backoff, BackoffConfig};
use futures::StreamExt;
use object_store::DynObjectStore;
use parquet_file::ParquetFilePath;
pub async fn copy_files(
files_in: &[ParquetFilePath],
files_out: &[ParquetFilePath],
from: Arc<DynObjectStore>,
to: Arc<DynObjectStore>,
backoff_config: &BackoffConfig,
concurrency: NonZeroUsize,
) {
futures::stream::iter(files_in.iter().cloned().zip(files_out.to_vec()))
.map(|(f_in, f_out)| {
let backoff_config = backoff_config.clone();
let from = Arc::clone(&from);
let to = Arc::clone(&to);
let path_in = f_in.object_store_path();
let path_out = f_out.object_store_path();
async move {
Backoff::new(&backoff_config)
.retry_all_errors("copy file", || async {
let bytes = from.get(&path_in).await?.bytes().await?;
to.put(&path_out, bytes).await?;
Ok::<_, object_store::Error>(())
})
.await
.expect("retry forever")
}
})
.buffer_unordered(concurrency.get())
.collect::<()>()
.await;
}
pub async fn delete_files(
files: &[ParquetFilePath],
store: Arc<DynObjectStore>,
backoff_config: &BackoffConfig,
concurrency: NonZeroUsize,
) {
// Note: `files.to_vec()` is required to avoid rustc freaking out about lifetimes
futures::stream::iter(files.to_vec())
.map(|f| {
let backoff_config = backoff_config.clone();
let store = Arc::clone(&store);
let path = f.object_store_path();
async move {
Backoff::new(&backoff_config)
.retry_all_errors("delete file", || async { store.delete(&path).await })
.await
.expect("retry forever")
}
})
.buffer_unordered(concurrency.get())
.collect::<()>()
.await;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,519 +0,0 @@
use data_types::{ParquetFile, TimestampMinMax, TransitionPartitionId};
use crate::{components::ir_planner::planner_v1::V1IRPlanner, file_classification::FileToSplit};
// max number of files in a minimum possible compacting set
const MAX_FILE_NUM: usize = 2;
// percentage of soft limit of max desired file size allowed to be exceeded when splitting
pub const PERCENTAGE_OF_SOFT_EXCEEDED: f64 = 0.1;
/// Return `[files_to_split]` and `[files_not_to_split]` of the given files.
/// files_to_split are the files that are larger than max_desired_file_size.
/// files_not_to_split are the files that are smaller than max_desired_file_size.
pub fn compute_split_times_for_large_files(
files: Vec<ParquetFile>,
max_desired_file_size: u64,
max_compact_size: usize,
partition: TransitionPartitionId,
) -> (Vec<FileToSplit>, Vec<ParquetFile>) {
// Sanity checks
// There must be at most 2 files
assert!(
files.len() <= MAX_FILE_NUM && !files.is_empty(),
"there must be at least one file and at most {MAX_FILE_NUM} files, instead found {} files, partition_id={}",
files.len(),
partition,
);
// max compact size must at least MAX_FILE_NUM times larger then max desired file size to ensure the split works
assert!(
max_compact_size >= MAX_FILE_NUM * max_desired_file_size as usize,
"max_compact_size {max_compact_size} must be at least {MAX_FILE_NUM} times larger than max_desired_file_size {max_desired_file_size}, partition_id={partition}",
);
// Total size of files must be larger than max_compact_size
let total_size: i64 = files.iter().map(|f| f.file_size_bytes).sum();
assert!(
total_size as usize > max_compact_size,
"total size of files {total_size} must be larger than max_compact_size {max_compact_size}, partition_id={partition}",
);
// Split files over max_desired_file_size into multiple files each is softly around max_desired_file_size
let mut files_to_split = Vec::with_capacity(files.len());
let mut files_not_to_split = Vec::with_capacity(files.len());
for file in files.into_iter() {
let file_size = file.file_size_bytes as u64;
let min_time = file.min_time.get();
let max_time = file.max_time.get();
// TODO: it would be nice to check if these files overlap (e.g. if they're multiple levels), and
// coordinate the split time across all the files, rather than deciding the split time for each file
// as if its the only file under consideration.
// only split files that are larger than max_desired_file_size and have time range at least 2
let max_file_size =
(max_desired_file_size as f64 * (1.0 + PERCENTAGE_OF_SOFT_EXCEEDED)) as u64;
if file_size > max_file_size && file.min_time < file.max_time {
if file.min_time < file.max_time - 1 {
// The time range of the file is big enough we have choices for split time(s), so compute them.
let file_times = vec![TimestampMinMax {
min: min_time,
max: max_time,
}];
let split_times = V1IRPlanner::compute_split_time(
file_times,
min_time,
max_time,
file_size,
max_desired_file_size,
);
files_to_split.push(FileToSplit { file, split_times });
} else {
// The file covers 2ns. There's nothing to compute, split it the only place possible.
// When splitting, split time is the last ns included in the 'left' file on the split.
// So setting `min` as the split time means `min` goes to the left, and `max` goes to the right.
let split_times = vec![file.min_time.get()];
files_to_split.push(FileToSplit { file, split_times });
}
} else {
files_not_to_split.push(file);
}
}
(files_to_split, files_not_to_split)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use compactor_test_utils::{
create_fake_partition_id, create_overlapped_l0_l1_files_3,
create_overlapped_two_overlapped_files, format_files, format_files_split, TestTimes,
};
use data_types::CompactionLevel;
use iox_tests::ParquetFileBuilder;
use iox_time::{MockProvider, Time};
use crate::components::split_or_compact::large_files_to_split::compute_split_times_for_large_files;
const FILE_SIZE: i64 = 100;
// empty input
#[test]
#[should_panic(
expected = "there must be at least one file and at most 2 files, instead found 0 files, partition_id=0"
)]
fn test_empty_input() {
let (_files_to_split, _files_not_to_split) = compute_split_times_for_large_files(
vec![],
(FILE_SIZE + 1) as u64,
((FILE_SIZE + 1) * 3) as usize,
create_fake_partition_id(),
);
}
// more than 2 files
#[test]
#[should_panic(
expected = "there must be at least one file and at most 2 files, instead found 5 files, partition_id=0"
)]
fn test_too_many_files() {
let files = create_overlapped_l0_l1_files_3(FILE_SIZE);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 100b "
- "L0.2[650,750] 180s |------L0.2------| "
- "L0.1[450,550] 120s |------L0.1------| "
- "L0.3[800,900] 300s |------L0.3------|"
- "L1, all files 100b "
- "L1.13[600,700] 60s |-----L1.13------| "
- "L1.12[400,500] 60s |-----L1.12------| "
"###
);
let (_files_to_split, _files_not_to_split) = compute_split_times_for_large_files(
files,
(FILE_SIZE + 1) as u64,
((FILE_SIZE + 1) * 3) as usize,
create_fake_partition_id(),
);
}
// invalid max compact size
#[test]
#[should_panic(
expected = "max_compact_size 111 must be at least 2 times larger than max_desired_file_size 101"
)]
fn test_invalid_max_compact_size() {
let files = create_overlapped_two_overlapped_files(FILE_SIZE);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 100b "
- "L0.1[450,620] 120s |-------------------------------L0.1--------------------------------| "
- "L1, all files 100b "
- "L1.11[400,500] 60s |----------------L1.11-----------------| "
"###
);
let (_files_to_split, _files_not_to_split) = compute_split_times_for_large_files(
files,
(FILE_SIZE + 1) as u64,
((FILE_SIZE + 1) + 10) as usize,
create_fake_partition_id(),
);
}
// invalid total size
#[test]
#[should_panic(
expected = "total size of files 200 must be larger than max_compact_size 300, partition_id=0"
)]
fn test_invalid_total_size() {
let files = create_overlapped_two_overlapped_files(FILE_SIZE);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 100b "
- "L0.1[450,620] 120s |-------------------------------L0.1--------------------------------| "
- "L1, all files 100b "
- "L1.11[400,500] 60s |----------------L1.11-----------------| "
"###
);
let (_files_to_split, _files_not_to_split) = compute_split_times_for_large_files(
files,
FILE_SIZE as u64,
(FILE_SIZE * 3) as usize,
create_fake_partition_id(),
);
}
// split both large files
#[test]
fn test_split_both_large_files() {
let file_size = FILE_SIZE;
let max_desired_file_size = (FILE_SIZE / 4) as u64;
let max_compact_size = (max_desired_file_size * 3) as usize;
let files = create_overlapped_two_overlapped_files(file_size);
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 100b "
- "L0.1[450,620] 120s |-------------------------------L0.1--------------------------------| "
- "L1, all files 100b "
- "L1.11[400,500] 60s |----------------L1.11-----------------| "
"###
);
let (files_to_split, files_not_to_split) = compute_split_times_for_large_files(
files,
max_desired_file_size,
max_compact_size,
create_fake_partition_id(),
);
// See layout of 2 set of files
let files_to_split = files_to_split
.into_iter()
.map(|f| f.file)
.collect::<Vec<_>>();
insta::assert_yaml_snapshot!(
format_files_split("files to split", &files_to_split , "files not to split:", &files_not_to_split),
@r###"
---
- files to split
- "L0, all files 100b "
- "L0.1[450,620] 120s |-------------------------------L0.1--------------------------------| "
- "L1, all files 100b "
- "L1.11[400,500] 60s |----------------L1.11-----------------| "
- "files not to split:"
"###
);
}
// split only the large file start level file
#[test]
fn test_split_large_start_level() {
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
let time = TestTimes::new(&time_provider);
let large_size = FILE_SIZE * 3;
let small_size = FILE_SIZE / 2;
let max_desired_file_size = FILE_SIZE as u64;
let max_compact_size = (max_desired_file_size * 3) as usize;
let l1_1 = ParquetFileBuilder::new(11)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_time_range(400, 500)
.with_file_size_bytes(small_size)
.with_max_l0_created_at(time.time_1_minute_future)
.build();
// L0_1 overlaps with L1_1
let l0_1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.with_time_range(450, 620)
.with_file_size_bytes(large_size)
.with_max_l0_created_at(time.time_2_minutes_future)
.build();
let files = vec![l1_1, l0_1];
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.1[450,620] 120s 300b |-------------------------------L0.1--------------------------------| "
- "L1 "
- "L1.11[400,500] 60s 50b |----------------L1.11-----------------| "
"###
);
let (files_to_split, files_not_to_split) = compute_split_times_for_large_files(
files,
max_desired_file_size,
max_compact_size,
create_fake_partition_id(),
);
// The split files should be L0_1 with 2 split times to split the file into 3 smaller files
assert_eq!(files_to_split.len(), 1);
assert_eq!(files_to_split[0].split_times.len(), 2);
// See layout of 2 set of files
let files_to_split = files_to_split
.into_iter()
.map(|f| f.file)
.collect::<Vec<_>>();
insta::assert_yaml_snapshot!(
format_files_split("files to split", &files_to_split , "files not to split:", &files_not_to_split),
@r###"
---
- files to split
- "L0, all files 300b "
- "L0.1[450,620] 120s |------------------------------------------L0.1------------------------------------------|"
- "files not to split:"
- "L1, all files 50b "
- "L1.11[400,500] 60s |-----------------------------------------L1.11------------------------------------------|"
"###
);
}
// split only the large file target level file
#[test]
fn test_split_large_target_level() {
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
let time = TestTimes::new(&time_provider);
let large_size = FILE_SIZE * 3;
let small_size = FILE_SIZE / 2;
let max_desired_file_size = FILE_SIZE as u64;
let max_compact_size = (max_desired_file_size * 3) as usize;
let l1_1 = ParquetFileBuilder::new(11)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_time_range(400, 500)
.with_file_size_bytes(large_size)
.with_max_l0_created_at(time.time_1_minute_future)
.build();
// L0_1 overlaps with L1_1
let l0_1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.with_time_range(450, 620)
.with_file_size_bytes(small_size)
.with_max_l0_created_at(time.time_2_minutes_future)
.build();
let files = vec![l1_1, l0_1];
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0 "
- "L0.1[450,620] 120s 50b |-------------------------------L0.1--------------------------------| "
- "L1 "
- "L1.11[400,500] 60s 300b |----------------L1.11-----------------| "
"###
);
let (files_to_split, files_not_to_split) = compute_split_times_for_large_files(
files,
max_desired_file_size,
max_compact_size,
create_fake_partition_id(),
);
// The split files should be L1_1 with 2 split times to split the file into 3 smaller files
assert_eq!(files_to_split.len(), 1);
assert_eq!(files_to_split[0].split_times.len(), 2);
// See layout of 2 set of files
let files_to_split = files_to_split
.into_iter()
.map(|f| f.file)
.collect::<Vec<_>>();
insta::assert_yaml_snapshot!(
format_files_split("files to split", &files_to_split , "files not to split:", &files_not_to_split),
@r###"
---
- files to split
- "L1, all files 300b "
- "L1.11[400,500] 60s |-----------------------------------------L1.11------------------------------------------|"
- "files not to split:"
- "L0, all files 50b "
- "L0.1[450,620] 120s |------------------------------------------L0.1------------------------------------------|"
"###
);
}
// tiny time-range on one file
#[test]
fn test_one_file_with_tiny_time_range() {
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
let time = TestTimes::new(&time_provider);
let large_size = FILE_SIZE * 3;
let max_desired_file_size = FILE_SIZE as u64;
let max_compact_size = (max_desired_file_size * 3) as usize;
let l1_1 = ParquetFileBuilder::new(11)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_time_range(400, 401)
.with_file_size_bytes(large_size)
.with_max_l0_created_at(time.time_1_minute_future)
.build();
// L0_1 overlaps with L1_1
let l0_1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.with_time_range(400, 620)
.with_file_size_bytes(large_size)
.with_max_l0_created_at(time.time_2_minutes_future)
.build();
let files = vec![l1_1, l0_1];
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 300b "
- "L0.1[400,620] 120s |-----------------------------------------L0.1------------------------------------------| "
- "L1, all files 300b "
- "L1.11[400,401] 60s |L1.11| "
"###
);
let (files_to_split, files_not_to_split) = compute_split_times_for_large_files(
files,
max_desired_file_size,
max_compact_size,
create_fake_partition_id(),
);
// The split files should be L1_1 with 2 split times to split the file into 3 smaller files, and the L1 split at the only
// time possible (since its a 2ns file, there is only one choice)
assert_eq!(files_to_split.len(), 2);
assert_eq!(files_to_split[0].split_times.len(), 1);
assert_eq!(files_to_split[1].split_times.len(), 2);
// See layout of 2 set of files
let files_to_split = files_to_split
.into_iter()
.map(|f| f.file)
.collect::<Vec<_>>();
insta::assert_yaml_snapshot!(
format_files_split("files to split", &files_to_split , "files not to split:", &files_not_to_split),
@r###"
---
- files to split
- "L0, all files 300b "
- "L0.1[400,620] 120s |-----------------------------------------L0.1------------------------------------------| "
- "L1, all files 300b "
- "L1.11[400,401] 60s |L1.11| "
- "files not to split:"
"###
);
}
// tiny time-range on both files
#[test]
fn test_two_files_with_tiny_time_range() {
let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
let time = TestTimes::new(&time_provider);
let large_size = FILE_SIZE * 3;
let max_desired_file_size = FILE_SIZE as u64;
let max_compact_size = (max_desired_file_size * 3) as usize;
let l1_1 = ParquetFileBuilder::new(11)
.with_compaction_level(CompactionLevel::FileNonOverlapped)
.with_time_range(400, 401)
.with_file_size_bytes(large_size)
.with_max_l0_created_at(time.time_1_minute_future)
.build();
// L0_1 overlaps with L1_1
let l0_1 = ParquetFileBuilder::new(1)
.with_compaction_level(CompactionLevel::Initial)
.with_time_range(400, 400)
.with_file_size_bytes(large_size)
.with_max_l0_created_at(time.time_2_minutes_future)
.build();
let files = vec![l1_1, l0_1];
insta::assert_yaml_snapshot!(
format_files("initial", &files),
@r###"
---
- initial
- "L0, all files 300b "
- "L0.1[400,400] 120s |L0.1| "
- "L1, all files 300b "
- "L1.11[400,401] 60s |-----------------------------------------L1.11------------------------------------------|"
"###
);
let (files_to_split, files_not_to_split) = compute_split_times_for_large_files(
files,
max_desired_file_size,
max_compact_size,
create_fake_partition_id(),
);
// The split files should be L1_11, split at the only time possible (since its a 2ns file, there is only one choice)
assert_eq!(files_to_split.len(), 1);
assert_eq!(files_to_split[0].split_times.len(), 1);
assert_eq!(files_to_split[0].split_times[0], 400);
// See layout of 2 set of files
let files_to_split = files_to_split
.into_iter()
.map(|f| f.file)
.collect::<Vec<_>>();
insta::assert_yaml_snapshot!(
format_files_split("files to split", &files_to_split , "files not to split:", &files_not_to_split),
@r###"
---
- files to split
- "L1, all files 300b "
- "L1.11[400,401] 60s |-----------------------------------------L1.11------------------------------------------|"
- "files not to split:"
- "L0, all files 300b "
- "L0.1[400,400] 120s |------------------------------------------L0.1------------------------------------------|"
"###
);
}
}

View File

@ -1,60 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile};
use observability_deps::tracing::info;
use crate::{file_classification::FilesToSplitOrCompact, partition_info::PartitionInfo};
use super::SplitOrCompact;
#[derive(Debug)]
pub struct LoggingSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
inner: T,
}
impl<T> LoggingSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
pub fn new(inner: T) -> Self {
Self { inner }
}
}
impl<T> Display for LoggingSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "display({})", self.inner)
}
}
impl<T> SplitOrCompact for LoggingSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
fn apply(
&self,
partition_info: &PartitionInfo,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
) -> (FilesToSplitOrCompact, Vec<ParquetFile>) {
let (files_to_split_or_compact, files_to_keep) =
self.inner.apply(partition_info, files, target_level);
info!(
partition_id = partition_info.partition_id.get(),
target_level = %target_level,
files_to_compact = files_to_split_or_compact.num_files_to_compact(),
files_to_split = files_to_split_or_compact.num_files_to_split(),
files_to_keep = files_to_keep.len(),
"split or compact"
);
(files_to_split_or_compact, files_to_keep)
}
}

View File

@ -1,212 +0,0 @@
use std::fmt::Display;
use data_types::{CompactionLevel, ParquetFile};
use metric::{Registry, U64Counter, U64Histogram, U64HistogramOptions};
use super::SplitOrCompact;
use crate::{file_classification::FilesToSplitOrCompact, partition_info::PartitionInfo};
const METRIC_NAME_FILES_TO_SPLIT: &str = "iox_compactor_files_to_split";
const METRIC_NAME_SPLIT_DECISION_COUNT: &str = "iox_compactor_split_decision";
const METRIC_NAME_COMPACT_DECISION_COUNT: &str = "iox_compactor_compact_decision";
#[derive(Debug)]
pub struct MetricsSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
files_to_split: U64Histogram,
split_decision_count: U64Counter,
compact_decision_count: U64Counter,
inner: T,
}
impl<T> MetricsSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
pub fn new(inner: T, registry: &Registry) -> Self {
let files_to_split = registry
.register_metric_with_options::<U64Histogram, _>(
METRIC_NAME_FILES_TO_SPLIT,
"Number of files needing to be split to minimize overlap",
|| U64HistogramOptions::new([1, 10, 100, 1_000, 10_000, u64::MAX]),
)
.recorder(&[]);
let split_decision_count = registry
.register_metric::<U64Counter>(
METRIC_NAME_SPLIT_DECISION_COUNT,
"Number of times the compactor decided to split files",
)
.recorder(&[]);
let compact_decision_count = registry
.register_metric::<U64Counter>(
METRIC_NAME_COMPACT_DECISION_COUNT,
"Number of times the compactor decided to compact files",
)
.recorder(&[]);
Self {
files_to_split,
split_decision_count,
compact_decision_count,
inner,
}
}
}
impl<T> SplitOrCompact for MetricsSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
fn apply(
&self,
partition_info: &PartitionInfo,
files: Vec<ParquetFile>,
target_level: CompactionLevel,
) -> (FilesToSplitOrCompact, Vec<ParquetFile>) {
let (files_to_split_or_compact, files_to_keep) =
self.inner.apply(partition_info, files, target_level);
match &files_to_split_or_compact {
FilesToSplitOrCompact::Split(..) => {
self.files_to_split
.record(files_to_split_or_compact.num_files_to_split() as u64);
self.split_decision_count.inc(1);
}
FilesToSplitOrCompact::Compact(..) => self.compact_decision_count.inc(1),
_ => {} // Nothing to do
}
(files_to_split_or_compact, files_to_keep)
}
}
impl<T> Display for MetricsSplitOrCompactWrapper<T>
where
T: SplitOrCompact,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "metrics({})", self.inner)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
use compactor_test_utils::{create_overlapped_l0_l1_files_2, create_overlapped_l1_l2_files_2};
use data_types::CompactionLevel;
use metric::{assert_counter, assert_histogram};
use crate::{
components::split_or_compact::{split_compact::SplitCompact, SplitOrCompact},
test_utils::PartitionInfoBuilder,
};
const MAX_FILE: usize = 100;
#[test]
fn empty_records_nothing() {
let registry = Registry::new();
let files = vec![];
let p_info = Arc::new(PartitionInfoBuilder::new().build());
let split_compact = MetricsSplitOrCompactWrapper::new(
SplitCompact::new(MAX_FILE, MAX_FILE, MAX_FILE as u64),
&registry,
);
let (_files_to_split_or_compact, _files_to_keep) =
split_compact.apply(&p_info, files, CompactionLevel::Initial);
assert_histogram!(
registry,
U64Histogram,
METRIC_NAME_FILES_TO_SPLIT,
samples = 0,
);
assert_counter!(
registry,
U64Counter,
METRIC_NAME_SPLIT_DECISION_COUNT,
value = 0,
);
assert_counter!(
registry,
U64Counter,
METRIC_NAME_COMPACT_DECISION_COUNT,
value = 0,
);
}
#[test]
fn files_to_split_get_recorded() {
let registry = Registry::new();
let files = create_overlapped_l0_l1_files_2(MAX_FILE as i64);
let p_info = Arc::new(PartitionInfoBuilder::new().build());
let split_compact = MetricsSplitOrCompactWrapper::new(
SplitCompact::new(MAX_FILE, MAX_FILE, MAX_FILE as u64),
&registry,
);
let (_files_to_split_or_compact, _files_to_keep) =
split_compact.apply(&p_info, files, CompactionLevel::FileNonOverlapped);
assert_histogram!(
registry,
U64Histogram,
METRIC_NAME_FILES_TO_SPLIT,
samples = 1,
sum = 1,
);
assert_counter!(
registry,
U64Counter,
METRIC_NAME_SPLIT_DECISION_COUNT,
value = 1,
);
assert_counter!(
registry,
U64Counter,
METRIC_NAME_COMPACT_DECISION_COUNT,
value = 0,
);
}
#[test]
fn files_to_compact_get_recorded() {
let registry = Registry::new();
let files = create_overlapped_l1_l2_files_2(MAX_FILE as i64);
let p_info = Arc::new(PartitionInfoBuilder::new().build());
let split_compact = MetricsSplitOrCompactWrapper::new(
SplitCompact::new(MAX_FILE, MAX_FILE * 3, MAX_FILE as u64),
&registry,
);
let (_files_to_split_or_compact, _files_to_keep) =
split_compact.apply(&p_info, files, CompactionLevel::Final);
assert_histogram!(
registry,
U64Histogram,
METRIC_NAME_FILES_TO_SPLIT,
samples = 0,
);
assert_counter!(
registry,
U64Counter,
METRIC_NAME_SPLIT_DECISION_COUNT,
value = 0,
);
assert_counter!(
registry,
U64Counter,
METRIC_NAME_COMPACT_DECISION_COUNT,
value = 1,
);
}
}

Some files were not shown because too many files have changed in this diff Show More