docs: Clarify comments on sort order of input/output of filtering

pull/24376/head
Carol (Nichols || Goulding) 2022-09-16 13:18:44 -04:00
parent cde0a94fd5
commit ca4c5d65e7
No known key found for this signature in database
GPG Key ID: E907EE5A736F87D4
1 changed files with 8 additions and 9 deletions

View File

@ -28,19 +28,19 @@ pub(crate) enum FilterResult {
},
}
/// Given a list of level N files sorted by max sequence number and a list of level N + 1 files for
/// a partition, select a subset set of files that:
/// Given a list of sorted level N files and a list of level N + 1 files for a partition, select a
/// subset set of files that:
///
/// - Has a subset of the level N files selected, from the start of the sorted level N list
/// - Has a total size less than `max_bytes`
/// - Has only level N + 1 files that overlap in time with the level N files
///
/// The returned files will be ordered with the level N + 1 files first, then the level N files
/// ordered in ascending order by their max sequence number.
/// in the same order as they were in the input.
pub(crate) fn filter_parquet_files(
// partition of the parquet files
partition: Arc<PartitionCompactionCandidateWithInfo>,
// Level N files sorted by max sequence number
// Level N files, sorted
level_n: Vec<CompactorParquetFile>,
// Level N + 1 files
level_n_plus_1: Vec<CompactorParquetFile>,
@ -67,7 +67,7 @@ pub(crate) fn filter_parquet_files(
}
fn filter_parquet_files_inner(
// Level N files sorted by max sequence number
// Level N files, sorted
level_n: Vec<CompactorParquetFile>,
// Level N + 1 files
mut remaining_level_n_plus_1: Vec<CompactorParquetFile>,
@ -95,8 +95,7 @@ fn filter_parquet_files_inner(
let num_level_n_plus_1_considering = remaining_level_n_plus_1.len();
// This will start by holding the level N + 1 files that are found to overlap an included level
// N file. At the end of this function, the level N files are added to the end so they are
// sorted last for deduplication purposes.
// N file. At the end of this function, the level N files are added to the end.
let mut files_to_return = Vec::with_capacity(level_n.len() + remaining_level_n_plus_1.len());
// Estimated memory bytes needed to compact returned LN+1 files
let mut ln_plus_1_estimated_budget =
@ -199,8 +198,8 @@ fn filter_parquet_files_inner(
ln_plus_1_estimated_budget,
);
// Return the level N+1 files first, followed by the level N files, assuming we've maintained
// their ordering by max sequence number.
// Return the level N+1 files first, followed by the level N files. The order is arbitrary;
// ordering for deduplication happens using `QueryChunk.order`.
files_to_return.extend(level_n_to_return);
FilterResult::Proceed {