Merge branch 'main' into idpe-17789/provide-job-on-commit

pull/24376/head
wiedld 2023-07-31 08:20:45 -07:00 committed by GitHub
commit cc70a2c38b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
85 changed files with 3291 additions and 1229 deletions

23
Cargo.lock generated
View File

@ -699,7 +699,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05"
dependencies = [
"memchr",
"regex-automata 0.3.3",
"regex-automata 0.3.4",
"serde",
]
@ -763,6 +763,7 @@ dependencies = [
"pdatastructs",
"proptest",
"rand",
"test_helpers",
"tokio",
"tokio-util",
"trace",
@ -2662,6 +2663,7 @@ dependencies = [
"flatbuffers",
"futures",
"generated_types",
"gossip",
"hashbrown 0.14.0",
"influxdb_iox_client",
"ingester_query_grpc",
@ -3107,6 +3109,7 @@ dependencies = [
"authz",
"clap_blocks",
"data_types",
"gossip",
"hashbrown 0.14.0",
"hyper",
"iox_catalog",
@ -4575,7 +4578,7 @@ checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.3.3",
"regex-automata 0.3.4",
"regex-syntax 0.7.4",
]
@ -4590,9 +4593,9 @@ dependencies = [
[[package]]
name = "regex-automata"
version = "0.3.3"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
checksum = "b7b6d6190b7594385f61bd3911cd1be99dfddcfc365a4160cc2ab5bff4aed294"
dependencies = [
"aho-corasick",
"memchr",
@ -4693,6 +4696,7 @@ dependencies = [
"flate2",
"futures",
"generated_types",
"gossip",
"hashbrown 0.14.0",
"hyper",
"influxdb-line-protocol",
@ -4906,18 +4910,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
[[package]]
name = "serde"
version = "1.0.177"
version = "1.0.179"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63ba2516aa6bf82e0b19ca8b50019d52df58455d3cf9bdaf6315225fdd0c560a"
checksum = "0a5bf42b8d227d4abf38a1ddb08602e229108a517cd4e5bb28f9c7eaafdce5c0"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.177"
version = "1.0.179"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "401797fe7833d72109fedec6bfcbe67c0eed9b99772f26eb8afd261f0abc6fd3"
checksum = "741e124f5485c7e60c03b043f79f320bff3527f4bbf12cf3831750dc46a0ec2c"
dependencies = [
"proc-macro2",
"quote",
@ -6265,6 +6269,7 @@ dependencies = [
"pin-project",
"sysinfo",
"tempfile",
"test_helpers",
"tokio",
"tokio-util",
"trace",
@ -6879,7 +6884,7 @@ dependencies = [
"rand",
"rand_core",
"regex",
"regex-automata 0.3.3",
"regex-automata 0.3.4",
"regex-syntax 0.7.4",
"reqwest",
"ring",

View File

@ -23,6 +23,7 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]
criterion = { version = "0.5", default-features = false, features = ["rayon"]}
proptest = { version = "1", default_features = false, features = ["std"] }
test_helpers = { path = "../test_helpers" }
[lib]
# Allow --save-baseline to work

View File

@ -5,6 +5,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
use iox_time::{MockProvider, Time};
use parking_lot::Mutex;
use rand::rngs::mock::StepRng;
use test_helpers::maybe_start_logging;
use tokio::{runtime::Handle, sync::Notify};
use crate::{
@ -116,6 +117,7 @@ async fn test_refresh_does_not_update_lru_time() {
time_provider,
loader,
notify_idle,
pool,
..
} = TestStateLruAndRefresh::new();
@ -135,12 +137,14 @@ async fn test_refresh_does_not_update_lru_time() {
let barrier = loader.block_next(1, String::from("foo"));
backend.set(1, String::from("a"));
pool.wait_converged().await;
// trigger refresh
time_provider.inc(Duration::from_secs(1));
time_provider.inc(Duration::from_secs(1));
backend.set(2, String::from("b"));
pool.wait_converged().await;
time_provider.inc(Duration::from_secs(1));
@ -150,6 +154,7 @@ async fn test_refresh_does_not_update_lru_time() {
// add a third item to the cache, forcing LRU to evict one of the items
backend.set(3, String::from("c"));
pool.wait_converged().await;
// Should evict `1` even though it was refreshed after `2` was added
assert_eq!(backend.get(&1), None);
@ -192,6 +197,8 @@ async fn test_if_refresh_to_slow_then_expire() {
#[tokio::test]
async fn test_refresh_can_trigger_lru_eviction() {
maybe_start_logging();
let TestStateLRUAndRefresh {
mut backend,
refresh_duration_provider,
@ -224,13 +231,16 @@ async fn test_refresh_can_trigger_lru_eviction() {
backend.set(1, String::from("a"));
backend.set(2, String::from("c"));
backend.set(3, String::from("d"));
assert_eq!(backend.get(&1), Some(String::from("a")));
pool.wait_converged().await;
assert_eq!(backend.get(&2), Some(String::from("c")));
assert_eq!(backend.get(&3), Some(String::from("d")));
time_provider.inc(Duration::from_millis(1));
assert_eq!(backend.get(&1), Some(String::from("a")));
// refresh
time_provider.inc(Duration::from_secs(1));
time_provider.inc(Duration::from_secs(10));
notify_idle.notified_with_timeout().await;
pool.wait_converged().await;
// needed to evict 2->"c"
assert_eq!(backend.get(&1), Some(String::from("b")));
@ -285,6 +295,7 @@ async fn test_remove_if_check_does_not_extend_lifetime() {
size_estimator,
time_provider,
remove_if_handle,
pool,
..
} = TestStateLruAndRemoveIf::new().await;
@ -293,15 +304,18 @@ async fn test_remove_if_check_does_not_extend_lifetime() {
size_estimator.mock_size(3, String::from("c"), TestSize(4));
backend.set(1, String::from("a"));
pool.wait_converged().await;
time_provider.inc(Duration::from_secs(1));
backend.set(2, String::from("b"));
pool.wait_converged().await;
time_provider.inc(Duration::from_secs(1));
// Checking remove_if should not count as a "use" of 1
// for the "least recently used" calculation
remove_if_handle.remove_if(&1, |_| false);
backend.set(3, String::from("c"));
pool.wait_converged().await;
// adding "c" totals 12 size, but backend has room for only 10
// so "least recently used" (in this case 1, not 2) should be removed
@ -397,6 +411,7 @@ impl TestStateLRUAndRefresh {
"my_pool",
TestSize(10),
Arc::clone(&metric_registry),
&Handle::current(),
));
backend.add_policy(LruPolicy::new(
Arc::clone(&pool),
@ -442,6 +457,7 @@ impl TestStateTtlAndLRU {
"my_pool",
TestSize(10),
Arc::clone(&metric_registry),
&Handle::current(),
));
backend.add_policy(LruPolicy::new(
Arc::clone(&pool),
@ -465,6 +481,7 @@ struct TestStateLruAndRemoveIf {
time_provider: Arc<MockProvider>,
size_estimator: Arc<TestSizeEstimator>,
remove_if_handle: RemoveIfHandle<u8, String>,
pool: Arc<ResourcePool<TestSize>>,
}
impl TestStateLruAndRemoveIf {
@ -479,6 +496,7 @@ impl TestStateLruAndRemoveIf {
"my_pool",
TestSize(10),
Arc::clone(&metric_registry),
&Handle::current(),
));
backend.add_policy(LruPolicy::new(
Arc::clone(&pool),
@ -495,6 +513,7 @@ impl TestStateLruAndRemoveIf {
time_provider,
size_estimator,
remove_if_handle,
pool,
}
}
}
@ -507,6 +526,7 @@ struct TestStateLruAndRefresh {
time_provider: Arc<MockProvider>,
loader: Arc<TestLoader<u8, (), String>>,
notify_idle: Arc<Notify>,
pool: Arc<ResourcePool<TestSize>>,
}
impl TestStateLruAndRefresh {
@ -537,6 +557,7 @@ impl TestStateLruAndRefresh {
"my_pool",
TestSize(10),
Arc::clone(&metric_registry),
&Handle::current(),
));
backend.add_policy(LruPolicy::new(
Arc::clone(&pool),
@ -551,6 +572,7 @@ impl TestStateLruAndRefresh {
time_provider,
loader,
notify_idle,
pool,
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -393,7 +393,11 @@ where
/// structures while calling this function if you plan to also [subscribe](Subscriber) to
/// changes because this would easily lead to deadlocks.
pub fn execute_requests(&mut self, change_requests: Vec<ChangeRequest<'_, K, V>>) {
let inner = self.inner.upgrade().expect("backend gone");
let Some(inner) = self.inner.upgrade() else {
// backend gone, can happen during shutdowns, try not to panic
return;
};
lock_inner!(mut guard = inner);
perform_changes(&mut guard, change_requests);
}

View File

@ -9,7 +9,15 @@ use std::{
///
/// Can be used to represent in-RAM memory as well as on-disc memory.
pub trait Resource:
Add<Output = Self> + Copy + Debug + Into<u64> + PartialOrd + Send + Sub<Output = Self> + 'static
Add<Output = Self>
+ Copy
+ Debug
+ Into<u64>
+ PartialOrd
+ Send
+ Sync
+ Sub<Output = Self>
+ 'static
{
/// Create resource consumption of zero.
fn zero() -> Self;

49
clap_blocks/src/gossip.rs Normal file
View File

@ -0,0 +1,49 @@
//! CLI config for cluster gossip communication.
use crate::socket_addr::SocketAddr;
/// Configuration parameters for the cluster gossip communication mechanism.
#[derive(Debug, Clone, clap::Parser)]
#[allow(missing_copy_implementations)]
pub struct GossipConfig {
/// A comma-delimited set of seed gossip peer addresses.
///
/// Example: "10.0.0.1:4242,10.0.0.2:4242"
///
/// These seeds will be used to discover all other peers that talk to the
/// same seeds. Typically all nodes in the cluster should use the same set
/// of seeds.
#[clap(
long = "gossip-seed-list",
env = "INFLUXDB_IOX_GOSSIP_SEED_LIST",
required = false,
num_args=1..,
value_delimiter = ',',
requires = "gossip_bind_address", // Field name, not flag
)]
pub seed_list: Vec<String>,
/// The UDP socket address IOx will use for gossip communication between
/// peers.
///
/// Example: "0.0.0.0:4242"
///
/// If not provided, the gossip sub-system is disabled.
#[clap(
long = "gossip-bind-address",
env = "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
requires = "seed_list", // Field name, not flag
action
)]
pub gossip_bind_address: Option<SocketAddr>,
}
impl GossipConfig {
/// Initialise the gossip config to be disabled.
pub fn disabled() -> Self {
Self {
seed_list: vec![],
gossip_bind_address: None,
}
}
}

View File

@ -2,10 +2,16 @@
use std::path::PathBuf;
use crate::gossip::GossipConfig;
/// CLI config for the ingester using the RPC write path
#[derive(Debug, Clone, clap::Parser)]
#[allow(missing_copy_implementations)]
pub struct IngesterConfig {
/// Gossip config.
#[clap(flatten)]
pub gossip_config: GossipConfig,
/// Where this ingester instance should store its write-ahead log files. Each ingester instance
/// must have its own directory.
#[clap(long = "wal-directory", env = "INFLUXDB_IOX_WAL_DIRECTORY", action)]

View File

@ -22,6 +22,7 @@ pub mod catalog_dsn;
pub mod compactor;
pub mod compactor_scheduler;
pub mod garbage_collector;
pub mod gossip;
pub mod ingester;
pub mod ingester_address;
pub mod object_store;

View File

@ -1,6 +1,7 @@
//! CLI config for the router using the RPC write path
use crate::{
gossip::GossipConfig,
ingester_address::IngesterAddress,
single_tenant::{
CONFIG_AUTHZ_ENV_NAME, CONFIG_AUTHZ_FLAG, CONFIG_CST_ENV_NAME, CONFIG_CST_FLAG,
@ -15,6 +16,10 @@ use std::{
#[derive(Debug, Clone, clap::Parser)]
#[allow(missing_copy_implementations)]
pub struct RouterConfig {
/// Gossip config.
#[clap(flatten)]
pub gossip_config: GossipConfig,
/// Addr for connection to authz
#[clap(
long = CONFIG_AUTHZ_FLAG,

View File

@ -171,7 +171,7 @@ fn to_queryable_parquet_chunk(
parquet_file_id = file.file.id.get(),
parquet_file_namespace_id = file.file.namespace_id.get(),
parquet_file_table_id = file.file.table_id.get(),
parquet_file_partition_id = file.file.partition_id.get(),
parquet_file_partition_id = %file.file.partition_id,
parquet_file_object_store_id = uuid.to_string().as_str(),
"built parquet chunk from metadata"
);

View File

@ -70,8 +70,7 @@ impl ParquetFileSink for MockParquetFileSink {
let out = ((row_count > 0) || !self.filter_empty_files).then(|| ParquetFileParams {
namespace_id: partition.namespace_id,
table_id: partition.table.id,
partition_id: partition.partition_id,
partition_hash_id: partition.partition_hash_id.clone(),
partition_id: partition.transition_partition_id(),
object_store_id: Uuid::from_u128(guard.len() as u128),
min_time: Timestamp::new(0),
max_time: Timestamp::new(0),
@ -95,7 +94,7 @@ impl ParquetFileSink for MockParquetFileSink {
#[cfg(test)]
mod tests {
use arrow_util::assert_batches_eq;
use data_types::{NamespaceId, PartitionId, TableId};
use data_types::{NamespaceId, TableId};
use datafusion::{
arrow::{array::new_null_array, datatypes::DataType},
physical_plan::stream::RecordBatchStreamAdapter,
@ -159,7 +158,7 @@ mod tests {
Arc::clone(&schema),
futures::stream::once(async move { Ok(record_batch_captured) }),
));
let partition_hash_id = partition.partition_hash_id.clone();
let partition_id = partition.transition_partition_id();
assert_eq!(
sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await
@ -167,8 +166,7 @@ mod tests {
Some(ParquetFileParams {
namespace_id: NamespaceId::new(2),
table_id: TableId::new(3),
partition_id: PartitionId::new(1),
partition_hash_id,
partition_id,
object_store_id: Uuid::from_u128(2),
min_time: Timestamp::new(0),
max_time: Timestamp::new(0),
@ -223,7 +221,7 @@ mod tests {
Arc::clone(&schema),
futures::stream::empty(),
));
let partition_hash_id = partition.partition_hash_id.clone();
let partition_id = partition.transition_partition_id();
assert_eq!(
sink.store(stream, Arc::clone(&partition), level, max_l0_created_at)
.await
@ -231,8 +229,7 @@ mod tests {
Some(ParquetFileParams {
namespace_id: NamespaceId::new(2),
table_id: TableId::new(3),
partition_id: PartitionId::new(1),
partition_hash_id,
partition_id,
object_store_id: Uuid::from_u128(0),
min_time: Timestamp::new(0),
max_time: Timestamp::new(0),

View File

@ -1,19 +1,35 @@
use std::{collections::HashMap, fmt::Display};
use async_trait::async_trait;
use data_types::{ParquetFile, PartitionId};
use super::PartitionFilesSource;
use async_trait::async_trait;
use data_types::{ParquetFile, PartitionId, TransitionPartitionId};
#[derive(Debug)]
pub struct MockPartitionFilesSource {
files: HashMap<PartitionId, Vec<ParquetFile>>,
// This complexity is because we're in the process of moving to partition hash IDs rather than
// partition catalog IDs, and Parquet files might only have the partition hash ID on their
// record, but the compactor deals with partition catalog IDs because we haven't transitioned
// it yet. This should become simpler when the transition is complete.
partition_lookup: HashMap<PartitionId, TransitionPartitionId>,
file_lookup: HashMap<TransitionPartitionId, Vec<ParquetFile>>,
}
impl MockPartitionFilesSource {
#[allow(dead_code)] // not used anywhere
pub fn new(files: HashMap<PartitionId, Vec<ParquetFile>>) -> Self {
Self { files }
#[cfg(test)]
pub fn new(
partition_lookup: HashMap<PartitionId, TransitionPartitionId>,
parquet_files: Vec<ParquetFile>,
) -> Self {
let mut file_lookup: HashMap<TransitionPartitionId, Vec<ParquetFile>> = HashMap::new();
for file in parquet_files {
let files = file_lookup.entry(file.partition_id.clone()).or_default();
files.push(file);
}
Self {
partition_lookup,
file_lookup,
}
}
}
@ -25,46 +41,60 @@ impl Display for MockPartitionFilesSource {
#[async_trait]
impl PartitionFilesSource for MockPartitionFilesSource {
async fn fetch(&self, partition: PartitionId) -> Vec<ParquetFile> {
self.files.get(&partition).cloned().unwrap_or_default()
async fn fetch(&self, partition_id: PartitionId) -> Vec<ParquetFile> {
self.partition_lookup
.get(&partition_id)
.and_then(|partition_hash_id| self.file_lookup.get(partition_hash_id).cloned())
.unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use iox_tests::ParquetFileBuilder;
use super::*;
use iox_tests::{partition_identifier, ParquetFileBuilder};
#[test]
fn test_display() {
assert_eq!(
MockPartitionFilesSource::new(HashMap::default()).to_string(),
MockPartitionFilesSource::new(Default::default(), Default::default()).to_string(),
"mock",
)
}
#[tokio::test]
async fn test_fetch() {
let f_1_1 = ParquetFileBuilder::new(1).with_partition(1).build();
let f_1_2 = ParquetFileBuilder::new(2).with_partition(1).build();
let f_2_1 = ParquetFileBuilder::new(3).with_partition(2).build();
let partition_id_1 = PartitionId::new(1);
let partition_id_2 = PartitionId::new(2);
let partition_identifier_1 = partition_identifier(1);
let partition_identifier_2 = partition_identifier(2);
let f_1_1 = ParquetFileBuilder::new(1)
.with_partition(partition_identifier_1.clone())
.build();
let f_1_2 = ParquetFileBuilder::new(2)
.with_partition(partition_identifier_1.clone())
.build();
let f_2_1 = ParquetFileBuilder::new(3)
.with_partition(partition_identifier_2.clone())
.build();
let files = HashMap::from([
(PartitionId::new(1), vec![f_1_1.clone(), f_1_2.clone()]),
(PartitionId::new(2), vec![f_2_1.clone()]),
let partition_lookup = HashMap::from([
(partition_id_1, partition_identifier_1.clone()),
(partition_id_2, partition_identifier_2.clone()),
]);
let source = MockPartitionFilesSource::new(files);
let files = vec![f_1_1.clone(), f_1_2.clone(), f_2_1.clone()];
let source = MockPartitionFilesSource::new(partition_lookup, files);
// different partitions
assert_eq!(
source.fetch(PartitionId::new(1)).await,
source.fetch(partition_id_1).await,
vec![f_1_1.clone(), f_1_2.clone()],
);
assert_eq!(source.fetch(PartitionId::new(2)).await, vec![f_2_1],);
assert_eq!(source.fetch(partition_id_2).await, vec![f_2_1],);
// fetching does not drain
assert_eq!(source.fetch(PartitionId::new(1)).await, vec![f_1_1, f_1_2],);
assert_eq!(source.fetch(partition_id_1).await, vec![f_1_1, f_1_2],);
// unknown partition => empty result
assert_eq!(source.fetch(PartitionId::new(3)).await, vec![],);

View File

@ -172,7 +172,11 @@ impl RoundInfoSource for LevelBasedRoundInfo {
_partition_info: &PartitionInfo,
files: &[ParquetFile],
) -> Result<RoundInfo, DynError> {
let start_level = get_start_level(files);
let start_level = get_start_level(
files,
self.max_num_files_per_plan,
self.max_total_file_size_per_plan,
);
if self.too_many_small_files_to_compact(files, start_level) {
return Ok(RoundInfo::ManySmallFiles {
@ -187,23 +191,53 @@ impl RoundInfoSource for LevelBasedRoundInfo {
}
}
fn get_start_level(files: &[ParquetFile]) -> CompactionLevel {
// get_start_level decides what level to start compaction from. Often this is the lowest level
// we have ParquetFiles in, but occasionally we decide to compact L1->L2 when L0s still exist.
//
// If we ignore the invariants (where intra-level overlaps are allowed), this would be a math problem
// to optimize write amplification.
//
// However, allowing intra-level overlaps in L0 but not L1/L2 adds extra challenge to compacting L0s to L1.
// This is especially true when there are large quantitites of overlapping L0s and L1s, potentially resulting
// in many split/compact cycles to resolve the overlaps.
//
// Since L1 & L2 only have inter-level overlaps, they can be compacted with just a few splits to align the L1s
// with the L2s. The relative ease of moving data from L1 to L2 provides additional motivation to compact the
// L1s to L2s when a backlog of L0s exist. The easily solvable L1->L2 compaction can give us a clean slate in
// L1, greatly simplifying the remaining L0->L1 compactions.
fn get_start_level(files: &[ParquetFile], max_files: usize, max_bytes: usize) -> CompactionLevel {
// panic if the files are empty
assert!(!files.is_empty());
// Start with initial level
// If there are files in this level, itis the start level
// Otherwise repeat until reaching the final level.
let mut level = CompactionLevel::Initial;
while level != CompactionLevel::Final {
if files.iter().any(|f| f.compaction_level == level) {
return level;
}
let mut l0_cnt: usize = 0;
let mut l0_bytes: usize = 0;
let mut l1_bytes: usize = 0;
level = level.next();
for f in files {
match f.compaction_level {
CompactionLevel::Initial => {
l0_cnt += 1;
l0_bytes += f.file_size_bytes as usize;
}
CompactionLevel::FileNonOverlapped => {
l1_bytes += f.file_size_bytes as usize;
}
_ => {}
}
}
level
if l1_bytes > 3 * max_bytes && (l0_cnt > max_files || l0_bytes > max_bytes) {
// L1 is big enough to pose an overlap challenge compacting from L0, and there is quite a bit more coming from L0.
// The criteria for this early L1->L2 compaction significanly impacts write amplification. The above values optimize
// existing test cases, but may be changed as additional test cases are added.
CompactionLevel::FileNonOverlapped
} else if l0_bytes > 0 {
CompactionLevel::Initial
} else if l1_bytes > 0 {
CompactionLevel::FileNonOverlapped
} else {
CompactionLevel::Final
}
}
fn get_num_overlapped_files(

View File

@ -301,7 +301,26 @@ pub fn merge_small_l0_chains(
for chain in &chains {
let this_chain_bytes = chain.iter().map(|f| f.file_size_bytes as usize).sum();
if prior_chain_bytes > 0 && prior_chain_bytes + this_chain_bytes <= max_compact_size {
// matching max_lo_created_at times indicates that the files were deliberately split. We shouldn't merge
// chains with matching max_lo_created_at times, because that would encourage undoing the previous split,
// which minimally increases write amplification, and may cause unproductive split/compact loops.
let mut matches = 0;
if prior_chain_bytes > 0 {
for f in chain.iter() {
for f2 in &merged_chains[prior_chain_idx as usize] {
if f.max_l0_created_at == f2.max_l0_created_at {
matches += 1;
break;
}
}
}
}
// Merge it if: there a prior chain to merge with, and merging wouldn't make it too big, or undo a previous split
if prior_chain_bytes > 0
&& prior_chain_bytes + this_chain_bytes <= max_compact_size
&& matches == 0
{
// this chain can be added to the prior chain.
merged_chains[prior_chain_idx as usize].append(&mut chain.clone());
prior_chain_bytes += this_chain_bytes;

View File

@ -68,8 +68,8 @@ async fn test_num_files_over_limit() {
assert_levels(
&files,
vec![
(8, CompactionLevel::FileNonOverlapped),
(9, CompactionLevel::FileNonOverlapped),
(10, CompactionLevel::FileNonOverlapped),
],
);
}

View File

@ -746,97 +746,85 @@ async fn random_backfill_over_l2s() {
- "Committing partition 1:"
- " Soft Deleting 4 files: L0.76, L0.77, L0.79, L0.80"
- " Creating 8 files"
- "**** Simulation run 15, type=compact(ManySmallFiles). 10 Input Files, 200mb total:"
- "L0 "
- "L0.75[42,356] 1.04us 33mb|-----------L0.75-----------| "
- "L0.86[357,357] 1.04us 0b |L0.86| "
- "L0.87[358,670] 1.04us 33mb |-----------L0.87-----------| "
- "L0.84[671,672] 1.04us 109kb |L0.84| "
- "L0.85[673,986] 1.04us 33mb |-----------L0.85-----------| "
- "L0.78[42,356] 1.05us 33mb|-----------L0.78-----------| "
- "L0.90[357,357] 1.05us 0b |L0.90| "
- "L0.91[358,670] 1.05us 33mb |-----------L0.91-----------| "
- "L0.88[671,672] 1.05us 109kb |L0.88| "
- "L0.89[673,986] 1.05us 33mb |-----------L0.89-----------| "
- "**** 1 Output Files (parquet_file_id not yet assigned), 200mb total:"
- "L0, all files 200mb "
- "L0.?[42,986] 1.05us |------------------------------------------L0.?------------------------------------------|"
- "Committing partition 1:"
- " Soft Deleting 10 files: L0.75, L0.78, L0.84, L0.85, L0.86, L0.87, L0.88, L0.89, L0.90, L0.91"
- " Creating 1 files"
- "**** Simulation run 16, type=split(HighL0OverlapSingleFile)(split_times=[670]). 1 Input Files, 100mb total:"
- "L1, all files 100mb "
- "L1.82[358,672] 1.03us |-----------------------------------------L1.82------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 100mb total:"
- "L1 "
- "L1.?[358,670] 1.03us 99mb|-----------------------------------------L1.?------------------------------------------| "
- "L1.?[671,672] 1.03us 651kb |L1.?|"
- "**** Simulation run 17, type=split(HighL0OverlapSingleFile)(split_times=[356]). 1 Input Files, 100mb total:"
- "L1, all files 100mb "
- "L1.81[42,357] 1.03us |-----------------------------------------L1.81------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 100mb total:"
- "L1 "
- "L1.?[42,356] 1.03us 100mb|-----------------------------------------L1.?------------------------------------------| "
- "L1.?[357,357] 1.03us 325kb |L1.?|"
- "**** Simulation run 18, type=split(HighL0OverlapSingleFile)(split_times=[356, 670]). 1 Input Files, 200mb total:"
- "L0, all files 200mb "
- "L0.92[42,986] 1.05us |-----------------------------------------L0.92------------------------------------------|"
- "**** 3 Output Files (parquet_file_id not yet assigned), 200mb total:"
- "L0 "
- "L0.?[42,356] 1.05us 67mb |-----------L0.?------------| "
- "L0.?[357,670] 1.05us 66mb |-----------L0.?------------| "
- "L0.?[671,986] 1.05us 67mb |------------L0.?------------| "
- "Committing partition 1:"
- " Soft Deleting 3 files: L1.81, L1.82, L0.92"
- " Creating 7 files"
- "**** Simulation run 19, type=split(ReduceOverlap)(split_times=[672]). 1 Input Files, 67mb total:"
- "**** Simulation run 15, type=compact(ManySmallFiles). 2 Input Files, 67mb total:"
- "L0, all files 33mb "
- "L0.75[42,356] 1.04us |-----------------------------------------L0.75------------------------------------------|"
- "L0.78[42,356] 1.05us |-----------------------------------------L0.78------------------------------------------|"
- "**** 1 Output Files (parquet_file_id not yet assigned), 67mb total:"
- "L0, all files 67mb "
- "L0.99[671,986] 1.05us |-----------------------------------------L0.99------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 67mb total:"
- "L0 "
- "L0.?[671,672] 1.05us 218kb|L0.?| "
- "L0.?[673,986] 1.05us 67mb|-----------------------------------------L0.?------------------------------------------| "
- "**** Simulation run 20, type=split(ReduceOverlap)(split_times=[357]). 1 Input Files, 66mb total:"
- "L0, all files 66mb "
- "L0.98[357,670] 1.05us |-----------------------------------------L0.98------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 66mb total:"
- "L0 "
- "L0.?[357,357] 1.05us 0b |L0.?| "
- "L0.?[358,670] 1.05us 66mb|-----------------------------------------L0.?------------------------------------------| "
- "L0.?[42,356] 1.05us |------------------------------------------L0.?------------------------------------------|"
- "Committing partition 1:"
- " Soft Deleting 2 files: L0.98, L0.99"
- " Creating 4 files"
- "**** Simulation run 21, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[232]). 4 Input Files, 167mb total:"
- " Soft Deleting 2 files: L0.75, L0.78"
- " Creating 1 files"
- "**** Simulation run 16, type=compact(ManySmallFiles). 2 Input Files, 66mb total:"
- "L0, all files 33mb "
- "L0.87[358,670] 1.04us |-----------------------------------------L0.87------------------------------------------|"
- "L0.91[358,670] 1.05us |-----------------------------------------L0.91------------------------------------------|"
- "**** 1 Output Files (parquet_file_id not yet assigned), 66mb total:"
- "L0, all files 66mb "
- "L0.?[358,670] 1.05us |------------------------------------------L0.?------------------------------------------|"
- "Committing partition 1:"
- " Soft Deleting 2 files: L0.87, L0.91"
- " Creating 1 files"
- "**** Simulation run 17, type=compact(ManySmallFiles). 2 Input Files, 218kb total:"
- "L0, all files 109kb "
- "L0.84[671,672] 1.04us |-----------------------------------------L0.84------------------------------------------|"
- "L0.88[671,672] 1.05us |-----------------------------------------L0.88------------------------------------------|"
- "**** 1 Output Files (parquet_file_id not yet assigned), 218kb total:"
- "L0, all files 218kb "
- "L0.?[671,672] 1.05us |------------------------------------------L0.?------------------------------------------|"
- "Committing partition 1:"
- " Soft Deleting 2 files: L0.84, L0.88"
- " Creating 1 files"
- "**** Simulation run 18, type=compact(ManySmallFiles). 2 Input Files, 67mb total:"
- "L0, all files 33mb "
- "L0.85[673,986] 1.04us |-----------------------------------------L0.85------------------------------------------|"
- "L0.89[673,986] 1.05us |-----------------------------------------L0.89------------------------------------------|"
- "**** 1 Output Files (parquet_file_id not yet assigned), 67mb total:"
- "L0, all files 67mb "
- "L0.?[673,986] 1.05us |------------------------------------------L0.?------------------------------------------|"
- "Committing partition 1:"
- " Soft Deleting 2 files: L0.85, L0.89"
- " Creating 1 files"
- "**** Simulation run 19, type=compact(ManySmallFiles). 2 Input Files, 0b total:"
- "L0, all files 0b "
- "L0.86[357,357] 1.04us |-----------------------------------------L0.86------------------------------------------|"
- "L0.90[357,357] 1.05us |-----------------------------------------L0.90------------------------------------------|"
- "**** 1 Output Files (parquet_file_id not yet assigned), 0b total:"
- "L0, all files 0b "
- "L0.?[357,357] 1.05us |------------------------------------------L0.?------------------------------------------|"
- "Committing partition 1:"
- " Soft Deleting 2 files: L0.86, L0.90"
- " Creating 1 files"
- "**** Simulation run 20, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[232]). 3 Input Files, 167mb total:"
- "L0 "
- "L0.97[42,356] 1.05us 67mb|-----------------------------------------L0.97-----------------------------------------| "
- "L0.102[357,357] 1.05us 0b |L0.102|"
- "L0.92[42,356] 1.05us 67mb|-----------------------------------------L0.92-----------------------------------------| "
- "L0.96[357,357] 1.05us 0b |L0.96|"
- "L1 "
- "L1.95[42,356] 1.03us 100mb|-----------------------------------------L1.95-----------------------------------------| "
- "L1.96[357,357] 1.03us 325kb |L1.96|"
- "L1.81[42,357] 1.03us 100mb|-----------------------------------------L1.81------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 167mb total:"
- "L1 "
- "L1.?[42,232] 1.05us 101mb|------------------------L1.?------------------------| "
- "L1.?[233,357] 1.05us 66mb |--------------L1.?---------------| "
- "Committing partition 1:"
- " Soft Deleting 4 files: L1.95, L1.96, L0.97, L0.102"
- " Soft Deleting 3 files: L1.81, L0.92, L0.96"
- " Creating 2 files"
- "**** Simulation run 22, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[547]). 4 Input Files, 166mb total:"
- "**** Simulation run 21, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[547]). 3 Input Files, 166mb total:"
- "L0 "
- "L0.103[358,670] 1.05us 66mb|----------------------------------------L0.103-----------------------------------------| "
- "L0.100[671,672] 1.05us 218kb |L0.100|"
- "L0.93[358,670] 1.05us 66mb|-----------------------------------------L0.93-----------------------------------------| "
- "L0.94[671,672] 1.05us 218kb |L0.94|"
- "L1 "
- "L1.93[358,670] 1.03us 99mb|-----------------------------------------L1.93-----------------------------------------| "
- "L1.94[671,672] 1.03us 651kb |L1.94|"
- "L1.82[358,672] 1.03us 100mb|-----------------------------------------L1.82------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 166mb total:"
- "L1 "
- "L1.?[358,547] 1.05us 100mb|------------------------L1.?------------------------| "
- "L1.?[548,672] 1.05us 66mb |--------------L1.?---------------| "
- "Committing partition 1:"
- " Soft Deleting 4 files: L1.93, L1.94, L0.100, L0.103"
- " Soft Deleting 3 files: L1.82, L0.93, L0.94"
- " Creating 2 files"
- "**** Simulation run 23, type=split(CompactAndSplitOutput(TotalSizeLessThanMaxCompactSize))(split_times=[861]). 2 Input Files, 167mb total:"
- "**** Simulation run 22, type=split(CompactAndSplitOutput(TotalSizeLessThanMaxCompactSize))(split_times=[861]). 2 Input Files, 167mb total:"
- "L0 "
- "L0.101[673,986] 1.05us 67mb|-----------------------------------------L0.101-----------------------------------------|"
- "L0.95[673,986] 1.05us 67mb|-----------------------------------------L0.95------------------------------------------|"
- "L1 "
- "L1.83[673,986] 1.03us 100mb|-----------------------------------------L1.83------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 167mb total:"
@ -844,60 +832,60 @@ async fn random_backfill_over_l2s() {
- "L1.?[673,861] 1.05us 100mb|------------------------L1.?------------------------| "
- "L1.?[862,986] 1.05us 67mb |--------------L1.?---------------| "
- "Committing partition 1:"
- " Soft Deleting 2 files: L1.83, L0.101"
- " Soft Deleting 2 files: L1.83, L0.95"
- " Creating 2 files"
- "**** Simulation run 24, type=split(ReduceOverlap)(split_times=[399, 499]). 1 Input Files, 100mb total:"
- "**** Simulation run 23, type=split(ReduceOverlap)(split_times=[399, 499]). 1 Input Files, 100mb total:"
- "L1, all files 100mb "
- "L1.106[358,547] 1.05us |-----------------------------------------L1.106-----------------------------------------|"
- "L1.99[358,547] 1.05us |-----------------------------------------L1.99------------------------------------------|"
- "**** 3 Output Files (parquet_file_id not yet assigned), 100mb total:"
- "L1 "
- "L1.?[358,399] 1.05us 22mb|------L1.?-------| "
- "L1.?[400,499] 1.05us 52mb |--------------------L1.?---------------------| "
- "L1.?[500,547] 1.05us 26mb |--------L1.?--------| "
- "**** Simulation run 25, type=split(ReduceOverlap)(split_times=[299]). 1 Input Files, 66mb total:"
- "**** Simulation run 24, type=split(ReduceOverlap)(split_times=[299]). 1 Input Files, 66mb total:"
- "L1, all files 66mb "
- "L1.105[233,357] 1.05us |-----------------------------------------L1.105-----------------------------------------|"
- "L1.98[233,357] 1.05us |-----------------------------------------L1.98------------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 66mb total:"
- "L1 "
- "L1.?[233,299] 1.05us 35mb|--------------------L1.?---------------------| "
- "L1.?[300,357] 1.05us 31mb |-----------------L1.?------------------| "
- "**** Simulation run 26, type=split(ReduceOverlap)(split_times=[99, 199]). 1 Input Files, 101mb total:"
- "**** Simulation run 25, type=split(ReduceOverlap)(split_times=[99, 199]). 1 Input Files, 101mb total:"
- "L1, all files 101mb "
- "L1.104[42,232] 1.05us |-----------------------------------------L1.104-----------------------------------------|"
- "L1.97[42,232] 1.05us |-----------------------------------------L1.97------------------------------------------|"
- "**** 3 Output Files (parquet_file_id not yet assigned), 101mb total:"
- "L1 "
- "L1.?[42,99] 1.05us 30mb |----------L1.?-----------| "
- "L1.?[100,199] 1.05us 52mb |--------------------L1.?--------------------| "
- "L1.?[200,232] 1.05us 18mb |----L1.?-----| "
- "**** Simulation run 27, type=split(ReduceOverlap)(split_times=[599]). 1 Input Files, 66mb total:"
- "**** Simulation run 26, type=split(ReduceOverlap)(split_times=[599]). 1 Input Files, 66mb total:"
- "L1, all files 66mb "
- "L1.107[548,672] 1.05us |-----------------------------------------L1.107-----------------------------------------|"
- "L1.100[548,672] 1.05us |-----------------------------------------L1.100-----------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 66mb total:"
- "L1 "
- "L1.?[548,599] 1.05us 27mb|---------------L1.?----------------| "
- "L1.?[600,672] 1.05us 39mb |-----------------------L1.?-----------------------| "
- "**** Simulation run 28, type=split(ReduceOverlap)(split_times=[899]). 1 Input Files, 67mb total:"
- "**** Simulation run 27, type=split(ReduceOverlap)(split_times=[899]). 1 Input Files, 67mb total:"
- "L1, all files 67mb "
- "L1.109[862,986] 1.05us |-----------------------------------------L1.109-----------------------------------------|"
- "L1.102[862,986] 1.05us |-----------------------------------------L1.102-----------------------------------------|"
- "**** 2 Output Files (parquet_file_id not yet assigned), 67mb total:"
- "L1 "
- "L1.?[862,899] 1.05us 20mb|----------L1.?----------| "
- "L1.?[900,986] 1.05us 47mb |----------------------------L1.?----------------------------| "
- "**** Simulation run 29, type=split(ReduceOverlap)(split_times=[699, 799]). 1 Input Files, 100mb total:"
- "**** Simulation run 28, type=split(ReduceOverlap)(split_times=[699, 799]). 1 Input Files, 100mb total:"
- "L1, all files 100mb "
- "L1.108[673,861] 1.05us |-----------------------------------------L1.108-----------------------------------------|"
- "L1.101[673,861] 1.05us |-----------------------------------------L1.101-----------------------------------------|"
- "**** 3 Output Files (parquet_file_id not yet assigned), 100mb total:"
- "L1 "
- "L1.?[673,699] 1.05us 14mb|---L1.?---| "
- "L1.?[700,799] 1.05us 53mb |--------------------L1.?---------------------| "
- "L1.?[800,861] 1.05us 34mb |-----------L1.?------------| "
- "Committing partition 1:"
- " Soft Deleting 6 files: L1.104, L1.105, L1.106, L1.107, L1.108, L1.109"
- " Soft Deleting 6 files: L1.97, L1.98, L1.99, L1.100, L1.101, L1.102"
- " Creating 15 files"
- "**** Simulation run 30, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[71, 142]). 4 Input Files, 283mb total:"
- "**** Simulation run 29, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[71, 142]). 4 Input Files, 283mb total:"
- "L1 "
- "L1.115[42,99] 1.05us 30mb |--------L1.115---------| "
- "L1.116[100,199] 1.05us 52mb |------------------L1.116------------------| "
- "L1.108[42,99] 1.05us 30mb |--------L1.108---------| "
- "L1.109[100,199] 1.05us 52mb |------------------L1.109------------------| "
- "L2 "
- "L2.1[0,99] 99ns 100mb |-------------------L2.1-------------------| "
- "L2.2[100,199] 199ns 100mb |-------------------L2.2-------------------| "
@ -907,13 +895,13 @@ async fn random_backfill_over_l2s() {
- "L2.?[72,142] 1.05us 99mb |------------L2.?-------------| "
- "L2.?[143,199] 1.05us 82mb |---------L2.?----------| "
- "Committing partition 1:"
- " Soft Deleting 4 files: L2.1, L2.2, L1.115, L1.116"
- " Soft Deleting 4 files: L2.1, L2.2, L1.108, L1.109"
- " Creating 3 files"
- "**** Simulation run 31, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[271, 342]). 5 Input Files, 284mb total:"
- "**** Simulation run 30, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[271, 342]). 5 Input Files, 284mb total:"
- "L1 "
- "L1.117[200,232] 1.05us 18mb|---L1.117---| "
- "L1.113[233,299] 1.05us 35mb |----------L1.113-----------| "
- "L1.114[300,357] 1.05us 31mb |--------L1.114---------| "
- "L1.110[200,232] 1.05us 18mb|---L1.110---| "
- "L1.106[233,299] 1.05us 35mb |----------L1.106-----------| "
- "L1.107[300,357] 1.05us 31mb |--------L1.107---------| "
- "L2 "
- "L2.3[200,299] 299ns 100mb|-------------------L2.3-------------------| "
- "L2.4[300,399] 399ns 100mb |-------------------L2.4-------------------| "
@ -923,14 +911,14 @@ async fn random_backfill_over_l2s() {
- "L2.?[272,342] 1.05us 100mb |------------L2.?-------------| "
- "L2.?[343,399] 1.05us 83mb |---------L2.?----------| "
- "Committing partition 1:"
- " Soft Deleting 5 files: L2.3, L2.4, L1.113, L1.114, L1.117"
- " Soft Deleting 5 files: L2.3, L2.4, L1.106, L1.107, L1.110"
- " Creating 3 files"
- "**** Simulation run 32, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[404, 465]). 4 Input Files, 257mb total:"
- "**** Simulation run 31, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[404, 465]). 4 Input Files, 257mb total:"
- "L1 "
- "L1.110[358,399] 1.05us 22mb |-------L1.110--------| "
- "L1.111[400,499] 1.05us 52mb |------------------------L1.111-------------------------| "
- "L1.103[358,399] 1.05us 22mb |-------L1.103--------| "
- "L1.104[400,499] 1.05us 52mb |------------------------L1.104-------------------------| "
- "L2 "
- "L2.130[343,399] 1.05us 83mb|------------L2.130------------| "
- "L2.123[343,399] 1.05us 83mb|------------L2.123------------| "
- "L2.5[400,499] 499ns 100mb |-------------------------L2.5--------------------------| "
- "**** 3 Output Files (parquet_file_id not yet assigned), 257mb total:"
- "L2 "
@ -938,13 +926,13 @@ async fn random_backfill_over_l2s() {
- "L2.?[405,465] 1.05us 99mb |--------------L2.?--------------| "
- "L2.?[466,499] 1.05us 58mb |------L2.?-------| "
- "Committing partition 1:"
- " Soft Deleting 4 files: L2.5, L1.110, L1.111, L2.130"
- " Soft Deleting 4 files: L2.5, L1.103, L1.104, L2.123"
- " Creating 3 files"
- "**** Simulation run 33, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[569, 638]). 5 Input Files, 292mb total:"
- "**** Simulation run 32, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[569, 638]). 5 Input Files, 292mb total:"
- "L1 "
- "L1.112[500,547] 1.05us 26mb|------L1.112-------| "
- "L1.118[548,599] 1.05us 27mb |-------L1.118--------| "
- "L1.119[600,672] 1.05us 39mb |------------L1.119------------| "
- "L1.105[500,547] 1.05us 26mb|------L1.105-------| "
- "L1.111[548,599] 1.05us 27mb |-------L1.111--------| "
- "L1.112[600,672] 1.05us 39mb |------------L1.112------------| "
- "L2 "
- "L2.6[500,599] 599ns 100mb|-------------------L2.6-------------------| "
- "L2.7[600,699] 699ns 100mb |-------------------L2.7-------------------| "
@ -954,14 +942,14 @@ async fn random_backfill_over_l2s() {
- "L2.?[570,638] 1.05us 100mb |------------L2.?------------| "
- "L2.?[639,699] 1.05us 91mb |----------L2.?-----------| "
- "Committing partition 1:"
- " Soft Deleting 5 files: L2.6, L2.7, L1.112, L1.118, L1.119"
- " Soft Deleting 5 files: L2.6, L2.7, L1.105, L1.111, L1.112"
- " Creating 3 files"
- "**** Simulation run 34, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[702, 765]). 4 Input Files, 258mb total:"
- "**** Simulation run 33, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[702, 765]). 4 Input Files, 258mb total:"
- "L1 "
- "L1.122[673,699] 1.05us 14mb |---L1.122---| "
- "L1.123[700,799] 1.05us 53mb |-----------------------L1.123------------------------| "
- "L1.115[673,699] 1.05us 14mb |---L1.115---| "
- "L1.116[700,799] 1.05us 53mb |-----------------------L1.116------------------------| "
- "L2 "
- "L2.136[639,699] 1.05us 91mb|------------L2.136-------------| "
- "L2.129[639,699] 1.05us 91mb|------------L2.129-------------| "
- "L2.8[700,799] 799ns 100mb |------------------------L2.8-------------------------| "
- "**** 3 Output Files (parquet_file_id not yet assigned), 258mb total:"
- "L2 "
@ -969,12 +957,12 @@ async fn random_backfill_over_l2s() {
- "L2.?[703,765] 1.05us 100mb |--------------L2.?--------------| "
- "L2.?[766,799] 1.05us 56mb |------L2.?------| "
- "Committing partition 1:"
- " Soft Deleting 4 files: L2.8, L1.122, L1.123, L2.136"
- " Soft Deleting 4 files: L2.8, L1.115, L1.116, L2.129"
- " Creating 3 files"
- "**** Simulation run 35, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[865]). 3 Input Files, 154mb total:"
- "**** Simulation run 34, type=split(CompactAndSplitOutput(FoundSubsetLessThanMaxCompactSize))(split_times=[865]). 3 Input Files, 154mb total:"
- "L1 "
- "L1.124[800,861] 1.05us 34mb|-----------------------L1.124------------------------| "
- "L1.120[862,899] 1.05us 20mb |------------L1.120-------------| "
- "L1.117[800,861] 1.05us 34mb|-----------------------L1.117------------------------| "
- "L1.113[862,899] 1.05us 20mb |------------L1.113-------------| "
- "L2 "
- "L2.9[800,899] 899ns 100mb|-----------------------------------------L2.9------------------------------------------| "
- "**** 2 Output Files (parquet_file_id not yet assigned), 154mb total:"
@ -982,28 +970,28 @@ async fn random_backfill_over_l2s() {
- "L2.?[800,865] 1.05us 101mb|--------------------------L2.?---------------------------| "
- "L2.?[866,899] 1.05us 53mb |-----------L2.?------------| "
- "Committing partition 1:"
- " Soft Deleting 3 files: L2.9, L1.120, L1.124"
- " Soft Deleting 3 files: L2.9, L1.113, L1.117"
- " Creating 2 files"
- "**** Final Output Files (4.58gb written)"
- "**** Final Output Files (4.06gb written)"
- "L1 "
- "L1.121[900,986] 1.05us 47mb |L1.121| "
- "L1.114[900,986] 1.05us 47mb |L1.114| "
- "L2 "
- "L2.10[900,999] 999ns 100mb |L2.10-| "
- "L2.125[0,71] 1.05us 101mb|L2.125| "
- "L2.126[72,142] 1.05us 99mb |L2.126| "
- "L2.127[143,199] 1.05us 82mb |L2.127| "
- "L2.128[200,271] 1.05us 101mb |L2.128| "
- "L2.129[272,342] 1.05us 100mb |L2.129| "
- "L2.131[343,404] 1.05us 100mb |L2.131| "
- "L2.132[405,465] 1.05us 99mb |L2.132| "
- "L2.133[466,499] 1.05us 58mb |L2.133| "
- "L2.134[500,569] 1.05us 101mb |L2.134| "
- "L2.135[570,638] 1.05us 100mb |L2.135| "
- "L2.137[639,702] 1.05us 101mb |L2.137| "
- "L2.138[703,765] 1.05us 100mb |L2.138| "
- "L2.139[766,799] 1.05us 56mb |L2.139| "
- "L2.140[800,865] 1.05us 101mb |L2.140| "
- "L2.141[866,899] 1.05us 53mb |L2.141| "
- "L2.118[0,71] 1.05us 101mb|L2.118| "
- "L2.119[72,142] 1.05us 99mb |L2.119| "
- "L2.120[143,199] 1.05us 82mb |L2.120| "
- "L2.121[200,271] 1.05us 101mb |L2.121| "
- "L2.122[272,342] 1.05us 100mb |L2.122| "
- "L2.124[343,404] 1.05us 100mb |L2.124| "
- "L2.125[405,465] 1.05us 99mb |L2.125| "
- "L2.126[466,499] 1.05us 58mb |L2.126| "
- "L2.127[500,569] 1.05us 101mb |L2.127| "
- "L2.128[570,638] 1.05us 100mb |L2.128| "
- "L2.130[639,702] 1.05us 101mb |L2.130| "
- "L2.131[703,765] 1.05us 100mb |L2.131| "
- "L2.132[766,799] 1.05us 56mb |L2.132| "
- "L2.133[800,865] 1.05us 101mb |L2.133| "
- "L2.134[866,899] 1.05us 53mb |L2.134| "
"###
);
}
@ -3020,63 +3008,66 @@ async fn actual_case_from_catalog_1() {
- "WARNING: file L0.161[327,333] 336ns 183mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L0.162[330,338] 340ns 231mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L0.163[331,338] 341ns 232mb exceeds soft limit 100mb by more than 50%"
- "**** Final Output Files (17.64gb written)"
- "**** Final Output Files (15.47gb written)"
- "L2 "
- "L2.578[134,149] 342ns 202mb |L2.578| "
- "L2.579[150,165] 342ns 218mb |L2.579| "
- "L2.580[166,176] 342ns 186mb |L2.580| "
- "L2.581[177,182] 342ns 150mb |L2.581| "
- "L2.582[183,197] 342ns 267mb |L2.582| "
- "L2.583[198,207] 342ns 157mb |L2.583| "
- "L2.584[208,220] 342ns 147mb |L2.584| "
- "L2.585[221,232] 342ns 270mb |L2.585| "
- "L2.588[233,253] 342ns 286mb |L2.588| "
- "L2.589[254,270] 342ns 289mb |L2.589| "
- "L2.590[271,281] 342ns 225mb |L2.590| "
- "L2.591[282,296] 342ns 234mb |L2.591| "
- "L2.592[297,302] 342ns 232mb |L2.592| "
- "L2.593[303,308] 342ns 244mb |L2.593| "
- "L2.594[309,314] 342ns 282mb |L2.594|"
- "L2.595[315,317] 342ns 214mb |L2.595|"
- "L2.596[318,320] 342ns 222mb |L2.596|"
- "L2.597[321,323] 342ns 146mb |L2.597|"
- "L2.598[324,326] 342ns 254mb |L2.598|"
- "L2.599[327,329] 342ns 197mb |L2.599|"
- "L2.600[330,332] 342ns 228mb |L2.600|"
- "L2.601[333,335] 342ns 199mb |L2.601|"
- "L2.602[336,338] 342ns 280mb |L2.602|"
- "L2.850[1,26] 342ns 101mb |L2.850| "
- "L2.853[69,85] 342ns 104mb |L2.853| "
- "L2.854[86,98] 342ns 107mb |L2.854| "
- "L2.861[27,48] 342ns 103mb |L2.861| "
- "L2.862[49,68] 342ns 98mb |L2.862| "
- "L2.863[99,108] 342ns 102mb |L2.863| "
- "L2.864[109,117] 342ns 91mb |L2.864| "
- "L2.865[118,124] 342ns 91mb |L2.865| "
- "L2.866[125,130] 342ns 107mb |L2.866| "
- "L2.867[131,133] 342ns 64mb |L2.867| "
- "L2.868[339,339] 342ns 25mb |L2.868|"
- "WARNING: file L2.578[134,149] 342ns 202mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.579[150,165] 342ns 218mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.580[166,176] 342ns 186mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.581[177,182] 342ns 150mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.582[183,197] 342ns 267mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.583[198,207] 342ns 157mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.585[221,232] 342ns 270mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.588[233,253] 342ns 286mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.589[254,270] 342ns 289mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.590[271,281] 342ns 225mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.591[282,296] 342ns 234mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.592[297,302] 342ns 232mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.593[303,308] 342ns 244mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.594[309,314] 342ns 282mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.595[315,317] 342ns 214mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.596[318,320] 342ns 222mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.598[324,326] 342ns 254mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.599[327,329] 342ns 197mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.600[330,332] 342ns 228mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.601[333,335] 342ns 199mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.602[336,338] 342ns 280mb exceeds soft limit 100mb by more than 50%"
- "L2.594[150,165] 342ns 218mb |L2.594| "
- "L2.595[166,171] 342ns 118mb |L2.595| "
- "L2.598[183,197] 342ns 267mb |L2.598| "
- "L2.599[198,207] 342ns 157mb |L2.599| "
- "L2.600[208,220] 342ns 147mb |L2.600| "
- "L2.601[221,232] 342ns 270mb |L2.601| "
- "L2.602[233,244] 342ns 147mb |L2.602| "
- "L2.603[245,253] 342ns 139mb |L2.603| "
- "L2.604[271,276] 342ns 117mb |L2.604| "
- "L2.605[277,281] 342ns 109mb |L2.605| "
- "L2.612[254,261] 342ns 105mb |L2.612| "
- "L2.613[262,270] 342ns 184mb |L2.613| "
- "L2.616[309,311] 342ns 101mb |L2.616|"
- "L2.617[312,314] 342ns 181mb |L2.617|"
- "L2.618[315,317] 342ns 214mb |L2.618|"
- "L2.619[318,320] 342ns 222mb |L2.619|"
- "L2.620[321,323] 342ns 146mb |L2.620|"
- "L2.621[324,326] 342ns 254mb |L2.621|"
- "L2.622[327,329] 342ns 197mb |L2.622|"
- "L2.623[330,332] 342ns 228mb |L2.623|"
- "L2.624[333,335] 342ns 199mb |L2.624|"
- "L2.625[336,337] 342ns 156mb |L2.625|"
- "L2.626[338,338] 342ns 124mb |L2.626|"
- "L2.628[1,36] 342ns 103mb |L2.628-| "
- "L2.629[37,71] 342ns 103mb |L2.629-| "
- "L2.630[72,83] 342ns 103mb |L2.630| "
- "L2.638[172,177] 342ns 109mb |L2.638| "
- "L2.639[178,182] 342ns 109mb |L2.639| "
- "L2.640[282,288] 342ns 100mb |L2.640| "
- "L2.643[300,303] 342ns 110mb |L2.643| "
- "L2.646[84,94] 342ns 107mb |L2.646| "
- "L2.647[95,104] 342ns 97mb |L2.647| "
- "L2.648[105,111] 342ns 86mb |L2.648| "
- "L2.649[112,119] 342ns 114mb |L2.649| "
- "L2.650[120,126] 342ns 98mb |L2.650| "
- "L2.651[127,130] 342ns 82mb |L2.651| "
- "L2.652[131,138] 342ns 108mb |L2.652| "
- "L2.653[139,145] 342ns 93mb |L2.653| "
- "L2.654[146,149] 342ns 77mb |L2.654| "
- "L2.655[289,293] 342ns 110mb |L2.655| "
- "L2.656[294,297] 342ns 82mb |L2.656| "
- "L2.657[298,299] 342ns 82mb |L2.657| "
- "L2.658[304,306] 342ns 113mb |L2.658| "
- "L2.659[307,308] 342ns 113mb |L2.659| "
- "L2.660[339,339] 342ns 25mb |L2.660|"
- "WARNING: file L2.594[150,165] 342ns 218mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.598[183,197] 342ns 267mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.599[198,207] 342ns 157mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.601[221,232] 342ns 270mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.613[262,270] 342ns 184mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.617[312,314] 342ns 181mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.618[315,317] 342ns 214mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.619[318,320] 342ns 222mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.621[324,326] 342ns 254mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.622[327,329] 342ns 197mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.623[330,332] 342ns 228mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.624[333,335] 342ns 199mb exceeds soft limit 100mb by more than 50%"
- "WARNING: file L2.625[336,337] 342ns 156mb exceeds soft limit 100mb by more than 50%"
"###
);
}

View File

@ -4670,17 +4670,17 @@ async fn l0s_almost_needing_vertical_split() {
- "L0.998[24,100] 1.02us |-----------------------------------------L0.998-----------------------------------------|"
- "L0.999[24,100] 1.02us |-----------------------------------------L0.999-----------------------------------------|"
- "L0.1000[24,100] 1.02us |----------------------------------------L0.1000-----------------------------------------|"
- "**** Final Output Files (6.5gb written)"
- "**** Final Output Files (5.23gb written)"
- "L2 "
- "L2.3141[24,37] 1.02us 108mb|---L2.3141---| "
- "L2.3150[38,49] 1.02us 102mb |--L2.3150--| "
- "L2.3151[50,60] 1.02us 93mb |-L2.3151-| "
- "L2.3152[61,63] 1.02us 37mb |L2.3152| "
- "L2.3153[64,73] 1.02us 101mb |L2.3153-| "
- "L2.3154[74,82] 1.02us 90mb |L2.3154| "
- "L2.3155[83,90] 1.02us 101mb |L2.3155| "
- "L2.3156[91,98] 1.02us 93mb |L2.3156| "
- "L2.3157[99,100] 1.02us 26mb |L2.3157|"
- "L2.3086[24,35] 1.02us 102mb|--L2.3086--| "
- "L2.3095[36,47] 1.02us 105mb |--L2.3095--| "
- "L2.3096[48,58] 1.02us 95mb |-L2.3096-| "
- "L2.3097[59,65] 1.02us 76mb |L2.3097| "
- "L2.3098[66,76] 1.02us 106mb |-L2.3098-| "
- "L2.3099[77,86] 1.02us 96mb |L2.3099-| "
- "L2.3100[87,90] 1.02us 53mb |L2.3100| "
- "L2.3101[91,98] 1.02us 90mb |L2.3101| "
- "L2.3102[99,100] 1.02us 26mb |L2.3102|"
"###
);
}

File diff suppressed because it is too large Load Diff

View File

@ -78,14 +78,12 @@ where
#[cfg(test)]
mod tests {
use std::sync::Arc;
use assert_matches::assert_matches;
use test_helpers::tracing::TracingCapture;
use super::*;
use crate::commit::mock::{CommitHistoryEntry, MockCommit};
use iox_tests::ParquetFileBuilder;
use assert_matches::assert_matches;
use iox_tests::{partition_identifier, ParquetFileBuilder};
use std::sync::Arc;
use test_helpers::tracing::TracingCapture;
#[test]
fn test_display() {
@ -111,14 +109,21 @@ mod tests {
.with_row_count(105)
.build();
let created_1 = ParquetFileBuilder::new(1000).with_partition(1).build();
let created_2 = ParquetFileBuilder::new(1001).with_partition(1).build();
let partition_id_1 = PartitionId::new(1);
let transition_partition_id_1 = partition_identifier(1);
let created_1 = ParquetFileBuilder::new(1000)
.with_partition(transition_partition_id_1.clone())
.build();
let created_2 = ParquetFileBuilder::new(1001)
.with_partition(transition_partition_id_1)
.build();
let capture = TracingCapture::new();
let ids = commit
.commit(
PartitionId::new(1),
partition_id_1,
&[existing_1.clone()],
&[],
&[created_1.clone().into(), created_2.clone().into()],
@ -130,9 +135,11 @@ mod tests {
Ok(res) if res == vec![ParquetFileId::new(1000), ParquetFileId::new(1001)]
);
let partition_id_2 = PartitionId::new(2);
let ids = commit
.commit(
PartitionId::new(2),
partition_id_2,
&[existing_2.clone(), existing_3.clone()],
&[existing_1.clone()],
&[],
@ -151,14 +158,14 @@ level = INFO; message = committed parquet file change; target_level = Final; par
inner.history(),
vec![
CommitHistoryEntry {
partition_id: PartitionId::new(1),
partition_id: partition_id_1,
delete: vec![existing_1.clone()],
upgrade: vec![],
created: vec![created_1, created_2],
target_level: CompactionLevel::Final,
},
CommitHistoryEntry {
partition_id: PartitionId::new(2),
partition_id: partition_id_2,
delete: vec![existing_2, existing_3],
upgrade: vec![existing_1],
created: vec![],

View File

@ -303,15 +303,12 @@ where
#[cfg(test)]
mod tests {
use std::sync::Arc;
use assert_matches::assert_matches;
use metric::{assert_histogram, Attributes};
use crate::commit::mock::{CommitHistoryEntry, MockCommit};
use iox_tests::ParquetFileBuilder;
use super::*;
use crate::commit::mock::{CommitHistoryEntry, MockCommit};
use assert_matches::assert_matches;
use iox_tests::{partition_identifier, ParquetFileBuilder};
use metric::{assert_histogram, Attributes};
use std::sync::Arc;
#[test]
fn test_display() {
@ -326,6 +323,9 @@ mod tests {
let inner = Arc::new(MockCommit::new());
let commit = MetricsCommitWrapper::new(Arc::clone(&inner), &registry);
let partition_id_1 = PartitionId::new(1);
let transition_partition_id_1 = partition_identifier(1);
let existing_1 = ParquetFileBuilder::new(1)
.with_file_size_bytes(10_001)
.with_row_count(1_001)
@ -350,7 +350,7 @@ mod tests {
let created = ParquetFileBuilder::new(1000)
.with_file_size_bytes(10_016)
.with_row_count(1_016)
.with_partition(1)
.with_partition(transition_partition_id_1)
.with_compaction_level(CompactionLevel::Initial)
.build();
@ -392,7 +392,7 @@ mod tests {
let ids = commit
.commit(
PartitionId::new(1),
partition_id_1,
&[existing_1.clone()],
&[existing_2a.clone()],
&[created.clone().into()],
@ -401,9 +401,11 @@ mod tests {
.await;
assert_matches!(ids, Ok(res) if res == vec![ParquetFileId::new(1000)]);
let partition_id_2 = PartitionId::new(2);
let ids = commit
.commit(
PartitionId::new(2),
partition_id_2,
&[existing_2b.clone(), existing_3.clone()],
&[existing_4.clone()],
&[],
@ -449,14 +451,14 @@ mod tests {
inner.history(),
vec![
CommitHistoryEntry {
partition_id: PartitionId::new(1),
partition_id: partition_id_1,
delete: vec![existing_1],
upgrade: vec![existing_2a.clone()],
created: vec![created],
target_level: CompactionLevel::FileNonOverlapped,
},
CommitHistoryEntry {
partition_id: PartitionId::new(2),
partition_id: partition_id_2,
delete: vec![existing_2b, existing_3],
upgrade: vec![existing_4],
created: vec![],

View File

@ -78,10 +78,9 @@ impl Commit for MockCommit {
#[cfg(test)]
mod tests {
use assert_matches::assert_matches;
use iox_tests::ParquetFileBuilder;
use super::*;
use assert_matches::assert_matches;
use iox_tests::{partition_identifier, ParquetFileBuilder};
#[test]
fn test_display() {
@ -92,6 +91,11 @@ mod tests {
async fn test_commit() {
let commit = MockCommit::new();
let partition_id_1 = PartitionId::new(1);
let transition_partition_id_1 = partition_identifier(1);
let partition_id_2 = PartitionId::new(2);
let transition_partition_id_2 = partition_identifier(2);
let existing_1 = ParquetFileBuilder::new(1).build();
let existing_2 = ParquetFileBuilder::new(2).build();
let existing_3 = ParquetFileBuilder::new(3).build();
@ -101,14 +105,22 @@ mod tests {
let existing_7 = ParquetFileBuilder::new(7).build();
let existing_8 = ParquetFileBuilder::new(8).build();
let created_1_1 = ParquetFileBuilder::new(1000).with_partition(1).build();
let created_1_2 = ParquetFileBuilder::new(1001).with_partition(1).build();
let created_1_3 = ParquetFileBuilder::new(1003).with_partition(1).build();
let created_2_1 = ParquetFileBuilder::new(1002).with_partition(2).build();
let created_1_1 = ParquetFileBuilder::new(1000)
.with_partition(transition_partition_id_1.clone())
.build();
let created_1_2 = ParquetFileBuilder::new(1001)
.with_partition(transition_partition_id_1.clone())
.build();
let created_1_3 = ParquetFileBuilder::new(1003)
.with_partition(transition_partition_id_1)
.build();
let created_2_1 = ParquetFileBuilder::new(1002)
.with_partition(transition_partition_id_2)
.build();
let ids = commit
.commit(
PartitionId::new(1),
partition_id_1,
&[existing_1.clone(), existing_2.clone()],
&[existing_3.clone(), existing_4.clone()],
&[created_1_1.clone().into(), created_1_2.clone().into()],
@ -122,7 +134,7 @@ mod tests {
let ids = commit
.commit(
PartitionId::new(2),
partition_id_2,
&[existing_3.clone()],
&[],
&[created_2_1.clone().into()],
@ -136,7 +148,7 @@ mod tests {
let ids = commit
.commit(
PartitionId::new(1),
partition_id_1,
&[existing_5.clone(), existing_6.clone(), existing_7.clone()],
&[],
&[created_1_3.clone().into()],
@ -151,7 +163,7 @@ mod tests {
// simulate fill implosion of the file (this may happen w/ delete predicates)
let ids = commit
.commit(
PartitionId::new(1),
partition_id_1,
&[existing_8.clone()],
&[],
&[],
@ -167,28 +179,28 @@ mod tests {
commit.history(),
vec![
CommitHistoryEntry {
partition_id: PartitionId::new(1),
partition_id: partition_id_1,
delete: vec![existing_1, existing_2],
upgrade: vec![existing_3.clone(), existing_4.clone()],
created: vec![created_1_1, created_1_2],
target_level: CompactionLevel::FileNonOverlapped,
},
CommitHistoryEntry {
partition_id: PartitionId::new(2),
partition_id: partition_id_2,
delete: vec![existing_3],
upgrade: vec![],
created: vec![created_2_1],
target_level: CompactionLevel::Final,
},
CommitHistoryEntry {
partition_id: PartitionId::new(1),
partition_id: partition_id_1,
delete: vec![existing_5, existing_6, existing_7,],
upgrade: vec![],
created: vec![created_1_3],
target_level: CompactionLevel::FileNonOverlapped,
},
CommitHistoryEntry {
partition_id: PartitionId::new(1),
partition_id: partition_id_1,
delete: vec![existing_8],
upgrade: vec![],
created: vec![],

View File

@ -4,7 +4,7 @@ use assert_matches::assert_matches;
use compactor_scheduler::{
create_scheduler, CompactionJob, LocalSchedulerConfig, Scheduler, SchedulerConfig,
};
use data_types::{ColumnType, ParquetFile, ParquetFileParams, PartitionId};
use data_types::{ColumnType, ParquetFile, ParquetFileParams, PartitionId, TransitionPartitionId};
use iox_tests::{ParquetFileBuilder, TestCatalog, TestParquetFileBuilder, TestPartition};
mod end_job;
@ -65,7 +65,7 @@ impl TestLocalScheduler {
pub async fn create_params_for_new_parquet_file(&self) -> ParquetFileParams {
ParquetFileBuilder::new(42)
.with_partition(self.get_partition_id().get())
.with_partition(self.get_transition_partition_id())
.build()
.into()
}
@ -81,4 +81,8 @@ impl TestLocalScheduler {
pub fn get_partition_id(&self) -> PartitionId {
self.test_partition.partition.id
}
pub fn get_transition_partition_id(&self) -> TransitionPartitionId {
self.test_partition.partition.transition_partition_id()
}
}

View File

@ -202,8 +202,7 @@ impl SimulatedFile {
ParquetFileParams {
namespace_id: partition_info.namespace_id,
table_id: partition_info.table.id,
partition_id: partition_info.partition_id,
partition_hash_id: partition_info.partition_hash_id.clone(),
partition_id: partition_info.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time,
max_time,

View File

@ -527,10 +527,9 @@ pub struct ParquetFile {
pub namespace_id: NamespaceId,
/// the table
pub table_id: TableId,
/// the partition
pub partition_id: PartitionId,
/// the partition hash ID, if generated
pub partition_hash_id: Option<PartitionHashId>,
/// the partition identifier
#[sqlx(flatten)]
pub partition_id: TransitionPartitionId,
/// the uuid used in the object store path for this file
pub object_store_id: Uuid,
/// the min timestamp of data in this file
@ -588,7 +587,6 @@ impl ParquetFile {
namespace_id: params.namespace_id,
table_id: params.table_id,
partition_id: params.partition_id,
partition_hash_id: params.partition_hash_id,
object_store_id: params.object_store_id,
min_time: params.min_time,
max_time: params.max_time,
@ -602,21 +600,9 @@ impl ParquetFile {
}
}
/// If this parquet file params will be storing a `PartitionHashId` in the catalog, use that.
/// Otherwise, use the database-assigned `PartitionId`.
pub fn transition_partition_id(&self) -> TransitionPartitionId {
TransitionPartitionId::from((self.partition_id, self.partition_hash_id.as_ref()))
}
/// Estimate the memory consumption of this object and its contents
pub fn size(&self) -> usize {
std::mem::size_of_val(self)
+ self
.partition_hash_id
.as_ref()
.map(|id| id.size() - std::mem::size_of_val(id))
.unwrap_or_default()
+ self.column_set.size()
std::mem::size_of_val(self) + self.partition_id.size() + self.column_set.size()
- std::mem::size_of_val(&self.column_set)
}
@ -638,10 +624,8 @@ pub struct ParquetFileParams {
pub namespace_id: NamespaceId,
/// the table
pub table_id: TableId,
/// the partition
pub partition_id: PartitionId,
/// the partition hash ID, if generated
pub partition_hash_id: Option<PartitionHashId>,
/// the partition identifier
pub partition_id: TransitionPartitionId,
/// the uuid used in the object store path for this file
pub object_store_id: Uuid,
/// the min timestamp of data in this file
@ -662,21 +646,12 @@ pub struct ParquetFileParams {
pub max_l0_created_at: Timestamp,
}
impl ParquetFileParams {
/// If this parquet file params will be storing a `PartitionHashId` in the catalog, use that.
/// Otherwise, use the database-assigned `PartitionId`.
pub fn transition_partition_id(&self) -> TransitionPartitionId {
TransitionPartitionId::from((self.partition_id, self.partition_hash_id.as_ref()))
}
}
impl From<ParquetFile> for ParquetFileParams {
fn from(value: ParquetFile) -> Self {
Self {
namespace_id: value.namespace_id,
table_id: value.table_id,
partition_id: value.partition_id,
partition_hash_id: value.partition_hash_id,
object_store_id: value.object_store_id,
min_time: value.min_time,
max_time: value.max_time,

View File

@ -31,6 +31,34 @@ impl TransitionPartitionId {
}
}
impl<'a, R> sqlx::FromRow<'a, R> for TransitionPartitionId
where
R: sqlx::Row,
&'static str: sqlx::ColumnIndex<R>,
PartitionId: sqlx::decode::Decode<'a, R::Database>,
PartitionId: sqlx::types::Type<R::Database>,
Option<PartitionHashId>: sqlx::decode::Decode<'a, R::Database>,
Option<PartitionHashId>: sqlx::types::Type<R::Database>,
{
fn from_row(row: &'a R) -> sqlx::Result<Self> {
let partition_id: Option<PartitionId> = row.try_get("partition_id")?;
let partition_hash_id: Option<PartitionHashId> = row.try_get("partition_hash_id")?;
let transition_partition_id = match (partition_id, partition_hash_id) {
(_, Some(hash_id)) => TransitionPartitionId::Deterministic(hash_id),
(Some(id), _) => TransitionPartitionId::Deprecated(id),
(None, None) => {
return Err(sqlx::Error::ColumnDecode {
index: "partition_id".into(),
source: "Both partition_id and partition_hash_id were NULL".into(),
})
}
};
Ok(transition_partition_id)
}
}
impl From<(PartitionId, Option<&PartitionHashId>)> for TransitionPartitionId {
fn from((partition_id, partition_hash_id): (PartitionId, Option<&PartitionHashId>)) -> Self {
partition_hash_id

View File

@ -267,8 +267,7 @@ mod tests {
let parquet_file_params = ParquetFileParams {
namespace_id: namespace.id,
table_id: partition.table_id,
partition_id: partition.id,
partition_hash_id: partition.hash_id().cloned(),
partition_id: partition.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time: Timestamp::new(1),
max_time: Timestamp::new(10),
@ -298,7 +297,7 @@ mod tests {
let location = ParquetFilePath::new(
file_in_catalog.namespace_id,
file_in_catalog.table_id,
&file_in_catalog.transition_partition_id(),
&file_in_catalog.partition_id.clone(),
file_in_catalog.object_store_id,
)
.object_store_path();
@ -376,7 +375,7 @@ mod tests {
let location = ParquetFilePath::new(
file_in_catalog.namespace_id,
file_in_catalog.table_id,
&file_in_catalog.transition_partition_id(),
&file_in_catalog.partition_id.clone(),
file_in_catalog.object_store_id,
)
.object_store_path();
@ -469,7 +468,7 @@ mod tests {
let loc = ParquetFilePath::new(
file_in_catalog.namespace_id,
file_in_catalog.table_id,
&file_in_catalog.transition_partition_id(),
&file_in_catalog.partition_id.clone(),
file_in_catalog.object_store_id,
)
.object_store_path();

View File

@ -52,6 +52,7 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
let proto_files = vec![
authz_path.join("authz.proto"),
catalog_path.join("parquet_file.proto"),
catalog_path.join("partition_identifier.proto"),
catalog_path.join("service.proto"),
compactor_path.join("service.proto"),
delete_path.join("service.proto"),

View File

@ -2,6 +2,8 @@ syntax = "proto3";
package influxdata.iox.catalog.v1;
option go_package = "github.com/influxdata/iox/catalog/v1";
import "influxdata/iox/catalog/v1/partition_identifier.proto";
message ParquetFile {
reserved 7;
reserved "min_sequence_number";
@ -11,6 +13,8 @@ message ParquetFile {
reserved "shard_id";
reserved 8;
reserved "max_sequence_number";
reserved 5;
reserved "partition_id";
// the id of the file in the catalog
int64 id = 1;
@ -18,8 +22,9 @@ message ParquetFile {
int64 namespace_id = 3;
// the table id
int64 table_id = 4;
// the partition id
int64 partition_id = 5;
PartitionIdentifier partition_identifier = 19;
// the object store uuid
string object_store_id = 6;
// the min timestamp of data in this file

View File

@ -0,0 +1,12 @@
syntax = "proto3";
package influxdata.iox.catalog.v1;
option go_package = "github.com/influxdata/iox/catalog/v1";
message PartitionIdentifier {
// Either the catalog-assigned partition ID or the deterministic identifier created from the
// table ID and partition key.
oneof id {
int64 catalog_id = 1;
bytes hash_id = 2;
}
}

View File

@ -3,6 +3,7 @@ package influxdata.iox.catalog.v1;
option go_package = "github.com/influxdata/iox/catalog/v1";
import "influxdata/iox/catalog/v1/parquet_file.proto";
import "influxdata/iox/catalog/v1/partition_identifier.proto";
service CatalogService {
// Get the parquet_file catalog records in the given partition
@ -19,8 +20,11 @@ service CatalogService {
}
message GetParquetFilesByPartitionIdRequest {
// the partition id
int64 partition_id = 1;
// Was the catalog-assigned partition ID.
reserved 1;
reserved "partition_id";
PartitionIdentifier partition_identifier = 2;
}
message GetParquetFilesByPartitionIdResponse {
@ -35,15 +39,17 @@ message Partition {
reserved "sequencer_id";
reserved 7;
reserved "shard_id";
reserved 1;
reserved "id";
// the partition id
int64 id = 1;
// the table id the partition is in
int64 table_id = 3;
// the partition key
string key = 4;
// the sort key for data in parquet files in the partition
repeated string array_sort_key = 6;
PartitionIdentifier identifier = 8;
}
message GetPartitionsByTableIdRequest {

View File

@ -1,5 +1,5 @@
use async_trait::async_trait;
use tracing::warn;
use tracing::{debug, warn};
// Re-export the bytes type to ensure upstream users of this crate are
// interacting with the same type.
@ -32,5 +32,7 @@ pub struct NopDispatcher;
#[async_trait::async_trait]
impl Dispatcher for NopDispatcher {
async fn dispatch(&self, _payload: crate::Bytes) {}
async fn dispatch(&self, _payload: crate::Bytes) {
debug!("received no-op message payload");
}
}

View File

@ -1,10 +1,13 @@
use data_types::{PartitionHashId, PartitionId, TransitionPartitionId};
use futures_util::TryStreamExt;
use influxdb_iox_client::{
catalog::{self, generated_types::ParquetFile},
catalog::{
self,
generated_types::{partition_identifier, ParquetFile, PartitionIdentifier},
},
connection::Connection,
store,
};
use observability_deps::tracing::{debug, info};
use std::path::{Path, PathBuf};
use thiserror::Error;
use tokio::{
@ -35,10 +38,6 @@ type Result<T, E = ExportError> = std::result::Result<T, E>;
pub struct RemoteExporter {
catalog_client: catalog::Client,
store_client: store::Client,
/// Optional partition filter. If `Some(partition_id)`, only these
/// files with that `partition_id` are downloaded.
partition_filter: Option<i64>,
}
impl RemoteExporter {
@ -46,19 +45,9 @@ impl RemoteExporter {
Self {
catalog_client: catalog::Client::new(connection.clone()),
store_client: store::Client::new(connection),
partition_filter: None,
}
}
/// Specify that only files and metadata for the specific
/// partition id should be exported.
pub fn with_partition_filter(mut self, partition_id: i64) -> Self {
info!(partition_id, "Filtering by partition");
self.partition_filter = Some(partition_id);
self
}
/// Exports all data and metadata for `table_name` in
/// `namespace` to local files.
///
@ -95,39 +84,14 @@ impl RemoteExporter {
let indexed_parquet_file_metadata = parquet_files.into_iter().enumerate();
for (index, parquet_file) in indexed_parquet_file_metadata {
if self.should_export(parquet_file.partition_id) {
self.export_parquet_file(
&output_directory,
index,
num_parquet_files,
&parquet_file,
)
self.export_parquet_file(&output_directory, index, num_parquet_files, &parquet_file)
.await?;
} else {
debug!(
"skipping file {} of {num_parquet_files} ({} does not match request)",
index + 1,
parquet_file.partition_id
);
}
}
println!("Done.");
Ok(())
}
/// Return true if this partition should be exported
fn should_export(&self, partition_id: i64) -> bool {
self.partition_filter
.map(|partition_filter| {
// if a partition filter was specified, only export
// the file if the partition matches
partition_filter == partition_id
})
// export files if there is no partition
.unwrap_or(true)
}
/// Exports table and partition information for the specified
/// table. Overwrites existing files, if any, to ensure it has the
/// latest catalog information.
@ -158,13 +122,11 @@ impl RemoteExporter {
.await?;
for partition in partitions {
let partition_id = partition.id;
if self.should_export(partition_id) {
let partition_json = serde_json::to_string_pretty(&partition)?;
let filename = format!("partition.{partition_id}.json");
let file_path = output_directory.join(&filename);
write_string_to_file(&partition_json, &file_path).await?;
}
let partition_id = to_partition_id(partition.identifier.as_ref());
let partition_json = serde_json::to_string_pretty(&partition)?;
let filename = format!("partition.{partition_id}.json");
let file_path = output_directory.join(&filename);
write_string_to_file(&partition_json, &file_path).await?;
}
Ok(())
@ -183,9 +145,10 @@ impl RemoteExporter {
parquet_file: &ParquetFile,
) -> Result<()> {
let uuid = &parquet_file.object_store_id;
let partition_id = parquet_file.partition_id;
let file_size_bytes = parquet_file.file_size_bytes as u64;
let partition_id = to_partition_id(parquet_file.partition_identifier.as_ref());
// copy out the metadata as pbjson encoded data always (to
// ensure we have the most up to date version)
{
@ -230,6 +193,21 @@ impl RemoteExporter {
}
}
fn to_partition_id(partition_identifier: Option<&PartitionIdentifier>) -> TransitionPartitionId {
match partition_identifier
.and_then(|pi| pi.id.as_ref())
.expect("Catalog service should send the partition identifier")
{
partition_identifier::Id::HashId(bytes) => TransitionPartitionId::Deterministic(
PartitionHashId::try_from(&bytes[..])
.expect("Catalog service should send valid hash_id bytes"),
),
partition_identifier::Id::CatalogId(id) => {
TransitionPartitionId::Deprecated(PartitionId::new(*id))
}
}
}
/// writes the contents of a string to a file, overwriting the previous contents, if any
async fn write_string_to_file(contents: &str, path: &Path) -> Result<()> {
let mut file = OpenOptions::new()

View File

@ -7,7 +7,7 @@ use data_types::{
NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, PARTITION_BY_DAY_PROTO,
},
ColumnSet, ColumnType, CompactionLevel, Namespace, NamespaceName, NamespaceNameError,
ParquetFileParams, Partition, PartitionHashId, Statistics, Table, TableId, Timestamp,
ParquetFileParams, Partition, Statistics, Table, TableId, Timestamp,
};
use generated_types::influxdata::iox::catalog::v1 as proto;
// ParquetFile as ProtoParquetFile, Partition as ProtoPartition,
@ -567,9 +567,6 @@ impl RemoteImporter {
// need to make columns in the target catalog
let column_set = insert_columns(table.id, decoded_iox_parquet_metadata, repos).await?;
// Create the the partition_hash_id
let partition_hash_id = Some(PartitionHashId::new(table.id, &partition.partition_key));
let params = if let Some(proto_parquet_file) = &parquet_metadata {
let compaction_level = proto_parquet_file
.compaction_level
@ -579,8 +576,7 @@ impl RemoteImporter {
ParquetFileParams {
namespace_id: namespace.id,
table_id: table.id,
partition_hash_id,
partition_id: partition.id,
partition_id: partition.transition_partition_id(),
object_store_id,
min_time: Timestamp::new(proto_parquet_file.min_time),
max_time: Timestamp::new(proto_parquet_file.max_time),
@ -599,8 +595,7 @@ impl RemoteImporter {
ParquetFileParams {
namespace_id: namespace.id,
table_id: table.id,
partition_hash_id,
partition_id: partition.id,
partition_id: partition.transition_partition_id(),
object_store_id,
min_time,
max_time,

View File

@ -67,7 +67,7 @@ libc = { version = "0.2" }
num_cpus = "1.16.0"
once_cell = { version = "1.18", features = ["parking_lot"] }
rustyline = { version = "12.0", default-features = false, features = ["with-file-history"]}
serde = "1.0.177"
serde = "1.0.179"
serde_json = "1.0.104"
snafu = "0.7"
tempfile = "3.7.0"

View File

@ -55,10 +55,6 @@ struct GetTable {
#[clap(action)]
table: String,
/// If specified, only files from the specified partitions are downloaded
#[clap(action, short, long)]
partition_id: Option<i64>,
/// The output directory to use. If not specified, files will be placed in a directory named
/// after the table in the current working directory.
#[clap(action, short)]
@ -91,13 +87,9 @@ pub async fn command(connection: Connection, config: Config) -> Result<()> {
Command::GetTable(GetTable {
namespace,
table,
partition_id,
output_directory,
}) => {
let mut exporter = RemoteExporter::new(connection);
if let Some(partition_id) = partition_id {
exporter = exporter.with_partition_filter(partition_id);
}
Ok(exporter
.export_table(output_directory, namespace, table)
.await?)

View File

@ -7,6 +7,7 @@ use clap_blocks::{
catalog_dsn::CatalogDsnConfig,
compactor::CompactorConfig,
compactor_scheduler::CompactorSchedulerConfig,
gossip::GossipConfig,
ingester::IngesterConfig,
ingester_address::IngesterAddress,
object_store::{make_object_store, ObjectStoreConfig},
@ -476,6 +477,7 @@ impl Config {
persist_queue_depth,
persist_hot_partition_cost,
rpc_write_max_incoming_bytes: 1024 * 1024 * 1024, // 1GiB
gossip_config: GossipConfig::disabled(),
};
let router_config = RouterConfig {
@ -489,6 +491,7 @@ impl Config {
rpc_write_replicas: 1.try_into().unwrap(),
rpc_write_max_outgoing_bytes: ingester_config.rpc_write_max_incoming_bytes,
rpc_write_health_error_window_seconds: Duration::from_secs(5),
gossip_config: GossipConfig::disabled(),
};
// create a CompactorConfig for the all in one server based on
@ -637,6 +640,7 @@ pub async fn command(config: Config) -> Result<()> {
Arc::clone(&catalog),
Arc::clone(&object_store),
&router_config,
&GossipConfig::disabled(),
router_run_config
.tracing_config()
.traces_jaeger_trace_context_header_name

View File

@ -98,6 +98,7 @@ pub async fn command(config: Config) -> Result<()> {
catalog,
object_store,
&config.router_config,
&config.router_config.gossip_config,
config
.run_config
.tracing_config()

View File

@ -157,10 +157,12 @@ async fn sharded_compactor_0_always_compacts_partition_1() {
.assert()
.success()
.stdout(
// Important parts are the expected partition ID
predicate::str::contains(r#""partitionId": "1","#)
// and compaction level
.and(predicate::str::contains(r#""compactionLevel": 1"#)),
// Important parts are the expected partition identifier
predicate::str::contains(
r#""hashId": "uGKn6bMp7mpBjN4ZEZjq6xUSdT8ZuHqB3vKubD0O0jc=""#,
)
// and compaction level
.and(predicate::str::contains(r#""compactionLevel": 1"#)),
);
}
.boxed()
@ -240,10 +242,12 @@ async fn sharded_compactor_1_never_compacts_partition_1() {
.assert()
.success()
.stdout(
// Important parts are the expected partition ID
predicate::str::contains(r#""partitionId": "1","#)
// and compaction level is 0 so it's not returned
.and(predicate::str::contains("compactionLevel").not()),
// Important parts are the expected partition identifier
predicate::str::contains(
r#""hashId": "uGKn6bMp7mpBjN4ZEZjq6xUSdT8ZuHqB3vKubD0O0jc=""#,
)
// and compaction level is 0 so it's not returned
.and(predicate::str::contains("compactionLevel").not()),
);
}
.boxed()

View File

@ -280,10 +280,9 @@ async fn remote_partition_and_get_from_store_and_pull() {
.arg("1")
.assert()
.success()
.stdout(
predicate::str::contains(r#""id": "1""#)
.and(predicate::str::contains(r#""partitionId": "1","#)),
)
.stdout(predicate::str::contains(
r#""hashId": "uGKn6bMp7mpBjN4ZEZjq6xUSdT8ZuHqB3vKubD0O0jc=""#,
))
.get_output()
.stdout
.clone();

View File

@ -29,9 +29,15 @@ impl Client {
&mut self,
partition_id: i64,
) -> Result<Vec<ParquetFile>, Error> {
let partition_identifier = PartitionIdentifier {
id: Some(partition_identifier::Id::CatalogId(partition_id)),
};
let response = self
.inner
.get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest { partition_id })
.get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest {
partition_identifier: Some(partition_identifier),
})
.await?;
Ok(response.into_inner().parquet_files)

View File

@ -48,6 +48,7 @@ trace = { version = "0.1.0", path = "../trace" }
uuid = "1.4.1"
wal = { version = "0.1.0", path = "../wal" }
workspace-hack = { version = "0.1", path = "../workspace-hack" }
gossip = { version = "0.1.0", path = "../gossip" }
[dev-dependencies]
assert_matches = "1.5.0"

View File

@ -1,6 +1,6 @@
//! Partition level data buffer structures.
use std::{collections::VecDeque, sync::Arc};
use std::sync::Arc;
use data_types::{
sequence_number_set::SequenceNumberSet, NamespaceId, PartitionHashId, PartitionId,
@ -8,11 +8,12 @@ use data_types::{
};
use mutable_batch::MutableBatch;
use observability_deps::tracing::*;
use schema::sort::SortKey;
use schema::{merge::SchemaMerger, sort::SortKey, Schema};
use self::{
buffer::{traits::Queryable, BufferState, DataBuffer, Persisting},
buffer::{traits::Queryable, DataBuffer},
persisting::{BatchIdent, PersistingData},
persisting_list::PersistingList,
};
use super::{namespace::NamespaceName, table::TableMetadata};
use crate::{
@ -21,6 +22,7 @@ use crate::{
mod buffer;
pub(crate) mod persisting;
mod persisting_list;
pub(crate) mod resolver;
/// The load state of the [`SortKey`] for a given partition.
@ -89,7 +91,7 @@ pub struct PartitionData {
///
/// The [`BatchIdent`] is a generational counter that is used to tag each
/// persisting with a unique, opaque identifier.
persisting: VecDeque<(BatchIdent, BufferState<Persisting>)>,
persisting: PersistingList,
/// The number of persist operations started over the lifetime of this
/// [`PartitionData`].
@ -123,7 +125,7 @@ impl PartitionData {
table_id,
table,
buffer: DataBuffer::default(),
persisting: VecDeque::with_capacity(1),
persisting: PersistingList::default(),
started_persistence_count: BatchIdent::default(),
completed_persistence_count: 0,
}
@ -169,7 +171,7 @@ impl PartitionData {
/// persisting batches, plus 1 for the "hot" buffer. Reading the row count
/// of each batch is `O(1)`. This method is expected to be fast.
pub(crate) fn rows(&self) -> usize {
self.persisting.iter().map(|(_, v)| v.rows()).sum::<usize>() + self.buffer.rows()
self.persisting.rows() + self.buffer.rows()
}
/// Return the timestamp min/max values for the data contained within this
@ -188,11 +190,8 @@ impl PartitionData {
/// statistics for each batch is `O(1)`. This method is expected to be fast.
pub(crate) fn timestamp_stats(&self) -> Option<TimestampMinMax> {
self.persisting
.iter()
.map(|(_, v)| {
v.timestamp_stats()
.expect("persisting batches must be non-empty")
})
.timestamp_stats()
.into_iter()
.chain(self.buffer.timestamp_stats())
.reduce(|acc, v| TimestampMinMax {
min: acc.min.min(v.min),
@ -200,6 +199,30 @@ impl PartitionData {
})
}
/// Return the schema of the data currently buffered within this
/// [`PartitionData`].
///
/// This schema is not additive - it is the union of the individual schema
/// batches currently buffered and as such columns are removed as the
/// individual batches containing those columns are persisted and dropped.
pub(crate) fn schema(&self) -> Option<Schema> {
if self.persisting.is_empty() && self.buffer.rows() == 0 {
return None;
}
Some(
self.persisting
.schema()
.into_iter()
.cloned()
.chain(self.buffer.schema())
.fold(SchemaMerger::new(), |acc, v| {
acc.merge(&v).expect("schemas are incompatible")
})
.build(),
)
}
/// Return all data for this partition, ordered by the calls to
/// [`PartitionData::buffer_write()`].
pub(crate) fn get_query_data(&mut self, projection: &OwnedProjection) -> Option<QueryAdaptor> {
@ -213,8 +236,7 @@ impl PartitionData {
// existing rows materialise to the correct output.
let data = self
.persisting
.iter()
.flat_map(|(_, b)| b.get_query_data(projection))
.get_query_data(projection)
.chain(buffered_data)
.collect::<Vec<_>>();
@ -287,7 +309,7 @@ impl PartitionData {
// Increment the "started persist" counter.
//
// This is used to cheaply identify batches given to the
// mark_persisted() call.
// mark_persisted() call and ensure monotonicity.
let batch_ident = self.started_persistence_count.next();
debug!(
@ -310,10 +332,9 @@ impl PartitionData {
batch_ident,
);
// Push the new buffer to the back of the persisting queue, so that
// iterating from back to front during queries iterates over writes from
// oldest to newest.
self.persisting.push_back((batch_ident, fsm));
// Push the buffer into the persisting list (which maintains batch
// order).
self.persisting.push(batch_ident, fsm);
Some(data)
}
@ -328,22 +349,11 @@ impl PartitionData {
/// This method panics if [`Self`] is not marked as undergoing a persist
/// operation, or `batch` is not currently being persisted.
pub(crate) fn mark_persisted(&mut self, batch: PersistingData) -> SequenceNumberSet {
// Find the batch in the persisting queue.
let idx = self
.persisting
.iter()
.position(|(old, _)| *old == batch.batch_ident())
.expect("no currently persisting batch");
// Remove the batch from the queue, preserving the order of the queue
// for batch iteration during queries.
let (old_ident, fsm) = self.persisting.remove(idx).unwrap();
assert_eq!(old_ident, batch.batch_ident());
let fsm = self.persisting.remove(batch.batch_ident());
self.completed_persistence_count += 1;
debug!(
batch_ident = %old_ident,
persistence_count = %self.completed_persistence_count,
namespace_id = %self.namespace_id,
table_id = %self.table_id,

View File

@ -7,7 +7,7 @@ use schema::Projection;
///
/// A [`Buffer`] can contain no writes.
///
/// [`BufferState`]: super::super::BufferState
/// [`BufferState`]: super::BufferState
#[derive(Debug, Default)]
pub(super) struct Buffer {
buffer: Option<MutableBatch>,

View File

@ -77,7 +77,7 @@ pub(crate) struct BufferState<T> {
impl BufferState<Buffering> {
/// Initialise a new buffer state machine.
pub(super) fn new() -> Self {
pub(crate) fn new() -> Self {
Self {
state: Buffering::default(),
sequence_numbers: SequenceNumberSet::default(),

View File

@ -2,14 +2,18 @@ use std::fmt::Display;
use crate::query_adaptor::QueryAdaptor;
/// An opaque generational identifier of a buffer in a [`PartitionData`].
/// An opaque, monotonic generational identifier of a buffer in a
/// [`PartitionData`].
///
/// A [`BatchIdent`] is strictly greater than all those that were obtained
/// before it.
///
/// [`PartitionData`]: super::PartitionData
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
pub(super) struct BatchIdent(u64);
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd)]
pub(crate) struct BatchIdent(u64);
impl BatchIdent {
/// Return the next unique value.
/// Return the next unique monotonic value.
pub(super) fn next(&mut self) -> Self {
self.0 += 1;
Self(self.0)

View File

@ -0,0 +1,467 @@
use std::collections::VecDeque;
use arrow::record_batch::RecordBatch;
use data_types::TimestampMinMax;
use schema::{merge::SchemaMerger, Schema};
use crate::query::projection::OwnedProjection;
use super::{
buffer::{traits::Queryable, BufferState, Persisting},
persisting::BatchIdent,
};
/// An ordered list of buffered, persisting data as [`BufferState<Persisting>`]
/// FSM instances.
///
/// This type maintains a cache of row count & timestamp min/max statistics
/// across all persisting batches, and performs incremental computation at
/// persist time, moving it out of the query execution path.
#[derive(Debug)]
pub(crate) struct PersistingList {
/// The currently persisting [`DataBuffer`] instances, if any.
///
/// This queue is ordered from newest at the head, to oldest at the tail -
/// forward iteration order matches write order.
///
/// The [`BatchIdent`] is a generational counter that is used to tag each
/// persisting with a unique, opaque, monotonic identifier.
///
/// [`DataBuffer`]: super::buffer::DataBuffer
persisting: VecDeque<(BatchIdent, BufferState<Persisting>)>,
cached: Option<CachedStats>,
}
impl Default for PersistingList {
fn default() -> Self {
Self {
persisting: VecDeque::with_capacity(1),
cached: None,
}
}
}
impl PersistingList {
/// Add this `buffer` which was assigned `ident` when marked as persisting
/// to the list.
///
/// This call incrementally recomputes the cached data statistics.
///
/// # Panics
///
/// Panics if a batch with a later `ident` has already been added to this
/// list - calls MUST push ordered buffers/idents to maintain correct
/// ordering of row updates across batches.
///
/// The provided buffer MUST be non-empty (containing a timestamp column,
/// and a schema)
pub(crate) fn push(&mut self, ident: BatchIdent, buffer: BufferState<Persisting>) {
// Recompute the statistics.
match &mut self.cached {
Some(v) => v.push(&buffer),
None => {
// Set the cached stats, as there's no other stats to merge
// with, so skip merging schemas.
self.cached = Some(CachedStats {
rows: buffer.rows(),
timestamps: buffer
.timestamp_stats()
.expect("persisting batch must contain timestamps"),
schema: buffer.schema().expect("persisting batch must have schema"),
});
}
}
// Invariant: the batch being added MUST be ordered strictly after
// existing batches.
//
// The BatchIdent provides this ordering assurance, as it is a monotonic
// (opaque) identifier.
assert!(self
.persisting
.back()
.map(|(last, _)| ident > *last)
.unwrap_or(true));
self.persisting.push_back((ident, buffer));
}
/// Remove the buffer identified by `ident` from the list.
///
/// There is no ordering requirement for this call, but is more efficient
/// when removals match the order of calls to [`PersistingList::push()`].
///
/// # Panics
///
/// This method panics if there is currently no batch identified by `ident`
/// in the list.
pub(crate) fn remove(&mut self, ident: BatchIdent) -> BufferState<Persisting> {
let idx = self
.persisting
.iter()
.position(|(old, _)| *old == ident)
.expect("no currently persisting batch");
let (old_ident, fsm) = self.persisting.remove(idx).unwrap();
assert_eq!(old_ident, ident);
// Recompute the cache of all remaining persisting batch stats (if any)
self.cached = CachedStats::new(self.persisting.iter().map(|(_, v)| v));
fsm
}
pub(crate) fn is_empty(&self) -> bool {
self.persisting.is_empty()
}
/// Returns the row count sum across all batches in this list.
///
/// This is an `O(1)` operation.
pub(crate) fn rows(&self) -> usize {
self.cached.as_ref().map(|v| v.rows).unwrap_or_default()
}
/// Returns the timestamp min/max values across all batches in this list.
///
/// This is an `O(1)` operation.
pub(crate) fn timestamp_stats(&self) -> Option<TimestampMinMax> {
self.cached.as_ref().map(|v| v.timestamps)
}
/// Returns the merged schema of all batches in this list.
///
/// This is an `O(1)` operation.
pub(crate) fn schema(&self) -> Option<&Schema> {
self.cached.as_ref().map(|v| &v.schema)
}
/// Returns the [`RecordBatch`] in this list, optionally applying the given
/// projection.
///
/// This is an `O(n)` operation.
pub(crate) fn get_query_data<'a, 'b: 'a>(
&'a self,
projection: &'b OwnedProjection,
) -> impl Iterator<Item = RecordBatch> + 'a {
self.persisting
.iter()
.flat_map(move |(_, b)| b.get_query_data(projection))
}
}
/// The set of cached statistics describing the batches of data within the
/// [`PersistingList`].
#[derive(Debug)]
struct CachedStats {
rows: usize,
timestamps: TimestampMinMax,
/// The merged schema of all the persisting batches.
schema: Schema,
}
impl CachedStats {
/// Generate a new [`CachedStats`] from an iterator of batches, if any.
///
/// # Panics
///
/// If any batches are empty (containing no schema or timestamp column), or
/// the batches do not contain compatible schemas, this call panics.
fn new<'a, T>(mut iter: T) -> Option<Self>
where
T: Iterator<Item = &'a BufferState<Persisting>> + 'a,
{
let v = iter.next()?;
let mut schema = SchemaMerger::new();
schema = schema
.merge(&v.schema().expect("persisting batch must be non-empty"))
.unwrap();
let mut rows = v.rows();
debug_assert!(rows > 0);
let mut timestamps = v
.timestamp_stats()
.expect("unprojected batch should have timestamp");
for buf in iter {
rows += buf.rows();
if let Some(v) = buf.schema() {
debug_assert!(buf.rows() > 0);
schema = schema
.merge(&v)
.expect("persit list contains incompatible schemas");
let ts = buf
.timestamp_stats()
.expect("no timestamp for bach containing rows");
timestamps.min = timestamps.min.min(ts.min);
timestamps.max = timestamps.max.max(ts.max);
}
}
Some(Self {
rows,
timestamps,
schema: schema.build(),
})
}
// Incrementally recompute the cached stats by adding `buffer` to the
// statistics.
fn push(&mut self, buffer: &BufferState<Persisting>) {
// This re-computation below MUST complete - no early exit is allowed or
// the stats will be left in an inconsistent state.
self.rows += buffer.rows();
let ts = buffer
.timestamp_stats()
.expect("persisting batch must contain timestamps");
self.timestamps.min = self.timestamps.min.min(ts.min);
self.timestamps.max = self.timestamps.max.max(ts.max);
let mut schema = SchemaMerger::new();
schema = schema.merge(&self.schema).unwrap();
schema = schema
.merge(&buffer.schema().expect("persisting batch must have schema"))
.expect("incompatible schema");
self.schema = schema.build()
}
}
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;
use arrow_util::assert_batches_eq;
use assert_matches::assert_matches;
use data_types::SequenceNumber;
use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
use crate::buffer_tree::partition::buffer::Transition;
use super::*;
/// Ensure the ordering of yielded batches matches that of the calls to
/// push(), preserving batch ordering, and in turn, causal row ordering.
#[test]
fn test_batch_ordering() {
let mut list = PersistingList::default();
let mut ident_oracle = BatchIdent::default();
assert!(list.is_empty());
// Generate a buffer with a single row.
let buffer = buffer_with_lp(r#"bananas,tag=platanos great="yes" 42"#);
// Add it to the list.
list.push(ident_oracle.next(), buffer);
// The statistics must now match the expected values.
assert!(!list.is_empty());
assert_eq!(list.rows(), 1);
assert_matches!(
list.timestamp_stats(),
Some(TimestampMinMax { min: 42, max: 42 })
);
assert_schema_matches(list.schema().unwrap(), &["time", "great", "tag"]);
// Assert the row content
let data = list
.get_query_data(&OwnedProjection::default())
.collect::<Vec<_>>();
let expected = vec![
"+-------+----------+--------------------------------+",
"| great | tag | time |",
"+-------+----------+--------------------------------+",
"| yes | platanos | 1970-01-01T00:00:00.000000042Z |",
"+-------+----------+--------------------------------+",
];
assert_eq!(data.len(), 1);
assert_batches_eq!(&expected, &data);
// Push a new buffer updating the last row to check yielded row ordering.
let buffer = buffer_with_lp(r#"bananas,tag=platanos great="definitely" 42"#);
list.push(ident_oracle.next(), buffer);
// The statistics must now match the expected values.
assert!(!list.is_empty());
assert_eq!(list.rows(), 2);
assert_matches!(
list.timestamp_stats(),
Some(TimestampMinMax { min: 42, max: 42 })
);
assert_schema_matches(list.schema().unwrap(), &["time", "great", "tag"]);
// Assert the row content
let data = list
.get_query_data(&OwnedProjection::default())
.collect::<Vec<_>>();
let expected = vec![
"+------------+----------+--------------------------------+",
"| great | tag | time |",
"+------------+----------+--------------------------------+",
"| yes | platanos | 1970-01-01T00:00:00.000000042Z |",
"| definitely | platanos | 1970-01-01T00:00:00.000000042Z |",
"+------------+----------+--------------------------------+",
];
assert_eq!(data.len(), 2);
assert_batches_eq!(&expected, &data);
}
/// Assert projection across batches works, and does not panic when given a
/// missing column.
#[test]
fn test_projection() {
let mut list = PersistingList::default();
let mut ident_oracle = BatchIdent::default();
assert!(list.is_empty());
// Populate the list.
list.push(
ident_oracle.next(),
buffer_with_lp(
"\
bananas,tag=platanos v=1 42\n\
bananas,tag=platanos v=2,bananas=100 4242\n\
",
),
);
list.push(
ident_oracle.next(),
buffer_with_lp(
"\
bananas,tag=platanos v=3 424242\n\
bananas v=4,bananas=200 42424242\n\
",
),
);
// Assert the row content
let data = list
.get_query_data(&OwnedProjection::from(vec!["time", "tag", "missing"]))
.collect::<Vec<_>>();
let expected = vec![
"+--------------------------------+----------+",
"| time | tag |",
"+--------------------------------+----------+",
"| 1970-01-01T00:00:00.000000042Z | platanos |",
"| 1970-01-01T00:00:00.000004242Z | platanos |",
"| 1970-01-01T00:00:00.000424242Z | platanos |",
"| 1970-01-01T00:00:00.042424242Z | |",
"+--------------------------------+----------+",
];
assert_batches_eq!(&expected, &data);
}
/// Validate the cached statistics as batches are added and removed.
#[test]
fn test_cached_statistics() {
let mut list = PersistingList::default();
let mut ident_oracle = BatchIdent::default();
assert!(list.is_empty());
// Generate a buffer with a single row.
let first_batch = ident_oracle.next();
list.push(
first_batch,
buffer_with_lp(r#"bananas,tag=platanos great="yes" 42"#),
);
// The statistics must now match the expected values.
assert!(!list.is_empty());
assert_eq!(list.rows(), 1);
assert_matches!(
list.timestamp_stats(),
Some(TimestampMinMax { min: 42, max: 42 })
);
assert_schema_matches(list.schema().unwrap(), &["time", "great", "tag"]);
// Push another row.
let second_batch = ident_oracle.next();
list.push(
second_batch,
buffer_with_lp(r#"bananas,another=yes great="definitely",incremental=true 4242"#),
);
// The statistics must now match the expected values.
assert!(!list.is_empty());
assert_eq!(list.rows(), 2);
assert_matches!(
list.timestamp_stats(),
Some(TimestampMinMax { min: 42, max: 4242 })
);
assert_schema_matches(
list.schema().unwrap(),
&["time", "great", "tag", "another", "incremental"],
);
// Remove the first batch.
list.remove(first_batch);
// The statistics must now match the second batch values.
assert!(!list.is_empty());
assert_eq!(list.rows(), 1);
assert_matches!(
list.timestamp_stats(),
Some(TimestampMinMax {
min: 4242,
max: 4242
})
);
assert_schema_matches(
list.schema().unwrap(),
&["time", "great", "another", "incremental"],
);
// Remove the second/final batch.
list.remove(second_batch);
assert!(list.is_empty());
assert_eq!(list.rows(), 0);
assert_matches!(list.timestamp_stats(), None);
assert_matches!(list.schema(), None);
}
/// Assert the schema columns match the given names.
fn assert_schema_matches(schema: &Schema, cols: &[&str]) {
let schema = schema.as_arrow();
let got = schema
.all_fields()
.into_iter()
.map(|v| v.name().to_owned())
.collect::<BTreeSet<_>>();
let want = cols
.iter()
.map(ToString::to_string)
.collect::<BTreeSet<_>>();
assert_eq!(got, want);
}
/// Return a persisting buffer containing the given LP content.
fn buffer_with_lp(lp: &str) -> BufferState<Persisting> {
let mut buffer = BufferState::new();
// Write some data to a buffer.
buffer
.write(lp_to_mutable_batch(lp).1, SequenceNumber::new(0))
.expect("write to empty buffer should succeed");
// Convert the buffer into a persisting snapshot.
match buffer.snapshot() {
Transition::Ok(v) => v.into_persisting(),
Transition::Unchanged(_) => panic!("did not transition to snapshot state"),
}
}
}

View File

@ -1,3 +1,5 @@
use gossip::{GossipHandle, NopDispatcher};
/// This needs to be pub for the benchmarks but should not be used outside the crate.
#[cfg(feature = "benches")]
pub use wal_replay::*;
@ -5,7 +7,7 @@ pub use wal_replay::*;
mod graceful_shutdown;
mod wal_replay;
use std::{path::PathBuf, sync::Arc, time::Duration};
use std::{net::SocketAddr, path::PathBuf, sync::Arc, time::Duration};
use arrow_flight::flight_service_server::FlightService;
use backoff::BackoffConfig;
@ -109,6 +111,9 @@ pub struct IngesterGuard<T> {
/// The task handle executing the graceful shutdown once triggered.
graceful_shutdown_handler: tokio::task::JoinHandle<()>,
shutdown_complete: Shared<oneshot::Receiver<()>>,
/// An optional handle to the gossip sub-system, if running.
gossip_handle: Option<GossipHandle>,
}
impl<T> IngesterGuard<T>
@ -137,6 +142,27 @@ impl<T> Drop for IngesterGuard<T> {
}
}
/// Configuration parameters for the optional gossip sub-system.
#[derive(Debug, Default)]
pub enum GossipConfig {
/// Disable the gossip sub-system.
#[default]
Disabled,
/// Enable the gossip sub-system, listening on the specified `bind_addr` and
/// using `peers` as the initial peer seed list.
Enabled {
/// UDP socket address to use for gossip communication.
bind_addr: SocketAddr,
/// Initial peer seed list in the form of either:
///
/// - "dns.address.example:port"
/// - "10.0.0.1:port"
///
peers: Vec<String>,
},
}
/// Errors that occur during initialisation of an `ingester` instance.
#[derive(Debug, Error)]
pub enum InitError {
@ -152,6 +178,10 @@ pub enum InitError {
/// An error replaying the entries in the WAL.
#[error(transparent)]
WalReplay(Box<dyn std::error::Error>),
/// An error binding the UDP socket for gossip communication.
#[error("failed to bind udp gossip socket: {0}")]
GossipBind(std::io::Error),
}
/// Initialise a new `ingester` instance, returning the gRPC service handler
@ -238,6 +268,7 @@ pub async fn new<F>(
persist_queue_depth: usize,
persist_hot_partition_cost: usize,
object_store: ParquetStorage,
gossip: GossipConfig,
shutdown: F,
) -> Result<IngesterGuard<impl IngesterRpcInterface>, InitError>
where
@ -351,11 +382,9 @@ where
// Initialize disk metrics to emit disk capacity / free statistics for the
// WAL directory.
let disk_metric_task = tokio::task::spawn(
DiskSpaceMetrics::new(wal_directory, &metrics)
.expect("failed to resolve WAL directory to disk")
.run(),
);
let (disk_metric_task, _snapshot_rx) = DiskSpaceMetrics::new(wal_directory, &metrics)
.expect("failed to resolve WAL directory to disk");
let disk_metric_task = tokio::task::spawn(disk_metric_task.run());
// Replay the WAL log files, if any.
let max_sequence_number =
@ -422,6 +451,23 @@ where
wal_reference_handle,
));
// Optionally start the gossip subsystem
let gossip_handle = match gossip {
GossipConfig::Disabled => {
info!("gossip disabled");
None
}
GossipConfig::Enabled { bind_addr, peers } => {
// Start the gossip sub-system, which logs during init.
let handle =
gossip::Builder::new(peers, NopDispatcher::default(), Arc::clone(&metrics))
.bind(bind_addr)
.await
.map_err(InitError::GossipBind)?;
Some(handle)
}
};
Ok(IngesterGuard {
rpc: GrpcDelegate::new(
Arc::new(write_path),
@ -438,5 +484,6 @@ where
disk_metric_task,
graceful_shutdown_handler: shutdown_task,
shutdown_complete: shutdown_rx.shared(),
gossip_handle,
})
}

View File

@ -200,6 +200,7 @@
unused_crate_dependencies,
missing_docs
)]
#![allow(clippy::default_constructed_unit_structs)]
// Workaround for "unused crate" lint false positives.
#[cfg(test)]

View File

@ -2,7 +2,8 @@ use std::{fmt::Debug, sync::Arc, time::Duration};
use async_trait::async_trait;
use data_types::{
sequence_number_set::SequenceNumberSet, NamespaceId, ParquetFileParams, PartitionId, TableId,
sequence_number_set::SequenceNumberSet, NamespaceId, ParquetFileParams, TableId,
TransitionPartitionId,
};
use crate::wal::reference_tracker::WalReferenceHandle;
@ -54,9 +55,9 @@ impl CompletedPersist {
self.meta.table_id
}
/// Returns the [`PartitionId`] of the persisted data.
pub(crate) fn partition_id(&self) -> PartitionId {
self.meta.partition_id
/// Returns the [`TransitionPartitionId`] of the persisted data.
pub(crate) fn partition_id(&self) -> &TransitionPartitionId {
&self.meta.partition_id
}
/// Returns the [`SequenceNumberSet`] of the persisted data.
@ -166,15 +167,16 @@ pub(crate) mod mock {
#[cfg(test)]
mod tests {
use super::*;
use crate::test_util::{ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_ID, ARBITRARY_TABLE_ID};
use crate::test_util::{
ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, ARBITRARY_TRANSITION_PARTITION_ID,
};
use data_types::{ColumnId, ColumnSet, SequenceNumber, Timestamp};
fn arbitrary_file_meta() -> ParquetFileParams {
ParquetFileParams {
namespace_id: ARBITRARY_NAMESPACE_ID,
table_id: ARBITRARY_TABLE_ID,
partition_id: ARBITRARY_PARTITION_ID,
partition_hash_id: None,
partition_id: ARBITRARY_TRANSITION_PARTITION_ID.clone(),
object_store_id: Default::default(),
min_time: Timestamp::new(42),
max_time: Timestamp::new(42),
@ -226,7 +228,7 @@ mod tests {
assert_eq!(note.namespace_id(), meta.namespace_id);
assert_eq!(note.table_id(), meta.table_id);
assert_eq!(note.partition_id(), meta.partition_id);
assert_eq!(note.partition_id(), &meta.partition_id);
assert_eq!(note.column_count(), meta.column_set.len());
assert_eq!(note.row_count(), meta.row_count as usize);

View File

@ -151,7 +151,9 @@ mod tests {
use super::*;
use crate::{
persist::completion_observer::mock::MockCompletionObserver,
test_util::{ARBITRARY_NAMESPACE_ID, ARBITRARY_PARTITION_ID, ARBITRARY_TABLE_ID},
test_util::{
ARBITRARY_NAMESPACE_ID, ARBITRARY_TABLE_ID, ARBITRARY_TRANSITION_PARTITION_ID,
},
};
use data_types::{
sequence_number_set::SequenceNumberSet, ColumnId, ColumnSet, ParquetFileParams, Timestamp,
@ -169,8 +171,7 @@ mod tests {
let meta = ParquetFileParams {
namespace_id: ARBITRARY_NAMESPACE_ID,
table_id: ARBITRARY_TABLE_ID,
partition_id: ARBITRARY_PARTITION_ID,
partition_hash_id: None,
partition_id: ARBITRARY_TRANSITION_PARTITION_ID.clone(),
object_store_id: Default::default(),
min_time: Timestamp::new(Duration::from_secs(1_000).as_nanos() as _),
max_time: Timestamp::new(Duration::from_secs(1_042).as_nanos() as _), // 42 seconds later

View File

@ -16,7 +16,7 @@ mod tests {
use std::{sync::Arc, time::Duration};
use assert_matches::assert_matches;
use data_types::{CompactionLevel, ParquetFile, TransitionPartitionId};
use data_types::{CompactionLevel, ParquetFile};
use futures::TryStreamExt;
use iox_catalog::{
interface::{get_schema_by_id, Catalog, SoftDeletedRows},
@ -190,7 +190,7 @@ mod tests {
// Generate a partition with data
let partition = partition_with_write(Arc::clone(&catalog)).await;
let table_id = partition.lock().table_id();
let partition_id = partition.lock().partition_id();
let partition_id = partition.lock().transition_partition_id();
let namespace_id = partition.lock().namespace_id();
assert_matches!(partition.lock().sort_key(), SortKeyState::Provided(None));
@ -221,7 +221,7 @@ mod tests {
assert_matches!(&completion_observer.calls().as_slice(), &[n] => {
assert_eq!(n.namespace_id(), namespace_id);
assert_eq!(n.table_id(), table_id);
assert_eq!(n.partition_id(), partition_id);
assert_eq!(n.partition_id(), &partition_id);
assert_eq!(n.sequence_numbers().len(), 1);
});
@ -243,12 +243,12 @@ mod tests {
.repositories()
.await
.parquet_files()
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
.list_by_partition_not_to_delete(&partition_id)
.await
.expect("query for parquet files failed");
// Validate a single file was inserted with the expected properties.
let (object_store_id, file_size_bytes) = assert_matches!(&*files, &[ParquetFile {
let (object_store_id, file_size_bytes) = assert_matches!(&*files, [ParquetFile {
namespace_id: got_namespace_id,
table_id: got_table_id,
partition_id: got_partition_id,
@ -263,12 +263,12 @@ mod tests {
{
assert_eq!(created_at.get(), max_l0_created_at.get());
assert_eq!(got_namespace_id, namespace_id);
assert_eq!(got_table_id, table_id);
assert_eq!(got_partition_id, partition_id);
assert_eq!(got_namespace_id, &namespace_id);
assert_eq!(got_table_id, &table_id);
assert_eq!(got_partition_id, &partition_id);
assert_eq!(row_count, 1);
assert_eq!(compaction_level, CompactionLevel::Initial);
assert_eq!(*row_count, 1);
assert_eq!(compaction_level, &CompactionLevel::Initial);
(object_store_id, file_size_bytes)
}
@ -292,7 +292,7 @@ mod tests {
}] => {
let want_path = format!("{object_store_id}.parquet");
assert!(location.as_ref().ends_with(&want_path));
assert_eq!(size, file_size_bytes as usize);
assert_eq!(size, *file_size_bytes as usize);
}
)
}
@ -326,8 +326,7 @@ mod tests {
// Generate a partition with data
let partition = partition_with_write(Arc::clone(&catalog)).await;
let table_id = partition.lock().table_id();
let partition_id = partition.lock().partition_id();
let transition_partition_id = partition.lock().transition_partition_id();
let partition_id = partition.lock().transition_partition_id();
let namespace_id = partition.lock().namespace_id();
assert_matches!(partition.lock().sort_key(), SortKeyState::Provided(None));
@ -344,7 +343,7 @@ mod tests {
.await
.partitions()
.cas_sort_key(
&transition_partition_id,
&partition_id,
None,
&["bananas", "are", "good", "for", "you"],
)
@ -367,7 +366,7 @@ mod tests {
assert_matches!(&completion_observer.calls().as_slice(), &[n] => {
assert_eq!(n.namespace_id(), namespace_id);
assert_eq!(n.table_id(), table_id);
assert_eq!(n.partition_id(), partition_id);
assert_eq!(n.partition_id(), &partition_id);
assert_eq!(n.sequence_numbers().len(), 1);
});
@ -392,12 +391,12 @@ mod tests {
.repositories()
.await
.parquet_files()
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition_id))
.list_by_partition_not_to_delete(&partition_id)
.await
.expect("query for parquet files failed");
// Validate a single file was inserted with the expected properties.
let (object_store_id, file_size_bytes) = assert_matches!(&*files, &[ParquetFile {
let (object_store_id, file_size_bytes) = assert_matches!(&*files, [ParquetFile {
namespace_id: got_namespace_id,
table_id: got_table_id,
partition_id: got_partition_id,
@ -412,12 +411,12 @@ mod tests {
{
assert_eq!(created_at.get(), max_l0_created_at.get());
assert_eq!(got_namespace_id, namespace_id);
assert_eq!(got_table_id, table_id);
assert_eq!(got_partition_id, partition_id);
assert_eq!(got_namespace_id, &namespace_id);
assert_eq!(got_table_id, &table_id);
assert_eq!(got_partition_id, &partition_id);
assert_eq!(row_count, 1);
assert_eq!(compaction_level, CompactionLevel::Initial);
assert_eq!(*row_count, 1);
assert_eq!(compaction_level, &CompactionLevel::Initial);
(object_store_id, file_size_bytes)
}
@ -438,18 +437,14 @@ mod tests {
assert_eq!(files.len(), 2, "expected two uploaded files");
// Ensure the catalog record points at a valid file in object storage.
let want_path = ParquetFilePath::new(
namespace_id,
table_id,
&transition_partition_id,
object_store_id,
)
.object_store_path();
let want_path =
ParquetFilePath::new(namespace_id, table_id, &partition_id, *object_store_id)
.object_store_path();
let file = files
.into_iter()
.find(|f| f.location == want_path)
.expect("did not find final file in object storage");
assert_eq!(file.size, file_size_bytes as usize);
assert_eq!(file.size, *file_size_bytes as usize);
}
}

View File

@ -55,7 +55,8 @@ pub(crate) mod mock {
use std::{sync::Arc, time::Duration};
use data_types::{
ColumnId, ColumnSet, NamespaceId, ParquetFileParams, PartitionId, TableId, Timestamp,
ColumnId, ColumnSet, NamespaceId, ParquetFileParams, PartitionHashId, PartitionKey,
TableId, Timestamp, TransitionPartitionId,
};
use test_helpers::timeout::FutureTimeout;
use tokio::task::JoinHandle;
@ -155,13 +156,16 @@ pub(crate) mod mock {
let wait_ms: u64 = rand::random::<u64>() % 100;
tokio::time::sleep(Duration::from_millis(wait_ms)).await;
let sequence_numbers = partition.lock().mark_persisted(data);
let table_id = TableId::new(2);
let partition_hash_id =
PartitionHashId::new(table_id, &PartitionKey::from("arbitrary"));
let partition_id = TransitionPartitionId::Deterministic(partition_hash_id);
completion_observer
.persist_complete(Arc::new(CompletedPersist::new(
ParquetFileParams {
namespace_id: NamespaceId::new(1),
table_id: TableId::new(2),
partition_id: PartitionId::new(3),
partition_hash_id: None,
table_id,
partition_id,
object_store_id: Default::default(),
min_time: Timestamp::new(42),
max_time: Timestamp::new(42),

View File

@ -394,8 +394,7 @@ where
ParquetFileParams {
namespace_id: NamespaceId::new(1),
table_id: TableId::new(2),
partition_id: PartitionId::new(3),
partition_hash_id: None,
partition_id: ARBITRARY_TRANSITION_PARTITION_ID.clone(),
object_store_id: Default::default(),
min_time: Timestamp::new(42),
max_time: Timestamp::new(42),

View File

@ -30,7 +30,7 @@ use futures::{stream::FuturesUnordered, FutureExt, StreamExt, TryStreamExt};
use generated_types::influxdata::iox::ingester::v1::{
write_service_server::WriteService, WriteRequest,
};
use ingester::{IngesterGuard, IngesterRpcInterface};
use ingester::{GossipConfig, IngesterGuard, IngesterRpcInterface};
use ingester_query_grpc::influxdata::iox::ingester::v1::IngesterQueryRequest;
use iox_catalog::{
interface::{Catalog, SoftDeletedRows},
@ -168,6 +168,7 @@ impl TestContextBuilder {
max_persist_queue_depth,
persist_hot_partition_cost,
storage.clone(),
GossipConfig::default(),
shutdown_rx.map(|v| v.expect("shutdown sender dropped without calling shutdown")),
)
.await

View File

@ -0,0 +1,24 @@
DROP TRIGGER IF EXISTS update_partition ON parquet_file;
ALTER TABLE parquet_file
ALTER COLUMN partition_id
DROP NOT NULL;
CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
RETURNS TRIGGER
LANGUAGE PLPGSQL
AS $$
BEGIN
UPDATE partition
SET new_file_at = NEW.created_at
WHERE (NEW.partition_id IS NULL OR id = NEW.partition_id)
AND (NEW.partition_hash_id IS NULL OR hash_id = NEW.partition_hash_id);
RETURN NEW;
END;
$$;
CREATE TRIGGER update_partition
AFTER INSERT ON parquet_file
FOR EACH ROW
EXECUTE PROCEDURE update_partition_on_new_file_at();

View File

@ -0,0 +1,98 @@
CREATE TABLE parquet_file_temp
AS SELECT * FROM parquet_file;
DROP TABLE parquet_file;
CREATE TABLE parquet_file
(
id INTEGER
constraint parquet_file_pkey
primary key autoincrement,
shard_id numeric not null
constraint parquet_file_sequencer_id_fkey
references shard,
table_id numeric not null
references table_name,
partition_id numeric
references partition,
partition_hash_id bytea
references partition (hash_id),
object_store_id uuid not null
constraint parquet_location_unique
unique,
max_sequence_number numeric,
min_time numeric,
max_time numeric,
to_delete numeric,
row_count numeric default 0 not null,
file_size_bytes numeric default 0 not null,
compaction_level smallint default 0 not null,
created_at numeric,
namespace_id numeric not null
references namespace
on delete cascade,
column_set numeric[] not null,
max_l0_created_at numeric default 0 not null
);
create index if not exists parquet_file_deleted_at_idx
on parquet_file (to_delete);
create index if not exists parquet_file_partition_idx
on parquet_file (partition_id);
create index if not exists parquet_file_table_idx
on parquet_file (table_id);
create index if not exists parquet_file_shard_compaction_delete_idx
on parquet_file (shard_id, compaction_level, to_delete);
create index if not exists parquet_file_shard_compaction_delete_created_idx
on parquet_file (shard_id, compaction_level, to_delete, created_at);
create index if not exists parquet_file_partition_created_idx
on parquet_file (partition_id, created_at);
CREATE INDEX IF NOT EXISTS parquet_file_partition_hash_id_idx
ON parquet_file (partition_hash_id)
WHERE partition_hash_id IS NOT NULL;
create trigger if not exists update_partition
after insert
on parquet_file
for each row
begin
UPDATE partition
SET new_file_at = NEW.created_at
WHERE (NEW.partition_id IS NULL OR id = NEW.partition_id)
AND (NEW.partition_hash_id IS NULL OR hash_id = NEW.partition_hash_id);
end;
create trigger if not exists update_billing
after insert
on parquet_file
for each row
begin
INSERT INTO billing_summary (namespace_id, total_file_size_bytes)
VALUES (NEW.namespace_id, NEW.file_size_bytes)
ON CONFLICT (namespace_id) DO UPDATE
SET total_file_size_bytes = billing_summary.total_file_size_bytes + NEW.file_size_bytes
WHERE billing_summary.namespace_id = NEW.namespace_id;
end;
create trigger if not exists decrement_summary
after update
on parquet_file
for each row
when OLD.to_delete IS NULL AND NEW.to_delete IS NOT NULL
begin
UPDATE billing_summary
SET total_file_size_bytes = billing_summary.total_file_size_bytes - OLD.file_size_bytes
WHERE billing_summary.namespace_id = OLD.namespace_id;
end;
INSERT INTO parquet_file
SELECT * FROM parquet_file_temp;
DROP TABLE parquet_file_temp;

View File

@ -1865,7 +1865,7 @@ pub(crate) mod test_helpers {
let other_params = ParquetFileParams {
table_id: other_partition.table_id,
partition_id: other_partition.id,
partition_id: other_partition.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time: Timestamp::new(50),
max_time: Timestamp::new(60),
@ -1978,7 +1978,7 @@ pub(crate) mod test_helpers {
let f1_params = ParquetFileParams {
table_id: partition2.table_id,
partition_id: partition2.id,
partition_id: partition2.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time: Timestamp::new(1),
max_time: Timestamp::new(10),
@ -2449,7 +2449,7 @@ pub(crate) mod test_helpers {
let l0_five_hour_ago_file_params = ParquetFileParams {
object_store_id: Uuid::new_v4(),
created_at: time_five_hour_ago,
partition_id: partition2.id,
partition_id: partition2.transition_partition_id(),
..parquet_file_params.clone()
};
repos
@ -2492,7 +2492,7 @@ pub(crate) mod test_helpers {
let l1_file_params = ParquetFileParams {
object_store_id: Uuid::new_v4(),
created_at: time_now,
partition_id: partition2.id,
partition_id: partition2.transition_partition_id(),
compaction_level: CompactionLevel::FileNonOverlapped,
..parquet_file_params.clone()
};
@ -2578,7 +2578,7 @@ pub(crate) mod test_helpers {
let l2_file_params = ParquetFileParams {
object_store_id: Uuid::new_v4(),
created_at: time_now,
partition_id: partition3.id,
partition_id: partition3.transition_partition_id(),
compaction_level: CompactionLevel::Final,
..parquet_file_params.clone()
};
@ -2619,7 +2619,7 @@ pub(crate) mod test_helpers {
let l0_one_hour_ago_file_params = ParquetFileParams {
object_store_id: Uuid::new_v4(),
created_at: time_one_hour_ago,
partition_id: partition3.id,
partition_id: partition3.transition_partition_id(),
..parquet_file_params.clone()
};
repos
@ -2720,8 +2720,7 @@ pub(crate) mod test_helpers {
level1_file.compaction_level = CompactionLevel::FileNonOverlapped;
let other_partition_params = ParquetFileParams {
partition_id: partition2.id,
partition_hash_id: partition2.hash_id().cloned(),
partition_id: partition2.transition_partition_id(),
object_store_id: Uuid::new_v4(),
..parquet_file_params.clone()
};
@ -2744,12 +2743,20 @@ pub(crate) mod test_helpers {
expected_ids.sort();
assert_eq!(file_ids, expected_ids);
// remove namespace to avoid it from affecting later tests
repos
.namespaces()
.soft_delete("namespace_parquet_file_test_list_by_partiton_not_to_delete")
// Using the catalog partition ID should return the same files, even if the Parquet file
// records don't have the partition ID on them (which is the default now)
let files = repos
.parquet_files()
.list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition.id))
.await
.expect("delete namespace should succeed");
.unwrap();
assert_eq!(files.len(), 2);
let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
file_ids.sort();
let mut expected_ids = vec![parquet_file.id, level1_file.id];
expected_ids.sort();
assert_eq!(file_ids, expected_ids);
}
async fn test_update_to_compaction_level_1(catalog: Arc<dyn Catalog>) {

View File

@ -396,8 +396,7 @@ pub mod test_helpers {
ParquetFileParams {
namespace_id: namespace.id,
table_id: table.id,
partition_id: partition.id,
partition_hash_id: partition.hash_id().cloned(),
partition_id: partition.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time: Timestamp::new(1),
max_time: Timestamp::new(10),

View File

@ -887,14 +887,28 @@ impl ParquetFileRepo for MemTxn {
) -> Result<Vec<ParquetFile>> {
let stage = self.stage();
let partition = stage
.partitions
.iter()
.find(|p| match partition_id {
TransitionPartitionId::Deterministic(hash_id) => p
.hash_id()
.map(|p_hash_id| p_hash_id == hash_id)
.unwrap_or(false),
TransitionPartitionId::Deprecated(id) => id == &p.id,
})
.unwrap()
.clone();
Ok(stage
.parquet_files
.iter()
.filter(|f| match partition_id {
TransitionPartitionId::Deterministic(hash_id) => {
f.partition_hash_id.as_ref().map_or(false, |h| h == hash_id)
}
TransitionPartitionId::Deprecated(id) => f.partition_id == *id,
.filter(|f| match &f.partition_id {
TransitionPartitionId::Deterministic(hash_id) => partition
.hash_id()
.map(|p_hash_id| p_hash_id == hash_id)
.unwrap_or(false),
TransitionPartitionId::Deprecated(id) => id == &partition.id,
})
.filter(|f| f.to_delete.is_none())
.cloned()
@ -996,17 +1010,15 @@ async fn create_parquet_file(
ParquetFileId::new(stage.parquet_files.len() as i64 + 1),
);
let created_at = parquet_file.created_at;
let partition_id = parquet_file.partition_id;
let partition_id = parquet_file.partition_id.clone();
stage.parquet_files.push(parquet_file);
// Update the new_file_at field its partition to the time of created_at
let partition = stage
.partitions
.iter_mut()
.find(|p| p.id == partition_id)
.ok_or(Error::PartitionNotFound {
id: TransitionPartitionId::Deprecated(partition_id),
})?;
.find(|p| p.transition_partition_id() == partition_id)
.ok_or(Error::PartitionNotFound { id: partition_id })?;
partition.new_file_at = Some(created_at);
Ok(stage.parquet_files.last().unwrap().clone())

View File

@ -1627,22 +1627,26 @@ RETURNING id;
let query = match partition_id {
TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFile>(
r#"
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
max_l0_created_at
SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
compaction_level, created_at, column_set, max_l0_created_at
FROM parquet_file
WHERE parquet_file.partition_hash_id = $1
INNER JOIN partition
ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
WHERE partition.hash_id = $1
AND parquet_file.to_delete IS NULL;
"#,
)
.bind(hash_id), // $1
TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFile>(
r#"
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
max_l0_created_at
SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
compaction_level, created_at, column_set, max_l0_created_at
FROM parquet_file
WHERE parquet_file.partition_id = $1
INNER JOIN partition
ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
WHERE partition.id = $1
AND parquet_file.to_delete IS NULL;
"#,
)
@ -1754,7 +1758,6 @@ where
namespace_id,
table_id,
partition_id,
partition_hash_id,
object_store_id,
min_time,
max_time,
@ -1766,6 +1769,11 @@ where
max_l0_created_at,
} = parquet_file_params;
let (partition_id, partition_hash_id) = match partition_id {
TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)),
TransitionPartitionId::Deprecated(id) => (Some(id), None),
};
let partition_hash_id_ref = &partition_hash_id.as_ref();
let query = sqlx::query_scalar::<_, ParquetFileId>(
r#"
@ -2203,7 +2211,10 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
.create(parquet_file_params)
.await
.unwrap();
assert!(parquet_file.partition_hash_id.is_none());
assert_matches!(
parquet_file.partition_id,
TransitionPartitionId::Deprecated(_)
);
}
#[test]

View File

@ -1221,8 +1221,8 @@ struct ParquetFilePod {
id: ParquetFileId,
namespace_id: NamespaceId,
table_id: TableId,
partition_id: PartitionId,
partition_hash_id: Option<PartitionHashId>,
#[sqlx(flatten)]
partition_id: TransitionPartitionId,
object_store_id: Uuid,
min_time: Timestamp,
max_time: Timestamp,
@ -1242,7 +1242,6 @@ impl From<ParquetFilePod> for ParquetFile {
namespace_id: value.namespace_id,
table_id: value.table_id,
partition_id: value.partition_id,
partition_hash_id: value.partition_hash_id,
object_store_id: value.object_store_id,
min_time: value.min_time,
max_time: value.max_time,
@ -1395,22 +1394,26 @@ RETURNING id;
let query = match partition_id {
TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFilePod>(
r#"
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
max_l0_created_at
SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
compaction_level, created_at, column_set, max_l0_created_at
FROM parquet_file
WHERE parquet_file.partition_hash_id = $1
INNER JOIN partition
ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
WHERE partition.hash_id = $1
AND parquet_file.to_delete IS NULL;
"#,
)
.bind(hash_id), // $1
TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFilePod>(
r#"
SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
max_l0_created_at
SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
compaction_level, created_at, column_set, max_l0_created_at
FROM parquet_file
WHERE parquet_file.partition_id = $1
INNER JOIN partition
ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
WHERE partition.id = $1
AND parquet_file.to_delete IS NULL;
"#,
)
@ -1533,7 +1536,6 @@ where
namespace_id,
table_id,
partition_id,
partition_hash_id,
object_store_id,
min_time,
max_time,
@ -1545,7 +1547,10 @@ where
max_l0_created_at,
} = parquet_file_params;
let partition_hash_id_ref = &partition_hash_id.as_ref();
let (partition_id, partition_hash_id) = match partition_id {
TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)),
TransitionPartitionId::Deprecated(id) => (Some(id), None),
};
let res = sqlx::query_as::<_, ParquetFilePod>(
r#"
INSERT INTO parquet_file (
@ -1562,7 +1567,7 @@ RETURNING
.bind(TRANSITION_SHARD_ID) // $1
.bind(table_id) // $2
.bind(partition_id) // $3
.bind(partition_hash_id_ref) // $4
.bind(partition_hash_id.as_ref()) // $4
.bind(object_store_id) // $5
.bind(min_time) // $6
.bind(max_time) // $7
@ -1811,7 +1816,10 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, new_file_at;
.create(parquet_file_params)
.await
.unwrap();
assert!(parquet_file.partition_hash_id.is_none());
assert_matches!(
parquet_file.partition_id,
TransitionPartitionId::Deprecated(_)
);
}
macro_rules! test_column_create_or_get_many_unchecked {

View File

@ -1,6 +1,7 @@
use data_types::{
ColumnSet, CompactionLevel, NamespaceId, ParquetFile, ParquetFileId, Partition,
PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, Table, TableId, Timestamp,
TransitionPartitionId,
};
use uuid::Uuid;
@ -20,8 +21,7 @@ impl ParquetFileBuilder {
id: ParquetFileId::new(id),
namespace_id: NamespaceId::new(0),
table_id,
partition_id: PartitionId::new(0),
partition_hash_id: Some(PartitionHashId::new(
partition_id: TransitionPartitionId::Deterministic(PartitionHashId::new(
table_id,
&PartitionKey::from("arbitrary"),
)),
@ -39,11 +39,11 @@ impl ParquetFileBuilder {
}
}
/// Set the partition id
pub fn with_partition(self, id: i64) -> Self {
/// Set the partition identifier
pub fn with_partition(self, partition_id: TransitionPartitionId) -> Self {
Self {
file: ParquetFile {
partition_id: PartitionId::new(id),
partition_id,
..self.file
},
}

View File

@ -602,8 +602,7 @@ impl TestPartition {
let parquet_file_params = ParquetFileParams {
namespace_id: self.namespace.namespace.id,
table_id: self.table.table.id,
partition_id: self.partition.id,
partition_hash_id: self.partition.hash_id().cloned(),
partition_id: self.partition.transition_partition_id(),
object_store_id: object_store_id.unwrap_or_else(Uuid::new_v4),
min_time: Timestamp::new(min_time),
max_time: Timestamp::new(max_time),

View File

@ -17,6 +17,8 @@
// Workaround for "unused crate" lint false positives.
use workspace_hack as _;
use data_types::{PartitionHashId, PartitionKey, TableId, TransitionPartitionId};
mod catalog;
pub use catalog::{
TestCatalog, TestNamespace, TestParquetFile, TestParquetFileBuilder, TestPartition, TestTable,
@ -24,3 +26,14 @@ pub use catalog::{
mod builders;
pub use builders::{ParquetFileBuilder, PartitionBuilder, SkippedCompactionBuilder, TableBuilder};
/// Create a partition identifier from an int (which gets used as the table ID) and a partition key
/// with the string "arbitrary". Most useful in cases where there isn't any actual catalog
/// interaction (that is, in mocks) and when the important property of the partition identifiers is
/// that they're either the same or different than other partition identifiers.
pub fn partition_identifier(table_id: i64) -> TransitionPartitionId {
TransitionPartitionId::Deterministic(PartitionHashId::new(
TableId::new(table_id),
&PartitionKey::from("arbitrary"),
))
}

View File

@ -25,7 +25,7 @@ use generated_types::influxdata::iox::{
},
};
use hyper::{Body, Request, Response};
use ingester::{IngesterGuard, IngesterRpcInterface};
use ingester::{GossipConfig, IngesterGuard, IngesterRpcInterface};
use iox_catalog::interface::Catalog;
use iox_query::exec::Executor;
use ioxd_common::{
@ -210,6 +210,14 @@ pub async fn create_ingester_server_type(
) -> Result<Arc<dyn ServerType>> {
let (shutdown_tx, shutdown_rx) = oneshot::channel();
let gossip = match ingester_config.gossip_config.gossip_bind_address {
None => GossipConfig::Disabled,
Some(v) => GossipConfig::Enabled {
bind_addr: v.into(),
peers: ingester_config.gossip_config.seed_list.clone(),
},
};
let grpc = ingester::new(
catalog,
Arc::clone(&metrics),
@ -221,6 +229,7 @@ pub async fn create_ingester_server_type(
ingester_config.persist_queue_depth,
ingester_config.persist_hot_partition_cost,
object_store,
gossip,
shutdown_rx.map(|v| v.expect("shutdown sender dropped without calling shutdown")),
)
.await?;

View File

@ -10,6 +10,7 @@ async-trait = "0.1"
authz = { path = "../authz" }
clap_blocks = { path = "../clap_blocks" }
data_types = { path = "../data_types" }
gossip = { version = "0.1.0", path = "../gossip" }
hashbrown = { workspace = true }
hyper = "0.14"
iox_catalog = { path = "../iox_catalog" }

View File

@ -10,7 +10,9 @@
missing_debug_implementations,
unused_crate_dependencies
)]
#![allow(clippy::default_constructed_unit_structs)]
use gossip::NopDispatcher;
// Workaround for "unused crate" lint false positives.
use workspace_hack as _;
@ -21,7 +23,7 @@ use std::{
use async_trait::async_trait;
use authz::{Authorizer, AuthorizerInstrumentation, IoxAuthorizer};
use clap_blocks::router::RouterConfig;
use clap_blocks::{gossip::GossipConfig, router::RouterConfig};
use data_types::NamespaceName;
use hashbrown::HashMap;
use hyper::{Body, Request, Response};
@ -86,6 +88,10 @@ pub enum Error {
source: Box<dyn std::error::Error>,
addr: String,
},
/// An error binding the UDP socket for gossip communication.
#[error("failed to bind udp gossip socket: {0}")]
GossipBind(std::io::Error),
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@ -218,6 +224,7 @@ pub async fn create_router_server_type(
catalog: Arc<dyn Catalog>,
object_store: Arc<DynObjectStore>,
router_config: &RouterConfig,
gossip_config: &GossipConfig,
trace_context_header_name: String,
) -> Result<Arc<dyn ServerType>> {
let ingester_connections = router_config.ingester_addresses.iter().map(|addr| {
@ -333,6 +340,28 @@ pub async fn create_router_server_type(
// Record the overall request handling latency
let handler_stack = InstrumentationDecorator::new("request", &metrics, handler_stack);
// Optionally initialised the gossip subsystem.
//
// NOTE: the handle is completely unused, but needs to live as long as the
// server does to do anything useful (RAII), so it is placed int he
// RpcWriteRouterServer, which doesn't need it at all.
//
// TODO: remove handle from RpcWriteRouterServer when using handle
let gossip_handle = match gossip_config.gossip_bind_address {
Some(bind_addr) => {
let handle = gossip::Builder::new(
gossip_config.seed_list.clone(),
NopDispatcher::default(),
Arc::clone(&metrics),
)
.bind(*bind_addr)
.await
.map_err(Error::GossipBind)?;
Some(handle)
}
None => None,
};
// Initialize the HTTP API delegate
let write_request_unifier: Result<Box<dyn WriteRequestUnifier>> = match (
router_config.single_tenant_deployment,
@ -379,8 +408,13 @@ pub async fn create_router_server_type(
// `RpcWriteRouterServerType`.
let grpc = RpcWriteGrpcDelegate::new(catalog, object_store);
let router_server =
RpcWriteRouterServer::new(http, grpc, metrics, common_state.trace_collector());
let router_server = RpcWriteRouterServer::new(
http,
grpc,
metrics,
common_state.trace_collector(),
gossip_handle,
);
let server_type = Arc::new(RpcWriteRouterServerType::new(router_server, common_state));
Ok(server_type)
}

View File

@ -108,7 +108,7 @@ impl From<&ParquetFile> for ParquetFilePath {
Self {
namespace_id: f.namespace_id,
table_id: f.table_id,
partition_id: f.transition_partition_id(),
partition_id: f.partition_id.clone(),
object_store_id: f.object_store_id,
}
}
@ -119,7 +119,7 @@ impl From<&ParquetFileParams> for ParquetFilePath {
Self {
namespace_id: f.namespace_id,
table_id: f.table_id,
partition_id: f.transition_partition_id(),
partition_id: f.partition_id.clone(),
object_store_id: f.object_store_id,
}
}

View File

@ -91,7 +91,7 @@ use bytes::Bytes;
use data_types::{
ColumnId, ColumnSet, ColumnSummary, CompactionLevel, InfluxDbType, NamespaceId,
ParquetFileParams, PartitionHashId, PartitionId, PartitionKey, StatValues, Statistics, TableId,
Timestamp,
Timestamp, TransitionPartitionId,
};
use generated_types::influxdata::iox::ingester::v1 as proto;
use iox_time::Time;
@ -443,6 +443,7 @@ impl IoxMetadata {
where
F: for<'a> Fn(&'a str) -> ColumnId,
{
let partition_id = TransitionPartitionId::from((partition_id, partition_hash_id.as_ref()));
let decoded = metadata.decode().expect("invalid IOx metadata");
trace!(
?partition_id,
@ -487,7 +488,6 @@ impl IoxMetadata {
namespace_id: self.namespace_id,
table_id: self.table_id,
partition_id,
partition_hash_id,
object_store_id: self.object_store_id,
min_time,
max_time,

View File

@ -113,11 +113,13 @@ impl CatalogCache {
"ram_metadata",
RamSize(ram_pool_metadata_bytes),
Arc::clone(&metric_registry),
&Handle::current(),
));
let ram_pool_data = Arc::new(ResourcePool::new(
"ram_data",
RamSize(ram_pool_data_bytes),
Arc::clone(&metric_registry),
&Handle::current(),
));
let partition_cache = PartitionCache::new(

View File

@ -361,8 +361,8 @@ mod tests {
partition.create_parquet_file(builder).await;
let table_id = table.table.id;
let single_file_size = 240;
let two_file_size = 448;
let single_file_size = 256;
let two_file_size = 480;
assert!(single_file_size < two_file_size);
let cache = make_cache(&catalog);

View File

@ -17,7 +17,7 @@ use cache_system::{
};
use data_types::{
partition_template::{build_column_values, ColumnValue},
ColumnId, Partition, PartitionId, TransitionPartitionId,
ColumnId, Partition, TransitionPartitionId,
};
use datafusion::scalar::ScalarValue;
use iox_catalog::{interface::Catalog, partition_lookup_batch};
@ -38,7 +38,7 @@ const CACHE_ID: &str = "partition";
type CacheT = Box<
dyn Cache<
K = PartitionId,
K = TransitionPartitionId,
V = Option<CachedPartition>,
GetExtra = (Arc<CachedTable>, Option<Span>),
PeekExtra = ((), Option<Span>),
@ -49,7 +49,7 @@ type CacheT = Box<
#[derive(Debug)]
pub struct PartitionCache {
cache: CacheT,
remove_if_handle: RemoveIfHandle<PartitionId, Option<CachedPartition>>,
remove_if_handle: RemoveIfHandle<TransitionPartitionId, Option<CachedPartition>>,
flusher: Arc<dyn BatchLoaderFlusher>,
}
@ -64,7 +64,8 @@ impl PartitionCache {
testing: bool,
) -> Self {
let loader = FunctionLoader::new(
move |partition_ids: Vec<PartitionId>, cached_tables: Vec<Arc<CachedTable>>| {
move |partition_ids: Vec<TransitionPartitionId>,
cached_tables: Vec<Arc<CachedTable>>| {
// sanity checks
assert_eq!(partition_ids.len(), cached_tables.len());
@ -75,23 +76,20 @@ impl PartitionCache {
// prepare output buffer
let mut out = (0..partition_ids.len()).map(|_| None).collect::<Vec<_>>();
let mut out_map =
HashMap::<PartitionId, usize>::with_capacity(partition_ids.len());
HashMap::<TransitionPartitionId, usize>::with_capacity(partition_ids.len());
for (idx, id) in partition_ids.iter().enumerate() {
match out_map.entry(*id) {
Entry::Occupied(_) => unreachable!("cache system requested same partition from loader concurrently, this should have been prevented by the CacheDriver"),
match out_map.entry(id.clone()) {
Entry::Occupied(_) => unreachable!(
"cache system requested same partition from loader concurrently, \
this should have been prevented by the CacheDriver"
),
Entry::Vacant(v) => {
v.insert(idx);
}
}
}
// build `&[&TransitionPartitionId]` for batch catalog request
let ids = partition_ids
.iter()
.copied()
.map(TransitionPartitionId::Deprecated)
.collect::<Vec<_>>();
let ids = ids.iter().collect::<Vec<_>>();
let ids: Vec<&TransitionPartitionId> = partition_ids.iter().collect();
// fetch catalog data
let partitions = Backoff::new(&backoff_config)
@ -104,7 +102,7 @@ impl PartitionCache {
// build output
for p in partitions {
let idx = out_map[&p.id];
let idx = out_map[&p.transition_partition_id()];
let cached_table = &cached_tables[idx];
let p = CachedPartition::new(p, cached_table);
out[idx] = Some(p);
@ -180,7 +178,7 @@ impl PartitionCache {
self.remove_if_handle.remove_if_and_get(
&self.cache,
partition_id,
partition_id.clone(),
move |cached_partition| {
let invalidates = if let Some(sort_key) =
&cached_partition.and_then(|p| p.sort_key)
@ -195,7 +193,7 @@ impl PartitionCache {
if invalidates {
debug!(
partition_id = partition_id.get(),
partition_id = %partition_id,
"invalidate partition cache",
);
}
@ -217,13 +215,13 @@ impl PartitionCache {
/// Request for [`PartitionCache::get`].
#[derive(Debug)]
pub struct PartitionRequest {
pub partition_id: PartitionId,
pub partition_id: TransitionPartitionId,
pub sort_key_should_cover: Vec<ColumnId>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CachedPartition {
pub id: PartitionId,
pub id: TransitionPartitionId,
pub sort_key: Option<Arc<PartitionSortKey>>,
pub column_ranges: ColumnRanges,
}
@ -299,7 +297,7 @@ impl CachedPartition {
column_ranges.shrink_to_fit();
Self {
id: partition.id,
id: partition.transition_partition_id(),
sort_key,
column_ranges: Arc::new(column_ranges),
}
@ -368,7 +366,10 @@ mod tests {
ram::test_util::test_ram_pool, test_util::assert_catalog_access_metric_count,
};
use async_trait::async_trait;
use data_types::{partition_template::TablePartitionTemplateOverride, ColumnType};
use data_types::{
partition_template::TablePartitionTemplateOverride, ColumnType, PartitionHashId,
PartitionId, PartitionKey, TableId,
};
use futures::StreamExt;
use generated_types::influxdata::iox::partition_template::v1::{
template_part::Part, PartitionTemplate, TemplatePart,
@ -419,8 +420,11 @@ mod tests {
true,
);
let p1_id = p1.transition_partition_id();
let p2_id = p2.transition_partition_id();
let sort_key1a = cache
.get_one(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
.get_one(Arc::clone(&cached_table), &p1_id, &Vec::new(), None)
.await
.unwrap()
.sort_key;
@ -434,24 +438,24 @@ mod tests {
);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
let sort_key2 = cache
.get_one(Arc::clone(&cached_table), p2.id, &Vec::new(), None)
.get_one(Arc::clone(&cached_table), &p2_id, &Vec::new(), None)
.await
.unwrap()
.sort_key;
assert_eq!(sort_key2, None);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
2,
);
let sort_key1b = cache
.get_one(Arc::clone(&cached_table), p1.id, &Vec::new(), None)
.get_one(Arc::clone(&cached_table), &p1_id, &Vec::new(), None)
.await
.unwrap()
.sort_key;
@ -461,16 +465,37 @@ mod tests {
));
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
2,
);
// non-existing partition
for _ in 0..2 {
// Non-existing partition identified by partition hash ID
let res = cache
.get_one(
Arc::clone(&cached_table),
PartitionId::new(i64::MAX),
&TransitionPartitionId::Deterministic(PartitionHashId::new(
TableId::new(i64::MAX),
&PartitionKey::from("bananas_not_found"),
)),
&[],
None,
)
.await;
assert_eq!(res, None);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_hash_id_batch",
3,
);
// Non-existing partition identified by deprecated catalog IDs; this part can be
// removed when partition identification is fully transitioned to partition hash IDs
let res = cache
.get_one(
Arc::clone(&cached_table),
&TransitionPartitionId::Deprecated(PartitionId::new(i64::MAX)),
&Vec::new(),
None,
)
@ -479,7 +504,7 @@ mod tests {
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
3,
1,
);
}
}
@ -548,8 +573,14 @@ mod tests {
true,
);
let p1_id = p1.transition_partition_id();
let p2_id = p2.transition_partition_id();
let p3_id = p3.transition_partition_id();
let p4_id = p4.transition_partition_id();
let p5_id = p5.transition_partition_id();
let ranges1a = cache
.get_one(Arc::clone(&cached_table), p1.id, &[], None)
.get_one(Arc::clone(&cached_table), &p1_id, &[], None)
.await
.unwrap()
.column_ranges;
@ -578,12 +609,12 @@ mod tests {
));
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
let ranges2 = cache
.get_one(Arc::clone(&cached_table), p2.id, &[], None)
.get_one(Arc::clone(&cached_table), &p2_id, &[], None)
.await
.unwrap()
.column_ranges;
@ -599,12 +630,12 @@ mod tests {
);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
2,
);
let ranges3 = cache
.get_one(Arc::clone(&cached_table), p3.id, &[], None)
.get_one(Arc::clone(&cached_table), &p3_id, &[], None)
.await
.unwrap()
.column_ranges;
@ -629,12 +660,12 @@ mod tests {
);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
3,
);
let ranges4 = cache
.get_one(Arc::clone(&cached_table), p4.id, &[], None)
.get_one(Arc::clone(&cached_table), &p4_id, &[], None)
.await
.unwrap()
.column_ranges;
@ -659,12 +690,12 @@ mod tests {
);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
4,
);
let ranges5 = cache
.get_one(Arc::clone(&cached_table), p5.id, &[], None)
.get_one(Arc::clone(&cached_table), &p5_id, &[], None)
.await
.unwrap()
.column_ranges;
@ -680,28 +711,48 @@ mod tests {
);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
5,
);
let ranges1b = cache
.get_one(Arc::clone(&cached_table), p1.id, &[], None)
.get_one(Arc::clone(&cached_table), &p1_id, &[], None)
.await
.unwrap()
.column_ranges;
assert!(Arc::ptr_eq(&ranges1a, &ranges1b));
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
5,
);
// non-existing partition
for _ in 0..2 {
// Non-existing partition identified by partition hash ID
let res = cache
.get_one(
Arc::clone(&cached_table),
PartitionId::new(i64::MAX),
&TransitionPartitionId::Deterministic(PartitionHashId::new(
TableId::new(i64::MAX),
&PartitionKey::from("bananas_not_found"),
)),
&[],
None,
)
.await;
assert_eq!(res, None);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_hash_id_batch",
6,
);
// Non-existing partition identified by deprecated catalog IDs; this part can be
// removed when partition identification is fully transitioned to partition hash IDs
let res = cache
.get_one(
Arc::clone(&cached_table),
&TransitionPartitionId::Deprecated(PartitionId::new(i64::MAX)),
&[],
None,
)
@ -710,7 +761,7 @@ mod tests {
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
6,
1,
);
}
}
@ -724,7 +775,7 @@ mod tests {
let c1 = t.create_column("foo", ColumnType::Tag).await;
let c2 = t.create_column("time", ColumnType::Time).await;
let p = t.create_partition("k1").await;
let p_id = p.partition.id;
let p_id = p.partition.transition_partition_id();
let p_sort_key = p.partition.sort_key();
let cached_table = Arc::new(CachedTable {
id: t.table.id,
@ -751,41 +802,41 @@ mod tests {
);
let sort_key = cache
.get_one(Arc::clone(&cached_table), p_id, &[], None)
.get_one(Arc::clone(&cached_table), &p_id, &[], None)
.await
.unwrap()
.sort_key;
assert_eq!(sort_key, None,);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
// requesting nother will not expire
assert!(p_sort_key.is_none());
let sort_key = cache
.get_one(Arc::clone(&cached_table), p_id, &[], None)
.get_one(Arc::clone(&cached_table), &p_id, &[], None)
.await
.unwrap()
.sort_key;
assert_eq!(sort_key, None,);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
// but requesting something will expire
let sort_key = cache
.get_one(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
.get_one(Arc::clone(&cached_table), &p_id, &[c1.column.id], None)
.await
.unwrap()
.sort_key;
assert_eq!(sort_key, None,);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
2,
);
@ -801,7 +852,7 @@ mod tests {
// expire & fetch
let p_sort_key = p.partition.sort_key();
let sort_key = cache
.get_one(Arc::clone(&cached_table), p_id, &[c1.column.id], None)
.get_one(Arc::clone(&cached_table), &p_id, &[c1.column.id], None)
.await
.unwrap()
.sort_key;
@ -815,7 +866,7 @@ mod tests {
);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
3,
);
@ -827,7 +878,7 @@ mod tests {
vec![c1.column.id, c2.column.id],
] {
let sort_key_2 = cache
.get_one(Arc::clone(&cached_table), p_id, &should_cover, None)
.get_one(Arc::clone(&cached_table), &p_id, &should_cover, None)
.await
.unwrap()
.sort_key;
@ -837,7 +888,7 @@ mod tests {
));
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
3,
);
}
@ -847,7 +898,7 @@ mod tests {
let sort_key_2 = cache
.get_one(
Arc::clone(&cached_table),
p_id,
&p_id,
&[c1.column.id, c3.column.id],
None,
)
@ -861,7 +912,7 @@ mod tests {
assert_eq!(sort_key, sort_key_2);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
4,
);
}
@ -892,34 +943,45 @@ mod tests {
true,
);
let p1_id = p1.transition_partition_id();
let p2_id = p2.transition_partition_id();
let mut res = cache
.get(
Arc::clone(&cached_table),
vec![
PartitionRequest {
partition_id: p1.id,
partition_id: p1_id.clone(),
sort_key_should_cover: vec![],
},
PartitionRequest {
partition_id: p2.id,
partition_id: p2_id.clone(),
sort_key_should_cover: vec![],
},
PartitionRequest {
partition_id: p1.id,
partition_id: p1_id.clone(),
sort_key_should_cover: vec![],
},
// requesting non-existing partitions is fine, they just don't appear in
// the output
PartitionRequest {
partition_id: TransitionPartitionId::Deprecated(PartitionId::new(i64::MAX)),
sort_key_should_cover: vec![],
},
PartitionRequest {
// requesting non-existing partitions is fine, they just don't appear in the output
partition_id: PartitionId::new(i64::MAX),
partition_id: TransitionPartitionId::Deterministic(PartitionHashId::new(
TableId::new(i64::MAX),
&PartitionKey::from("bananas_not_found"),
)),
sort_key_should_cover: vec![],
},
],
None,
)
.await;
res.sort_by_key(|p| p.id);
let ids = res.iter().map(|p| p.id).collect::<Vec<_>>();
assert_eq!(ids, vec![p1.id, p1.id, p2.id]);
res.sort_by(|a, b| a.id.cmp(&b.id));
let ids = res.into_iter().map(|p| p.id).collect::<Vec<_>>();
assert_eq!(ids, vec![p1_id.clone(), p1_id, p2_id]);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
@ -1008,7 +1070,7 @@ mod tests {
c_id: ColumnId,
/// Partitions within that table.
partitions: Vec<PartitionId>,
partitions: Vec<TransitionPartitionId>,
}
impl ConcurrencyTestState {
@ -1032,7 +1094,7 @@ mod tests {
t.create_partition_with_sort_key(&format!("p{i}"), &["time"])
.await
.partition
.id
.transition_partition_id()
}
})
.collect::<Vec<_>>()
@ -1046,7 +1108,8 @@ mod tests {
}
}
/// Perform the actual [`PartitionCache::get`] call and run some basic sanity checks on the result.
/// Perform the actual [`PartitionCache::get`] call and run some basic sanity checks on the
/// result.
async fn run(self, cache: Arc<PartitionCache>) {
let Self {
cached_table,
@ -1060,15 +1123,15 @@ mod tests {
partitions
.iter()
.map(|p| PartitionRequest {
partition_id: *p,
partition_id: p.clone(),
sort_key_should_cover: vec![],
})
.collect(),
None,
)
.await;
results.sort_by_key(|p| p.id);
let partitions_res = results.iter().map(|p| p.id).collect::<Vec<_>>();
results.sort_by(|a, b| a.id.cmp(&b.id));
let partitions_res = results.iter().map(|p| p.id.clone()).collect::<Vec<_>>();
assert_eq!(partitions, partitions_res);
assert!(results
.iter()
@ -1086,7 +1149,7 @@ mod tests {
async fn get_one(
&self,
cached_table: Arc<CachedTable>,
partition_id: PartitionId,
partition_id: &TransitionPartitionId,
sort_key_should_cover: &[ColumnId],
span: Option<Span>,
) -> Option<CachedPartition>;
@ -1097,14 +1160,14 @@ mod tests {
async fn get_one(
&self,
cached_table: Arc<CachedTable>,
partition_id: PartitionId,
partition_id: &TransitionPartitionId,
sort_key_should_cover: &[ColumnId],
span: Option<Span>,
) -> Option<CachedPartition> {
self.get(
cached_table,
vec![PartitionRequest {
partition_id,
partition_id: partition_id.clone(),
sort_key_should_cover: sort_key_should_cover.to_vec(),
}],
span,

View File

@ -43,12 +43,14 @@ pub mod test_util {
use std::sync::Arc;
use cache_system::backend::policy::lru::ResourcePool;
use tokio::runtime::Handle;
pub fn test_ram_pool() -> Arc<ResourcePool<RamSize>> {
Arc::new(ResourcePool::new(
"pool",
RamSize(usize::MAX),
Arc::new(metric::Registry::new()),
&Handle::current(),
))
}
}

View File

@ -859,10 +859,6 @@ impl IngesterPartition {
}
}
pub(crate) fn partition_id(&self) -> PartitionId {
self.partition_id
}
pub(crate) fn transition_partition_id(&self) -> TransitionPartitionId {
TransitionPartitionId::from((self.partition_id, self.partition_hash_id.as_ref()))
}

View File

@ -1,6 +1,6 @@
use std::{collections::HashMap, sync::Arc};
use data_types::{ChunkId, ChunkOrder, ColumnId, ParquetFile, PartitionId, TransitionPartitionId};
use data_types::{ChunkId, ChunkOrder, ColumnId, ParquetFile, TransitionPartitionId};
use futures::StreamExt;
use hashbrown::HashSet;
use iox_catalog::interface::Catalog;
@ -56,7 +56,7 @@ impl ChunkAdapter {
&self,
cached_table: Arc<CachedTable>,
files: Arc<[Arc<ParquetFile>]>,
cached_partitions: &HashMap<PartitionId, CachedPartition>,
cached_partitions: &HashMap<TransitionPartitionId, CachedPartition>,
span: Option<Span>,
) -> Vec<QuerierParquetChunk> {
let span_recorder = SpanRecorder::new(span);
@ -170,18 +170,13 @@ impl ChunkAdapter {
let order = ChunkOrder::new(parquet_file.file.max_l0_created_at.get());
let partition_id = parquet_file.file.partition_id;
let transition_partition_id = TransitionPartitionId::from((
partition_id,
parquet_file.file.partition_hash_id.as_ref(),
));
let partition_id = parquet_file.file.partition_id.clone();
let meta = Arc::new(QuerierParquetChunkMeta {
chunk_id,
order,
sort_key: Some(sort_key),
partition_id,
transition_partition_id,
});
let parquet_chunk = Arc::new(ParquetChunk::new(

View File

@ -1,6 +1,6 @@
//! Querier Chunks
use data_types::{ChunkId, ChunkOrder, PartitionId, TransitionPartitionId};
use data_types::{ChunkId, ChunkOrder, TransitionPartitionId};
use datafusion::physical_plan::Statistics;
use iox_query::chunk_statistics::{create_chunk_statistics, ColumnRanges};
use parquet_file::chunk::ParquetChunk;
@ -25,10 +25,7 @@ pub struct QuerierParquetChunkMeta {
sort_key: Option<SortKey>,
/// Partition ID.
partition_id: PartitionId,
/// Transition partition ID.
transition_partition_id: TransitionPartitionId,
partition_id: TransitionPartitionId,
}
impl QuerierParquetChunkMeta {
@ -43,13 +40,8 @@ impl QuerierParquetChunkMeta {
}
/// Partition ID.
pub fn partition_id(&self) -> PartitionId {
self.partition_id
}
/// Partition ID.
pub fn transition_partition_id(&self) -> &TransitionPartitionId {
&self.transition_partition_id
pub fn partition_id(&self) -> &TransitionPartitionId {
&self.partition_id
}
}
@ -251,7 +243,7 @@ pub mod tests {
.get(
Arc::clone(&self.cached_table),
vec![PartitionRequest {
partition_id: self.parquet_file.partition_id,
partition_id: self.parquet_file.partition_id.clone(),
sort_key_should_cover: vec![],
}],
None,
@ -261,7 +253,7 @@ pub mod tests {
.next()
.unwrap();
let cached_partitions =
HashMap::from([(self.parquet_file.partition_id, cached_partition)]);
HashMap::from([(self.parquet_file.partition_id.clone(), cached_partition)]);
self.adapter
.new_chunks(
Arc::clone(&self.cached_table),

View File

@ -15,11 +15,11 @@ impl QueryChunk for QuerierParquetChunk {
}
fn partition_id(&self) -> PartitionId {
self.meta().partition_id()
unimplemented!()
}
fn transition_partition_id(&self) -> &TransitionPartitionId {
self.meta().transition_partition_id()
self.meta().partition_id()
}
fn sort_key(&self) -> Option<&SortKey> {

View File

@ -8,7 +8,7 @@ use crate::{
parquet::ChunkAdapter,
IngesterConnection,
};
use data_types::{ColumnId, NamespaceId, ParquetFile, PartitionId, TableId};
use data_types::{ColumnId, NamespaceId, ParquetFile, TableId, TransitionPartitionId};
use datafusion::error::DataFusionError;
use futures::join;
use iox_query::{provider, provider::ChunkPruner, QueryChunk};
@ -282,7 +282,7 @@ impl QuerierTable {
let chunks = partitions
.into_iter()
.filter_map(|mut c| {
let cached_partition = cached_partitions.get(&c.partition_id())?;
let cached_partition = cached_partitions.get(&c.transition_partition_id())?;
c.set_partition_column_ranges(&cached_partition.column_ranges);
Some(c)
})
@ -322,16 +322,16 @@ impl QuerierTable {
ingester_partitions: &[IngesterPartition],
parquet_files: &[Arc<ParquetFile>],
span: Option<Span>,
) -> HashMap<PartitionId, CachedPartition> {
) -> HashMap<TransitionPartitionId, CachedPartition> {
let span_recorder = SpanRecorder::new(span);
let mut should_cover: HashMap<PartitionId, HashSet<ColumnId>> =
let mut should_cover: HashMap<TransitionPartitionId, HashSet<ColumnId>> =
HashMap::with_capacity(ingester_partitions.len());
// For ingester partitions we only need the column ranges -- which are static -- not the sort key. So it is
// sufficient to collect the partition IDs.
for p in ingester_partitions {
should_cover.entry(p.partition_id()).or_default();
should_cover.entry(p.transition_partition_id()).or_default();
}
// For parquet files we must ensure that the -- potentially evolving -- sort key coveres the primary key.
@ -342,7 +342,7 @@ impl QuerierTable {
.collect::<HashSet<_>>();
for f in parquet_files {
should_cover
.entry(f.partition_id)
.entry(f.partition_id.clone())
.or_default()
.extend(f.column_set.iter().copied().filter(|id| pk.contains(id)));
}
@ -366,7 +366,7 @@ impl QuerierTable {
)
.await;
partitions.into_iter().map(|p| (p.id, p)).collect()
partitions.into_iter().map(|p| (p.id.clone(), p)).collect()
}
/// Get a chunk pruner that can be used to prune chunks retrieved via [`chunks`](Self::chunks)
@ -889,7 +889,7 @@ mod tests {
assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
assert_cache_access_metric_count(&catalog.metric_registry, "partition", 2);
@ -899,7 +899,7 @@ mod tests {
assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 4);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
assert_cache_access_metric_count(&catalog.metric_registry, "partition", 4);
@ -912,7 +912,7 @@ mod tests {
assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
@ -922,7 +922,7 @@ mod tests {
assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
1,
);
assert_cache_access_metric_count(&catalog.metric_registry, "partition", 6);
@ -936,7 +936,7 @@ mod tests {
assert_catalog_access_metric_count(&catalog.metric_registry, "partition_get_by_id", 5);
assert_catalog_access_metric_count(
&catalog.metric_registry,
"partition_get_by_id_batch",
"partition_get_by_hash_id_batch",
2,
);
assert_cache_access_metric_count(&catalog.metric_registry, "partition", 8);

View File

@ -15,6 +15,7 @@ dml = { path = "../dml" }
flate2 = "1.0"
futures = "0.3.28"
generated_types = { path = "../generated_types" }
gossip = { version = "0.1.0", path = "../gossip" }
hashbrown = { workspace = true }
hyper = "0.14"
iox_catalog = { path = "../iox_catalog" }

View File

@ -16,6 +16,9 @@ pub struct RpcWriteRouterServer<D, N> {
http: HttpDelegate<D, N>,
grpc: RpcWriteGrpcDelegate,
// TODO: this shouldn't be here but it is here while it's unused elsewhere
_gossip_handle: Option<gossip::GossipHandle>,
}
impl<D, N> RpcWriteRouterServer<D, N> {
@ -26,12 +29,14 @@ impl<D, N> RpcWriteRouterServer<D, N> {
grpc: RpcWriteGrpcDelegate,
metrics: Arc<metric::Registry>,
trace_collector: Option<Arc<dyn TraceCollector>>,
gossip_handle: Option<gossip::GossipHandle>,
) -> Self {
Self {
metrics,
trace_collector,
http,
grpc,
_gossip_handle: gossip_handle,
}
}

View File

@ -18,7 +18,7 @@
// Workaround for "unused crate" lint false positives.
use workspace_hack as _;
use data_types::{PartitionId, TableId, TransitionPartitionId};
use data_types::{PartitionHashId, PartitionId, TableId, TransitionPartitionId};
use generated_types::influxdata::iox::catalog::v1::*;
use iox_catalog::interface::{Catalog, SoftDeletedRows};
use observability_deps::tracing::*;
@ -47,14 +47,14 @@ impl catalog_service_server::CatalogService for CatalogService {
) -> Result<Response<GetParquetFilesByPartitionIdResponse>, Status> {
let mut repos = self.catalog.repositories().await;
let req = request.into_inner();
let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(req.partition_id));
let partition_id = to_partition_id(req.partition_identifier)?;
let parquet_files = repos
.parquet_files()
.list_by_partition_not_to_delete(&partition_id)
.await
.map_err(|e| {
warn!(error=%e, %req.partition_id, "failed to get parquet_files for partition");
warn!(error=%e, %partition_id, "failed to get parquet_files for partition");
Status::not_found(e.to_string())
})?;
@ -169,13 +169,52 @@ impl catalog_service_server::CatalogService for CatalogService {
}
}
fn to_partition_identifier(partition_id: &TransitionPartitionId) -> PartitionIdentifier {
match partition_id {
TransitionPartitionId::Deterministic(hash_id) => PartitionIdentifier {
id: Some(partition_identifier::Id::HashId(
hash_id.as_bytes().to_owned(),
)),
},
TransitionPartitionId::Deprecated(id) => PartitionIdentifier {
id: Some(partition_identifier::Id::CatalogId(id.get())),
},
}
}
fn to_partition_id(
partition_identifier: Option<PartitionIdentifier>,
) -> Result<TransitionPartitionId, Status> {
let partition_id =
match partition_identifier
.and_then(|pi| pi.id)
.ok_or(Status::invalid_argument(
"No partition identifier specified",
))? {
partition_identifier::Id::HashId(bytes) => TransitionPartitionId::Deterministic(
PartitionHashId::try_from(&bytes[..]).map_err(|e| {
Status::invalid_argument(format!(
"Could not parse bytes as a `PartitionHashId`: {e}"
))
})?,
),
partition_identifier::Id::CatalogId(id) => {
TransitionPartitionId::Deprecated(PartitionId::new(id))
}
};
Ok(partition_id)
}
// converts the catalog ParquetFile to protobuf
fn to_parquet_file(p: data_types::ParquetFile) -> ParquetFile {
let partition_identifier = to_partition_identifier(&p.partition_id);
ParquetFile {
id: p.id.get(),
namespace_id: p.namespace_id.get(),
table_id: p.table_id.get(),
partition_id: p.partition_id.get(),
partition_identifier: Some(partition_identifier),
object_store_id: p.object_store_id.to_string(),
min_time: p.min_time.get(),
max_time: p.max_time.get(),
@ -191,8 +230,10 @@ fn to_parquet_file(p: data_types::ParquetFile) -> ParquetFile {
// converts the catalog Partition to protobuf
fn to_partition(p: data_types::Partition) -> Partition {
let identifier = to_partition_identifier(&p.transition_partition_id());
Partition {
id: p.id.get(),
identifier: Some(identifier),
key: p.partition_key.to_string(),
table_id: p.table_id.get(),
array_sort_key: p.sort_key,
@ -230,8 +271,7 @@ mod tests {
let p1params = ParquetFileParams {
namespace_id: namespace.id,
table_id: table.id,
partition_id: partition.id,
partition_hash_id: partition.hash_id().cloned(),
partition_id: partition.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time: Timestamp::new(1),
max_time: Timestamp::new(5),
@ -248,13 +288,15 @@ mod tests {
};
p1 = repos.parquet_files().create(p1params).await.unwrap();
p2 = repos.parquet_files().create(p2params).await.unwrap();
partition_id = partition.id;
partition_id = partition.transition_partition_id();
Arc::clone(&catalog)
};
let partition_identifier = to_partition_identifier(&partition_id);
let grpc = super::CatalogService::new(catalog);
let request = GetParquetFilesByPartitionIdRequest {
partition_id: partition_id.get(),
partition_identifier: Some(partition_identifier),
};
let tonic_response = grpc

View File

@ -75,7 +75,7 @@ impl object_store_service_server::ObjectStoreService for ObjectStoreService {
let path = ParquetFilePath::new(
parquet_file.namespace_id,
parquet_file.table_id,
&parquet_file.transition_partition_id(),
&parquet_file.partition_id.clone(),
parquet_file.object_store_id,
);
let path = path.object_store_path();
@ -128,8 +128,7 @@ mod tests {
let p1params = ParquetFileParams {
namespace_id: namespace.id,
table_id: table.id,
partition_id: partition.id,
partition_hash_id: partition.hash_id().cloned(),
partition_id: partition.transition_partition_id(),
object_store_id: Uuid::new_v4(),
min_time: Timestamp::new(1),
max_time: Timestamp::new(5),
@ -150,7 +149,7 @@ mod tests {
let path = ParquetFilePath::new(
p1.namespace_id,
p1.table_id,
&p1.transition_partition_id(),
&p1.partition_id.clone(),
p1.object_store_id,
);
let path = path.object_store_path();

View File

@ -25,3 +25,4 @@ sysinfo = "0.29.7"
tempfile = "3.7.0"
# Need the multi-threaded executor for testing
tokio = { version = "1.29", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
test_helpers = { path = "../test_helpers", features = ["future_timeout"] }

View File

@ -1,7 +1,10 @@
use std::{borrow::Cow, path::PathBuf, time::Duration};
use std::borrow::Cow;
use std::path::PathBuf;
use std::time::Duration;
use metric::{Attributes, U64Gauge};
use sysinfo::{DiskExt, RefreshKind, System, SystemExt};
use tokio::sync::watch;
/// The interval at which disk metrics are updated.
///
@ -9,6 +12,32 @@ use sysinfo::{DiskExt, RefreshKind, System, SystemExt};
/// interval.
const UPDATE_INTERVAL: Duration = Duration::from_secs(13);
/// An immutable snapshot of space and usage statistics for some disk.
#[derive(Clone, Copy, Debug)]
pub struct DiskSpaceSnapshot {
available_disk_space: u64,
total_disk_space: u64,
}
impl DiskSpaceSnapshot {
/// The available space in bytes on the disk.
pub fn available_disk_space(&self) -> u64 {
self.available_disk_space
}
/// The maximum capacity in bytes of the disk.
pub fn total_disk_space(&self) -> u64 {
self.total_disk_space
}
/// Overall usage of the disk, as a percentage [0.0, 1.0].
#[inline]
pub fn disk_usage_ratio(&self) -> f64 {
debug_assert!(self.available_disk_space <= self.total_disk_space);
1.0 - (self.available_disk_space as f64 / self.total_disk_space as f64)
}
}
/// A periodic reporter of disk capacity / free statistics for a given
/// directory.
#[derive(Debug)]
@ -22,12 +51,19 @@ pub struct DiskSpaceMetrics {
/// The index into [`System::disks()`] for the disk containing the observed
/// directory.
disk_idx: usize,
/// A stream of [`DiskSpaceSnapshot`] produced by the metric reporter for
/// consumption by any listeners.
snapshot_tx: watch::Sender<DiskSpaceSnapshot>,
}
impl DiskSpaceMetrics {
/// Create a new [`DiskSpaceMetrics`], returning [`None`] if no disk can be
/// found for the specified `directory`.
pub fn new(directory: PathBuf, registry: &metric::Registry) -> Option<Self> {
pub fn new(
directory: PathBuf,
registry: &metric::Registry,
) -> Option<(Self, watch::Receiver<DiskSpaceSnapshot>)> {
let path: Cow<'static, str> = Cow::from(directory.display().to_string());
let mut directory = directory.canonicalize().ok()?;
@ -52,14 +88,14 @@ impl DiskSpaceMetrics {
// Resolve the mount point once.
// The directory path may be `/path/to/dir` and the mount point is `/`.
let disk_idx = loop {
if let Some((idx, _disk)) = system
let (disk_idx, initial_disk) = loop {
if let Some((idx, disk)) = system
.disks()
.iter()
.enumerate()
.find(|(_idx, disk)| disk.mount_point() == directory)
{
break idx;
break (idx, disk);
}
// The mount point for this directory could not be found.
if !directory.pop() {
@ -67,18 +103,26 @@ impl DiskSpaceMetrics {
}
};
Some(Self {
available_disk_space,
total_disk_space,
system,
disk_idx,
})
let (snapshot_tx, snapshot_rx) = watch::channel(DiskSpaceSnapshot {
available_disk_space: initial_disk.available_space(),
total_disk_space: initial_disk.total_space(),
});
Some((
Self {
available_disk_space,
total_disk_space,
system,
disk_idx,
snapshot_tx,
},
snapshot_rx,
))
}
/// Start the [`DiskSpaceMetrics`] evaluation loop, blocking forever.
pub async fn run(mut self) {
let mut interval = tokio::time::interval(UPDATE_INTERVAL);
loop {
interval.tick().await;
@ -93,6 +137,13 @@ impl DiskSpaceMetrics {
self.available_disk_space.set(disk.available_space());
self.total_disk_space.set(disk.total_space());
// Produce and send a [`DiskSpaceSnapshot`] for any listeners
// that might exist.
_ = self.snapshot_tx.send(DiskSpaceSnapshot {
available_disk_space: disk.available_space(),
total_disk_space: disk.total_space(),
});
}
}
}
@ -103,6 +154,7 @@ mod tests {
use metric::Metric;
use tempfile::tempdir_in;
use test_helpers::timeout::FutureTimeout;
use super::*;
@ -121,11 +173,9 @@ mod tests {
let registry = Arc::new(metric::Registry::new());
let _handle = tokio::spawn(
DiskSpaceMetrics::new(pathbuf, &registry)
.expect("root always exists")
.run(),
);
let (_handle, mut snapshot_rx) =
DiskSpaceMetrics::new(pathbuf, &registry).expect("root always exists");
let _handle = tokio::spawn(_handle.run());
// Wait for the metric to be emitted and non-zero - this should be very
// quick!
@ -151,10 +201,45 @@ mod tests {
.fetch();
if recorded_free_metric > 0 && recorded_total_metric > 0 {
snapshot_rx
.changed()
.with_timeout_panic(Duration::from_secs(5))
.await
.expect("snapshot value should have changed");
let snapshot = *snapshot_rx.borrow();
assert_eq!(snapshot.available_disk_space, recorded_free_metric);
assert_eq!(snapshot.total_disk_space, recorded_total_metric);
return;
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
}
// Token test to assert disk usage ratio
#[test]
fn assert_disk_usage_ratio() {
// 80% used
let snapshot = DiskSpaceSnapshot {
available_disk_space: 2000,
total_disk_space: 10000,
};
assert_eq!(snapshot.disk_usage_ratio(), 0.8);
// 90% used
let snapshot = DiskSpaceSnapshot {
available_disk_space: 2000,
total_disk_space: 20000,
};
assert_eq!(snapshot.disk_usage_ratio(), 0.9);
// Free!
let snapshot = DiskSpaceSnapshot {
available_disk_space: 42,
total_disk_space: 42,
};
assert_eq!(snapshot.disk_usage_ratio(), 0.0);
}
}