Merge pull request #6568 from influxdata/dom/shutdown-persist
feat(ingester2): persist on shutdownpull/24376/head
commit
7e3bb25815
|
@ -267,6 +267,14 @@ mod kafkaless_rpc_write {
|
||||||
|
|
||||||
// Restart the ingester and ensure it gets a new UUID
|
// Restart the ingester and ensure it gets a new UUID
|
||||||
cluster.restart_ingester().await;
|
cluster.restart_ingester().await;
|
||||||
|
|
||||||
|
// Populate the ingester with some data so it returns a successful
|
||||||
|
// response containing the UUID.
|
||||||
|
let lp = format!("{},tag1=A,tag2=B val=42i 123456", table_name);
|
||||||
|
let response = cluster.write_to_router(lp).await;
|
||||||
|
assert_eq!(response.status(), StatusCode::NO_CONTENT);
|
||||||
|
|
||||||
|
// Query for the new UUID and assert it has changed.
|
||||||
let mut performed_query = querier_flight.do_get(query).await.unwrap().into_inner();
|
let mut performed_query = querier_flight.do_get(query).await.unwrap().into_inner();
|
||||||
let (msg, app_metadata) = next_message(&mut performed_query).await.unwrap();
|
let (msg, app_metadata) = next_message(&mut performed_query).await.unwrap();
|
||||||
assert!(matches!(msg, DecodedPayload::None), "{:?}", msg);
|
assert!(matches!(msg, DecodedPayload::None), "{:?}", msg);
|
||||||
|
|
|
@ -2,11 +2,13 @@ crate::maybe_pub!(
|
||||||
pub use super::wal_replay::*;
|
pub use super::wal_replay::*;
|
||||||
);
|
);
|
||||||
|
|
||||||
|
mod graceful_shutdown;
|
||||||
mod wal_replay;
|
mod wal_replay;
|
||||||
|
|
||||||
use std::{path::PathBuf, sync::Arc, time::Duration};
|
use std::{path::PathBuf, sync::Arc, time::Duration};
|
||||||
|
|
||||||
use backoff::BackoffConfig;
|
use backoff::BackoffConfig;
|
||||||
|
use futures::{future::Shared, Future, FutureExt};
|
||||||
use generated_types::influxdata::iox::{
|
use generated_types::influxdata::iox::{
|
||||||
catalog::v1::catalog_service_server::{CatalogService, CatalogServiceServer},
|
catalog::v1::catalog_service_server::{CatalogService, CatalogServiceServer},
|
||||||
ingester::v1::write_service_server::{WriteService, WriteServiceServer},
|
ingester::v1::write_service_server::{WriteService, WriteServiceServer},
|
||||||
|
@ -14,8 +16,10 @@ use generated_types::influxdata::iox::{
|
||||||
use iox_arrow_flight::flight_service_server::{FlightService, FlightServiceServer};
|
use iox_arrow_flight::flight_service_server::{FlightService, FlightServiceServer};
|
||||||
use iox_catalog::interface::Catalog;
|
use iox_catalog::interface::Catalog;
|
||||||
use iox_query::exec::Executor;
|
use iox_query::exec::Executor;
|
||||||
|
use observability_deps::tracing::*;
|
||||||
use parquet_file::storage::ParquetStorage;
|
use parquet_file::storage::ParquetStorage;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
use tokio::sync::oneshot;
|
||||||
use wal::Wal;
|
use wal::Wal;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
|
@ -33,6 +37,8 @@ use crate::{
|
||||||
TRANSITION_SHARD_INDEX,
|
TRANSITION_SHARD_INDEX,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use self::graceful_shutdown::graceful_shutdown_handler;
|
||||||
|
|
||||||
/// Acquire opaque handles to the Ingester RPC service implementations.
|
/// Acquire opaque handles to the Ingester RPC service implementations.
|
||||||
///
|
///
|
||||||
/// This trait serves as the public crate API boundary - callers external to the
|
/// This trait serves as the public crate API boundary - callers external to the
|
||||||
|
@ -77,18 +83,34 @@ pub struct IngesterGuard<T> {
|
||||||
///
|
///
|
||||||
/// Aborted on drop.
|
/// Aborted on drop.
|
||||||
rotation_task: tokio::task::JoinHandle<()>,
|
rotation_task: tokio::task::JoinHandle<()>,
|
||||||
|
|
||||||
|
/// The task handle executing the graceful shutdown once triggered.
|
||||||
|
graceful_shutdown_handler: tokio::task::JoinHandle<()>,
|
||||||
|
shutdown_complete: Shared<oneshot::Receiver<()>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> IngesterGuard<T> {
|
impl<T> IngesterGuard<T>
|
||||||
|
where
|
||||||
|
T: Send + Sync,
|
||||||
|
{
|
||||||
/// Obtain a handle to the gRPC handlers.
|
/// Obtain a handle to the gRPC handlers.
|
||||||
pub fn rpc(&self) -> &T {
|
pub fn rpc(&self) -> &T {
|
||||||
&self.rpc
|
&self.rpc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Block and wait until the ingester has gracefully stopped.
|
||||||
|
pub async fn join(&self) {
|
||||||
|
self.shutdown_complete
|
||||||
|
.clone()
|
||||||
|
.await
|
||||||
|
.expect("graceful shutdown task panicked")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> Drop for IngesterGuard<T> {
|
impl<T> Drop for IngesterGuard<T> {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
self.rotation_task.abort();
|
self.rotation_task.abort();
|
||||||
|
self.graceful_shutdown_handler.abort();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,8 +170,21 @@ pub enum InitError {
|
||||||
/// value should be tuned to be slightly less than the interval between persist
|
/// value should be tuned to be slightly less than the interval between persist
|
||||||
/// operations, but not so long that it causes catalog load spikes at persist
|
/// operations, but not so long that it causes catalog load spikes at persist
|
||||||
/// time (which can be observed by the catalog instrumentation metrics).
|
/// time (which can be observed by the catalog instrumentation metrics).
|
||||||
|
///
|
||||||
|
/// ## Graceful Shutdown
|
||||||
|
///
|
||||||
|
/// When `shutdown` completes, the ingester blocks ingest (returning an error to
|
||||||
|
/// all new write requests) while still executing query requests. The ingester
|
||||||
|
/// then persists all data currently buffered.
|
||||||
|
///
|
||||||
|
/// Callers can wait for this buffer persist to complete by awaiting
|
||||||
|
/// [`IngesterGuard::join()`], which will resolve once all data has been flushed
|
||||||
|
/// to object storage.
|
||||||
|
///
|
||||||
|
/// The ingester will continue answering queries until the gRPC server is
|
||||||
|
/// stopped by the caller (managed outside of this crate).
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub async fn new(
|
pub async fn new<F>(
|
||||||
catalog: Arc<dyn Catalog>,
|
catalog: Arc<dyn Catalog>,
|
||||||
metrics: Arc<metric::Registry>,
|
metrics: Arc<metric::Registry>,
|
||||||
persist_background_fetch_time: Duration,
|
persist_background_fetch_time: Duration,
|
||||||
|
@ -160,7 +195,11 @@ pub async fn new(
|
||||||
persist_queue_depth: usize,
|
persist_queue_depth: usize,
|
||||||
persist_hot_partition_cost: usize,
|
persist_hot_partition_cost: usize,
|
||||||
object_store: ParquetStorage,
|
object_store: ParquetStorage,
|
||||||
) -> Result<IngesterGuard<impl IngesterRpcInterface>, InitError> {
|
shutdown: F,
|
||||||
|
) -> Result<IngesterGuard<impl IngesterRpcInterface>, InitError>
|
||||||
|
where
|
||||||
|
F: Future<Output = ()> + Send + 'static,
|
||||||
|
{
|
||||||
// Create the transition shard.
|
// Create the transition shard.
|
||||||
let mut txn = catalog
|
let mut txn = catalog
|
||||||
.start_transaction()
|
.start_transaction()
|
||||||
|
@ -269,11 +308,11 @@ pub async fn new(
|
||||||
let write_path = WalSink::new(Arc::clone(&buffer), Arc::clone(&wal));
|
let write_path = WalSink::new(Arc::clone(&buffer), Arc::clone(&wal));
|
||||||
|
|
||||||
// Spawn a background thread to periodically rotate the WAL segment file.
|
// Spawn a background thread to periodically rotate the WAL segment file.
|
||||||
let handle = tokio::spawn(periodic_rotation(
|
let rotation_task = tokio::spawn(periodic_rotation(
|
||||||
wal,
|
Arc::clone(&wal),
|
||||||
wal_rotation_period,
|
wal_rotation_period,
|
||||||
Arc::clone(&buffer),
|
Arc::clone(&buffer),
|
||||||
persist_handle,
|
Arc::clone(&persist_handle),
|
||||||
));
|
));
|
||||||
|
|
||||||
// Restore the highest sequence number from the WAL files, and default to 0
|
// Restore the highest sequence number from the WAL files, and default to 0
|
||||||
|
@ -288,6 +327,16 @@ pub async fn new(
|
||||||
.unwrap_or(0),
|
.unwrap_or(0),
|
||||||
));
|
));
|
||||||
|
|
||||||
|
let (shutdown_tx, shutdown_rx) = oneshot::channel();
|
||||||
|
let shutdown_task = tokio::spawn(graceful_shutdown_handler(
|
||||||
|
shutdown,
|
||||||
|
shutdown_tx,
|
||||||
|
Arc::clone(&ingest_state),
|
||||||
|
Arc::clone(&buffer),
|
||||||
|
persist_handle,
|
||||||
|
wal,
|
||||||
|
));
|
||||||
|
|
||||||
Ok(IngesterGuard {
|
Ok(IngesterGuard {
|
||||||
rpc: GrpcDelegate::new(
|
rpc: GrpcDelegate::new(
|
||||||
Arc::new(write_path),
|
Arc::new(write_path),
|
||||||
|
@ -297,6 +346,8 @@ pub async fn new(
|
||||||
catalog,
|
catalog,
|
||||||
metrics,
|
metrics,
|
||||||
),
|
),
|
||||||
rotation_task: handle,
|
rotation_task,
|
||||||
|
graceful_shutdown_handler: shutdown_task,
|
||||||
|
shutdown_complete: shutdown_rx.shared(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,372 @@
|
||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
use futures::Future;
|
||||||
|
use observability_deps::tracing::*;
|
||||||
|
use tokio::sync::oneshot;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
ingest_state::{IngestState, IngestStateError},
|
||||||
|
partition_iter::PartitionIter,
|
||||||
|
persist::{drain_buffer::persist_partitions, queue::PersistQueue},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Defines how often the shutdown task polls the partition buffers for
|
||||||
|
/// emptiness.
|
||||||
|
///
|
||||||
|
/// Polls faster in tests to avoid unnecessary delay.
|
||||||
|
#[cfg(test)]
|
||||||
|
const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_millis(50);
|
||||||
|
#[cfg(not(test))]
|
||||||
|
const SHUTDOWN_POLL_INTERVAL: Duration = Duration::from_secs(1);
|
||||||
|
|
||||||
|
/// Awaits `fut`, before blocking ingest and persisting all data.
|
||||||
|
///
|
||||||
|
/// Returns once all outstanding persist jobs have completed (regardless of what
|
||||||
|
/// started them) and all buffered data has been flushed to object store.
|
||||||
|
///
|
||||||
|
/// Correctly accounts for persist jobs that have been started (by a call to
|
||||||
|
/// [`PartitionData::mark_persisting()`] but not yet enqueued).
|
||||||
|
///
|
||||||
|
/// Ingest is blocked by setting [`IngestStateError::GracefulStop`] in the
|
||||||
|
/// [`IngestState`].
|
||||||
|
///
|
||||||
|
/// [`PartitionData::mark_persisting()`]:
|
||||||
|
/// crate::buffer_tree::partition::PartitionData::mark_persisting()
|
||||||
|
pub(super) async fn graceful_shutdown_handler<F, T, P>(
|
||||||
|
fut: F,
|
||||||
|
complete: oneshot::Sender<()>,
|
||||||
|
ingest_state: Arc<IngestState>,
|
||||||
|
buffer: T,
|
||||||
|
persist: P,
|
||||||
|
wal: Arc<wal::Wal>,
|
||||||
|
) where
|
||||||
|
F: Future<Output = ()> + Send,
|
||||||
|
T: PartitionIter + Sync,
|
||||||
|
P: PersistQueue + Clone,
|
||||||
|
{
|
||||||
|
fut.await;
|
||||||
|
info!("gracefully stopping ingester");
|
||||||
|
|
||||||
|
// Reject RPC writes.
|
||||||
|
//
|
||||||
|
// There MAY be writes ongoing that started before this state was set.
|
||||||
|
ingest_state.set(IngestStateError::GracefulStop);
|
||||||
|
|
||||||
|
info!("persisting all data before shutdown");
|
||||||
|
|
||||||
|
// Drain the buffer tree, persisting all data.
|
||||||
|
//
|
||||||
|
// Returns once the persist jobs it starts have complete.
|
||||||
|
persist_partitions(buffer.partition_iter(), &persist).await;
|
||||||
|
|
||||||
|
// There may have been concurrent persist jobs started previously by hot
|
||||||
|
// partition persistence or WAL rotation (or some other, arbitrary persist
|
||||||
|
// source) that have not yet completed (this is unlikely). There may also be
|
||||||
|
// late arriving writes that started before ingest was blocked, but did not
|
||||||
|
// buffer until after the persist was completed above (also unlikely).
|
||||||
|
//
|
||||||
|
// Wait until there is no data in the buffer at all before proceeding,
|
||||||
|
// therefore ensuring those concurrent persist operations have completed and
|
||||||
|
// no late arriving data remains buffered.
|
||||||
|
//
|
||||||
|
// NOTE: There is a small race in which a late arriving write starts before
|
||||||
|
// ingest is blocked, is then stalled the entire time partitions are
|
||||||
|
// persisted, remains stalled while this "empty" check occurs, and then
|
||||||
|
// springs to life and buffers in the buffer tree after this check has
|
||||||
|
// completed - I think this is extreme enough to accept as a theoretical
|
||||||
|
// possibility that doesn't need covering off in practice.
|
||||||
|
while buffer
|
||||||
|
.partition_iter()
|
||||||
|
.any(|p| p.lock().get_query_data().is_some())
|
||||||
|
{
|
||||||
|
if persist_partitions(buffer.partition_iter(), &persist).await != 0 {
|
||||||
|
// Late arriving writes needed persisting.
|
||||||
|
debug!("re-persisting late arriving data");
|
||||||
|
} else {
|
||||||
|
// At least one partition is returning data, and there is no data to
|
||||||
|
// start persisting, therefore there is an outstanding persist
|
||||||
|
// operation that hasn't yet been marked as complete.
|
||||||
|
debug!("waiting for concurrent persist to complete");
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(SHUTDOWN_POLL_INTERVAL).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
// There is now no data buffered in the ingester - all data has been
|
||||||
|
// persisted to object storage.
|
||||||
|
//
|
||||||
|
// Therefore there are no ops that need replaying to rebuild the (now empty)
|
||||||
|
// buffer state, therefore all WAL segments can be deleted to prevent
|
||||||
|
// spurious replay and re-uploading of the same data.
|
||||||
|
//
|
||||||
|
// This should be made redundant by persist-driven WAL dropping:
|
||||||
|
//
|
||||||
|
// https://github.com/influxdata/influxdb_iox/issues/6566
|
||||||
|
//
|
||||||
|
wal.rotate().expect("failed to rotate wal");
|
||||||
|
for file in wal.closed_segments() {
|
||||||
|
if let Err(error) = wal.delete(file.id()).await {
|
||||||
|
// This MAY occur due to concurrent segment deletion driven by the
|
||||||
|
// WAL rotation task.
|
||||||
|
//
|
||||||
|
// If this is a legitimate failure to delete (not a "not found")
|
||||||
|
// then this causes the data to be re-uploaded - an acceptable
|
||||||
|
// outcome, and preferable to panicking here and not dropping the
|
||||||
|
// rest of the deletable files.
|
||||||
|
warn!(%error, "failed to drop WAL segment");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("persisted all data - stopping ingester");
|
||||||
|
let _ = complete.send(());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::{future::ready, sync::Arc, task::Poll};
|
||||||
|
|
||||||
|
use assert_matches::assert_matches;
|
||||||
|
use data_types::{NamespaceId, PartitionId, PartitionKey, SequenceNumber, ShardId, TableId};
|
||||||
|
use futures::FutureExt;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
|
||||||
|
use parking_lot::Mutex;
|
||||||
|
use test_helpers::timeout::FutureTimeout;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
buffer_tree::{
|
||||||
|
namespace::NamespaceName, partition::PartitionData, partition::SortKeyState,
|
||||||
|
table::TableName,
|
||||||
|
},
|
||||||
|
deferred_load::DeferredLoad,
|
||||||
|
persist::queue::mock::MockPersistQueue,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
const PARTITION_ID: PartitionId = PartitionId::new(1);
|
||||||
|
const TRANSITION_SHARD_ID: ShardId = ShardId::new(84);
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref PARTITION_KEY: PartitionKey = PartitionKey::from("platanos");
|
||||||
|
static ref TABLE_NAME: TableName = TableName::from("bananas");
|
||||||
|
static ref NAMESPACE_NAME: NamespaceName = NamespaceName::from("namespace-bananas");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialise a partition containing buffered data.
|
||||||
|
fn new_partition() -> Arc<Mutex<PartitionData>> {
|
||||||
|
let mut partition = PartitionData::new(
|
||||||
|
PARTITION_ID,
|
||||||
|
PARTITION_KEY.clone(),
|
||||||
|
NamespaceId::new(3),
|
||||||
|
Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
|
||||||
|
NAMESPACE_NAME.clone()
|
||||||
|
})),
|
||||||
|
TableId::new(4),
|
||||||
|
Arc::new(DeferredLoad::new(Duration::from_secs(1), async {
|
||||||
|
TABLE_NAME.clone()
|
||||||
|
})),
|
||||||
|
SortKeyState::Provided(None),
|
||||||
|
TRANSITION_SHARD_ID,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mb = lp_to_mutable_batch(r#"bananas,city=London people=2,pigeons="millions" 10"#).1;
|
||||||
|
partition
|
||||||
|
.buffer_write(mb, SequenceNumber::new(1))
|
||||||
|
.expect("failed to write dummy data");
|
||||||
|
|
||||||
|
Arc::new(Mutex::new(partition))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialise a WAL with > 1 segment.
|
||||||
|
async fn new_wal() -> (tempfile::TempDir, Arc<wal::Wal>) {
|
||||||
|
let dir = tempfile::tempdir().expect("failed to get temporary WAL directory");
|
||||||
|
let wal = wal::Wal::new(dir.path())
|
||||||
|
.await
|
||||||
|
.expect("failed to initialise WAL to write");
|
||||||
|
|
||||||
|
wal.rotate().expect("failed to rotate WAL");
|
||||||
|
|
||||||
|
(dir, wal)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_graceful_shutdown() {
|
||||||
|
let persist = Arc::new(MockPersistQueue::default());
|
||||||
|
let ingest_state = Arc::new(IngestState::default());
|
||||||
|
let (_tempdir, wal) = new_wal().await;
|
||||||
|
let partition = new_partition();
|
||||||
|
|
||||||
|
let (tx, rx) = oneshot::channel();
|
||||||
|
graceful_shutdown_handler(
|
||||||
|
ready(()),
|
||||||
|
tx,
|
||||||
|
ingest_state,
|
||||||
|
vec![Arc::clone(&partition)],
|
||||||
|
Arc::clone(&persist),
|
||||||
|
Arc::clone(&wal),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Wait for the shutdown to complete.
|
||||||
|
rx.with_timeout_panic(Duration::from_secs(5))
|
||||||
|
.await
|
||||||
|
.expect("shutdown task panicked");
|
||||||
|
|
||||||
|
// Assert the data was persisted
|
||||||
|
let persist_calls = persist.calls();
|
||||||
|
assert_matches!(&*persist_calls, [p] => {
|
||||||
|
assert!(Arc::ptr_eq(p, &partition));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Assert there are now no WAL segment files that will be replayed
|
||||||
|
assert!(wal.closed_segments().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_graceful_shutdown_concurrent_persist() {
|
||||||
|
let persist = Arc::new(MockPersistQueue::default());
|
||||||
|
let ingest_state = Arc::new(IngestState::default());
|
||||||
|
let (_tempdir, wal) = new_wal().await;
|
||||||
|
let partition = new_partition();
|
||||||
|
|
||||||
|
// Mark the partition as persisting
|
||||||
|
let persist_job = partition
|
||||||
|
.lock()
|
||||||
|
.mark_persisting()
|
||||||
|
.expect("non-empty partition should begin persisting");
|
||||||
|
|
||||||
|
// Start the graceful shutdown job in another thread, as it SHOULD block
|
||||||
|
// until the persist job is marked as complete.
|
||||||
|
let (tx, rx) = oneshot::channel();
|
||||||
|
let handle = tokio::spawn(graceful_shutdown_handler(
|
||||||
|
ready(()),
|
||||||
|
tx,
|
||||||
|
ingest_state,
|
||||||
|
vec![Arc::clone(&partition)],
|
||||||
|
Arc::clone(&persist),
|
||||||
|
Arc::clone(&wal),
|
||||||
|
));
|
||||||
|
|
||||||
|
// Wait a small duration of time for the first buffer emptiness check to
|
||||||
|
// fire.
|
||||||
|
tokio::time::sleep(Duration::from_millis(200)).await;
|
||||||
|
|
||||||
|
// Assert the shutdown hasn't completed.
|
||||||
|
//
|
||||||
|
// This is racy, but will fail false negative and will not flake in CI.
|
||||||
|
// If this fails in CI, it is a legitimate bug (shutdown task should not
|
||||||
|
// have stopped).
|
||||||
|
let rx = rx.shared();
|
||||||
|
assert_matches!(futures::poll!(rx.clone()), Poll::Pending);
|
||||||
|
|
||||||
|
// Mark the persist job as having completed, unblocking the shutdown
|
||||||
|
// task.
|
||||||
|
partition.lock().mark_persisted(persist_job);
|
||||||
|
|
||||||
|
// Wait for the shutdown to complete.
|
||||||
|
rx.with_timeout_panic(Duration::from_secs(5))
|
||||||
|
.await
|
||||||
|
.expect("shutdown task panicked");
|
||||||
|
|
||||||
|
assert!(handle
|
||||||
|
.with_timeout_panic(Duration::from_secs(1))
|
||||||
|
.await
|
||||||
|
.is_ok());
|
||||||
|
|
||||||
|
// Assert the data was not passed to the persist task (it couldn't have
|
||||||
|
// been, as this caller held the PersistData)
|
||||||
|
assert!(persist.calls().is_empty());
|
||||||
|
|
||||||
|
// Assert there are now no WAL segment files that will be replayed
|
||||||
|
assert!(wal.closed_segments().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An implementation of [`PartitionIter`] that yields an extra new,
|
||||||
|
/// non-empty partition each time [`PartitionIter::partition_iter()`] is
|
||||||
|
/// called.
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct SneakyPartitionBuffer {
|
||||||
|
max: usize,
|
||||||
|
partitions: Mutex<Vec<Arc<Mutex<PartitionData>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SneakyPartitionBuffer {
|
||||||
|
fn new(max: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
max,
|
||||||
|
partitions: Default::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partitions(&self) -> Vec<Arc<Mutex<PartitionData>>> {
|
||||||
|
self.partitions.lock().clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartitionIter for SneakyPartitionBuffer {
|
||||||
|
fn partition_iter(&self) -> Box<dyn Iterator<Item = Arc<Mutex<PartitionData>>> + Send> {
|
||||||
|
let mut partitions = self.partitions.lock();
|
||||||
|
|
||||||
|
// If this hasn't reached the maximum number of times to be sneaky,
|
||||||
|
// add another partition.
|
||||||
|
if partitions.len() != self.max {
|
||||||
|
partitions.push(new_partition());
|
||||||
|
}
|
||||||
|
|
||||||
|
Box::new(partitions.clone().into_iter())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_graceful_shutdown_concurrent_new_writes() {
|
||||||
|
let persist = Arc::new(MockPersistQueue::default());
|
||||||
|
let ingest_state = Arc::new(IngestState::default());
|
||||||
|
let (_tempdir, wal) = new_wal().await;
|
||||||
|
|
||||||
|
// Initialise a buffer that keeps yielding more and more newly wrote
|
||||||
|
// data, up until the maximum.
|
||||||
|
const MAX_NEW_PARTITIONS: usize = 3;
|
||||||
|
let buffer = Arc::new(SneakyPartitionBuffer::new(MAX_NEW_PARTITIONS));
|
||||||
|
|
||||||
|
// Start the graceful shutdown job in another thread, as it SHOULD block
|
||||||
|
// until the persist job is marked as complete.
|
||||||
|
let (tx, rx) = oneshot::channel();
|
||||||
|
let handle = tokio::spawn(graceful_shutdown_handler(
|
||||||
|
ready(()),
|
||||||
|
tx,
|
||||||
|
ingest_state,
|
||||||
|
Arc::clone(&buffer),
|
||||||
|
Arc::clone(&persist),
|
||||||
|
Arc::clone(&wal),
|
||||||
|
));
|
||||||
|
|
||||||
|
// Wait for the shutdown to complete.
|
||||||
|
rx.with_timeout_panic(Duration::from_secs(5))
|
||||||
|
.await
|
||||||
|
.expect("shutdown task panicked");
|
||||||
|
|
||||||
|
assert!(handle
|
||||||
|
.with_timeout_panic(Duration::from_secs(1))
|
||||||
|
.await
|
||||||
|
.is_ok());
|
||||||
|
|
||||||
|
// Assert all the data yielded by the sneaky buffer was passed to the
|
||||||
|
// persist task.
|
||||||
|
let persist_calls = persist.calls();
|
||||||
|
let must_have_persisted = |p: &Arc<Mutex<PartitionData>>| {
|
||||||
|
for call in &persist_calls {
|
||||||
|
if Arc::ptr_eq(call, p) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
};
|
||||||
|
if !buffer.partitions().iter().all(must_have_persisted) {
|
||||||
|
panic!("at least one sneaky buffer was not passed to the persist system");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assert there are now no WAL segment files that will be replayed
|
||||||
|
assert!(wal.closed_segments().is_empty());
|
||||||
|
}
|
||||||
|
}
|
|
@ -130,7 +130,7 @@ where
|
||||||
);
|
);
|
||||||
|
|
||||||
// Persist all the data that was replayed from the WAL segment.
|
// Persist all the data that was replayed from the WAL segment.
|
||||||
persist_partitions(sink.partition_iter(), persist.clone()).await;
|
persist_partitions(sink.partition_iter(), &persist).await;
|
||||||
|
|
||||||
// Drop the newly persisted data - it should not be replayed.
|
// Drop the newly persisted data - it should not be replayed.
|
||||||
wal.delete(file.id())
|
wal.delete(file.id())
|
||||||
|
|
|
@ -18,3 +18,9 @@ where
|
||||||
(**self).partition_iter()
|
(**self).partition_iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl PartitionIter for Vec<Arc<Mutex<PartitionData>>> {
|
||||||
|
fn partition_iter(&self) -> Box<dyn Iterator<Item = Arc<Mutex<PartitionData>>> + Send> {
|
||||||
|
Box::new(self.clone().into_iter())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -14,12 +14,12 @@ use super::queue::PersistQueue;
|
||||||
const PERSIST_ENQUEUE_CONCURRENCY: usize = 5;
|
const PERSIST_ENQUEUE_CONCURRENCY: usize = 5;
|
||||||
|
|
||||||
// Persist a set of [`PartitionData`], blocking for completion of all enqueued
|
// Persist a set of [`PartitionData`], blocking for completion of all enqueued
|
||||||
// persist jobs.
|
// persist jobs and returning the number of partitions that were persisted.
|
||||||
//
|
//
|
||||||
// This call is not atomic, partitions are marked for persistence incrementally.
|
// This call is not atomic, partitions are marked for persistence incrementally.
|
||||||
// Writes that landed into the partition buffer after this call, but before the
|
// Writes that landed into the partition buffer after this call, but before the
|
||||||
// partition data is read will be included in the persisted data.
|
// partition data is read will be included in the persisted data.
|
||||||
pub(crate) async fn persist_partitions<T, P>(iter: T, persist: P)
|
pub(crate) async fn persist_partitions<T, P>(iter: T, persist: &P) -> usize
|
||||||
where
|
where
|
||||||
T: Iterator<Item = Arc<Mutex<PartitionData>>> + Send,
|
T: Iterator<Item = Arc<Mutex<PartitionData>>> + Send,
|
||||||
P: PersistQueue + Clone,
|
P: PersistQueue + Clone,
|
||||||
|
@ -70,8 +70,12 @@ where
|
||||||
"queued all non-empty partitions for persist"
|
"queued all non-empty partitions for persist"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let count = notifications.len();
|
||||||
|
|
||||||
// Wait for all the persist completion notifications.
|
// Wait for all the persist completion notifications.
|
||||||
for n in notifications {
|
for n in notifications {
|
||||||
n.await.expect("persist worker task panic");
|
n.await.expect("persist worker task panic");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
count
|
||||||
}
|
}
|
||||||
|
|
|
@ -119,7 +119,7 @@ use crate::{
|
||||||
/// crate::ingest_state::IngestStateError::PersistSaturated
|
/// crate::ingest_state::IngestStateError::PersistSaturated
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct PersistHandle {
|
pub(crate) struct PersistHandle {
|
||||||
/// THe state/dependencies shared across all worker tasks.
|
/// The state/dependencies shared across all worker tasks.
|
||||||
worker_state: Arc<SharedWorkerState>,
|
worker_state: Arc<SharedWorkerState>,
|
||||||
|
|
||||||
/// Task handles for the worker tasks, aborted on drop of all
|
/// Task handles for the worker tasks, aborted on drop of all
|
||||||
|
|
|
@ -68,6 +68,8 @@ pub(crate) async fn periodic_rotation<T, P>(
|
||||||
// special code path between "hot partition persist" and "wal rotation
|
// special code path between "hot partition persist" and "wal rotation
|
||||||
// persist" - it all works the same way!
|
// persist" - it all works the same way!
|
||||||
//
|
//
|
||||||
|
// https://github.com/influxdata/influxdb_iox/issues/6566
|
||||||
|
//
|
||||||
// TODO: this properly as described above.
|
// TODO: this properly as described above.
|
||||||
|
|
||||||
tokio::time::sleep(Duration::from_secs(5)).await;
|
tokio::time::sleep(Duration::from_secs(5)).await;
|
||||||
|
@ -86,7 +88,7 @@ pub(crate) async fn periodic_rotation<T, P>(
|
||||||
// - a small price to pay for not having to block ingest while the WAL
|
// - a small price to pay for not having to block ingest while the WAL
|
||||||
// is rotated, all outstanding writes + queries complete, and all then
|
// is rotated, all outstanding writes + queries complete, and all then
|
||||||
// partitions are marked as persisting.
|
// partitions are marked as persisting.
|
||||||
persist_partitions(buffer.partition_iter(), persist.clone()).await;
|
persist_partitions(buffer.partition_iter(), &persist).await;
|
||||||
|
|
||||||
debug!(
|
debug!(
|
||||||
closed_id = %stats.id(),
|
closed_id = %stats.id(),
|
||||||
|
|
|
@ -45,10 +45,11 @@ impl<I: IngesterRpcInterface> IngesterServerType<I> {
|
||||||
metrics: Arc<Registry>,
|
metrics: Arc<Registry>,
|
||||||
common_state: &CommonServerState,
|
common_state: &CommonServerState,
|
||||||
max_simultaneous_queries: usize,
|
max_simultaneous_queries: usize,
|
||||||
|
shutdown: CancellationToken,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
server,
|
server,
|
||||||
shutdown: CancellationToken::new(),
|
shutdown,
|
||||||
metrics,
|
metrics,
|
||||||
trace_collector: common_state.trace_collector(),
|
trace_collector: common_state.trace_collector(),
|
||||||
max_simultaneous_queries,
|
max_simultaneous_queries,
|
||||||
|
@ -101,7 +102,7 @@ impl<I: IngesterRpcInterface + Sync + Send + Debug + 'static> ServerType for Ing
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn join(self: Arc<Self>) {
|
async fn join(self: Arc<Self>) {
|
||||||
self.shutdown.cancelled().await;
|
self.server.join().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn shutdown(&self) {
|
fn shutdown(&self) {
|
||||||
|
@ -148,6 +149,8 @@ pub async fn create_ingester_server_type(
|
||||||
exec: Arc<Executor>,
|
exec: Arc<Executor>,
|
||||||
object_store: ParquetStorage,
|
object_store: ParquetStorage,
|
||||||
) -> Result<Arc<dyn ServerType>> {
|
) -> Result<Arc<dyn ServerType>> {
|
||||||
|
let shutdown = CancellationToken::new();
|
||||||
|
|
||||||
let grpc = ingester2::new(
|
let grpc = ingester2::new(
|
||||||
catalog,
|
catalog,
|
||||||
Arc::clone(&metrics),
|
Arc::clone(&metrics),
|
||||||
|
@ -159,6 +162,10 @@ pub async fn create_ingester_server_type(
|
||||||
ingester_config.persist_queue_depth,
|
ingester_config.persist_queue_depth,
|
||||||
ingester_config.persist_hot_partition_cost,
|
ingester_config.persist_hot_partition_cost,
|
||||||
object_store,
|
object_store,
|
||||||
|
{
|
||||||
|
let shutdown = shutdown.clone();
|
||||||
|
async move { shutdown.cancelled().await }
|
||||||
|
},
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
@ -167,5 +174,6 @@ pub async fn create_ingester_server_type(
|
||||||
metrics,
|
metrics,
|
||||||
common_state,
|
common_state,
|
||||||
ingester_config.concurrent_query_limit,
|
ingester_config.concurrent_query_limit,
|
||||||
|
shutdown,
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue