chore(garbage collector): backoff first connect to objectstore (#7652)

* chore(garbage collector): backoff first connect to objectstore

This pr replaces the initial sleep added in #7562 with expential backoff
retry of connecting to objectstore. This avoids the issue of waiting the
configured sleep which can be quite long in a world where the service is
getting redeployed often.

* for #7562

* chore: rewrite lister::perform based on pr feedback

This commit redoes perform as a do..while loop, putting the call to list
from the object store at the top of the loop so the infinite backoff
retry and be used for each loop iteration - it might fail on more than
the first time!

There are 3 selects as there are 3 wait stages and each needs to check
for shutdown: os list, processing the list, and sleeping on the poll
interval.

* chore: hoist cancellation check higher; limit listing to 1000 files

Responding to PR feedback.

* chore: add error info message

* chore: make build. :|

* chore: linter
pull/24376/head
Phil Bracikowski 2023-04-27 10:51:52 -07:00 committed by GitHub
parent 7ae92caf4b
commit 8609015821
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 41 deletions

1
Cargo.lock generated
View File

@ -2035,6 +2035,7 @@ dependencies = [
name = "garbage_collector"
version = "0.1.0"
dependencies = [
"backoff",
"bytes",
"chrono",
"clap 4.2.4",

View File

@ -13,6 +13,7 @@ data_types = { path = "../data_types" }
futures = "0.3"
humantime = "2.1.0"
iox_catalog = { path = "../iox_catalog" }
backoff = { path = "../backoff" }
object_store = { version = "0.5.6" }
observability_deps = { path = "../observability_deps" }
snafu = "0.7"

View File

@ -30,7 +30,7 @@ use object_store::DynObjectStore;
use observability_deps::tracing::*;
use snafu::prelude::*;
use std::{fmt::Debug, sync::Arc};
use tokio::sync::mpsc;
use tokio::{select, sync::mpsc};
use tokio_util::sync::CancellationToken;
/// Logic for listing, checking and deleting files in object storage
@ -96,12 +96,23 @@ impl GarbageCollector {
let (tx1, rx1) = mpsc::channel(BUFFER_SIZE);
let (tx2, rx2) = mpsc::channel(BUFFER_SIZE);
let os_lister = tokio::spawn(os_lister::perform(
shutdown.clone(),
Arc::clone(&object_store),
tx1,
sub_config.objectstore_sleep_interval_minutes,
));
let sdt = shutdown.clone();
let osa = Arc::clone(&object_store);
let os_lister = tokio::spawn(async move {
select! {
ret = os_lister::perform(
osa,
tx1,
sub_config.objectstore_sleep_interval_minutes,
) => {
ret
},
_ = sdt.cancelled() => {
Ok(())
},
}
});
let os_checker = tokio::spawn(os_checker::perform(
shutdown.clone(),
Arc::clone(&catalog),

View File

@ -1,52 +1,53 @@
use backoff::*;
use futures::prelude::*;
use futures::stream::BoxStream;
use object_store::{DynObjectStore, ObjectMeta};
use observability_deps::tracing::*;
use snafu::prelude::*;
use std::{sync::Arc, time::Duration};
use tokio::{select, sync::mpsc, time::sleep};
use tokio_util::sync::CancellationToken;
use tokio::{sync::mpsc, time::sleep};
/// perform a object store list, limiting to 1000 files per loop iteration, waiting sleep interval
/// per loop.
pub(crate) async fn perform(
shutdown: CancellationToken,
object_store: Arc<DynObjectStore>,
checker: mpsc::Sender<ObjectMeta>,
sleep_interval_minutes: u64,
) -> Result<()> {
// sleep poll interval to avoid issues with immediately polling the object store at startup
info!("object store polling will start in {sleep_interval_minutes} configured minutes");
sleep(Duration::from_secs(60 * sleep_interval_minutes)).await;
let mut items = object_store.list(None).await.context(ListingSnafu)?;
loop {
select! {
_ = shutdown.cancelled() => {
break
},
item = items.next() => {
match item {
Some(item) => {
let item = item.context(MalformedSnafu)?;
debug!(location = %item.location, "Object store item");
checker.send(item).await?;
}
None => {
// sleep for the configured time, then list again and go around the loop
// again
debug!("end of object store item list");
select! {
_ = shutdown.cancelled() => {
break;
}
_ = sleep(Duration::from_secs(60 * sleep_interval_minutes)) => {
items = object_store.list(None).await.context(ListingSnafu)?;
}
}
}
}
// backoff retry to avoid issues with immediately polling the object store at startup
let mut backoff = Backoff::new(&BackoffConfig::default());
let mut items = backoff
.retry_all_errors("list_os_files", || object_store.list(None))
.await
.expect("backoff retries forever");
// ignore the result, if it was successful, sleep; if there was an error, sleep still to
// make sure we don't loop onto the same error repeatedly
// (todo: maybe improve in the future based on error).
let ret = process_item_list(&mut items, &checker).await;
match ret {
Ok(_) => {}
Err(e) => {
info!("error processing items from object store, continuing: {e}")
}
}
}
sleep(Duration::from_secs(60 * sleep_interval_minutes)).await;
}
}
async fn process_item_list(
items: &mut BoxStream<'_, object_store::Result<ObjectMeta>>,
checker: &mpsc::Sender<ObjectMeta>,
) -> Result<()> {
while let Some(item) = items.take(1000).next().await {
let item = item.context(MalformedSnafu)?;
debug!(location = %item.location, "Object store item");
checker.send(item).await?;
}
debug!("end of object store item list");
Ok(())
}