diff --git a/Cargo.lock b/Cargo.lock index 755a359720..60270e2f66 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3909,6 +3909,7 @@ dependencies = [ name = "panic_logging" version = "0.1.0" dependencies = [ + "metric", "observability_deps", "workspace-hack", ] diff --git a/influxdb_iox/src/commands/run/all_in_one.rs b/influxdb_iox/src/commands/run/all_in_one.rs index 29e24247e2..787880286b 100644 --- a/influxdb_iox/src/commands/run/all_in_one.rs +++ b/influxdb_iox/src/commands/run/all_in_one.rs @@ -370,7 +370,7 @@ pub async fn command(config: Config) -> Result<()> { info!(?ingester_addresses, "starting querier"); let querier = create_querier_server_type( &common_state, - metrics, + Arc::clone(&metrics), catalog, object_store, time_provider, @@ -388,5 +388,5 @@ pub async fn command(config: Config) -> Result<()> { Service::create_grpc_only(querier, &querier_run_config), ]; - Ok(main::main(common_state, services).await?) + Ok(main::main(common_state, services, metrics).await?) } diff --git a/influxdb_iox/src/commands/run/compactor.rs b/influxdb_iox/src/commands/run/compactor.rs index 1fc8657561..04a6479f64 100644 --- a/influxdb_iox/src/commands/run/compactor.rs +++ b/influxdb_iox/src/commands/run/compactor.rs @@ -91,7 +91,7 @@ pub async fn command(config: Config) -> Result<(), Error> { let server_type = create_compactor_server_type( &common_state, - metric_registry, + Arc::clone(&metric_registry), catalog, object_store, exec, @@ -103,5 +103,5 @@ pub async fn command(config: Config) -> Result<(), Error> { info!("starting compactor"); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main(common_state, services, metric_registry).await?) } diff --git a/influxdb_iox/src/commands/run/database.rs b/influxdb_iox/src/commands/run/database.rs index f57cb852ff..48db2dca59 100644 --- a/influxdb_iox/src/commands/run/database.rs +++ b/influxdb_iox/src/commands/run/database.rs @@ -130,5 +130,10 @@ pub async fn command(config: Config) -> Result<()> { )); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main( + common_state, + services, + Arc::new(metric::Registry::default()), + ) + .await?) } diff --git a/influxdb_iox/src/commands/run/ingester.rs b/influxdb_iox/src/commands/run/ingester.rs index 4390a947f0..435f48d87d 100644 --- a/influxdb_iox/src/commands/run/ingester.rs +++ b/influxdb_iox/src/commands/run/ingester.rs @@ -89,7 +89,7 @@ pub async fn command(config: Config) -> Result<()> { let exec = Arc::new(Executor::new(config.query_exec_thread_count)); let server_type = create_ingester_server_type( &common_state, - metric_registry, + Arc::clone(&metric_registry), catalog, object_store, exec, @@ -101,5 +101,5 @@ pub async fn command(config: Config) -> Result<()> { info!("starting ingester"); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main(common_state, services, metric_registry).await?) } diff --git a/influxdb_iox/src/commands/run/main.rs b/influxdb_iox/src/commands/run/main.rs index 88302f009b..79b4c60b09 100644 --- a/influxdb_iox/src/commands/run/main.rs +++ b/influxdb_iox/src/commands/run/main.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use ioxd_common::Service; use ioxd_common::{grpc_listener, http_listener, serve, server_type::CommonServerState}; use observability_deps::tracing::{error, info}; @@ -66,7 +68,11 @@ fn build_malloc_conf() -> String { /// /// Due to its invasive nature (install global panic handling, /// logging, etc) this function should not be used during unit tests. -pub async fn main(common_state: CommonServerState, services: Vec) -> Result<()> { +pub async fn main( + common_state: CommonServerState, + services: Vec, + metrics: Arc, +) -> Result<()> { let git_hash = env!("GIT_HASH", "starting influxdb_iox server"); let num_cpus = num_cpus::get(); let build_malloc_conf = build_malloc_conf(); @@ -98,7 +104,7 @@ pub async fn main(common_state: CommonServerState, services: Vec) -> Re // lifetime of the program - this is actually a good thing, as it prevents // the panic handler from being removed while unwinding a panic (which in // turn, causes a panic - see #548) - let f = SendPanicsToTracing::new(); + let f = SendPanicsToTracing::new().with_metrics(&*metrics); std::mem::forget(f); // Register jemalloc metrics diff --git a/influxdb_iox/src/commands/run/querier.rs b/influxdb_iox/src/commands/run/querier.rs index 09758c18a1..0f1495f415 100644 --- a/influxdb_iox/src/commands/run/querier.rs +++ b/influxdb_iox/src/commands/run/querier.rs @@ -89,7 +89,7 @@ pub async fn command(config: Config) -> Result<(), Error> { let exec = Arc::new(Executor::new(num_threads)); let server_type = create_querier_server_type( &common_state, - metric_registry, + Arc::clone(&metric_registry), catalog, object_store, time_provider, @@ -101,5 +101,5 @@ pub async fn command(config: Config) -> Result<(), Error> { info!("starting querier"); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main(common_state, services, metric_registry).await?) } diff --git a/influxdb_iox/src/commands/run/router.rs b/influxdb_iox/src/commands/run/router.rs index 25dbb17b60..16bcd22543 100644 --- a/influxdb_iox/src/commands/run/router.rs +++ b/influxdb_iox/src/commands/run/router.rs @@ -159,5 +159,10 @@ pub async fn command(config: Config) -> Result<()> { )); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main( + common_state, + services, + Arc::new(metric::Registry::default()), + ) + .await?) } diff --git a/influxdb_iox/src/commands/run/router2.rs b/influxdb_iox/src/commands/run/router2.rs index 5970d27c69..9f198e8add 100644 --- a/influxdb_iox/src/commands/run/router2.rs +++ b/influxdb_iox/src/commands/run/router2.rs @@ -96,5 +96,5 @@ pub async fn command(config: Config) -> Result<()> { info!("starting router2"); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main(common_state, services, metrics).await?) } diff --git a/influxdb_iox/src/commands/run/test.rs b/influxdb_iox/src/commands/run/test.rs index b2ac0a0924..4c12f6e628 100644 --- a/influxdb_iox/src/commands/run/test.rs +++ b/influxdb_iox/src/commands/run/test.rs @@ -61,5 +61,10 @@ pub async fn command(config: Config) -> Result<()> { )); let services = vec![Service::create(server_type, common_state.run_config())]; - Ok(main::main(common_state, services).await?) + Ok(main::main( + common_state, + services, + Arc::new(metric::Registry::default()), + ) + .await?) } diff --git a/panic_logging/Cargo.toml b/panic_logging/Cargo.toml index e2daecb464..93f8358558 100644 --- a/panic_logging/Cargo.toml +++ b/panic_logging/Cargo.toml @@ -5,5 +5,6 @@ authors = ["Paul Dix "] edition = "2021" [dependencies] # In alphabetical order +metric = { path = "../metric" } observability_deps = { path = "../observability_deps" } workspace-hack = { path = "../workspace-hack"} diff --git a/panic_logging/src/lib.rs b/panic_logging/src/lib.rs index 8357090509..06583adf64 100644 --- a/panic_logging/src/lib.rs +++ b/panic_logging/src/lib.rs @@ -11,6 +11,7 @@ use std::{fmt, panic, sync::Arc}; +use metric::U64Counter; use observability_deps::tracing::{error, warn}; use panic::PanicInfo; @@ -24,6 +25,7 @@ type PanicFunctionPtr = Arc) + Sync + Send + 'static>> /// prior panic hook. /// /// Upon drop, restores the pre-existing panic hook +#[derive(Default)] pub struct SendPanicsToTracing { /// The previously installed panic hook -- Note it is wrapped in an /// `Option` so we can `.take` it during the call to `drop()`; @@ -40,12 +42,23 @@ impl SendPanicsToTracing { Self { old_panic_hook } } -} -// recommended by clippy -impl Default for SendPanicsToTracing { - fn default() -> Self { - Self::new() + /// Configure this panic handler to emit a panic count metric. + /// + /// The metric is named `thread_panic_count_total` and is incremented each + /// time the panic handler is invoked. + pub fn with_metrics(self, metrics: &metric::Registry) -> Self { + let panic_count = metrics + .register_metric::("thread_panic_count", "number of thread panics observed") + .recorder(&[]); + + let old_hook = Arc::clone(self.old_panic_hook.as_ref().expect("no hook set")); + panic::set_hook(Box::new(move |info| { + panic_count.inc(1); + tracing_panic_hook(&old_hook, info) + })); + + self } } @@ -97,3 +110,36 @@ fn tracing_panic_hook(other_hook: &PanicFunctionPtr, panic_info: &PanicInfo<'_>) // panic function) other_hook(panic_info) } + +#[cfg(test)] +mod tests { + use metric::{Attributes, Metric}; + + use super::*; + + fn assert_count(metrics: &metric::Registry, count: u64) { + let got = metrics + .get_instrument::>("thread_panic_count") + .expect("failed to read metric") + .get_observer(&Attributes::from(&[])) + .expect("failed to get observer") + .fetch(); + assert_eq!(got, count); + } + + #[test] + fn test_panic_counter() { + let metrics = metric::Registry::default(); + let _guard = SendPanicsToTracing::new().with_metrics(&metrics); + + assert_count(&metrics, 0); + + std::thread::spawn(|| { + panic!("it's bananas"); + }) + .join() + .expect_err("wat"); + + assert_count(&metrics, 1); + } +}