Merge pull request #4367 from influxdata/dom/panic-metrics

feat: panic metrics
pull/24376/head
kodiakhq[bot] 2022-04-20 11:43:20 +00:00 committed by GitHub
commit fcb82b2c99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 88 additions and 19 deletions

1
Cargo.lock generated
View File

@ -3909,6 +3909,7 @@ dependencies = [
name = "panic_logging"
version = "0.1.0"
dependencies = [
"metric",
"observability_deps",
"workspace-hack",
]

View File

@ -370,7 +370,7 @@ pub async fn command(config: Config) -> Result<()> {
info!(?ingester_addresses, "starting querier");
let querier = create_querier_server_type(
&common_state,
metrics,
Arc::clone(&metrics),
catalog,
object_store,
time_provider,
@ -388,5 +388,5 @@ pub async fn command(config: Config) -> Result<()> {
Service::create_grpc_only(querier, &querier_run_config),
];
Ok(main::main(common_state, services).await?)
Ok(main::main(common_state, services, metrics).await?)
}

View File

@ -91,7 +91,7 @@ pub async fn command(config: Config) -> Result<(), Error> {
let server_type = create_compactor_server_type(
&common_state,
metric_registry,
Arc::clone(&metric_registry),
catalog,
object_store,
exec,
@ -103,5 +103,5 @@ pub async fn command(config: Config) -> Result<(), Error> {
info!("starting compactor");
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(common_state, services, metric_registry).await?)
}

View File

@ -130,5 +130,10 @@ pub async fn command(config: Config) -> Result<()> {
));
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(
common_state,
services,
Arc::new(metric::Registry::default()),
)
.await?)
}

View File

@ -89,7 +89,7 @@ pub async fn command(config: Config) -> Result<()> {
let exec = Arc::new(Executor::new(config.query_exec_thread_count));
let server_type = create_ingester_server_type(
&common_state,
metric_registry,
Arc::clone(&metric_registry),
catalog,
object_store,
exec,
@ -101,5 +101,5 @@ pub async fn command(config: Config) -> Result<()> {
info!("starting ingester");
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(common_state, services, metric_registry).await?)
}

View File

@ -1,3 +1,5 @@
use std::sync::Arc;
use ioxd_common::Service;
use ioxd_common::{grpc_listener, http_listener, serve, server_type::CommonServerState};
use observability_deps::tracing::{error, info};
@ -66,7 +68,11 @@ fn build_malloc_conf() -> String {
///
/// Due to its invasive nature (install global panic handling,
/// logging, etc) this function should not be used during unit tests.
pub async fn main(common_state: CommonServerState, services: Vec<Service>) -> Result<()> {
pub async fn main(
common_state: CommonServerState,
services: Vec<Service>,
metrics: Arc<metric::Registry>,
) -> Result<()> {
let git_hash = env!("GIT_HASH", "starting influxdb_iox server");
let num_cpus = num_cpus::get();
let build_malloc_conf = build_malloc_conf();
@ -98,7 +104,7 @@ pub async fn main(common_state: CommonServerState, services: Vec<Service>) -> Re
// lifetime of the program - this is actually a good thing, as it prevents
// the panic handler from being removed while unwinding a panic (which in
// turn, causes a panic - see #548)
let f = SendPanicsToTracing::new();
let f = SendPanicsToTracing::new().with_metrics(&*metrics);
std::mem::forget(f);
// Register jemalloc metrics

View File

@ -89,7 +89,7 @@ pub async fn command(config: Config) -> Result<(), Error> {
let exec = Arc::new(Executor::new(num_threads));
let server_type = create_querier_server_type(
&common_state,
metric_registry,
Arc::clone(&metric_registry),
catalog,
object_store,
time_provider,
@ -101,5 +101,5 @@ pub async fn command(config: Config) -> Result<(), Error> {
info!("starting querier");
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(common_state, services, metric_registry).await?)
}

View File

@ -159,5 +159,10 @@ pub async fn command(config: Config) -> Result<()> {
));
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(
common_state,
services,
Arc::new(metric::Registry::default()),
)
.await?)
}

View File

@ -96,5 +96,5 @@ pub async fn command(config: Config) -> Result<()> {
info!("starting router2");
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(common_state, services, metrics).await?)
}

View File

@ -61,5 +61,10 @@ pub async fn command(config: Config) -> Result<()> {
));
let services = vec![Service::create(server_type, common_state.run_config())];
Ok(main::main(common_state, services).await?)
Ok(main::main(
common_state,
services,
Arc::new(metric::Registry::default()),
)
.await?)
}

View File

@ -5,5 +5,6 @@ authors = ["Paul Dix <paul@pauldix.net>"]
edition = "2021"
[dependencies] # In alphabetical order
metric = { path = "../metric" }
observability_deps = { path = "../observability_deps" }
workspace-hack = { path = "../workspace-hack"}

View File

@ -11,6 +11,7 @@
use std::{fmt, panic, sync::Arc};
use metric::U64Counter;
use observability_deps::tracing::{error, warn};
use panic::PanicInfo;
@ -24,6 +25,7 @@ type PanicFunctionPtr = Arc<Box<dyn Fn(&PanicInfo<'_>) + Sync + Send + 'static>>
/// prior panic hook.
///
/// Upon drop, restores the pre-existing panic hook
#[derive(Default)]
pub struct SendPanicsToTracing {
/// The previously installed panic hook -- Note it is wrapped in an
/// `Option` so we can `.take` it during the call to `drop()`;
@ -40,12 +42,23 @@ impl SendPanicsToTracing {
Self { old_panic_hook }
}
}
// recommended by clippy
impl Default for SendPanicsToTracing {
fn default() -> Self {
Self::new()
/// Configure this panic handler to emit a panic count metric.
///
/// The metric is named `thread_panic_count_total` and is incremented each
/// time the panic handler is invoked.
pub fn with_metrics(self, metrics: &metric::Registry) -> Self {
let panic_count = metrics
.register_metric::<U64Counter>("thread_panic_count", "number of thread panics observed")
.recorder(&[]);
let old_hook = Arc::clone(self.old_panic_hook.as_ref().expect("no hook set"));
panic::set_hook(Box::new(move |info| {
panic_count.inc(1);
tracing_panic_hook(&old_hook, info)
}));
self
}
}
@ -97,3 +110,36 @@ fn tracing_panic_hook(other_hook: &PanicFunctionPtr, panic_info: &PanicInfo<'_>)
// panic function)
other_hook(panic_info)
}
#[cfg(test)]
mod tests {
use metric::{Attributes, Metric};
use super::*;
fn assert_count(metrics: &metric::Registry, count: u64) {
let got = metrics
.get_instrument::<Metric<U64Counter>>("thread_panic_count")
.expect("failed to read metric")
.get_observer(&Attributes::from(&[]))
.expect("failed to get observer")
.fetch();
assert_eq!(got, count);
}
#[test]
fn test_panic_counter() {
let metrics = metric::Registry::default();
let _guard = SendPanicsToTracing::new().with_metrics(&metrics);
assert_count(&metrics, 0);
std::thread::spawn(|| {
panic!("it's bananas");
})
.join()
.expect_err("wat");
assert_count(&metrics, 1);
}
}