diff --git a/Cargo.lock b/Cargo.lock index c4f2200da8..d7d50be3b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4621,6 +4621,7 @@ dependencies = [ "metric", "parking_lot 0.12.1", "predicate", + "tonic", "trace", "tracker", "workspace-hack", diff --git a/service_common/Cargo.toml b/service_common/Cargo.toml index 17a4133754..024d2c886b 100644 --- a/service_common/Cargo.toml +++ b/service_common/Cargo.toml @@ -14,6 +14,7 @@ metric = { path = "../metric" } parking_lot = "0.12" trace = { path = "../trace" } tracker = { path = "../tracker" } +tonic = "0.8" workspace-hack = { path = "../workspace-hack"} # Crates.io dependencies, in alphabetical order diff --git a/service_common/src/error.rs b/service_common/src/error.rs new file mode 100644 index 0000000000..104f6bd1d2 --- /dev/null +++ b/service_common/src/error.rs @@ -0,0 +1,131 @@ +//! Routines for error handling + +use datafusion::error::DataFusionError; + +/// Converts a [`DataFusionError`] into the appropriate [`tonic::Code`] +/// +/// Note: the goal of this function is that the "user sees the error +/// message rather than an opaque message". Typically the messages of +/// [`tonic::Code::Internal`] are not displayed to the user as they +/// result from bugs in the software rather and the user can't do +/// anything about them. +/// +/// In an ideal world, it would be totally clear from a +/// [`DataFusionError`] which errors belonged in which mapping. +/// +/// However, this is not always the case, so the code takes the +/// "conservative UX" approach to "show the user the message". This +/// may be at odds with the conservative approach from a security +/// perspective. +/// +/// Basically because I wasn't sure they were all internal errors -- +/// for example, you can get an Arrow error if you try and divide a +/// column by zero, depending on the data. +pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code { + match e { + DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted, + // Map as many as possible back into user visible (non internal) errors + DataFusionError::SQL(_) + | DataFusionError::SchemaError(_) + // Execution, ArrowError and ParquetError might be due to an + // internal error (e.g. some sort of IO error or bug) or due + // to a user input error (e.g. you can get an Arrow error if + // you try and divide by a column and it has zeros). + // + // Since we are not sure they are all internal errors we + // classify them as InvalidArgument so the user has a chance + // to see them + // + // Potential future TODO: we could inspect the error and + // decide. e.g. For Box we could downcast the type + // if IOx only puts a single concrete enum in there. + | DataFusionError::Execution(_) + | DataFusionError::ArrowError(_) + | DataFusionError::ParquetError(_) + // DataFusion most often returns "NotImplemented" when a + // particular SQL feature is not implemented. This + // information is useful to the user who may be able to + // express their query using different syntax that is implemented. + // + // the grpc / tonic "NotImplemented" code typically means + // that the client called an API endpoint that wasn't + // implemented. + // + // See examples in: + // https://github.com/apache/arrow-datafusion/search?q=NotImplemented + | DataFusionError::NotImplemented(_) + | DataFusionError::Plan(_) => tonic::Code::InvalidArgument, + e @ DataFusionError::Context(_,_) => { + // traverse context chain without recursion + datafusion_error_to_tonic_code(leaf_error(e)) + } + // Map as many as possible back into user visible + // (non internal) errors and only treat the ones + // the user likely can't do anything about as internal + DataFusionError::ObjectStore(_) + | DataFusionError::IoError(_) + // External originate from outside DataFusion’s core codebase. + // As of 2022-10-17, these always come external object store + // errors (e.g. misconfiguration or bad path) which would be + // an internal error and thus we classify them as such. + | DataFusionError::External(_) + | DataFusionError::Internal(_) => tonic::Code::Internal, + // explicitly don't have a catchall here so any + // newly added DataFusion error will raise a compiler error for us to address + } +} + +/// Returns a reference to the bottommost error in the chain +fn leaf_error(inner: &DataFusionError) -> &DataFusionError { + let mut source = inner; + while let DataFusionError::Context(_msg, inner) = source { + source = inner; + } + source +} + +#[cfg(test)] +mod test { + use datafusion::sql::sqlparser::parser::ParserError; + + use super::*; + + #[test] + fn test_error_translation() { + let s = "foo".to_string(); + + // this is basically a second implementation of the translation table to help avoid mistakes + do_test( + DataFusionError::ResourcesExhausted(s.clone()), + tonic::Code::ResourceExhausted, + ); + + let e = ParserError::ParserError(s.clone()); + do_test(DataFusionError::SQL(e), tonic::Code::InvalidArgument); + + do_test( + DataFusionError::NotImplemented(s.clone()), + tonic::Code::InvalidArgument, + ); + do_test( + DataFusionError::Plan(s.clone()), + tonic::Code::InvalidArgument, + ); + + do_test(DataFusionError::Internal(s), tonic::Code::Internal); + } + + #[test] + fn test_error_context_traversal() { + let inner_error = DataFusionError::ResourcesExhausted("foo".to_string()); + + do_test( + DataFusionError::Context("it happened!".to_string(), Box::new(inner_error)), + tonic::Code::ResourceExhausted, + ); + } + + fn do_test(e: DataFusionError, code: tonic::Code) { + assert_eq!(datafusion_error_to_tonic_code(&e), code); + } +} diff --git a/service_common/src/lib.rs b/service_common/src/lib.rs index cbf0fd6c78..99cff740ee 100644 --- a/service_common/src/lib.rs +++ b/service_common/src/lib.rs @@ -1,5 +1,6 @@ //! Common methods for RPC service implementations +mod error; pub mod planner; pub mod test_util; @@ -25,3 +26,5 @@ pub trait QueryDatabaseProvider: std::fmt::Debug + Send + Sync + 'static { /// Acquire concurrency-limiting sempahore async fn acquire_semaphore(&self, span: Option) -> InstrumentedAsyncOwnedSemaphorePermit; } + +pub use error::datafusion_error_to_tonic_code; diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs index 6f1bc41a82..2fe6c48181 100644 --- a/service_grpc_flight/src/lib.rs +++ b/service_grpc_flight/src/lib.rs @@ -20,7 +20,7 @@ use observability_deps::tracing::{debug, info, warn}; use pin_project::{pin_project, pinned_drop}; use prost::Message; use serde::Deserialize; -use service_common::{planner::Planner, QueryDatabaseProvider}; +use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryDatabaseProvider}; use snafu::{ResultExt, Snafu}; use std::{fmt::Debug, pin::Pin, sync::Arc, task::Poll, time::Instant}; use tokio::task::JoinHandle; @@ -96,8 +96,8 @@ impl From for tonic::Status { } impl Error { - /// Converts a result from the business logic into the appropriate tonic - /// status + /// Converts a result from the business logic into the appropriate tonic (gRPC) + /// status message to send back to users fn into_status(self) -> tonic::Status { let msg = self.to_string(); @@ -108,18 +108,7 @@ impl Error { | Self::InvalidQuery { .. } | Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument, Self::Planning { source, .. } | Self::Query { source, .. } => { - // traverse context chain - let mut source = source; - while let DataFusionError::Context(_msg, inner) = source { - source = *inner; - } - - match source { - DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted, - DataFusionError::Plan(_) => tonic::Code::InvalidArgument, - DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented, - _ => tonic::Code::Internal, - } + datafusion_error_to_tonic_code(&source) } Self::Optimize { .. } | Self::Serialization { .. } => tonic::Code::Internal, }; diff --git a/service_grpc_influxrpc/src/service.rs b/service_grpc_influxrpc/src/service.rs index caca45feca..97223551fe 100644 --- a/service_grpc_influxrpc/src/service.rs +++ b/service_grpc_influxrpc/src/service.rs @@ -34,7 +34,7 @@ use iox_query::{ }; use observability_deps::tracing::{error, info, trace}; use pin_project::pin_project; -use service_common::{planner::Planner, QueryDatabaseProvider}; +use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryDatabaseProvider}; use snafu::{OptionExt, ResultExt, Snafu}; use std::{ collections::{BTreeSet, HashMap}, @@ -197,20 +197,7 @@ impl Error { | Self::PlanningGroupSeries { source, .. } | Self::FilteringSeries { source, .. } | Self::GroupingSeries { source, .. } - | Self::ListingTagValues { source, .. } => { - // traverse context chain - let mut source = source; - while let DataFusionError::Context(_msg, inner) = source { - source = *inner; - } - - match source { - DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted, - DataFusionError::Plan(_) => tonic::Code::InvalidArgument, - DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented, - _ => tonic::Code::Internal, - } - } + | Self::ListingTagValues { source, .. } => datafusion_error_to_tonic_code(&source), Self::ConvertingPredicate { .. } | Self::ConvertingReadGroupAggregate { .. } | Self::ConvertingReadGroupType { .. } @@ -1645,15 +1632,15 @@ mod tests { fixture: &Fixture, path: &'static str, status: &'static str, - count: u64, + expected: u64, ) { - let metric = fixture + let metrics = fixture .test_storage .metric_registry .get_instrument::>("grpc_requests") .unwrap(); - let observation = metric + let observation = metrics .get_observer(&Attributes::from([ ( "path", @@ -1664,7 +1651,11 @@ mod tests { .unwrap() .fetch(); - assert_eq!(observation, count); + assert_eq!( + observation, expected, + "\n\npath: {}\nstatus:{}\nobservation:{}\nexpected:{}\n\nAll metrics:\n\n{:#?}", + path, status, observation, expected, metrics + ); } #[tokio::test] @@ -2178,8 +2169,11 @@ mod tests { "Error converting tag_key to UTF-8 in tag_values request" ); - grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1); + // error from backend error grpc_request_metric_has_count(&fixture, "TagValues", "server_error", 1); + + // error from bad utf8 + grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1); } #[tokio::test]