fix: improve tonic status codes returned for query errors (#5864)
* fix: improve tonic status codes returned for query errors * fix: update tests * fix: Add rationale for categorization and avoid recurson handling DataFusionError::Context * fix: Apply suggestions from code review Thanks @carols10cents! Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com> * fix: Update tests * fix: add future TOOD Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com> Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>pull/24376/head
parent
d78591f52e
commit
e16b04891e
|
|
@ -4621,6 +4621,7 @@ dependencies = [
|
|||
"metric",
|
||||
"parking_lot 0.12.1",
|
||||
"predicate",
|
||||
"tonic",
|
||||
"trace",
|
||||
"tracker",
|
||||
"workspace-hack",
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ metric = { path = "../metric" }
|
|||
parking_lot = "0.12"
|
||||
trace = { path = "../trace" }
|
||||
tracker = { path = "../tracker" }
|
||||
tonic = "0.8"
|
||||
workspace-hack = { path = "../workspace-hack"}
|
||||
|
||||
# Crates.io dependencies, in alphabetical order
|
||||
|
|
|
|||
|
|
@ -0,0 +1,131 @@
|
|||
//! Routines for error handling
|
||||
|
||||
use datafusion::error::DataFusionError;
|
||||
|
||||
/// Converts a [`DataFusionError`] into the appropriate [`tonic::Code`]
|
||||
///
|
||||
/// Note: the goal of this function is that the "user sees the error
|
||||
/// message rather than an opaque message". Typically the messages of
|
||||
/// [`tonic::Code::Internal`] are not displayed to the user as they
|
||||
/// result from bugs in the software rather and the user can't do
|
||||
/// anything about them.
|
||||
///
|
||||
/// In an ideal world, it would be totally clear from a
|
||||
/// [`DataFusionError`] which errors belonged in which mapping.
|
||||
///
|
||||
/// However, this is not always the case, so the code takes the
|
||||
/// "conservative UX" approach to "show the user the message". This
|
||||
/// may be at odds with the conservative approach from a security
|
||||
/// perspective.
|
||||
///
|
||||
/// Basically because I wasn't sure they were all internal errors --
|
||||
/// for example, you can get an Arrow error if you try and divide a
|
||||
/// column by zero, depending on the data.
|
||||
pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code {
|
||||
match e {
|
||||
DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
|
||||
// Map as many as possible back into user visible (non internal) errors
|
||||
DataFusionError::SQL(_)
|
||||
| DataFusionError::SchemaError(_)
|
||||
// Execution, ArrowError and ParquetError might be due to an
|
||||
// internal error (e.g. some sort of IO error or bug) or due
|
||||
// to a user input error (e.g. you can get an Arrow error if
|
||||
// you try and divide by a column and it has zeros).
|
||||
//
|
||||
// Since we are not sure they are all internal errors we
|
||||
// classify them as InvalidArgument so the user has a chance
|
||||
// to see them
|
||||
//
|
||||
// Potential future TODO: we could inspect the error and
|
||||
// decide. e.g. For Box<dyn ...> we could downcast the type
|
||||
// if IOx only puts a single concrete enum in there.
|
||||
| DataFusionError::Execution(_)
|
||||
| DataFusionError::ArrowError(_)
|
||||
| DataFusionError::ParquetError(_)
|
||||
// DataFusion most often returns "NotImplemented" when a
|
||||
// particular SQL feature is not implemented. This
|
||||
// information is useful to the user who may be able to
|
||||
// express their query using different syntax that is implemented.
|
||||
//
|
||||
// the grpc / tonic "NotImplemented" code typically means
|
||||
// that the client called an API endpoint that wasn't
|
||||
// implemented.
|
||||
//
|
||||
// See examples in:
|
||||
// https://github.com/apache/arrow-datafusion/search?q=NotImplemented
|
||||
| DataFusionError::NotImplemented(_)
|
||||
| DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
|
||||
e @ DataFusionError::Context(_,_) => {
|
||||
// traverse context chain without recursion
|
||||
datafusion_error_to_tonic_code(leaf_error(e))
|
||||
}
|
||||
// Map as many as possible back into user visible
|
||||
// (non internal) errors and only treat the ones
|
||||
// the user likely can't do anything about as internal
|
||||
DataFusionError::ObjectStore(_)
|
||||
| DataFusionError::IoError(_)
|
||||
// External originate from outside DataFusion’s core codebase.
|
||||
// As of 2022-10-17, these always come external object store
|
||||
// errors (e.g. misconfiguration or bad path) which would be
|
||||
// an internal error and thus we classify them as such.
|
||||
| DataFusionError::External(_)
|
||||
| DataFusionError::Internal(_) => tonic::Code::Internal,
|
||||
// explicitly don't have a catchall here so any
|
||||
// newly added DataFusion error will raise a compiler error for us to address
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a reference to the bottommost error in the chain
|
||||
fn leaf_error(inner: &DataFusionError) -> &DataFusionError {
|
||||
let mut source = inner;
|
||||
while let DataFusionError::Context(_msg, inner) = source {
|
||||
source = inner;
|
||||
}
|
||||
source
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use datafusion::sql::sqlparser::parser::ParserError;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_error_translation() {
|
||||
let s = "foo".to_string();
|
||||
|
||||
// this is basically a second implementation of the translation table to help avoid mistakes
|
||||
do_test(
|
||||
DataFusionError::ResourcesExhausted(s.clone()),
|
||||
tonic::Code::ResourceExhausted,
|
||||
);
|
||||
|
||||
let e = ParserError::ParserError(s.clone());
|
||||
do_test(DataFusionError::SQL(e), tonic::Code::InvalidArgument);
|
||||
|
||||
do_test(
|
||||
DataFusionError::NotImplemented(s.clone()),
|
||||
tonic::Code::InvalidArgument,
|
||||
);
|
||||
do_test(
|
||||
DataFusionError::Plan(s.clone()),
|
||||
tonic::Code::InvalidArgument,
|
||||
);
|
||||
|
||||
do_test(DataFusionError::Internal(s), tonic::Code::Internal);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_context_traversal() {
|
||||
let inner_error = DataFusionError::ResourcesExhausted("foo".to_string());
|
||||
|
||||
do_test(
|
||||
DataFusionError::Context("it happened!".to_string(), Box::new(inner_error)),
|
||||
tonic::Code::ResourceExhausted,
|
||||
);
|
||||
}
|
||||
|
||||
fn do_test(e: DataFusionError, code: tonic::Code) {
|
||||
assert_eq!(datafusion_error_to_tonic_code(&e), code);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
//! Common methods for RPC service implementations
|
||||
|
||||
mod error;
|
||||
pub mod planner;
|
||||
pub mod test_util;
|
||||
|
||||
|
|
@ -25,3 +26,5 @@ pub trait QueryDatabaseProvider: std::fmt::Debug + Send + Sync + 'static {
|
|||
/// Acquire concurrency-limiting sempahore
|
||||
async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit;
|
||||
}
|
||||
|
||||
pub use error::datafusion_error_to_tonic_code;
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ use observability_deps::tracing::{debug, info, warn};
|
|||
use pin_project::{pin_project, pinned_drop};
|
||||
use prost::Message;
|
||||
use serde::Deserialize;
|
||||
use service_common::{planner::Planner, QueryDatabaseProvider};
|
||||
use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryDatabaseProvider};
|
||||
use snafu::{ResultExt, Snafu};
|
||||
use std::{fmt::Debug, pin::Pin, sync::Arc, task::Poll, time::Instant};
|
||||
use tokio::task::JoinHandle;
|
||||
|
|
@ -96,8 +96,8 @@ impl From<Error> for tonic::Status {
|
|||
}
|
||||
|
||||
impl Error {
|
||||
/// Converts a result from the business logic into the appropriate tonic
|
||||
/// status
|
||||
/// Converts a result from the business logic into the appropriate tonic (gRPC)
|
||||
/// status message to send back to users
|
||||
fn into_status(self) -> tonic::Status {
|
||||
let msg = self.to_string();
|
||||
|
||||
|
|
@ -108,18 +108,7 @@ impl Error {
|
|||
| Self::InvalidQuery { .. }
|
||||
| Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument,
|
||||
Self::Planning { source, .. } | Self::Query { source, .. } => {
|
||||
// traverse context chain
|
||||
let mut source = source;
|
||||
while let DataFusionError::Context(_msg, inner) = source {
|
||||
source = *inner;
|
||||
}
|
||||
|
||||
match source {
|
||||
DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
|
||||
DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
|
||||
DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
|
||||
_ => tonic::Code::Internal,
|
||||
}
|
||||
datafusion_error_to_tonic_code(&source)
|
||||
}
|
||||
Self::Optimize { .. } | Self::Serialization { .. } => tonic::Code::Internal,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ use iox_query::{
|
|||
};
|
||||
use observability_deps::tracing::{error, info, trace};
|
||||
use pin_project::pin_project;
|
||||
use service_common::{planner::Planner, QueryDatabaseProvider};
|
||||
use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryDatabaseProvider};
|
||||
use snafu::{OptionExt, ResultExt, Snafu};
|
||||
use std::{
|
||||
collections::{BTreeSet, HashMap},
|
||||
|
|
@ -197,20 +197,7 @@ impl Error {
|
|||
| Self::PlanningGroupSeries { source, .. }
|
||||
| Self::FilteringSeries { source, .. }
|
||||
| Self::GroupingSeries { source, .. }
|
||||
| Self::ListingTagValues { source, .. } => {
|
||||
// traverse context chain
|
||||
let mut source = source;
|
||||
while let DataFusionError::Context(_msg, inner) = source {
|
||||
source = *inner;
|
||||
}
|
||||
|
||||
match source {
|
||||
DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
|
||||
DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
|
||||
DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
|
||||
_ => tonic::Code::Internal,
|
||||
}
|
||||
}
|
||||
| Self::ListingTagValues { source, .. } => datafusion_error_to_tonic_code(&source),
|
||||
Self::ConvertingPredicate { .. }
|
||||
| Self::ConvertingReadGroupAggregate { .. }
|
||||
| Self::ConvertingReadGroupType { .. }
|
||||
|
|
@ -1645,15 +1632,15 @@ mod tests {
|
|||
fixture: &Fixture,
|
||||
path: &'static str,
|
||||
status: &'static str,
|
||||
count: u64,
|
||||
expected: u64,
|
||||
) {
|
||||
let metric = fixture
|
||||
let metrics = fixture
|
||||
.test_storage
|
||||
.metric_registry
|
||||
.get_instrument::<Metric<U64Counter>>("grpc_requests")
|
||||
.unwrap();
|
||||
|
||||
let observation = metric
|
||||
let observation = metrics
|
||||
.get_observer(&Attributes::from([
|
||||
(
|
||||
"path",
|
||||
|
|
@ -1664,7 +1651,11 @@ mod tests {
|
|||
.unwrap()
|
||||
.fetch();
|
||||
|
||||
assert_eq!(observation, count);
|
||||
assert_eq!(
|
||||
observation, expected,
|
||||
"\n\npath: {}\nstatus:{}\nobservation:{}\nexpected:{}\n\nAll metrics:\n\n{:#?}",
|
||||
path, status, observation, expected, metrics
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
@ -2178,8 +2169,11 @@ mod tests {
|
|||
"Error converting tag_key to UTF-8 in tag_values request"
|
||||
);
|
||||
|
||||
grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1);
|
||||
// error from backend error
|
||||
grpc_request_metric_has_count(&fixture, "TagValues", "server_error", 1);
|
||||
|
||||
// error from bad utf8
|
||||
grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
|
|||
Loading…
Reference in New Issue