fix: improve tonic status codes returned for query errors (#5864)

* fix: improve tonic status codes returned for query errors

* fix: update tests

* fix: Add rationale for categorization and avoid recurson handling DataFusionError::Context

* fix: Apply suggestions from code review

Thanks @carols10cents!

Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com>

* fix: Update tests

* fix: add future TOOD

Co-authored-by: Carol (Nichols || Goulding) <193874+carols10cents@users.noreply.github.com>
Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
pull/24376/head
Andrew Lamb 2022-10-18 06:47:52 -04:00 committed by GitHub
parent d78591f52e
commit e16b04891e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 154 additions and 35 deletions

1
Cargo.lock generated
View File

@ -4621,6 +4621,7 @@ dependencies = [
"metric",
"parking_lot 0.12.1",
"predicate",
"tonic",
"trace",
"tracker",
"workspace-hack",

View File

@ -14,6 +14,7 @@ metric = { path = "../metric" }
parking_lot = "0.12"
trace = { path = "../trace" }
tracker = { path = "../tracker" }
tonic = "0.8"
workspace-hack = { path = "../workspace-hack"}
# Crates.io dependencies, in alphabetical order

131
service_common/src/error.rs Normal file
View File

@ -0,0 +1,131 @@
//! Routines for error handling
use datafusion::error::DataFusionError;
/// Converts a [`DataFusionError`] into the appropriate [`tonic::Code`]
///
/// Note: the goal of this function is that the "user sees the error
/// message rather than an opaque message". Typically the messages of
/// [`tonic::Code::Internal`] are not displayed to the user as they
/// result from bugs in the software rather and the user can't do
/// anything about them.
///
/// In an ideal world, it would be totally clear from a
/// [`DataFusionError`] which errors belonged in which mapping.
///
/// However, this is not always the case, so the code takes the
/// "conservative UX" approach to "show the user the message". This
/// may be at odds with the conservative approach from a security
/// perspective.
///
/// Basically because I wasn't sure they were all internal errors --
/// for example, you can get an Arrow error if you try and divide a
/// column by zero, depending on the data.
pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code {
match e {
DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
// Map as many as possible back into user visible (non internal) errors
DataFusionError::SQL(_)
| DataFusionError::SchemaError(_)
// Execution, ArrowError and ParquetError might be due to an
// internal error (e.g. some sort of IO error or bug) or due
// to a user input error (e.g. you can get an Arrow error if
// you try and divide by a column and it has zeros).
//
// Since we are not sure they are all internal errors we
// classify them as InvalidArgument so the user has a chance
// to see them
//
// Potential future TODO: we could inspect the error and
// decide. e.g. For Box<dyn ...> we could downcast the type
// if IOx only puts a single concrete enum in there.
| DataFusionError::Execution(_)
| DataFusionError::ArrowError(_)
| DataFusionError::ParquetError(_)
// DataFusion most often returns "NotImplemented" when a
// particular SQL feature is not implemented. This
// information is useful to the user who may be able to
// express their query using different syntax that is implemented.
//
// the grpc / tonic "NotImplemented" code typically means
// that the client called an API endpoint that wasn't
// implemented.
//
// See examples in:
// https://github.com/apache/arrow-datafusion/search?q=NotImplemented
| DataFusionError::NotImplemented(_)
| DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
e @ DataFusionError::Context(_,_) => {
// traverse context chain without recursion
datafusion_error_to_tonic_code(leaf_error(e))
}
// Map as many as possible back into user visible
// (non internal) errors and only treat the ones
// the user likely can't do anything about as internal
DataFusionError::ObjectStore(_)
| DataFusionError::IoError(_)
// External originate from outside DataFusions core codebase.
// As of 2022-10-17, these always come external object store
// errors (e.g. misconfiguration or bad path) which would be
// an internal error and thus we classify them as such.
| DataFusionError::External(_)
| DataFusionError::Internal(_) => tonic::Code::Internal,
// explicitly don't have a catchall here so any
// newly added DataFusion error will raise a compiler error for us to address
}
}
/// Returns a reference to the bottommost error in the chain
fn leaf_error(inner: &DataFusionError) -> &DataFusionError {
let mut source = inner;
while let DataFusionError::Context(_msg, inner) = source {
source = inner;
}
source
}
#[cfg(test)]
mod test {
use datafusion::sql::sqlparser::parser::ParserError;
use super::*;
#[test]
fn test_error_translation() {
let s = "foo".to_string();
// this is basically a second implementation of the translation table to help avoid mistakes
do_test(
DataFusionError::ResourcesExhausted(s.clone()),
tonic::Code::ResourceExhausted,
);
let e = ParserError::ParserError(s.clone());
do_test(DataFusionError::SQL(e), tonic::Code::InvalidArgument);
do_test(
DataFusionError::NotImplemented(s.clone()),
tonic::Code::InvalidArgument,
);
do_test(
DataFusionError::Plan(s.clone()),
tonic::Code::InvalidArgument,
);
do_test(DataFusionError::Internal(s), tonic::Code::Internal);
}
#[test]
fn test_error_context_traversal() {
let inner_error = DataFusionError::ResourcesExhausted("foo".to_string());
do_test(
DataFusionError::Context("it happened!".to_string(), Box::new(inner_error)),
tonic::Code::ResourceExhausted,
);
}
fn do_test(e: DataFusionError, code: tonic::Code) {
assert_eq!(datafusion_error_to_tonic_code(&e), code);
}
}

View File

@ -1,5 +1,6 @@
//! Common methods for RPC service implementations
mod error;
pub mod planner;
pub mod test_util;
@ -25,3 +26,5 @@ pub trait QueryDatabaseProvider: std::fmt::Debug + Send + Sync + 'static {
/// Acquire concurrency-limiting sempahore
async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit;
}
pub use error::datafusion_error_to_tonic_code;

View File

@ -20,7 +20,7 @@ use observability_deps::tracing::{debug, info, warn};
use pin_project::{pin_project, pinned_drop};
use prost::Message;
use serde::Deserialize;
use service_common::{planner::Planner, QueryDatabaseProvider};
use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryDatabaseProvider};
use snafu::{ResultExt, Snafu};
use std::{fmt::Debug, pin::Pin, sync::Arc, task::Poll, time::Instant};
use tokio::task::JoinHandle;
@ -96,8 +96,8 @@ impl From<Error> for tonic::Status {
}
impl Error {
/// Converts a result from the business logic into the appropriate tonic
/// status
/// Converts a result from the business logic into the appropriate tonic (gRPC)
/// status message to send back to users
fn into_status(self) -> tonic::Status {
let msg = self.to_string();
@ -108,18 +108,7 @@ impl Error {
| Self::InvalidQuery { .. }
| Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument,
Self::Planning { source, .. } | Self::Query { source, .. } => {
// traverse context chain
let mut source = source;
while let DataFusionError::Context(_msg, inner) = source {
source = *inner;
}
match source {
DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
_ => tonic::Code::Internal,
}
datafusion_error_to_tonic_code(&source)
}
Self::Optimize { .. } | Self::Serialization { .. } => tonic::Code::Internal,
};

View File

@ -34,7 +34,7 @@ use iox_query::{
};
use observability_deps::tracing::{error, info, trace};
use pin_project::pin_project;
use service_common::{planner::Planner, QueryDatabaseProvider};
use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryDatabaseProvider};
use snafu::{OptionExt, ResultExt, Snafu};
use std::{
collections::{BTreeSet, HashMap},
@ -197,20 +197,7 @@ impl Error {
| Self::PlanningGroupSeries { source, .. }
| Self::FilteringSeries { source, .. }
| Self::GroupingSeries { source, .. }
| Self::ListingTagValues { source, .. } => {
// traverse context chain
let mut source = source;
while let DataFusionError::Context(_msg, inner) = source {
source = *inner;
}
match source {
DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
DataFusionError::NotImplemented(_) => tonic::Code::Unimplemented,
_ => tonic::Code::Internal,
}
}
| Self::ListingTagValues { source, .. } => datafusion_error_to_tonic_code(&source),
Self::ConvertingPredicate { .. }
| Self::ConvertingReadGroupAggregate { .. }
| Self::ConvertingReadGroupType { .. }
@ -1645,15 +1632,15 @@ mod tests {
fixture: &Fixture,
path: &'static str,
status: &'static str,
count: u64,
expected: u64,
) {
let metric = fixture
let metrics = fixture
.test_storage
.metric_registry
.get_instrument::<Metric<U64Counter>>("grpc_requests")
.unwrap();
let observation = metric
let observation = metrics
.get_observer(&Attributes::from([
(
"path",
@ -1664,7 +1651,11 @@ mod tests {
.unwrap()
.fetch();
assert_eq!(observation, count);
assert_eq!(
observation, expected,
"\n\npath: {}\nstatus:{}\nobservation:{}\nexpected:{}\n\nAll metrics:\n\n{:#?}",
path, status, observation, expected, metrics
);
}
#[tokio::test]
@ -2178,8 +2169,11 @@ mod tests {
"Error converting tag_key to UTF-8 in tag_values request"
);
grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1);
// error from backend error
grpc_request_metric_has_count(&fixture, "TagValues", "server_error", 1);
// error from bad utf8
grpc_request_metric_has_count(&fixture, "TagValues", "client_error", 1);
}
#[tokio::test]