Merge pull request #2537 from influxdata/crepererum/issue2493c

feat: `Predicate` serialization
pull/24376/head
kodiakhq[bot] 2021-09-15 16:27:02 +00:00 committed by GitHub
commit c4d39f328f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 421 additions and 0 deletions

2
Cargo.lock generated
View File

@ -3150,11 +3150,13 @@ dependencies = [
"data_types",
"datafusion 0.1.0",
"datafusion_util",
"generated_types",
"internal_types",
"observability_deps",
"regex",
"snafu",
"sqlparser",
"test_helpers",
"tokio",
]

View File

@ -39,6 +39,7 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
idpe_path.join("source.proto"),
catalog_path.join("catalog.proto"),
catalog_path.join("parquet_metadata.proto"),
catalog_path.join("predicate.proto"),
management_path.join("database_rules.proto"),
management_path.join("chunk.proto"),
management_path.join("partition.proto"),

View File

@ -0,0 +1,80 @@
syntax = "proto3";
package influxdata.iox.catalog.v1;
// Represents a parsed predicate for evaluation by the InfluxDB IOx query engine.
message Predicate {
// Optional table restriction. If present, restricts the results to only tables these tables.
OptionalStringSet table_names = 1;
// Optional field restriction. If present, restricts the results to only tables which have *at least one* of the
// fields in field_columns.
OptionalStringSet field_columns = 2;
// Optional partition key filter
OptionalString partition_key = 3;
// Optional timestamp range: only rows within this range are included in results. Other rows are excluded.
TimestampRange range = 4;
// Optional arbitrary predicates, represented as list of expressions applied a logical conjunction (aka they are
// 'AND'ed together). Only rows that evaluate to TRUE for all these expressions should be returned. Other rows are
// excluded from the results.
repeated Expr exprs = 5;
}
// A optional string set.
//
// This is used instead of a `repeated string` to differenctiate between "empty set" and "none".
message OptionalStringSet {
repeated string values = 1;
}
// An optional string.
message OptionalString {
string value = 1;
}
// Specifies a continuous range of nanosecond timestamps.
message TimestampRange {
// Start defines the inclusive lower bound.
int64 start = 1;
// End defines the exclusive upper bound.
int64 end = 2;
}
// Single expression to be used as parts of a predicate.
//
// Only very simple expression of the type `<columng> <op> <scalar>` are supported.
message Expr {
// Column (w/o table name).
string column = 1;
// Operator.
Op op = 2;
// Scalar value.
Scalar scalar = 3;
}
// Binary operator that can be evaluated on a column and a scalar value.
enum Op {
// Unspecified operator, will result in an error.
OP_UNSPECIFIED = 0;
// Strict equality (`=`).
OP_EQ = 1;
// Inequality (`!=`).
OP_NE = 2;
}
// Scalar value of a certain type.
message Scalar {
oneof value {
bool value_bool = 1;
int64 value_i64 = 2;
double value_f64 = 3;
string value_string = 4;
}
}

View File

@ -9,6 +9,7 @@ chrono = "0.4"
data_types = { path = "../data_types" }
datafusion = { path = "../datafusion" }
datafusion_util = { path = "../datafusion_util" }
generated_types = { path = "../generated_types" }
internal_types = { path = "../internal_types" }
observability_deps = { path = "../observability_deps" }
regex = "1"
@ -16,4 +17,5 @@ snafu = "0.6.9"
sqlparser = "0.10.0"
[dev-dependencies]
test_helpers = { path = "../test_helpers" }
tokio = { version = "1.11", features = ["macros"] }

View File

@ -1,2 +1,3 @@
pub mod predicate;
pub mod regex;
pub mod serialize;

335
predicate/src/serialize.rs Normal file
View File

@ -0,0 +1,335 @@
//! Code to serialize and deserialize certain expressions.
//!
//! Note that [Ballista] also provides a serialization using [Protocol Buffers 3]. However the protocol is meant as a
//! communication channel between workers and clients of Ballista, not for long term preservation. For IOx we need a
//! more stable solution. Luckily we only need to support a very small subset of expression.
//!
//! [Ballista]: https://github.com/apache/arrow-datafusion/blob/22fcb3d7a68a56afbe12eab9e7d98f7b8de33703/ballista/rust/core/proto/ballista.proto
//! [Protocol Buffers 3]: https://developers.google.com/protocol-buffers/docs/proto3
use std::{collections::BTreeSet, ops::Deref};
use data_types::timestamp::TimestampRange;
use datafusion::{
logical_plan::{Column, Expr, Operator},
scalar::ScalarValue,
};
use generated_types::influxdata::iox::catalog::v1 as proto;
use snafu::{OptionExt, Snafu};
use crate::predicate::Predicate;
#[derive(Debug, Snafu)]
pub enum SerializeError {
#[snafu(display("unsupported expression: {:?}", expr))]
UnsupportedExpression { expr: Expr },
#[snafu(display("unsupported operants: left {:?}; right {:?}", left, right))]
UnsupportedOperants { left: Expr, right: Expr },
#[snafu(display("unsupported scalar value: {:?}", value))]
UnsupportedScalarValue { value: ScalarValue },
#[snafu(display("unsupported operator: {:?}", op))]
UnsupportedOperator { op: Operator },
}
/// Serialize IOx [`Predicate`] to a protobuf object.
pub fn serialize(predicate: &Predicate) -> Result<proto::Predicate, SerializeError> {
let proto_predicate = proto::Predicate {
table_names: serialize_optional_string_set(&predicate.table_names),
field_columns: serialize_optional_string_set(&predicate.field_columns),
partition_key: serialize_optional_string(&predicate.partition_key),
range: serialize_timestamp_range(&predicate.range),
exprs: predicate
.exprs
.iter()
.map(serialize_expr)
.collect::<Result<Vec<proto::Expr>, SerializeError>>()?,
};
Ok(proto_predicate)
}
fn serialize_optional_string_set(
set: &Option<BTreeSet<String>>,
) -> Option<proto::OptionalStringSet> {
set.as_ref().map(|set| proto::OptionalStringSet {
values: set.iter().cloned().collect(),
})
}
fn serialize_optional_string(s: &Option<String>) -> Option<proto::OptionalString> {
s.as_ref()
.map(|s| proto::OptionalString { value: s.clone() })
}
fn serialize_timestamp_range(r: &Option<TimestampRange>) -> Option<proto::TimestampRange> {
r.as_ref().map(|r| proto::TimestampRange {
start: r.start,
end: r.end,
})
}
fn serialize_expr(expr: &Expr) -> Result<proto::Expr, SerializeError> {
match expr {
Expr::BinaryExpr { left, op, right } => {
let (column, scalar) = match (left.deref(), right.deref()) {
// The delete predicate parser currently only supports `<column><op><value>`, not `<value><op><column>`,
// however this could can easily be extended to support the latter case as well.
(Expr::Column(column), Expr::Literal(value)) => {
let column = column.name.clone();
let scalar = serialize_scalar_value(value)?;
(column, scalar)
}
(other_left, other_right) => {
return Err(SerializeError::UnsupportedOperants {
left: other_left.clone(),
right: other_right.clone(),
});
}
};
let op = serialize_operator(op)?;
let proto_expr = proto::Expr {
column,
op: op.into(),
scalar: Some(scalar),
};
Ok(proto_expr)
}
other => Err(SerializeError::UnsupportedExpression {
expr: other.clone(),
}),
}
}
fn serialize_scalar_value(value: &ScalarValue) -> Result<proto::Scalar, SerializeError> {
// see https://github.com/apache/arrow-datafusion/blob/195b69995db8044ce283d72fb78eb6b74b8842f5/datafusion/src/sql/planner.rs#L274-L295
match value {
ScalarValue::Utf8(Some(value)) => Ok(proto::Scalar {
value: Some(proto::scalar::Value::ValueString(value.clone())),
}),
ScalarValue::Int64(Some(value)) => Ok(proto::Scalar {
value: Some(proto::scalar::Value::ValueI64(*value)),
}),
ScalarValue::Float64(Some(value)) => Ok(proto::Scalar {
value: Some(proto::scalar::Value::ValueF64(*value)),
}),
ScalarValue::Boolean(Some(value)) => Ok(proto::Scalar {
value: Some(proto::scalar::Value::ValueBool(*value)),
}),
other => Err(SerializeError::UnsupportedScalarValue {
value: other.clone(),
}),
}
}
fn serialize_operator(op: &Operator) -> Result<proto::Op, SerializeError> {
match op {
Operator::Eq => Ok(proto::Op::Eq),
Operator::NotEq => Ok(proto::Op::Ne),
other => Err(SerializeError::UnsupportedOperator { op: *other }),
}
}
#[derive(Debug, Snafu)]
pub enum DeserializeError {
#[snafu(display("unspecified operator"))]
UnspecifiedOperator,
#[snafu(display("illegal operator enum value: {}", value))]
IllegalOperatorEnumValue { value: i32 },
#[snafu(display("missing scalar value"))]
MissingScalarValue,
}
/// Deserialize IOx [`Predicate`] from a protobuf object.
pub fn deserialize(
proto_predicate: &proto::Predicate,
table_name: &str,
) -> Result<Predicate, DeserializeError> {
let predicate = Predicate {
table_names: deserialize_optional_string_set(&proto_predicate.table_names),
field_columns: deserialize_optional_string_set(&proto_predicate.field_columns),
partition_key: deserialize_optional_string(&proto_predicate.partition_key),
range: deserialize_timestamp_range(&proto_predicate.range),
exprs: proto_predicate
.exprs
.iter()
.map(|expr| deserialize_expr(expr, table_name))
.collect::<Result<Vec<Expr>, DeserializeError>>()?,
};
Ok(predicate)
}
fn deserialize_optional_string_set(
set: &Option<proto::OptionalStringSet>,
) -> Option<BTreeSet<String>> {
set.as_ref().map(|set| set.values.iter().cloned().collect())
}
fn deserialize_optional_string(s: &Option<proto::OptionalString>) -> Option<String> {
s.as_ref().map(|s| s.value.clone())
}
fn deserialize_timestamp_range(r: &Option<proto::TimestampRange>) -> Option<TimestampRange> {
r.as_ref().map(|r| TimestampRange {
start: r.start,
end: r.end,
})
}
fn deserialize_expr(proto_expr: &proto::Expr, table_name: &str) -> Result<Expr, DeserializeError> {
let column = Column {
relation: Some(table_name.to_string()),
name: proto_expr.column.clone(),
};
let op = deserialize_operator(&proto::Op::from_i32(proto_expr.op).context(
IllegalOperatorEnumValue {
value: proto_expr.op,
},
)?)?;
let value = deserialize_scalar_value(&proto_expr.scalar)?;
let expr = Expr::BinaryExpr {
left: Box::new(Expr::Column(column)),
op,
right: Box::new(Expr::Literal(value)),
};
Ok(expr)
}
fn deserialize_scalar_value(
value: &Option<proto::Scalar>,
) -> Result<ScalarValue, DeserializeError> {
match value
.as_ref()
.context(MissingScalarValue)?
.value
.as_ref()
.context(MissingScalarValue)?
{
proto::scalar::Value::ValueBool(value) => Ok(ScalarValue::Boolean(Some(*value))),
proto::scalar::Value::ValueI64(value) => Ok(ScalarValue::Int64(Some(*value))),
proto::scalar::Value::ValueF64(value) => Ok(ScalarValue::Float64(Some(*value))),
proto::scalar::Value::ValueString(value) => Ok(ScalarValue::Utf8(Some(value.clone()))),
}
}
fn deserialize_operator(op: &proto::Op) -> Result<Operator, DeserializeError> {
match op {
proto::Op::Unspecified => Err(DeserializeError::UnspecifiedOperator),
proto::Op::Eq => Ok(Operator::Eq),
proto::Op::Ne => Ok(Operator::NotEq),
}
}
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType;
use test_helpers::assert_contains;
use crate::predicate::{ParseDeletePredicate, PredicateBuilder};
use super::*;
#[test]
fn test_roundtrip() {
let table_name = "my_table";
let predicate = delete_predicate(table_name);
let proto = serialize(&predicate).unwrap();
let recovered = deserialize(&proto, table_name).unwrap();
assert_eq!(predicate, recovered);
}
#[test]
fn test_fail_serialize_unsupported_expression() {
let table_name = "my_table";
let mut predicate = delete_predicate(table_name);
predicate.exprs.push(Expr::Not(Box::new(Expr::BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::Utf8(Some("x".to_string())))),
})));
let err = serialize(&predicate).unwrap_err();
assert_contains!(err.to_string(), "unsupported expression:");
}
#[test]
fn test_fail_serialize_unsupported_operants() {
let table_name = "my_table";
let mut predicate = delete_predicate(table_name);
predicate.exprs.push(Expr::BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Column(Column {
relation: None,
name: "bar".to_string(),
})),
});
let err = serialize(&predicate).unwrap_err();
assert_contains!(err.to_string(), "unsupported operants:");
}
#[test]
fn test_fail_serialize_unsupported_scalar_value() {
let table_name = "my_table";
let mut predicate = delete_predicate(table_name);
predicate.exprs.push(Expr::BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::List(
Some(Box::new(vec![])),
Box::new(DataType::Float64),
))),
});
let err = serialize(&predicate).unwrap_err();
assert_contains!(err.to_string(), "unsupported scalar value:");
}
#[test]
fn test_fail_serialize_unsupported_operator() {
let table_name = "my_table";
let mut predicate = delete_predicate(table_name);
predicate.exprs.push(Expr::BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "foo".to_string(),
})),
op: Operator::Like,
right: Box::new(Expr::Literal(ScalarValue::Utf8(Some("x".to_string())))),
});
let err = serialize(&predicate).unwrap_err();
assert_contains!(err.to_string(), "unsupported operator:");
}
fn delete_predicate(table_name: &str) -> Predicate {
let start_time = "11";
let stop_time = "22";
let predicate = r#"city=Boston and cost!=100 and temp=87.5 and good=true"#;
let parse_delete_pred =
ParseDeletePredicate::try_new(table_name, start_time, stop_time, predicate).unwrap();
let mut del_predicate_builder = PredicateBuilder::new()
.table(table_name)
.timestamp_range(parse_delete_pred.start_time, parse_delete_pred.stop_time);
for expr in parse_delete_pred.predicate {
del_predicate_builder = del_predicate_builder.add_expr(expr);
}
del_predicate_builder.build()
}
}