Optimize the performance of filter by JSON field (#24268)

- Construct JSON pointer only once
- Avoid copying nested path for each row

Signed-off-by: yah01 <yang.cen@zilliz.com>
pull/24289/head
yah01 2023-05-22 00:47:25 +08:00 committed by GitHub
parent 3f96c335bb
commit ceda0ed598
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 126 additions and 123 deletions

View File

@ -109,25 +109,24 @@ class Json {
}
bool
exist(std::vector<std::string> nested_path) const {
exist(std::string_view pointer) const {
return doc().at_pointer(pointer).error() == simdjson::SUCCESS;
}
static std::string
pointer(std::vector<std::string> nested_path) {
std::for_each(
nested_path.begin(), nested_path.end(), [](std::string& key) {
boost::replace_all(key, "~", "~0");
boost::replace_all(key, "/", "~1");
});
auto pointer = "/" + boost::algorithm::join(nested_path, "/");
return doc().at_pointer(pointer).error() == simdjson::SUCCESS;
return pointer;
}
template <typename T>
value_result<T>
at(std::vector<std::string> nested_path) const {
std::for_each(
nested_path.begin(), nested_path.end(), [](std::string& key) {
boost::replace_all(key, "~", "~0");
boost::replace_all(key, "/", "~1");
});
auto pointer = "/" + boost::algorithm::join(nested_path, "/");
at(std::string_view pointer) const {
return doc().at_pointer(pointer).get<T>();
}

View File

@ -359,55 +359,53 @@ ExecExprVisitor::ExecUnaryRangeVisitorDispatcher(UnaryRangeExpr& expr_raw)
auto field_id = expr.column_.field_id;
switch (op) {
case OpType::Equal: {
auto index_func = [val](Index* index) {
return index->In(1, &val);
};
auto elem_func = [val](T x) { return (x == val); };
auto index_func = [&](Index* index) { return index->In(1, &val); };
auto elem_func = [&](T x) { return (x == val); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
case OpType::NotEqual: {
auto index_func = [val](Index* index) {
auto index_func = [&](Index* index) {
return index->NotIn(1, &val);
};
auto elem_func = [val](T x) { return (x != val); };
auto elem_func = [&](T x) { return (x != val); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
case OpType::GreaterEqual: {
auto index_func = [val](Index* index) {
auto index_func = [&](Index* index) {
return index->Range(val, OpType::GreaterEqual);
};
auto elem_func = [val](T x) { return (x >= val); };
auto elem_func = [&](T x) { return (x >= val); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
case OpType::GreaterThan: {
auto index_func = [val](Index* index) {
auto index_func = [&](Index* index) {
return index->Range(val, OpType::GreaterThan);
};
auto elem_func = [val](T x) { return (x > val); };
auto elem_func = [&](T x) { return (x > val); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
case OpType::LessEqual: {
auto index_func = [val](Index* index) {
auto index_func = [&](Index* index) {
return index->Range(val, OpType::LessEqual);
};
auto elem_func = [val](T x) { return (x <= val); };
auto elem_func = [&](T x) { return (x <= val); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
case OpType::LessThan: {
auto index_func = [val](Index* index) {
auto index_func = [&](Index* index) {
return index->Range(val, OpType::LessThan);
};
auto elem_func = [val](T x) { return (x < val); };
auto elem_func = [&](T x) { return (x < val); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
case OpType::PrefixMatch: {
auto index_func = [val](Index* index) {
auto index_func = [&](Index* index) {
auto dataset = std::make_unique<Dataset>();
dataset->Set(milvus::index::OPERATOR_TYPE, OpType::PrefixMatch);
dataset->Set(milvus::index::PREFIX_VALUE, val);
return index->Query(std::move(dataset));
};
auto elem_func = [val, op](T x) { return Match(x, val, op); };
auto elem_func = [&](T x) { return Match(x, val, op); };
return ExecRangeVisitorImpl<T>(field_id, index_func, elem_func);
}
// TODO: PostfixMatch
@ -427,7 +425,7 @@ ExecExprVisitor::ExecUnaryRangeVisitorDispatcherJson(UnaryRangeExpr& expr_raw)
auto op = expr.op_type_;
auto val = expr.value_;
auto& nested_path = expr.column_.nested_path;
auto pointer = milvus::Json::pointer(std::move(expr.column_.nested_path));
auto field_id = expr.column_.field_id;
auto index_func = [=](Index* index) { return TargetBitmap{}; };
using GetType =
@ -435,77 +433,77 @@ ExecExprVisitor::ExecUnaryRangeVisitorDispatcherJson(UnaryRangeExpr& expr_raw)
std::string_view,
ExprValueType>;
#define UnaryRangeJSONCompare(cmp) \
do { \
auto x = json.template at<GetType>(nested_path); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(nested_path); \
return !x.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
#define UnaryRangeJSONCompare(cmp) \
do { \
auto x = json.template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(pointer); \
return !x.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
} while (false)
#define UnaryRangeJSONCompareNotEqual(cmp) \
do { \
auto x = json.template at<GetType>(nested_path); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(nested_path); \
return x.error() || (cmp); \
} \
return true; \
} \
return (cmp); \
#define UnaryRangeJSONCompareNotEqual(cmp) \
do { \
auto x = json.template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(pointer); \
return x.error() || (cmp); \
} \
return true; \
} \
return (cmp); \
} while (false)
switch (op) {
case OpType::Equal: {
auto elem_func = [val, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompare(x.value() == val);
};
return ExecRangeVisitorImpl<milvus::Json>(
field_id, index_func, elem_func);
}
case OpType::NotEqual: {
auto elem_func = [val, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompareNotEqual(x.value() != val);
};
return ExecRangeVisitorImpl<milvus::Json>(
field_id, index_func, elem_func);
}
case OpType::GreaterEqual: {
auto elem_func = [val, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompare(x.value() >= val);
};
return ExecRangeVisitorImpl<milvus::Json>(
field_id, index_func, elem_func);
}
case OpType::GreaterThan: {
auto elem_func = [val, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompare(x.value() > val);
};
return ExecRangeVisitorImpl<milvus::Json>(
field_id, index_func, elem_func);
}
case OpType::LessEqual: {
auto elem_func = [val, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompare(x.value() <= val);
};
return ExecRangeVisitorImpl<milvus::Json>(
field_id, index_func, elem_func);
}
case OpType::LessThan: {
auto elem_func = [val, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompare(x.value() < val);
};
return ExecRangeVisitorImpl<milvus::Json>(
field_id, index_func, elem_func);
}
case OpType::PrefixMatch: {
auto elem_func = [val, op, nested_path](const milvus::Json& json) {
auto elem_func = [&](const milvus::Json& json) {
UnaryRangeJSONCompare(Match(ExprValueType(x.value()), val, op));
};
return ExecRangeVisitorImpl<milvus::Json>(
@ -692,32 +690,32 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcherJson(
auto right_operand = expr.right_operand_;
auto op = expr.op_type_;
auto val = expr.value_;
auto& nested_path = expr.column_.nested_path;
auto pointer = milvus::Json::pointer(std::move(expr.column_.nested_path));
#define BinaryArithRangeJSONCompare(cmp) \
do { \
auto x = json.template at<GetType>(nested_path); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(nested_path); \
return !x.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
#define BinaryArithRangeJSONCompare(cmp) \
do { \
auto x = json.template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(pointer); \
return !x.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
} while (false)
#define BinaryArithRangeJSONCompareNotEqual(cmp) \
do { \
auto x = json.template at<GetType>(nested_path); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(nested_path); \
return x.error() || (cmp); \
} \
return true; \
} \
return (cmp); \
#define BinaryArithRangeJSONCompareNotEqual(cmp) \
do { \
auto x = json.template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(pointer); \
return x.error() || (cmp); \
} \
return true; \
} \
return (cmp); \
} while (false)
switch (op) {
@ -918,26 +916,26 @@ ExecExprVisitor::ExecBinaryRangeVisitorDispatcherJson(BinaryRangeExpr& expr_raw)
bool upper_inclusive = expr.upper_inclusive_;
ExprValueType val1 = expr.lower_value_;
ExprValueType val2 = expr.upper_value_;
auto& nested_path = expr.column_.nested_path;
auto pointer = milvus::Json::pointer(std::move(expr.column_.nested_path));
// no json index now
auto index_func = [=](Index* index) { return TargetBitmap{}; };
#define BinaryRangeJSONCompare(cmp) \
do { \
auto x = json.template at<GetType>(expr.column_.nested_path); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(expr.column_.nested_path); \
if (!x.error()) { \
auto value = x.value(); \
return (cmp); \
} \
} \
return false; \
} \
auto value = x.value(); \
return (cmp); \
#define BinaryRangeJSONCompare(cmp) \
do { \
auto x = json.template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.template at<double>(pointer); \
if (!x.error()) { \
auto value = x.value(); \
return (cmp); \
} \
} \
return false; \
} \
auto value = x.value(); \
return (cmp); \
} while (false)
if (lower_inclusive && upper_inclusive) {
@ -1730,7 +1728,7 @@ ExecExprVisitor::ExecTermVisitorImplTemplateJson(TermExpr& expr_raw)
-> BitsetType {
using Index = index::ScalarIndex<milvus::Json>;
auto& expr = static_cast<TermExprImpl<ExprValueType>&>(expr_raw);
auto& nested_path = expr.column_.nested_path;
auto pointer = milvus::Json::pointer(std::move(expr.column_.nested_path));
auto index_func = [=](Index* index) { return TargetBitmap{}; };
std::unordered_set<ExprValueType> term_set(expr.terms_.begin(),
@ -1742,12 +1740,12 @@ ExecExprVisitor::ExecTermVisitorImplTemplateJson(TermExpr& expr_raw)
expr.column_.field_id, index_func, elem_func);
}
auto elem_func = [&term_set, nested_path](const milvus::Json& json) {
auto elem_func = [&term_set, &pointer](const milvus::Json& json) {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto x = json.template at<GetType>(nested_path);
auto x = json.template at<GetType>(pointer);
if (x.error()) {
return false;
}
@ -1840,13 +1838,13 @@ ExecExprVisitor::visit(ExistsExpr& expr) {
AssertInfo(expr.column_.data_type == field_meta.get_data_type(),
"[ExecExprVisitor]DataType of expr isn't field_meta data type");
BitsetType res;
auto& nested_path = expr.column_.nested_path;
auto pointer = milvus::Json::pointer(std::move(expr.column_.nested_path));
switch (expr.column_.data_type) {
case DataType::JSON: {
using Index = index::ScalarIndex<milvus::Json>;
auto index_func = [=](Index* index) { return TargetBitmap{}; };
auto elem_func = [nested_path](const milvus::Json& json) {
auto x = json.exist(nested_path);
auto index_func = [&](Index* index) { return TargetBitmap{}; };
auto elem_func = [&](const milvus::Json& json) {
auto x = json.exist(pointer);
return x;
};
res = ExecRangeVisitorImpl<milvus::Json>(

View File

@ -412,6 +412,7 @@ TEST(Expr, TestBinaryRangeJSON) {
}
return lower <= value && value <= upper;
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
RetrievePlanNode plan;
plan.predicate_ = std::make_unique<BinaryRangeExprImpl<int64_t>>(
ColumnInfo(json_fid, DataType::JSON, testcase.nested_path),
@ -428,7 +429,7 @@ TEST(Expr, TestBinaryRangeJSON) {
if (testcase.nested_path[0] == "int") {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>(testcase.nested_path)
.template at<int64_t>(pointer)
.value();
auto ref = check(val);
ASSERT_EQ(ans, ref)
@ -436,7 +437,7 @@ TEST(Expr, TestBinaryRangeJSON) {
<< testcase.upper_inclusive << testcase.upper;
} else {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>(testcase.nested_path)
.template at<double>(pointer)
.value();
auto ref = check(val);
ASSERT_EQ(ans, ref)
@ -490,6 +491,7 @@ TEST(Expr, TestExistsJson) {
for (auto testcase : testcases) {
auto check = [&](bool value) { return value; };
RetrievePlanNode plan;
auto pointer = milvus::Json::pointer(testcase.nested_path);
plan.predicate_ = std::make_unique<ExistsExprImpl>(
ColumnInfo(json_fid, DataType::JSON, testcase.nested_path));
auto final = visitor.call_child(*plan.predicate_.value());
@ -498,7 +500,7 @@ TEST(Expr, TestExistsJson) {
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.exist(testcase.nested_path);
.exist(pointer);
auto ref = check(val);
ASSERT_EQ(ans, ref);
}
@ -593,6 +595,7 @@ TEST(Expr, TestUnaryRangeJson) {
}
RetrievePlanNode plan;
auto pointer = milvus::Json::pointer(testcase.nested_path);
plan.predicate_ = std::make_unique<UnaryRangeExprImpl<int64_t>>(
ColumnInfo(json_fid, DataType::JSON, testcase.nested_path),
op,
@ -606,14 +609,14 @@ TEST(Expr, TestUnaryRangeJson) {
if (testcase.nested_path[0] == "int") {
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>(testcase.nested_path)
.template at<int64_t>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
} else {
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>(testcase.nested_path)
.template at<double>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
@ -671,6 +674,7 @@ TEST(Expr, TestTermJson) {
return term_set.find(value) != term_set.end();
};
RetrievePlanNode plan;
auto pointer = milvus::Json::pointer(testcase.nested_path);
plan.predicate_ = std::make_unique<TermExprImpl<int64_t>>(
ColumnInfo(json_fid, DataType::JSON, testcase.nested_path),
testcase.term,
@ -681,7 +685,7 @@ TEST(Expr, TestTermJson) {
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>(testcase.nested_path)
.template at<int64_t>(pointer)
.value();
auto ref = check(val);
ASSERT_EQ(ans, ref);
@ -1806,6 +1810,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeJSON) {
return value + testcase.right_operand != testcase.value;
};
RetrievePlanNode plan;
auto pointer = milvus::Json::pointer(testcase.nested_path);
plan.predicate_ =
std::make_unique<BinaryArithOpEvalRangeExprImpl<int64_t>>(
ColumnInfo(json_fid, DataType::JSON, testcase.nested_path),
@ -1822,13 +1827,13 @@ TEST(Expr, TestBinaryArithOpEvalRangeJSON) {
if (testcase.nested_path[0] == "int") {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>(testcase.nested_path)
.template at<int64_t>(pointer)
.value();
auto ref = check(val);
ASSERT_EQ(ans, ref) << testcase.value << " " << val;
} else {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>(testcase.nested_path)
.template at<double>(pointer)
.value();
auto ref = check(val);
ASSERT_EQ(ans, ref) << testcase.value << " " << val;
@ -1892,6 +1897,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeJSONFloat) {
return value + testcase.right_operand != testcase.value;
};
RetrievePlanNode plan;
auto pointer = milvus::Json::pointer(testcase.nested_path);
plan.predicate_ =
std::make_unique<BinaryArithOpEvalRangeExprImpl<double>>(
ColumnInfo(json_fid, DataType::JSON, testcase.nested_path),
@ -1907,7 +1913,7 @@ TEST(Expr, TestBinaryArithOpEvalRangeJSONFloat) {
auto ans = final[i];
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>(testcase.nested_path)
.template at<double>(pointer)
.value();
auto ref = check(val);
ASSERT_EQ(ans, ref) << testcase.value << " " << val;
@ -2536,25 +2542,25 @@ TEST(Expr, TestUnaryRangeWithJSON) {
auto ans = final[i];
if (dtype == DataType::BOOL) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<bool>({"bool"})
.template at<bool>("/bool")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::INT64) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>({"int"})
.template at<int64_t>("/int")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::DOUBLE) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>({"double"})
.template at<double>("/double")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::STRING) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<std::string_view>({"string"})
.template at<std::string_view>("/string")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
@ -2713,25 +2719,25 @@ TEST(Expr, TestTermWithJSON) {
auto ans = final[i];
if (dtype == DataType::BOOL) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<bool>({"bool"})
.template at<bool>("/bool")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::INT64) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>({"int"})
.template at<int64_t>("/int")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::DOUBLE) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>({"double"})
.template at<double>("/double")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::STRING) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.template at<std::string_view>({"string"})
.template at<std::string_view>("/string")
.value();
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
@ -2864,27 +2870,27 @@ TEST(Expr, TestExistsWithJSON) {
auto ans = final[i];
if (dtype == DataType::BOOL) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.exist({"bool"});
.exist("/bool");
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::INT64) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.exist({"int"});
.exist("/int");
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::DOUBLE) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.exist({"double"});
.exist("/double");
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::STRING) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.exist({"string"});
.exist("/string");
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else if (dtype == DataType::VARCHAR) {
auto val = milvus::Json(simdjson::padded_string(json_col[i]))
.exist({"varchar"});
.exist("/varchar");
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << clause << "@" << i << "!!" << val;
} else {