fix: Align brute force search with json index for exists expr (#41056)

issue: #35528 
pr: #41004

Signed-off-by: sunby <sunbingyi1992@gmail.com>
pull/41071/head
Bingyi Sun 2025-04-02 18:28:23 +08:00 committed by GitHub
parent 5017e9ab8c
commit 240f766511
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 72 additions and 66 deletions

View File

@ -201,7 +201,9 @@ class Json {
bool
exist(std::string_view pointer) const {
return doc().at_pointer(pointer).error() == simdjson::SUCCESS;
auto doc = this->doc();
auto res = doc.at_pointer(pointer);
return res.error() == simdjson::SUCCESS && !res.is_null();
}
// construct JSON pointer with provided path

View File

@ -115,7 +115,8 @@ JsonInvertedIndex<T>::build_index_for_json(
auto exists = path_exists(json_column->dom_doc(), tokens);
if (!exists ||
json_column->doc().at_pointer(nested_path_).is_null()) {
nested_path_ != "" &&
json_column->doc().at_pointer(nested_path_).is_null()) {
error_recorder_.Record(
*json_column, nested_path_, simdjson::NO_SUCH_FIELD);
this->null_offset_.push_back(offset);

View File

@ -18,6 +18,7 @@
#include <regex>
#include <string>
#include <string_view>
#include <tuple>
#include <type_traits>
#include <vector>
#include <chrono>
@ -16712,22 +16713,40 @@ TEST(JsonIndexTest, TestJsonNotEqualExpr) {
EXPECT_EQ(final.count(), 2 * json_strs.size() - 4);
}
TEST(JsonIndexTest, TestExistsExpr) {
std::unordered_map<std::string, bool> json_strs_match = {
{R"({"a": 1.0})", true},
{R"({"a": "abc"})", true},
{R"({"a": 3.0})", true},
{R"({"a": true})", true},
{R"({"a": {"b": 1}})", true},
{R"({"a": []})", true},
{R"({"a": ["a", "b"]})", true},
{R"({"a": null})", false}, // exists null
{R"(1)", false},
{R"("abc")", false},
{R"(1.0)", false},
{R"(true)", false},
{R"([1, 2, 3])", false},
{R"({"a": 1, "b": 2})", true}};
class JsonIndexExistsTest : public ::testing::TestWithParam<std::string> {};
INSTANTIATE_TEST_SUITE_P(JsonIndexExistsTestParams,
JsonIndexExistsTest,
::testing::Values("/a", ""));
TEST_P(JsonIndexExistsTest, TestExistsExpr) {
std::vector<std::string> json_strs = {
R"({"a": 1.0})",
R"({"a": "abc"})",
R"({"a": 3.0})",
R"({"a": true})",
R"({"a": {"b": 1}})",
R"({"a": []})",
R"({"a": ["a", "b"]})",
R"({"a": null})", // exists null
R"(1)",
R"("abc")",
R"(1.0)",
R"(true)",
R"([1, 2, 3])",
R"({"a": 1, "b": 2})",
R"({})",
R"(null)",
};
// bool: exists or not
std::vector<std::tuple<std::vector<std::string>, bool, uint32_t>>
test_cases = {
{{"a"}, true, 0b1111111000000100},
{{"a", "b"}, true, 0b0000100000000000},
};
auto json_index_path = GetParam();
auto schema = std::make_shared<Schema>();
auto vec_fid = schema->AddDebugField(
@ -16747,7 +16766,7 @@ TEST(JsonIndexTest, TestExistsExpr) {
auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex(
index::INVERTED_INDEX_TYPE,
JsonCastType::DOUBLE,
"/a",
json_index_path,
file_manager_ctx);
using json_index_type = index::JsonInvertedIndex<double>;
@ -16757,13 +16776,8 @@ TEST(JsonIndexTest, TestExistsExpr) {
auto json_field =
std::make_shared<FieldData<milvus::Json>>(DataType::JSON, false);
std::vector<milvus::Json> jsons;
BitsetType expect;
expect.resize(json_strs_match.size());
int i = 0;
for (auto& [json_str, match] : json_strs_match) {
for (auto& json_str : json_strs) {
jsons.push_back(milvus::Json(simdjson::padded_string(json_str)));
expect.set(i, match);
i++;
}
json_field->add_json_data(jsons);
@ -16774,14 +16788,36 @@ TEST(JsonIndexTest, TestExistsExpr) {
load_index_info.field_id = json_fid.get();
load_index_info.field_type = DataType::JSON;
load_index_info.index = std::move(json_index);
load_index_info.index_params = {{JSON_PATH, "/a"}};
load_index_info.index_params = {{JSON_PATH, json_index_path}};
seg->LoadIndex(load_index_info);
auto exists_expr = std::make_shared<expr::ExistsExpr>(
expr::ColumnInfo(json_fid, DataType::JSON, {"a"}, true));
auto plan = std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID,
exists_expr);
auto result = ExecuteQueryExpr(
plan, seg.get(), json_strs_match.size(), MAX_TIMESTAMP);
EXPECT_TRUE(result == expect);
auto json_field_data_info =
FieldDataInfo(json_fid.get(), json_strs.size(), {json_field});
seg->LoadFieldData(json_fid, json_field_data_info);
for (auto& [nested_path, exists, expect] : test_cases) {
BitsetType expect_res;
expect_res.resize(json_strs.size());
for (int i = json_strs.size() - 1; expect > 0; i--) {
expect_res.set(i, (expect & 1) != 0);
expect >>= 1;
}
std::shared_ptr<expr::ITypeFilterExpr> exists_expr;
if (exists) {
exists_expr = std::make_shared<expr::ExistsExpr>(
expr::ColumnInfo(json_fid, DataType::JSON, nested_path, true));
} else {
auto child_expr = std::make_shared<expr::ExistsExpr>(
expr::ColumnInfo(json_fid, DataType::JSON, nested_path, true));
exists_expr = std::make_shared<expr::LogicalUnaryExpr>(
expr::LogicalUnaryExpr::OpType::LogicalNot, child_expr);
}
auto plan = std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID,
exists_expr);
auto result =
ExecuteQueryExpr(plan, seg.get(), json_strs.size(), MAX_TIMESTAMP);
EXPECT_TRUE(result == expect_res);
}
}

View File

@ -19,39 +19,6 @@
using namespace milvus;
using namespace milvus::index;
TEST(JsonIndexTest, TestBuildNonExistJsonPath) {
std::string json_path = "/hello";
auto schema = std::make_shared<Schema>();
auto json_fid = schema->AddDebugField("json", DataType::JSON);
auto file_manager_ctx = storage::FileManagerContext();
file_manager_ctx.fieldDataMeta.field_schema.set_data_type(
milvus::proto::schema::JSON);
file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get());
file_manager_ctx.fieldDataMeta.field_id = json_fid.get();
auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex(
index::INVERTED_INDEX_TYPE,
JsonCastType::DOUBLE,
json_path,
file_manager_ctx);
auto json_index = std::unique_ptr<JsonInvertedIndex<int32_t>>(
static_cast<JsonInvertedIndex<int32_t>*>(inv_index.release()));
std::vector<std::string> json_raw_data = {R"({"hello": 1})",
R"({"world": 2})"};
std::vector<milvus::Json> jsons;
for (auto& json : json_raw_data) {
jsons.push_back(milvus::Json(simdjson::padded_string(json)));
}
auto json_field =
std::make_shared<FieldData<milvus::Json>>(DataType::JSON, false);
json_field->add_json_data(jsons);
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
}
TEST(JsonIndexTest, TestJSONErrRecorder) {
std::vector<std::string> json_raw_data = {
R"(1)",