// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #include #include #include #include #include #include #include #include #include #include #include #include #include "common/LoadInfo.h" #include "pb/milvus.pb.h" #include "pb/plan.pb.h" #include "query/ExprImpl.h" #include "segcore/Collection.h" #include "segcore/reduce_c.h" #include "segcore/Reduce.h" #include "test_utils/DataGen.h" namespace chrono = std::chrono; using namespace milvus; using namespace milvus::segcore; using namespace knowhere; namespace { const int DIM = 16; const int64_t ROW_COUNT = 100 * 1000; const char* get_default_schema_config() { static std::string conf = R"(name: "default-collection" fields: < fieldID: 100 name: "fakevec" data_type: FloatVector type_params: < key: "dim" value: "16" > index_params: < key: "metric_type" value: "L2" > > fields: < fieldID: 101 name: "age" data_type: Int64 is_primary_key: true >)"; static std::string fake_conf = ""; return conf.c_str(); } std::vector translate_text_plan_to_binary_plan(const char* text_plan) { proto::plan::PlanNode plan_node; auto ok = google::protobuf::TextFormat::ParseFromString(text_plan, &plan_node); AssertInfo(ok, "Failed to parse"); std::string binary_plan; plan_node.SerializeToString(&binary_plan); std::vector ret; ret.resize(binary_plan.size()); std::memcpy(ret.data(), binary_plan.c_str(), binary_plan.size()); return ret; } auto generate_data(int N) { std::vector raw_data; std::vector timestamps; std::vector uids; std::default_random_engine e(42); std::normal_distribution<> dis(0.0, 1.0); for (int i = 0; i < N; ++i) { uids.push_back(10 * N + i); timestamps.push_back(0); float vec[DIM]; for (auto& x : vec) { x = dis(e); } raw_data.insert(raw_data.end(), (const char*)std::begin(vec), (const char*)std::end(vec)); int age = e() % 100; raw_data.insert(raw_data.end(), (const char*)&age, ((const char*)&age) + sizeof(age)); } return std::make_tuple(raw_data, timestamps, uids); } std::string generate_query_data(int nq) { namespace ser = milvus::proto::milvus; std::default_random_engine e(67); int dim = DIM; std::normal_distribution dis(0.0, 1.0); ser::PlaceholderGroup raw_group; auto value = raw_group.add_placeholders(); value->set_tag("$0"); value->set_type(ser::PlaceholderType::FloatVector); for (int i = 0; i < nq; ++i) { std::vector vec; for (int d = 0; d < dim; ++d) { vec.push_back(dis(e)); } value->add_values(vec.data(), vec.size() * sizeof(float)); } auto blob = raw_group.SerializeAsString(); return blob; } std::string generate_collection_schema(std::string metric_type, int dim, bool is_binary) { namespace schema = milvus::proto::schema; schema::CollectionSchema collection_schema; collection_schema.set_name("collection_test"); auto vec_field_schema = collection_schema.add_fields(); vec_field_schema->set_name("fakevec"); vec_field_schema->set_fieldid(100); if (is_binary) { vec_field_schema->set_data_type(schema::DataType::BinaryVector); } else { vec_field_schema->set_data_type(schema::DataType::FloatVector); } auto metric_type_param = vec_field_schema->add_index_params(); metric_type_param->set_key("metric_type"); metric_type_param->set_value(metric_type); auto dim_param = vec_field_schema->add_type_params(); dim_param->set_key("dim"); dim_param->set_value(std::to_string(dim)); auto other_field_schema = collection_schema.add_fields(); other_field_schema->set_name("counter"); other_field_schema->set_fieldid(101); other_field_schema->set_data_type(schema::DataType::Int64); other_field_schema->set_is_primary_key(true); auto other_field_schema2 = collection_schema.add_fields(); other_field_schema2->set_name("doubleField"); other_field_schema2->set_fieldid(102); other_field_schema2->set_data_type(schema::DataType::Double); std::string schema_string; auto marshal = google::protobuf::TextFormat::PrintToString(collection_schema, &schema_string); assert(marshal == true); return schema_string; } VecIndexPtr generate_index(void* raw_data, knowhere::Config conf, int64_t dim, int64_t topK, int64_t N, std::string index_type) { auto indexing = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type, knowhere::IndexMode::MODE_CPU); auto database = knowhere::GenDataset(N, dim, raw_data); indexing->Train(database, conf); indexing->AddWithoutIds(database, conf); EXPECT_EQ(indexing->Count(), N); EXPECT_EQ(indexing->Dim(), dim); EXPECT_EQ(indexing->Count(), N); EXPECT_EQ(indexing->Dim(), dim); return indexing; } } // namespace TEST(CApiTest, CollectionTest) { auto collection = NewCollection(get_default_schema_config()); DeleteCollection(collection); } TEST(CApiTest, GetCollectionNameTest) { auto collection = NewCollection(get_default_schema_config()); auto name = GetCollectionName(collection); assert(strcmp(name, "default-collection") == 0); DeleteCollection(collection); } TEST(CApiTest, SegmentTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); DeleteCollection(collection); DeleteSegment(segment); } template std::vector serialize(const Message* msg) { auto l = msg->ByteSize(); std::vector ret(l); auto ok = msg->SerializeToArray(ret.data(), l); assert(ok); return ret; } TEST(CApiTest, InsertTest) { auto c_collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(c_collection, Growing, -1); auto col = (milvus::segcore::Collection*)c_collection; int N = 10000; auto dataset = DataGen(col->get_schema(), N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(res.error_code == Success); DeleteCollection(c_collection); DeleteSegment(segment); } TEST(CApiTest, DeleteTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); std::vector delete_row_ids = {100000, 100001, 100002}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end()); auto delete_data = serialize(ids.get()); uint64_t delete_timestamps[] = {0, 0, 0}; auto offset = PreDelete(segment, 3); auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps); assert(del_res.error_code == Success); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, MultiDeleteGrowingSegment) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto col = (milvus::segcore::Collection*)collection; int N = 10; auto dataset = DataGen(col->get_schema(), N); auto insert_data = serialize(dataset.raw_); // insert, pks= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} int64_t offset; PreInsert(segment, N, &offset); auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(res.error_code == Success); // delete data pks = {1} std::vector delete_pks = {1}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end()); auto delete_data = serialize(ids.get()); std::vector delete_timestamps(1, dataset.timestamps_[N - 1]); offset = PreDelete(segment, 1); auto del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data()); assert(del_res.error_code == Success); // retrieve pks = {1} std::vector retrive_pks = {1}; auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); auto plan = std::make_unique(*schema); auto term_expr = std::make_unique>(FieldId(101), DataType::INT64, retrive_pks); plan->plan_node_ = std::make_unique(); plan->plan_node_->predicate_ = std::move(term_expr); std::vector target_field_ids{FieldId(100), FieldId(101)}; plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 0); // retrieve pks = {2} retrive_pks = {2}; term_expr = std::make_unique>(FieldId(101), DataType::INT64, retrive_pks); plan->plan_node_->predicate_ = std::move(term_expr); res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 1); // delete pks = {2} delete_pks = {2}; ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end()); delete_data = serialize(ids.get()); offset = PreDelete(segment, 1); del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data()); assert(del_res.error_code == Success); // retrieve pks in {2} res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 0); DeleteRetrievePlan(plan.release()); DeleteRetrieveResult(&retrieve_result); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, MultiDeleteSealedSegment) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Sealed, -1); auto col = (milvus::segcore::Collection*)collection; int N = 10; auto dataset = DataGen(col->get_schema(), N); // load field data for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) { auto array = dataset.get_col(field_id); auto data = serialize(array.get()); auto load_info = CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N}; auto res = LoadFieldData(segment, load_info); assert(res.error_code == Success); auto count = GetRowCount(segment); assert(count == N); } // load timestamps FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); auto ts_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); auto ts_data = serialize(ts_array.get()); auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; auto res = LoadFieldData(segment, load_info); assert(res.error_code == Success); auto count = GetRowCount(segment); assert(count == N); // load rowID FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); auto row_id_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); auto row_id_data = serialize(row_id_array.get()); load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; res = LoadFieldData(segment, load_info); assert(res.error_code == Success); count = GetRowCount(segment); assert(count == N); // delete data pks = {1} std::vector delete_pks = {1}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end()); auto delete_data = serialize(ids.get()); std::vector delete_timestamps(1, dataset.timestamps_[N - 1]); auto offset = PreDelete(segment, 1); auto del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data()); assert(del_res.error_code == Success); // retrieve pks = {1} std::vector retrive_pks = {1}; auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); auto plan = std::make_unique(*schema); auto term_expr = std::make_unique>(FieldId(101), DataType::INT64, retrive_pks); plan->plan_node_ = std::make_unique(); plan->plan_node_->predicate_ = std::move(term_expr); std::vector target_field_ids{FieldId(100), FieldId(101)}; plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 0); // retrieve pks = {2} retrive_pks = {2}; term_expr = std::make_unique>(FieldId(101), DataType::INT64, retrive_pks); plan->plan_node_->predicate_ = std::move(term_expr); res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 1); // delete pks = {2} delete_pks = {2}; ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end()); delete_data = serialize(ids.get()); offset = PreDelete(segment, 1); del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data()); assert(del_res.error_code == Success); // retrieve pks in {2} res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 0); DeleteRetrievePlan(plan.release()); DeleteRetrieveResult(&retrieve_result); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, DeleteRepeatedPksFromGrowingSegment) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto col = (milvus::segcore::Collection*)collection; int N = 10; auto dataset = DataGen(col->get_schema(), N); auto insert_data = serialize(dataset.raw_); // first insert, pks= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} int64_t offset; PreInsert(segment, N, &offset); auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(res.error_code == Success); // second insert, pks= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} PreInsert(segment, N, &offset); res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(res.error_code == Success); // create retrieve plan pks in {1, 2, 3} std::vector retrive_row_ids = {1, 2, 3}; auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); auto plan = std::make_unique(*schema); auto term_expr = std::make_unique>(FieldId(101), DataType::INT64, retrive_row_ids); plan->plan_node_ = std::make_unique(); plan->plan_node_->predicate_ = std::move(term_expr); std::vector target_field_ids{FieldId(100), FieldId(101)}; plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 6); // delete data pks = {1, 2, 3} std::vector delete_row_ids = {1, 2, 3}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end()); auto delete_data = serialize(ids.get()); std::vector delete_timestamps(3, dataset.timestamps_[N - 1]); offset = PreDelete(segment, 3); auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps.data()); assert(del_res.error_code == Success); // retrieve pks in {1, 2, 3} res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); query_result = std::make_unique(); suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 0); DeleteRetrievePlan(plan.release()); DeleteRetrieveResult(&retrieve_result); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, DeleteRepeatedPksFromSealedSegment) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Sealed, -1); auto col = (milvus::segcore::Collection*)collection; int N = 20; auto dataset = DataGen(col->get_schema(), N, 42, 0, 2); for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) { auto array = dataset.get_col(field_id); auto data = serialize(array.get()); auto load_info = CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N}; auto res = LoadFieldData(segment, load_info); assert(res.error_code == Success); auto count = GetRowCount(segment); assert(count == N); } FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); auto ts_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); auto ts_data = serialize(ts_array.get()); auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N}; auto res = LoadFieldData(segment, load_info); assert(res.error_code == Success); auto count = GetRowCount(segment); assert(count == N); FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); auto row_id_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); auto row_id_data = serialize(row_id_array.get()); load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N}; res = LoadFieldData(segment, load_info); assert(res.error_code == Success); count = GetRowCount(segment); assert(count == N); // create retrieve plan pks in {1, 2, 3} std::vector retrive_row_ids = {1, 2, 3}; auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); auto plan = std::make_unique(*schema); auto term_expr = std::make_unique>(FieldId(101), DataType::INT64, retrive_row_ids); plan->plan_node_ = std::make_unique(); plan->plan_node_->predicate_ = std::move(term_expr); std::vector target_field_ids{FieldId(100), FieldId(101)}; plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); auto query_result = std::make_unique(); auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 6); // delete data pks = {1, 2, 3} std::vector delete_row_ids = {1, 2, 3}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end()); auto delete_data = serialize(ids.get()); std::vector delete_timestamps(3, dataset.timestamps_[N - 1]); auto offset = PreDelete(segment, 3); auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps.data()); assert(del_res.error_code == Success); // retrieve pks in {1, 2, 3} res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result); ASSERT_EQ(res.error_code, Success); query_result = std::make_unique(); suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); ASSERT_EQ(query_result->ids().int_id().data().size(), 0); DeleteRetrievePlan(plan.release()); DeleteRetrieveResult(&retrieve_result); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, SearchTest) { auto c_collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(c_collection, Growing, -1); auto col = (milvus::segcore::Collection*)c_collection; int N = 10000; auto dataset = DataGen(col->get_schema(), N); int64_t ts_offset = 1000; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); ASSERT_EQ(ins_res.error_code, Success); const char* dsl_string = R"( { "bool": { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 10, "round_decimal": 3 } } } })"; int num_queries = 10; auto blob = generate_query_data(num_queries); void* plan = nullptr; auto status = CreateSearchPlan(c_collection, dsl_string, &plan); ASSERT_EQ(status.error_code, Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); ASSERT_EQ(status.error_code, Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); CSearchResult search_result; auto res = Search(segment, plan, placeholderGroup, N + ts_offset, &search_result, -1); ASSERT_EQ(res.error_code, Success); CSearchResult search_result2; auto res2 = Search(segment, plan, placeholderGroup, ts_offset, &search_result2, -1); ASSERT_EQ(res2.error_code, Success); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(search_result); DeleteSearchResult(search_result2); DeleteCollection(c_collection); DeleteSegment(segment); } TEST(CApiTest, SearchTestWithExpr) { auto c_collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(c_collection, Growing, -1); auto col = (milvus::segcore::Collection*)c_collection; int N = 10000; auto dataset = DataGen(col->get_schema(), N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); ASSERT_EQ(ins_res.error_code, Success); const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 query_info: < topk: 10 metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; int num_queries = 10; auto blob = generate_query_data(num_queries); void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(c_collection, binary_plan.data(), binary_plan.size(), &plan); ASSERT_EQ(status.error_code, Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); ASSERT_EQ(status.error_code, Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); dataset.timestamps_.clear(); dataset.timestamps_.push_back(1); CSearchResult search_result; auto res = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &search_result, -1); ASSERT_EQ(res.error_code, Success); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(search_result); DeleteCollection(c_collection); DeleteSegment(segment); } TEST(CApiTest, RetrieveTestWithExpr) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); auto plan = std::make_unique(*schema); int N = 10000; auto dataset = DataGen(schema, N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); ASSERT_EQ(ins_res.error_code, Success); // create retrieve plan "age in [0]" std::vector values(1, 0); auto term_expr = std::make_unique>(FieldId(101), DataType::INT64, values); plan->plan_node_ = std::make_unique(); plan->plan_node_->predicate_ = std::move(term_expr); std::vector target_field_ids{FieldId(100), FieldId(101)}; plan->field_ids_ = target_field_ids; CRetrieveResult retrieve_result; auto res = Retrieve(segment, plan.get(), dataset.timestamps_[0], &retrieve_result); ASSERT_EQ(res.error_code, Success); DeleteRetrievePlan(plan.release()); DeleteRetrieveResult(&retrieve_result); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, GetMemoryUsageInBytesTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto old_memory_usage_size = GetMemoryUsageInBytes(segment); // std::cout << "old_memory_usage_size = " << old_memory_usage_size << std::endl; assert(old_memory_usage_size == 0); auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); int N = 10000; auto dataset = DataGen(schema, N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(res.error_code == Success); auto memory_usage_size = GetMemoryUsageInBytes(segment); // std::cout << "new_memory_usage_size = " << memory_usage_size << std::endl; // TODO:: assert // assert(memory_usage_size == 2785280); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, GetDeletedCountTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); std::vector delete_row_ids = {100000, 100001, 100002}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end()); auto delete_data = serialize(ids.get()); uint64_t delete_timestamps[] = {0, 0, 0}; auto offset = PreDelete(segment, 3); auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps); assert(del_res.error_code == Success); // TODO: assert(deleted_count == len(delete_row_ids)) auto deleted_count = GetDeletedCount(segment); assert(deleted_count == 0); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, GetRowCountTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); int N = 10000; auto dataset = DataGen(schema, N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(res.error_code == Success); auto row_count = GetRowCount(segment); assert(row_count == N); DeleteCollection(collection); DeleteSegment(segment); } // TEST(CApiTest, SchemaTest) { // std::string schema_string = // "id: 6873737669791618215\nname: \"collection0\"\nschema: \u003c\n " // "field_metas: \u003c\n field_name: \"age\"\n type: INT32\n dim: 1\n \u003e\n " // "field_metas: \u003c\n field_name: \"field_1\"\n type: VECTOR_FLOAT\n dim: 16\n \u003e\n" // "\u003e\ncreate_time: 1600416765\nsegment_ids: 6873737669791618215\npartition_tags: \"default\"\n"; // // auto collection = NewCollection(schema_string.data()); // auto segment = NewSegment(collection, Growing, -1); // DeleteCollection(collection); // DeleteSegment(segment); //} void CheckSearchResultDuplicate(const std::vector& results) { auto sr = (SearchResult*)results[0]; auto topk = sr->topk_; auto num_queries = sr->num_queries_; // fill primary keys std::vector result_pks(num_queries * topk); for (int i = 0; i < results.size(); i++) { auto search_result = (SearchResult*)results[i]; auto size = search_result->result_offsets_.size(); if (size == 0) { continue; } for (int j = 0; j < size; j++) { auto offset = search_result->result_offsets_[j]; result_pks[offset] = search_result->primary_keys_[j]; } } // check primary key duplicates // int64_t cnt = 0; // std::unordered_set pk_set; // for (int qi = 0; qi < num_queries; qi++) { // pk_set.clear(); // for (int k = 0; k < topk; k++) { // int64_t idx = topk * qi + k; // pk_set.insert(result_pks[idx]); // } // cnt += pk_set.size(); // } // assert(cnt == topk * num_queries); } TEST(CApiTest, ReduceRemoveDuplicates) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); int N = 10000; auto dataset = DataGen(schema, N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* dsl_string = R"( { "bool": { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 10, "round_decimal": 3 } } } })"; int num_queries = 10; auto blob = generate_query_data(num_queries); void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); dataset.timestamps_.clear(); dataset.timestamps_.push_back(1); { std::vector results; CSearchResult res1, res2; status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res1, -1); assert(status.error_code == Success); status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res2, -1); assert(status.error_code == Success); results.push_back(res1); results.push_back(res2); status = ReduceSearchResultsAndFillData(plan, results.data(), results.size()); assert(status.error_code == Success); // TODO:: insert no duplicate pks and check reduce results CheckSearchResultDuplicate(results); DeleteSearchResult(res1); DeleteSearchResult(res2); } { std::vector results; CSearchResult res1, res2, res3; status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res1, -1); assert(status.error_code == Success); status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res2, -1); assert(status.error_code == Success); status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res3, -1); assert(status.error_code == Success); results.push_back(res1); results.push_back(res2); results.push_back(res3); status = ReduceSearchResultsAndFillData(plan, results.data(), results.size()); assert(status.error_code == Success); // TODO:: insert no duplicate pks and check reduce results CheckSearchResultDuplicate(results); DeleteSearchResult(res1); DeleteSearchResult(res2); DeleteSearchResult(res3); } DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteCollection(collection); DeleteSegment(segment); } void testReduceSearchWithExpr(int N, int topK, int num_queries) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Growing, -1); auto schema = ((milvus::segcore::Collection*)collection)->get_schema(); auto dataset = DataGen(schema, N); int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); auto fmt = boost::format(R"(vector_anns: < field_id: 100 query_info: < topk: %1% metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0"> output_field_ids: 100)") % topK; auto serialized_expr_plan = fmt.str(); auto blob = generate_query_data(num_queries); void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan.data()); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); dataset.timestamps_.clear(); dataset.timestamps_.push_back(1); std::vector results; CSearchResult res1; CSearchResult res2; auto res = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res1, -1); assert(res.error_code == Success); res = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res2, -1); assert(res.error_code == Success); results.push_back(res1); results.push_back(res2); // 1. reduce status = ReduceSearchResultsAndFillData(plan, results.data(), results.size()); assert(status.error_code == Success); // 2. marshal CSearchResultDataBlobs cSearchResultData; auto req_sizes = std::vector{num_queries / 2, num_queries / 2}; if (num_queries == 1) { req_sizes = std::vector{num_queries}; } status = Marshal(&cSearchResultData, results.data(), plan, results.size(), req_sizes.data(), req_sizes.size()); assert(status.error_code == Success); auto search_result_data_blobs = reinterpret_cast(cSearchResultData); // check result for (int i = 0; i < req_sizes.size(); i++) { milvus::proto::schema::SearchResultData search_result_data; auto suc = search_result_data.ParseFromArray(search_result_data_blobs->blobs[i].data(), search_result_data_blobs->blobs[i].size()); assert(suc); assert(search_result_data.top_k() == topK); assert(search_result_data.num_queries() == req_sizes[i]); // assert(search_result_data.scores().size() == topK * req_sizes[i]); // assert(search_result_data.ids().int_id().data_size() == topK * req_sizes[i]); } DeleteSearchResultDataBlobs(cSearchResultData); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(res1); DeleteSearchResult(res2); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, ReduceSearchWithExpr) { testReduceSearchWithExpr(100, 1, 1); testReduceSearchWithExpr(100, 10, 10); testReduceSearchWithExpr(10000, 1, 1); testReduceSearchWithExpr(10000, 10, 10); } TEST(CApiTest, LoadIndexInfo) { // generator index constexpr auto TOPK = 10; auto N = 1024 * 10; auto [raw_data, timestamps, uids] = generate_data(N); auto indexing = std::make_shared(); auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 4}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto database = knowhere::GenDataset(N, DIM, raw_data.data()); indexing->Train(database, conf); indexing->AddWithoutIds(database, conf); EXPECT_EQ(indexing->Count(), N); EXPECT_EQ(indexing->Dim(), DIM); auto binary_set = indexing->Serialize(conf); CBinarySet c_binary_set = (CBinarySet)&binary_set; void* c_load_index_info = nullptr; auto status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_param_key1 = "index_type"; std::string index_param_value1 = "IVF_PQ"; status = AppendIndexParam(c_load_index_info, index_param_key1.data(), index_param_value1.data()); std::string index_param_key2 = "index_mode"; std::string index_param_value2 = "cpu"; status = AppendIndexParam(c_load_index_info, index_param_key2.data(), index_param_value2.data()); assert(status.error_code == Success); std::string field_name = "field0"; status = AppendFieldInfo(c_load_index_info, 0, CDataType::FloatVector); assert(status.error_code == Success); status = AppendIndex(c_load_index_info, c_binary_set); assert(status.error_code == Success); DeleteLoadIndexInfo(c_load_index_info); } TEST(CApiTest, LoadIndex_Search) { // generator index constexpr auto TOPK = 10; auto N = 1024 * 1024; auto num_query = 100; auto [raw_data, timestamps, uids] = generate_data(N); auto indexing = std::make_shared(); auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 4}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto database = knowhere::GenDataset(N, DIM, raw_data.data()); indexing->Train(database, conf); indexing->AddWithoutIds(database, conf); EXPECT_EQ(indexing->Count(), N); EXPECT_EQ(indexing->Dim(), DIM); // serializ index to binarySet auto binary_set = indexing->Serialize(conf); // fill loadIndexInfo LoadIndexInfo load_index_info; auto& index_params = load_index_info.index_params; index_params["index_type"] = "IVF_PQ"; index_params["index_mode"] = "CPU"; auto mode = knowhere::IndexMode::MODE_CPU; load_index_info.index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_params["index_type"], mode); load_index_info.index->Load(binary_set); // search auto query_dataset = knowhere::GenDataset(num_query, DIM, raw_data.data() + DIM * 4200); auto result = indexing->Query(query_dataset, conf, nullptr); auto ids = result->Get(knowhere::meta::IDS); auto dis = result->Get(knowhere::meta::DISTANCE); // for (int i = 0; i < std::min(num_query * K, 100); ++i) { // std::cout << ids[i] << "->" << dis[i] << std::endl; //} } TEST(CApiTest, Indexing_Without_Predicate) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* dsl_string = R"( { "bool": { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } })"; // create place_holder_group int num_queries = 5; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_raw_index_json = SearchResultToJson(*search_result_on_raw_index); auto search_result_on_bigIndex_json = SearchResultToJson((*(SearchResult*)c_search_result_on_bigIndex)); // std::cout << search_result_on_raw_index_json.dump(1) << std::endl; // std::cout << search_result_on_bigIndex_json.dump(1) << std::endl; ASSERT_EQ(search_result_on_raw_index_json.dump(1), search_result_on_bigIndex_json.dump(1)); DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_Expr_Without_Predicate) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 query_info: < topk: 5 round_decimal: -1 metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; // create place_holder_group int num_queries = 5; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_raw_index_json = SearchResultToJson(*search_result_on_raw_index); auto search_result_on_bigIndex_json = SearchResultToJson((*(SearchResult*)c_search_result_on_bigIndex)); // std::cout << search_result_on_raw_index_json.dump(1) << std::endl; // std::cout << search_result_on_bigIndex_json.dump(1) << std::endl; ASSERT_EQ(search_result_on_raw_index_json.dump(1), search_result_on_bigIndex_json.dump(1)); DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_With_float_Predicate_Range) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* dsl_string = R"({ "bool": { "must": [ { "range": { "counter": { "GE": 42000, "LT": 42010 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } ] } })"; // create place_holder_group int num_queries = 10; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_Expr_With_float_Predicate_Range) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = 1000 * 1000; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 420000 * DIM; { int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); } const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 predicates: < binary_expr: < op: LogicalAnd left: < unary_range_expr: < column_info: < field_id: 101 data_type: Int64 > op: GreaterEqual value: < int64_val: 420000 > > > right: < unary_range_expr: < column_info: < field_id: 101 data_type: Int64 > op: LessThan value: < int64_val: 420010 > > > > > query_info: < topk: 5 round_decimal: -1 metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; // create place_holder_group int num_queries = 10; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 420000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_With_float_Predicate_Term) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* dsl_string = R"({ "bool": { "must": [ { "term": { "counter": { "values": [42000, 42001, 42002, 42003, 42004] } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } ] } })"; // create place_holder_group int num_queries = 5; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_Expr_With_float_Predicate_Term) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = 1000 * 1000; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 420000 * DIM; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* serialized_expr_plan = R"( vector_anns: < field_id: 100 predicates: < term_expr: < column_info: < field_id: 101 data_type: Int64 > values: < int64_val: 420000 > values: < int64_val: 420001 > values: < int64_val: 420002 > values: < int64_val: 420003 > values: < int64_val: 420004 > > > query_info: < topk: 5 round_decimal: -1 metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; // create place_holder_group int num_queries = 5; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 420000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_With_binary_Predicate_Range) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("JACCARD", DIM, true); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = 1000 * 1000; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 420000 * DIM / 8; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* dsl_string = R"({ "bool": { "must": [ { "range": { "counter": { "GE": 420000, "LT": 420010 } } }, { "vector": { "fakevec": { "metric_type": "JACCARD", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } ] } })"; // create place_holder_group int num_queries = 5; auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{ {knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::JACCARD}, }; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "BIN_IVF_FLAT"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "JACCARD"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 420000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Range) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("JACCARD", DIM, true); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM / 8; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 predicates: < binary_expr: < op: LogicalAnd left: < unary_range_expr: < column_info: < field_id: 101 data_type: Int64 > op: GreaterEqual value: < int64_val: 42000 > > > right: < unary_range_expr: < column_info: < field_id: 101 data_type: Int64 > op: LessThan value: < int64_val: 42010 > > > > > query_info: < topk: 5 round_decimal: -1 metric_type: "JACCARD" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; // create place_holder_group int num_queries = 5; auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); ASSERT_TRUE(res_before_load_index.error_code == Success) << res_before_load_index.error_msg; // load index to segment auto conf = knowhere::Config{ {knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::JACCARD}, }; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "BIN_IVF_FLAT"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "JACCARD"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_With_binary_Predicate_Term) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("JACCARD", DIM, true); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM / 8; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* dsl_string = R"({ "bool": { "must": [ { "term": { "counter": { "values": [42000, 42001, 42002, 42003, 42004] } } }, { "vector": { "fakevec": { "metric_type": "JACCARD", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } ] } })"; // create place_holder_group int num_queries = 5; auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{ {knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::JACCARD}, }; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "BIN_IVF_FLAT"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "JACCARD"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); std::vector results; results.push_back(c_search_result_on_bigIndex); status = ReduceSearchResultsAndFillData(plan, results.data(), results.size()); assert(status.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = search_result_on_bigIndex->get_result_count(i); ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[i * TOPK]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Term) { // insert data to segment constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("JACCARD", DIM, true); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Growing, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM / 8; int64_t offset; PreInsert(segment, N, &offset); auto insert_data = serialize(dataset.raw_); auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(), insert_data.size()); assert(ins_res.error_code == Success); const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 predicates: < term_expr: < column_info: < field_id: 101 data_type: Int64 > values: < int64_val: 42000 > values: < int64_val: 42001 > values: < int64_val: 42002 > values: < int64_val: 42003 > values: < int64_val: 42004 > > > query_info: < topk: 5 round_decimal: -1 metric_type: "JACCARD" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; // create place_holder_group int num_queries = 5; auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; CSearchResult c_search_result_on_smallIndex; auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1); assert(res_before_load_index.error_code == Success); // load index to segment auto conf = knowhere::Config{ {knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::JACCARD}, }; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex; search_result_on_raw_index->seg_offsets_ = vec_ids; search_result_on_raw_index->distances_ = vec_dis; auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "BIN_IVF_FLAT"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "JACCARD"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); std::vector results; results.push_back(c_search_result_on_bigIndex); status = ReduceSearchResultsAndFillData(plan, results.data(), results.size()); assert(status.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = search_result_on_bigIndex->get_result_count(i); ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[i * TOPK]); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_smallIndex); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, SealedSegmentTest) { auto collection = NewCollection(get_default_schema_config()); auto segment = NewSegment(collection, Sealed, -1); int N = 10000; std::default_random_engine e(67); auto ages = std::vector(N); for (auto& age : ages) { age = e() % 2000; } auto blob = (void*)(&ages[0]); FieldMeta field_meta(FieldName("age"), FieldId(101), DataType::INT64); auto array = CreateScalarDataArrayFrom(ages.data(), N, field_meta); auto age_data = serialize(array.get()); auto load_info = CLoadFieldDataInfo{101, age_data.data(), age_data.size(), N}; auto res = LoadFieldData(segment, load_info); assert(res.error_code == Success); auto count = GetRowCount(segment); assert(count == N); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, SealedSegment_search_float_Predicate_Range) { constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Sealed, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; auto counter_col = dataset.get_col(FieldId(101)); FieldMeta counter_field_meta(FieldName("counter"), FieldId(101), DataType::INT64); auto count_array = CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta); auto counter_data = serialize(count_array.get()); FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); auto row_ids_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); auto row_ids_data = serialize(row_ids_array.get()); FieldMeta timestamp_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); auto timestamps_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, timestamp_field_meta); auto timestamps_data = serialize(timestamps_array.get()); const char* dsl_string = R"({ "bool": { "must": [ { "range": { "counter": { "GE": 42000, "LT": 42010 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } ] } })"; // create place_holder_group int num_queries = 10; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto status = CreateSearchPlan(collection, dsl_string, &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto load_index_info = (LoadIndexInfo*)c_load_index_info; auto query_dataset2 = knowhere::GenDataset(num_queries, DIM, query_ptr); auto index = std::dynamic_pointer_cast(load_index_info->index); auto result_on_index2 = index->Query(query_dataset2, conf, nullptr); auto ids2 = result_on_index2->Get(knowhere::meta::IDS); auto dis2 = result_on_index2->Get(knowhere::meta::DISTANCE); auto c_counter_field_data = CLoadFieldDataInfo{ 101, counter_data.data(), counter_data.size(), N, }; status = LoadFieldData(segment, c_counter_field_data); assert(status.error_code == Success); auto c_id_field_data = CLoadFieldDataInfo{ 0, row_ids_data.data(), row_ids_data.size(), N, }; status = LoadFieldData(segment, c_id_field_data); assert(status.error_code == Success); auto c_ts_field_data = CLoadFieldDataInfo{ 1, timestamps_data.data(), timestamps_data.size(), N, }; status = LoadFieldData(segment, c_ts_field_data); assert(status.error_code == Success); auto sealed_segment = SealedCreator(schema, dataset, *(LoadIndexInfo*)c_load_index_info); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, SealedSegment_search_without_predicates) { constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Sealed, -1); auto N = ROW_COUNT; uint64_t ts_offset = 1000; auto dataset = DataGen(schema, N, ts_offset); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; auto vec_array = dataset.get_col(FieldId(100)); auto vec_data = serialize(vec_array.get()); auto counter_col = dataset.get_col(FieldId(101)); FieldMeta counter_field_meta(FieldName("counter"), FieldId(101), DataType::INT64); auto count_array = CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta); auto counter_data = serialize(count_array.get()); FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); auto row_ids_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); auto row_ids_data = serialize(row_ids_array.get()); FieldMeta timestamp_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); auto timestamps_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, timestamp_field_meta); auto timestamps_data = serialize(timestamps_array.get()); const char* dsl_string = R"( { "bool": { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": -1 } } } })"; auto c_vec_field_data = CLoadFieldDataInfo{ 100, vec_data.data(), vec_data.size(), N, }; auto status = LoadFieldData(segment, c_vec_field_data); assert(status.error_code == Success); auto c_counter_field_data = CLoadFieldDataInfo{ 101, counter_data.data(), counter_data.size(), N, }; status = LoadFieldData(segment, c_counter_field_data); assert(status.error_code == Success); auto c_id_field_data = CLoadFieldDataInfo{ 0, row_ids_data.data(), row_ids_data.size(), N, }; status = LoadFieldData(segment, c_id_field_data); assert(status.error_code == Success); auto c_ts_field_data = CLoadFieldDataInfo{ 1, timestamps_data.data(), timestamps_data.size(), N, }; status = LoadFieldData(segment, c_ts_field_data); assert(status.error_code == Success); int num_queries = 10; auto blob = generate_query_data(num_queries); void* plan = nullptr; status = CreateSearchPlan(collection, dsl_string, &plan); ASSERT_EQ(status.error_code, Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); ASSERT_EQ(status.error_code, Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); CSearchResult search_result; auto res = Search(segment, plan, placeholderGroup, N + ts_offset, &search_result, -1); std::cout << res.error_msg << std::endl; ASSERT_EQ(res.error_code, Success); CSearchResult search_result2; auto res2 = Search(segment, plan, placeholderGroup, ts_offset, &search_result2, -1); ASSERT_EQ(res2.error_code, Success); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(search_result); DeleteSearchResult(search_result2); DeleteCollection(collection); DeleteSegment(segment); } TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) { constexpr auto TOPK = 5; std::string schema_string = generate_collection_schema("L2", DIM, false); auto collection = NewCollection(schema_string.c_str()); auto schema = ((segcore::Collection*)collection)->get_schema(); auto segment = NewSegment(collection, Sealed, -1); auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(FieldId(100)); auto query_ptr = vec_col.data() + 42000 * DIM; auto counter_col = dataset.get_col(FieldId(101)); FieldMeta counter_field_meta(FieldName("counter"), FieldId(101), DataType::INT64); auto count_array = CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta); auto counter_data = serialize(count_array.get()); FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); auto row_ids_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); auto row_ids_data = serialize(row_ids_array.get()); FieldMeta timestamp_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); auto timestamps_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, timestamp_field_meta); auto timestamps_data = serialize(timestamps_array.get()); const char* serialized_expr_plan = R"(vector_anns: < field_id: 100 predicates: < binary_expr: < op: LogicalAnd left: < unary_range_expr: < column_info: < field_id: 101 data_type: Int64 > op: GreaterEqual value: < int64_val: 42000 > > > right: < unary_range_expr: < column_info: < field_id: 101 data_type: Int64 > op: LessThan value: < int64_val: 42010 > > > > > query_info: < topk: 5 round_decimal: -1 metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0" >)"; // create place_holder_group int num_queries = 10; auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr); auto blob = raw_group.SerializeAsString(); // search on segment's small index void* plan = nullptr; auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan); auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan); assert(status.error_code == Success); void* placeholderGroup = nullptr; status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup); assert(status.error_code == Success); std::vector placeholderGroups; placeholderGroups.push_back(placeholderGroup); Timestamp time = 10000000; // load index to segment auto conf = knowhere::Config{{knowhere::meta::DIM, DIM}, {knowhere::meta::TOPK, TOPK}, {knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 10}, {knowhere::IndexParams::m, 4}, {knowhere::IndexParams::nbits, 8}, {knowhere::Metric::TYPE, knowhere::Metric::L2}, {knowhere::meta::DEVICEID, 0}}; auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ); // gen query dataset auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr); auto result_on_index = indexing->Query(query_dataset, conf, nullptr); auto ids = result_on_index->Get(knowhere::meta::IDS); auto dis = result_on_index->Get(knowhere::meta::DISTANCE); std::vector vec_ids(ids, ids + TOPK * num_queries); std::vector vec_dis; for (int j = 0; j < TOPK * num_queries; ++j) { vec_dis.push_back(dis[j] * -1); } auto binary_set = indexing->Serialize(conf); void* c_load_index_info = nullptr; status = NewLoadIndexInfo(&c_load_index_info); assert(status.error_code == Success); std::string index_type_key = "index_type"; std::string index_type_value = "IVF_PQ"; std::string index_mode_key = "index_mode"; std::string index_mode_value = "cpu"; std::string metric_type_key = "metric_type"; std::string metric_type_value = "L2"; AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str()); AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str()); AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str()); AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector); AppendIndex(c_load_index_info, (CBinarySet)&binary_set); auto load_index_info = (LoadIndexInfo*)c_load_index_info; auto query_dataset2 = knowhere::GenDataset(num_queries, DIM, query_ptr); auto index = std::dynamic_pointer_cast(load_index_info->index); auto result_on_index2 = index->Query(query_dataset2, conf, nullptr); auto ids2 = result_on_index2->Get(knowhere::meta::IDS); auto dis2 = result_on_index2->Get(knowhere::meta::DISTANCE); auto c_counter_field_data = CLoadFieldDataInfo{ 101, counter_data.data(), counter_data.size(), N, }; status = LoadFieldData(segment, c_counter_field_data); assert(status.error_code == Success); auto c_id_field_data = CLoadFieldDataInfo{ 0, row_ids_data.data(), row_ids_data.size(), N, }; status = LoadFieldData(segment, c_id_field_data); assert(status.error_code == Success); auto c_ts_field_data = CLoadFieldDataInfo{ 1, timestamps_data.data(), timestamps_data.size(), N, }; status = LoadFieldData(segment, c_ts_field_data); assert(status.error_code == Success); status = UpdateSealedSegmentIndex(segment, c_load_index_info); assert(status.error_code == Success); auto counter_index = GenScalarIndexing(N, counter_col.data()); auto counter_index_binary_set = counter_index->Serialize(conf); CLoadIndexInfo counter_index_info = nullptr; status = NewLoadIndexInfo(&counter_index_info); assert(status.error_code == Success); status = AppendFieldInfo(counter_index_info, 101, CDataType::Int64); assert(status.error_code == Success); std::string counter_index_type_key = "index_type"; std::string counter_index_type_value = "sort"; status = AppendIndexParam(counter_index_info, counter_index_type_key.c_str(), counter_index_type_value.c_str()); assert(status.error_code == Success); status = AppendIndex(counter_index_info, (CBinarySet)&counter_index_binary_set); assert(status.error_code == Success); status = UpdateSealedSegmentIndex(segment, counter_index_info); assert(status.error_code == Success); CSearchResult c_search_result_on_bigIndex; auto res_after_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1); assert(res_after_load_index.error_code == Success); auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex; for (int i = 0; i < num_queries; ++i) { auto offset = i * TOPK; ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i); } DeleteLoadIndexInfo(c_load_index_info); DeleteSearchPlan(plan); DeletePlaceholderGroup(placeholderGroup); DeleteSearchResult(c_search_result_on_bigIndex); DeleteCollection(collection); DeleteSegment(segment); }