milvus/internal/core/unittest/test_hybrid_index.cpp

577 lines
19 KiB
C++

// Copyright(C) 2019 - 2020 Zilliz.All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <functional>
#include <boost/filesystem.hpp>
#include <unordered_set>
#include <memory>
#include "common/Tracer.h"
#include "index/BitmapIndex.h"
#include "index/HybridScalarIndex.h"
#include "storage/Util.h"
#include "storage/InsertData.h"
#include "indexbuilder/IndexFactory.h"
#include "index/IndexFactory.h"
#include "test_utils/indexbuilder_test_utils.h"
#include "index/Meta.h"
#include "pb/schema.pb.h"
using namespace milvus::index;
using namespace milvus::indexbuilder;
using namespace milvus;
using namespace milvus::index;
template <typename T>
static std::vector<T>
GenerateData(const size_t size, const size_t cardinality) {
std::vector<T> result;
for (size_t i = 0; i < size; ++i) {
result.push_back(rand() % cardinality);
}
return result;
}
template <>
std::vector<bool>
GenerateData<bool>(const size_t size, const size_t cardinality) {
std::vector<bool> result;
for (size_t i = 0; i < size; ++i) {
result.push_back(rand() % 2 == 0);
}
return result;
}
template <>
std::vector<std::string>
GenerateData<std::string>(const size_t size, const size_t cardinality) {
std::vector<std::string> result;
for (size_t i = 0; i < size; ++i) {
result.push_back(std::to_string(rand() % cardinality));
}
return result;
}
template <typename T>
class HybridIndexTestV1 : public testing::Test {
protected:
void
Init(int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id,
int64_t index_build_id,
int64_t index_version) {
proto::schema::FieldSchema field_schema;
field_schema.set_nullable(nullable_);
if constexpr (std::is_same_v<int8_t, T>) {
field_schema.set_data_type(proto::schema::DataType::Int8);
} else if constexpr (std::is_same_v<int16_t, T>) {
field_schema.set_data_type(proto::schema::DataType::Int16);
} else if constexpr (std::is_same_v<int32_t, T>) {
field_schema.set_data_type(proto::schema::DataType::Int32);
} else if constexpr (std::is_same_v<int64_t, T>) {
field_schema.set_data_type(proto::schema::DataType::Int64);
} else if constexpr (std::is_same_v<float, T>) {
field_schema.set_data_type(proto::schema::DataType::Float);
} else if constexpr (std::is_same_v<double, T>) {
field_schema.set_data_type(proto::schema::DataType::Double);
} else if constexpr (std::is_same_v<std::string, T>) {
field_schema.set_data_type(proto::schema::DataType::String);
}
auto field_meta = storage::FieldDataMeta{
collection_id, partition_id, segment_id, field_id, field_schema};
auto index_meta = storage::IndexMeta{
segment_id, field_id, index_build_id, index_version};
std::vector<T> data_gen;
data_gen = GenerateData<T>(nb_, cardinality_);
for (auto x : data_gen) {
data_.push_back(x);
}
auto field_data = storage::CreateFieldData(type_, nullable_);
if (nullable_) {
valid_data_.reserve(nb_);
uint8_t* ptr = new uint8_t[(nb_ + 7) / 8];
for (int i = 0; i < nb_; i++) {
int byteIndex = i / 8;
int bitIndex = i % 8;
if (i % 2 == 0) {
valid_data_.push_back(true);
ptr[byteIndex] |= (1 << bitIndex);
} else {
valid_data_.push_back(false);
ptr[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data_.data(), ptr, data_.size());
delete[] ptr;
} else {
field_data->FillFieldData(data_.data(), data_.size());
}
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}",
"/tmp/test_hybrid/",
collection_id,
partition_id,
segment_id,
field_id,
0);
chunk_manager_->Write(
log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_);
std::vector<std::string> index_files;
Config config;
config["index_type"] = milvus::index::HYBRID_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
config["bitmap_cardinality_limit"] = "1000";
{
auto build_index =
indexbuilder::IndexFactory::GetInstance().CreateIndex(
type_, config, ctx);
build_index->Build();
auto create_index_result = build_index->Upload();
auto memSize = create_index_result->GetMemSize();
auto serializedSize = create_index_result->GetSerializedSize();
ASSERT_GT(memSize, 0);
ASSERT_GT(serializedSize, 0);
index_files = create_index_result->GetIndexFiles();
}
index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::HYBRID_INDEX_TYPE;
index_info.field_type = type_;
config["index_files"] = index_files;
ctx.set_for_loading_index(true);
index_ =
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
index_->Load(milvus::tracer::TraceContext{}, config);
}
virtual void
SetParam() {
nb_ = 10000;
cardinality_ = 30;
nullable_ = false;
index_version_ = 1001;
index_build_id_ = 1001;
}
void
SetUp() override {
SetParam();
if constexpr (std::is_same_v<T, int8_t>) {
type_ = DataType::INT8;
} else if constexpr (std::is_same_v<T, int16_t>) {
type_ = DataType::INT16;
} else if constexpr (std::is_same_v<T, int32_t>) {
type_ = DataType::INT32;
} else if constexpr (std::is_same_v<T, int64_t>) {
type_ = DataType::INT64;
} else if constexpr (std::is_same_v<T, std::string>) {
type_ = DataType::VARCHAR;
}
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t field_id = 101;
std::string root_path = "/tmp/test-bitmap-index";
storage::StorageConfig storage_config;
storage_config.storage_type = "local";
storage_config.root_path = root_path;
chunk_manager_ = storage::CreateChunkManager(storage_config);
Init(collection_id,
partition_id,
segment_id,
field_id,
index_build_id_,
index_version_);
}
virtual ~HybridIndexTestV1() override {
boost::filesystem::remove_all(chunk_manager_->GetRootPath());
}
public:
void
TestInFunc() {
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq; i++) {
test_data.push_back(data_[i]);
s.insert(data_[i]);
}
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->In(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
}
}
}
void
TestNotInFunc() {
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq; i++) {
test_data.push_back(data_[i]);
s.insert(data_[i]);
}
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->NotIn(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_NE(bitset[i], s.find(data_[i]) != s.end());
}
}
}
void
TestIsNullFunc() {
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->IsNull();
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], true);
} else {
ASSERT_EQ(bitset[i], false);
}
}
}
void
TestIsNotNullFunc() {
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->IsNotNull();
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], true);
}
}
}
void
TestCompareValueFunc() {
if constexpr (!std::is_same_v<T, std::string>) {
using RefFunc = std::function<bool(int64_t)>;
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
{10,
OpType::GreaterThan,
[&](int64_t i) -> bool { return data_[i] > 10; }},
{10,
OpType::GreaterEqual,
[&](int64_t i) -> bool { return data_[i] >= 10; }},
{10,
OpType::LessThan,
[&](int64_t i) -> bool { return data_[i] < 10; }},
{10,
OpType::LessEqual,
[&](int64_t i) -> bool { return data_[i] <= 10; }},
};
for (const auto& [test_value, op, ref] : test_cases) {
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->Range(test_value, op);
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(ans, false)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
} else {
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
}
void
TestRangeCompareFunc() {
if constexpr (!std::is_same_v<T, std::string>) {
using RefFunc = std::function<bool(int64_t)>;
struct TestParam {
int64_t lower_val;
int64_t upper_val;
bool lower_inclusive;
bool upper_inclusive;
RefFunc ref;
};
std::vector<TestParam> test_cases = {
{
10,
30,
false,
false,
[&](int64_t i) { return 10 < data_[i] && data_[i] < 30; },
},
{
10,
30,
true,
false,
[&](int64_t i) { return 10 <= data_[i] && data_[i] < 30; },
},
{
10,
30,
true,
true,
[&](int64_t i) { return 10 <= data_[i] && data_[i] <= 30; },
},
{
10,
30,
false,
true,
[&](int64_t i) { return 10 < data_[i] && data_[i] <= 30; },
}};
for (const auto& test_case : test_cases) {
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->Range(test_case.lower_val,
test_case.lower_inclusive,
test_case.upper_val,
test_case.upper_inclusive);
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = test_case.ref(i);
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(ans, false)
<< "lower:" << test_case.lower_val
<< "upper:" << test_case.upper_val << ", @" << i
<< ", ans: " << ans << ", ref: " << false;
} else {
ASSERT_EQ(ans, should)
<< "lower:" << test_case.lower_val
<< "upper:" << test_case.upper_val << ", @" << i
<< ", ans: " << ans << ", ref: " << should;
}
}
}
}
}
public:
IndexBasePtr index_;
DataType type_;
size_t nb_;
size_t cardinality_;
boost::container::vector<T> data_;
std::shared_ptr<storage::ChunkManager> chunk_manager_;
bool nullable_;
FixedVector<bool> valid_data_;
int index_build_id_;
int index_version_;
};
TYPED_TEST_SUITE_P(HybridIndexTestV1);
TYPED_TEST_P(HybridIndexTestV1, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}
TYPED_TEST_P(HybridIndexTestV1, INFuncTest) {
this->TestInFunc();
}
TYPED_TEST_P(HybridIndexTestV1, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(HybridIndexTestV1, IsNullFuncTest) {
this->TestIsNullFunc();
}
TYPED_TEST_P(HybridIndexTestV1, IsNotNullFuncTest) {
this->TestIsNotNullFunc();
}
TYPED_TEST_P(HybridIndexTestV1, CompareValFuncTest) {
this->TestCompareValueFunc();
}
TYPED_TEST_P(HybridIndexTestV1, TestRangeCompareFuncTest) {
this->TestRangeCompareFunc();
}
using BitmapType =
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV1,
CountFuncTest,
INFuncTest,
IsNullFuncTest,
IsNotNullFuncTest,
NotINFuncTest,
CompareValFuncTest,
TestRangeCompareFuncTest);
INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_LowCardinality,
HybridIndexTestV1,
BitmapType);
template <typename T>
class HybridIndexTestV2 : public HybridIndexTestV1<T> {
public:
virtual void
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 2000;
this->nullable_ = false;
this->index_version_ = 1002;
this->index_build_id_ = 1002;
}
virtual ~HybridIndexTestV2() {
}
};
TYPED_TEST_SUITE_P(HybridIndexTestV2);
TYPED_TEST_P(HybridIndexTestV2, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}
TYPED_TEST_P(HybridIndexTestV2, INFuncTest) {
this->TestInFunc();
}
TYPED_TEST_P(HybridIndexTestV2, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(HybridIndexTestV2, IsNullFuncTest) {
this->TestIsNullFunc();
}
TYPED_TEST_P(HybridIndexTestV2, IsNotNullFuncTest) {
this->TestIsNotNullFunc();
}
TYPED_TEST_P(HybridIndexTestV2, CompareValFuncTest) {
this->TestCompareValueFunc();
}
TYPED_TEST_P(HybridIndexTestV2, TestRangeCompareFuncTest) {
this->TestRangeCompareFunc();
}
template <typename T>
class HybridIndexTestNullable : public HybridIndexTestV1<T> {
public:
virtual void
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 2000;
this->nullable_ = true;
this->index_version_ = 1003;
this->index_build_id_ = 1003;
}
virtual ~HybridIndexTestNullable() {
}
};
TYPED_TEST_SUITE_P(HybridIndexTestNullable);
TYPED_TEST_P(HybridIndexTestNullable, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}
TYPED_TEST_P(HybridIndexTestNullable, INFuncTest) {
this->TestInFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, IsNullFuncTest) {
this->TestIsNullFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, IsNotNullFuncTest) {
this->TestIsNotNullFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, CompareValFuncTest) {
this->TestCompareValueFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, TestRangeCompareFuncTest) {
this->TestRangeCompareFunc();
}
using BitmapType =
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV2,
CountFuncTest,
INFuncTest,
IsNullFuncTest,
IsNotNullFuncTest,
NotINFuncTest,
CompareValFuncTest,
TestRangeCompareFuncTest);
REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestNullable,
CountFuncTest,
INFuncTest,
IsNullFuncTest,
IsNotNullFuncTest,
NotINFuncTest,
CompareValFuncTest,
TestRangeCompareFuncTest);
INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_HighCardinality,
HybridIndexTestV2,
BitmapType);
INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_Nullable,
HybridIndexTestNullable,
BitmapType);