From 626b1b2f5e75b4500b9f18dc7c0aeec0cac5284e Mon Sep 17 00:00:00 2001 From: zhagnlu <1542303831@qq.com> Date: Thu, 8 Aug 2024 10:32:16 +0800 Subject: [PATCH] fix:redefine hybrid internal index type (#35314) #32900 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/index/BitmapIndex.h | 7 +- internal/core/src/index/HybridScalarIndex.cpp | 16 +- .../core/src/index/InvertedIndexTantivy.h | 7 +- internal/core/src/index/ScalarIndex.h | 5 + internal/core/unittest/CMakeLists.txt | 1 + internal/core/unittest/test_bitmap_index.cpp | 403 ++++++++++++++++++ 6 files changed, 429 insertions(+), 10 deletions(-) create mode 100644 internal/core/unittest/test_bitmap_index.cpp diff --git a/internal/core/src/index/BitmapIndex.h b/internal/core/src/index/BitmapIndex.h index a29f401f75..c37cfec1cd 100644 --- a/internal/core/src/index/BitmapIndex.h +++ b/internal/core/src/index/BitmapIndex.h @@ -117,6 +117,11 @@ class BitmapIndex : public ScalarIndex { const TargetBitmap Query(const DatasetPtr& dataset) override; + bool + SupportPatternMatch() const override { + return SupportRegexQuery(); + } + const TargetBitmap PatternMatch(const std::string& pattern) override { PatternMatchTranslator translator; @@ -126,7 +131,7 @@ class BitmapIndex : public ScalarIndex { bool SupportRegexQuery() const override { - return true; + return std::is_same_v; } const TargetBitmap diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp index 0118039781..e9280523df 100644 --- a/internal/core/src/index/HybridScalarIndex.cpp +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -50,9 +50,9 @@ HybridScalarIndex::SelectIndexBuildType(size_t n, const T* values) { distinct_vals.insert(values[i]); } - // Decide whether to select bitmap index or stl sort + // Decide whether to select bitmap index or inverted sort if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { - internal_index_type_ = ScalarIndexType::STLSORT; + internal_index_type_ = ScalarIndexType::INVERTED; } else { internal_index_type_ = ScalarIndexType::BITMAP; } @@ -71,9 +71,9 @@ HybridScalarIndex::SelectIndexBuildType( } } - // Decide whether to select bitmap index or marisa index + // Decide whether to select bitmap index or inverted index if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { - internal_index_type_ = ScalarIndexType::MARISA; + internal_index_type_ = ScalarIndexType::INVERTED; } else { internal_index_type_ = ScalarIndexType::BITMAP; } @@ -96,9 +96,9 @@ HybridScalarIndex::SelectBuildTypeForPrimitiveType( } } - // Decide whether to select bitmap index or stl sort + // Decide whether to select bitmap index or inverted sort if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { - internal_index_type_ = ScalarIndexType::STLSORT; + internal_index_type_ = ScalarIndexType::INVERTED; } else { internal_index_type_ = ScalarIndexType::BITMAP; } @@ -121,9 +121,9 @@ HybridScalarIndex::SelectBuildTypeForPrimitiveType( } } - // Decide whether to select bitmap index or marisa sort + // Decide whether to select bitmap index or inverted sort if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { - internal_index_type_ = ScalarIndexType::MARISA; + internal_index_type_ = ScalarIndexType::INVERTED; } else { internal_index_type_ = ScalarIndexType::BITMAP; } diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index 98114a7914..12165c572b 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -154,9 +154,14 @@ class InvertedIndexTantivy : public ScalarIndex { return RegexQuery(regex_pattern); } + bool + SupportPatternMatch() const override { + return SupportRegexQuery(); + } + bool SupportRegexQuery() const override { - return true; + return std::is_same_v; } const TargetBitmap diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index badff11383..bdb576dc2f 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -114,6 +114,11 @@ class ScalarIndex : public IndexBase { virtual const TargetBitmap Query(const DatasetPtr& dataset); + virtual bool + SupportPatternMatch() const { + return false; + } + virtual const TargetBitmap PatternMatch(const std::string& pattern) { PanicInfo(Unsupported, "pattern match is not supported"); diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 246b844a6f..cfe2ad00b7 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -31,6 +31,7 @@ set(MILVUS_TEST_FILES test_growing.cpp test_growing_index.cpp test_indexing.cpp + test_bitmap_index.cpp test_hybrid_index.cpp test_array_bitmap_index.cpp test_index_c_api.cpp diff --git a/internal/core/unittest/test_bitmap_index.cpp b/internal/core/unittest/test_bitmap_index.cpp new file mode 100644 index 0000000000..3ea8811965 --- /dev/null +++ b/internal/core/unittest/test_bitmap_index.cpp @@ -0,0 +1,403 @@ +// Copyright(C) 2019 - 2020 Zilliz.All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include +#include +#include +#include + +#include "common/Tracer.h" +#include "index/BitmapIndex.h" +#include "storage/Util.h" +#include "storage/InsertData.h" +#include "indexbuilder/IndexFactory.h" +#include "index/IndexFactory.h" +#include "test_utils/indexbuilder_test_utils.h" +#include "index/Meta.h" + +using namespace milvus::index; +using namespace milvus::indexbuilder; +using namespace milvus; +using namespace milvus::index; + +template +static std::vector +GenerateData(const size_t size, const size_t cardinality) { + std::vector result; + for (size_t i = 0; i < size; ++i) { + result.push_back(rand() % cardinality); + } + return result; +} + +template <> +std::vector +GenerateData(const size_t size, const size_t cardinality) { + std::vector result; + for (size_t i = 0; i < size; ++i) { + result.push_back(rand() % 2 == 0); + } + return result; +} + +template <> +std::vector +GenerateData(const size_t size, const size_t cardinality) { + std::vector result; + for (size_t i = 0; i < size; ++i) { + result.push_back(std::to_string(rand() % cardinality)); + } + return result; +} + +template +class BitmapIndexTest : public testing::Test { + protected: + void + Init(int64_t collection_id, + int64_t partition_id, + int64_t segment_id, + int64_t field_id, + int64_t index_build_id, + int64_t index_version) { + proto::schema::FieldSchema field_schema; + if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::Int8); + } else if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::Int16); + } else if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::Int32); + } else if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::Int64); + } else if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::Float); + } else if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::Double); + } else if constexpr (std::is_same_v) { + field_schema.set_data_type(proto::schema::DataType::String); + } + auto field_meta = storage::FieldDataMeta{ + collection_id, partition_id, segment_id, field_id, field_schema}; + auto index_meta = storage::IndexMeta{ + segment_id, field_id, index_build_id, index_version}; + + std::vector data_gen; + data_gen = GenerateData(nb_, cardinality_); + for (auto x : data_gen) { + data_.push_back(x); + } + + auto field_data = storage::CreateFieldData(type_); + field_data->FillFieldData(data_.data(), data_.size()); + storage::InsertData insert_data(field_data); + insert_data.SetFieldDataMeta(field_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::Remote); + + auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}", + "/tmp/test_bitmap/", + collection_id, + partition_id, + segment_id, + field_id, + 0); + chunk_manager_->Write( + log_path, serialized_bytes.data(), serialized_bytes.size()); + + storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_); + std::vector index_files; + + Config config; + config["index_type"] = milvus::index::BITMAP_INDEX_TYPE; + config["insert_files"] = std::vector{log_path}; + + auto build_index = + indexbuilder::IndexFactory::GetInstance().CreateIndex( + type_, config, ctx); + build_index->Build(); + + auto binary_set = build_index->Upload(); + for (const auto& [key, _] : binary_set.binary_map_) { + index_files.push_back(key); + } + + index::CreateIndexInfo index_info{}; + index_info.index_type = milvus::index::BITMAP_INDEX_TYPE; + index_info.field_type = type_; + + config["index_files"] = index_files; + + index_ = + index::IndexFactory::GetInstance().CreateIndex(index_info, ctx); + index_->Load(milvus::tracer::TraceContext{}, config); + } + + virtual void + SetParam() { + nb_ = 10000; + cardinality_ = 30; + } + void + SetUp() override { + SetParam(); + + if constexpr (std::is_same_v) { + type_ = DataType::INT8; + } else if constexpr (std::is_same_v) { + type_ = DataType::INT16; + } else if constexpr (std::is_same_v) { + type_ = DataType::INT32; + } else if constexpr (std::is_same_v) { + type_ = DataType::INT64; + } else if constexpr (std::is_same_v) { + type_ = DataType::VARCHAR; + } + int64_t collection_id = 1; + int64_t partition_id = 2; + int64_t segment_id = 3; + int64_t field_id = 101; + int64_t index_build_id = 1000; + int64_t index_version = 10000; + std::string root_path = "/tmp/test-bitmap-index/"; + + storage::StorageConfig storage_config; + storage_config.storage_type = "local"; + storage_config.root_path = root_path; + chunk_manager_ = storage::CreateChunkManager(storage_config); + + Init(collection_id, + partition_id, + segment_id, + field_id, + index_build_id, + index_version); + } + + virtual ~BitmapIndexTest() override { + boost::filesystem::remove_all(chunk_manager_->GetRootPath()); + } + + public: + void + TestInFunc() { + boost::container::vector test_data; + std::unordered_set s; + size_t nq = 10; + for (size_t i = 0; i < nq; i++) { + test_data.push_back(data_[i]); + s.insert(data_[i]); + } + auto index_ptr = dynamic_cast*>(index_.get()); + auto bitset = index_ptr->In(test_data.size(), test_data.data()); + for (size_t i = 0; i < bitset.size(); i++) { + ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end()); + } + } + + void + TestNotInFunc() { + boost::container::vector test_data; + std::unordered_set s; + size_t nq = 10; + for (size_t i = 0; i < nq; i++) { + test_data.push_back(data_[i]); + s.insert(data_[i]); + } + auto index_ptr = dynamic_cast*>(index_.get()); + auto bitset = index_ptr->NotIn(test_data.size(), test_data.data()); + for (size_t i = 0; i < bitset.size(); i++) { + ASSERT_EQ(bitset[i], s.find(data_[i]) == s.end()); + } + } + + void + TestCompareValueFunc() { + if constexpr (!std::is_same_v) { + using RefFunc = std::function; + std::vector> test_cases{ + {10, + OpType::GreaterThan, + [&](int64_t i) -> bool { return data_[i] > 10; }}, + {10, + OpType::GreaterEqual, + [&](int64_t i) -> bool { return data_[i] >= 10; }}, + {10, + OpType::LessThan, + [&](int64_t i) -> bool { return data_[i] < 10; }}, + {10, + OpType::LessEqual, + [&](int64_t i) -> bool { return data_[i] <= 10; }}, + }; + for (const auto& [test_value, op, ref] : test_cases) { + auto index_ptr = + dynamic_cast*>(index_.get()); + auto bitset = index_ptr->Range(test_value, op); + for (size_t i = 0; i < bitset.size(); i++) { + auto ans = bitset[i]; + auto should = ref(i); + ASSERT_EQ(ans, should) + << "op: " << op << ", @" << i << ", ans: " << ans + << ", ref: " << should; + } + } + } + } + + void + TestRangeCompareFunc() { + if constexpr (!std::is_same_v) { + using RefFunc = std::function; + struct TestParam { + int64_t lower_val; + int64_t upper_val; + bool lower_inclusive; + bool upper_inclusive; + RefFunc ref; + }; + std::vector test_cases = { + { + 10, + 30, + false, + false, + [&](int64_t i) { return 10 < data_[i] && data_[i] < 30; }, + }, + { + 10, + 30, + true, + false, + [&](int64_t i) { return 10 <= data_[i] && data_[i] < 30; }, + }, + { + 10, + 30, + true, + true, + [&](int64_t i) { return 10 <= data_[i] && data_[i] <= 30; }, + }, + { + 10, + 30, + false, + true, + [&](int64_t i) { return 10 < data_[i] && data_[i] <= 30; }, + }}; + + for (const auto& test_case : test_cases) { + auto index_ptr = + dynamic_cast*>(index_.get()); + auto bitset = index_ptr->Range(test_case.lower_val, + test_case.lower_inclusive, + test_case.upper_val, + test_case.upper_inclusive); + for (size_t i = 0; i < bitset.size(); i++) { + auto ans = bitset[i]; + auto should = test_case.ref(i); + ASSERT_EQ(ans, should) + << "lower:" << test_case.lower_val + << "upper:" << test_case.upper_val << ", @" << i + << ", ans: " << ans << ", ref: " << should; + } + } + } + } + + public: + IndexBasePtr index_; + DataType type_; + size_t nb_; + size_t cardinality_; + boost::container::vector data_; + std::shared_ptr chunk_manager_; +}; + +TYPED_TEST_SUITE_P(BitmapIndexTest); + +TYPED_TEST_P(BitmapIndexTest, CountFuncTest) { + auto count = this->index_->Count(); + EXPECT_EQ(count, this->nb_); +} + +TYPED_TEST_P(BitmapIndexTest, INFuncTest) { + this->TestInFunc(); +} + +TYPED_TEST_P(BitmapIndexTest, NotINFuncTest) { + this->TestNotInFunc(); +} + +TYPED_TEST_P(BitmapIndexTest, CompareValFuncTest) { + this->TestCompareValueFunc(); +} + +using BitmapType = + testing::Types; + +REGISTER_TYPED_TEST_SUITE_P(BitmapIndexTest, + CountFuncTest, + INFuncTest, + NotINFuncTest, + CompareValFuncTest); + +INSTANTIATE_TYPED_TEST_SUITE_P(BitmapE2ECheck, BitmapIndexTest, BitmapType); + +template +class BitmapIndexTestV2 : public BitmapIndexTest { + public: + virtual void + SetParam() override { + this->nb_ = 10000; + this->cardinality_ = 2000; + } + + virtual ~BitmapIndexTestV2() { + } +}; + +TYPED_TEST_SUITE_P(BitmapIndexTestV2); + +TYPED_TEST_P(BitmapIndexTestV2, CountFuncTest) { + auto count = this->index_->Count(); + EXPECT_EQ(count, this->nb_); +} + +TYPED_TEST_P(BitmapIndexTestV2, INFuncTest) { + this->TestInFunc(); +} + +TYPED_TEST_P(BitmapIndexTestV2, NotINFuncTest) { + this->TestNotInFunc(); +} + +TYPED_TEST_P(BitmapIndexTestV2, CompareValFuncTest) { + this->TestCompareValueFunc(); +} + +TYPED_TEST_P(BitmapIndexTestV2, TestRangeCompareFuncTest) { + this->TestRangeCompareFunc(); +} + +using BitmapType = + testing::Types; + +REGISTER_TYPED_TEST_SUITE_P(BitmapIndexTestV2, + CountFuncTest, + INFuncTest, + NotINFuncTest, + CompareValFuncTest, + TestRangeCompareFuncTest); + +INSTANTIATE_TYPED_TEST_SUITE_P(BitmapIndexE2ECheck_HighCardinality, + BitmapIndexTestV2, + BitmapType); \ No newline at end of file