enhance: support bitmap index for scalar type (#32902)

#32900

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
pull/33147/head
zhagnlu 2024-05-19 21:49:38 +08:00 committed by GitHub
parent 5cc38aa9f8
commit d669fbcf46
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 1171 additions and 7 deletions

View File

@ -42,6 +42,7 @@ class MilvusConan(ConanFile):
"opentelemetry-cpp/1.8.1.1@milvus/dev",
"librdkafka/1.9.1",
"abseil/20230125.3",
"roaring/3.0.0",
)
generators = ("cmake", "cmake_find_package")
default_options = {

View File

@ -0,0 +1,666 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "index/BitmapIndex.h"
#include "common/Slice.h"
#include "index/Meta.h"
#include "index/ScalarIndex.h"
#include "index/Utils.h"
#include "storage/Util.h"
#include "storage/space.h"
namespace milvus {
namespace index {
template <typename T>
BitmapIndex<T>::BitmapIndex(
const storage::FileManagerContext& file_manager_context)
: is_built_(false) {
if (file_manager_context.Valid()) {
file_manager_ =
std::make_shared<storage::MemFileManagerImpl>(file_manager_context);
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
}
}
template <typename T>
BitmapIndex<T>::BitmapIndex(
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space)
: is_built_(false), data_(), space_(space) {
if (file_manager_context.Valid()) {
file_manager_ = std::make_shared<storage::MemFileManagerImpl>(
file_manager_context, space);
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
}
}
template <typename T>
void
BitmapIndex<T>::Build(const Config& config) {
if (is_built_) {
return;
}
auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(),
"insert file paths is empty when build index");
auto field_datas =
file_manager_->CacheRawDataToMemory(insert_files.value());
int total_num_rows = 0;
for (const auto& field_data : field_datas) {
total_num_rows += field_data->get_num_rows();
}
if (total_num_rows == 0) {
throw SegcoreError(DataIsEmpty,
"scalar bitmap index can not build null values");
}
total_num_rows_ = total_num_rows;
int64_t offset = 0;
for (const auto& data : field_datas) {
auto slice_row_num = data->get_num_rows();
for (size_t i = 0; i < slice_row_num; ++i) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
offset++;
}
}
is_built_ = true;
}
template <typename T>
void
BitmapIndex<T>::Build(size_t n, const T* data) {
if (is_built_) {
return;
}
if (n == 0) {
throw SegcoreError(DataIsEmpty,
"BitmapIndex can not build null values");
}
T* p = const_cast<T*>(data);
for (int i = 0; i < n; ++i, ++p) {
data_[*p].add(i);
}
total_num_rows_ = n;
for (auto it = data_.begin(); it != data_.end(); ++it) {
bitsets_[it->first] = ConvertRoaringToBitset(it->second);
}
is_built_ = true;
}
template <typename T>
void
BitmapIndex<T>::BuildV2(const Config& config) {
if (is_built_) {
return;
}
auto field_name = file_manager_->GetIndexMeta().field_name;
auto reader = space_->ScanData();
std::vector<FieldDataPtr> field_datas;
for (auto rec = reader->Next(); rec != nullptr; rec = reader->Next()) {
if (!rec.ok()) {
PanicInfo(DataFormatBroken, "failed to read data");
}
auto data = rec.ValueUnsafe();
auto total_num_rows = data->num_rows();
auto col_data = data->GetColumnByName(field_name);
auto field_data = storage::CreateFieldData(
DataType(GetDType<T>()), 0, total_num_rows);
field_data->FillFieldData(col_data);
field_datas.push_back(field_data);
}
int total_num_rows = 0;
for (auto& field_data : field_datas) {
total_num_rows += field_data->get_num_rows();
}
if (total_num_rows == 0) {
throw SegcoreError(DataIsEmpty,
"scalar bitmap index can not build null values");
}
total_num_rows_ = total_num_rows;
int64_t offset = 0;
for (const auto& data : field_datas) {
auto slice_row_num = data->get_num_rows();
for (size_t i = 0; i < slice_row_num; ++i) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
offset++;
}
}
is_built_ = true;
}
template <typename T>
size_t
BitmapIndex<T>::GetIndexDataSize() {
auto index_data_size = 0;
for (auto& pair : data_) {
index_data_size += pair.second.getSizeInBytes() + sizeof(T);
}
return index_data_size;
}
template <>
size_t
BitmapIndex<std::string>::GetIndexDataSize() {
auto index_data_size = 0;
for (auto& pair : data_) {
index_data_size +=
pair.second.getSizeInBytes() + pair.first.size() + sizeof(size_t);
}
return index_data_size;
}
template <typename T>
void
BitmapIndex<T>::SerializeIndexData(uint8_t* data_ptr) {
for (auto& pair : data_) {
memcpy(data_ptr, &pair.first, sizeof(T));
data_ptr += sizeof(T);
pair.second.write(reinterpret_cast<char*>(data_ptr));
data_ptr += pair.second.getSizeInBytes();
}
}
template <>
void
BitmapIndex<std::string>::SerializeIndexData(uint8_t* data_ptr) {
for (auto& pair : data_) {
size_t key_size = pair.first.size();
memcpy(data_ptr, &key_size, sizeof(size_t));
data_ptr += sizeof(size_t);
memcpy(data_ptr, pair.first.data(), key_size);
data_ptr += key_size;
pair.second.write(reinterpret_cast<char*>(data_ptr));
data_ptr += pair.second.getSizeInBytes();
}
}
template <typename T>
BinarySet
BitmapIndex<T>::Serialize(const Config& config) {
AssertInfo(is_built_, "index has not been built yet");
auto index_data_size = GetIndexDataSize();
std::shared_ptr<uint8_t[]> index_data(new uint8_t[index_data_size]);
uint8_t* data_ptr = index_data.get();
SerializeIndexData(data_ptr);
std::shared_ptr<uint8_t[]> index_length(new uint8_t[sizeof(size_t)]);
auto index_size = data_.size();
memcpy(index_length.get(), &index_size, sizeof(size_t));
std::shared_ptr<uint8_t[]> num_rows(new uint8_t[sizeof(size_t)]);
memcpy(num_rows.get(), &total_num_rows_, sizeof(size_t));
BinarySet ret_set;
ret_set.Append(BITMAP_INDEX_DATA, index_data, index_data_size);
ret_set.Append(BITMAP_INDEX_LENGTH, index_length, sizeof(size_t));
ret_set.Append(BITMAP_INDEX_NUM_ROWS, num_rows, sizeof(size_t));
LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}",
index_size,
total_num_rows_);
return ret_set;
}
template <typename T>
BinarySet
BitmapIndex<T>::Upload(const Config& config) {
auto binary_set = Serialize(config);
file_manager_->AddFile(binary_set);
auto remote_path_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet ret;
for (auto& file : remote_path_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
template <typename T>
BinarySet
BitmapIndex<T>::UploadV2(const Config& config) {
auto binary_set = Serialize(config);
file_manager_->AddFileV2(binary_set);
auto remote_path_to_size = file_manager_->GetRemotePathsToFileSize();
BinarySet ret;
for (auto& file : remote_path_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
template <typename T>
void
BitmapIndex<T>::Load(const BinarySet& binary_set, const Config& config) {
milvus::Assemble(const_cast<BinarySet&>(binary_set));
LoadWithoutAssemble(binary_set, config);
}
template <typename T>
TargetBitmap
BitmapIndex<T>::ConvertRoaringToBitset(const roaring::Roaring& values) {
AssertInfo(total_num_rows_ != 0, "total num rows should not be 0");
TargetBitmap res(total_num_rows_, false);
for (const auto& val : values) {
res.set(val);
}
return res;
}
template <typename T>
void
BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
size_t index_length) {
for (size_t i = 0; i < index_length; ++i) {
T key;
memcpy(&key, data_ptr, sizeof(T));
data_ptr += sizeof(T);
roaring::Roaring value;
value = roaring::Roaring::read(reinterpret_cast<const char*>(data_ptr));
data_ptr += value.getSizeInBytes();
bitsets_[key] = ConvertRoaringToBitset(value);
}
}
template <>
void
BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
size_t index_length) {
for (size_t i = 0; i < index_length; ++i) {
size_t key_size;
memcpy(&key_size, data_ptr, sizeof(size_t));
data_ptr += sizeof(size_t);
std::string key(reinterpret_cast<const char*>(data_ptr), key_size);
data_ptr += key_size;
roaring::Roaring value;
value = roaring::Roaring::read(reinterpret_cast<const char*>(data_ptr));
data_ptr += value.getSizeInBytes();
bitsets_[key] = ConvertRoaringToBitset(value);
}
}
template <typename T>
void
BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) {
size_t index_length;
auto index_length_buffer = binary_set.GetByName(BITMAP_INDEX_LENGTH);
memcpy(&index_length,
index_length_buffer->data.get(),
(size_t)index_length_buffer->size);
auto num_rows_buffer = binary_set.GetByName(BITMAP_INDEX_NUM_ROWS);
memcpy(&total_num_rows_,
num_rows_buffer->data.get(),
(size_t)num_rows_buffer->size);
auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
const uint8_t* data_ptr = index_data_buffer->data.get();
DeserializeIndexData(data_ptr, index_length);
LOG_INFO("load bitmap index with cardinality = {}, num_rows = {}",
Cardinality(),
total_num_rows_);
is_built_ = true;
}
template <typename T>
void
BitmapIndex<T>::LoadV2(const Config& config) {
auto blobs = space_->StatisticsBlobs();
std::vector<std::string> index_files;
auto prefix = file_manager_->GetRemoteIndexObjectPrefixV2();
for (auto& b : blobs) {
if (b.name.rfind(prefix, 0) == 0) {
index_files.push_back(b.name);
}
}
std::map<std::string, FieldDataPtr> index_datas{};
for (auto& file_name : index_files) {
auto res = space_->GetBlobByteSize(file_name);
if (!res.ok()) {
PanicInfo(S3Error, "unable to read index blob");
}
auto index_blob_data =
std::shared_ptr<uint8_t[]>(new uint8_t[res.value()]);
auto status = space_->ReadBlob(file_name, index_blob_data.get());
if (!status.ok()) {
PanicInfo(S3Error, "unable to read index blob");
}
auto raw_index_blob =
storage::DeserializeFileData(index_blob_data, res.value());
auto key = file_name.substr(file_name.find_last_of('/') + 1);
index_datas[key] = raw_index_blob->GetFieldData();
}
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
LoadWithoutAssemble(binary_set, config);
}
template <typename T>
void
BitmapIndex<T>::Load(milvus::tracer::TraceContext ctx, const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load bitmap index");
auto index_datas = file_manager_->LoadIndexToMemory(index_files.value());
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
LoadWithoutAssemble(binary_set, config);
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::In(const size_t n, const T* values) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, false);
#if 0
roaring::Roaring result;
for (size_t i = 0; i < n; ++i) {
auto val = values[i];
auto it = data_.find(val);
if (it != data_.end()) {
result |= it->second;
}
}
for (auto& val : result) {
res.set(val);
}
#else
for (size_t i = 0; i < n; ++i) {
auto val = values[i];
if (bitsets_.find(val) != bitsets_.end()) {
res |= bitsets_.at(val);
}
}
#endif
return res;
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::NotIn(const size_t n, const T* values) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, false);
#if 0
roaring::Roaring result;
for (int i = 0; i < n; ++i) {
auto val = values[i];
auto it = data_.find(val);
if (it != data_.end()) {
result |= it->second;
}
}
for (auto& val : result) {
bitset.reset(val);
}
#else
for (size_t i = 0; i < n; ++i) {
auto val = values[i];
if (bitsets_.find(val) != bitsets_.end()) {
res |= bitsets_.at(val);
}
}
#endif
res.flip();
return res;
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::Range(const T value, const OpType op) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, false);
if (ShouldSkip(value, value, op)) {
return res;
}
auto lb = bitsets_.begin();
auto ub = bitsets_.end();
switch (op) {
case OpType::LessThan: {
ub = std::lower_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
break;
}
case OpType::LessEqual: {
ub = std::upper_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
break;
}
case OpType::GreaterThan: {
lb = std::upper_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
break;
}
case OpType::GreaterEqual: {
lb = std::lower_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
break;
}
default: {
throw SegcoreError(OpTypeInvalid,
fmt::format("Invalid OperatorType: {}", op));
}
}
for (; lb != ub; lb++) {
res |= lb->second;
}
return res;
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::Range(const T lower_value,
bool lb_inclusive,
const T upper_value,
bool ub_inclusive) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, false);
if (lower_value > upper_value ||
(lower_value == upper_value && !(lb_inclusive && ub_inclusive))) {
return res;
}
if (ShouldSkip(lower_value, upper_value, OpType::Range)) {
return res;
}
auto lb = bitsets_.begin();
auto ub = bitsets_.end();
if (lb_inclusive) {
lb = std::lower_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(lower_value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
} else {
lb = std::upper_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(lower_value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
}
if (ub_inclusive) {
ub = std::upper_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(upper_value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
} else {
ub = std::lower_bound(bitsets_.begin(),
bitsets_.end(),
std::make_pair(upper_value, TargetBitmap()),
[](const auto& lhs, const auto& rhs) {
return lhs.first < rhs.first;
});
}
for (; lb != ub; lb++) {
res |= lb->second;
}
return res;
}
template <typename T>
T
BitmapIndex<T>::Reverse_Lookup(size_t idx) const {
AssertInfo(is_built_, "index has not been built");
AssertInfo(idx < total_num_rows_, "out of range of total coun");
for (auto it = bitsets_.begin(); it != bitsets_.end(); it++) {
if (it->second[idx]) {
return it->first;
}
}
throw SegcoreError(
UnexpectedError,
fmt::format(
"scalar bitmap index can not lookup target value of index {}",
idx));
}
template <typename T>
bool
BitmapIndex<T>::ShouldSkip(const T lower_value,
const T upper_value,
const OpType op) {
if (!bitsets_.empty()) {
auto lower_bound = bitsets_.begin()->first;
auto upper_bound = bitsets_.rbegin()->first;
bool should_skip = false;
switch (op) {
case OpType::LessThan: {
// lower_value == upper_value
should_skip = lower_bound >= lower_value;
break;
}
case OpType::LessEqual: {
// lower_value == upper_value
should_skip = lower_bound > lower_value;
break;
}
case OpType::GreaterThan: {
// lower_value == upper_value
should_skip = upper_bound <= lower_value;
break;
}
case OpType::GreaterEqual: {
// lower_value == upper_value
should_skip = upper_bound < lower_value;
break;
}
case OpType::Range: {
// lower_value == upper_value
should_skip =
lower_bound > upper_value || upper_bound < lower_value;
break;
}
default:
throw SegcoreError(
OpTypeInvalid,
fmt::format("Invalid OperatorType for "
"checking scalar index optimization: {}",
op));
}
return should_skip;
}
return true;
}
template class BitmapIndex<bool>;
template class BitmapIndex<int8_t>;
template class BitmapIndex<int16_t>;
template class BitmapIndex<int32_t>;
template class BitmapIndex<int64_t>;
template class BitmapIndex<float>;
template class BitmapIndex<double>;
template class BitmapIndex<std::string>;
} // namespace index
} // namespace milvus

View File

@ -0,0 +1,144 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <roaring/roaring.hh>
#include "index/ScalarIndex.h"
#include "storage/FileManager.h"
#include "storage/DiskFileManagerImpl.h"
#include "storage/MemFileManagerImpl.h"
#include "storage/space.h"
namespace milvus {
namespace index {
/*
* @brief Implementation of Bitmap Index
* @details This index only for scalar Integral type.
*/
template <typename T>
class BitmapIndex : public ScalarIndex<T> {
public:
explicit BitmapIndex(
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
explicit BitmapIndex(
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space);
~BitmapIndex() override = default;
BinarySet
Serialize(const Config& config) override;
void
Load(const BinarySet& index_binary, const Config& config = {}) override;
void
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) override;
void
LoadV2(const Config& config = {}) override;
int64_t
Count() override {
return bitsets_.begin()->second.size();
}
void
Build(size_t n, const T* values) override;
void
Build(const Config& config = {}) override;
void
BuildV2(const Config& config = {}) override;
const TargetBitmap
In(size_t n, const T* values) override;
const TargetBitmap
NotIn(size_t n, const T* values) override;
const TargetBitmap
Range(T value, OpType op) override;
const TargetBitmap
Range(T lower_bound_value,
bool lb_inclusive,
T upper_bound_value,
bool ub_inclusive) override;
T
Reverse_Lookup(size_t offset) const override;
int64_t
Size() override {
return Count();
}
BinarySet
Upload(const Config& config = {}) override;
BinarySet
UploadV2(const Config& config = {}) override;
const bool
HasRawData() const override {
return true;
}
int64_t
Cardinality() {
return bitsets_.size();
}
private:
size_t
GetIndexDataSize();
void
SerializeIndexData(uint8_t* index_data_ptr);
void
DeserializeIndexData(const uint8_t* data_ptr, size_t index_length);
bool
ShouldSkip(const T lower_value, const T upper_value, const OpType op);
TargetBitmap
ConvertRoaringToBitset(const roaring::Roaring& values);
void
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);
private:
bool is_built_;
Config config_;
std::map<T, roaring::Roaring> data_;
std::map<T, TargetBitmap> bitsets_;
size_t total_num_rows_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
std::shared_ptr<milvus_storage::Space> space_;
};
} // namespace index
} // namespace milvus

View File

@ -19,6 +19,7 @@ set(INDEX_FILES
ScalarIndexSort.cpp
SkipIndex.cpp
InvertedIndexTantivy.cpp
BitmapIndex.cpp
)
milvus_add_pkg_config("milvus_index")

View File

@ -27,6 +27,7 @@
#include "index/StringIndexMarisa.h"
#include "index/BoolIndex.h"
#include "index/InvertedIndexTantivy.h"
#include "index/BitmapIndex.h"
namespace milvus::index {
@ -42,6 +43,9 @@ IndexFactory::CreateScalarIndex(
return std::make_unique<InvertedIndexTantivy<T>>(cfg,
file_manager_context);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<T>>(file_manager_context);
}
return CreateScalarIndexSort<T>(file_manager_context);
}
@ -65,6 +69,9 @@ IndexFactory::CreateScalarIndex<std::string>(
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
}
return CreateStringIndexMarisa(file_manager_context);
#else
throw SegcoreError(Unsupported, "unsupported platform");
@ -84,6 +91,9 @@ IndexFactory::CreateScalarIndex(
return std::make_unique<InvertedIndexTantivy<T>>(
cfg, file_manager_context, space);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<T>>(file_manager_context, space);
}
return CreateScalarIndexSort<T>(file_manager_context, space);
}
@ -101,6 +111,10 @@ IndexFactory::CreateScalarIndex<std::string>(
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context, space);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<std::string>>(file_manager_context,
space);
}
return CreateStringIndexMarisa(file_manager_context, space);
#else
throw SegcoreError(Unsupported, "unsupported platform");

View File

@ -30,6 +30,12 @@ constexpr const char* PREFIX_VALUE = "prefix_value";
constexpr const char* MARISA_TRIE_INDEX = "marisa_trie_index";
constexpr const char* MARISA_STR_IDS = "marisa_trie_str_ids";
// below meta key of store bitmap indexes
constexpr const char* BITMAP_INDEX_DATA = "bitmap_index_data";
constexpr const char* BITMAP_INDEX_META = "bitmap_index_meta";
constexpr const char* BITMAP_INDEX_LENGTH = "bitmap_index_length";
constexpr const char* BITMAP_INDEX_NUM_ROWS = "bitmap_index_num_rows";
constexpr const char* INDEX_TYPE = "index_type";
constexpr const char* METRIC_TYPE = "metric_type";
@ -37,6 +43,7 @@ constexpr const char* METRIC_TYPE = "metric_type";
constexpr const char* ASCENDING_SORT = "STL_SORT";
constexpr const char* MARISA_TRIE = "Trie";
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
// index meta
constexpr const char* COLLECTION_ID = "collection_id";

View File

@ -32,6 +32,7 @@ set(MILVUS_TEST_FILES
test_growing.cpp
test_growing_index.cpp
test_indexing.cpp
test_bitmap_index.cpp
test_index_c_api.cpp
test_index_wrapper.cpp
test_init.cpp

View File

@ -0,0 +1,274 @@
// Copyright(C) 2019 - 2020 Zilliz.All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <functional>
#include <boost/filesystem.hpp>
#include <unordered_set>
#include <memory>
#include "common/Tracer.h"
#include "index/BitmapIndex.h"
#include "storage/Util.h"
#include "storage/InsertData.h"
#include "indexbuilder/IndexFactory.h"
#include "index/IndexFactory.h"
#include "test_utils/indexbuilder_test_utils.h"
#include "index/Meta.h"
using namespace milvus::index;
using namespace milvus::indexbuilder;
using namespace milvus;
using namespace milvus::index;
template <typename T>
static std::vector<T>
GenerateData(const size_t size, const size_t cardinality) {
std::vector<T> result;
for (size_t i = 0; i < size; ++i) {
result.push_back(rand() % cardinality);
}
return result;
}
template <>
std::vector<bool>
GenerateData<bool>(const size_t size, const size_t cardinality) {
std::vector<bool> result;
for (size_t i = 0; i < size; ++i) {
result.push_back(rand() % 2 == 0);
}
return result;
}
template <>
std::vector<std::string>
GenerateData<std::string>(const size_t size, const size_t cardinality) {
std::vector<std::string> result;
for (size_t i = 0; i < size; ++i) {
result.push_back(std::to_string(rand() % cardinality));
}
return result;
}
template <typename T>
class BitmapIndexTest : public testing::Test {
protected:
void
Init(int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id,
int64_t index_build_id,
int64_t index_version) {
auto field_meta = storage::FieldDataMeta{
collection_id, partition_id, segment_id, field_id};
auto index_meta = storage::IndexMeta{
segment_id, field_id, index_build_id, index_version};
std::vector<T> data_gen;
data_gen = GenerateData<T>(nb_, cardinality_);
for (auto x : data_gen) {
data_.push_back(x);
}
auto field_data = storage::CreateFieldData(type_);
field_data->FillFieldData(data_.data(), data_.size());
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto log_path = fmt::format("{}/{}/{}/{}/{}",
collection_id,
partition_id,
segment_id,
field_id,
0);
chunk_manager_->Write(
log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_);
std::vector<std::string> index_files;
Config config;
config["index_type"] = milvus::index::BITMAP_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
auto build_index =
indexbuilder::IndexFactory::GetInstance().CreateIndex(
type_, config, ctx);
build_index->Build();
auto binary_set = build_index->Upload();
for (const auto& [key, _] : binary_set.binary_map_) {
index_files.push_back(key);
}
index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::BITMAP_INDEX_TYPE;
index_info.field_type = type_;
config["index_files"] = index_files;
index_ =
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
index_->Load(milvus::tracer::TraceContext{}, config);
}
void
SetUp() override {
nb_ = 10000;
cardinality_ = 30;
if constexpr (std::is_same_v<T, int8_t>) {
type_ = DataType::INT8;
} else if constexpr (std::is_same_v<T, int16_t>) {
type_ = DataType::INT16;
} else if constexpr (std::is_same_v<T, int32_t>) {
type_ = DataType::INT32;
} else if constexpr (std::is_same_v<T, int64_t>) {
type_ = DataType::INT64;
} else if constexpr (std::is_same_v<T, std::string>) {
type_ = DataType::VARCHAR;
}
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t field_id = 101;
int64_t index_build_id = 1000;
int64_t index_version = 10000;
std::string root_path = "/tmp/test-bitmap-index/";
storage::StorageConfig storage_config;
storage_config.storage_type = "local";
storage_config.root_path = root_path;
chunk_manager_ = storage::CreateChunkManager(storage_config);
Init(collection_id,
partition_id,
segment_id,
field_id,
index_build_id,
index_version);
}
virtual ~BitmapIndexTest() override {
boost::filesystem::remove_all(chunk_manager_->GetRootPath());
}
public:
void
TestInFunc() {
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq; i++) {
test_data.push_back(data_[i]);
s.insert(data_[i]);
}
auto index_ptr = dynamic_cast<index::BitmapIndex<T>*>(index_.get());
auto bitset = index_ptr->In(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
}
}
void
TestNotInFunc() {
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq; i++) {
test_data.push_back(data_[i]);
s.insert(data_[i]);
}
auto index_ptr = dynamic_cast<index::BitmapIndex<T>*>(index_.get());
auto bitset = index_ptr->NotIn(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data_[i]) == s.end());
}
}
void
TestCompareValueFunc() {
if constexpr (!std::is_same_v<T, std::string>) {
using RefFunc = std::function<bool(int64_t)>;
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
{10,
OpType::GreaterThan,
[&](int64_t i) -> bool { return data_[i] > 10; }},
{10,
OpType::GreaterEqual,
[&](int64_t i) -> bool { return data_[i] >= 10; }},
{10,
OpType::LessThan,
[&](int64_t i) -> bool { return data_[i] < 10; }},
{10,
OpType::LessEqual,
[&](int64_t i) -> bool { return data_[i] <= 10; }},
};
for (const auto& [test_value, op, ref] : test_cases) {
auto index_ptr =
dynamic_cast<index::BitmapIndex<T>*>(index_.get());
auto bitset = index_ptr->Range(test_value, op);
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
private:
std::shared_ptr<storage::ChunkManager> chunk_manager_;
public:
IndexBasePtr index_;
DataType type_;
size_t nb_;
size_t cardinality_;
boost::container::vector<T> data_;
};
TYPED_TEST_SUITE_P(BitmapIndexTest);
TYPED_TEST_P(BitmapIndexTest, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}
TYPED_TEST_P(BitmapIndexTest, INFuncTest) {
this->TestInFunc();
}
TYPED_TEST_P(BitmapIndexTest, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(BitmapIndexTest, CompareValFuncTest) {
this->TestCompareValueFunc();
}
using BitmapType =
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
REGISTER_TYPED_TEST_SUITE_P(BitmapIndexTest,
CountFuncTest,
INFuncTest,
NotINFuncTest,
CompareValFuncTest);
INSTANTIATE_TYPED_TEST_SUITE_P(BitmapE2ECheck, BitmapIndexTest, BitmapType);

View File

@ -5212,4 +5212,4 @@ TEST(CApiTest, RANGE_SEARCH_WITH_RADIUS_AND_RANGE_FILTER_WHEN_IP_BFLOAT16) {
TEST(CApiTest, IsLoadWithDisk) {
ASSERT_TRUE(IsLoadWithDisk(INVERTED_INDEX_TYPE, 0));
}
}

View File

@ -382,7 +382,7 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
auto new_scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(new_index.get());
new_scalar_index->LoadV2();
ASSERT_EQ(nb, scalar_index->Count());
ASSERT_EQ(nb, new_scalar_index->Count());
}
}

View File

@ -478,26 +478,30 @@ GenDsFromPB(const google::protobuf::Message& msg) {
template <typename T>
inline std::vector<std::string>
GetIndexTypes() {
return std::vector<std::string>{"sort"};
return std::vector<std::string>{"sort", milvus::index::BITMAP_INDEX_TYPE};
}
template <>
inline std::vector<std::string>
GetIndexTypes<std::string>() {
return std::vector<std::string>{"sort", "marisa"};
return std::vector<std::string>{
"sort", "marisa", milvus::index::BITMAP_INDEX_TYPE};
}
template <typename T>
inline std::vector<std::string>
GetIndexTypesV2() {
return std::vector<std::string>{"sort", milvus::index::INVERTED_INDEX_TYPE};
return std::vector<std::string>{"sort",
milvus::index::INVERTED_INDEX_TYPE,
milvus::index::BITMAP_INDEX_TYPE};
}
template <>
inline std::vector<std::string>
GetIndexTypesV2<std::string>() {
return std::vector<std::string>{milvus::index::INVERTED_INDEX_TYPE,
"marisa"};
return std::vector<std::string>{"marisa",
milvus::index::INVERTED_INDEX_TYPE,
milvus::index::BITMAP_INDEX_TYPE};
}
} // namespace

View File

@ -0,0 +1,22 @@
package indexparamcheck
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
)
func Test_BitmapIndexChecker(t *testing.T) {
c := newBITMAPChecker()
assert.NoError(t, c.CheckTrain(map[string]string{}))
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Int64))
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Float))
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_String))
assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON))
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Array))
}

View File

@ -0,0 +1,28 @@
package indexparamcheck
import (
"fmt"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
// STLSORTChecker checks if a STL_SORT index can be built.
type BITMAPChecker struct {
scalarIndexChecker
}
func (c *BITMAPChecker) CheckTrain(params map[string]string) error {
return c.scalarIndexChecker.CheckTrain(params)
}
func (c *BITMAPChecker) CheckValidDataType(dType schemapb.DataType) error {
if !typeutil.IsArithmetic(dType) && !typeutil.IsStringType(dType) {
return fmt.Errorf("bitmap index are only supported on numeric and string field")
}
return nil
}
func newBITMAPChecker() *BITMAPChecker {
return &BITMAPChecker{}
}

View File

@ -65,6 +65,7 @@ func (mgr *indexCheckerMgrImpl) registerIndexChecker() {
mgr.checkers["Asceneding"] = newSTLSORTChecker()
mgr.checkers[IndexTRIE] = newTRIEChecker()
mgr.checkers[IndexTrie] = newTRIEChecker()
mgr.checkers[IndexBitmap] = newBITMAPChecker()
mgr.checkers["marisa-trie"] = newTRIEChecker()
mgr.checkers[AutoIndex] = newAUTOINDEXChecker()
}

View File

@ -37,6 +37,7 @@ const (
IndexSTLSORT IndexType = "STL_SORT"
IndexTRIE IndexType = "TRIE"
IndexTrie IndexType = "Trie"
IndexBitmap IndexType = "BITMAP"
AutoIndex IndexType = "AUTOINDEX"
)