feat: support inverted index (#28783)

issue: https://github.com/milvus-io/milvus/issues/27704

Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.

Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.

Not supported: `ARRAY` and `JSON`.

Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.

The inverted index is very easy to be used.

Taking below collection as an example:

```python
fields = [
		FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
		FieldSchema(name="int8", dtype=DataType.INT8),
		FieldSchema(name="int16", dtype=DataType.INT16),
		FieldSchema(name="int32", dtype=DataType.INT32),
		FieldSchema(name="int64", dtype=DataType.INT64),
		FieldSchema(name="float", dtype=DataType.FLOAT),
		FieldSchema(name="double", dtype=DataType.DOUBLE),
		FieldSchema(name="bool", dtype=DataType.BOOL),
		FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
		FieldSchema(name="random", dtype=DataType.DOUBLE),
		FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```

Then we can simply create inverted index for field via:

```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```

Then, term query and range query on the field can be speed up
automatically by the inverted index:

```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```

---------

Signed-off-by: longjiquan <jiquan.long@zilliz.com>
pull/29610/head
Jiquan Long 2023-12-31 19:50:47 +08:00 committed by GitHub
parent 984e7bba9b
commit 3f46c6d459
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
50 changed files with 3028 additions and 185 deletions

View File

@ -7,12 +7,13 @@ fi
CorePath=$1
formatThis() {
find "$1" | grep -E "(*\.cpp|*\.h|*\.cc)$" | grep -v "gen_tools/templates" | grep -v "/thirdparty" | grep -v "\.pb\." | xargs clang-format-10 -i
find "$1" | grep -E "(*\.cpp|*\.h|*\.cc)$" | grep -v "gen_tools/templates" | grep -v "\.pb\." | xargs clang-format-10 -i
}
formatThis "${CorePath}/src"
formatThis "${CorePath}/unittest"
formatThis "${CorePath}/unittest/bench"
formatThis "${CorePath}/thirdparty/tantivy"
${CorePath}/build-support/add_cpp_license.sh ${CorePath}/build-support/cpp_license.txt ${CorePath}
${CorePath}/build-support/add_cmake_license.sh ${CorePath}/build-support/cmake_license.txt ${CorePath}
${CorePath}/build-support/add_cmake_license.sh ${CorePath}/build-support/cmake_license.txt ${CorePath}

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <oneapi/tbb/concurrent_queue.h>
#include <atomic>

View File

@ -18,11 +18,12 @@ set(INDEX_FILES
ScalarIndex.cpp
ScalarIndexSort.cpp
SkipIndex.cpp
InvertedIndexTantivy.cpp
)
milvus_add_pkg_config("milvus_index")
add_library(milvus_index SHARED ${INDEX_FILES})
target_link_libraries(milvus_index milvus_storage milvus-storage)
target_link_libraries(milvus_index milvus_storage milvus-storage tantivy_binding)
install(TARGETS milvus_index DESTINATION "${CMAKE_INSTALL_LIBDIR}")

View File

@ -25,6 +25,7 @@
#include "index/ScalarIndexSort.h"
#include "index/StringIndexMarisa.h"
#include "index/BoolIndex.h"
#include "index/InvertedIndexTantivy.h"
namespace milvus::index {
@ -32,7 +33,14 @@ template <typename T>
ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context) {
const storage::FileManagerContext& file_manager_context,
DataType d_type) {
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<T>>(cfg,
file_manager_context);
}
return CreateScalarIndexSort<T>(file_manager_context);
}
@ -44,11 +52,18 @@ IndexFactory::CreateScalarIndex(
//
template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
inline ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context) {
const storage::FileManagerContext& file_manager_context,
DataType d_type) {
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context);
}
return CreateStringIndexMarisa(file_manager_context);
#else
throw SegcoreError(Unsupported, "unsupported platform");
@ -60,7 +75,14 @@ ScalarIndexPtr<T>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space) {
std::shared_ptr<milvus_storage::Space> space,
DataType d_type) {
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<T>>(
cfg, file_manager_context, space);
}
return CreateScalarIndexSort<T>(file_manager_context, space);
}
@ -69,8 +91,15 @@ ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space) {
std::shared_ptr<milvus_storage::Space> space,
DataType d_type) {
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
TantivyConfig cfg;
cfg.data_type_ = d_type;
return std::make_unique<InvertedIndexTantivy<std::string>>(
cfg, file_manager_context, space);
}
return CreateStringIndexMarisa(file_manager_context, space);
#else
throw SegcoreError(Unsupported, "unsupported platform");
@ -111,25 +140,32 @@ IndexFactory::CreateScalarIndex(
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(index_type, file_manager_context);
return CreateScalarIndex<bool>(
index_type, file_manager_context, data_type);
case DataType::INT8:
return CreateScalarIndex<int8_t>(index_type, file_manager_context);
return CreateScalarIndex<int8_t>(
index_type, file_manager_context, data_type);
case DataType::INT16:
return CreateScalarIndex<int16_t>(index_type, file_manager_context);
return CreateScalarIndex<int16_t>(
index_type, file_manager_context, data_type);
case DataType::INT32:
return CreateScalarIndex<int32_t>(index_type, file_manager_context);
return CreateScalarIndex<int32_t>(
index_type, file_manager_context, data_type);
case DataType::INT64:
return CreateScalarIndex<int64_t>(index_type, file_manager_context);
return CreateScalarIndex<int64_t>(
index_type, file_manager_context, data_type);
case DataType::FLOAT:
return CreateScalarIndex<float>(index_type, file_manager_context);
return CreateScalarIndex<float>(
index_type, file_manager_context, data_type);
case DataType::DOUBLE:
return CreateScalarIndex<double>(index_type, file_manager_context);
return CreateScalarIndex<double>(
index_type, file_manager_context, data_type);
// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(index_type,
file_manager_context);
return CreateScalarIndex<std::string>(
index_type, file_manager_context, data_type);
default:
throw SegcoreError(
DataTypeInvalid,
@ -187,25 +223,32 @@ IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info,
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreateScalarIndex<bool>(index_type, file_manager, space);
return CreateScalarIndex<bool>(
index_type, file_manager, space, data_type);
case DataType::INT8:
return CreateScalarIndex<int8_t>(index_type, file_manager, space);
return CreateScalarIndex<int8_t>(
index_type, file_manager, space, data_type);
case DataType::INT16:
return CreateScalarIndex<int16_t>(index_type, file_manager, space);
return CreateScalarIndex<int16_t>(
index_type, file_manager, space, data_type);
case DataType::INT32:
return CreateScalarIndex<int32_t>(index_type, file_manager, space);
return CreateScalarIndex<int32_t>(
index_type, file_manager, space, data_type);
case DataType::INT64:
return CreateScalarIndex<int64_t>(index_type, file_manager, space);
return CreateScalarIndex<int64_t>(
index_type, file_manager, space, data_type);
case DataType::FLOAT:
return CreateScalarIndex<float>(index_type, file_manager, space);
return CreateScalarIndex<float>(
index_type, file_manager, space, data_type);
case DataType::DOUBLE:
return CreateScalarIndex<double>(index_type, file_manager, space);
return CreateScalarIndex<double>(
index_type, file_manager, space, data_type);
// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreateScalarIndex<std::string>(
index_type, file_manager, space);
index_type, file_manager, space, data_type);
default:
throw SegcoreError(
DataTypeInvalid,

View File

@ -60,6 +60,7 @@ class IndexFactory {
CreateIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space);
IndexBasePtr
CreateVectorIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context);
@ -86,18 +87,29 @@ class IndexFactory {
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type,
const storage::FileManagerContext& file_manager =
storage::FileManagerContext());
storage::FileManagerContext(),
DataType d_type = DataType::NONE);
template <typename T>
ScalarIndexPtr<T>
CreateScalarIndex(const IndexType& index_type,
const storage::FileManagerContext& file_manager,
std::shared_ptr<milvus_storage::Space> space);
std::shared_ptr<milvus_storage::Space> space,
DataType d_type = DataType::NONE);
};
template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context);
const storage::FileManagerContext& file_manager_context,
DataType d_type);
template <>
ScalarIndexPtr<std::string>
IndexFactory::CreateScalarIndex<std::string>(
const IndexType& index_type,
const storage::FileManagerContext& file_manager_context,
std::shared_ptr<milvus_storage::Space> space,
DataType d_type);
} // namespace milvus::index

View File

@ -0,0 +1,410 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "tantivy-binding.h"
#include "common/Slice.h"
#include "storage/LocalChunkManagerSingleton.h"
#include "index/InvertedIndexTantivy.h"
#include "log/Log.h"
#include "index/Utils.h"
#include "storage/Util.h"
#include <boost/filesystem.hpp>
namespace milvus::index {
template <typename T>
InvertedIndexTantivy<T>::InvertedIndexTantivy(
const TantivyConfig& cfg,
const storage::FileManagerContext& ctx,
std::shared_ptr<milvus_storage::Space> space)
: cfg_(cfg), space_(space) {
mem_file_manager_ = std::make_shared<MemFileManager>(ctx, ctx.space_);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx, ctx.space_);
auto field =
std::to_string(disk_file_manager_->GetFieldDataMeta().field_id);
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
path_ = fmt::format("/tmp/{}", prefix);
boost::filesystem::create_directories(path_);
d_type_ = cfg_.to_tantivy_data_type();
if (tantivy_index_exist(path_.c_str())) {
LOG_INFO(
"index {} already exists, which should happen in loading progress",
path_);
} else {
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str());
}
}
template <typename T>
InvertedIndexTantivy<T>::~InvertedIndexTantivy() {
auto local_chunk_manager =
storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager();
auto prefix = path_;
local_chunk_manager->RemoveDir(prefix);
}
template <typename T>
void
InvertedIndexTantivy<T>::finish() {
wrapper_->finish();
}
template <typename T>
BinarySet
InvertedIndexTantivy<T>::Serialize(const Config& config) {
BinarySet res_set;
return res_set;
}
template <typename T>
BinarySet
InvertedIndexTantivy<T>::Upload(const Config& config) {
finish();
for (const auto& entry : std::filesystem::directory_iterator(path_)) {
disk_file_manager_->AddFile(entry.path());
}
BinarySet ret;
auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize();
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
template <typename T>
BinarySet
InvertedIndexTantivy<T>::UploadV2(const Config& config) {
return Upload(config);
}
template <typename T>
void
InvertedIndexTantivy<T>::Build(const Config& config) {
auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(), "insert_files were empty");
auto field_datas =
mem_file_manager_->CacheRawDataToMemory(insert_files.value());
switch (cfg_.data_type_) {
case DataType::BOOL: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<bool>(static_cast<const bool*>(data->Data()),
n);
}
break;
}
case DataType::INT8: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int8_t>(
static_cast<const int8_t*>(data->Data()), n);
}
break;
}
case DataType::INT16: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int16_t>(
static_cast<const int16_t*>(data->Data()), n);
}
break;
}
case DataType::INT32: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int32_t>(
static_cast<const int32_t*>(data->Data()), n);
}
break;
}
case DataType::INT64: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int64_t>(
static_cast<const int64_t*>(data->Data()), n);
}
break;
}
case DataType::FLOAT: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<float>(
static_cast<const float*>(data->Data()), n);
}
break;
}
case DataType::DOUBLE: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<double>(
static_cast<const double*>(data->Data()), n);
}
break;
}
case DataType::VARCHAR: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<std::string>(
static_cast<const std::string*>(data->Data()), n);
}
break;
}
default:
PanicInfo(ErrorCode::NotImplemented,
fmt::format("todo: not supported, {}", cfg_.data_type_));
}
}
template <typename T>
void
InvertedIndexTantivy<T>::BuildV2(const Config& config) {
auto field_name = mem_file_manager_->GetIndexMeta().field_name;
auto res = space_->ScanData();
if (!res.ok()) {
PanicInfo(S3Error, "failed to create scan iterator");
}
auto reader = res.value();
std::vector<FieldDataPtr> field_datas;
for (auto rec = reader->Next(); rec != nullptr; rec = reader->Next()) {
if (!rec.ok()) {
PanicInfo(DataFormatBroken, "failed to read data");
}
auto data = rec.ValueUnsafe();
auto total_num_rows = data->num_rows();
auto col_data = data->GetColumnByName(field_name);
auto field_data = storage::CreateFieldData(
DataType(GetDType<T>()), 0, total_num_rows);
field_data->FillFieldData(col_data);
field_datas.push_back(field_data);
}
switch (cfg_.data_type_) {
case DataType::BOOL: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<bool>(static_cast<const bool*>(data->Data()),
n);
}
break;
}
case DataType::INT8: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int8_t>(
static_cast<const int8_t*>(data->Data()), n);
}
break;
}
case DataType::INT16: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int16_t>(
static_cast<const int16_t*>(data->Data()), n);
}
break;
}
case DataType::INT32: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int32_t>(
static_cast<const int32_t*>(data->Data()), n);
}
break;
}
case DataType::INT64: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<int64_t>(
static_cast<const int64_t*>(data->Data()), n);
}
break;
}
case DataType::FLOAT: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<float>(
static_cast<const float*>(data->Data()), n);
}
break;
}
case DataType::DOUBLE: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<double>(
static_cast<const double*>(data->Data()), n);
}
break;
}
case DataType::VARCHAR: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<std::string>(
static_cast<const std::string*>(data->Data()), n);
}
break;
}
default:
PanicInfo(ErrorCode::NotImplemented,
fmt::format("todo: not supported, {}", cfg_.data_type_));
}
}
template <typename T>
void
InvertedIndexTantivy<T>::Load(const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load disk ann index data");
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
disk_file_manager_->CacheIndexToDisk(index_files.value());
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
}
template <typename T>
void
InvertedIndexTantivy<T>::LoadV2(const Config& config) {
disk_file_manager_->CacheIndexToDisk();
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
}
inline void
apply_hits(TargetBitmap& bitset, const RustArrayWrapper& w, bool v) {
for (size_t j = 0; j < w.array_.len; j++) {
bitset[w.array_.array[j]] = v;
}
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::In(size_t n, const T* values) {
TargetBitmap bitset(Count());
for (size_t i = 0; i < n; ++i) {
auto array = wrapper_->term_query(values[i]);
apply_hits(bitset, array, true);
}
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
TargetBitmap bitset(Count(), true);
for (size_t i = 0; i < n; ++i) {
auto array = wrapper_->term_query(values[i]);
apply_hits(bitset, array, false);
}
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::Range(T value, OpType op) {
TargetBitmap bitset(Count());
switch (op) {
case OpType::LessThan: {
auto array = wrapper_->upper_bound_range_query(value, false);
apply_hits(bitset, array, true);
} break;
case OpType::LessEqual: {
auto array = wrapper_->upper_bound_range_query(value, true);
apply_hits(bitset, array, true);
} break;
case OpType::GreaterThan: {
auto array = wrapper_->lower_bound_range_query(value, false);
apply_hits(bitset, array, true);
} break;
case OpType::GreaterEqual: {
auto array = wrapper_->lower_bound_range_query(value, true);
apply_hits(bitset, array, true);
} break;
default:
throw SegcoreError(OpTypeInvalid,
fmt::format("Invalid OperatorType: {}", op));
}
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::Range(T lower_bound_value,
bool lb_inclusive,
T upper_bound_value,
bool ub_inclusive) {
TargetBitmap bitset(Count());
auto array = wrapper_->range_query(
lower_bound_value, upper_bound_value, lb_inclusive, ub_inclusive);
apply_hits(bitset, array, true);
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::PrefixMatch(const std::string_view prefix) {
TargetBitmap bitset(Count());
std::string s(prefix);
auto array = wrapper_->prefix_query(s);
apply_hits(bitset, array, true);
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::Query(const DatasetPtr& dataset) {
return ScalarIndex<T>::Query(dataset);
}
template <>
const TargetBitmap
InvertedIndexTantivy<std::string>::Query(const DatasetPtr& dataset) {
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
if (op == OpType::PrefixMatch) {
auto prefix = dataset->Get<std::string>(PREFIX_VALUE);
return PrefixMatch(prefix);
}
return ScalarIndex<std::string>::Query(dataset);
}
template class InvertedIndexTantivy<bool>;
template class InvertedIndexTantivy<int8_t>;
template class InvertedIndexTantivy<int16_t>;
template class InvertedIndexTantivy<int32_t>;
template class InvertedIndexTantivy<int64_t>;
template class InvertedIndexTantivy<float>;
template class InvertedIndexTantivy<double>;
template class InvertedIndexTantivy<std::string>;
} // namespace milvus::index

View File

@ -0,0 +1,176 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "index/Index.h"
#include "storage/FileManager.h"
#include "storage/DiskFileManagerImpl.h"
#include "storage/MemFileManagerImpl.h"
#include "tantivy-binding.h"
#include "tantivy-wrapper.h"
#include "index/StringIndex.h"
#include "index/TantivyConfig.h"
#include "storage/space.h"
namespace milvus::index {
using TantivyIndexWrapper = milvus::tantivy::TantivyIndexWrapper;
using RustArrayWrapper = milvus::tantivy::RustArrayWrapper;
template <typename T>
class InvertedIndexTantivy : public ScalarIndex<T> {
public:
using MemFileManager = storage::MemFileManagerImpl;
using MemFileManagerPtr = std::shared_ptr<MemFileManager>;
using DiskFileManager = storage::DiskFileManagerImpl;
using DiskFileManagerPtr = std::shared_ptr<DiskFileManager>;
explicit InvertedIndexTantivy(const TantivyConfig& cfg,
const storage::FileManagerContext& ctx)
: InvertedIndexTantivy(cfg, ctx, nullptr) {
}
explicit InvertedIndexTantivy(const TantivyConfig& cfg,
const storage::FileManagerContext& ctx,
std::shared_ptr<milvus_storage::Space> space);
~InvertedIndexTantivy();
/*
* deprecated.
* TODO: why not remove this?
*/
void
Load(const BinarySet& binary_set, const Config& config = {}) override {
PanicInfo(ErrorCode::NotImplemented, "load v1 should be deprecated");
}
void
Load(const Config& config = {}) override;
void
LoadV2(const Config& config = {}) override;
/*
* deprecated.
* TODO: why not remove this?
*/
void
BuildWithDataset(const DatasetPtr& dataset,
const Config& config = {}) override {
PanicInfo(ErrorCode::NotImplemented,
"BuildWithDataset should be deprecated");
}
void
Build(const Config& config = {}) override;
void
BuildV2(const Config& config = {}) override;
int64_t
Count() override {
return wrapper_->count();
}
/*
* deprecated.
* TODO: why not remove this?
*/
void
BuildWithRawData(size_t n,
const void* values,
const Config& config = {}) override {
PanicInfo(ErrorCode::NotImplemented,
"BuildWithRawData should be deprecated");
}
/*
* deprecated.
* TODO: why not remove this?
*/
BinarySet
Serialize(const Config& config /* not used */) override;
BinarySet
Upload(const Config& config = {}) override;
BinarySet
UploadV2(const Config& config = {}) override;
/*
* deprecated, only used in small chunk index.
*/
void
Build(size_t n, const T* values) override {
PanicInfo(ErrorCode::NotImplemented, "Build should not be called");
}
const TargetBitmap
In(size_t n, const T* values) override;
const TargetBitmap
NotIn(size_t n, const T* values) override;
const TargetBitmap
Range(T value, OpType op) override;
const TargetBitmap
Range(T lower_bound_value,
bool lb_inclusive,
T upper_bound_value,
bool ub_inclusive) override;
const bool
HasRawData() const override {
return false;
}
T
Reverse_Lookup(size_t offset) const override {
PanicInfo(ErrorCode::NotImplemented,
"Reverse_Lookup should not be handled by inverted index");
}
int64_t
Size() override {
return Count();
}
const TargetBitmap
PrefixMatch(const std::string_view prefix);
const TargetBitmap
Query(const DatasetPtr& dataset) override;
private:
void
finish();
private:
std::shared_ptr<TantivyIndexWrapper> wrapper_;
TantivyConfig cfg_;
TantivyDataType d_type_;
std::string path_;
/*
* To avoid IO amplification, we use both mem file manager & disk file manager
* 1, build phase, we just need the raw data in memory, we use MemFileManager.CacheRawDataToMemory;
* 2, upload phase, the index was already on the disk, we use DiskFileManager.AddFile directly;
* 3, load phase, we need the index on the disk instead of memory, we use DiskFileManager.CacheIndexToDisk;
* Btw, this approach can be applied to DiskANN also.
*/
MemFileManagerPtr mem_file_manager_;
DiskFileManagerPtr disk_file_manager_;
std::shared_ptr<milvus_storage::Space> space_;
};
} // namespace milvus::index

View File

@ -36,6 +36,7 @@ constexpr const char* METRIC_TYPE = "metric_type";
// scalar index type
constexpr const char* ASCENDING_SORT = "STL_SORT";
constexpr const char* MARISA_TRIE = "Trie";
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
// index meta
constexpr const char* COLLECTION_ID = "collection_id";

View File

@ -0,0 +1,51 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "storage/Types.h"
#include "tantivy-binding.h"
namespace milvus::index {
struct TantivyConfig {
DataType data_type_;
TantivyDataType
to_tantivy_data_type() {
switch (data_type_) {
case DataType::BOOL: {
return TantivyDataType::Bool;
}
case DataType::INT8:
case DataType::INT16:
case DataType::INT32:
case DataType::INT64: {
return TantivyDataType::I64;
}
case DataType::FLOAT:
case DataType::DOUBLE: {
return TantivyDataType::F64;
}
case DataType::VARCHAR: {
return TantivyDataType::Keyword;
}
default:
PanicInfo(
ErrorCode::NotImplemented,
fmt::format("not implemented data type: {}", data_type_));
}
}
};
} // namespace milvus::index

View File

@ -27,6 +27,9 @@ ScalarIndexCreator::ScalarIndexCreator(
const storage::FileManagerContext& file_manager_context)
: config_(config), dtype_(dtype) {
milvus::index::CreateIndexInfo index_info;
if (config.contains("index_type")) {
index_type_ = config.at("index_type").get<std::string>();
}
index_info.field_type = dtype_;
index_info.index_type = index_type();
index_ = index::IndexFactory::GetInstance().CreateIndex(
@ -74,8 +77,7 @@ ScalarIndexCreator::Load(const milvus::BinarySet& binary_set) {
std::string
ScalarIndexCreator::index_type() {
// TODO
return "sort";
return index_type_;
}
BinarySet

View File

@ -60,6 +60,7 @@ class ScalarIndexCreator : public IndexCreatorBase {
index::IndexBasePtr index_ = nullptr;
Config config_;
DataType dtype_;
IndexType index_type_;
};
using ScalarIndexCreatorPtr = std::unique_ptr<ScalarIndexCreator>;

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <map>

View File

@ -1,3 +1,14 @@
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under the License
# Copyright (c) Microsoft Corporation. All rights reserved.
# SPDX-License-Identifier: MIT

View File

@ -1,3 +1,14 @@
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under the License
# Copyright (c) Microsoft Corporation. All rights reserved.
# SPDX-License-Identifier: MIT

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "../AzureBlobChunkManager.h"
#include <azure/identity/workload_identity_credential.hpp>
#include <gtest/gtest.h>

View File

@ -37,6 +37,7 @@ add_subdirectory(rocksdb)
add_subdirectory(rdkafka)
add_subdirectory(simdjson)
add_subdirectory(opendal)
add_subdirectory(tantivy)
add_subdirectory(milvus-storage)

View File

@ -0,0 +1,66 @@
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CARGO_CMD cargo build)
set(TARGET_DIR "debug")
else ()
set(CARGO_CMD cargo build --release)
set(TARGET_DIR "release")
endif ()
set(TANTIVY_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib")
set(TANTIVY_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
set(TANTIVY_NAME "libtantivy_binding${CMAKE_STATIC_LIBRARY_SUFFIX}")
set(LIB_FILE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_DIR}/${TANTIVY_NAME}")
set(LIB_HEADER_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}/tantivy-binding/include")
# In fact, cargo was already installed on our builder environment.
# Below settings are used to suit for first local development.
set(HOME_VAR $ENV{HOME})
set(PATH_VAR $ENV{PATH})
set(ENV{PATH} ${HOME_VAR}/.cargo/bin:${PATH_VAR})
message($ENV{PATH})
add_custom_command(OUTPUT ls_cargo
COMMENT "ls cargo"
COMMAND ls ${HOME_VAR}/.cargo/bin/
)
add_custom_target(ls_cargo_target DEPENDS ls_cargo)
add_custom_command(OUTPUT compile_tantivy
COMMENT "Compiling tantivy binding"
COMMAND CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR} ${CARGO_CMD}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/tantivy-binding)
add_custom_target(tantivy_binding_target DEPENDS compile_tantivy ls_cargo_target)
set(INSTALL_COMMAND
cp ${LIB_HEADER_FOLDER}/tantivy-binding.h ${TANTIVY_INCLUDE_DIR}/ &&
cp ${CMAKE_CURRENT_SOURCE_DIR}/tantivy-wrapper.h ${TANTIVY_INCLUDE_DIR}/ &&
cp ${LIB_FILE} ${TANTIVY_LIB_DIR}/)
add_custom_command(OUTPUT install_tantivy
COMMENT "Install tantivy target ${LIB_FILE} to ${TANTIVY_LIB_DIR}"
COMMAND ${INSTALL_COMMAND}
)
add_custom_target(install_tantivy_target DEPENDS install_tantivy tantivy_binding_target)
add_library(tantivy_binding STATIC IMPORTED)
add_dependencies(tantivy_binding
install_tantivy_target
)
set_target_properties(tantivy_binding
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION "${TANTIVY_LIB_DIR}/${TANTIVY_NAME}"
INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/include")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
endif()
add_executable(test_tantivy test.cpp)
target_link_libraries(test_tantivy
tantivy_binding
boost_filesystem
dl
)

View File

@ -0,0 +1,16 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb
.vscode/

View File

@ -0,0 +1,18 @@
[package]
name = "tantivy-binding"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
tantivy = "0.21.0"
futures = "0.3.21"
libc = "0.2"
scopeguard = "1.2"
[build-dependencies]
cbindgen = "0.26.0"
[lib]
crate-type = ["staticlib"]

View File

@ -0,0 +1,13 @@
use std::{env, path::PathBuf};
fn main() {
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
let package_name = env::var("CARGO_PKG_NAME").unwrap();
let output_file = PathBuf::from(&crate_dir)
.join("include")
.join(format!("{}.h", package_name));
cbindgen::generate(&crate_dir)
.unwrap()
.write_to_file(output_file);
}

View File

@ -0,0 +1,2 @@
language = "C++"
pragma_once = true

View File

@ -0,0 +1,102 @@
#pragma once
#include <cstdarg>
#include <cstdint>
#include <cstdlib>
#include <ostream>
#include <new>
enum class TantivyDataType : uint8_t {
Keyword,
I64,
F64,
Bool,
};
struct RustArray {
uint32_t *array;
size_t len;
size_t cap;
};
extern "C" {
void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path);
void tantivy_free_index_writer(void *ptr);
void tantivy_finish_index(void *ptr);
void *tantivy_create_reader_for_index(void *ptr);
void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len);
void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len);
void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len);
void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len);
void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len);
void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len);
void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len);
void tantivy_index_add_keyword(void *ptr, const char *s);
bool tantivy_index_exist(const char *path);
void free_rust_array(RustArray array);
void *tantivy_load_index(const char *path);
void tantivy_free_index_reader(void *ptr);
uint32_t tantivy_index_count(void *ptr);
RustArray tantivy_term_query_i64(void *ptr, int64_t term);
RustArray tantivy_lower_bound_range_query_i64(void *ptr, int64_t lower_bound, bool inclusive);
RustArray tantivy_upper_bound_range_query_i64(void *ptr, int64_t upper_bound, bool inclusive);
RustArray tantivy_range_query_i64(void *ptr,
int64_t lower_bound,
int64_t upper_bound,
bool lb_inclusive,
bool ub_inclusive);
RustArray tantivy_term_query_f64(void *ptr, double term);
RustArray tantivy_lower_bound_range_query_f64(void *ptr, double lower_bound, bool inclusive);
RustArray tantivy_upper_bound_range_query_f64(void *ptr, double upper_bound, bool inclusive);
RustArray tantivy_range_query_f64(void *ptr,
double lower_bound,
double upper_bound,
bool lb_inclusive,
bool ub_inclusive);
RustArray tantivy_term_query_bool(void *ptr, bool term);
RustArray tantivy_term_query_keyword(void *ptr, const char *term);
RustArray tantivy_lower_bound_range_query_keyword(void *ptr,
const char *lower_bound,
bool inclusive);
RustArray tantivy_upper_bound_range_query_keyword(void *ptr,
const char *upper_bound,
bool inclusive);
RustArray tantivy_range_query_keyword(void *ptr,
const char *lower_bound,
const char *upper_bound,
bool lb_inclusive,
bool ub_inclusive);
RustArray tantivy_prefix_query_keyword(void *ptr, const char *prefix);
} // extern "C"

View File

@ -0,0 +1,30 @@
use libc::size_t;
#[repr(C)]
pub struct RustArray {
array: *mut u32,
len: size_t,
cap: size_t,
}
impl RustArray {
pub fn from_vec(vec: Vec<u32>) -> RustArray {
let len = vec.len();
let cap = vec.capacity();
let v = vec.leak();
RustArray {
array: v.as_mut_ptr(),
len,
cap,
}
}
}
#[no_mangle]
pub extern "C" fn free_rust_array(array: RustArray) {
let RustArray { array, len , cap} = array;
unsafe {
Vec::from_raw_parts(array, len, cap);
}
}

View File

@ -0,0 +1,9 @@
#[repr(u8)]
pub enum TantivyDataType {
// Text,
Keyword,
// U64,
I64,
F64,
Bool,
}

View File

@ -0,0 +1,204 @@
use std::ops::Bound;
use std::str::FromStr;
use tantivy::collector::TopDocs;
use tantivy::directory::MmapDirectory;
use tantivy::query::{Query, RangeQuery, TermQuery, RegexQuery};
use tantivy::schema::{Field, IndexRecordOption};
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
use crate::util::make_bounds;
pub struct IndexReaderWrapper {
pub field_name: String,
pub field: Field,
pub reader: IndexReader,
pub cnt: u32,
}
impl IndexReaderWrapper {
pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let metas = index
.searchable_segment_metas()
.unwrap();
let mut sum: u32 = 0;
for meta in metas {
sum += meta.max_doc();
}
reader.reload().unwrap();
IndexReaderWrapper {
field_name: field_name.to_string(),
field,
reader,
cnt: sum,
}
}
pub fn load(path: &str) -> IndexReaderWrapper {
let dir = MmapDirectory::open(path).unwrap();
let index = Index::open(dir).unwrap();
let field = index.schema().fields().next().unwrap().0;
let schema = index.schema();
let field_name = schema.get_field_name(field);
IndexReaderWrapper::new(&index, &String::from_str(field_name).unwrap(), field)
}
pub fn count(&self) -> u32 {
self.cnt
}
fn search(&self, q: &dyn Query) -> Vec<u32> {
let searcher = self.reader.searcher();
let cnt = self.cnt;
let hits = searcher
.search(q, &TopDocs::with_limit(cnt as usize))
.unwrap();
let mut ret = Vec::new();
for (_, address) in hits {
ret.push(address.doc_id);
}
ret
}
pub fn term_query_i64(&self, term: i64) -> Vec<u32> {
let q = TermQuery::new(
Term::from_field_i64(self.field, term),
IndexRecordOption::Basic,
);
self.search(&q)
}
pub fn lower_bound_range_query_i64(&self, lower_bound: i64, inclusive: bool) -> Vec<u32> {
let q = RangeQuery::new_i64_bounds(
self.field_name.to_string(),
make_bounds(lower_bound, inclusive),
Bound::Unbounded,
);
self.search(&q)
}
pub fn upper_bound_range_query_i64(&self, upper_bound: i64, inclusive: bool) -> Vec<u32> {
let q = RangeQuery::new_i64_bounds(
self.field_name.to_string(),
Bound::Unbounded,
make_bounds(upper_bound, inclusive),
);
self.search(&q)
}
pub fn range_query_i64(
&self,
lower_bound: i64,
upper_bound: i64,
lb_inclusive: bool,
ub_inclusive: bool,
) -> Vec<u32> {
let lb = make_bounds(lower_bound, lb_inclusive);
let ub = make_bounds(upper_bound, ub_inclusive);
let q = RangeQuery::new_i64_bounds(self.field_name.to_string(), lb, ub);
self.search(&q)
}
pub fn term_query_f64(&self, term: f64) -> Vec<u32> {
let q = TermQuery::new(
Term::from_field_f64(self.field, term),
IndexRecordOption::Basic,
);
self.search(&q)
}
pub fn lower_bound_range_query_f64(&self, lower_bound: f64, inclusive: bool) -> Vec<u32> {
let q = RangeQuery::new_f64_bounds(
self.field_name.to_string(),
make_bounds(lower_bound, inclusive),
Bound::Unbounded,
);
self.search(&q)
}
pub fn upper_bound_range_query_f64(&self, upper_bound: f64, inclusive: bool) -> Vec<u32> {
let q = RangeQuery::new_f64_bounds(
self.field_name.to_string(),
Bound::Unbounded,
make_bounds(upper_bound, inclusive),
);
self.search(&q)
}
pub fn range_query_f64(
&self,
lower_bound: f64,
upper_bound: f64,
lb_inclusive: bool,
ub_inclusive: bool,
) -> Vec<u32> {
let lb = make_bounds(lower_bound, lb_inclusive);
let ub = make_bounds(upper_bound, ub_inclusive);
let q = RangeQuery::new_f64_bounds(self.field_name.to_string(), lb, ub);
self.search(&q)
}
pub fn term_query_bool(&self, term: bool) -> Vec<u32> {
let q = TermQuery::new(
Term::from_field_bool(self.field, term),
IndexRecordOption::Basic,
);
self.search(&q)
}
pub fn term_query_keyword(&self, term: &str) -> Vec<u32> {
let q = TermQuery::new(
Term::from_field_text(self.field, term),
IndexRecordOption::Basic,
);
self.search(&q)
}
pub fn lower_bound_range_query_keyword(&self, lower_bound: &str, inclusive: bool) -> Vec<u32> {
let q = RangeQuery::new_str_bounds(
self.field_name.to_string(),
make_bounds(lower_bound, inclusive),
Bound::Unbounded,
);
self.search(&q)
}
pub fn upper_bound_range_query_keyword(&self, upper_bound: &str, inclusive: bool) -> Vec<u32> {
let q = RangeQuery::new_str_bounds(
self.field_name.to_string(),
Bound::Unbounded,
make_bounds(upper_bound, inclusive),
);
self.search(&q)
}
pub fn range_query_keyword(
&self,
lower_bound: &str,
upper_bound: &str,
lb_inclusive: bool,
ub_inclusive: bool,
) -> Vec<u32> {
let lb = make_bounds(lower_bound, lb_inclusive);
let ub = make_bounds(upper_bound, ub_inclusive);
let q = RangeQuery::new_str_bounds(self.field_name.to_string(), lb, ub);
self.search(&q)
}
pub fn prefix_query_keyword(
&self,
prefix: &str,
) -> Vec<u32> {
let pattern = format!("{}(.|\n)*", prefix);
let q = RegexQuery::from_pattern(&pattern, self.field).unwrap();
self.search(&q)
}
}

View File

@ -0,0 +1,206 @@
use std::ffi::{c_char, c_void, CStr};
use crate::{
index_reader::IndexReaderWrapper, util_c::tantivy_index_exist, util::{create_binding, free_binding}, array::RustArray,
};
#[no_mangle]
pub extern "C" fn tantivy_load_index(path: *const c_char) -> *mut c_void {
assert!(tantivy_index_exist(path));
let path_str = unsafe { CStr::from_ptr(path) };
let wrapper = IndexReaderWrapper::load(path_str.to_str().unwrap());
create_binding(wrapper)
}
#[no_mangle]
pub extern "C" fn tantivy_free_index_reader(ptr: *mut c_void) {
free_binding::<IndexReaderWrapper>(ptr);
}
// -------------------------query--------------------
#[no_mangle]
pub extern "C" fn tantivy_index_count(ptr: *mut c_void) -> u32 {
let real = ptr as *mut IndexReaderWrapper;
unsafe { (*real).count() }
}
#[no_mangle]
pub extern "C" fn tantivy_term_query_i64(ptr: *mut c_void, term: i64) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).term_query_i64(term);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_lower_bound_range_query_i64(
ptr: *mut c_void,
lower_bound: i64,
inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).lower_bound_range_query_i64(lower_bound, inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_upper_bound_range_query_i64(
ptr: *mut c_void,
upper_bound: i64,
inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).upper_bound_range_query_i64(upper_bound, inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_range_query_i64(
ptr: *mut c_void,
lower_bound: i64,
upper_bound: i64,
lb_inclusive: bool,
ub_inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).range_query_i64(lower_bound, upper_bound, lb_inclusive, ub_inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_term_query_f64(ptr: *mut c_void, term: f64) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).term_query_f64(term);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_lower_bound_range_query_f64(
ptr: *mut c_void,
lower_bound: f64,
inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).lower_bound_range_query_f64(lower_bound, inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_upper_bound_range_query_f64(
ptr: *mut c_void,
upper_bound: f64,
inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).upper_bound_range_query_f64(upper_bound, inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_range_query_f64(
ptr: *mut c_void,
lower_bound: f64,
upper_bound: f64,
lb_inclusive: bool,
ub_inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).range_query_f64(lower_bound, upper_bound, lb_inclusive, ub_inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_term_query_bool(ptr: *mut c_void, term: bool) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let hits = (*real).term_query_bool(term);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_term_query_keyword(ptr: *mut c_void, term: *const c_char) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let c_str = CStr::from_ptr(term);
let hits = (*real).term_query_keyword(c_str.to_str().unwrap());
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_lower_bound_range_query_keyword(
ptr: *mut c_void,
lower_bound: *const c_char,
inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let c_lower_bound = CStr::from_ptr(lower_bound);
let hits =
(*real).lower_bound_range_query_keyword(c_lower_bound.to_str().unwrap(), inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_upper_bound_range_query_keyword(
ptr: *mut c_void,
upper_bound: *const c_char,
inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let c_upper_bound = CStr::from_ptr(upper_bound);
let hits =
(*real).upper_bound_range_query_keyword(c_upper_bound.to_str().unwrap(), inclusive);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_range_query_keyword(
ptr: *mut c_void,
lower_bound: *const c_char,
upper_bound: *const c_char,
lb_inclusive: bool,
ub_inclusive: bool,
) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let c_lower_bound = CStr::from_ptr(lower_bound);
let c_upper_bound = CStr::from_ptr(upper_bound);
let hits = (*real).range_query_keyword(
c_lower_bound.to_str().unwrap(),
c_upper_bound.to_str().unwrap(),
lb_inclusive,
ub_inclusive,
);
RustArray::from_vec(hits)
}
}
#[no_mangle]
pub extern "C" fn tantivy_prefix_query_keyword(ptr: *mut c_void, prefix: *const c_char) -> RustArray {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let c_str = CStr::from_ptr(prefix);
let hits = (*real).prefix_query_keyword(c_str.to_str().unwrap());
RustArray::from_vec(hits)
}
}

View File

@ -0,0 +1,108 @@
use futures::executor::block_on;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
use tantivy::{doc, tokenizer, Index, IndexWriter};
use crate::data_type::TantivyDataType;
use crate::index_reader::IndexReaderWrapper;
pub struct IndexWriterWrapper {
pub field_name: String,
pub field: Field,
pub data_type: TantivyDataType,
pub path: String,
pub index: Index,
pub index_writer: IndexWriter,
}
impl IndexWriterWrapper {
pub fn create_reader(&self) -> IndexReaderWrapper {
IndexReaderWrapper::new(&self.index, &self.field_name, self.field)
}
pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper {
let field: Field;
let mut schema_builder = Schema::builder();
let mut use_raw_tokenizer = false;
match data_type {
TantivyDataType::I64 => {
field = schema_builder.add_i64_field(&field_name, INDEXED);
}
TantivyDataType::F64 => {
field = schema_builder.add_f64_field(&field_name, INDEXED);
}
TantivyDataType::Bool => {
field = schema_builder.add_bool_field(&field_name, INDEXED);
}
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw_tokenizer")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
field = schema_builder.add_text_field(&field_name, text_options);
use_raw_tokenizer = true;
}
}
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema).unwrap();
if use_raw_tokenizer {
index
.tokenizers()
.register("raw_tokenizer", tokenizer::RawTokenizer::default());
}
let index_writer = index.writer_with_num_threads(1, 15_000_000).unwrap();
IndexWriterWrapper {
field_name,
field,
data_type,
path,
index,
index_writer,
}
}
pub fn add_i8(&self, data: i8) {
self.add_i64(data.into())
}
pub fn add_i16(&self, data: i16) {
self.add_i64(data.into())
}
pub fn add_i32(&self, data: i32) {
self.add_i64(data.into())
}
pub fn add_i64(&self, data: i64) {
self.index_writer
.add_document(doc!(self.field => data))
.unwrap();
}
pub fn add_f32(&self, data: f32) {
self.add_f64(data.into())
}
pub fn add_f64(&self, data: f64) {
self.index_writer
.add_document(doc!(self.field => data))
.unwrap();
}
pub fn add_bool(&self, data: bool) {
self.index_writer
.add_document(doc!(self.field => data))
.unwrap();
}
pub fn add_keyword(&self, data: &str) {
self.index_writer
.add_document(doc!(self.field => data))
.unwrap();
}
pub fn finish(&mut self) {
self.index_writer.commit().unwrap();
block_on(self.index_writer.garbage_collect_files()).unwrap();
}
}

View File

@ -0,0 +1,133 @@
use core::slice;
use std::ffi::{c_char, c_void, CStr};
use crate::{
data_type::TantivyDataType, index_writer::IndexWriterWrapper, util::{create_binding, free_binding},
};
#[no_mangle]
pub extern "C" fn tantivy_create_index(
field_name: *const c_char,
data_type: TantivyDataType,
path: *const c_char,
) -> *mut c_void {
let field_name_str = unsafe { CStr::from_ptr(field_name) };
let path_str = unsafe { CStr::from_ptr(path) };
let wrapper = IndexWriterWrapper::new(
String::from(field_name_str.to_str().unwrap()),
data_type,
String::from(path_str.to_str().unwrap()),
);
create_binding(wrapper)
}
#[no_mangle]
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
free_binding::<IndexWriterWrapper>(ptr);
}
#[no_mangle]
pub extern "C" fn tantivy_finish_index(ptr: *mut c_void) {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
(*real).finish();
}
}
// should be only used for test
#[no_mangle]
pub extern "C" fn tantivy_create_reader_for_index(ptr: *mut c_void) -> *mut c_void{
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let reader = (*real).create_reader();
create_binding(reader)
}
}
// -------------------------build--------------------
#[no_mangle]
pub extern "C" fn tantivy_index_add_int8s(ptr: *mut c_void, array: *const i8, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_i8(*data);
}
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int16s(ptr: *mut c_void, array: *const i16, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_i16(*data);
}
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int32s(ptr: *mut c_void, array: *const i32, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_i32(*data);
}
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int64s(ptr: *mut c_void, array: *const i64, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_i64(*data);
}
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_f32s(ptr: *mut c_void, array: *const f32, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_f32(*data);
}
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_f64s(ptr: *mut c_void, array: *const f64, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_f64(*data);
}
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_bools(ptr: *mut c_void, array: *const bool, len: usize) {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for data in arr {
(*real).add_bool(*data);
}
}
}
// TODO: this is not a very efficient way, since we must call this function many times, which
// will bring a lot of overhead caused by the rust binding.
#[no_mangle]
pub extern "C" fn tantivy_index_add_keyword(ptr: *mut c_void, s: *const c_char) {
let real = ptr as *mut IndexWriterWrapper;
let c_str = unsafe { CStr::from_ptr(s) };
unsafe { (*real).add_keyword(c_str.to_str().unwrap()) }
}

View File

@ -0,0 +1,23 @@
mod data_type;
mod index_writer;
mod index_writer_c;
mod util;
mod util_c;
mod array;
mod index_reader;
mod index_reader_c;
pub fn add(left: usize, right: usize) -> usize {
left + right
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}

View File

@ -0,0 +1,30 @@
use std::ops::Bound;
use std::ffi::{c_void};
use tantivy::{directory::MmapDirectory, Index};
pub fn index_exist(path: &str) -> bool {
let dir = MmapDirectory::open(path).unwrap();
Index::exists(&dir).unwrap()
}
pub fn make_bounds<T>(bound: T, inclusive: bool) -> Bound<T> {
if inclusive {
Bound::Included(bound)
} else {
Bound::Excluded(bound)
}
}
pub fn create_binding<T>(wrapper: T) -> *mut c_void {
let bp = Box::new(wrapper);
let p_heap : *mut T = Box::into_raw(bp);
p_heap as *mut c_void
}
pub fn free_binding<T>(ptr: *mut c_void) {
let real = ptr as *mut T;
unsafe {
drop(Box::from_raw(real));
}
}

View File

@ -0,0 +1,9 @@
use std::ffi::{c_char, CStr};
use crate::{util::index_exist};
#[no_mangle]
pub extern "C" fn tantivy_index_exist(path: *const c_char) -> bool {
let path_str = unsafe { CStr::from_ptr(path) };
index_exist(path_str.to_str().unwrap())
}

View File

@ -0,0 +1,362 @@
#include <sstream>
#include <fmt/format.h>
#include "tantivy-binding.h"
namespace milvus::tantivy {
struct RustArrayWrapper {
explicit RustArrayWrapper(RustArray array) : array_(array) {
}
RustArrayWrapper(RustArrayWrapper&) = delete;
RustArrayWrapper&
operator=(RustArrayWrapper&) = delete;
RustArrayWrapper(RustArrayWrapper&& other) noexcept {
array_.array = other.array_.array;
array_.len = other.array_.len;
array_.cap = other.array_.cap;
other.array_.array = nullptr;
other.array_.len = 0;
other.array_.cap = 0;
}
RustArrayWrapper&
operator=(RustArrayWrapper&& other) noexcept {
if (this != &other) {
free();
array_.array = other.array_.array;
array_.len = other.array_.len;
array_.cap = other.array_.cap;
other.array_.array = nullptr;
other.array_.len = 0;
other.array_.cap = 0;
}
return *this;
}
~RustArrayWrapper() {
free();
}
void
debug() {
std::stringstream ss;
ss << "[ ";
for (int i = 0; i < array_.len; i++) {
ss << array_.array[i] << " ";
}
ss << "]";
std::cout << ss.str() << std::endl;
}
RustArray array_;
private:
void
free() {
if (array_.array != nullptr) {
free_rust_array(array_);
}
}
};
template <typename T>
inline TantivyDataType
guess_data_type() {
if constexpr (std::is_same_v<T, bool>) {
return TantivyDataType::Bool;
}
if constexpr (std::is_integral_v<T>) {
return TantivyDataType::I64;
}
if constexpr (std::is_floating_point_v<T>) {
return TantivyDataType::F64;
}
throw fmt::format("guess_data_type: unsupported data type: {}",
typeid(T).name());
}
struct TantivyIndexWrapper {
using IndexWriter = void*;
using IndexReader = void*;
TantivyIndexWrapper() = default;
TantivyIndexWrapper(TantivyIndexWrapper&) = delete;
TantivyIndexWrapper&
operator=(TantivyIndexWrapper&) = delete;
TantivyIndexWrapper(TantivyIndexWrapper&& other) noexcept {
writer_ = other.writer_;
reader_ = other.reader_;
finished_ = other.finished_;
other.writer_ = nullptr;
other.reader_ = nullptr;
other.finished_ = false;
}
TantivyIndexWrapper&
operator=(TantivyIndexWrapper&& other) noexcept {
if (this != &other) {
free();
writer_ = other.writer_;
reader_ = other.reader_;
finished_ = other.finished_;
other.writer_ = nullptr;
other.reader_ = nullptr;
other.finished_ = false;
}
return *this;
}
TantivyIndexWrapper(const char* field_name,
TantivyDataType data_type,
const char* path) {
writer_ = tantivy_create_index(field_name, data_type, path);
}
explicit TantivyIndexWrapper(const char* path) {
assert(tantivy_index_exist(path));
reader_ = tantivy_load_index(path);
}
~TantivyIndexWrapper() {
free();
}
template <typename T>
void
add_data(const T* array, uintptr_t len) {
if constexpr (std::is_same_v<T, bool>) {
tantivy_index_add_bools(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int8_t>) {
tantivy_index_add_int8s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int16_t>) {
tantivy_index_add_int16s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int32_t>) {
tantivy_index_add_int32s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, int64_t>) {
tantivy_index_add_int64s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, float>) {
tantivy_index_add_f32s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, double>) {
tantivy_index_add_f64s(writer_, array, len);
return;
}
if constexpr (std::is_same_v<T, std::string>) {
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
for (uintptr_t i = 0; i < len; i++) {
tantivy_index_add_keyword(
writer_, static_cast<const std::string*>(array)[i].c_str());
}
return;
}
throw fmt::format("InvertedIndex.add_data: unsupported data type: {}",
typeid(T).name());
}
inline void
finish() {
if (!finished_) {
tantivy_finish_index(writer_);
reader_ = tantivy_create_reader_for_index(writer_);
}
}
inline uint32_t
count() {
return tantivy_index_count(reader_);
}
public:
template <typename T>
RustArrayWrapper
term_query(T term) {
auto array = [&]() {
if constexpr (std::is_same_v<T, bool>) {
return tantivy_term_query_bool(reader_, term);
}
if constexpr (std::is_integral_v<T>) {
return tantivy_term_query_i64(reader_,
static_cast<int64_t>(term));
}
if constexpr (std::is_floating_point_v<T>) {
return tantivy_term_query_f64(reader_,
static_cast<double>(term));
}
if constexpr (std::is_same_v<T, std::string>) {
return tantivy_term_query_keyword(
reader_, static_cast<std::string>(term).c_str());
}
throw fmt::format(
"InvertedIndex.term_query: unsupported data type: {}",
typeid(T).name());
}();
return RustArrayWrapper(array);
}
template <typename T>
RustArrayWrapper
lower_bound_range_query(T lower_bound, bool inclusive) {
auto array = [&]() {
if constexpr (std::is_integral_v<T>) {
return tantivy_lower_bound_range_query_i64(
reader_, static_cast<int64_t>(lower_bound), inclusive);
}
if constexpr (std::is_floating_point_v<T>) {
return tantivy_lower_bound_range_query_f64(
reader_, static_cast<double>(lower_bound), inclusive);
}
if constexpr (std::is_same_v<T, std::string>) {
return tantivy_lower_bound_range_query_keyword(
reader_,
static_cast<std::string>(lower_bound).c_str(),
inclusive);
}
throw fmt::format(
"InvertedIndex.lower_bound_range_query: unsupported data type: "
"{}",
typeid(T).name());
}();
return RustArrayWrapper(array);
}
template <typename T>
RustArrayWrapper
upper_bound_range_query(T upper_bound, bool inclusive) {
auto array = [&]() {
if constexpr (std::is_integral_v<T>) {
return tantivy_upper_bound_range_query_i64(
reader_, static_cast<int64_t>(upper_bound), inclusive);
}
if constexpr (std::is_floating_point_v<T>) {
return tantivy_upper_bound_range_query_f64(
reader_, static_cast<double>(upper_bound), inclusive);
}
if constexpr (std::is_same_v<T, std::string>) {
return tantivy_upper_bound_range_query_keyword(
reader_,
static_cast<std::string>(upper_bound).c_str(),
inclusive);
}
throw fmt::format(
"InvertedIndex.upper_bound_range_query: unsupported data type: "
"{}",
typeid(T).name());
}();
return RustArrayWrapper(array);
}
template <typename T>
RustArrayWrapper
range_query(T lower_bound,
T upper_bound,
bool lb_inclusive,
bool ub_inclusive) {
auto array = [&]() {
if constexpr (std::is_integral_v<T>) {
return tantivy_range_query_i64(
reader_,
static_cast<int64_t>(lower_bound),
static_cast<int64_t>(upper_bound),
lb_inclusive,
ub_inclusive);
}
if constexpr (std::is_floating_point_v<T>) {
return tantivy_range_query_f64(reader_,
static_cast<double>(lower_bound),
static_cast<double>(upper_bound),
lb_inclusive,
ub_inclusive);
}
if constexpr (std::is_same_v<T, std::string>) {
return tantivy_range_query_keyword(
reader_,
static_cast<std::string>(lower_bound).c_str(),
static_cast<std::string>(upper_bound).c_str(),
lb_inclusive,
ub_inclusive);
}
throw fmt::format(
"InvertedIndex.range_query: unsupported data type: {}",
typeid(T).name());
}();
return RustArrayWrapper(array);
}
RustArrayWrapper
prefix_query(const std::string& prefix) {
auto array = tantivy_prefix_query_keyword(reader_, prefix.c_str());
return RustArrayWrapper(array);
}
public:
inline IndexWriter
get_writer() {
return writer_;
}
inline IndexReader
get_reader() {
return reader_;
}
private:
void
check_search() {
// TODO
}
void
free() {
if (writer_ != nullptr) {
tantivy_free_index_writer(writer_);
}
if (reader_ != nullptr) {
tantivy_free_index_reader(reader_);
}
}
private:
bool finished_ = false;
IndexWriter writer_ = nullptr;
IndexReader reader_ = nullptr;
};
} // namespace milvus::tantivy

View File

@ -0,0 +1,165 @@
#include <cstdint>
#include <cassert>
#include <boost/filesystem.hpp>
#include <iostream>
#include "tantivy-binding.h"
#include "tantivy-wrapper.h"
using namespace milvus::tantivy;
template <typename T>
void
run() {
std::cout << "run " << typeid(T).name() << std::endl;
auto path = "/tmp/inverted-index/test-binding/";
boost::filesystem::remove_all(path);
boost::filesystem::create_directories(path);
if (tantivy_index_exist(path)) {
auto w = TantivyIndexWrapper(path);
auto cnt = w.count();
std::cout << "index already exist, open it, count: " << cnt
<< std::endl;
return;
}
auto w = TantivyIndexWrapper("test_field_name", guess_data_type<T>(), path);
T arr[] = {1, 2, 3, 4, 5, 6};
auto l = sizeof(arr) / sizeof(T);
w.add_data(arr, l);
w.finish();
assert(w.count() == l);
{
auto hits = w.term_query<T>(2);
hits.debug();
}
{
auto hits = w.lower_bound_range_query<T>(1, false);
hits.debug();
}
{
auto hits = w.upper_bound_range_query<T>(4, false);
hits.debug();
}
{
auto hits = w.range_query<T>(2, 4, false, false);
hits.debug();
}
}
template <>
void
run<bool>() {
std::cout << "run bool" << std::endl;
auto path = "/tmp/inverted-index/test-binding/";
boost::filesystem::remove_all(path);
boost::filesystem::create_directories(path);
if (tantivy_index_exist(path)) {
auto w = TantivyIndexWrapper(path);
auto cnt = w.count();
std::cout << "index already exist, open it, count: " << cnt
<< std::endl;
return;
}
auto w =
TantivyIndexWrapper("test_field_name", TantivyDataType::Bool, path);
bool arr[] = {true, false, false, true, false, true};
auto l = sizeof(arr) / sizeof(bool);
w.add_data(arr, l);
w.finish();
assert(w.count() == l);
{
auto hits = w.term_query<bool>(true);
hits.debug();
}
}
template <>
void
run<std::string>() {
std::cout << "run string" << std::endl;
auto path = "/tmp/inverted-index/test-binding/";
boost::filesystem::remove_all(path);
boost::filesystem::create_directories(path);
if (tantivy_index_exist(path)) {
auto w = TantivyIndexWrapper(path);
auto cnt = w.count();
std::cout << "index already exist, open it, count: " << cnt
<< std::endl;
return;
}
auto w =
TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path);
std::vector<std::string> arr = {"a", "b", "aaa", "abbb"};
auto l = arr.size();
w.add_data<std::string>(arr.data(), l);
w.finish();
assert(w.count() == l);
{
auto hits = w.term_query<std::string>("a");
hits.debug();
}
{
auto hits = w.lower_bound_range_query<std::string>("aa", true);
hits.debug();
}
{
auto hits = w.upper_bound_range_query<std::string>("ab", true);
hits.debug();
}
{
auto hits = w.range_query<std::string>("aa", "ab", true, true);
hits.debug();
}
{
auto hits = w.prefix_query("a");
hits.debug();
}
}
int
main(int argc, char* argv[]) {
run<int8_t>();
run<int16_t>();
run<int32_t>();
run<int64_t>();
run<float>();
run<double>();
run<bool>();
run<std::string>();
return 0;
}

View File

@ -60,6 +60,7 @@ set(MILVUS_TEST_FILES
test_binlog_index.cpp
test_storage.cpp
test_exec.cpp
test_inverted_index.cpp
)
if ( BUILD_DISK_ANN STREQUAL "ON" )

View File

@ -32,10 +32,9 @@ TEST(Array, TestConstructArray) {
ASSERT_EQ(int_array.get_data<int>(i), i);
}
ASSERT_TRUE(int_array.is_same_array(field_int_array));
auto int_array_tmp = Array(
const_cast<char*>(int_array.data()),
int_array.byte_size(),
int_array.get_element_type(),
auto int_array_tmp = Array(const_cast<char*>(int_array.data()),
int_array.byte_size(),
int_array.get_element_type(),
{});
auto int_8_array = Array(const_cast<char*>(int_array.data()),
int_array.byte_size(),
@ -48,10 +47,9 @@ TEST(Array, TestConstructArray) {
{});
ASSERT_EQ(int_array.length(), int_16_array.length());
ASSERT_TRUE(int_array_tmp == int_array);
auto int_array_view = ArrayView(
const_cast<char*>(int_array.data()),
int_array.byte_size(),
int_array.get_element_type(),
auto int_array_view = ArrayView(const_cast<char*>(int_array.data()),
int_array.byte_size(),
int_array.get_element_type(),
{});
ASSERT_EQ(int_array.length(), int_array_view.length());
ASSERT_EQ(int_array.byte_size(), int_array_view.byte_size());
@ -76,10 +74,9 @@ TEST(Array, TestConstructArray) {
long_array.get_element_type(),
{});
ASSERT_TRUE(long_array_tmp == long_array);
auto long_array_view = ArrayView(
const_cast<char*>(long_array.data()),
long_array.byte_size(),
long_array.get_element_type(),
auto long_array_view = ArrayView(const_cast<char*>(long_array.data()),
long_array.byte_size(),
long_array.get_element_type(),
{});
ASSERT_EQ(long_array.length(), long_array_view.length());
ASSERT_EQ(long_array.byte_size(), long_array_view.byte_size());
@ -114,10 +111,9 @@ TEST(Array, TestConstructArray) {
string_array.get_element_type(),
std::move(string_element_offsets));
ASSERT_TRUE(string_array_tmp == string_array);
auto string_array_view = ArrayView(
const_cast<char*>(string_array.data()),
string_array.byte_size(),
string_array.get_element_type(),
auto string_array_view = ArrayView(const_cast<char*>(string_array.data()),
string_array.byte_size(),
string_array.get_element_type(),
std::move(string_view_element_offsets));
ASSERT_EQ(string_array.length(), string_array_view.length());
ASSERT_EQ(string_array.byte_size(), string_array_view.byte_size());
@ -143,10 +139,9 @@ TEST(Array, TestConstructArray) {
bool_array.get_element_type(),
{});
ASSERT_TRUE(bool_array_tmp == bool_array);
auto bool_array_view = ArrayView(
const_cast<char*>(bool_array.data()),
bool_array.byte_size(),
bool_array.get_element_type(),
auto bool_array_view = ArrayView(const_cast<char*>(bool_array.data()),
bool_array.byte_size(),
bool_array.get_element_type(),
{});
ASSERT_EQ(bool_array.length(), bool_array_view.length());
ASSERT_EQ(bool_array.byte_size(), bool_array_view.byte_size());
@ -172,10 +167,9 @@ TEST(Array, TestConstructArray) {
float_array.get_element_type(),
{});
ASSERT_TRUE(float_array_tmp == float_array);
auto float_array_view = ArrayView(
const_cast<char*>(float_array.data()),
float_array.byte_size(),
float_array.get_element_type(),
auto float_array_view = ArrayView(const_cast<char*>(float_array.data()),
float_array.byte_size(),
float_array.get_element_type(),
{});
ASSERT_EQ(float_array.length(), float_array_view.length());
ASSERT_EQ(float_array.byte_size(), float_array_view.byte_size());
@ -202,10 +196,9 @@ TEST(Array, TestConstructArray) {
double_array.get_element_type(),
{});
ASSERT_TRUE(double_array_tmp == double_array);
auto double_array_view = ArrayView(
const_cast<char*>(double_array.data()),
double_array.byte_size(),
double_array.get_element_type(),
auto double_array_view = ArrayView(const_cast<char*>(double_array.data()),
double_array.byte_size(),
double_array.get_element_type(),
{});
ASSERT_EQ(double_array.length(), double_array_view.length());
ASSERT_EQ(double_array.byte_size(), double_array_view.byte_size());

View File

@ -198,12 +198,12 @@ TEST(CBoolIndexTest, All) {
{ DeleteBinarySet(binary_set); }
}
delete[] (char*)(half_ds->GetTensor());
delete[](char*)(half_ds->GetTensor());
}
// TODO: more scalar type.
TEST(CInt64IndexTest, All) {
auto arr = GenArr<int64_t>(NB);
auto arr = GenSortedArr<int64_t>(NB);
auto params = GenParams<int64_t>();
for (const auto& tp : params) {
@ -315,6 +315,6 @@ TEST(CStringIndexTest, All) {
{ DeleteBinarySet(binary_set); }
}
delete[] (char*)(str_ds->GetTensor());
delete[](char*)(str_ds->GetTensor());
}
#endif

View File

@ -0,0 +1,511 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <functional>
#include <boost/filesystem.hpp>
#include <unordered_set>
#include "index/InvertedIndexTantivy.h"
#include "storage/Util.h"
#include "storage/InsertData.h"
#include "indexbuilder/IndexFactory.h"
#include "index/IndexFactory.h"
#include "test_utils/indexbuilder_test_utils.h"
#include "index/Meta.h"
using namespace milvus;
// TODO: I would suggest that our all indexes use this test to simulate the real production environment.
namespace milvus::test {
auto
gen_field_meta(int64_t collection_id = 1,
int64_t partition_id = 2,
int64_t segment_id = 3,
int64_t field_id = 101) -> storage::FieldDataMeta {
return storage::FieldDataMeta{
.collection_id = collection_id,
.partition_id = partition_id,
.segment_id = segment_id,
.field_id = field_id,
};
}
auto
gen_index_meta(int64_t segment_id = 3,
int64_t field_id = 101,
int64_t index_build_id = 1000,
int64_t index_version = 10000) -> storage::IndexMeta {
return storage::IndexMeta{
.segment_id = segment_id,
.field_id = field_id,
.build_id = index_build_id,
.index_version = index_version,
};
}
auto
gen_local_storage_config(const std::string& root_path)
-> storage::StorageConfig {
auto ret = storage::StorageConfig{};
ret.storage_type = "local";
ret.root_path = root_path;
return ret;
}
struct ChunkManagerWrapper {
ChunkManagerWrapper(storage::ChunkManagerPtr cm) : cm_(cm) {
}
~ChunkManagerWrapper() {
for (const auto& file : written_) {
cm_->Remove(file);
}
boost::filesystem::remove_all(cm_->GetRootPath());
}
void
Write(const std::string& filepath, void* buf, uint64_t len) {
written_.insert(filepath);
cm_->Write(filepath, buf, len);
}
const storage::ChunkManagerPtr cm_;
std::unordered_set<std::string> written_;
};
} // namespace milvus::test
template <typename T, DataType dtype>
void
test_run() {
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t field_id = 101;
int64_t index_build_id = 1000;
int64_t index_version = 10000;
auto field_meta =
test::gen_field_meta(collection_id, partition_id, segment_id, field_id);
auto index_meta = test::gen_index_meta(
segment_id, field_id, index_build_id, index_version);
std::string root_path = "/tmp/test-inverted-index/";
auto storage_config = test::gen_local_storage_config(root_path);
auto cm = storage::CreateChunkManager(storage_config);
size_t nb = 10000;
std::vector<T> data_gen;
boost::container::vector<T> data;
if constexpr (!std::is_same_v<T, bool>) {
data_gen = GenSortedArr<T>(nb);
} else {
for (size_t i = 0; i < nb; i++) {
data_gen.push_back(rand() % 2 == 0);
}
}
for (auto x : data_gen) {
data.push_back(x);
}
auto field_data = storage::CreateFieldData(dtype);
field_data->FillFieldData(data.data(), data.size());
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto get_binlog_path = [=](int64_t log_id) {
return fmt::format("{}/{}/{}/{}/{}",
collection_id,
partition_id,
segment_id,
field_id,
log_id);
};
auto log_path = get_binlog_path(0);
auto cm_w = test::ChunkManagerWrapper(cm);
cm_w.Write(log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, cm);
std::vector<std::string> index_files;
{
Config config;
config["index_type"] = milvus::index::INVERTED_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
auto index = indexbuilder::IndexFactory::GetInstance().CreateIndex(
dtype, config, ctx);
index->Build();
auto bs = index->Upload();
for (const auto& [key, _] : bs.binary_map_) {
index_files.push_back(key);
}
}
{
index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::INVERTED_INDEX_TYPE;
index_info.field_type = dtype;
Config config;
config["index_files"] = index_files;
auto index =
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
index->Load(config);
auto cnt = index->Count();
ASSERT_EQ(cnt, nb);
using IndexType = index::ScalarIndex<T>;
auto real_index = dynamic_cast<IndexType*>(index.get());
if constexpr (!std::is_floating_point_v<T>) {
// hard to compare floating-point value.
{
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq && i < nb; i++) {
test_data.push_back(data[i]);
s.insert(data[i]);
}
auto bitset =
real_index->In(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
}
}
{
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq && i < nb; i++) {
test_data.push_back(data[i]);
s.insert(data[i]);
}
auto bitset =
real_index->NotIn(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
}
}
}
using RefFunc = std::function<bool(int64_t)>;
if constexpr (!std::is_same_v<T, bool>) {
// range query on boolean is not reasonable.
{
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
{20,
OpType::GreaterThan,
[&](int64_t i) -> bool { return data[i] > 20; }},
{20,
OpType::GreaterEqual,
[&](int64_t i) -> bool { return data[i] >= 20; }},
{20,
OpType::LessThan,
[&](int64_t i) -> bool { return data[i] < 20; }},
{20,
OpType::LessEqual,
[&](int64_t i) -> bool { return data[i] <= 20; }},
};
for (const auto& [test_value, op, ref] : test_cases) {
auto bitset = real_index->Range(test_value, op);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
{
std::vector<std::tuple<T, bool, T, bool, RefFunc>> test_cases{
{1,
false,
20,
false,
[&](int64_t i) -> bool {
return 1 < data[i] && data[i] < 20;
}},
{1,
false,
20,
true,
[&](int64_t i) -> bool {
return 1 < data[i] && data[i] <= 20;
}},
{1,
true,
20,
false,
[&](int64_t i) -> bool {
return 1 <= data[i] && data[i] < 20;
}},
{1,
true,
20,
true,
[&](int64_t i) -> bool {
return 1 <= data[i] && data[i] <= 20;
}},
};
for (const auto& [lb, lb_inclusive, ub, ub_inclusive, ref] :
test_cases) {
auto bitset =
real_index->Range(lb, lb_inclusive, ub, ub_inclusive);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should) << "@" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
}
}
void
test_string() {
using T = std::string;
DataType dtype = DataType::VARCHAR;
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t field_id = 101;
int64_t index_build_id = 1000;
int64_t index_version = 10000;
auto field_meta =
test::gen_field_meta(collection_id, partition_id, segment_id, field_id);
auto index_meta = test::gen_index_meta(
segment_id, field_id, index_build_id, index_version);
std::string root_path = "/tmp/test-inverted-index/";
auto storage_config = test::gen_local_storage_config(root_path);
auto cm = storage::CreateChunkManager(storage_config);
size_t nb = 10000;
boost::container::vector<T> data;
for (size_t i = 0; i < nb; i++) {
data.push_back(std::to_string(rand()));
}
auto field_data = storage::CreateFieldData(dtype);
field_data->FillFieldData(data.data(), data.size());
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto get_binlog_path = [=](int64_t log_id) {
return fmt::format("{}/{}/{}/{}/{}",
collection_id,
partition_id,
segment_id,
field_id,
log_id);
};
auto log_path = get_binlog_path(0);
auto cm_w = test::ChunkManagerWrapper(cm);
cm_w.Write(log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, cm);
std::vector<std::string> index_files;
{
Config config;
config["index_type"] = milvus::index::INVERTED_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
auto index = indexbuilder::IndexFactory::GetInstance().CreateIndex(
dtype, config, ctx);
index->Build();
auto bs = index->Upload();
for (const auto& [key, _] : bs.binary_map_) {
index_files.push_back(key);
}
}
{
index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::INVERTED_INDEX_TYPE;
index_info.field_type = dtype;
Config config;
config["index_files"] = index_files;
auto index =
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
index->Load(config);
auto cnt = index->Count();
ASSERT_EQ(cnt, nb);
using IndexType = index::ScalarIndex<T>;
auto real_index = dynamic_cast<IndexType*>(index.get());
{
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq && i < nb; i++) {
test_data.push_back(data[i]);
s.insert(data[i]);
}
auto bitset = real_index->In(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
}
}
{
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq && i < nb; i++) {
test_data.push_back(data[i]);
s.insert(data[i]);
}
auto bitset = real_index->NotIn(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
}
}
using RefFunc = std::function<bool(int64_t)>;
{
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
{"20",
OpType::GreaterThan,
[&](int64_t i) -> bool { return data[i] > "20"; }},
{"20",
OpType::GreaterEqual,
[&](int64_t i) -> bool { return data[i] >= "20"; }},
{"20",
OpType::LessThan,
[&](int64_t i) -> bool { return data[i] < "20"; }},
{"20",
OpType::LessEqual,
[&](int64_t i) -> bool { return data[i] <= "20"; }},
};
for (const auto& [test_value, op, ref] : test_cases) {
auto bitset = real_index->Range(test_value, op);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
{
std::vector<std::tuple<T, bool, T, bool, RefFunc>> test_cases{
{"1",
false,
"20",
false,
[&](int64_t i) -> bool {
return "1" < data[i] && data[i] < "20";
}},
{"1",
false,
"20",
true,
[&](int64_t i) -> bool {
return "1" < data[i] && data[i] <= "20";
}},
{"1",
true,
"20",
false,
[&](int64_t i) -> bool {
return "1" <= data[i] && data[i] < "20";
}},
{"1",
true,
"20",
true,
[&](int64_t i) -> bool {
return "1" <= data[i] && data[i] <= "20";
}},
};
for (const auto& [lb, lb_inclusive, ub, ub_inclusive, ref] :
test_cases) {
auto bitset =
real_index->Range(lb, lb_inclusive, ub, ub_inclusive);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "@" << i << ", ans: " << ans << ", ref: " << should;
}
}
}
{
auto dataset = std::make_shared<Dataset>();
auto prefix = data[0];
dataset->Set(index::OPERATOR_TYPE, OpType::PrefixMatch);
dataset->Set(index::PREFIX_VALUE, prefix);
auto bitset = real_index->Query(dataset);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], boost::starts_with(data[i], prefix));
}
}
}
}
TEST(InvertedIndex, Naive) {
test_run<int8_t, DataType::INT8>();
test_run<int16_t, DataType::INT16>();
test_run<int32_t, DataType::INT32>();
test_run<int64_t, DataType::INT64>();
test_run<bool, DataType::BOOL>();
test_run<float, DataType::FLOAT>();
test_run<double, DataType::DOUBLE>();
test_string();
}

View File

@ -22,6 +22,7 @@
#include "test_utils/DataGen.h"
#include <boost/filesystem.hpp>
#include "test_utils/storage_test_utils.h"
#include "test_utils/TmpPath.h"
constexpr int64_t nb = 100;
namespace indexcgo = milvus::proto::indexcgo;
@ -75,7 +76,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Count) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
ASSERT_EQ(nb, scalar_index->Count());
}
@ -94,7 +95,7 @@ TYPED_TEST_P(TypedScalarIndexTest, HasRawData) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
ASSERT_EQ(nb, scalar_index->Count());
ASSERT_TRUE(scalar_index->HasRawData());
@ -114,7 +115,7 @@ TYPED_TEST_P(TypedScalarIndexTest, In) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
assert_in<T>(scalar_index, arr);
}
@ -133,7 +134,7 @@ TYPED_TEST_P(TypedScalarIndexTest, NotIn) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
assert_not_in<T>(scalar_index, arr);
}
@ -152,7 +153,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Reverse) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
assert_reverse<T>(scalar_index, arr);
}
@ -171,7 +172,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Range) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
assert_range<T>(scalar_index, arr);
}
@ -190,7 +191,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
create_index_info);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
scalar_index->Build(nb, arr.data());
auto binary_set = index->Serialize(nullptr);
@ -231,101 +232,90 @@ class TypedScalarIndexTestV2 : public ::testing::Test {
struct Helper {};
protected:
std::unordered_map<std::type_index, const std::shared_ptr<arrow::DataType>>
m_fields = {{typeid(int8_t), arrow::int8()},
{typeid(int16_t), arrow::int16()},
{typeid(int32_t), arrow::int32()},
{typeid(int64_t), arrow::int64()},
{typeid(float), arrow::float32()},
{typeid(double), arrow::float64()}};
std::shared_ptr<arrow::Schema>
TestSchema(int vec_size) {
arrow::FieldVector fields;
fields.push_back(arrow::field("pk", arrow::int64()));
fields.push_back(arrow::field("ts", arrow::int64()));
fields.push_back(arrow::field("scalar", m_fields[typeid(T)]));
fields.push_back(
arrow::field("vec", arrow::fixed_size_binary(vec_size)));
return std::make_shared<arrow::Schema>(fields);
}
std::shared_ptr<arrow::RecordBatchReader>
TestRecords(int vec_size, GeneratedData& dataset, std::vector<T>& scalars) {
arrow::Int64Builder pk_builder;
arrow::Int64Builder ts_builder;
arrow::NumericBuilder<typename Helper::C> scalar_builder;
arrow::FixedSizeBinaryBuilder vec_builder(
arrow::fixed_size_binary(vec_size));
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto data = reinterpret_cast<char*>(xb_data.data());
for (auto i = 0; i < nb; ++i) {
EXPECT_TRUE(pk_builder.Append(i).ok());
EXPECT_TRUE(ts_builder.Append(i).ok());
EXPECT_TRUE(vec_builder.Append(data + i * vec_size).ok());
}
for (auto& v : scalars) {
EXPECT_TRUE(scalar_builder.Append(v).ok());
}
std::shared_ptr<arrow::Array> pk_array;
EXPECT_TRUE(pk_builder.Finish(&pk_array).ok());
std::shared_ptr<arrow::Array> ts_array;
EXPECT_TRUE(ts_builder.Finish(&ts_array).ok());
std::shared_ptr<arrow::Array> scalar_array;
EXPECT_TRUE(scalar_builder.Finish(&scalar_array).ok());
std::shared_ptr<arrow::Array> vec_array;
EXPECT_TRUE(vec_builder.Finish(&vec_array).ok());
auto schema = TestSchema(vec_size);
auto rec_batch = arrow::RecordBatch::Make(
schema, nb, {pk_array, ts_array, scalar_array, vec_array});
auto reader =
arrow::RecordBatchReader::Make({rec_batch}, schema).ValueOrDie();
return reader;
}
std::shared_ptr<milvus_storage::Space>
TestSpace(int vec_size, GeneratedData& dataset, std::vector<T>& scalars) {
auto arrow_schema = TestSchema(vec_size);
auto schema_options = std::make_shared<milvus_storage::SchemaOptions>();
schema_options->primary_column = "pk";
schema_options->version_column = "ts";
schema_options->vector_column = "vec";
auto schema = std::make_shared<milvus_storage::Schema>(arrow_schema,
schema_options);
EXPECT_TRUE(schema->Validate().ok());
auto space_res = milvus_storage::Space::Open(
"file://" + boost::filesystem::canonical(temp_path).string(),
milvus_storage::Options{schema});
EXPECT_TRUE(space_res.has_value());
auto space = std::move(space_res.value());
auto rec = TestRecords(vec_size, dataset, scalars);
auto write_opt = milvus_storage::WriteOption{nb};
space->Write(rec.get(), &write_opt);
return std::move(space);
}
void
SetUp() override {
temp_path = boost::filesystem::temp_directory_path() /
boost::filesystem::unique_path();
boost::filesystem::create_directory(temp_path);
auto vec_size = DIM * 4;
auto dataset = GenDataset(nb, knowhere::metric::L2, false);
auto scalars = GenArr<T>(nb);
space = TestSpace(vec_size, dataset, scalars);
}
void
TearDown() override {
boost::filesystem::remove_all(temp_path);
}
protected:
boost::filesystem::path temp_path;
std::shared_ptr<milvus_storage::Space> space;
};
static std::unordered_map<std::type_index,
const std::shared_ptr<arrow::DataType>>
m_fields = {{typeid(int8_t), arrow::int8()},
{typeid(int16_t), arrow::int16()},
{typeid(int32_t), arrow::int32()},
{typeid(int64_t), arrow::int64()},
{typeid(float), arrow::float32()},
{typeid(double), arrow::float64()}};
template <typename T>
std::shared_ptr<arrow::Schema>
TestSchema(int vec_size) {
arrow::FieldVector fields;
fields.push_back(arrow::field("pk", arrow::int64()));
fields.push_back(arrow::field("ts", arrow::int64()));
fields.push_back(arrow::field("scalar", m_fields[typeid(T)]));
fields.push_back(arrow::field("vec", arrow::fixed_size_binary(vec_size)));
return std::make_shared<arrow::Schema>(fields);
}
template <typename T>
std::shared_ptr<arrow::RecordBatchReader>
TestRecords(int vec_size, GeneratedData& dataset, std::vector<T>& scalars) {
arrow::Int64Builder pk_builder;
arrow::Int64Builder ts_builder;
arrow::NumericBuilder<typename TypedScalarIndexTestV2<T>::Helper::C>
scalar_builder;
arrow::FixedSizeBinaryBuilder vec_builder(
arrow::fixed_size_binary(vec_size));
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto data = reinterpret_cast<char*>(xb_data.data());
for (auto i = 0; i < nb; ++i) {
EXPECT_TRUE(pk_builder.Append(i).ok());
EXPECT_TRUE(ts_builder.Append(i).ok());
EXPECT_TRUE(vec_builder.Append(data + i * vec_size).ok());
}
for (auto& v : scalars) {
EXPECT_TRUE(scalar_builder.Append(v).ok());
}
std::shared_ptr<arrow::Array> pk_array;
EXPECT_TRUE(pk_builder.Finish(&pk_array).ok());
std::shared_ptr<arrow::Array> ts_array;
EXPECT_TRUE(ts_builder.Finish(&ts_array).ok());
std::shared_ptr<arrow::Array> scalar_array;
EXPECT_TRUE(scalar_builder.Finish(&scalar_array).ok());
std::shared_ptr<arrow::Array> vec_array;
EXPECT_TRUE(vec_builder.Finish(&vec_array).ok());
auto schema = TestSchema<T>(vec_size);
auto rec_batch = arrow::RecordBatch::Make(
schema, nb, {pk_array, ts_array, scalar_array, vec_array});
auto reader =
arrow::RecordBatchReader::Make({rec_batch}, schema).ValueOrDie();
return reader;
}
template <typename T>
std::shared_ptr<milvus_storage::Space>
TestSpace(boost::filesystem::path& temp_path,
int vec_size,
GeneratedData& dataset,
std::vector<T>& scalars) {
auto arrow_schema = TestSchema<T>(vec_size);
auto schema_options = std::make_shared<milvus_storage::SchemaOptions>();
schema_options->primary_column = "pk";
schema_options->version_column = "ts";
schema_options->vector_column = "vec";
auto schema =
std::make_shared<milvus_storage::Schema>(arrow_schema, schema_options);
EXPECT_TRUE(schema->Validate().ok());
auto space_res = milvus_storage::Space::Open(
"file://" + boost::filesystem::canonical(temp_path).string(),
milvus_storage::Options{schema});
EXPECT_TRUE(space_res.has_value());
auto space = std::move(space_res.value());
auto rec = TestRecords<T>(vec_size, dataset, scalars);
auto write_opt = milvus_storage::WriteOption{nb};
space->Write(rec.get(), &write_opt);
return std::move(space);
}
template <>
struct TypedScalarIndexTestV2<int8_t>::Helper {
using C = arrow::Int8Type;
@ -361,7 +351,7 @@ TYPED_TEST_CASE_P(TypedScalarIndexTestV2);
TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
using T = TypeParam;
auto dtype = milvus::GetDType<T>();
auto index_types = GetIndexTypes<T>();
auto index_types = GetIndexTypesV2<T>();
for (const auto& index_type : index_types) {
milvus::index::CreateIndexInfo create_index_info;
create_index_info.field_type = milvus::DataType(dtype);
@ -371,11 +361,18 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
auto storage_config = get_default_local_storage_config();
auto chunk_manager =
milvus::storage::CreateChunkManager(storage_config);
milvus::test::TmpPath tmp_path;
auto temp_path = tmp_path.get();
auto vec_size = DIM * 4;
auto dataset = GenDataset(nb, knowhere::metric::L2, false);
auto scalars = GenSortedArr<T>(nb);
auto space = TestSpace<T>(temp_path, vec_size, dataset, scalars);
milvus::storage::FileManagerContext file_manager_context(
{}, {.field_name = "scalar"}, chunk_manager, this->space);
{}, {.field_name = "scalar"}, chunk_manager, space);
auto index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info, file_manager_context, this->space);
create_index_info, file_manager_context, space);
auto scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
scalar_index->BuildV2();
@ -383,7 +380,7 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
auto new_index =
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
create_index_info, file_manager_context, this->space);
create_index_info, file_manager_context, space);
auto new_scalar_index =
dynamic_cast<milvus::index::ScalarIndex<T>*>(new_index.get());
new_scalar_index->LoadV2();

View File

@ -138,7 +138,7 @@ TYPED_TEST_P(TypedScalarIndexCreatorTest, Codec) {
milvus::DataType(dtype),
config,
milvus::storage::FileManagerContext());
auto arr = GenArr<T>(nb);
auto arr = GenSortedArr<T>(nb);
build_index<T>(creator, arr);
auto binary_set = creator->Serialize();
auto copy_creator = milvus::indexbuilder::CreateScalarIndex(

View File

@ -0,0 +1,35 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <boost/filesystem.hpp>
#include <string>
namespace milvus::test {
struct TmpPath {
TmpPath() {
temp_path_ = boost::filesystem::temp_directory_path() /
boost::filesystem::unique_path();
boost::filesystem::create_directory(temp_path_);
}
~TmpPath() {
boost::filesystem::remove_all(temp_path_);
}
auto
get() {
return temp_path_;
}
private:
boost::filesystem::path temp_path_;
};
} // namespace milvus::test

View File

@ -18,6 +18,7 @@
#include <google/protobuf/text_format.h>
#include "DataGen.h"
#include "index/Meta.h"
#include "index/ScalarIndex.h"
#include "index/StringIndex.h"
#include "index/Utils.h"
@ -348,7 +349,7 @@ template <typename T,
typename = typename std::enable_if_t<std::is_arithmetic_v<T> ||
std::is_same_v<T, std::string>>>
inline std::vector<T>
GenArr(int64_t n) {
GenSortedArr(int64_t n) {
auto max_i8 = std::numeric_limits<int8_t>::max() - 1;
std::vector<T> arr;
arr.resize(n);
@ -374,15 +375,14 @@ GenStrArr(int64_t n) {
template <>
inline std::vector<std::string>
GenArr<std::string>(int64_t n) {
GenSortedArr<std::string>(int64_t n) {
return GenStrArr(n);
}
std::vector<ScalarTestParams>
GenBoolParams() {
std::vector<ScalarTestParams> ret;
ret.emplace_back(
ScalarTestParams(MapParams(), {{"index_type", "inverted_index"}}));
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "sort"}}));
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "flat"}}));
return ret;
}
@ -408,8 +408,7 @@ GenParams() {
}
std::vector<ScalarTestParams> ret;
ret.emplace_back(
ScalarTestParams(MapParams(), {{"index_type", "inverted_index"}}));
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "sort"}}));
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "flat"}}));
return ret;
}
@ -442,13 +441,25 @@ GenDsFromPB(const google::protobuf::Message& msg) {
template <typename T>
inline std::vector<std::string>
GetIndexTypes() {
return std::vector<std::string>{"inverted_index"};
return std::vector<std::string>{"sort"};
}
template <>
inline std::vector<std::string>
GetIndexTypes<std::string>() {
return std::vector<std::string>{"marisa"};
return std::vector<std::string>{"sort", "marisa"};
}
template <typename T>
inline std::vector<std::string>
GetIndexTypesV2() {
return std::vector<std::string>{"sort", milvus::index::INVERTED_INDEX_TYPE};
}
template <>
inline std::vector<std::string>
GetIndexTypesV2<std::string>() {
return std::vector<std::string>{milvus::index::INVERTED_INDEX_TYPE, "marisa"};
}
} // namespace

View File

@ -594,7 +594,7 @@ func (it *indexBuildTask) parseFieldMetaFromBinlog(ctx context.Context) error {
it.partitionID = partitionID
it.segmentID = segmentID
for fID, value := range insertData.Data {
it.fieldType = indexcgowrapper.GenDataset(value).DType
it.fieldType = value.GetDataType()
it.fieldID = fID
break
}

View File

@ -160,6 +160,14 @@ func (cit *createIndexTask) parseIndexParams() error {
if exist && !validateArithmeticIndexType(specifyIndexType) {
return merr.WrapErrParameterInvalid(DefaultArithmeticIndexType, specifyIndexType, "index type not match")
}
} else if typeutil.IsBoolType(cit.fieldSchema.DataType) {
if !exist {
return merr.WrapErrParameterInvalidMsg("no index type specified")
}
if specifyIndexType != InvertedIndexType {
return merr.WrapErrParameterInvalidMsg("index type (%s) not supported for boolean, supported: %s",
specifyIndexType, InvertedIndexType)
}
} else {
return merr.WrapErrParameterInvalid("supported field",
fmt.Sprintf("create index on %s field", cit.fieldSchema.DataType.String()),

View File

@ -67,6 +67,8 @@ const (
// DefaultStringIndexType name of default index type for varChar/string field
DefaultStringIndexType = "Trie"
InvertedIndexType = "INVERTED"
)
var logger = log.L().WithOptions(zap.Fields(zap.String("role", typeutil.ProxyRole)))
@ -247,12 +249,12 @@ func validatePartitionTag(partitionTag string, strictCheck bool) error {
func validateStringIndexType(indexType string) bool {
// compatible with the index type marisa-trie of attu versions prior to 2.3.0
return indexType == DefaultStringIndexType || indexType == "marisa-trie"
return indexType == DefaultStringIndexType || indexType == "marisa-trie" || indexType == InvertedIndexType
}
func validateArithmeticIndexType(indexType string) bool {
// compatible with the index type Asceneding of attu versions prior to 2.3.0
return indexType == DefaultArithmeticIndexType || indexType == "Asceneding"
return indexType == DefaultArithmeticIndexType || indexType == "Asceneding" || indexType == InvertedIndexType
}
func validateFieldName(fieldName string) error {

View File

@ -606,6 +606,7 @@ func (insertCodec *InsertCodec) DeserializeInto(fieldBinlogs []*Blob, rowNum int
stringFieldData := insertData.Data[fieldID].(*StringFieldData)
stringFieldData.Data = append(stringFieldData.Data, stringPayload...)
stringFieldData.DataType = dataType
totalLength += len(stringPayload)
insertData.Data[fieldID] = stringFieldData

View File

@ -330,7 +330,7 @@ func TestInsertCodec(t *testing.T) {
Int64Field: &Int64FieldData{[]int64{}},
FloatField: &FloatFieldData{[]float32{}},
DoubleField: &DoubleFieldData{[]float64{}},
StringField: &StringFieldData{[]string{}},
StringField: &StringFieldData{[]string{}, schemapb.DataType_VarChar},
BinaryVectorField: &BinaryVectorFieldData{[]byte{}, 8},
FloatVectorField: &FloatVectorFieldData{[]float32{}, 4},
Float16VectorField: &Float16VectorFieldData{[]byte{}, 4},
@ -706,7 +706,7 @@ func TestMemorySize(t *testing.T) {
Int64Field: &Int64FieldData{[]int64{}},
FloatField: &FloatFieldData{[]float32{}},
DoubleField: &DoubleFieldData{[]float64{}},
StringField: &StringFieldData{[]string{}},
StringField: &StringFieldData{[]string{}, schemapb.DataType_VarChar},
BinaryVectorField: &BinaryVectorFieldData{[]byte{}, 8},
FloatVectorField: &FloatVectorFieldData{[]float32{}, 4},
},

View File

@ -121,6 +121,7 @@ type FieldData interface {
RowNum() int
GetRow(i int) any
AppendRow(row interface{}) error
GetDataType() schemapb.DataType
}
func NewFieldData(dataType schemapb.DataType, fieldSchema *schemapb.FieldSchema) (FieldData, error) {
@ -198,7 +199,8 @@ func NewFieldData(dataType schemapb.DataType, fieldSchema *schemapb.FieldSchema)
}, nil
case schemapb.DataType_String, schemapb.DataType_VarChar:
return &StringFieldData{
Data: make([]string, 0),
Data: make([]string, 0),
DataType: dataType,
}, nil
default:
return nil, fmt.Errorf("Unexpected schema data type: %d", dataType)
@ -227,7 +229,8 @@ type DoubleFieldData struct {
Data []float64
}
type StringFieldData struct {
Data []string
Data []string
DataType schemapb.DataType
}
type ArrayFieldData struct {
ElementType schemapb.DataType
@ -417,6 +420,29 @@ func (data *BinaryVectorFieldData) GetMemorySize() int { return binary.Size(dat
func (data *FloatVectorFieldData) GetMemorySize() int { return binary.Size(data.Data) + 4 }
func (data *Float16VectorFieldData) GetMemorySize() int { return binary.Size(data.Data) + 4 }
// GetDataType implements FieldData.GetDataType
func (data *BoolFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Bool }
func (data *Int8FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int8 }
func (data *Int16FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int16 }
func (data *Int32FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int32 }
func (data *Int64FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int64 }
func (data *FloatFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Float }
func (data *DoubleFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Double }
func (data *StringFieldData) GetDataType() schemapb.DataType { return data.DataType }
func (data *ArrayFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Array }
func (data *JSONFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_JSON }
func (data *BinaryVectorFieldData) GetDataType() schemapb.DataType {
return schemapb.DataType_BinaryVector
}
func (data *FloatVectorFieldData) GetDataType() schemapb.DataType {
return schemapb.DataType_FloatVector
}
func (data *Float16VectorFieldData) GetDataType() schemapb.DataType {
return schemapb.DataType_Float16Vector
}
// why not binary.Size(data) directly? binary.Size(data) return -1
// binary.Size returns how many bytes Write would generate to encode the value v, which
// must be a fixed-size value or a slice of fixed-size values, or a pointer to such data.

View File

@ -147,6 +147,14 @@ func (s *InsertDataSuite) TestMemorySize() {
s.Equal(s.iDataTwoRows.Data[Float16VectorField].GetMemorySize(), 20)
}
func (s *InsertDataSuite) TestGetDataType() {
for _, field := range s.schema.GetFields() {
fieldData, ok := s.iDataOneRow.Data[field.GetFieldID()]
s.True(ok)
s.Equal(field.GetDataType(), fieldData.GetDataType())
}
}
func (s *InsertDataSuite) SetupTest() {
var err error
s.iDataEmpty, err = NewInsertData(s.schema)

View File

@ -171,7 +171,7 @@ func genScalarIndexCases(dtype schemapb.DataType) []indexTestCase {
dtype: dtype,
typeParams: nil,
indexParams: map[string]string{
common.IndexTypeKey: "inverted_index",
common.IndexTypeKey: "sort",
},
},
{
@ -190,7 +190,7 @@ func genStringIndexCases(dtype schemapb.DataType) []indexTestCase {
dtype: dtype,
typeParams: nil,
indexParams: map[string]string{
common.IndexTypeKey: "inverted_index",
common.IndexTypeKey: "sort",
},
},
{

View File

@ -85,7 +85,7 @@ func GenDataset(data storage.FieldData) *Dataset {
}
case *storage.StringFieldData:
return &Dataset{
DType: schemapb.DataType_String,
DType: schemapb.DataType_VarChar,
Data: map[string]interface{}{
keyRawArr: f.Data,
},