mirror of https://github.com/milvus-io/milvus.git
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704 Add inverted index for some data types in Milvus. This index type can save a lot of memory compared to loading all data into RAM and speed up the term query and range query. Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL` and `VARCHAR`. Not supported: `ARRAY` and `JSON`. Note: - The inverted index for `VARCHAR` is not designed to serve full-text search now. We will treat every row as a whole keyword instead of tokenizing it into multiple terms. - The inverted index don't support retrieval well, so if you create inverted index for field, those operations which depend on the raw data will fallback to use chunk storage, which will bring some performance loss. For example, comparisons between two columns and retrieval of output fields. The inverted index is very easy to be used. Taking below collection as an example: ```python fields = [ FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100), FieldSchema(name="int8", dtype=DataType.INT8), FieldSchema(name="int16", dtype=DataType.INT16), FieldSchema(name="int32", dtype=DataType.INT32), FieldSchema(name="int64", dtype=DataType.INT64), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="double", dtype=DataType.DOUBLE), FieldSchema(name="bool", dtype=DataType.BOOL), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000), FieldSchema(name="random", dtype=DataType.DOUBLE), FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim), ] schema = CollectionSchema(fields) collection = Collection("demo", schema) ``` Then we can simply create inverted index for field via: ```python index_type = "INVERTED" collection.create_index("int8", {"index_type": index_type}) collection.create_index("int16", {"index_type": index_type}) collection.create_index("int32", {"index_type": index_type}) collection.create_index("int64", {"index_type": index_type}) collection.create_index("float", {"index_type": index_type}) collection.create_index("double", {"index_type": index_type}) collection.create_index("bool", {"index_type": index_type}) collection.create_index("varchar", {"index_type": index_type}) ``` Then, term query and range query on the field can be speed up automatically by the inverted index: ```python result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"]) result = collection.query(expr='int64 < 5', output_fields=["pk"]) result = collection.query(expr='int64 > 2997', output_fields=["pk"]) result = collection.query(expr='1 < int64 < 5', output_fields=["pk"]) ``` --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>pull/29610/head
parent
984e7bba9b
commit
3f46c6d459
|
@ -7,12 +7,13 @@ fi
|
|||
CorePath=$1
|
||||
|
||||
formatThis() {
|
||||
find "$1" | grep -E "(*\.cpp|*\.h|*\.cc)$" | grep -v "gen_tools/templates" | grep -v "/thirdparty" | grep -v "\.pb\." | xargs clang-format-10 -i
|
||||
find "$1" | grep -E "(*\.cpp|*\.h|*\.cc)$" | grep -v "gen_tools/templates" | grep -v "\.pb\." | xargs clang-format-10 -i
|
||||
}
|
||||
|
||||
formatThis "${CorePath}/src"
|
||||
formatThis "${CorePath}/unittest"
|
||||
formatThis "${CorePath}/unittest/bench"
|
||||
formatThis "${CorePath}/thirdparty/tantivy"
|
||||
|
||||
${CorePath}/build-support/add_cpp_license.sh ${CorePath}/build-support/cpp_license.txt ${CorePath}
|
||||
${CorePath}/build-support/add_cmake_license.sh ${CorePath}/build-support/cmake_license.txt ${CorePath}
|
||||
${CorePath}/build-support/add_cmake_license.sh ${CorePath}/build-support/cmake_license.txt ${CorePath}
|
||||
|
|
|
@ -1,3 +1,14 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <oneapi/tbb/concurrent_queue.h>
|
||||
|
||||
#include <atomic>
|
||||
|
|
|
@ -18,11 +18,12 @@ set(INDEX_FILES
|
|||
ScalarIndex.cpp
|
||||
ScalarIndexSort.cpp
|
||||
SkipIndex.cpp
|
||||
InvertedIndexTantivy.cpp
|
||||
)
|
||||
|
||||
milvus_add_pkg_config("milvus_index")
|
||||
add_library(milvus_index SHARED ${INDEX_FILES})
|
||||
|
||||
target_link_libraries(milvus_index milvus_storage milvus-storage)
|
||||
target_link_libraries(milvus_index milvus_storage milvus-storage tantivy_binding)
|
||||
|
||||
install(TARGETS milvus_index DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndexMarisa.h"
|
||||
#include "index/BoolIndex.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
|
||||
namespace milvus::index {
|
||||
|
||||
|
@ -32,7 +33,14 @@ template <typename T>
|
|||
ScalarIndexPtr<T>
|
||||
IndexFactory::CreateScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
DataType d_type) {
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
TantivyConfig cfg;
|
||||
cfg.data_type_ = d_type;
|
||||
return std::make_unique<InvertedIndexTantivy<T>>(cfg,
|
||||
file_manager_context);
|
||||
}
|
||||
return CreateScalarIndexSort<T>(file_manager_context);
|
||||
}
|
||||
|
||||
|
@ -44,11 +52,18 @@ IndexFactory::CreateScalarIndex(
|
|||
//
|
||||
|
||||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex<std::string>(
|
||||
inline ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
DataType d_type) {
|
||||
#if defined(__linux__) || defined(__APPLE__)
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
TantivyConfig cfg;
|
||||
cfg.data_type_ = d_type;
|
||||
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
||||
cfg, file_manager_context);
|
||||
}
|
||||
return CreateStringIndexMarisa(file_manager_context);
|
||||
#else
|
||||
throw SegcoreError(Unsupported, "unsupported platform");
|
||||
|
@ -60,7 +75,14 @@ ScalarIndexPtr<T>
|
|||
IndexFactory::CreateScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space) {
|
||||
std::shared_ptr<milvus_storage::Space> space,
|
||||
DataType d_type) {
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
TantivyConfig cfg;
|
||||
cfg.data_type_ = d_type;
|
||||
return std::make_unique<InvertedIndexTantivy<T>>(
|
||||
cfg, file_manager_context, space);
|
||||
}
|
||||
return CreateScalarIndexSort<T>(file_manager_context, space);
|
||||
}
|
||||
|
||||
|
@ -69,8 +91,15 @@ ScalarIndexPtr<std::string>
|
|||
IndexFactory::CreateScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space) {
|
||||
std::shared_ptr<milvus_storage::Space> space,
|
||||
DataType d_type) {
|
||||
#if defined(__linux__) || defined(__APPLE__)
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
TantivyConfig cfg;
|
||||
cfg.data_type_ = d_type;
|
||||
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
||||
cfg, file_manager_context, space);
|
||||
}
|
||||
return CreateStringIndexMarisa(file_manager_context, space);
|
||||
#else
|
||||
throw SegcoreError(Unsupported, "unsupported platform");
|
||||
|
@ -111,25 +140,32 @@ IndexFactory::CreateScalarIndex(
|
|||
switch (data_type) {
|
||||
// create scalar index
|
||||
case DataType::BOOL:
|
||||
return CreateScalarIndex<bool>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<bool>(
|
||||
index_type, file_manager_context, data_type);
|
||||
case DataType::INT8:
|
||||
return CreateScalarIndex<int8_t>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<int8_t>(
|
||||
index_type, file_manager_context, data_type);
|
||||
case DataType::INT16:
|
||||
return CreateScalarIndex<int16_t>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<int16_t>(
|
||||
index_type, file_manager_context, data_type);
|
||||
case DataType::INT32:
|
||||
return CreateScalarIndex<int32_t>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<int32_t>(
|
||||
index_type, file_manager_context, data_type);
|
||||
case DataType::INT64:
|
||||
return CreateScalarIndex<int64_t>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<int64_t>(
|
||||
index_type, file_manager_context, data_type);
|
||||
case DataType::FLOAT:
|
||||
return CreateScalarIndex<float>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<float>(
|
||||
index_type, file_manager_context, data_type);
|
||||
case DataType::DOUBLE:
|
||||
return CreateScalarIndex<double>(index_type, file_manager_context);
|
||||
return CreateScalarIndex<double>(
|
||||
index_type, file_manager_context, data_type);
|
||||
|
||||
// create string index
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return CreateScalarIndex<std::string>(index_type,
|
||||
file_manager_context);
|
||||
return CreateScalarIndex<std::string>(
|
||||
index_type, file_manager_context, data_type);
|
||||
default:
|
||||
throw SegcoreError(
|
||||
DataTypeInvalid,
|
||||
|
@ -187,25 +223,32 @@ IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info,
|
|||
switch (data_type) {
|
||||
// create scalar index
|
||||
case DataType::BOOL:
|
||||
return CreateScalarIndex<bool>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<bool>(
|
||||
index_type, file_manager, space, data_type);
|
||||
case DataType::INT8:
|
||||
return CreateScalarIndex<int8_t>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<int8_t>(
|
||||
index_type, file_manager, space, data_type);
|
||||
case DataType::INT16:
|
||||
return CreateScalarIndex<int16_t>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<int16_t>(
|
||||
index_type, file_manager, space, data_type);
|
||||
case DataType::INT32:
|
||||
return CreateScalarIndex<int32_t>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<int32_t>(
|
||||
index_type, file_manager, space, data_type);
|
||||
case DataType::INT64:
|
||||
return CreateScalarIndex<int64_t>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<int64_t>(
|
||||
index_type, file_manager, space, data_type);
|
||||
case DataType::FLOAT:
|
||||
return CreateScalarIndex<float>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<float>(
|
||||
index_type, file_manager, space, data_type);
|
||||
case DataType::DOUBLE:
|
||||
return CreateScalarIndex<double>(index_type, file_manager, space);
|
||||
return CreateScalarIndex<double>(
|
||||
index_type, file_manager, space, data_type);
|
||||
|
||||
// create string index
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return CreateScalarIndex<std::string>(
|
||||
index_type, file_manager, space);
|
||||
index_type, file_manager, space, data_type);
|
||||
default:
|
||||
throw SegcoreError(
|
||||
DataTypeInvalid,
|
||||
|
|
|
@ -60,6 +60,7 @@ class IndexFactory {
|
|||
CreateIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
|
||||
IndexBasePtr
|
||||
CreateVectorIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context);
|
||||
|
@ -86,18 +87,29 @@ class IndexFactory {
|
|||
ScalarIndexPtr<T>
|
||||
CreateScalarIndex(const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager =
|
||||
storage::FileManagerContext());
|
||||
storage::FileManagerContext(),
|
||||
DataType d_type = DataType::NONE);
|
||||
|
||||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
CreateScalarIndex(const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
std::shared_ptr<milvus_storage::Space> space,
|
||||
DataType d_type = DataType::NONE);
|
||||
};
|
||||
|
||||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex<std::string>(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context);
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
DataType d_type);
|
||||
|
||||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex<std::string>(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space,
|
||||
DataType d_type);
|
||||
} // namespace milvus::index
|
||||
|
|
|
@ -0,0 +1,410 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "common/Slice.h"
|
||||
#include "storage/LocalChunkManagerSingleton.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "log/Log.h"
|
||||
#include "index/Utils.h"
|
||||
#include "storage/Util.h"
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
|
||||
namespace milvus::index {
|
||||
template <typename T>
|
||||
InvertedIndexTantivy<T>::InvertedIndexTantivy(
|
||||
const TantivyConfig& cfg,
|
||||
const storage::FileManagerContext& ctx,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: cfg_(cfg), space_(space) {
|
||||
mem_file_manager_ = std::make_shared<MemFileManager>(ctx, ctx.space_);
|
||||
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx, ctx.space_);
|
||||
auto field =
|
||||
std::to_string(disk_file_manager_->GetFieldDataMeta().field_id);
|
||||
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
|
||||
path_ = fmt::format("/tmp/{}", prefix);
|
||||
boost::filesystem::create_directories(path_);
|
||||
d_type_ = cfg_.to_tantivy_data_type();
|
||||
if (tantivy_index_exist(path_.c_str())) {
|
||||
LOG_INFO(
|
||||
"index {} already exists, which should happen in loading progress",
|
||||
path_);
|
||||
} else {
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||
field.c_str(), d_type_, path_.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
InvertedIndexTantivy<T>::~InvertedIndexTantivy() {
|
||||
auto local_chunk_manager =
|
||||
storage::LocalChunkManagerSingleton::GetInstance().GetChunkManager();
|
||||
auto prefix = path_;
|
||||
local_chunk_manager->RemoveDir(prefix);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::finish() {
|
||||
wrapper_->finish();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
InvertedIndexTantivy<T>::Serialize(const Config& config) {
|
||||
BinarySet res_set;
|
||||
|
||||
return res_set;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
InvertedIndexTantivy<T>::Upload(const Config& config) {
|
||||
finish();
|
||||
|
||||
for (const auto& entry : std::filesystem::directory_iterator(path_)) {
|
||||
disk_file_manager_->AddFile(entry.path());
|
||||
}
|
||||
|
||||
BinarySet ret;
|
||||
|
||||
auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize();
|
||||
for (auto& file : remote_paths_to_size) {
|
||||
ret.Append(file.first, nullptr, file.second);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
InvertedIndexTantivy<T>::UploadV2(const Config& config) {
|
||||
return Upload(config);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::Build(const Config& config) {
|
||||
auto insert_files =
|
||||
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
|
||||
AssertInfo(insert_files.has_value(), "insert_files were empty");
|
||||
auto field_datas =
|
||||
mem_file_manager_->CacheRawDataToMemory(insert_files.value());
|
||||
switch (cfg_.data_type_) {
|
||||
case DataType::BOOL: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<bool>(static_cast<const bool*>(data->Data()),
|
||||
n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT8: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int8_t>(
|
||||
static_cast<const int8_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT16: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int16_t>(
|
||||
static_cast<const int16_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT32: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int32_t>(
|
||||
static_cast<const int32_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT64: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int64_t>(
|
||||
static_cast<const int64_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::FLOAT: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<float>(
|
||||
static_cast<const float*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::DOUBLE: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<double>(
|
||||
static_cast<const double*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::VARCHAR: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<std::string>(
|
||||
static_cast<const std::string*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
PanicInfo(ErrorCode::NotImplemented,
|
||||
fmt::format("todo: not supported, {}", cfg_.data_type_));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::BuildV2(const Config& config) {
|
||||
auto field_name = mem_file_manager_->GetIndexMeta().field_name;
|
||||
auto res = space_->ScanData();
|
||||
if (!res.ok()) {
|
||||
PanicInfo(S3Error, "failed to create scan iterator");
|
||||
}
|
||||
auto reader = res.value();
|
||||
std::vector<FieldDataPtr> field_datas;
|
||||
for (auto rec = reader->Next(); rec != nullptr; rec = reader->Next()) {
|
||||
if (!rec.ok()) {
|
||||
PanicInfo(DataFormatBroken, "failed to read data");
|
||||
}
|
||||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType(GetDType<T>()), 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
||||
switch (cfg_.data_type_) {
|
||||
case DataType::BOOL: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<bool>(static_cast<const bool*>(data->Data()),
|
||||
n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT8: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int8_t>(
|
||||
static_cast<const int8_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT16: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int16_t>(
|
||||
static_cast<const int16_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT32: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int32_t>(
|
||||
static_cast<const int32_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::INT64: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<int64_t>(
|
||||
static_cast<const int64_t*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::FLOAT: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<float>(
|
||||
static_cast<const float*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::DOUBLE: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<double>(
|
||||
static_cast<const double*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case DataType::VARCHAR: {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<std::string>(
|
||||
static_cast<const std::string*>(data->Data()), n);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
PanicInfo(ErrorCode::NotImplemented,
|
||||
fmt::format("todo: not supported, {}", cfg_.data_type_));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::Load(const Config& config) {
|
||||
auto index_files =
|
||||
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
|
||||
AssertInfo(index_files.has_value(),
|
||||
"index file paths is empty when load disk ann index data");
|
||||
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
|
||||
disk_file_manager_->CacheIndexToDisk(index_files.value());
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::LoadV2(const Config& config) {
|
||||
disk_file_manager_->CacheIndexToDisk();
|
||||
auto prefix = disk_file_manager_->GetLocalIndexObjectPrefix();
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
|
||||
}
|
||||
|
||||
inline void
|
||||
apply_hits(TargetBitmap& bitset, const RustArrayWrapper& w, bool v) {
|
||||
for (size_t j = 0; j < w.array_.len; j++) {
|
||||
bitset[w.array_.array[j]] = v;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::In(size_t n, const T* values) {
|
||||
TargetBitmap bitset(Count());
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto array = wrapper_->term_query(values[i]);
|
||||
apply_hits(bitset, array, true);
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
|
||||
TargetBitmap bitset(Count(), true);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto array = wrapper_->term_query(values[i]);
|
||||
apply_hits(bitset, array, false);
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::Range(T value, OpType op) {
|
||||
TargetBitmap bitset(Count());
|
||||
|
||||
switch (op) {
|
||||
case OpType::LessThan: {
|
||||
auto array = wrapper_->upper_bound_range_query(value, false);
|
||||
apply_hits(bitset, array, true);
|
||||
} break;
|
||||
case OpType::LessEqual: {
|
||||
auto array = wrapper_->upper_bound_range_query(value, true);
|
||||
apply_hits(bitset, array, true);
|
||||
} break;
|
||||
case OpType::GreaterThan: {
|
||||
auto array = wrapper_->lower_bound_range_query(value, false);
|
||||
apply_hits(bitset, array, true);
|
||||
} break;
|
||||
case OpType::GreaterEqual: {
|
||||
auto array = wrapper_->lower_bound_range_query(value, true);
|
||||
apply_hits(bitset, array, true);
|
||||
} break;
|
||||
default:
|
||||
throw SegcoreError(OpTypeInvalid,
|
||||
fmt::format("Invalid OperatorType: {}", op));
|
||||
}
|
||||
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::Range(T lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
T upper_bound_value,
|
||||
bool ub_inclusive) {
|
||||
TargetBitmap bitset(Count());
|
||||
auto array = wrapper_->range_query(
|
||||
lower_bound_value, upper_bound_value, lb_inclusive, ub_inclusive);
|
||||
apply_hits(bitset, array, true);
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::PrefixMatch(const std::string_view prefix) {
|
||||
TargetBitmap bitset(Count());
|
||||
std::string s(prefix);
|
||||
auto array = wrapper_->prefix_query(s);
|
||||
apply_hits(bitset, array, true);
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::Query(const DatasetPtr& dataset) {
|
||||
return ScalarIndex<T>::Query(dataset);
|
||||
}
|
||||
|
||||
template <>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<std::string>::Query(const DatasetPtr& dataset) {
|
||||
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
|
||||
if (op == OpType::PrefixMatch) {
|
||||
auto prefix = dataset->Get<std::string>(PREFIX_VALUE);
|
||||
return PrefixMatch(prefix);
|
||||
}
|
||||
return ScalarIndex<std::string>::Query(dataset);
|
||||
}
|
||||
|
||||
template class InvertedIndexTantivy<bool>;
|
||||
template class InvertedIndexTantivy<int8_t>;
|
||||
template class InvertedIndexTantivy<int16_t>;
|
||||
template class InvertedIndexTantivy<int32_t>;
|
||||
template class InvertedIndexTantivy<int64_t>;
|
||||
template class InvertedIndexTantivy<float>;
|
||||
template class InvertedIndexTantivy<double>;
|
||||
template class InvertedIndexTantivy<std::string>;
|
||||
} // namespace milvus::index
|
|
@ -0,0 +1,176 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "index/Index.h"
|
||||
#include "storage/FileManager.h"
|
||||
#include "storage/DiskFileManagerImpl.h"
|
||||
#include "storage/MemFileManagerImpl.h"
|
||||
#include "tantivy-binding.h"
|
||||
#include "tantivy-wrapper.h"
|
||||
#include "index/StringIndex.h"
|
||||
#include "index/TantivyConfig.h"
|
||||
#include "storage/space.h"
|
||||
|
||||
namespace milvus::index {
|
||||
|
||||
using TantivyIndexWrapper = milvus::tantivy::TantivyIndexWrapper;
|
||||
using RustArrayWrapper = milvus::tantivy::RustArrayWrapper;
|
||||
|
||||
template <typename T>
|
||||
class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||
public:
|
||||
using MemFileManager = storage::MemFileManagerImpl;
|
||||
using MemFileManagerPtr = std::shared_ptr<MemFileManager>;
|
||||
using DiskFileManager = storage::DiskFileManagerImpl;
|
||||
using DiskFileManagerPtr = std::shared_ptr<DiskFileManager>;
|
||||
|
||||
explicit InvertedIndexTantivy(const TantivyConfig& cfg,
|
||||
const storage::FileManagerContext& ctx)
|
||||
: InvertedIndexTantivy(cfg, ctx, nullptr) {
|
||||
}
|
||||
|
||||
explicit InvertedIndexTantivy(const TantivyConfig& cfg,
|
||||
const storage::FileManagerContext& ctx,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
|
||||
~InvertedIndexTantivy();
|
||||
|
||||
/*
|
||||
* deprecated.
|
||||
* TODO: why not remove this?
|
||||
*/
|
||||
void
|
||||
Load(const BinarySet& binary_set, const Config& config = {}) override {
|
||||
PanicInfo(ErrorCode::NotImplemented, "load v1 should be deprecated");
|
||||
}
|
||||
|
||||
void
|
||||
Load(const Config& config = {}) override;
|
||||
|
||||
void
|
||||
LoadV2(const Config& config = {}) override;
|
||||
|
||||
/*
|
||||
* deprecated.
|
||||
* TODO: why not remove this?
|
||||
*/
|
||||
void
|
||||
BuildWithDataset(const DatasetPtr& dataset,
|
||||
const Config& config = {}) override {
|
||||
PanicInfo(ErrorCode::NotImplemented,
|
||||
"BuildWithDataset should be deprecated");
|
||||
}
|
||||
|
||||
void
|
||||
Build(const Config& config = {}) override;
|
||||
|
||||
void
|
||||
BuildV2(const Config& config = {}) override;
|
||||
|
||||
int64_t
|
||||
Count() override {
|
||||
return wrapper_->count();
|
||||
}
|
||||
|
||||
/*
|
||||
* deprecated.
|
||||
* TODO: why not remove this?
|
||||
*/
|
||||
void
|
||||
BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override {
|
||||
PanicInfo(ErrorCode::NotImplemented,
|
||||
"BuildWithRawData should be deprecated");
|
||||
}
|
||||
|
||||
/*
|
||||
* deprecated.
|
||||
* TODO: why not remove this?
|
||||
*/
|
||||
BinarySet
|
||||
Serialize(const Config& config /* not used */) override;
|
||||
|
||||
BinarySet
|
||||
Upload(const Config& config = {}) override;
|
||||
|
||||
BinarySet
|
||||
UploadV2(const Config& config = {}) override;
|
||||
|
||||
/*
|
||||
* deprecated, only used in small chunk index.
|
||||
*/
|
||||
void
|
||||
Build(size_t n, const T* values) override {
|
||||
PanicInfo(ErrorCode::NotImplemented, "Build should not be called");
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
In(size_t n, const T* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
NotIn(size_t n, const T* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(T value, OpType op) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(T lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
T upper_bound_value,
|
||||
bool ub_inclusive) override;
|
||||
|
||||
const bool
|
||||
HasRawData() const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
T
|
||||
Reverse_Lookup(size_t offset) const override {
|
||||
PanicInfo(ErrorCode::NotImplemented,
|
||||
"Reverse_Lookup should not be handled by inverted index");
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size() override {
|
||||
return Count();
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
PrefixMatch(const std::string_view prefix);
|
||||
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override;
|
||||
|
||||
private:
|
||||
void
|
||||
finish();
|
||||
|
||||
private:
|
||||
std::shared_ptr<TantivyIndexWrapper> wrapper_;
|
||||
TantivyConfig cfg_;
|
||||
TantivyDataType d_type_;
|
||||
std::string path_;
|
||||
|
||||
/*
|
||||
* To avoid IO amplification, we use both mem file manager & disk file manager
|
||||
* 1, build phase, we just need the raw data in memory, we use MemFileManager.CacheRawDataToMemory;
|
||||
* 2, upload phase, the index was already on the disk, we use DiskFileManager.AddFile directly;
|
||||
* 3, load phase, we need the index on the disk instead of memory, we use DiskFileManager.CacheIndexToDisk;
|
||||
* Btw, this approach can be applied to DiskANN also.
|
||||
*/
|
||||
MemFileManagerPtr mem_file_manager_;
|
||||
DiskFileManagerPtr disk_file_manager_;
|
||||
std::shared_ptr<milvus_storage::Space> space_;
|
||||
};
|
||||
} // namespace milvus::index
|
|
@ -36,6 +36,7 @@ constexpr const char* METRIC_TYPE = "metric_type";
|
|||
// scalar index type
|
||||
constexpr const char* ASCENDING_SORT = "STL_SORT";
|
||||
constexpr const char* MARISA_TRIE = "Trie";
|
||||
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
|
||||
|
||||
// index meta
|
||||
constexpr const char* COLLECTION_ID = "collection_id";
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "storage/Types.h"
|
||||
#include "tantivy-binding.h"
|
||||
|
||||
namespace milvus::index {
|
||||
struct TantivyConfig {
|
||||
DataType data_type_;
|
||||
|
||||
TantivyDataType
|
||||
to_tantivy_data_type() {
|
||||
switch (data_type_) {
|
||||
case DataType::BOOL: {
|
||||
return TantivyDataType::Bool;
|
||||
}
|
||||
|
||||
case DataType::INT8:
|
||||
case DataType::INT16:
|
||||
case DataType::INT32:
|
||||
case DataType::INT64: {
|
||||
return TantivyDataType::I64;
|
||||
}
|
||||
|
||||
case DataType::FLOAT:
|
||||
case DataType::DOUBLE: {
|
||||
return TantivyDataType::F64;
|
||||
}
|
||||
|
||||
case DataType::VARCHAR: {
|
||||
return TantivyDataType::Keyword;
|
||||
}
|
||||
|
||||
default:
|
||||
PanicInfo(
|
||||
ErrorCode::NotImplemented,
|
||||
fmt::format("not implemented data type: {}", data_type_));
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace milvus::index
|
|
@ -27,6 +27,9 @@ ScalarIndexCreator::ScalarIndexCreator(
|
|||
const storage::FileManagerContext& file_manager_context)
|
||||
: config_(config), dtype_(dtype) {
|
||||
milvus::index::CreateIndexInfo index_info;
|
||||
if (config.contains("index_type")) {
|
||||
index_type_ = config.at("index_type").get<std::string>();
|
||||
}
|
||||
index_info.field_type = dtype_;
|
||||
index_info.index_type = index_type();
|
||||
index_ = index::IndexFactory::GetInstance().CreateIndex(
|
||||
|
@ -74,8 +77,7 @@ ScalarIndexCreator::Load(const milvus::BinarySet& binary_set) {
|
|||
|
||||
std::string
|
||||
ScalarIndexCreator::index_type() {
|
||||
// TODO
|
||||
return "sort";
|
||||
return index_type_;
|
||||
}
|
||||
|
||||
BinarySet
|
||||
|
|
|
@ -60,6 +60,7 @@ class ScalarIndexCreator : public IndexCreatorBase {
|
|||
index::IndexBasePtr index_ = nullptr;
|
||||
Config config_;
|
||||
DataType dtype_;
|
||||
IndexType index_type_;
|
||||
};
|
||||
|
||||
using ScalarIndexCreatorPtr = std::unique_ptr<ScalarIndexCreator>;
|
||||
|
|
|
@ -1,3 +1,14 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
|
|
|
@ -1,3 +1,14 @@
|
|||
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
# or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
|
|
|
@ -1,3 +1,14 @@
|
|||
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
# or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
|
|
|
@ -1,3 +1,14 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "../AzureBlobChunkManager.h"
|
||||
#include <azure/identity/workload_identity_credential.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
|
|
|
@ -37,6 +37,7 @@ add_subdirectory(rocksdb)
|
|||
add_subdirectory(rdkafka)
|
||||
add_subdirectory(simdjson)
|
||||
add_subdirectory(opendal)
|
||||
add_subdirectory(tantivy)
|
||||
|
||||
add_subdirectory(milvus-storage)
|
||||
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
set(CARGO_CMD cargo build)
|
||||
set(TARGET_DIR "debug")
|
||||
else ()
|
||||
set(CARGO_CMD cargo build --release)
|
||||
set(TARGET_DIR "release")
|
||||
endif ()
|
||||
|
||||
set(TANTIVY_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib")
|
||||
set(TANTIVY_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include")
|
||||
set(TANTIVY_NAME "libtantivy_binding${CMAKE_STATIC_LIBRARY_SUFFIX}")
|
||||
|
||||
set(LIB_FILE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_DIR}/${TANTIVY_NAME}")
|
||||
set(LIB_HEADER_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}/tantivy-binding/include")
|
||||
|
||||
# In fact, cargo was already installed on our builder environment.
|
||||
# Below settings are used to suit for first local development.
|
||||
set(HOME_VAR $ENV{HOME})
|
||||
set(PATH_VAR $ENV{PATH})
|
||||
set(ENV{PATH} ${HOME_VAR}/.cargo/bin:${PATH_VAR})
|
||||
message($ENV{PATH})
|
||||
|
||||
add_custom_command(OUTPUT ls_cargo
|
||||
COMMENT "ls cargo"
|
||||
COMMAND ls ${HOME_VAR}/.cargo/bin/
|
||||
)
|
||||
add_custom_target(ls_cargo_target DEPENDS ls_cargo)
|
||||
|
||||
add_custom_command(OUTPUT compile_tantivy
|
||||
COMMENT "Compiling tantivy binding"
|
||||
COMMAND CARGO_TARGET_DIR=${CMAKE_CURRENT_BINARY_DIR} ${CARGO_CMD}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/tantivy-binding)
|
||||
add_custom_target(tantivy_binding_target DEPENDS compile_tantivy ls_cargo_target)
|
||||
|
||||
set(INSTALL_COMMAND
|
||||
cp ${LIB_HEADER_FOLDER}/tantivy-binding.h ${TANTIVY_INCLUDE_DIR}/ &&
|
||||
cp ${CMAKE_CURRENT_SOURCE_DIR}/tantivy-wrapper.h ${TANTIVY_INCLUDE_DIR}/ &&
|
||||
cp ${LIB_FILE} ${TANTIVY_LIB_DIR}/)
|
||||
add_custom_command(OUTPUT install_tantivy
|
||||
COMMENT "Install tantivy target ${LIB_FILE} to ${TANTIVY_LIB_DIR}"
|
||||
COMMAND ${INSTALL_COMMAND}
|
||||
)
|
||||
add_custom_target(install_tantivy_target DEPENDS install_tantivy tantivy_binding_target)
|
||||
|
||||
add_library(tantivy_binding STATIC IMPORTED)
|
||||
add_dependencies(tantivy_binding
|
||||
install_tantivy_target
|
||||
)
|
||||
|
||||
set_target_properties(tantivy_binding
|
||||
PROPERTIES
|
||||
IMPORTED_GLOBAL TRUE
|
||||
IMPORTED_LOCATION "${TANTIVY_LIB_DIR}/${TANTIVY_NAME}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/include")
|
||||
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||
add_compile_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
|
||||
add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
|
||||
endif()
|
||||
|
||||
add_executable(test_tantivy test.cpp)
|
||||
target_link_libraries(test_tantivy
|
||||
tantivy_binding
|
||||
boost_filesystem
|
||||
dl
|
||||
)
|
|
@ -0,0 +1,16 @@
|
|||
# Generated by Cargo
|
||||
# will have compiled files and executables
|
||||
debug/
|
||||
target/
|
||||
|
||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
||||
Cargo.lock
|
||||
|
||||
# These are backup files generated by rustfmt
|
||||
**/*.rs.bk
|
||||
|
||||
# MSVC Windows builds of rustc generate these, which store debugging information
|
||||
*.pdb
|
||||
|
||||
.vscode/
|
|
@ -0,0 +1,18 @@
|
|||
[package]
|
||||
name = "tantivy-binding"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
tantivy = "0.21.0"
|
||||
futures = "0.3.21"
|
||||
libc = "0.2"
|
||||
scopeguard = "1.2"
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = "0.26.0"
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
|
@ -0,0 +1,13 @@
|
|||
use std::{env, path::PathBuf};
|
||||
|
||||
|
||||
fn main() {
|
||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
let package_name = env::var("CARGO_PKG_NAME").unwrap();
|
||||
let output_file = PathBuf::from(&crate_dir)
|
||||
.join("include")
|
||||
.join(format!("{}.h", package_name));
|
||||
cbindgen::generate(&crate_dir)
|
||||
.unwrap()
|
||||
.write_to_file(output_file);
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
language = "C++"
|
||||
pragma_once = true
|
|
@ -0,0 +1,102 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdarg>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <ostream>
|
||||
#include <new>
|
||||
|
||||
enum class TantivyDataType : uint8_t {
|
||||
Keyword,
|
||||
I64,
|
||||
F64,
|
||||
Bool,
|
||||
};
|
||||
|
||||
struct RustArray {
|
||||
uint32_t *array;
|
||||
size_t len;
|
||||
size_t cap;
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
|
||||
void *tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path);
|
||||
|
||||
void tantivy_free_index_writer(void *ptr);
|
||||
|
||||
void tantivy_finish_index(void *ptr);
|
||||
|
||||
void *tantivy_create_reader_for_index(void *ptr);
|
||||
|
||||
void tantivy_index_add_int8s(void *ptr, const int8_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int16s(void *ptr, const int16_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int32s(void *ptr, const int32_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_int64s(void *ptr, const int64_t *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_f32s(void *ptr, const float *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_f64s(void *ptr, const double *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_bools(void *ptr, const bool *array, uintptr_t len);
|
||||
|
||||
void tantivy_index_add_keyword(void *ptr, const char *s);
|
||||
|
||||
bool tantivy_index_exist(const char *path);
|
||||
|
||||
void free_rust_array(RustArray array);
|
||||
|
||||
void *tantivy_load_index(const char *path);
|
||||
|
||||
void tantivy_free_index_reader(void *ptr);
|
||||
|
||||
uint32_t tantivy_index_count(void *ptr);
|
||||
|
||||
RustArray tantivy_term_query_i64(void *ptr, int64_t term);
|
||||
|
||||
RustArray tantivy_lower_bound_range_query_i64(void *ptr, int64_t lower_bound, bool inclusive);
|
||||
|
||||
RustArray tantivy_upper_bound_range_query_i64(void *ptr, int64_t upper_bound, bool inclusive);
|
||||
|
||||
RustArray tantivy_range_query_i64(void *ptr,
|
||||
int64_t lower_bound,
|
||||
int64_t upper_bound,
|
||||
bool lb_inclusive,
|
||||
bool ub_inclusive);
|
||||
|
||||
RustArray tantivy_term_query_f64(void *ptr, double term);
|
||||
|
||||
RustArray tantivy_lower_bound_range_query_f64(void *ptr, double lower_bound, bool inclusive);
|
||||
|
||||
RustArray tantivy_upper_bound_range_query_f64(void *ptr, double upper_bound, bool inclusive);
|
||||
|
||||
RustArray tantivy_range_query_f64(void *ptr,
|
||||
double lower_bound,
|
||||
double upper_bound,
|
||||
bool lb_inclusive,
|
||||
bool ub_inclusive);
|
||||
|
||||
RustArray tantivy_term_query_bool(void *ptr, bool term);
|
||||
|
||||
RustArray tantivy_term_query_keyword(void *ptr, const char *term);
|
||||
|
||||
RustArray tantivy_lower_bound_range_query_keyword(void *ptr,
|
||||
const char *lower_bound,
|
||||
bool inclusive);
|
||||
|
||||
RustArray tantivy_upper_bound_range_query_keyword(void *ptr,
|
||||
const char *upper_bound,
|
||||
bool inclusive);
|
||||
|
||||
RustArray tantivy_range_query_keyword(void *ptr,
|
||||
const char *lower_bound,
|
||||
const char *upper_bound,
|
||||
bool lb_inclusive,
|
||||
bool ub_inclusive);
|
||||
|
||||
RustArray tantivy_prefix_query_keyword(void *ptr, const char *prefix);
|
||||
|
||||
} // extern "C"
|
|
@ -0,0 +1,30 @@
|
|||
use libc::size_t;
|
||||
|
||||
|
||||
#[repr(C)]
|
||||
pub struct RustArray {
|
||||
array: *mut u32,
|
||||
len: size_t,
|
||||
cap: size_t,
|
||||
}
|
||||
|
||||
impl RustArray {
|
||||
pub fn from_vec(vec: Vec<u32>) -> RustArray {
|
||||
let len = vec.len();
|
||||
let cap = vec.capacity();
|
||||
let v = vec.leak();
|
||||
RustArray {
|
||||
array: v.as_mut_ptr(),
|
||||
len,
|
||||
cap,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_rust_array(array: RustArray) {
|
||||
let RustArray { array, len , cap} = array;
|
||||
unsafe {
|
||||
Vec::from_raw_parts(array, len, cap);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
#[repr(u8)]
|
||||
pub enum TantivyDataType {
|
||||
// Text,
|
||||
Keyword,
|
||||
// U64,
|
||||
I64,
|
||||
F64,
|
||||
Bool,
|
||||
}
|
|
@ -0,0 +1,204 @@
|
|||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
|
||||
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::directory::MmapDirectory;
|
||||
use tantivy::query::{Query, RangeQuery, TermQuery, RegexQuery};
|
||||
use tantivy::schema::{Field, IndexRecordOption};
|
||||
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
|
||||
|
||||
|
||||
use crate::util::make_bounds;
|
||||
|
||||
pub struct IndexReaderWrapper {
|
||||
pub field_name: String,
|
||||
pub field: Field,
|
||||
pub reader: IndexReader,
|
||||
pub cnt: u32,
|
||||
}
|
||||
|
||||
impl IndexReaderWrapper {
|
||||
pub fn new(index: &Index, field_name: &String, field: Field) -> IndexReaderWrapper {
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let metas = index
|
||||
.searchable_segment_metas()
|
||||
.unwrap();
|
||||
let mut sum: u32 = 0;
|
||||
for meta in metas {
|
||||
sum += meta.max_doc();
|
||||
}
|
||||
reader.reload().unwrap();
|
||||
IndexReaderWrapper {
|
||||
field_name: field_name.to_string(),
|
||||
field,
|
||||
reader,
|
||||
cnt: sum,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load(path: &str) -> IndexReaderWrapper {
|
||||
let dir = MmapDirectory::open(path).unwrap();
|
||||
let index = Index::open(dir).unwrap();
|
||||
let field = index.schema().fields().next().unwrap().0;
|
||||
let schema = index.schema();
|
||||
let field_name = schema.get_field_name(field);
|
||||
IndexReaderWrapper::new(&index, &String::from_str(field_name).unwrap(), field)
|
||||
}
|
||||
|
||||
pub fn count(&self) -> u32 {
|
||||
self.cnt
|
||||
}
|
||||
|
||||
fn search(&self, q: &dyn Query) -> Vec<u32> {
|
||||
let searcher = self.reader.searcher();
|
||||
let cnt = self.cnt;
|
||||
let hits = searcher
|
||||
.search(q, &TopDocs::with_limit(cnt as usize))
|
||||
.unwrap();
|
||||
let mut ret = Vec::new();
|
||||
for (_, address) in hits {
|
||||
ret.push(address.doc_id);
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
pub fn term_query_i64(&self, term: i64) -> Vec<u32> {
|
||||
let q = TermQuery::new(
|
||||
Term::from_field_i64(self.field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn lower_bound_range_query_i64(&self, lower_bound: i64, inclusive: bool) -> Vec<u32> {
|
||||
let q = RangeQuery::new_i64_bounds(
|
||||
self.field_name.to_string(),
|
||||
make_bounds(lower_bound, inclusive),
|
||||
Bound::Unbounded,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn upper_bound_range_query_i64(&self, upper_bound: i64, inclusive: bool) -> Vec<u32> {
|
||||
let q = RangeQuery::new_i64_bounds(
|
||||
self.field_name.to_string(),
|
||||
Bound::Unbounded,
|
||||
make_bounds(upper_bound, inclusive),
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn range_query_i64(
|
||||
&self,
|
||||
lower_bound: i64,
|
||||
upper_bound: i64,
|
||||
lb_inclusive: bool,
|
||||
ub_inclusive: bool,
|
||||
) -> Vec<u32> {
|
||||
let lb = make_bounds(lower_bound, lb_inclusive);
|
||||
let ub = make_bounds(upper_bound, ub_inclusive);
|
||||
let q = RangeQuery::new_i64_bounds(self.field_name.to_string(), lb, ub);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn term_query_f64(&self, term: f64) -> Vec<u32> {
|
||||
let q = TermQuery::new(
|
||||
Term::from_field_f64(self.field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn lower_bound_range_query_f64(&self, lower_bound: f64, inclusive: bool) -> Vec<u32> {
|
||||
let q = RangeQuery::new_f64_bounds(
|
||||
self.field_name.to_string(),
|
||||
make_bounds(lower_bound, inclusive),
|
||||
Bound::Unbounded,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn upper_bound_range_query_f64(&self, upper_bound: f64, inclusive: bool) -> Vec<u32> {
|
||||
let q = RangeQuery::new_f64_bounds(
|
||||
self.field_name.to_string(),
|
||||
Bound::Unbounded,
|
||||
make_bounds(upper_bound, inclusive),
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn range_query_f64(
|
||||
&self,
|
||||
lower_bound: f64,
|
||||
upper_bound: f64,
|
||||
lb_inclusive: bool,
|
||||
ub_inclusive: bool,
|
||||
) -> Vec<u32> {
|
||||
let lb = make_bounds(lower_bound, lb_inclusive);
|
||||
let ub = make_bounds(upper_bound, ub_inclusive);
|
||||
let q = RangeQuery::new_f64_bounds(self.field_name.to_string(), lb, ub);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn term_query_bool(&self, term: bool) -> Vec<u32> {
|
||||
let q = TermQuery::new(
|
||||
Term::from_field_bool(self.field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn term_query_keyword(&self, term: &str) -> Vec<u32> {
|
||||
let q = TermQuery::new(
|
||||
Term::from_field_text(self.field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn lower_bound_range_query_keyword(&self, lower_bound: &str, inclusive: bool) -> Vec<u32> {
|
||||
let q = RangeQuery::new_str_bounds(
|
||||
self.field_name.to_string(),
|
||||
make_bounds(lower_bound, inclusive),
|
||||
Bound::Unbounded,
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn upper_bound_range_query_keyword(&self, upper_bound: &str, inclusive: bool) -> Vec<u32> {
|
||||
let q = RangeQuery::new_str_bounds(
|
||||
self.field_name.to_string(),
|
||||
Bound::Unbounded,
|
||||
make_bounds(upper_bound, inclusive),
|
||||
);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn range_query_keyword(
|
||||
&self,
|
||||
lower_bound: &str,
|
||||
upper_bound: &str,
|
||||
lb_inclusive: bool,
|
||||
ub_inclusive: bool,
|
||||
) -> Vec<u32> {
|
||||
let lb = make_bounds(lower_bound, lb_inclusive);
|
||||
let ub = make_bounds(upper_bound, ub_inclusive);
|
||||
let q = RangeQuery::new_str_bounds(self.field_name.to_string(), lb, ub);
|
||||
self.search(&q)
|
||||
}
|
||||
|
||||
pub fn prefix_query_keyword(
|
||||
&self,
|
||||
prefix: &str,
|
||||
) -> Vec<u32> {
|
||||
let pattern = format!("{}(.|\n)*", prefix);
|
||||
let q = RegexQuery::from_pattern(&pattern, self.field).unwrap();
|
||||
self.search(&q)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,206 @@
|
|||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
index_reader::IndexReaderWrapper, util_c::tantivy_index_exist, util::{create_binding, free_binding}, array::RustArray,
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_load_index(path: *const c_char) -> *mut c_void {
|
||||
assert!(tantivy_index_exist(path));
|
||||
let path_str = unsafe { CStr::from_ptr(path) };
|
||||
let wrapper = IndexReaderWrapper::load(path_str.to_str().unwrap());
|
||||
create_binding(wrapper)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_index_reader(ptr: *mut c_void) {
|
||||
free_binding::<IndexReaderWrapper>(ptr);
|
||||
}
|
||||
|
||||
// -------------------------query--------------------
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_count(ptr: *mut c_void) -> u32 {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe { (*real).count() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_term_query_i64(ptr: *mut c_void, term: i64) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).term_query_i64(term);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_lower_bound_range_query_i64(
|
||||
ptr: *mut c_void,
|
||||
lower_bound: i64,
|
||||
inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).lower_bound_range_query_i64(lower_bound, inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_upper_bound_range_query_i64(
|
||||
ptr: *mut c_void,
|
||||
upper_bound: i64,
|
||||
inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).upper_bound_range_query_i64(upper_bound, inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_range_query_i64(
|
||||
ptr: *mut c_void,
|
||||
lower_bound: i64,
|
||||
upper_bound: i64,
|
||||
lb_inclusive: bool,
|
||||
ub_inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).range_query_i64(lower_bound, upper_bound, lb_inclusive, ub_inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_term_query_f64(ptr: *mut c_void, term: f64) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).term_query_f64(term);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_lower_bound_range_query_f64(
|
||||
ptr: *mut c_void,
|
||||
lower_bound: f64,
|
||||
inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).lower_bound_range_query_f64(lower_bound, inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_upper_bound_range_query_f64(
|
||||
ptr: *mut c_void,
|
||||
upper_bound: f64,
|
||||
inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).upper_bound_range_query_f64(upper_bound, inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_range_query_f64(
|
||||
ptr: *mut c_void,
|
||||
lower_bound: f64,
|
||||
upper_bound: f64,
|
||||
lb_inclusive: bool,
|
||||
ub_inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).range_query_f64(lower_bound, upper_bound, lb_inclusive, ub_inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_term_query_bool(ptr: *mut c_void, term: bool) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let hits = (*real).term_query_bool(term);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_term_query_keyword(ptr: *mut c_void, term: *const c_char) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let c_str = CStr::from_ptr(term);
|
||||
let hits = (*real).term_query_keyword(c_str.to_str().unwrap());
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_lower_bound_range_query_keyword(
|
||||
ptr: *mut c_void,
|
||||
lower_bound: *const c_char,
|
||||
inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let c_lower_bound = CStr::from_ptr(lower_bound);
|
||||
let hits =
|
||||
(*real).lower_bound_range_query_keyword(c_lower_bound.to_str().unwrap(), inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_upper_bound_range_query_keyword(
|
||||
ptr: *mut c_void,
|
||||
upper_bound: *const c_char,
|
||||
inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let c_upper_bound = CStr::from_ptr(upper_bound);
|
||||
let hits =
|
||||
(*real).upper_bound_range_query_keyword(c_upper_bound.to_str().unwrap(), inclusive);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_range_query_keyword(
|
||||
ptr: *mut c_void,
|
||||
lower_bound: *const c_char,
|
||||
upper_bound: *const c_char,
|
||||
lb_inclusive: bool,
|
||||
ub_inclusive: bool,
|
||||
) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let c_lower_bound = CStr::from_ptr(lower_bound);
|
||||
let c_upper_bound = CStr::from_ptr(upper_bound);
|
||||
let hits = (*real).range_query_keyword(
|
||||
c_lower_bound.to_str().unwrap(),
|
||||
c_upper_bound.to_str().unwrap(),
|
||||
lb_inclusive,
|
||||
ub_inclusive,
|
||||
);
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_prefix_query_keyword(ptr: *mut c_void, prefix: *const c_char) -> RustArray {
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
unsafe {
|
||||
let c_str = CStr::from_ptr(prefix);
|
||||
let hits = (*real).prefix_query_keyword(c_str.to_str().unwrap());
|
||||
RustArray::from_vec(hits)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
use futures::executor::block_on;
|
||||
|
||||
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, INDEXED};
|
||||
use tantivy::{doc, tokenizer, Index, IndexWriter};
|
||||
|
||||
use crate::data_type::TantivyDataType;
|
||||
use crate::index_reader::IndexReaderWrapper;
|
||||
|
||||
pub struct IndexWriterWrapper {
|
||||
pub field_name: String,
|
||||
pub field: Field,
|
||||
pub data_type: TantivyDataType,
|
||||
pub path: String,
|
||||
pub index: Index,
|
||||
pub index_writer: IndexWriter,
|
||||
}
|
||||
|
||||
impl IndexWriterWrapper {
|
||||
pub fn create_reader(&self) -> IndexReaderWrapper {
|
||||
IndexReaderWrapper::new(&self.index, &self.field_name, self.field)
|
||||
}
|
||||
|
||||
pub fn new(field_name: String, data_type: TantivyDataType, path: String) -> IndexWriterWrapper {
|
||||
let field: Field;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let mut use_raw_tokenizer = false;
|
||||
match data_type {
|
||||
TantivyDataType::I64 => {
|
||||
field = schema_builder.add_i64_field(&field_name, INDEXED);
|
||||
}
|
||||
TantivyDataType::F64 => {
|
||||
field = schema_builder.add_f64_field(&field_name, INDEXED);
|
||||
}
|
||||
TantivyDataType::Bool => {
|
||||
field = schema_builder.add_bool_field(&field_name, INDEXED);
|
||||
}
|
||||
TantivyDataType::Keyword => {
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("raw_tokenizer")
|
||||
.set_index_option(IndexRecordOption::Basic);
|
||||
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
||||
field = schema_builder.add_text_field(&field_name, text_options);
|
||||
use_raw_tokenizer = true;
|
||||
}
|
||||
}
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_dir(path.clone(), schema).unwrap();
|
||||
if use_raw_tokenizer {
|
||||
index
|
||||
.tokenizers()
|
||||
.register("raw_tokenizer", tokenizer::RawTokenizer::default());
|
||||
}
|
||||
let index_writer = index.writer_with_num_threads(1, 15_000_000).unwrap();
|
||||
IndexWriterWrapper {
|
||||
field_name,
|
||||
field,
|
||||
data_type,
|
||||
path,
|
||||
index,
|
||||
index_writer,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_i8(&self, data: i8) {
|
||||
self.add_i64(data.into())
|
||||
}
|
||||
|
||||
pub fn add_i16(&self, data: i16) {
|
||||
self.add_i64(data.into())
|
||||
}
|
||||
|
||||
pub fn add_i32(&self, data: i32) {
|
||||
self.add_i64(data.into())
|
||||
}
|
||||
|
||||
pub fn add_i64(&self, data: i64) {
|
||||
self.index_writer
|
||||
.add_document(doc!(self.field => data))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn add_f32(&self, data: f32) {
|
||||
self.add_f64(data.into())
|
||||
}
|
||||
|
||||
pub fn add_f64(&self, data: f64) {
|
||||
self.index_writer
|
||||
.add_document(doc!(self.field => data))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn add_bool(&self, data: bool) {
|
||||
self.index_writer
|
||||
.add_document(doc!(self.field => data))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn add_keyword(&self, data: &str) {
|
||||
self.index_writer
|
||||
.add_document(doc!(self.field => data))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn finish(&mut self) {
|
||||
self.index_writer.commit().unwrap();
|
||||
block_on(self.index_writer.garbage_collect_files()).unwrap();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
use core::slice;
|
||||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
data_type::TantivyDataType, index_writer::IndexWriterWrapper, util::{create_binding, free_binding},
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_index(
|
||||
field_name: *const c_char,
|
||||
data_type: TantivyDataType,
|
||||
path: *const c_char,
|
||||
) -> *mut c_void {
|
||||
let field_name_str = unsafe { CStr::from_ptr(field_name) };
|
||||
let path_str = unsafe { CStr::from_ptr(path) };
|
||||
let wrapper = IndexWriterWrapper::new(
|
||||
String::from(field_name_str.to_str().unwrap()),
|
||||
data_type,
|
||||
String::from(path_str.to_str().unwrap()),
|
||||
);
|
||||
create_binding(wrapper)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
|
||||
free_binding::<IndexWriterWrapper>(ptr);
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_finish_index(ptr: *mut c_void) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
(*real).finish();
|
||||
}
|
||||
}
|
||||
|
||||
// should be only used for test
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_reader_for_index(ptr: *mut c_void) -> *mut c_void{
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let reader = (*real).create_reader();
|
||||
create_binding(reader)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// -------------------------build--------------------
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int8s(ptr: *mut c_void, array: *const i8, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_i8(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int16s(ptr: *mut c_void, array: *const i16, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_i16(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int32s(ptr: *mut c_void, array: *const i32, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_i32(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int64s(ptr: *mut c_void, array: *const i64, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_i64(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_f32s(ptr: *mut c_void, array: *const f32, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_f32(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_f64s(ptr: *mut c_void, array: *const f64, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_f64(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_bools(ptr: *mut c_void, array: *const bool, len: usize) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
for data in arr {
|
||||
(*real).add_bool(*data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this is not a very efficient way, since we must call this function many times, which
|
||||
// will bring a lot of overhead caused by the rust binding.
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_keyword(ptr: *mut c_void, s: *const c_char) {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let c_str = unsafe { CStr::from_ptr(s) };
|
||||
unsafe { (*real).add_keyword(c_str.to_str().unwrap()) }
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
mod data_type;
|
||||
mod index_writer;
|
||||
mod index_writer_c;
|
||||
mod util;
|
||||
mod util_c;
|
||||
mod array;
|
||||
mod index_reader;
|
||||
mod index_reader_c;
|
||||
|
||||
pub fn add(left: usize, right: usize) -> usize {
|
||||
left + right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn it_works() {
|
||||
let result = add(2, 2);
|
||||
assert_eq!(result, 4);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
use std::ops::Bound;
|
||||
use std::ffi::{c_void};
|
||||
|
||||
use tantivy::{directory::MmapDirectory, Index};
|
||||
|
||||
pub fn index_exist(path: &str) -> bool {
|
||||
let dir = MmapDirectory::open(path).unwrap();
|
||||
Index::exists(&dir).unwrap()
|
||||
}
|
||||
|
||||
pub fn make_bounds<T>(bound: T, inclusive: bool) -> Bound<T> {
|
||||
if inclusive {
|
||||
Bound::Included(bound)
|
||||
} else {
|
||||
Bound::Excluded(bound)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_binding<T>(wrapper: T) -> *mut c_void {
|
||||
let bp = Box::new(wrapper);
|
||||
let p_heap : *mut T = Box::into_raw(bp);
|
||||
p_heap as *mut c_void
|
||||
}
|
||||
|
||||
pub fn free_binding<T>(ptr: *mut c_void) {
|
||||
let real = ptr as *mut T;
|
||||
unsafe {
|
||||
drop(Box::from_raw(real));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
use std::ffi::{c_char, CStr};
|
||||
|
||||
use crate::{util::index_exist};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_exist(path: *const c_char) -> bool {
|
||||
let path_str = unsafe { CStr::from_ptr(path) };
|
||||
index_exist(path_str.to_str().unwrap())
|
||||
}
|
|
@ -0,0 +1,362 @@
|
|||
#include <sstream>
|
||||
#include <fmt/format.h>
|
||||
#include "tantivy-binding.h"
|
||||
|
||||
namespace milvus::tantivy {
|
||||
struct RustArrayWrapper {
|
||||
explicit RustArrayWrapper(RustArray array) : array_(array) {
|
||||
}
|
||||
|
||||
RustArrayWrapper(RustArrayWrapper&) = delete;
|
||||
RustArrayWrapper&
|
||||
operator=(RustArrayWrapper&) = delete;
|
||||
|
||||
RustArrayWrapper(RustArrayWrapper&& other) noexcept {
|
||||
array_.array = other.array_.array;
|
||||
array_.len = other.array_.len;
|
||||
array_.cap = other.array_.cap;
|
||||
other.array_.array = nullptr;
|
||||
other.array_.len = 0;
|
||||
other.array_.cap = 0;
|
||||
}
|
||||
|
||||
RustArrayWrapper&
|
||||
operator=(RustArrayWrapper&& other) noexcept {
|
||||
if (this != &other) {
|
||||
free();
|
||||
array_.array = other.array_.array;
|
||||
array_.len = other.array_.len;
|
||||
array_.cap = other.array_.cap;
|
||||
other.array_.array = nullptr;
|
||||
other.array_.len = 0;
|
||||
other.array_.cap = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
~RustArrayWrapper() {
|
||||
free();
|
||||
}
|
||||
|
||||
void
|
||||
debug() {
|
||||
std::stringstream ss;
|
||||
ss << "[ ";
|
||||
for (int i = 0; i < array_.len; i++) {
|
||||
ss << array_.array[i] << " ";
|
||||
}
|
||||
ss << "]";
|
||||
std::cout << ss.str() << std::endl;
|
||||
}
|
||||
|
||||
RustArray array_;
|
||||
|
||||
private:
|
||||
void
|
||||
free() {
|
||||
if (array_.array != nullptr) {
|
||||
free_rust_array(array_);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline TantivyDataType
|
||||
guess_data_type() {
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
return TantivyDataType::Bool;
|
||||
}
|
||||
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
return TantivyDataType::I64;
|
||||
}
|
||||
|
||||
if constexpr (std::is_floating_point_v<T>) {
|
||||
return TantivyDataType::F64;
|
||||
}
|
||||
|
||||
throw fmt::format("guess_data_type: unsupported data type: {}",
|
||||
typeid(T).name());
|
||||
}
|
||||
|
||||
struct TantivyIndexWrapper {
|
||||
using IndexWriter = void*;
|
||||
using IndexReader = void*;
|
||||
|
||||
TantivyIndexWrapper() = default;
|
||||
|
||||
TantivyIndexWrapper(TantivyIndexWrapper&) = delete;
|
||||
TantivyIndexWrapper&
|
||||
operator=(TantivyIndexWrapper&) = delete;
|
||||
|
||||
TantivyIndexWrapper(TantivyIndexWrapper&& other) noexcept {
|
||||
writer_ = other.writer_;
|
||||
reader_ = other.reader_;
|
||||
finished_ = other.finished_;
|
||||
other.writer_ = nullptr;
|
||||
other.reader_ = nullptr;
|
||||
other.finished_ = false;
|
||||
}
|
||||
|
||||
TantivyIndexWrapper&
|
||||
operator=(TantivyIndexWrapper&& other) noexcept {
|
||||
if (this != &other) {
|
||||
free();
|
||||
writer_ = other.writer_;
|
||||
reader_ = other.reader_;
|
||||
finished_ = other.finished_;
|
||||
other.writer_ = nullptr;
|
||||
other.reader_ = nullptr;
|
||||
other.finished_ = false;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
TantivyIndexWrapper(const char* field_name,
|
||||
TantivyDataType data_type,
|
||||
const char* path) {
|
||||
writer_ = tantivy_create_index(field_name, data_type, path);
|
||||
}
|
||||
|
||||
explicit TantivyIndexWrapper(const char* path) {
|
||||
assert(tantivy_index_exist(path));
|
||||
reader_ = tantivy_load_index(path);
|
||||
}
|
||||
|
||||
~TantivyIndexWrapper() {
|
||||
free();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
add_data(const T* array, uintptr_t len) {
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
tantivy_index_add_bools(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
tantivy_index_add_int8s(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int16_t>) {
|
||||
tantivy_index_add_int16s(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int32_t>) {
|
||||
tantivy_index_add_int32s(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int64_t>) {
|
||||
tantivy_index_add_int64s(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
tantivy_index_add_f32s(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, double>) {
|
||||
tantivy_index_add_f64s(writer_, array, len);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
|
||||
for (uintptr_t i = 0; i < len; i++) {
|
||||
tantivy_index_add_keyword(
|
||||
writer_, static_cast<const std::string*>(array)[i].c_str());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw fmt::format("InvertedIndex.add_data: unsupported data type: {}",
|
||||
typeid(T).name());
|
||||
}
|
||||
|
||||
inline void
|
||||
finish() {
|
||||
if (!finished_) {
|
||||
tantivy_finish_index(writer_);
|
||||
reader_ = tantivy_create_reader_for_index(writer_);
|
||||
}
|
||||
}
|
||||
|
||||
inline uint32_t
|
||||
count() {
|
||||
return tantivy_index_count(reader_);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename T>
|
||||
RustArrayWrapper
|
||||
term_query(T term) {
|
||||
auto array = [&]() {
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
return tantivy_term_query_bool(reader_, term);
|
||||
}
|
||||
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
return tantivy_term_query_i64(reader_,
|
||||
static_cast<int64_t>(term));
|
||||
}
|
||||
|
||||
if constexpr (std::is_floating_point_v<T>) {
|
||||
return tantivy_term_query_f64(reader_,
|
||||
static_cast<double>(term));
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
return tantivy_term_query_keyword(
|
||||
reader_, static_cast<std::string>(term).c_str());
|
||||
}
|
||||
|
||||
throw fmt::format(
|
||||
"InvertedIndex.term_query: unsupported data type: {}",
|
||||
typeid(T).name());
|
||||
}();
|
||||
return RustArrayWrapper(array);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
RustArrayWrapper
|
||||
lower_bound_range_query(T lower_bound, bool inclusive) {
|
||||
auto array = [&]() {
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
return tantivy_lower_bound_range_query_i64(
|
||||
reader_, static_cast<int64_t>(lower_bound), inclusive);
|
||||
}
|
||||
|
||||
if constexpr (std::is_floating_point_v<T>) {
|
||||
return tantivy_lower_bound_range_query_f64(
|
||||
reader_, static_cast<double>(lower_bound), inclusive);
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
return tantivy_lower_bound_range_query_keyword(
|
||||
reader_,
|
||||
static_cast<std::string>(lower_bound).c_str(),
|
||||
inclusive);
|
||||
}
|
||||
|
||||
throw fmt::format(
|
||||
"InvertedIndex.lower_bound_range_query: unsupported data type: "
|
||||
"{}",
|
||||
typeid(T).name());
|
||||
}();
|
||||
return RustArrayWrapper(array);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
RustArrayWrapper
|
||||
upper_bound_range_query(T upper_bound, bool inclusive) {
|
||||
auto array = [&]() {
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
return tantivy_upper_bound_range_query_i64(
|
||||
reader_, static_cast<int64_t>(upper_bound), inclusive);
|
||||
}
|
||||
|
||||
if constexpr (std::is_floating_point_v<T>) {
|
||||
return tantivy_upper_bound_range_query_f64(
|
||||
reader_, static_cast<double>(upper_bound), inclusive);
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
return tantivy_upper_bound_range_query_keyword(
|
||||
reader_,
|
||||
static_cast<std::string>(upper_bound).c_str(),
|
||||
inclusive);
|
||||
}
|
||||
|
||||
throw fmt::format(
|
||||
"InvertedIndex.upper_bound_range_query: unsupported data type: "
|
||||
"{}",
|
||||
typeid(T).name());
|
||||
}();
|
||||
return RustArrayWrapper(array);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
RustArrayWrapper
|
||||
range_query(T lower_bound,
|
||||
T upper_bound,
|
||||
bool lb_inclusive,
|
||||
bool ub_inclusive) {
|
||||
auto array = [&]() {
|
||||
if constexpr (std::is_integral_v<T>) {
|
||||
return tantivy_range_query_i64(
|
||||
reader_,
|
||||
static_cast<int64_t>(lower_bound),
|
||||
static_cast<int64_t>(upper_bound),
|
||||
lb_inclusive,
|
||||
ub_inclusive);
|
||||
}
|
||||
|
||||
if constexpr (std::is_floating_point_v<T>) {
|
||||
return tantivy_range_query_f64(reader_,
|
||||
static_cast<double>(lower_bound),
|
||||
static_cast<double>(upper_bound),
|
||||
lb_inclusive,
|
||||
ub_inclusive);
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
return tantivy_range_query_keyword(
|
||||
reader_,
|
||||
static_cast<std::string>(lower_bound).c_str(),
|
||||
static_cast<std::string>(upper_bound).c_str(),
|
||||
lb_inclusive,
|
||||
ub_inclusive);
|
||||
}
|
||||
|
||||
throw fmt::format(
|
||||
"InvertedIndex.range_query: unsupported data type: {}",
|
||||
typeid(T).name());
|
||||
}();
|
||||
return RustArrayWrapper(array);
|
||||
}
|
||||
|
||||
RustArrayWrapper
|
||||
prefix_query(const std::string& prefix) {
|
||||
auto array = tantivy_prefix_query_keyword(reader_, prefix.c_str());
|
||||
return RustArrayWrapper(array);
|
||||
}
|
||||
|
||||
public:
|
||||
inline IndexWriter
|
||||
get_writer() {
|
||||
return writer_;
|
||||
}
|
||||
|
||||
inline IndexReader
|
||||
get_reader() {
|
||||
return reader_;
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
check_search() {
|
||||
// TODO
|
||||
}
|
||||
|
||||
void
|
||||
free() {
|
||||
if (writer_ != nullptr) {
|
||||
tantivy_free_index_writer(writer_);
|
||||
}
|
||||
|
||||
if (reader_ != nullptr) {
|
||||
tantivy_free_index_reader(reader_);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool finished_ = false;
|
||||
IndexWriter writer_ = nullptr;
|
||||
IndexReader reader_ = nullptr;
|
||||
};
|
||||
} // namespace milvus::tantivy
|
|
@ -0,0 +1,165 @@
|
|||
#include <cstdint>
|
||||
#include <cassert>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <iostream>
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "tantivy-wrapper.h"
|
||||
|
||||
using namespace milvus::tantivy;
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
run() {
|
||||
std::cout << "run " << typeid(T).name() << std::endl;
|
||||
|
||||
auto path = "/tmp/inverted-index/test-binding/";
|
||||
boost::filesystem::remove_all(path);
|
||||
boost::filesystem::create_directories(path);
|
||||
|
||||
if (tantivy_index_exist(path)) {
|
||||
auto w = TantivyIndexWrapper(path);
|
||||
auto cnt = w.count();
|
||||
std::cout << "index already exist, open it, count: " << cnt
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
auto w = TantivyIndexWrapper("test_field_name", guess_data_type<T>(), path);
|
||||
|
||||
T arr[] = {1, 2, 3, 4, 5, 6};
|
||||
auto l = sizeof(arr) / sizeof(T);
|
||||
|
||||
w.add_data(arr, l);
|
||||
|
||||
w.finish();
|
||||
|
||||
assert(w.count() == l);
|
||||
|
||||
{
|
||||
auto hits = w.term_query<T>(2);
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.lower_bound_range_query<T>(1, false);
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.upper_bound_range_query<T>(4, false);
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.range_query<T>(2, 4, false, false);
|
||||
hits.debug();
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
run<bool>() {
|
||||
std::cout << "run bool" << std::endl;
|
||||
|
||||
auto path = "/tmp/inverted-index/test-binding/";
|
||||
boost::filesystem::remove_all(path);
|
||||
boost::filesystem::create_directories(path);
|
||||
|
||||
if (tantivy_index_exist(path)) {
|
||||
auto w = TantivyIndexWrapper(path);
|
||||
auto cnt = w.count();
|
||||
std::cout << "index already exist, open it, count: " << cnt
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
auto w =
|
||||
TantivyIndexWrapper("test_field_name", TantivyDataType::Bool, path);
|
||||
|
||||
bool arr[] = {true, false, false, true, false, true};
|
||||
auto l = sizeof(arr) / sizeof(bool);
|
||||
|
||||
w.add_data(arr, l);
|
||||
|
||||
w.finish();
|
||||
|
||||
assert(w.count() == l);
|
||||
|
||||
{
|
||||
auto hits = w.term_query<bool>(true);
|
||||
hits.debug();
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
run<std::string>() {
|
||||
std::cout << "run string" << std::endl;
|
||||
|
||||
auto path = "/tmp/inverted-index/test-binding/";
|
||||
boost::filesystem::remove_all(path);
|
||||
boost::filesystem::create_directories(path);
|
||||
|
||||
if (tantivy_index_exist(path)) {
|
||||
auto w = TantivyIndexWrapper(path);
|
||||
auto cnt = w.count();
|
||||
std::cout << "index already exist, open it, count: " << cnt
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
auto w =
|
||||
TantivyIndexWrapper("test_field_name", TantivyDataType::Keyword, path);
|
||||
|
||||
std::vector<std::string> arr = {"a", "b", "aaa", "abbb"};
|
||||
auto l = arr.size();
|
||||
|
||||
w.add_data<std::string>(arr.data(), l);
|
||||
|
||||
w.finish();
|
||||
|
||||
assert(w.count() == l);
|
||||
|
||||
{
|
||||
auto hits = w.term_query<std::string>("a");
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.lower_bound_range_query<std::string>("aa", true);
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.upper_bound_range_query<std::string>("ab", true);
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.range_query<std::string>("aa", "ab", true, true);
|
||||
hits.debug();
|
||||
}
|
||||
|
||||
{
|
||||
auto hits = w.prefix_query("a");
|
||||
hits.debug();
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char* argv[]) {
|
||||
run<int8_t>();
|
||||
run<int16_t>();
|
||||
run<int32_t>();
|
||||
run<int64_t>();
|
||||
|
||||
run<float>();
|
||||
run<double>();
|
||||
|
||||
run<bool>();
|
||||
|
||||
run<std::string>();
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -60,6 +60,7 @@ set(MILVUS_TEST_FILES
|
|||
test_binlog_index.cpp
|
||||
test_storage.cpp
|
||||
test_exec.cpp
|
||||
test_inverted_index.cpp
|
||||
)
|
||||
|
||||
if ( BUILD_DISK_ANN STREQUAL "ON" )
|
||||
|
|
|
@ -32,10 +32,9 @@ TEST(Array, TestConstructArray) {
|
|||
ASSERT_EQ(int_array.get_data<int>(i), i);
|
||||
}
|
||||
ASSERT_TRUE(int_array.is_same_array(field_int_array));
|
||||
auto int_array_tmp = Array(
|
||||
const_cast<char*>(int_array.data()),
|
||||
int_array.byte_size(),
|
||||
int_array.get_element_type(),
|
||||
auto int_array_tmp = Array(const_cast<char*>(int_array.data()),
|
||||
int_array.byte_size(),
|
||||
int_array.get_element_type(),
|
||||
{});
|
||||
auto int_8_array = Array(const_cast<char*>(int_array.data()),
|
||||
int_array.byte_size(),
|
||||
|
@ -48,10 +47,9 @@ TEST(Array, TestConstructArray) {
|
|||
{});
|
||||
ASSERT_EQ(int_array.length(), int_16_array.length());
|
||||
ASSERT_TRUE(int_array_tmp == int_array);
|
||||
auto int_array_view = ArrayView(
|
||||
const_cast<char*>(int_array.data()),
|
||||
int_array.byte_size(),
|
||||
int_array.get_element_type(),
|
||||
auto int_array_view = ArrayView(const_cast<char*>(int_array.data()),
|
||||
int_array.byte_size(),
|
||||
int_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_EQ(int_array.length(), int_array_view.length());
|
||||
ASSERT_EQ(int_array.byte_size(), int_array_view.byte_size());
|
||||
|
@ -76,10 +74,9 @@ TEST(Array, TestConstructArray) {
|
|||
long_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_TRUE(long_array_tmp == long_array);
|
||||
auto long_array_view = ArrayView(
|
||||
const_cast<char*>(long_array.data()),
|
||||
long_array.byte_size(),
|
||||
long_array.get_element_type(),
|
||||
auto long_array_view = ArrayView(const_cast<char*>(long_array.data()),
|
||||
long_array.byte_size(),
|
||||
long_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_EQ(long_array.length(), long_array_view.length());
|
||||
ASSERT_EQ(long_array.byte_size(), long_array_view.byte_size());
|
||||
|
@ -114,10 +111,9 @@ TEST(Array, TestConstructArray) {
|
|||
string_array.get_element_type(),
|
||||
std::move(string_element_offsets));
|
||||
ASSERT_TRUE(string_array_tmp == string_array);
|
||||
auto string_array_view = ArrayView(
|
||||
const_cast<char*>(string_array.data()),
|
||||
string_array.byte_size(),
|
||||
string_array.get_element_type(),
|
||||
auto string_array_view = ArrayView(const_cast<char*>(string_array.data()),
|
||||
string_array.byte_size(),
|
||||
string_array.get_element_type(),
|
||||
std::move(string_view_element_offsets));
|
||||
ASSERT_EQ(string_array.length(), string_array_view.length());
|
||||
ASSERT_EQ(string_array.byte_size(), string_array_view.byte_size());
|
||||
|
@ -143,10 +139,9 @@ TEST(Array, TestConstructArray) {
|
|||
bool_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_TRUE(bool_array_tmp == bool_array);
|
||||
auto bool_array_view = ArrayView(
|
||||
const_cast<char*>(bool_array.data()),
|
||||
bool_array.byte_size(),
|
||||
bool_array.get_element_type(),
|
||||
auto bool_array_view = ArrayView(const_cast<char*>(bool_array.data()),
|
||||
bool_array.byte_size(),
|
||||
bool_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_EQ(bool_array.length(), bool_array_view.length());
|
||||
ASSERT_EQ(bool_array.byte_size(), bool_array_view.byte_size());
|
||||
|
@ -172,10 +167,9 @@ TEST(Array, TestConstructArray) {
|
|||
float_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_TRUE(float_array_tmp == float_array);
|
||||
auto float_array_view = ArrayView(
|
||||
const_cast<char*>(float_array.data()),
|
||||
float_array.byte_size(),
|
||||
float_array.get_element_type(),
|
||||
auto float_array_view = ArrayView(const_cast<char*>(float_array.data()),
|
||||
float_array.byte_size(),
|
||||
float_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_EQ(float_array.length(), float_array_view.length());
|
||||
ASSERT_EQ(float_array.byte_size(), float_array_view.byte_size());
|
||||
|
@ -202,10 +196,9 @@ TEST(Array, TestConstructArray) {
|
|||
double_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_TRUE(double_array_tmp == double_array);
|
||||
auto double_array_view = ArrayView(
|
||||
const_cast<char*>(double_array.data()),
|
||||
double_array.byte_size(),
|
||||
double_array.get_element_type(),
|
||||
auto double_array_view = ArrayView(const_cast<char*>(double_array.data()),
|
||||
double_array.byte_size(),
|
||||
double_array.get_element_type(),
|
||||
{});
|
||||
ASSERT_EQ(double_array.length(), double_array_view.length());
|
||||
ASSERT_EQ(double_array.byte_size(), double_array_view.byte_size());
|
||||
|
|
|
@ -198,12 +198,12 @@ TEST(CBoolIndexTest, All) {
|
|||
{ DeleteBinarySet(binary_set); }
|
||||
}
|
||||
|
||||
delete[] (char*)(half_ds->GetTensor());
|
||||
delete[](char*)(half_ds->GetTensor());
|
||||
}
|
||||
|
||||
// TODO: more scalar type.
|
||||
TEST(CInt64IndexTest, All) {
|
||||
auto arr = GenArr<int64_t>(NB);
|
||||
auto arr = GenSortedArr<int64_t>(NB);
|
||||
|
||||
auto params = GenParams<int64_t>();
|
||||
for (const auto& tp : params) {
|
||||
|
@ -315,6 +315,6 @@ TEST(CStringIndexTest, All) {
|
|||
{ DeleteBinarySet(binary_set); }
|
||||
}
|
||||
|
||||
delete[] (char*)(str_ds->GetTensor());
|
||||
delete[](char*)(str_ds->GetTensor());
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,511 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <functional>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "storage/Util.h"
|
||||
#include "storage/InsertData.h"
|
||||
#include "indexbuilder/IndexFactory.h"
|
||||
#include "index/IndexFactory.h"
|
||||
#include "test_utils/indexbuilder_test_utils.h"
|
||||
#include "index/Meta.h"
|
||||
|
||||
using namespace milvus;
|
||||
|
||||
// TODO: I would suggest that our all indexes use this test to simulate the real production environment.
|
||||
|
||||
namespace milvus::test {
|
||||
auto
|
||||
gen_field_meta(int64_t collection_id = 1,
|
||||
int64_t partition_id = 2,
|
||||
int64_t segment_id = 3,
|
||||
int64_t field_id = 101) -> storage::FieldDataMeta {
|
||||
return storage::FieldDataMeta{
|
||||
.collection_id = collection_id,
|
||||
.partition_id = partition_id,
|
||||
.segment_id = segment_id,
|
||||
.field_id = field_id,
|
||||
};
|
||||
}
|
||||
|
||||
auto
|
||||
gen_index_meta(int64_t segment_id = 3,
|
||||
int64_t field_id = 101,
|
||||
int64_t index_build_id = 1000,
|
||||
int64_t index_version = 10000) -> storage::IndexMeta {
|
||||
return storage::IndexMeta{
|
||||
.segment_id = segment_id,
|
||||
.field_id = field_id,
|
||||
.build_id = index_build_id,
|
||||
.index_version = index_version,
|
||||
};
|
||||
}
|
||||
|
||||
auto
|
||||
gen_local_storage_config(const std::string& root_path)
|
||||
-> storage::StorageConfig {
|
||||
auto ret = storage::StorageConfig{};
|
||||
ret.storage_type = "local";
|
||||
ret.root_path = root_path;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct ChunkManagerWrapper {
|
||||
ChunkManagerWrapper(storage::ChunkManagerPtr cm) : cm_(cm) {
|
||||
}
|
||||
|
||||
~ChunkManagerWrapper() {
|
||||
for (const auto& file : written_) {
|
||||
cm_->Remove(file);
|
||||
}
|
||||
|
||||
boost::filesystem::remove_all(cm_->GetRootPath());
|
||||
}
|
||||
|
||||
void
|
||||
Write(const std::string& filepath, void* buf, uint64_t len) {
|
||||
written_.insert(filepath);
|
||||
cm_->Write(filepath, buf, len);
|
||||
}
|
||||
|
||||
const storage::ChunkManagerPtr cm_;
|
||||
std::unordered_set<std::string> written_;
|
||||
};
|
||||
} // namespace milvus::test
|
||||
|
||||
template <typename T, DataType dtype>
|
||||
void
|
||||
test_run() {
|
||||
int64_t collection_id = 1;
|
||||
int64_t partition_id = 2;
|
||||
int64_t segment_id = 3;
|
||||
int64_t field_id = 101;
|
||||
int64_t index_build_id = 1000;
|
||||
int64_t index_version = 10000;
|
||||
|
||||
auto field_meta =
|
||||
test::gen_field_meta(collection_id, partition_id, segment_id, field_id);
|
||||
auto index_meta = test::gen_index_meta(
|
||||
segment_id, field_id, index_build_id, index_version);
|
||||
|
||||
std::string root_path = "/tmp/test-inverted-index/";
|
||||
auto storage_config = test::gen_local_storage_config(root_path);
|
||||
auto cm = storage::CreateChunkManager(storage_config);
|
||||
|
||||
size_t nb = 10000;
|
||||
std::vector<T> data_gen;
|
||||
boost::container::vector<T> data;
|
||||
if constexpr (!std::is_same_v<T, bool>) {
|
||||
data_gen = GenSortedArr<T>(nb);
|
||||
} else {
|
||||
for (size_t i = 0; i < nb; i++) {
|
||||
data_gen.push_back(rand() % 2 == 0);
|
||||
}
|
||||
}
|
||||
for (auto x : data_gen) {
|
||||
data.push_back(x);
|
||||
}
|
||||
|
||||
auto field_data = storage::CreateFieldData(dtype);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(field_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::Remote);
|
||||
|
||||
auto get_binlog_path = [=](int64_t log_id) {
|
||||
return fmt::format("{}/{}/{}/{}/{}",
|
||||
collection_id,
|
||||
partition_id,
|
||||
segment_id,
|
||||
field_id,
|
||||
log_id);
|
||||
};
|
||||
|
||||
auto log_path = get_binlog_path(0);
|
||||
|
||||
auto cm_w = test::ChunkManagerWrapper(cm);
|
||||
cm_w.Write(log_path, serialized_bytes.data(), serialized_bytes.size());
|
||||
|
||||
storage::FileManagerContext ctx(field_meta, index_meta, cm);
|
||||
std::vector<std::string> index_files;
|
||||
|
||||
{
|
||||
Config config;
|
||||
config["index_type"] = milvus::index::INVERTED_INDEX_TYPE;
|
||||
config["insert_files"] = std::vector<std::string>{log_path};
|
||||
|
||||
auto index = indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
||||
dtype, config, ctx);
|
||||
index->Build();
|
||||
|
||||
auto bs = index->Upload();
|
||||
for (const auto& [key, _] : bs.binary_map_) {
|
||||
index_files.push_back(key);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
index::CreateIndexInfo index_info{};
|
||||
index_info.index_type = milvus::index::INVERTED_INDEX_TYPE;
|
||||
index_info.field_type = dtype;
|
||||
|
||||
Config config;
|
||||
config["index_files"] = index_files;
|
||||
|
||||
auto index =
|
||||
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
|
||||
index->Load(config);
|
||||
|
||||
auto cnt = index->Count();
|
||||
ASSERT_EQ(cnt, nb);
|
||||
|
||||
using IndexType = index::ScalarIndex<T>;
|
||||
auto real_index = dynamic_cast<IndexType*>(index.get());
|
||||
|
||||
if constexpr (!std::is_floating_point_v<T>) {
|
||||
// hard to compare floating-point value.
|
||||
{
|
||||
boost::container::vector<T> test_data;
|
||||
std::unordered_set<T> s;
|
||||
size_t nq = 10;
|
||||
for (size_t i = 0; i < nq && i < nb; i++) {
|
||||
test_data.push_back(data[i]);
|
||||
s.insert(data[i]);
|
||||
}
|
||||
auto bitset =
|
||||
real_index->In(test_data.size(), test_data.data());
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
boost::container::vector<T> test_data;
|
||||
std::unordered_set<T> s;
|
||||
size_t nq = 10;
|
||||
for (size_t i = 0; i < nq && i < nb; i++) {
|
||||
test_data.push_back(data[i]);
|
||||
s.insert(data[i]);
|
||||
}
|
||||
auto bitset =
|
||||
real_index->NotIn(test_data.size(), test_data.data());
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
using RefFunc = std::function<bool(int64_t)>;
|
||||
|
||||
if constexpr (!std::is_same_v<T, bool>) {
|
||||
// range query on boolean is not reasonable.
|
||||
|
||||
{
|
||||
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
|
||||
{20,
|
||||
OpType::GreaterThan,
|
||||
[&](int64_t i) -> bool { return data[i] > 20; }},
|
||||
{20,
|
||||
OpType::GreaterEqual,
|
||||
[&](int64_t i) -> bool { return data[i] >= 20; }},
|
||||
{20,
|
||||
OpType::LessThan,
|
||||
[&](int64_t i) -> bool { return data[i] < 20; }},
|
||||
{20,
|
||||
OpType::LessEqual,
|
||||
[&](int64_t i) -> bool { return data[i] <= 20; }},
|
||||
};
|
||||
for (const auto& [test_value, op, ref] : test_cases) {
|
||||
auto bitset = real_index->Range(test_value, op);
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
auto ans = bitset[i];
|
||||
auto should = ref(i);
|
||||
ASSERT_EQ(ans, should)
|
||||
<< "op: " << op << ", @" << i << ", ans: " << ans
|
||||
<< ", ref: " << should;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::tuple<T, bool, T, bool, RefFunc>> test_cases{
|
||||
{1,
|
||||
false,
|
||||
20,
|
||||
false,
|
||||
[&](int64_t i) -> bool {
|
||||
return 1 < data[i] && data[i] < 20;
|
||||
}},
|
||||
{1,
|
||||
false,
|
||||
20,
|
||||
true,
|
||||
[&](int64_t i) -> bool {
|
||||
return 1 < data[i] && data[i] <= 20;
|
||||
}},
|
||||
{1,
|
||||
true,
|
||||
20,
|
||||
false,
|
||||
[&](int64_t i) -> bool {
|
||||
return 1 <= data[i] && data[i] < 20;
|
||||
}},
|
||||
{1,
|
||||
true,
|
||||
20,
|
||||
true,
|
||||
[&](int64_t i) -> bool {
|
||||
return 1 <= data[i] && data[i] <= 20;
|
||||
}},
|
||||
};
|
||||
for (const auto& [lb, lb_inclusive, ub, ub_inclusive, ref] :
|
||||
test_cases) {
|
||||
auto bitset =
|
||||
real_index->Range(lb, lb_inclusive, ub, ub_inclusive);
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
auto ans = bitset[i];
|
||||
auto should = ref(i);
|
||||
ASSERT_EQ(ans, should) << "@" << i << ", ans: " << ans
|
||||
<< ", ref: " << should;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
test_string() {
|
||||
using T = std::string;
|
||||
DataType dtype = DataType::VARCHAR;
|
||||
|
||||
int64_t collection_id = 1;
|
||||
int64_t partition_id = 2;
|
||||
int64_t segment_id = 3;
|
||||
int64_t field_id = 101;
|
||||
int64_t index_build_id = 1000;
|
||||
int64_t index_version = 10000;
|
||||
|
||||
auto field_meta =
|
||||
test::gen_field_meta(collection_id, partition_id, segment_id, field_id);
|
||||
auto index_meta = test::gen_index_meta(
|
||||
segment_id, field_id, index_build_id, index_version);
|
||||
|
||||
std::string root_path = "/tmp/test-inverted-index/";
|
||||
auto storage_config = test::gen_local_storage_config(root_path);
|
||||
auto cm = storage::CreateChunkManager(storage_config);
|
||||
|
||||
size_t nb = 10000;
|
||||
boost::container::vector<T> data;
|
||||
for (size_t i = 0; i < nb; i++) {
|
||||
data.push_back(std::to_string(rand()));
|
||||
}
|
||||
|
||||
auto field_data = storage::CreateFieldData(dtype);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(field_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::Remote);
|
||||
|
||||
auto get_binlog_path = [=](int64_t log_id) {
|
||||
return fmt::format("{}/{}/{}/{}/{}",
|
||||
collection_id,
|
||||
partition_id,
|
||||
segment_id,
|
||||
field_id,
|
||||
log_id);
|
||||
};
|
||||
|
||||
auto log_path = get_binlog_path(0);
|
||||
|
||||
auto cm_w = test::ChunkManagerWrapper(cm);
|
||||
cm_w.Write(log_path, serialized_bytes.data(), serialized_bytes.size());
|
||||
|
||||
storage::FileManagerContext ctx(field_meta, index_meta, cm);
|
||||
std::vector<std::string> index_files;
|
||||
|
||||
{
|
||||
Config config;
|
||||
config["index_type"] = milvus::index::INVERTED_INDEX_TYPE;
|
||||
config["insert_files"] = std::vector<std::string>{log_path};
|
||||
|
||||
auto index = indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
||||
dtype, config, ctx);
|
||||
index->Build();
|
||||
|
||||
auto bs = index->Upload();
|
||||
for (const auto& [key, _] : bs.binary_map_) {
|
||||
index_files.push_back(key);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
index::CreateIndexInfo index_info{};
|
||||
index_info.index_type = milvus::index::INVERTED_INDEX_TYPE;
|
||||
index_info.field_type = dtype;
|
||||
|
||||
Config config;
|
||||
config["index_files"] = index_files;
|
||||
|
||||
auto index =
|
||||
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
|
||||
index->Load(config);
|
||||
|
||||
auto cnt = index->Count();
|
||||
ASSERT_EQ(cnt, nb);
|
||||
|
||||
using IndexType = index::ScalarIndex<T>;
|
||||
auto real_index = dynamic_cast<IndexType*>(index.get());
|
||||
|
||||
{
|
||||
boost::container::vector<T> test_data;
|
||||
std::unordered_set<T> s;
|
||||
size_t nq = 10;
|
||||
for (size_t i = 0; i < nq && i < nb; i++) {
|
||||
test_data.push_back(data[i]);
|
||||
s.insert(data[i]);
|
||||
}
|
||||
auto bitset = real_index->In(test_data.size(), test_data.data());
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
boost::container::vector<T> test_data;
|
||||
std::unordered_set<T> s;
|
||||
size_t nq = 10;
|
||||
for (size_t i = 0; i < nq && i < nb; i++) {
|
||||
test_data.push_back(data[i]);
|
||||
s.insert(data[i]);
|
||||
}
|
||||
auto bitset = real_index->NotIn(test_data.size(), test_data.data());
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
|
||||
}
|
||||
}
|
||||
|
||||
using RefFunc = std::function<bool(int64_t)>;
|
||||
|
||||
{
|
||||
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
|
||||
{"20",
|
||||
OpType::GreaterThan,
|
||||
[&](int64_t i) -> bool { return data[i] > "20"; }},
|
||||
{"20",
|
||||
OpType::GreaterEqual,
|
||||
[&](int64_t i) -> bool { return data[i] >= "20"; }},
|
||||
{"20",
|
||||
OpType::LessThan,
|
||||
[&](int64_t i) -> bool { return data[i] < "20"; }},
|
||||
{"20",
|
||||
OpType::LessEqual,
|
||||
[&](int64_t i) -> bool { return data[i] <= "20"; }},
|
||||
};
|
||||
for (const auto& [test_value, op, ref] : test_cases) {
|
||||
auto bitset = real_index->Range(test_value, op);
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
auto ans = bitset[i];
|
||||
auto should = ref(i);
|
||||
ASSERT_EQ(ans, should)
|
||||
<< "op: " << op << ", @" << i << ", ans: " << ans
|
||||
<< ", ref: " << should;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<std::tuple<T, bool, T, bool, RefFunc>> test_cases{
|
||||
{"1",
|
||||
false,
|
||||
"20",
|
||||
false,
|
||||
[&](int64_t i) -> bool {
|
||||
return "1" < data[i] && data[i] < "20";
|
||||
}},
|
||||
{"1",
|
||||
false,
|
||||
"20",
|
||||
true,
|
||||
[&](int64_t i) -> bool {
|
||||
return "1" < data[i] && data[i] <= "20";
|
||||
}},
|
||||
{"1",
|
||||
true,
|
||||
"20",
|
||||
false,
|
||||
[&](int64_t i) -> bool {
|
||||
return "1" <= data[i] && data[i] < "20";
|
||||
}},
|
||||
{"1",
|
||||
true,
|
||||
"20",
|
||||
true,
|
||||
[&](int64_t i) -> bool {
|
||||
return "1" <= data[i] && data[i] <= "20";
|
||||
}},
|
||||
};
|
||||
for (const auto& [lb, lb_inclusive, ub, ub_inclusive, ref] :
|
||||
test_cases) {
|
||||
auto bitset =
|
||||
real_index->Range(lb, lb_inclusive, ub, ub_inclusive);
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
auto ans = bitset[i];
|
||||
auto should = ref(i);
|
||||
ASSERT_EQ(ans, should)
|
||||
<< "@" << i << ", ans: " << ans << ", ref: " << should;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto dataset = std::make_shared<Dataset>();
|
||||
auto prefix = data[0];
|
||||
dataset->Set(index::OPERATOR_TYPE, OpType::PrefixMatch);
|
||||
dataset->Set(index::PREFIX_VALUE, prefix);
|
||||
auto bitset = real_index->Query(dataset);
|
||||
ASSERT_EQ(cnt, bitset.size());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_EQ(bitset[i], boost::starts_with(data[i], prefix));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(InvertedIndex, Naive) {
|
||||
test_run<int8_t, DataType::INT8>();
|
||||
test_run<int16_t, DataType::INT16>();
|
||||
test_run<int32_t, DataType::INT32>();
|
||||
test_run<int64_t, DataType::INT64>();
|
||||
|
||||
test_run<bool, DataType::BOOL>();
|
||||
|
||||
test_run<float, DataType::FLOAT>();
|
||||
test_run<double, DataType::DOUBLE>();
|
||||
|
||||
test_string();
|
||||
}
|
|
@ -22,6 +22,7 @@
|
|||
#include "test_utils/DataGen.h"
|
||||
#include <boost/filesystem.hpp>
|
||||
#include "test_utils/storage_test_utils.h"
|
||||
#include "test_utils/TmpPath.h"
|
||||
|
||||
constexpr int64_t nb = 100;
|
||||
namespace indexcgo = milvus::proto::indexcgo;
|
||||
|
@ -75,7 +76,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Count) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
ASSERT_EQ(nb, scalar_index->Count());
|
||||
}
|
||||
|
@ -94,7 +95,7 @@ TYPED_TEST_P(TypedScalarIndexTest, HasRawData) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
ASSERT_EQ(nb, scalar_index->Count());
|
||||
ASSERT_TRUE(scalar_index->HasRawData());
|
||||
|
@ -114,7 +115,7 @@ TYPED_TEST_P(TypedScalarIndexTest, In) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
assert_in<T>(scalar_index, arr);
|
||||
}
|
||||
|
@ -133,7 +134,7 @@ TYPED_TEST_P(TypedScalarIndexTest, NotIn) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
assert_not_in<T>(scalar_index, arr);
|
||||
}
|
||||
|
@ -152,7 +153,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Reverse) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
assert_reverse<T>(scalar_index, arr);
|
||||
}
|
||||
|
@ -171,7 +172,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Range) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
assert_range<T>(scalar_index, arr);
|
||||
}
|
||||
|
@ -190,7 +191,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
|
|||
create_index_info);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
scalar_index->Build(nb, arr.data());
|
||||
|
||||
auto binary_set = index->Serialize(nullptr);
|
||||
|
@ -231,101 +232,90 @@ class TypedScalarIndexTestV2 : public ::testing::Test {
|
|||
struct Helper {};
|
||||
|
||||
protected:
|
||||
std::unordered_map<std::type_index, const std::shared_ptr<arrow::DataType>>
|
||||
m_fields = {{typeid(int8_t), arrow::int8()},
|
||||
{typeid(int16_t), arrow::int16()},
|
||||
{typeid(int32_t), arrow::int32()},
|
||||
{typeid(int64_t), arrow::int64()},
|
||||
{typeid(float), arrow::float32()},
|
||||
{typeid(double), arrow::float64()}};
|
||||
|
||||
std::shared_ptr<arrow::Schema>
|
||||
TestSchema(int vec_size) {
|
||||
arrow::FieldVector fields;
|
||||
fields.push_back(arrow::field("pk", arrow::int64()));
|
||||
fields.push_back(arrow::field("ts", arrow::int64()));
|
||||
fields.push_back(arrow::field("scalar", m_fields[typeid(T)]));
|
||||
fields.push_back(
|
||||
arrow::field("vec", arrow::fixed_size_binary(vec_size)));
|
||||
return std::make_shared<arrow::Schema>(fields);
|
||||
}
|
||||
|
||||
std::shared_ptr<arrow::RecordBatchReader>
|
||||
TestRecords(int vec_size, GeneratedData& dataset, std::vector<T>& scalars) {
|
||||
arrow::Int64Builder pk_builder;
|
||||
arrow::Int64Builder ts_builder;
|
||||
arrow::NumericBuilder<typename Helper::C> scalar_builder;
|
||||
arrow::FixedSizeBinaryBuilder vec_builder(
|
||||
arrow::fixed_size_binary(vec_size));
|
||||
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
|
||||
auto data = reinterpret_cast<char*>(xb_data.data());
|
||||
for (auto i = 0; i < nb; ++i) {
|
||||
EXPECT_TRUE(pk_builder.Append(i).ok());
|
||||
EXPECT_TRUE(ts_builder.Append(i).ok());
|
||||
EXPECT_TRUE(vec_builder.Append(data + i * vec_size).ok());
|
||||
}
|
||||
for (auto& v : scalars) {
|
||||
EXPECT_TRUE(scalar_builder.Append(v).ok());
|
||||
}
|
||||
std::shared_ptr<arrow::Array> pk_array;
|
||||
EXPECT_TRUE(pk_builder.Finish(&pk_array).ok());
|
||||
std::shared_ptr<arrow::Array> ts_array;
|
||||
EXPECT_TRUE(ts_builder.Finish(&ts_array).ok());
|
||||
std::shared_ptr<arrow::Array> scalar_array;
|
||||
EXPECT_TRUE(scalar_builder.Finish(&scalar_array).ok());
|
||||
std::shared_ptr<arrow::Array> vec_array;
|
||||
EXPECT_TRUE(vec_builder.Finish(&vec_array).ok());
|
||||
auto schema = TestSchema(vec_size);
|
||||
auto rec_batch = arrow::RecordBatch::Make(
|
||||
schema, nb, {pk_array, ts_array, scalar_array, vec_array});
|
||||
auto reader =
|
||||
arrow::RecordBatchReader::Make({rec_batch}, schema).ValueOrDie();
|
||||
return reader;
|
||||
}
|
||||
|
||||
std::shared_ptr<milvus_storage::Space>
|
||||
TestSpace(int vec_size, GeneratedData& dataset, std::vector<T>& scalars) {
|
||||
auto arrow_schema = TestSchema(vec_size);
|
||||
auto schema_options = std::make_shared<milvus_storage::SchemaOptions>();
|
||||
schema_options->primary_column = "pk";
|
||||
schema_options->version_column = "ts";
|
||||
schema_options->vector_column = "vec";
|
||||
auto schema = std::make_shared<milvus_storage::Schema>(arrow_schema,
|
||||
schema_options);
|
||||
EXPECT_TRUE(schema->Validate().ok());
|
||||
|
||||
auto space_res = milvus_storage::Space::Open(
|
||||
"file://" + boost::filesystem::canonical(temp_path).string(),
|
||||
milvus_storage::Options{schema});
|
||||
EXPECT_TRUE(space_res.has_value());
|
||||
|
||||
auto space = std::move(space_res.value());
|
||||
auto rec = TestRecords(vec_size, dataset, scalars);
|
||||
auto write_opt = milvus_storage::WriteOption{nb};
|
||||
space->Write(rec.get(), &write_opt);
|
||||
return std::move(space);
|
||||
}
|
||||
void
|
||||
SetUp() override {
|
||||
temp_path = boost::filesystem::temp_directory_path() /
|
||||
boost::filesystem::unique_path();
|
||||
boost::filesystem::create_directory(temp_path);
|
||||
|
||||
auto vec_size = DIM * 4;
|
||||
auto dataset = GenDataset(nb, knowhere::metric::L2, false);
|
||||
auto scalars = GenArr<T>(nb);
|
||||
space = TestSpace(vec_size, dataset, scalars);
|
||||
}
|
||||
void
|
||||
TearDown() override {
|
||||
boost::filesystem::remove_all(temp_path);
|
||||
}
|
||||
|
||||
protected:
|
||||
boost::filesystem::path temp_path;
|
||||
std::shared_ptr<milvus_storage::Space> space;
|
||||
};
|
||||
|
||||
static std::unordered_map<std::type_index,
|
||||
const std::shared_ptr<arrow::DataType>>
|
||||
m_fields = {{typeid(int8_t), arrow::int8()},
|
||||
{typeid(int16_t), arrow::int16()},
|
||||
{typeid(int32_t), arrow::int32()},
|
||||
{typeid(int64_t), arrow::int64()},
|
||||
{typeid(float), arrow::float32()},
|
||||
{typeid(double), arrow::float64()}};
|
||||
|
||||
template <typename T>
|
||||
std::shared_ptr<arrow::Schema>
|
||||
TestSchema(int vec_size) {
|
||||
arrow::FieldVector fields;
|
||||
fields.push_back(arrow::field("pk", arrow::int64()));
|
||||
fields.push_back(arrow::field("ts", arrow::int64()));
|
||||
fields.push_back(arrow::field("scalar", m_fields[typeid(T)]));
|
||||
fields.push_back(arrow::field("vec", arrow::fixed_size_binary(vec_size)));
|
||||
return std::make_shared<arrow::Schema>(fields);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::shared_ptr<arrow::RecordBatchReader>
|
||||
TestRecords(int vec_size, GeneratedData& dataset, std::vector<T>& scalars) {
|
||||
arrow::Int64Builder pk_builder;
|
||||
arrow::Int64Builder ts_builder;
|
||||
arrow::NumericBuilder<typename TypedScalarIndexTestV2<T>::Helper::C>
|
||||
scalar_builder;
|
||||
arrow::FixedSizeBinaryBuilder vec_builder(
|
||||
arrow::fixed_size_binary(vec_size));
|
||||
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
|
||||
auto data = reinterpret_cast<char*>(xb_data.data());
|
||||
for (auto i = 0; i < nb; ++i) {
|
||||
EXPECT_TRUE(pk_builder.Append(i).ok());
|
||||
EXPECT_TRUE(ts_builder.Append(i).ok());
|
||||
EXPECT_TRUE(vec_builder.Append(data + i * vec_size).ok());
|
||||
}
|
||||
for (auto& v : scalars) {
|
||||
EXPECT_TRUE(scalar_builder.Append(v).ok());
|
||||
}
|
||||
std::shared_ptr<arrow::Array> pk_array;
|
||||
EXPECT_TRUE(pk_builder.Finish(&pk_array).ok());
|
||||
std::shared_ptr<arrow::Array> ts_array;
|
||||
EXPECT_TRUE(ts_builder.Finish(&ts_array).ok());
|
||||
std::shared_ptr<arrow::Array> scalar_array;
|
||||
EXPECT_TRUE(scalar_builder.Finish(&scalar_array).ok());
|
||||
std::shared_ptr<arrow::Array> vec_array;
|
||||
EXPECT_TRUE(vec_builder.Finish(&vec_array).ok());
|
||||
auto schema = TestSchema<T>(vec_size);
|
||||
auto rec_batch = arrow::RecordBatch::Make(
|
||||
schema, nb, {pk_array, ts_array, scalar_array, vec_array});
|
||||
auto reader =
|
||||
arrow::RecordBatchReader::Make({rec_batch}, schema).ValueOrDie();
|
||||
return reader;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::shared_ptr<milvus_storage::Space>
|
||||
TestSpace(boost::filesystem::path& temp_path,
|
||||
int vec_size,
|
||||
GeneratedData& dataset,
|
||||
std::vector<T>& scalars) {
|
||||
auto arrow_schema = TestSchema<T>(vec_size);
|
||||
auto schema_options = std::make_shared<milvus_storage::SchemaOptions>();
|
||||
schema_options->primary_column = "pk";
|
||||
schema_options->version_column = "ts";
|
||||
schema_options->vector_column = "vec";
|
||||
auto schema =
|
||||
std::make_shared<milvus_storage::Schema>(arrow_schema, schema_options);
|
||||
EXPECT_TRUE(schema->Validate().ok());
|
||||
|
||||
auto space_res = milvus_storage::Space::Open(
|
||||
"file://" + boost::filesystem::canonical(temp_path).string(),
|
||||
milvus_storage::Options{schema});
|
||||
EXPECT_TRUE(space_res.has_value());
|
||||
|
||||
auto space = std::move(space_res.value());
|
||||
auto rec = TestRecords<T>(vec_size, dataset, scalars);
|
||||
auto write_opt = milvus_storage::WriteOption{nb};
|
||||
space->Write(rec.get(), &write_opt);
|
||||
return std::move(space);
|
||||
}
|
||||
|
||||
template <>
|
||||
struct TypedScalarIndexTestV2<int8_t>::Helper {
|
||||
using C = arrow::Int8Type;
|
||||
|
@ -361,7 +351,7 @@ TYPED_TEST_CASE_P(TypedScalarIndexTestV2);
|
|||
TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
|
||||
using T = TypeParam;
|
||||
auto dtype = milvus::GetDType<T>();
|
||||
auto index_types = GetIndexTypes<T>();
|
||||
auto index_types = GetIndexTypesV2<T>();
|
||||
for (const auto& index_type : index_types) {
|
||||
milvus::index::CreateIndexInfo create_index_info;
|
||||
create_index_info.field_type = milvus::DataType(dtype);
|
||||
|
@ -371,11 +361,18 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
|
|||
auto storage_config = get_default_local_storage_config();
|
||||
auto chunk_manager =
|
||||
milvus::storage::CreateChunkManager(storage_config);
|
||||
|
||||
milvus::test::TmpPath tmp_path;
|
||||
auto temp_path = tmp_path.get();
|
||||
auto vec_size = DIM * 4;
|
||||
auto dataset = GenDataset(nb, knowhere::metric::L2, false);
|
||||
auto scalars = GenSortedArr<T>(nb);
|
||||
auto space = TestSpace<T>(temp_path, vec_size, dataset, scalars);
|
||||
milvus::storage::FileManagerContext file_manager_context(
|
||||
{}, {.field_name = "scalar"}, chunk_manager, this->space);
|
||||
{}, {.field_name = "scalar"}, chunk_manager, space);
|
||||
auto index =
|
||||
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
|
||||
create_index_info, file_manager_context, this->space);
|
||||
create_index_info, file_manager_context, space);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
scalar_index->BuildV2();
|
||||
|
@ -383,7 +380,7 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
|
|||
|
||||
auto new_index =
|
||||
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
|
||||
create_index_info, file_manager_context, this->space);
|
||||
create_index_info, file_manager_context, space);
|
||||
auto new_scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(new_index.get());
|
||||
new_scalar_index->LoadV2();
|
||||
|
|
|
@ -138,7 +138,7 @@ TYPED_TEST_P(TypedScalarIndexCreatorTest, Codec) {
|
|||
milvus::DataType(dtype),
|
||||
config,
|
||||
milvus::storage::FileManagerContext());
|
||||
auto arr = GenArr<T>(nb);
|
||||
auto arr = GenSortedArr<T>(nb);
|
||||
build_index<T>(creator, arr);
|
||||
auto binary_set = creator->Serialize();
|
||||
auto copy_creator = milvus::indexbuilder::CreateScalarIndex(
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace milvus::test {
|
||||
struct TmpPath {
|
||||
TmpPath() {
|
||||
temp_path_ = boost::filesystem::temp_directory_path() /
|
||||
boost::filesystem::unique_path();
|
||||
boost::filesystem::create_directory(temp_path_);
|
||||
}
|
||||
~TmpPath() {
|
||||
boost::filesystem::remove_all(temp_path_);
|
||||
}
|
||||
|
||||
auto
|
||||
get() {
|
||||
return temp_path_;
|
||||
}
|
||||
|
||||
private:
|
||||
boost::filesystem::path temp_path_;
|
||||
};
|
||||
|
||||
} // namespace milvus::test
|
|
@ -18,6 +18,7 @@
|
|||
#include <google/protobuf/text_format.h>
|
||||
|
||||
#include "DataGen.h"
|
||||
#include "index/Meta.h"
|
||||
#include "index/ScalarIndex.h"
|
||||
#include "index/StringIndex.h"
|
||||
#include "index/Utils.h"
|
||||
|
@ -348,7 +349,7 @@ template <typename T,
|
|||
typename = typename std::enable_if_t<std::is_arithmetic_v<T> ||
|
||||
std::is_same_v<T, std::string>>>
|
||||
inline std::vector<T>
|
||||
GenArr(int64_t n) {
|
||||
GenSortedArr(int64_t n) {
|
||||
auto max_i8 = std::numeric_limits<int8_t>::max() - 1;
|
||||
std::vector<T> arr;
|
||||
arr.resize(n);
|
||||
|
@ -374,15 +375,14 @@ GenStrArr(int64_t n) {
|
|||
|
||||
template <>
|
||||
inline std::vector<std::string>
|
||||
GenArr<std::string>(int64_t n) {
|
||||
GenSortedArr<std::string>(int64_t n) {
|
||||
return GenStrArr(n);
|
||||
}
|
||||
|
||||
std::vector<ScalarTestParams>
|
||||
GenBoolParams() {
|
||||
std::vector<ScalarTestParams> ret;
|
||||
ret.emplace_back(
|
||||
ScalarTestParams(MapParams(), {{"index_type", "inverted_index"}}));
|
||||
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "sort"}}));
|
||||
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "flat"}}));
|
||||
return ret;
|
||||
}
|
||||
|
@ -408,8 +408,7 @@ GenParams() {
|
|||
}
|
||||
|
||||
std::vector<ScalarTestParams> ret;
|
||||
ret.emplace_back(
|
||||
ScalarTestParams(MapParams(), {{"index_type", "inverted_index"}}));
|
||||
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "sort"}}));
|
||||
ret.emplace_back(ScalarTestParams(MapParams(), {{"index_type", "flat"}}));
|
||||
return ret;
|
||||
}
|
||||
|
@ -442,13 +441,25 @@ GenDsFromPB(const google::protobuf::Message& msg) {
|
|||
template <typename T>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypes() {
|
||||
return std::vector<std::string>{"inverted_index"};
|
||||
return std::vector<std::string>{"sort"};
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypes<std::string>() {
|
||||
return std::vector<std::string>{"marisa"};
|
||||
return std::vector<std::string>{"sort", "marisa"};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypesV2() {
|
||||
return std::vector<std::string>{"sort", milvus::index::INVERTED_INDEX_TYPE};
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypesV2<std::string>() {
|
||||
return std::vector<std::string>{milvus::index::INVERTED_INDEX_TYPE, "marisa"};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
|
|
@ -594,7 +594,7 @@ func (it *indexBuildTask) parseFieldMetaFromBinlog(ctx context.Context) error {
|
|||
it.partitionID = partitionID
|
||||
it.segmentID = segmentID
|
||||
for fID, value := range insertData.Data {
|
||||
it.fieldType = indexcgowrapper.GenDataset(value).DType
|
||||
it.fieldType = value.GetDataType()
|
||||
it.fieldID = fID
|
||||
break
|
||||
}
|
||||
|
|
|
@ -160,6 +160,14 @@ func (cit *createIndexTask) parseIndexParams() error {
|
|||
if exist && !validateArithmeticIndexType(specifyIndexType) {
|
||||
return merr.WrapErrParameterInvalid(DefaultArithmeticIndexType, specifyIndexType, "index type not match")
|
||||
}
|
||||
} else if typeutil.IsBoolType(cit.fieldSchema.DataType) {
|
||||
if !exist {
|
||||
return merr.WrapErrParameterInvalidMsg("no index type specified")
|
||||
}
|
||||
if specifyIndexType != InvertedIndexType {
|
||||
return merr.WrapErrParameterInvalidMsg("index type (%s) not supported for boolean, supported: %s",
|
||||
specifyIndexType, InvertedIndexType)
|
||||
}
|
||||
} else {
|
||||
return merr.WrapErrParameterInvalid("supported field",
|
||||
fmt.Sprintf("create index on %s field", cit.fieldSchema.DataType.String()),
|
||||
|
|
|
@ -67,6 +67,8 @@ const (
|
|||
|
||||
// DefaultStringIndexType name of default index type for varChar/string field
|
||||
DefaultStringIndexType = "Trie"
|
||||
|
||||
InvertedIndexType = "INVERTED"
|
||||
)
|
||||
|
||||
var logger = log.L().WithOptions(zap.Fields(zap.String("role", typeutil.ProxyRole)))
|
||||
|
@ -247,12 +249,12 @@ func validatePartitionTag(partitionTag string, strictCheck bool) error {
|
|||
|
||||
func validateStringIndexType(indexType string) bool {
|
||||
// compatible with the index type marisa-trie of attu versions prior to 2.3.0
|
||||
return indexType == DefaultStringIndexType || indexType == "marisa-trie"
|
||||
return indexType == DefaultStringIndexType || indexType == "marisa-trie" || indexType == InvertedIndexType
|
||||
}
|
||||
|
||||
func validateArithmeticIndexType(indexType string) bool {
|
||||
// compatible with the index type Asceneding of attu versions prior to 2.3.0
|
||||
return indexType == DefaultArithmeticIndexType || indexType == "Asceneding"
|
||||
return indexType == DefaultArithmeticIndexType || indexType == "Asceneding" || indexType == InvertedIndexType
|
||||
}
|
||||
|
||||
func validateFieldName(fieldName string) error {
|
||||
|
|
|
@ -606,6 +606,7 @@ func (insertCodec *InsertCodec) DeserializeInto(fieldBinlogs []*Blob, rowNum int
|
|||
stringFieldData := insertData.Data[fieldID].(*StringFieldData)
|
||||
|
||||
stringFieldData.Data = append(stringFieldData.Data, stringPayload...)
|
||||
stringFieldData.DataType = dataType
|
||||
totalLength += len(stringPayload)
|
||||
insertData.Data[fieldID] = stringFieldData
|
||||
|
||||
|
|
|
@ -330,7 +330,7 @@ func TestInsertCodec(t *testing.T) {
|
|||
Int64Field: &Int64FieldData{[]int64{}},
|
||||
FloatField: &FloatFieldData{[]float32{}},
|
||||
DoubleField: &DoubleFieldData{[]float64{}},
|
||||
StringField: &StringFieldData{[]string{}},
|
||||
StringField: &StringFieldData{[]string{}, schemapb.DataType_VarChar},
|
||||
BinaryVectorField: &BinaryVectorFieldData{[]byte{}, 8},
|
||||
FloatVectorField: &FloatVectorFieldData{[]float32{}, 4},
|
||||
Float16VectorField: &Float16VectorFieldData{[]byte{}, 4},
|
||||
|
@ -706,7 +706,7 @@ func TestMemorySize(t *testing.T) {
|
|||
Int64Field: &Int64FieldData{[]int64{}},
|
||||
FloatField: &FloatFieldData{[]float32{}},
|
||||
DoubleField: &DoubleFieldData{[]float64{}},
|
||||
StringField: &StringFieldData{[]string{}},
|
||||
StringField: &StringFieldData{[]string{}, schemapb.DataType_VarChar},
|
||||
BinaryVectorField: &BinaryVectorFieldData{[]byte{}, 8},
|
||||
FloatVectorField: &FloatVectorFieldData{[]float32{}, 4},
|
||||
},
|
||||
|
|
|
@ -121,6 +121,7 @@ type FieldData interface {
|
|||
RowNum() int
|
||||
GetRow(i int) any
|
||||
AppendRow(row interface{}) error
|
||||
GetDataType() schemapb.DataType
|
||||
}
|
||||
|
||||
func NewFieldData(dataType schemapb.DataType, fieldSchema *schemapb.FieldSchema) (FieldData, error) {
|
||||
|
@ -198,7 +199,8 @@ func NewFieldData(dataType schemapb.DataType, fieldSchema *schemapb.FieldSchema)
|
|||
}, nil
|
||||
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
||||
return &StringFieldData{
|
||||
Data: make([]string, 0),
|
||||
Data: make([]string, 0),
|
||||
DataType: dataType,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("Unexpected schema data type: %d", dataType)
|
||||
|
@ -227,7 +229,8 @@ type DoubleFieldData struct {
|
|||
Data []float64
|
||||
}
|
||||
type StringFieldData struct {
|
||||
Data []string
|
||||
Data []string
|
||||
DataType schemapb.DataType
|
||||
}
|
||||
type ArrayFieldData struct {
|
||||
ElementType schemapb.DataType
|
||||
|
@ -417,6 +420,29 @@ func (data *BinaryVectorFieldData) GetMemorySize() int { return binary.Size(dat
|
|||
func (data *FloatVectorFieldData) GetMemorySize() int { return binary.Size(data.Data) + 4 }
|
||||
func (data *Float16VectorFieldData) GetMemorySize() int { return binary.Size(data.Data) + 4 }
|
||||
|
||||
// GetDataType implements FieldData.GetDataType
|
||||
func (data *BoolFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Bool }
|
||||
func (data *Int8FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int8 }
|
||||
func (data *Int16FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int16 }
|
||||
func (data *Int32FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int32 }
|
||||
func (data *Int64FieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Int64 }
|
||||
func (data *FloatFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Float }
|
||||
func (data *DoubleFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Double }
|
||||
func (data *StringFieldData) GetDataType() schemapb.DataType { return data.DataType }
|
||||
func (data *ArrayFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_Array }
|
||||
func (data *JSONFieldData) GetDataType() schemapb.DataType { return schemapb.DataType_JSON }
|
||||
func (data *BinaryVectorFieldData) GetDataType() schemapb.DataType {
|
||||
return schemapb.DataType_BinaryVector
|
||||
}
|
||||
|
||||
func (data *FloatVectorFieldData) GetDataType() schemapb.DataType {
|
||||
return schemapb.DataType_FloatVector
|
||||
}
|
||||
|
||||
func (data *Float16VectorFieldData) GetDataType() schemapb.DataType {
|
||||
return schemapb.DataType_Float16Vector
|
||||
}
|
||||
|
||||
// why not binary.Size(data) directly? binary.Size(data) return -1
|
||||
// binary.Size returns how many bytes Write would generate to encode the value v, which
|
||||
// must be a fixed-size value or a slice of fixed-size values, or a pointer to such data.
|
||||
|
|
|
@ -147,6 +147,14 @@ func (s *InsertDataSuite) TestMemorySize() {
|
|||
s.Equal(s.iDataTwoRows.Data[Float16VectorField].GetMemorySize(), 20)
|
||||
}
|
||||
|
||||
func (s *InsertDataSuite) TestGetDataType() {
|
||||
for _, field := range s.schema.GetFields() {
|
||||
fieldData, ok := s.iDataOneRow.Data[field.GetFieldID()]
|
||||
s.True(ok)
|
||||
s.Equal(field.GetDataType(), fieldData.GetDataType())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *InsertDataSuite) SetupTest() {
|
||||
var err error
|
||||
s.iDataEmpty, err = NewInsertData(s.schema)
|
||||
|
|
|
@ -171,7 +171,7 @@ func genScalarIndexCases(dtype schemapb.DataType) []indexTestCase {
|
|||
dtype: dtype,
|
||||
typeParams: nil,
|
||||
indexParams: map[string]string{
|
||||
common.IndexTypeKey: "inverted_index",
|
||||
common.IndexTypeKey: "sort",
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -190,7 +190,7 @@ func genStringIndexCases(dtype schemapb.DataType) []indexTestCase {
|
|||
dtype: dtype,
|
||||
typeParams: nil,
|
||||
indexParams: map[string]string{
|
||||
common.IndexTypeKey: "inverted_index",
|
||||
common.IndexTypeKey: "sort",
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
|
@ -85,7 +85,7 @@ func GenDataset(data storage.FieldData) *Dataset {
|
|||
}
|
||||
case *storage.StringFieldData:
|
||||
return &Dataset{
|
||||
DType: schemapb.DataType_String,
|
||||
DType: schemapb.DataType_VarChar,
|
||||
Data: map[string]interface{}{
|
||||
keyRawArr: f.Data,
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue