fix: enable to build index with single segment (#39233)

fix https://github.com/milvus-io/milvus/issues/39232

---------

Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
pull/39079/head^2
Spade A 2025-01-16 11:01:06 +08:00 committed by GitHub
parent bca2a62b78
commit 8c4ba70a4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 1199 additions and 246 deletions

View File

@ -45,9 +45,9 @@ class IndexBase {
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) = 0;
virtual void
BuildWithRawData(size_t n,
const void* values,
const Config& config = {}) = 0;
BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config = {}) = 0;
virtual void
BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0;

View File

@ -35,10 +35,14 @@ namespace milvus::index {
template <typename T>
ScalarIndexPtr<T>
IndexFactory::CreatePrimitiveScalarIndex(
const IndexType& index_type,
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
if (index_type == INVERTED_INDEX_TYPE) {
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context);
// scalar_index_engine_version 0 means we should built tantivy index within single segment
return std::make_unique<InvertedIndexTantivy<T>>(
file_manager_context,
create_index_info.scalar_index_engine_version == 0);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<T>>(file_manager_context);
@ -59,12 +63,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
template <>
ScalarIndexPtr<std::string>
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
const IndexType& index_type,
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
// scalar_index_engine_version 0 means we should built tantivy index within single segment
return std::make_unique<InvertedIndexTantivy<std::string>>(
file_manager_context);
file_manager_context,
create_index_info.scalar_index_engine_version == 0);
}
if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
@ -294,37 +301,37 @@ IndexFactory::CreateIndex(
IndexBasePtr
IndexFactory::CreatePrimitiveScalarIndex(
DataType data_type,
IndexType index_type,
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) {
switch (data_type) {
// create scalar index
case DataType::BOOL:
return CreatePrimitiveScalarIndex<bool>(index_type,
return CreatePrimitiveScalarIndex<bool>(create_index_info,
file_manager_context);
case DataType::INT8:
return CreatePrimitiveScalarIndex<int8_t>(index_type,
return CreatePrimitiveScalarIndex<int8_t>(create_index_info,
file_manager_context);
case DataType::INT16:
return CreatePrimitiveScalarIndex<int16_t>(index_type,
return CreatePrimitiveScalarIndex<int16_t>(create_index_info,
file_manager_context);
case DataType::INT32:
return CreatePrimitiveScalarIndex<int32_t>(index_type,
return CreatePrimitiveScalarIndex<int32_t>(create_index_info,
file_manager_context);
case DataType::INT64:
return CreatePrimitiveScalarIndex<int64_t>(index_type,
return CreatePrimitiveScalarIndex<int64_t>(create_index_info,
file_manager_context);
case DataType::FLOAT:
return CreatePrimitiveScalarIndex<float>(index_type,
return CreatePrimitiveScalarIndex<float>(create_index_info,
file_manager_context);
case DataType::DOUBLE:
return CreatePrimitiveScalarIndex<double>(index_type,
return CreatePrimitiveScalarIndex<double>(create_index_info,
file_manager_context);
// create string index
case DataType::STRING:
case DataType::VARCHAR:
return CreatePrimitiveScalarIndex<std::string>(
index_type, file_manager_context);
create_index_info, file_manager_context);
default:
PanicInfo(
DataTypeInvalid,
@ -334,14 +341,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
IndexBasePtr
IndexFactory::CreateCompositeScalarIndex(
IndexType index_type,
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
index_type == INVERTED_INDEX_TYPE) {
auto element_type = static_cast<DataType>(
file_manager_context.fieldDataMeta.field_schema.element_type());
return CreatePrimitiveScalarIndex(
element_type, index_type, file_manager_context);
element_type, create_index_info, file_manager_context);
} else {
PanicInfo(
Unsupported,
@ -373,9 +381,9 @@ IndexFactory::CreateScalarIndex(
case DataType::VARCHAR:
case DataType::STRING:
return CreatePrimitiveScalarIndex(
data_type, create_index_info.index_type, file_manager_context);
data_type, create_index_info, file_manager_context);
case DataType::ARRAY: {
return CreateCompositeScalarIndex(create_index_info.index_type,
return CreateCompositeScalarIndex(create_index_info,
file_manager_context);
}
case DataType::JSON: {

View File

@ -85,14 +85,14 @@ class IndexFactory {
IndexBasePtr
CreatePrimitiveScalarIndex(
DataType data_type,
IndexType index_type,
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
// For types like array, struct, union, etc
IndexBasePtr
CreateCompositeScalarIndex(
IndexType index_type,
const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
@ -115,7 +115,7 @@ class IndexFactory {
template <typename T>
ScalarIndexPtr<T>
CreatePrimitiveScalarIndex(const IndexType& index_type,
CreatePrimitiveScalarIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager =
storage::FileManagerContext());
};

View File

@ -26,6 +26,7 @@ struct CreateIndexInfo {
IndexVersion index_engine_version;
std::string field_name;
int64_t dim;
int32_t scalar_index_engine_version;
};
} // namespace milvus::index

View File

@ -84,14 +84,15 @@ InvertedIndexTantivy<T>::InitForBuildIndex() {
path_);
}
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str());
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
}
template <typename T>
InvertedIndexTantivy<T>::InvertedIndexTantivy(
const storage::FileManagerContext& ctx)
const storage::FileManagerContext& ctx, bool inverted_index_single_segment)
: ScalarIndex<T>(INVERTED_INDEX_TYPE),
schema_(ctx.fieldDataMeta.field_schema) {
schema_(ctx.fieldDataMeta.field_schema),
inverted_index_single_segment_(inverted_index_single_segment) {
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
// push init wrapper to load process
@ -387,9 +388,9 @@ InvertedIndexTantivy<T>::RegexQuery(const std::string& regex_pattern) {
template <typename T>
void
InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
if constexpr (std::is_same_v<bool, T>) {
schema_.set_data_type(proto::schema::DataType::Bool);
}
@ -421,16 +422,35 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
boost::filesystem::create_directories(path_);
d_type_ = get_tantivy_data_type(schema_);
std::string field = "test_inverted_index";
inverted_index_single_segment_ =
GetValueFromConfig<int32_t>(config,
milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1) == 0;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str());
if (config.find("is_array") != config.end()) {
// only used in ut.
auto arr = static_cast<const boost::container::vector<T>*>(values);
for (size_t i = 0; i < n; i++) {
wrapper_->template add_multi_data(arr[i].data(), arr[i].size(), i);
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
if (!inverted_index_single_segment_) {
if (config.find("is_array") != config.end()) {
// only used in ut.
auto arr = static_cast<const boost::container::vector<T>*>(values);
for (size_t i = 0; i < n; i++) {
wrapper_->template add_multi_data(
arr[i].data(), arr[i].size(), i);
}
} else {
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
}
} else {
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
if (config.find("is_array") != config.end()) {
// only used in ut.
auto arr = static_cast<const boost::container::vector<T>*>(values);
for (size_t i = 0; i < n; i++) {
wrapper_->template add_multi_data_by_single_segment_writer(
arr[i].data(), arr[i].size());
}
} else {
wrapper_->add_data_by_single_segment_writer<T>(
static_cast<const T*>(values), n);
}
}
wrapper_->create_reader();
finish();
@ -458,26 +478,48 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
case proto::schema::DataType::Double:
case proto::schema::DataType::String:
case proto::schema::DataType::VarChar: {
int64_t offset = 0;
if (schema_.nullable()) {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
// Generally, we will not build inverted index with single segment except for building index
// for query node with older version(2.4). See more comments above `inverted_index_single_segment_`.
if (!inverted_index_single_segment_) {
int64_t offset = 0;
if (schema_.nullable()) {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_->add_multi_data<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i),
offset++);
}
wrapper_->add_multi_data<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i),
offset++);
}
} else {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<T>(
static_cast<const T*>(data->Data()), n, offset);
offset += n;
}
}
} else {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
wrapper_->add_data<T>(
static_cast<const T*>(data->Data()), n, offset);
offset += n;
if (schema_.nullable()) {
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_
->add_multi_data_by_single_segment_writer<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i));
}
continue;
}
wrapper_->add_data_by_single_segment_writer<T>(
static_cast<const T*>(data->Data()), n);
}
}
break;
@ -508,10 +550,15 @@ InvertedIndexTantivy<T>::build_index_for_array(
null_offset.push_back(i);
}
auto length = data->is_valid(i) ? array_column[i].length() : 0;
wrapper_->template add_multi_data(
reinterpret_cast<const T*>(array_column[i].data()),
length,
offset++);
if (!inverted_index_single_segment_) {
wrapper_->template add_multi_data(
reinterpret_cast<const T*>(array_column[i].data()),
length,
offset++);
} else {
wrapper_->template add_multi_data_by_single_segment_writer(
reinterpret_cast<const T*>(array_column[i].data()), length);
}
}
}
}
@ -537,7 +584,13 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
array_column[i].template get_data<std::string>(j));
}
auto length = data->is_valid(i) ? output.size() : 0;
wrapper_->template add_multi_data(output.data(), length, offset++);
if (!inverted_index_single_segment_) {
wrapper_->template add_multi_data(
output.data(), length, offset++);
} else {
wrapper_->template add_multi_data_by_single_segment_writer(
output.data(), length);
}
}
}
}

View File

@ -38,7 +38,8 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) {
}
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx);
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx,
bool inverted_index_single_segment = false);
~InvertedIndexTantivy();
@ -80,11 +81,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
return wrapper_->count();
}
// BuildWithRawData should be only used in ut. Only string is supported.
// BuildWithRawDataForUT should be only used in ut. Only string is supported.
void
BuildWithRawData(size_t n,
const void* values,
const Config& config = {}) override;
BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config = {}) override;
BinarySet
Serialize(const Config& config) override;
@ -205,5 +206,14 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
// all data need to be built to align the offset
// so need to store null_offset in inverted index additionally
std::vector<size_t> null_offset{};
// `inverted_index_single_segment_` is used to control whether to build tantivy index with single segment.
//
// In the older version of milvus, the query node can only read tantivy index built whtin single segment
// where the newer version builds and reads index of multi segments by default.
// However, the index may be built from a separate node from the query node where the index buliding node is a
// new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index
// building node to build specific type of tantivy index.
bool inverted_index_single_segment_{false};
};
} // namespace milvus::index

View File

@ -46,6 +46,8 @@ constexpr const char* MARISA_TRIE_UPPER = "TRIE";
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";
constexpr const char* SCALAR_INDEX_ENGINE_VERSION =
"scalar_index_engine_version";
// index meta
constexpr const char* COLLECTION_ID = "collection_id";

View File

@ -72,9 +72,9 @@ ScalarIndex<T>::Query(const DatasetPtr& dataset) {
template <>
void
ScalarIndex<std::string>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<std::string>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
proto::schema::StringArray arr;
auto ok = arr.ParseFromArray(values, n);
Assert(ok);
@ -86,9 +86,9 @@ ScalarIndex<std::string>::BuildWithRawData(size_t n,
template <>
void
ScalarIndex<bool>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<bool>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
proto::schema::BoolArray arr;
auto ok = arr.ParseFromArray(values, n);
Assert(ok);
@ -97,54 +97,54 @@ ScalarIndex<bool>::BuildWithRawData(size_t n,
template <>
void
ScalarIndex<int8_t>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<int8_t>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
auto data = reinterpret_cast<int8_t*>(const_cast<void*>(values));
Build(n, data);
}
template <>
void
ScalarIndex<int16_t>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<int16_t>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
auto data = reinterpret_cast<int16_t*>(const_cast<void*>(values));
Build(n, data);
}
template <>
void
ScalarIndex<int32_t>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<int32_t>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
auto data = reinterpret_cast<int32_t*>(const_cast<void*>(values));
Build(n, data);
}
template <>
void
ScalarIndex<int64_t>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<int64_t>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
auto data = reinterpret_cast<int64_t*>(const_cast<void*>(values));
Build(n, data);
}
template <>
void
ScalarIndex<float>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<float>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
auto data = reinterpret_cast<float*>(const_cast<void*>(values));
Build(n, data);
}
template <>
void
ScalarIndex<double>::BuildWithRawData(size_t n,
const void* values,
const Config& config) {
ScalarIndex<double>::BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config) {
auto data = reinterpret_cast<double*>(const_cast<void*>(values));
Build(n, data);
}

View File

@ -65,9 +65,9 @@ class ScalarIndex : public IndexBase {
}
void
BuildWithRawData(size_t n,
const void* values,
const Config& config = {}) override;
BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config = {}) override;
void
BuildWithDataset(const DatasetPtr& dataset,

View File

@ -42,9 +42,9 @@ class VectorIndex : public IndexBase {
public:
void
BuildWithRawData(size_t n,
const void* values,
const Config& config = {}) override {
BuildWithRawDataForUT(size_t n,
const void* values,
const Config& config = {}) override {
PanicInfo(Unsupported,
"vector index don't support build index with raw data");
};

View File

@ -30,6 +30,13 @@ ScalarIndexCreator::ScalarIndexCreator(
if (config.contains("index_type")) {
index_type_ = config.at("index_type").get<std::string>();
}
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
// Use value_or(1) for unit test without setting this value
index_info.scalar_index_engine_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1);
index_info.field_type = dtype_;
index_info.index_type = index_type();
index_ = index::IndexFactory::GetInstance().CreateIndex(
@ -40,7 +47,7 @@ void
ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) {
auto size = dataset->GetRows();
auto data = dataset->GetTensor();
index_->BuildWithRawData(size, data);
index_->BuildWithRawDataForUT(size, data);
}
void

View File

@ -166,30 +166,17 @@ CreateIndex(CIndex* res_index,
auto field_type =
static_cast<DataType>(build_index_info->field_schema().data_type());
milvus::index::CreateIndexInfo index_info;
index_info.field_type = field_type;
auto storage_config =
get_storage_config(build_index_info->storage_config());
auto config = get_config(build_index_info);
// get index type
auto index_type = milvus::index::GetValueFromConfig<std::string>(
config, "index_type");
AssertInfo(index_type.has_value(), "index type is empty");
index_info.index_type = index_type.value();
auto engine_version = build_index_info->current_index_version();
index_info.index_engine_version = engine_version;
config[milvus::index::INDEX_ENGINE_VERSION] =
std::to_string(engine_version);
// get metric type
if (milvus::IsVectorDataType(field_type)) {
auto metric_type = milvus::index::GetValueFromConfig<std::string>(
config, "metric_type");
AssertInfo(metric_type.has_value(), "metric type is empty");
index_info.metric_type = metric_type.value();
}
auto scalar_index_engine_version =
build_index_info->current_scalar_index_version();
config[milvus::index::SCALAR_INDEX_ENGINE_VERSION] =
scalar_index_engine_version;
// init file manager
milvus::storage::FieldDataMeta field_meta{

View File

@ -1193,6 +1193,7 @@ name = "tantivy-binding"
version = "0.1.0"
dependencies = [
"cbindgen",
"either",
"env_logger",
"futures",
"jieba-rs",

View File

@ -17,6 +17,7 @@ lazy_static = "1.4.0"
serde_json = "1.0.128"
jieba-rs = "0.6.8"
regex = "1.11.1"
either = "1.13.0"
[build-dependencies]
cbindgen = "0.26.0"

View File

@ -149,6 +149,10 @@ RustResult tantivy_create_index(const char *field_name,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes);
RustResult tantivy_create_index_with_single_segment(const char *field_name,
TantivyDataType data_type,
const char *path);
void tantivy_free_index_writer(void *ptr);
RustResult tantivy_finish_index(void *ptr);
@ -162,78 +166,140 @@ RustResult tantivy_index_add_int8s(void *ptr,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_int8s_by_single_segment_writer(void *ptr,
const int8_t *array,
uintptr_t len);
RustResult tantivy_index_add_int16s(void *ptr,
const int16_t *array,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_int16s_by_single_segment_writer(void *ptr,
const int16_t *array,
uintptr_t len);
RustResult tantivy_index_add_int32s(void *ptr,
const int32_t *array,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_int32s_by_single_segment_writer(void *ptr,
const int32_t *array,
uintptr_t len);
RustResult tantivy_index_add_int64s(void *ptr,
const int64_t *array,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_int64s_by_single_segment_writer(void *ptr,
const int64_t *array,
uintptr_t len);
RustResult tantivy_index_add_f32s(void *ptr,
const float *array,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_f32s_by_single_segment_writer(void *ptr,
const float *array,
uintptr_t len);
RustResult tantivy_index_add_f64s(void *ptr,
const double *array,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_f64s_by_single_segment_writer(void *ptr,
const double *array,
uintptr_t len);
RustResult tantivy_index_add_bools(void *ptr,
const bool *array,
uintptr_t len,
int64_t offset_begin);
RustResult tantivy_index_add_bools_by_single_segment_writer(void *ptr,
const bool *array,
uintptr_t len);
RustResult tantivy_index_add_string(void *ptr, const char *s, int64_t offset);
RustResult tantivy_index_add_string_by_single_segment_writer(void *ptr, const char *s);
RustResult tantivy_index_add_multi_int8s(void *ptr,
const int8_t *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_int8s_by_single_segment_writer(void *ptr,
const int8_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_int16s(void *ptr,
const int16_t *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_int16s_by_single_segment_writer(void *ptr,
const int16_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_int32s(void *ptr,
const int32_t *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_int32s_by_single_segment_writer(void *ptr,
const int32_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_int64s(void *ptr,
const int64_t *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_int64s_by_single_segment_writer(void *ptr,
const int64_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_f32s(void *ptr,
const float *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_f32s_by_single_segment_writer(void *ptr,
const float *array,
uintptr_t len);
RustResult tantivy_index_add_multi_f64s(void *ptr,
const double *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_f64s_by_single_segment_writer(void *ptr,
const double *array,
uintptr_t len);
RustResult tantivy_index_add_multi_bools(void *ptr,
const bool *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_bools_by_single_segment_writer(void *ptr,
const bool *array,
uintptr_t len);
RustResult tantivy_index_add_multi_keywords(void *ptr,
const char *const *array,
uintptr_t len,
int64_t offset);
RustResult tantivy_index_add_multi_keywords_by_single_segment_writer(void *ptr,
const char *const *array,
uintptr_t len);
RustResult tantivy_create_text_writer(const char *field_name,
const char *path,
const char *tokenizer_name,

View File

@ -1,4 +1,3 @@
use std::default;
use std::ffi::c_void;
use std::ptr::null;
@ -132,19 +131,15 @@ pub extern "C" fn free_rust_result(result: RustResult) {
}
_ => {}
}
unsafe {
if !result.error.is_null() {
free_rust_string(result.error as *mut c_char);
}
if !result.error.is_null() {
free_rust_string(result.error as *mut c_char);
}
}
#[no_mangle]
pub extern "C" fn free_rust_error(error: *const c_char) {
unsafe {
if !error.is_null() {
free_rust_string(error as *mut c_char);
}
if !error.is_null() {
free_rust_string(error as *mut c_char);
}
}

View File

@ -1,4 +1,5 @@
#[repr(u8)]
#[derive(Debug)]
pub enum TantivyDataType {
Text,
Keyword,

View File

@ -1,7 +1,5 @@
use core::{fmt, str};
use serde_json as json;
#[derive(Debug)]
pub enum TantivyBindingError {
JsonError(serde_json::Error),

View File

@ -1,13 +1,9 @@
use std::{
ffi::{c_char, c_void, CStr},
ptr::null,
};
use std::ffi::{c_char, c_void, CStr};
use crate::{
array::{RustArray, RustResult},
array::RustResult,
cstr_to_str,
index_reader::IndexReaderWrapper,
string_c::create_string,
util::{create_binding, free_binding},
util_c::tantivy_index_exist,
};

View File

@ -4,7 +4,7 @@ use tantivy::{
Term,
};
use crate::error::{Result, TantivyBindingError};
use crate::error::Result;
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
impl IndexReaderWrapper {

View File

@ -1,13 +1,9 @@
use std::{ffi::CStr, ptr::null};
use std::ffi::CStr;
use libc::{c_char, c_void};
use crate::{
array::{RustArray, RustResult},
cstr_to_str,
index_reader::IndexReaderWrapper,
log::init_log,
string_c::{c_str_to_str, create_string},
array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
tokenizer::create_tokenizer,
};

View File

@ -1,12 +1,14 @@
use std::ffi::CStr;
use std::sync::Arc;
use either::Either;
use futures::executor::block_on;
use libc::c_char;
use log::info;
use tantivy::schema::{
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED,
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
};
use tantivy::{doc, tokenizer, Document, Index, IndexWriter};
use tantivy::{doc, Document, Index, IndexWriter, SingleSegmentIndexWriter};
use crate::data_type::TantivyDataType;
@ -16,11 +18,34 @@ use crate::log::init_log;
pub(crate) struct IndexWriterWrapper {
pub(crate) field: Field,
pub(crate) index_writer: IndexWriter,
pub(crate) id_field: Field,
pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
pub(crate) id_field: Option<Field>,
pub(crate) index: Arc<Index>,
}
#[inline]
fn schema_builder_add_field(
schema_builder: &mut SchemaBuilder,
field_name: &str,
data_type: TantivyDataType,
) -> Field {
match data_type {
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
schema_builder.add_text_field(&field_name, text_options)
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
}
impl IndexWriterWrapper {
pub fn new(
field_name: String,
@ -30,30 +55,10 @@ impl IndexWriterWrapper {
overall_memory_budget_in_bytes: usize,
) -> Result<IndexWriterWrapper> {
init_log();
let field: Field;
info!("create index writer, field_name: {}, data_type: {:?}", field_name, data_type);
let mut schema_builder = Schema::builder();
match data_type {
TantivyDataType::I64 => {
field = schema_builder.add_i64_field(&field_name, INDEXED);
}
TantivyDataType::F64 => {
field = schema_builder.add_f64_field(&field_name, INDEXED);
}
TantivyDataType::Bool => {
field = schema_builder.add_bool_field(&field_name, INDEXED);
}
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
field = schema_builder.add_text_field(&field_name, text_options);
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
@ -61,8 +66,28 @@ impl IndexWriterWrapper {
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapper {
field,
index_writer,
id_field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
index: Arc::new(index),
})
}
pub fn new_with_single_segment(
field_name: String,
data_type: TantivyDataType,
path: String,
) -> Result<IndexWriterWrapper> {
init_log();
info!("create single segment index writer, field_name: {}, data_type: {:?}", field_name, data_type);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
Ok(IndexWriterWrapper {
field,
index_writer: Either::Right(index_writer),
id_field: None,
index: Arc::new(index),
})
}
@ -71,6 +96,30 @@ impl IndexWriterWrapper {
IndexReaderWrapper::from_index(self.index.clone())
}
fn index_writer_add_document(&self, document: Document) -> Result<()> {
match self.index_writer {
Either::Left(ref writer) => {
let _ = writer.add_document(document)?;
}
Either::Right(_) => {
panic!("unexpected writer");
}
}
Ok(())
}
fn single_segment_index_writer_add_document(&mut self, document: Document) -> Result<()> {
match self.index_writer {
Either::Left(_) => {
panic!("unexpected writer");
}
Either::Right(ref mut single_segmnet_writer) => {
let _ = single_segmnet_writer.add_document(document)?;
}
}
Ok(())
}
pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> {
self.add_i64(data.into(), offset)
}
@ -84,11 +133,10 @@ impl IndexWriterWrapper {
}
pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!(
self.index_writer_add_document(doc!(
self.field => data,
self.id_field => offset,
))?;
Ok(())
self.id_field.unwrap() => offset,
))
}
pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> {
@ -96,27 +144,24 @@ impl IndexWriterWrapper {
}
pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!(
self.index_writer_add_document(doc!(
self.field => data,
self.id_field => offset,
))?;
Ok(())
self.id_field.unwrap() => offset,
))
}
pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!(
self.index_writer_add_document(doc!(
self.field => data,
self.id_field => offset,
))?;
Ok(())
self.id_field.unwrap() => offset,
))
}
pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!(
self.index_writer_add_document(doc!(
self.field => data,
self.id_field => offset,
))?;
Ok(())
self.id_field.unwrap() => offset,
))
}
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
@ -124,9 +169,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data as i64);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
@ -134,9 +178,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data as i64);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
@ -144,9 +187,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data as i64);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
@ -154,9 +196,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
@ -164,9 +205,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data as f64);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
@ -174,9 +214,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
@ -184,9 +223,8 @@ impl IndexWriterWrapper {
for data in datas {
document.add_field_value(self.field, *data);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
@ -195,31 +233,148 @@ impl IndexWriterWrapper {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
}
document.add_i64(self.id_field, offset);
let _ = self.index_writer.add_document(document)?;
Ok(())
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> {
self.add_f64_by_single_segment_writer(data.into())
}
pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_multi_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as f64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_keywords_by_single_segment_writer(
&mut self,
datas: &[*const c_char],
) -> Result<()> {
let mut document = Document::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
}
self.single_segment_index_writer_add_document(document)
}
fn manual_merge(&mut self) -> Result<()> {
let metas = self.index_writer.index().searchable_segment_metas()?;
let policy = self.index_writer.get_merge_policy();
let index_writer = self.index_writer.as_mut().left().unwrap();
let metas = index_writer.index().searchable_segment_metas()?;
let policy = index_writer.get_merge_policy();
let candidates = policy.compute_merge_candidates(metas.as_slice());
for candidate in candidates {
self.index_writer.merge(candidate.0.as_slice()).wait()?;
index_writer.merge(candidate.0.as_slice()).wait()?;
}
Ok(())
}
pub fn finish(mut self) -> Result<()> {
self.index_writer.commit()?;
// self.manual_merge();
block_on(self.index_writer.garbage_collect_files())?;
self.index_writer.wait_merging_threads()?;
pub fn finish(self) -> Result<()> {
match self.index_writer {
Either::Left(mut index_writer) => {
index_writer.commit()?;
// self.manual_merge();
block_on(index_writer.garbage_collect_files())?;
index_writer.wait_merging_threads()?;
}
Either::Right(single_segment_index_writer) => {
single_segment_index_writer
.finalize()
.expect("failed to build inverted index");
}
}
Ok(())
}
pub(crate) fn commit(&mut self) -> Result<()> {
self.index_writer.commit()?;
self.index_writer.as_mut().left().unwrap().commit()?;
Ok(())
}
}

View File

@ -1,10 +1,5 @@
use core::slice;
use std::{
ffi::{c_char, c_void, CStr},
ptr::null,
};
use tantivy::Index;
use std::ffi::{c_char, c_void, CStr};
use crate::{
array::RustResult,
@ -47,6 +42,24 @@ pub extern "C" fn tantivy_create_index(
}
}
#[no_mangle]
pub extern "C" fn tantivy_create_index_with_single_segment(
field_name: *const c_char,
data_type: TantivyDataType,
path: *const c_char,
) -> RustResult {
let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path);
match IndexWriterWrapper::new_with_single_segment(
String::from(field_name_str),
data_type,
String::from(path_str),
) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(e) => RustResult::from_error(e.to_string()),
}
}
#[no_mangle]
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
free_binding::<IndexWriterWrapper>(ptr);
@ -77,6 +90,29 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes
}
// -------------------------build--------------------
fn execute<T: Copy>(
arr: &[T],
offset: i64,
e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
for (index, data) in arr.iter().enumerate() {
e(w, *data, offset + (index as i64))?;
}
Ok(())
}
fn execute_by_single_segment_writer<T: Copy>(
arr: &[T],
e: fn(&mut IndexWriterWrapper, T) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
for (_, data) in arr.iter().enumerate() {
e(w, *data)?;
}
Ok(())
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int8s(
ptr: *mut c_void,
@ -89,6 +125,24 @@ pub extern "C" fn tantivy_index_add_int8s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i8,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i8_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int16s(
ptr: *mut c_void,
@ -101,6 +155,24 @@ pub extern "C" fn tantivy_index_add_int16s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i16,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i16_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int32s(
ptr: *mut c_void,
@ -113,6 +185,24 @@ pub extern "C" fn tantivy_index_add_int32s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i32_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_int64s(
ptr: *mut c_void,
@ -126,18 +216,23 @@ pub extern "C" fn tantivy_index_add_int64s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
}
fn execute<T: Copy>(
arr: &[T],
offset: i64,
mut e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
#[no_mangle]
pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
for (index, data) in arr.iter().enumerate() {
e(w, *data, offset + (index as i64))?;
}
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i64_by_single_segment_writer,
&mut (*real),
)
.into()
}
Ok(())
}
#[no_mangle]
@ -152,6 +247,24 @@ pub extern "C" fn tantivy_index_add_f32s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_f32_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_f64s(
ptr: *mut c_void,
@ -164,6 +277,24 @@ pub extern "C" fn tantivy_index_add_f64s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_f64_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_bools(
ptr: *mut c_void,
@ -184,6 +315,24 @@ pub extern "C" fn tantivy_index_add_bools(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer(
ptr: *mut c_void,
array: *const bool,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_bool_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
// TODO: this is not a very efficient way, since we must call this function many times, which
// will bring a lot of overhead caused by the rust binding.
#[no_mangle]
@ -197,6 +346,16 @@ pub extern "C" fn tantivy_index_add_string(
unsafe { (*real).add_string(s, offset).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_string_by_single_segment_writer(
ptr: *mut c_void,
s: *const c_char,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let s = cstr_to_str!(s);
unsafe { (*real).add_string_by_single_segment_writer(s).into() }
}
// --------------------------------------------- array ------------------------------------------
#[no_mangle]
@ -213,6 +372,19 @@ pub extern "C" fn tantivy_index_add_multi_int8s(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int8s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i8,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i8s_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int16s(
ptr: *mut c_void,
@ -227,6 +399,19 @@ pub extern "C" fn tantivy_index_add_multi_int16s(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int16s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i16,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i16s_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int32s(
ptr: *mut c_void,
@ -241,6 +426,19 @@ pub extern "C" fn tantivy_index_add_multi_int32s(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i32s_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int64s(
ptr: *mut c_void,
@ -255,6 +453,19 @@ pub extern "C" fn tantivy_index_add_multi_int64s(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i64s_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f32s(
ptr: *mut c_void,
@ -269,6 +480,19 @@ pub extern "C" fn tantivy_index_add_multi_f32s(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_f32s_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f64s(
ptr: *mut c_void,
@ -283,6 +507,19 @@ pub extern "C" fn tantivy_index_add_multi_f64s(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_f64s_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_bools(
ptr: *mut c_void,
@ -297,6 +534,19 @@ pub extern "C" fn tantivy_index_add_multi_bools(
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_bools_by_single_segment_writer(
ptr: *mut c_void,
array: *const bool,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_bools_by_single_segment_writer(arr).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_keywords(
ptr: *mut c_void,
@ -310,3 +560,18 @@ pub extern "C" fn tantivy_index_add_multi_keywords(
(*real).add_multi_keywords(arr, offset).into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_keywords_by_single_segment_writer(
ptr: *mut c_void,
array: *const *const c_char,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real)
.add_multi_keywords_by_single_segment_writer(arr)
.into()
}
}

View File

@ -1,5 +1,6 @@
use std::sync::Arc;
use either::Either;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
use tantivy::tokenizer::TextAnalyzer;
use tantivy::Index;
@ -44,8 +45,8 @@ impl IndexWriterWrapper {
IndexWriterWrapper {
field,
index_writer,
id_field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
index: Arc::new(index),
}
}

View File

@ -1,13 +1,10 @@
use std::ffi::c_char;
use std::ffi::c_void;
use std::ffi::CStr;
use crate::array::RustResult;
use crate::cstr_to_str;
use crate::error::Result;
use crate::index_writer::IndexWriterWrapper;
use crate::log::init_log;
use crate::string_c::c_str_to_str;
use crate::tokenizer::create_tokenizer;
use crate::util::create_binding;

View File

@ -81,15 +81,22 @@ struct TantivyIndexWrapper {
TantivyIndexWrapper(const char* field_name,
TantivyDataType data_type,
const char* path,
bool inverted_single_semgnent = false,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
auto res = RustResultWrapper(
tantivy_create_index(field_name,
data_type,
path,
num_threads,
overall_memory_budget_in_bytes));
RustResultWrapper res;
if (inverted_single_semgnent) {
res = RustResultWrapper(tantivy_create_index_with_single_segment(
field_name, data_type, path));
} else {
res = RustResultWrapper(
tantivy_create_index(field_name,
data_type,
path,
num_threads,
overall_memory_budget_in_bytes));
}
AssertInfo(res.result_->success,
"failed to create index: {}",
res.result_->error);
@ -340,6 +347,193 @@ struct TantivyIndexWrapper {
typeid(T).name());
}
template <typename T>
void
add_data_by_single_segment_writer(const T* array, uintptr_t len) {
assert(!finished_);
if constexpr (std::is_same_v<T, bool>) {
auto res = RustResultWrapper(
tantivy_index_add_bools_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add bools: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int8_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int8s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int8s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int16_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int16s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int16s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int32_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int64_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, float>) {
auto res = RustResultWrapper(
tantivy_index_add_f32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add f32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, double>) {
auto res = RustResultWrapper(
tantivy_index_add_f64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add f64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, std::string>) {
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
for (uintptr_t i = 0; i < len; i++) {
auto res = RustResultWrapper(
tantivy_index_add_string_by_single_segment_writer(
writer_,
static_cast<const std::string*>(array)[i].c_str()));
AssertInfo(res.result_->success,
"failed to add string: {}",
res.result_->error);
}
return;
}
throw fmt::format("InvertedIndex.add_data: unsupported data type: {}",
typeid(T).name());
}
template <typename T>
void
add_multi_data_by_single_segment_writer(const T* array, uintptr_t len) {
assert(!finished_);
if constexpr (std::is_same_v<T, bool>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_bools_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi bools: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int8_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int8s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int8s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int16_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int16s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int16s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int32_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int64_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, float>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_f32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi f32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, double>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_f64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi f64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, std::string>) {
std::vector<const char*> views;
for (uintptr_t i = 0; i < len; i++) {
views.push_back(array[i].c_str());
}
auto res = RustResultWrapper(
tantivy_index_add_multi_keywords_by_single_segment_writer(
writer_, views.data(), len));
AssertInfo(res.result_->success,
"failed to add multi keywords: {}",
res.result_->error);
return;
}
throw fmt::format(
"InvertedIndex.add_multi_data: unsupported data type: {}",
typeid(T).name());
}
inline void
finish() {
if (finished_) {

View File

@ -90,6 +90,7 @@ set(MILVUS_TEST_FILES
test_chunked_column.cpp
test_rust_result.cpp
test_cached_search_iterator.cpp
test_build_inverted_index_with_single_segment.cpp
)
if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -113,7 +113,7 @@ class ArrayInvertedIndexTest : public ::testing::Test {
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
Config cfg;
cfg["is_array"] = true;
index->BuildWithRawData(N_, vec_of_array_.data(), cfg);
index->BuildWithRawDataForUT(N_, vec_of_array_.data(), cfg);
LoadIndexInfo info{
.field_id = schema_->get_field_id(FieldName("array")).get(),
.index = std::move(index),

View File

@ -0,0 +1,214 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <random>
#include <gtest/gtest.h>
#include "pb/plan.pb.h"
#include "segcore/SegmentSealedImpl.h"
#include "index/InvertedIndexTantivy.h"
#include "test_utils/DataGen.h"
#include "common/Schema.h"
#include "test_utils/GenExprProto.h"
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
template <typename T>
SchemaPtr
GenSchema() {
auto schema_ = std::make_shared<Schema>();
auto pk = schema_->AddDebugField("pk", DataType::INT64);
schema_->set_primary_field_id(pk);
if constexpr (std::is_same_v<T, bool>) {
schema_->AddDebugField("index", DataType::BOOL, false);
} else if constexpr (std::is_same_v<T, int8_t>) {
schema_->AddDebugField("index", DataType::INT8, false);
} else if constexpr (std::is_same_v<T, int16_t>) {
schema_->AddDebugField("index", DataType::INT16, false);
} else if constexpr (std::is_same_v<T, int32_t>) {
schema_->AddDebugField("index", DataType::INT32, false);
} else if constexpr (std::is_same_v<T, int64_t>) {
schema_->AddDebugField("index", DataType::INT64, false);
} else if constexpr (std::is_same_v<T, float>) {
schema_->AddDebugField("index", DataType::FLOAT, false);
} else if constexpr (std::is_same_v<T, double>) {
schema_->AddDebugField("index", DataType::DOUBLE, false);
} else if constexpr (std::is_same_v<T, std::string>) {
schema_->AddDebugField("index", DataType::VARCHAR, false);
}
return schema_;
}
template <typename T>
class BuildInvertedIndexWithSingleSegmentTest : public ::testing::Test {
public:
void
SetUp() override {
schema_ = GenSchema<T>();
seg_ = CreateSealedSegment(schema_);
N_ = 3000;
uint64_t seed = 1234;
auto raw_data = DataGen(schema_, N_, seed);
if constexpr (std::is_same_v<T, bool>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.bool_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, int64_t>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.long_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_integral_v<T>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.int_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, float>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.float_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, double>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.double_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, std::string>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.string_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
}
SealedLoadFieldData(raw_data, *seg_);
LoadInvertedIndex();
}
void
TearDown() override {
}
void
LoadInvertedIndex() {
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
Config cfg;
cfg[milvus::index::SCALAR_INDEX_ENGINE_VERSION] = 0;
index->BuildWithRawDataForUT(N_, index_column_data_.data(), cfg);
LoadIndexInfo info{
.field_id = schema_->get_field_id(FieldName("index")).get(),
.index = std::move(index),
};
seg_->LoadIndex(info);
}
T
FieldValueAt(int64_t offset) {
return index_column_data_[offset];
}
public:
SchemaPtr schema_;
SegmentSealedUPtr seg_;
int64_t N_;
boost::container::vector<T> index_column_data_;
};
TYPED_TEST_SUITE_P(BuildInvertedIndexWithSingleSegmentTest);
TYPED_TEST_P(BuildInvertedIndexWithSingleSegmentTest,
ReadFromSingleSegmentIndex) {
const auto& meta = this->schema_->operator[](FieldName("index"));
for (size_t i = 0; i < 10; i++) {
auto column_info = test::GenColumnInfo(
meta.get_id().get(),
static_cast<proto::schema::DataType>(meta.get_data_type()),
false,
false,
static_cast<proto::schema::DataType>(meta.get_element_type()));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> int_dist(1, this->N_);
int random_idx = int_dist(gen) - 1;
auto unary_range_expr = std::make_unique<proto::plan::UnaryRangeExpr>();
unary_range_expr->set_allocated_column_info(column_info);
unary_range_expr->set_op(proto::plan::OpType::Equal);
auto val = this->FieldValueAt(random_idx);
unary_range_expr->set_allocated_value(test::GenGenericValue(val));
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr.release());
auto parser = ProtoParser(*this->schema_);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
BitsetType final;
final = ExecuteQueryExpr(parsed, segpromote, this->N_, MAX_TIMESTAMP);
auto ref = [this, random_idx](size_t offset) -> bool {
return this->index_column_data_[offset] ==
this->index_column_data_[random_idx];
};
ASSERT_EQ(final.size(), this->N_);
for (size_t i = 0; i < this->N_; i++) {
if (std::is_floating_point_v<decltype(val)> && i == random_idx) {
continue;
}
ASSERT_EQ(final[i], ref(i))
<< "i: " << i << ", final[i]: " << final[i]
<< ", ref(i): " << ref(i) << ", random_idx: " << random_idx
<< ", value: " << this->index_column_data_[random_idx]
<< ", value: " << this->index_column_data_[i];
}
}
}
REGISTER_TYPED_TEST_CASE_P(BuildInvertedIndexWithSingleSegmentTest,
ReadFromSingleSegmentIndex);
using ElementType = testing::
Types<bool, int8_t, int16_t, int32_t, int64_t, float, double, std::string>;
INSTANTIATE_TYPED_TEST_SUITE_P(Naive,
BuildInvertedIndexWithSingleSegmentTest,
ElementType);

View File

@ -354,7 +354,7 @@ TEST_F(TestChunkSegment, TestCompareExpr) {
data.begin() + i * test_data_count);
}
index->BuildWithRawData(data.size(), data.data());
index->BuildWithRawDataForUT(data.size(), data.data());
segcore::LoadIndexInfo load_index_info;
load_index_info.index = std::move(index);
load_index_info.field_id = fid.get();

View File

@ -255,7 +255,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
auto index = index::CreateStringIndexSort();
std::vector<uint8_t> buffer(arr.ByteSize());
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
index->BuildWithRawData(arr.ByteSize(), buffer.data());
index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(),
.index = std::move(index),
@ -264,7 +264,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
}
{
auto index = index::CreateScalarIndexSort<int64_t>();
index->BuildWithRawData(N, raw_int.data());
index->BuildWithRawDataForUT(N, raw_int.data());
LoadIndexInfo info{
.field_id =
schema->get_field_id(FieldName("another_int64")).get(),
@ -278,7 +278,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
LoadInvertedIndex() {
auto index =
std::make_unique<index::InvertedIndexTantivy<std::string>>();
index->BuildWithRawData(N, raw_str.data());
index->BuildWithRawDataForUT(N, raw_str.data());
LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(),
.index = std::move(index),
@ -295,7 +295,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
auto index = std::make_unique<MockStringIndex>();
std::vector<uint8_t> buffer(arr.ByteSize());
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
index->BuildWithRawData(arr.ByteSize(), buffer.data());
index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(),
.index = std::move(index),

View File

@ -151,8 +151,12 @@ TEST_F(StringIndexMarisaTest, Range) {
TEST_F(StringIndexMarisaTest, Reverse) {
auto index_types = GetIndexTypes<std::string>();
for (const auto& index_type : index_types) {
auto index = milvus::index::IndexFactory::GetInstance()
.CreatePrimitiveScalarIndex<std::string>(index_type);
CreateIndexInfo create_index_info{
.index_type = index_type,
};
auto index =
milvus::index::IndexFactory::GetInstance()
.CreatePrimitiveScalarIndex<std::string>(create_index_info);
index->Build(nb, strs.data());
assert_reverse<std::string>(index.get(), strs);
}
@ -311,7 +315,7 @@ TEST_F(StringIndexMarisaTest, BaseIndexCodec) {
*str_arr.mutable_data() = {strings.begin(), strings.end()};
std::vector<uint8_t> data(str_arr.ByteSizeLong(), 0);
str_arr.SerializeToArray(data.data(), str_arr.ByteSizeLong());
index->BuildWithRawData(str_arr.ByteSizeLong(), data.data());
index->BuildWithRawDataForUT(str_arr.ByteSizeLong(), data.data());
std::vector<std::string> invalid_strings = {std::to_string(nb)};
auto copy_index = milvus::index::CreateStringIndexMarisa();