fix: enable to build index with single segment (#39233)

fix https://github.com/milvus-io/milvus/issues/39232

---------

Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
pull/39079/head^2
Spade A 2025-01-16 11:01:06 +08:00 committed by GitHub
parent bca2a62b78
commit 8c4ba70a4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 1199 additions and 246 deletions

View File

@ -45,9 +45,9 @@ class IndexBase {
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) = 0; Load(milvus::tracer::TraceContext ctx, const Config& config = {}) = 0;
virtual void virtual void
BuildWithRawData(size_t n, BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config = {}) = 0; const Config& config = {}) = 0;
virtual void virtual void
BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0; BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0;

View File

@ -35,10 +35,14 @@ namespace milvus::index {
template <typename T> template <typename T>
ScalarIndexPtr<T> ScalarIndexPtr<T>
IndexFactory::CreatePrimitiveScalarIndex( IndexFactory::CreatePrimitiveScalarIndex(
const IndexType& index_type, const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) { const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
if (index_type == INVERTED_INDEX_TYPE) { if (index_type == INVERTED_INDEX_TYPE) {
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context); // scalar_index_engine_version 0 means we should built tantivy index within single segment
return std::make_unique<InvertedIndexTantivy<T>>(
file_manager_context,
create_index_info.scalar_index_engine_version == 0);
} }
if (index_type == BITMAP_INDEX_TYPE) { if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<T>>(file_manager_context); return std::make_unique<BitmapIndex<T>>(file_manager_context);
@ -59,12 +63,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
template <> template <>
ScalarIndexPtr<std::string> ScalarIndexPtr<std::string>
IndexFactory::CreatePrimitiveScalarIndex<std::string>( IndexFactory::CreatePrimitiveScalarIndex<std::string>(
const IndexType& index_type, const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) { const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
#if defined(__linux__) || defined(__APPLE__) #if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) { if (index_type == INVERTED_INDEX_TYPE) {
// scalar_index_engine_version 0 means we should built tantivy index within single segment
return std::make_unique<InvertedIndexTantivy<std::string>>( return std::make_unique<InvertedIndexTantivy<std::string>>(
file_manager_context); file_manager_context,
create_index_info.scalar_index_engine_version == 0);
} }
if (index_type == BITMAP_INDEX_TYPE) { if (index_type == BITMAP_INDEX_TYPE) {
return std::make_unique<BitmapIndex<std::string>>(file_manager_context); return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
@ -294,37 +301,37 @@ IndexFactory::CreateIndex(
IndexBasePtr IndexBasePtr
IndexFactory::CreatePrimitiveScalarIndex( IndexFactory::CreatePrimitiveScalarIndex(
DataType data_type, DataType data_type,
IndexType index_type, const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) { const storage::FileManagerContext& file_manager_context) {
switch (data_type) { switch (data_type) {
// create scalar index // create scalar index
case DataType::BOOL: case DataType::BOOL:
return CreatePrimitiveScalarIndex<bool>(index_type, return CreatePrimitiveScalarIndex<bool>(create_index_info,
file_manager_context); file_manager_context);
case DataType::INT8: case DataType::INT8:
return CreatePrimitiveScalarIndex<int8_t>(index_type, return CreatePrimitiveScalarIndex<int8_t>(create_index_info,
file_manager_context); file_manager_context);
case DataType::INT16: case DataType::INT16:
return CreatePrimitiveScalarIndex<int16_t>(index_type, return CreatePrimitiveScalarIndex<int16_t>(create_index_info,
file_manager_context); file_manager_context);
case DataType::INT32: case DataType::INT32:
return CreatePrimitiveScalarIndex<int32_t>(index_type, return CreatePrimitiveScalarIndex<int32_t>(create_index_info,
file_manager_context); file_manager_context);
case DataType::INT64: case DataType::INT64:
return CreatePrimitiveScalarIndex<int64_t>(index_type, return CreatePrimitiveScalarIndex<int64_t>(create_index_info,
file_manager_context); file_manager_context);
case DataType::FLOAT: case DataType::FLOAT:
return CreatePrimitiveScalarIndex<float>(index_type, return CreatePrimitiveScalarIndex<float>(create_index_info,
file_manager_context); file_manager_context);
case DataType::DOUBLE: case DataType::DOUBLE:
return CreatePrimitiveScalarIndex<double>(index_type, return CreatePrimitiveScalarIndex<double>(create_index_info,
file_manager_context); file_manager_context);
// create string index // create string index
case DataType::STRING: case DataType::STRING:
case DataType::VARCHAR: case DataType::VARCHAR:
return CreatePrimitiveScalarIndex<std::string>( return CreatePrimitiveScalarIndex<std::string>(
index_type, file_manager_context); create_index_info, file_manager_context);
default: default:
PanicInfo( PanicInfo(
DataTypeInvalid, DataTypeInvalid,
@ -334,14 +341,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
IndexBasePtr IndexBasePtr
IndexFactory::CreateCompositeScalarIndex( IndexFactory::CreateCompositeScalarIndex(
IndexType index_type, const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context) { const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE || if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
index_type == INVERTED_INDEX_TYPE) { index_type == INVERTED_INDEX_TYPE) {
auto element_type = static_cast<DataType>( auto element_type = static_cast<DataType>(
file_manager_context.fieldDataMeta.field_schema.element_type()); file_manager_context.fieldDataMeta.field_schema.element_type());
return CreatePrimitiveScalarIndex( return CreatePrimitiveScalarIndex(
element_type, index_type, file_manager_context); element_type, create_index_info, file_manager_context);
} else { } else {
PanicInfo( PanicInfo(
Unsupported, Unsupported,
@ -373,9 +381,9 @@ IndexFactory::CreateScalarIndex(
case DataType::VARCHAR: case DataType::VARCHAR:
case DataType::STRING: case DataType::STRING:
return CreatePrimitiveScalarIndex( return CreatePrimitiveScalarIndex(
data_type, create_index_info.index_type, file_manager_context); data_type, create_index_info, file_manager_context);
case DataType::ARRAY: { case DataType::ARRAY: {
return CreateCompositeScalarIndex(create_index_info.index_type, return CreateCompositeScalarIndex(create_index_info,
file_manager_context); file_manager_context);
} }
case DataType::JSON: { case DataType::JSON: {

View File

@ -85,14 +85,14 @@ class IndexFactory {
IndexBasePtr IndexBasePtr
CreatePrimitiveScalarIndex( CreatePrimitiveScalarIndex(
DataType data_type, DataType data_type,
IndexType index_type, const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context = const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext()); storage::FileManagerContext());
// For types like array, struct, union, etc // For types like array, struct, union, etc
IndexBasePtr IndexBasePtr
CreateCompositeScalarIndex( CreateCompositeScalarIndex(
IndexType index_type, const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager_context = const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext()); storage::FileManagerContext());
@ -115,7 +115,7 @@ class IndexFactory {
template <typename T> template <typename T>
ScalarIndexPtr<T> ScalarIndexPtr<T>
CreatePrimitiveScalarIndex(const IndexType& index_type, CreatePrimitiveScalarIndex(const CreateIndexInfo& create_index_info,
const storage::FileManagerContext& file_manager = const storage::FileManagerContext& file_manager =
storage::FileManagerContext()); storage::FileManagerContext());
}; };

View File

@ -26,6 +26,7 @@ struct CreateIndexInfo {
IndexVersion index_engine_version; IndexVersion index_engine_version;
std::string field_name; std::string field_name;
int64_t dim; int64_t dim;
int32_t scalar_index_engine_version;
}; };
} // namespace milvus::index } // namespace milvus::index

View File

@ -84,14 +84,15 @@ InvertedIndexTantivy<T>::InitForBuildIndex() {
path_); path_);
} }
wrapper_ = std::make_shared<TantivyIndexWrapper>( wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str()); field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
} }
template <typename T> template <typename T>
InvertedIndexTantivy<T>::InvertedIndexTantivy( InvertedIndexTantivy<T>::InvertedIndexTantivy(
const storage::FileManagerContext& ctx) const storage::FileManagerContext& ctx, bool inverted_index_single_segment)
: ScalarIndex<T>(INVERTED_INDEX_TYPE), : ScalarIndex<T>(INVERTED_INDEX_TYPE),
schema_(ctx.fieldDataMeta.field_schema) { schema_(ctx.fieldDataMeta.field_schema),
inverted_index_single_segment_(inverted_index_single_segment) {
mem_file_manager_ = std::make_shared<MemFileManager>(ctx); mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx); disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
// push init wrapper to load process // push init wrapper to load process
@ -387,9 +388,9 @@ InvertedIndexTantivy<T>::RegexQuery(const std::string& regex_pattern) {
template <typename T> template <typename T>
void void
InvertedIndexTantivy<T>::BuildWithRawData(size_t n, InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
if constexpr (std::is_same_v<bool, T>) { if constexpr (std::is_same_v<bool, T>) {
schema_.set_data_type(proto::schema::DataType::Bool); schema_.set_data_type(proto::schema::DataType::Bool);
} }
@ -421,16 +422,35 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
boost::filesystem::create_directories(path_); boost::filesystem::create_directories(path_);
d_type_ = get_tantivy_data_type(schema_); d_type_ = get_tantivy_data_type(schema_);
std::string field = "test_inverted_index"; std::string field = "test_inverted_index";
inverted_index_single_segment_ =
GetValueFromConfig<int32_t>(config,
milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1) == 0;
wrapper_ = std::make_shared<TantivyIndexWrapper>( wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str()); field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
if (config.find("is_array") != config.end()) { if (!inverted_index_single_segment_) {
// only used in ut. if (config.find("is_array") != config.end()) {
auto arr = static_cast<const boost::container::vector<T>*>(values); // only used in ut.
for (size_t i = 0; i < n; i++) { auto arr = static_cast<const boost::container::vector<T>*>(values);
wrapper_->template add_multi_data(arr[i].data(), arr[i].size(), i); for (size_t i = 0; i < n; i++) {
wrapper_->template add_multi_data(
arr[i].data(), arr[i].size(), i);
}
} else {
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
} }
} else { } else {
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0); if (config.find("is_array") != config.end()) {
// only used in ut.
auto arr = static_cast<const boost::container::vector<T>*>(values);
for (size_t i = 0; i < n; i++) {
wrapper_->template add_multi_data_by_single_segment_writer(
arr[i].data(), arr[i].size());
}
} else {
wrapper_->add_data_by_single_segment_writer<T>(
static_cast<const T*>(values), n);
}
} }
wrapper_->create_reader(); wrapper_->create_reader();
finish(); finish();
@ -458,26 +478,48 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
case proto::schema::DataType::Double: case proto::schema::DataType::Double:
case proto::schema::DataType::String: case proto::schema::DataType::String:
case proto::schema::DataType::VarChar: { case proto::schema::DataType::VarChar: {
int64_t offset = 0; // Generally, we will not build inverted index with single segment except for building index
if (schema_.nullable()) { // for query node with older version(2.4). See more comments above `inverted_index_single_segment_`.
for (const auto& data : field_datas) { if (!inverted_index_single_segment_) {
auto n = data->get_num_rows(); int64_t offset = 0;
for (int i = 0; i < n; i++) { if (schema_.nullable()) {
if (!data->is_valid(i)) { for (const auto& data : field_datas) {
null_offset.push_back(i); auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_->add_multi_data<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i),
offset++);
} }
wrapper_->add_multi_data<T>( }
static_cast<const T*>(data->RawValue(i)), } else {
data->is_valid(i), for (const auto& data : field_datas) {
offset++); auto n = data->get_num_rows();
wrapper_->add_data<T>(
static_cast<const T*>(data->Data()), n, offset);
offset += n;
} }
} }
} else { } else {
for (const auto& data : field_datas) { for (const auto& data : field_datas) {
auto n = data->get_num_rows(); auto n = data->get_num_rows();
wrapper_->add_data<T>( if (schema_.nullable()) {
static_cast<const T*>(data->Data()), n, offset); for (int i = 0; i < n; i++) {
offset += n; if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_
->add_multi_data_by_single_segment_writer<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i));
}
continue;
}
wrapper_->add_data_by_single_segment_writer<T>(
static_cast<const T*>(data->Data()), n);
} }
} }
break; break;
@ -508,10 +550,15 @@ InvertedIndexTantivy<T>::build_index_for_array(
null_offset.push_back(i); null_offset.push_back(i);
} }
auto length = data->is_valid(i) ? array_column[i].length() : 0; auto length = data->is_valid(i) ? array_column[i].length() : 0;
wrapper_->template add_multi_data( if (!inverted_index_single_segment_) {
reinterpret_cast<const T*>(array_column[i].data()), wrapper_->template add_multi_data(
length, reinterpret_cast<const T*>(array_column[i].data()),
offset++); length,
offset++);
} else {
wrapper_->template add_multi_data_by_single_segment_writer(
reinterpret_cast<const T*>(array_column[i].data()), length);
}
} }
} }
} }
@ -537,7 +584,13 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
array_column[i].template get_data<std::string>(j)); array_column[i].template get_data<std::string>(j));
} }
auto length = data->is_valid(i) ? output.size() : 0; auto length = data->is_valid(i) ? output.size() : 0;
wrapper_->template add_multi_data(output.data(), length, offset++); if (!inverted_index_single_segment_) {
wrapper_->template add_multi_data(
output.data(), length, offset++);
} else {
wrapper_->template add_multi_data_by_single_segment_writer(
output.data(), length);
}
} }
} }
} }

View File

@ -38,7 +38,8 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) { InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) {
} }
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx); explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx,
bool inverted_index_single_segment = false);
~InvertedIndexTantivy(); ~InvertedIndexTantivy();
@ -80,11 +81,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
return wrapper_->count(); return wrapper_->count();
} }
// BuildWithRawData should be only used in ut. Only string is supported. // BuildWithRawDataForUT should be only used in ut. Only string is supported.
void void
BuildWithRawData(size_t n, BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config = {}) override; const Config& config = {}) override;
BinarySet BinarySet
Serialize(const Config& config) override; Serialize(const Config& config) override;
@ -205,5 +206,14 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
// all data need to be built to align the offset // all data need to be built to align the offset
// so need to store null_offset in inverted index additionally // so need to store null_offset in inverted index additionally
std::vector<size_t> null_offset{}; std::vector<size_t> null_offset{};
// `inverted_index_single_segment_` is used to control whether to build tantivy index with single segment.
//
// In the older version of milvus, the query node can only read tantivy index built whtin single segment
// where the newer version builds and reads index of multi segments by default.
// However, the index may be built from a separate node from the query node where the index buliding node is a
// new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index
// building node to build specific type of tantivy index.
bool inverted_index_single_segment_{false};
}; };
} // namespace milvus::index } // namespace milvus::index

View File

@ -46,6 +46,8 @@ constexpr const char* MARISA_TRIE_UPPER = "TRIE";
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED"; constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP"; constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID"; constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";
constexpr const char* SCALAR_INDEX_ENGINE_VERSION =
"scalar_index_engine_version";
// index meta // index meta
constexpr const char* COLLECTION_ID = "collection_id"; constexpr const char* COLLECTION_ID = "collection_id";

View File

@ -72,9 +72,9 @@ ScalarIndex<T>::Query(const DatasetPtr& dataset) {
template <> template <>
void void
ScalarIndex<std::string>::BuildWithRawData(size_t n, ScalarIndex<std::string>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
proto::schema::StringArray arr; proto::schema::StringArray arr;
auto ok = arr.ParseFromArray(values, n); auto ok = arr.ParseFromArray(values, n);
Assert(ok); Assert(ok);
@ -86,9 +86,9 @@ ScalarIndex<std::string>::BuildWithRawData(size_t n,
template <> template <>
void void
ScalarIndex<bool>::BuildWithRawData(size_t n, ScalarIndex<bool>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
proto::schema::BoolArray arr; proto::schema::BoolArray arr;
auto ok = arr.ParseFromArray(values, n); auto ok = arr.ParseFromArray(values, n);
Assert(ok); Assert(ok);
@ -97,54 +97,54 @@ ScalarIndex<bool>::BuildWithRawData(size_t n,
template <> template <>
void void
ScalarIndex<int8_t>::BuildWithRawData(size_t n, ScalarIndex<int8_t>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
auto data = reinterpret_cast<int8_t*>(const_cast<void*>(values)); auto data = reinterpret_cast<int8_t*>(const_cast<void*>(values));
Build(n, data); Build(n, data);
} }
template <> template <>
void void
ScalarIndex<int16_t>::BuildWithRawData(size_t n, ScalarIndex<int16_t>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
auto data = reinterpret_cast<int16_t*>(const_cast<void*>(values)); auto data = reinterpret_cast<int16_t*>(const_cast<void*>(values));
Build(n, data); Build(n, data);
} }
template <> template <>
void void
ScalarIndex<int32_t>::BuildWithRawData(size_t n, ScalarIndex<int32_t>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
auto data = reinterpret_cast<int32_t*>(const_cast<void*>(values)); auto data = reinterpret_cast<int32_t*>(const_cast<void*>(values));
Build(n, data); Build(n, data);
} }
template <> template <>
void void
ScalarIndex<int64_t>::BuildWithRawData(size_t n, ScalarIndex<int64_t>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
auto data = reinterpret_cast<int64_t*>(const_cast<void*>(values)); auto data = reinterpret_cast<int64_t*>(const_cast<void*>(values));
Build(n, data); Build(n, data);
} }
template <> template <>
void void
ScalarIndex<float>::BuildWithRawData(size_t n, ScalarIndex<float>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
auto data = reinterpret_cast<float*>(const_cast<void*>(values)); auto data = reinterpret_cast<float*>(const_cast<void*>(values));
Build(n, data); Build(n, data);
} }
template <> template <>
void void
ScalarIndex<double>::BuildWithRawData(size_t n, ScalarIndex<double>::BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config) { const Config& config) {
auto data = reinterpret_cast<double*>(const_cast<void*>(values)); auto data = reinterpret_cast<double*>(const_cast<void*>(values));
Build(n, data); Build(n, data);
} }

View File

@ -65,9 +65,9 @@ class ScalarIndex : public IndexBase {
} }
void void
BuildWithRawData(size_t n, BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config = {}) override; const Config& config = {}) override;
void void
BuildWithDataset(const DatasetPtr& dataset, BuildWithDataset(const DatasetPtr& dataset,

View File

@ -42,9 +42,9 @@ class VectorIndex : public IndexBase {
public: public:
void void
BuildWithRawData(size_t n, BuildWithRawDataForUT(size_t n,
const void* values, const void* values,
const Config& config = {}) override { const Config& config = {}) override {
PanicInfo(Unsupported, PanicInfo(Unsupported,
"vector index don't support build index with raw data"); "vector index don't support build index with raw data");
}; };

View File

@ -30,6 +30,13 @@ ScalarIndexCreator::ScalarIndexCreator(
if (config.contains("index_type")) { if (config.contains("index_type")) {
index_type_ = config.at("index_type").get<std::string>(); index_type_ = config.at("index_type").get<std::string>();
} }
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
// Use value_or(1) for unit test without setting this value
index_info.scalar_index_engine_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1);
index_info.field_type = dtype_; index_info.field_type = dtype_;
index_info.index_type = index_type(); index_info.index_type = index_type();
index_ = index::IndexFactory::GetInstance().CreateIndex( index_ = index::IndexFactory::GetInstance().CreateIndex(
@ -40,7 +47,7 @@ void
ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) { ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) {
auto size = dataset->GetRows(); auto size = dataset->GetRows();
auto data = dataset->GetTensor(); auto data = dataset->GetTensor();
index_->BuildWithRawData(size, data); index_->BuildWithRawDataForUT(size, data);
} }
void void

View File

@ -166,30 +166,17 @@ CreateIndex(CIndex* res_index,
auto field_type = auto field_type =
static_cast<DataType>(build_index_info->field_schema().data_type()); static_cast<DataType>(build_index_info->field_schema().data_type());
milvus::index::CreateIndexInfo index_info;
index_info.field_type = field_type;
auto storage_config = auto storage_config =
get_storage_config(build_index_info->storage_config()); get_storage_config(build_index_info->storage_config());
auto config = get_config(build_index_info); auto config = get_config(build_index_info);
// get index type
auto index_type = milvus::index::GetValueFromConfig<std::string>(
config, "index_type");
AssertInfo(index_type.has_value(), "index type is empty");
index_info.index_type = index_type.value();
auto engine_version = build_index_info->current_index_version(); auto engine_version = build_index_info->current_index_version();
index_info.index_engine_version = engine_version;
config[milvus::index::INDEX_ENGINE_VERSION] = config[milvus::index::INDEX_ENGINE_VERSION] =
std::to_string(engine_version); std::to_string(engine_version);
auto scalar_index_engine_version =
// get metric type build_index_info->current_scalar_index_version();
if (milvus::IsVectorDataType(field_type)) { config[milvus::index::SCALAR_INDEX_ENGINE_VERSION] =
auto metric_type = milvus::index::GetValueFromConfig<std::string>( scalar_index_engine_version;
config, "metric_type");
AssertInfo(metric_type.has_value(), "metric type is empty");
index_info.metric_type = metric_type.value();
}
// init file manager // init file manager
milvus::storage::FieldDataMeta field_meta{ milvus::storage::FieldDataMeta field_meta{

View File

@ -1193,6 +1193,7 @@ name = "tantivy-binding"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"cbindgen", "cbindgen",
"either",
"env_logger", "env_logger",
"futures", "futures",
"jieba-rs", "jieba-rs",

View File

@ -17,6 +17,7 @@ lazy_static = "1.4.0"
serde_json = "1.0.128" serde_json = "1.0.128"
jieba-rs = "0.6.8" jieba-rs = "0.6.8"
regex = "1.11.1" regex = "1.11.1"
either = "1.13.0"
[build-dependencies] [build-dependencies]
cbindgen = "0.26.0" cbindgen = "0.26.0"

View File

@ -149,6 +149,10 @@ RustResult tantivy_create_index(const char *field_name,
uintptr_t num_threads, uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes); uintptr_t overall_memory_budget_in_bytes);
RustResult tantivy_create_index_with_single_segment(const char *field_name,
TantivyDataType data_type,
const char *path);
void tantivy_free_index_writer(void *ptr); void tantivy_free_index_writer(void *ptr);
RustResult tantivy_finish_index(void *ptr); RustResult tantivy_finish_index(void *ptr);
@ -162,78 +166,140 @@ RustResult tantivy_index_add_int8s(void *ptr,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_int8s_by_single_segment_writer(void *ptr,
const int8_t *array,
uintptr_t len);
RustResult tantivy_index_add_int16s(void *ptr, RustResult tantivy_index_add_int16s(void *ptr,
const int16_t *array, const int16_t *array,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_int16s_by_single_segment_writer(void *ptr,
const int16_t *array,
uintptr_t len);
RustResult tantivy_index_add_int32s(void *ptr, RustResult tantivy_index_add_int32s(void *ptr,
const int32_t *array, const int32_t *array,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_int32s_by_single_segment_writer(void *ptr,
const int32_t *array,
uintptr_t len);
RustResult tantivy_index_add_int64s(void *ptr, RustResult tantivy_index_add_int64s(void *ptr,
const int64_t *array, const int64_t *array,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_int64s_by_single_segment_writer(void *ptr,
const int64_t *array,
uintptr_t len);
RustResult tantivy_index_add_f32s(void *ptr, RustResult tantivy_index_add_f32s(void *ptr,
const float *array, const float *array,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_f32s_by_single_segment_writer(void *ptr,
const float *array,
uintptr_t len);
RustResult tantivy_index_add_f64s(void *ptr, RustResult tantivy_index_add_f64s(void *ptr,
const double *array, const double *array,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_f64s_by_single_segment_writer(void *ptr,
const double *array,
uintptr_t len);
RustResult tantivy_index_add_bools(void *ptr, RustResult tantivy_index_add_bools(void *ptr,
const bool *array, const bool *array,
uintptr_t len, uintptr_t len,
int64_t offset_begin); int64_t offset_begin);
RustResult tantivy_index_add_bools_by_single_segment_writer(void *ptr,
const bool *array,
uintptr_t len);
RustResult tantivy_index_add_string(void *ptr, const char *s, int64_t offset); RustResult tantivy_index_add_string(void *ptr, const char *s, int64_t offset);
RustResult tantivy_index_add_string_by_single_segment_writer(void *ptr, const char *s);
RustResult tantivy_index_add_multi_int8s(void *ptr, RustResult tantivy_index_add_multi_int8s(void *ptr,
const int8_t *array, const int8_t *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_int8s_by_single_segment_writer(void *ptr,
const int8_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_int16s(void *ptr, RustResult tantivy_index_add_multi_int16s(void *ptr,
const int16_t *array, const int16_t *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_int16s_by_single_segment_writer(void *ptr,
const int16_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_int32s(void *ptr, RustResult tantivy_index_add_multi_int32s(void *ptr,
const int32_t *array, const int32_t *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_int32s_by_single_segment_writer(void *ptr,
const int32_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_int64s(void *ptr, RustResult tantivy_index_add_multi_int64s(void *ptr,
const int64_t *array, const int64_t *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_int64s_by_single_segment_writer(void *ptr,
const int64_t *array,
uintptr_t len);
RustResult tantivy_index_add_multi_f32s(void *ptr, RustResult tantivy_index_add_multi_f32s(void *ptr,
const float *array, const float *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_f32s_by_single_segment_writer(void *ptr,
const float *array,
uintptr_t len);
RustResult tantivy_index_add_multi_f64s(void *ptr, RustResult tantivy_index_add_multi_f64s(void *ptr,
const double *array, const double *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_f64s_by_single_segment_writer(void *ptr,
const double *array,
uintptr_t len);
RustResult tantivy_index_add_multi_bools(void *ptr, RustResult tantivy_index_add_multi_bools(void *ptr,
const bool *array, const bool *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_bools_by_single_segment_writer(void *ptr,
const bool *array,
uintptr_t len);
RustResult tantivy_index_add_multi_keywords(void *ptr, RustResult tantivy_index_add_multi_keywords(void *ptr,
const char *const *array, const char *const *array,
uintptr_t len, uintptr_t len,
int64_t offset); int64_t offset);
RustResult tantivy_index_add_multi_keywords_by_single_segment_writer(void *ptr,
const char *const *array,
uintptr_t len);
RustResult tantivy_create_text_writer(const char *field_name, RustResult tantivy_create_text_writer(const char *field_name,
const char *path, const char *path,
const char *tokenizer_name, const char *tokenizer_name,

View File

@ -1,4 +1,3 @@
use std::default;
use std::ffi::c_void; use std::ffi::c_void;
use std::ptr::null; use std::ptr::null;
@ -132,19 +131,15 @@ pub extern "C" fn free_rust_result(result: RustResult) {
} }
_ => {} _ => {}
} }
unsafe { if !result.error.is_null() {
if !result.error.is_null() { free_rust_string(result.error as *mut c_char);
free_rust_string(result.error as *mut c_char);
}
} }
} }
#[no_mangle] #[no_mangle]
pub extern "C" fn free_rust_error(error: *const c_char) { pub extern "C" fn free_rust_error(error: *const c_char) {
unsafe { if !error.is_null() {
if !error.is_null() { free_rust_string(error as *mut c_char);
free_rust_string(error as *mut c_char);
}
} }
} }

View File

@ -1,4 +1,5 @@
#[repr(u8)] #[repr(u8)]
#[derive(Debug)]
pub enum TantivyDataType { pub enum TantivyDataType {
Text, Text,
Keyword, Keyword,

View File

@ -1,7 +1,5 @@
use core::{fmt, str}; use core::{fmt, str};
use serde_json as json;
#[derive(Debug)] #[derive(Debug)]
pub enum TantivyBindingError { pub enum TantivyBindingError {
JsonError(serde_json::Error), JsonError(serde_json::Error),

View File

@ -1,13 +1,9 @@
use std::{ use std::ffi::{c_char, c_void, CStr};
ffi::{c_char, c_void, CStr},
ptr::null,
};
use crate::{ use crate::{
array::{RustArray, RustResult}, array::RustResult,
cstr_to_str, cstr_to_str,
index_reader::IndexReaderWrapper, index_reader::IndexReaderWrapper,
string_c::create_string,
util::{create_binding, free_binding}, util::{create_binding, free_binding},
util_c::tantivy_index_exist, util_c::tantivy_index_exist,
}; };

View File

@ -4,7 +4,7 @@ use tantivy::{
Term, Term,
}; };
use crate::error::{Result, TantivyBindingError}; use crate::error::Result;
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer}; use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
impl IndexReaderWrapper { impl IndexReaderWrapper {

View File

@ -1,13 +1,9 @@
use std::{ffi::CStr, ptr::null}; use std::ffi::CStr;
use libc::{c_char, c_void}; use libc::{c_char, c_void};
use crate::{ use crate::{
array::{RustArray, RustResult}, array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
cstr_to_str,
index_reader::IndexReaderWrapper,
log::init_log,
string_c::{c_str_to_str, create_string},
tokenizer::create_tokenizer, tokenizer::create_tokenizer,
}; };

View File

@ -1,12 +1,14 @@
use std::ffi::CStr; use std::ffi::CStr;
use std::sync::Arc; use std::sync::Arc;
use either::Either;
use futures::executor::block_on; use futures::executor::block_on;
use libc::c_char; use libc::c_char;
use log::info;
use tantivy::schema::{ use tantivy::schema::{
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED, Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
}; };
use tantivy::{doc, tokenizer, Document, Index, IndexWriter}; use tantivy::{doc, Document, Index, IndexWriter, SingleSegmentIndexWriter};
use crate::data_type::TantivyDataType; use crate::data_type::TantivyDataType;
@ -16,11 +18,34 @@ use crate::log::init_log;
pub(crate) struct IndexWriterWrapper { pub(crate) struct IndexWriterWrapper {
pub(crate) field: Field, pub(crate) field: Field,
pub(crate) index_writer: IndexWriter, pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
pub(crate) id_field: Field, pub(crate) id_field: Option<Field>,
pub(crate) index: Arc<Index>, pub(crate) index: Arc<Index>,
} }
#[inline]
fn schema_builder_add_field(
schema_builder: &mut SchemaBuilder,
field_name: &str,
data_type: TantivyDataType,
) -> Field {
match data_type {
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
schema_builder.add_text_field(&field_name, text_options)
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
}
impl IndexWriterWrapper { impl IndexWriterWrapper {
pub fn new( pub fn new(
field_name: String, field_name: String,
@ -30,30 +55,10 @@ impl IndexWriterWrapper {
overall_memory_budget_in_bytes: usize, overall_memory_budget_in_bytes: usize,
) -> Result<IndexWriterWrapper> { ) -> Result<IndexWriterWrapper> {
init_log(); init_log();
info!("create index writer, field_name: {}, data_type: {:?}", field_name, data_type);
let field: Field;
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
match data_type { let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
TantivyDataType::I64 => { // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
field = schema_builder.add_i64_field(&field_name, INDEXED);
}
TantivyDataType::F64 => {
field = schema_builder.add_f64_field(&field_name, INDEXED);
}
TantivyDataType::Bool => {
field = schema_builder.add_bool_field(&field_name, INDEXED);
}
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
field = schema_builder.add_text_field(&field_name, text_options);
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
let id_field = schema_builder.add_i64_field("doc_id", FAST); let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?; let index = Index::create_in_dir(path.clone(), schema)?;
@ -61,8 +66,28 @@ impl IndexWriterWrapper {
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapper { Ok(IndexWriterWrapper {
field, field,
index_writer, index_writer: Either::Left(index_writer),
id_field, id_field: Some(id_field),
index: Arc::new(index),
})
}
pub fn new_with_single_segment(
field_name: String,
data_type: TantivyDataType,
path: String,
) -> Result<IndexWriterWrapper> {
init_log();
info!("create single segment index writer, field_name: {}, data_type: {:?}", field_name, data_type);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
Ok(IndexWriterWrapper {
field,
index_writer: Either::Right(index_writer),
id_field: None,
index: Arc::new(index), index: Arc::new(index),
}) })
} }
@ -71,6 +96,30 @@ impl IndexWriterWrapper {
IndexReaderWrapper::from_index(self.index.clone()) IndexReaderWrapper::from_index(self.index.clone())
} }
fn index_writer_add_document(&self, document: Document) -> Result<()> {
match self.index_writer {
Either::Left(ref writer) => {
let _ = writer.add_document(document)?;
}
Either::Right(_) => {
panic!("unexpected writer");
}
}
Ok(())
}
fn single_segment_index_writer_add_document(&mut self, document: Document) -> Result<()> {
match self.index_writer {
Either::Left(_) => {
panic!("unexpected writer");
}
Either::Right(ref mut single_segmnet_writer) => {
let _ = single_segmnet_writer.add_document(document)?;
}
}
Ok(())
}
pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> { pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> {
self.add_i64(data.into(), offset) self.add_i64(data.into(), offset)
} }
@ -84,11 +133,10 @@ impl IndexWriterWrapper {
} }
pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> { pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!( self.index_writer_add_document(doc!(
self.field => data, self.field => data,
self.id_field => offset, self.id_field.unwrap() => offset,
))?; ))
Ok(())
} }
pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> { pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> {
@ -96,27 +144,24 @@ impl IndexWriterWrapper {
} }
pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> { pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!( self.index_writer_add_document(doc!(
self.field => data, self.field => data,
self.id_field => offset, self.id_field.unwrap() => offset,
))?; ))
Ok(())
} }
pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> { pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!( self.index_writer_add_document(doc!(
self.field => data, self.field => data,
self.id_field => offset, self.id_field.unwrap() => offset,
))?; ))
Ok(())
} }
pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> { pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> {
let _ = self.index_writer.add_document(doc!( self.index_writer_add_document(doc!(
self.field => data, self.field => data,
self.id_field => offset, self.id_field.unwrap() => offset,
))?; ))
Ok(())
} }
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> { pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
@ -124,9 +169,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data as i64); document.add_field_value(self.field, *data as i64);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> { pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
@ -134,9 +178,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data as i64); document.add_field_value(self.field, *data as i64);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> { pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
@ -144,9 +187,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data as i64); document.add_field_value(self.field, *data as i64);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> { pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
@ -154,9 +196,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data); document.add_field_value(self.field, *data);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> { pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
@ -164,9 +205,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data as f64); document.add_field_value(self.field, *data as f64);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> { pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
@ -174,9 +214,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data); document.add_field_value(self.field, *data);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> { pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
@ -184,9 +223,8 @@ impl IndexWriterWrapper {
for data in datas { for data in datas {
document.add_field_value(self.field, *data); document.add_field_value(self.field, *data);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(())
} }
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> { pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
@ -195,31 +233,148 @@ impl IndexWriterWrapper {
let data = unsafe { CStr::from_ptr(*element) }; let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?); document.add_field_value(self.field, data.to_str()?);
} }
document.add_i64(self.id_field, offset); document.add_i64(self.id_field.unwrap(), offset);
let _ = self.index_writer.add_document(document)?; self.index_writer_add_document(document)
Ok(()) }
pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> {
self.add_f64_by_single_segment_writer(data.into())
}
pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_multi_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data as f64);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> {
let mut document = Document::default();
for data in datas {
document.add_field_value(self.field, *data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_keywords_by_single_segment_writer(
&mut self,
datas: &[*const c_char],
) -> Result<()> {
let mut document = Document::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
}
self.single_segment_index_writer_add_document(document)
} }
fn manual_merge(&mut self) -> Result<()> { fn manual_merge(&mut self) -> Result<()> {
let metas = self.index_writer.index().searchable_segment_metas()?; let index_writer = self.index_writer.as_mut().left().unwrap();
let policy = self.index_writer.get_merge_policy(); let metas = index_writer.index().searchable_segment_metas()?;
let policy = index_writer.get_merge_policy();
let candidates = policy.compute_merge_candidates(metas.as_slice()); let candidates = policy.compute_merge_candidates(metas.as_slice());
for candidate in candidates { for candidate in candidates {
self.index_writer.merge(candidate.0.as_slice()).wait()?; index_writer.merge(candidate.0.as_slice()).wait()?;
} }
Ok(()) Ok(())
} }
pub fn finish(mut self) -> Result<()> { pub fn finish(self) -> Result<()> {
self.index_writer.commit()?; match self.index_writer {
// self.manual_merge(); Either::Left(mut index_writer) => {
block_on(self.index_writer.garbage_collect_files())?; index_writer.commit()?;
self.index_writer.wait_merging_threads()?; // self.manual_merge();
block_on(index_writer.garbage_collect_files())?;
index_writer.wait_merging_threads()?;
}
Either::Right(single_segment_index_writer) => {
single_segment_index_writer
.finalize()
.expect("failed to build inverted index");
}
}
Ok(()) Ok(())
} }
pub(crate) fn commit(&mut self) -> Result<()> { pub(crate) fn commit(&mut self) -> Result<()> {
self.index_writer.commit()?; self.index_writer.as_mut().left().unwrap().commit()?;
Ok(()) Ok(())
} }
} }

View File

@ -1,10 +1,5 @@
use core::slice; use core::slice;
use std::{ use std::ffi::{c_char, c_void, CStr};
ffi::{c_char, c_void, CStr},
ptr::null,
};
use tantivy::Index;
use crate::{ use crate::{
array::RustResult, array::RustResult,
@ -47,6 +42,24 @@ pub extern "C" fn tantivy_create_index(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_create_index_with_single_segment(
field_name: *const c_char,
data_type: TantivyDataType,
path: *const c_char,
) -> RustResult {
let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path);
match IndexWriterWrapper::new_with_single_segment(
String::from(field_name_str),
data_type,
String::from(path_str),
) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(e) => RustResult::from_error(e.to_string()),
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) { pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
free_binding::<IndexWriterWrapper>(ptr); free_binding::<IndexWriterWrapper>(ptr);
@ -77,6 +90,29 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes
} }
// -------------------------build-------------------- // -------------------------build--------------------
fn execute<T: Copy>(
arr: &[T],
offset: i64,
e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
for (index, data) in arr.iter().enumerate() {
e(w, *data, offset + (index as i64))?;
}
Ok(())
}
fn execute_by_single_segment_writer<T: Copy>(
arr: &[T],
e: fn(&mut IndexWriterWrapper, T) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
for (_, data) in arr.iter().enumerate() {
e(w, *data)?;
}
Ok(())
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_int8s( pub extern "C" fn tantivy_index_add_int8s(
ptr: *mut c_void, ptr: *mut c_void,
@ -89,6 +125,24 @@ pub extern "C" fn tantivy_index_add_int8s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() } unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i8,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i8_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_int16s( pub extern "C" fn tantivy_index_add_int16s(
ptr: *mut c_void, ptr: *mut c_void,
@ -101,6 +155,24 @@ pub extern "C" fn tantivy_index_add_int16s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() } unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i16,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i16_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_int32s( pub extern "C" fn tantivy_index_add_int32s(
ptr: *mut c_void, ptr: *mut c_void,
@ -113,6 +185,24 @@ pub extern "C" fn tantivy_index_add_int32s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() } unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i32_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_int64s( pub extern "C" fn tantivy_index_add_int64s(
ptr: *mut c_void, ptr: *mut c_void,
@ -126,18 +216,23 @@ pub extern "C" fn tantivy_index_add_int64s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() } unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
} }
fn execute<T: Copy>( #[no_mangle]
arr: &[T], pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer(
offset: i64, ptr: *mut c_void,
mut e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>, array: *const i64,
w: &mut IndexWriterWrapper, len: usize,
) -> Result<()> { ) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { unsafe {
for (index, data) in arr.iter().enumerate() { execute_by_single_segment_writer(
e(w, *data, offset + (index as i64))?; arr,
} IndexWriterWrapper::add_i64_by_single_segment_writer,
&mut (*real),
)
.into()
} }
Ok(())
} }
#[no_mangle] #[no_mangle]
@ -152,6 +247,24 @@ pub extern "C" fn tantivy_index_add_f32s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() } unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_f32_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_f64s( pub extern "C" fn tantivy_index_add_f64s(
ptr: *mut c_void, ptr: *mut c_void,
@ -164,6 +277,24 @@ pub extern "C" fn tantivy_index_add_f64s(
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() } unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_f64_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_bools( pub extern "C" fn tantivy_index_add_bools(
ptr: *mut c_void, ptr: *mut c_void,
@ -184,6 +315,24 @@ pub extern "C" fn tantivy_index_add_bools(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer(
ptr: *mut c_void,
array: *const bool,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_bool_by_single_segment_writer,
&mut (*real),
)
.into()
}
}
// TODO: this is not a very efficient way, since we must call this function many times, which // TODO: this is not a very efficient way, since we must call this function many times, which
// will bring a lot of overhead caused by the rust binding. // will bring a lot of overhead caused by the rust binding.
#[no_mangle] #[no_mangle]
@ -197,6 +346,16 @@ pub extern "C" fn tantivy_index_add_string(
unsafe { (*real).add_string(s, offset).into() } unsafe { (*real).add_string(s, offset).into() }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_string_by_single_segment_writer(
ptr: *mut c_void,
s: *const c_char,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let s = cstr_to_str!(s);
unsafe { (*real).add_string_by_single_segment_writer(s).into() }
}
// --------------------------------------------- array ------------------------------------------ // --------------------------------------------- array ------------------------------------------
#[no_mangle] #[no_mangle]
@ -213,6 +372,19 @@ pub extern "C" fn tantivy_index_add_multi_int8s(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int8s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i8,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i8s_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int16s( pub extern "C" fn tantivy_index_add_multi_int16s(
ptr: *mut c_void, ptr: *mut c_void,
@ -227,6 +399,19 @@ pub extern "C" fn tantivy_index_add_multi_int16s(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int16s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i16,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i16s_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int32s( pub extern "C" fn tantivy_index_add_multi_int32s(
ptr: *mut c_void, ptr: *mut c_void,
@ -241,6 +426,19 @@ pub extern "C" fn tantivy_index_add_multi_int32s(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i32s_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int64s( pub extern "C" fn tantivy_index_add_multi_int64s(
ptr: *mut c_void, ptr: *mut c_void,
@ -255,6 +453,19 @@ pub extern "C" fn tantivy_index_add_multi_int64s(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_int64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const i64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i64s_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f32s( pub extern "C" fn tantivy_index_add_multi_f32s(
ptr: *mut c_void, ptr: *mut c_void,
@ -269,6 +480,19 @@ pub extern "C" fn tantivy_index_add_multi_f32s(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f32s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f32,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_f32s_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f64s( pub extern "C" fn tantivy_index_add_multi_f64s(
ptr: *mut c_void, ptr: *mut c_void,
@ -283,6 +507,19 @@ pub extern "C" fn tantivy_index_add_multi_f64s(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_f64s_by_single_segment_writer(
ptr: *mut c_void,
array: *const f64,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_f64s_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_bools( pub extern "C" fn tantivy_index_add_multi_bools(
ptr: *mut c_void, ptr: *mut c_void,
@ -297,6 +534,19 @@ pub extern "C" fn tantivy_index_add_multi_bools(
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_bools_by_single_segment_writer(
ptr: *mut c_void,
array: *const bool,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_bools_by_single_segment_writer(arr).into()
}
}
#[no_mangle] #[no_mangle]
pub extern "C" fn tantivy_index_add_multi_keywords( pub extern "C" fn tantivy_index_add_multi_keywords(
ptr: *mut c_void, ptr: *mut c_void,
@ -310,3 +560,18 @@ pub extern "C" fn tantivy_index_add_multi_keywords(
(*real).add_multi_keywords(arr, offset).into() (*real).add_multi_keywords(arr, offset).into()
} }
} }
#[no_mangle]
pub extern "C" fn tantivy_index_add_multi_keywords_by_single_segment_writer(
ptr: *mut c_void,
array: *const *const c_char,
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real)
.add_multi_keywords_by_single_segment_writer(arr)
.into()
}
}

View File

@ -1,5 +1,6 @@
use std::sync::Arc; use std::sync::Arc;
use either::Either;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
use tantivy::tokenizer::TextAnalyzer; use tantivy::tokenizer::TextAnalyzer;
use tantivy::Index; use tantivy::Index;
@ -44,8 +45,8 @@ impl IndexWriterWrapper {
IndexWriterWrapper { IndexWriterWrapper {
field, field,
index_writer, index_writer: Either::Left(index_writer),
id_field, id_field: Some(id_field),
index: Arc::new(index), index: Arc::new(index),
} }
} }

View File

@ -1,13 +1,10 @@
use std::ffi::c_char; use std::ffi::c_char;
use std::ffi::c_void;
use std::ffi::CStr; use std::ffi::CStr;
use crate::array::RustResult; use crate::array::RustResult;
use crate::cstr_to_str; use crate::cstr_to_str;
use crate::error::Result;
use crate::index_writer::IndexWriterWrapper; use crate::index_writer::IndexWriterWrapper;
use crate::log::init_log; use crate::log::init_log;
use crate::string_c::c_str_to_str;
use crate::tokenizer::create_tokenizer; use crate::tokenizer::create_tokenizer;
use crate::util::create_binding; use crate::util::create_binding;

View File

@ -81,15 +81,22 @@ struct TantivyIndexWrapper {
TantivyIndexWrapper(const char* field_name, TantivyIndexWrapper(const char* field_name,
TantivyDataType data_type, TantivyDataType data_type,
const char* path, const char* path,
bool inverted_single_semgnent = false,
uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes = uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) { DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
auto res = RustResultWrapper( RustResultWrapper res;
tantivy_create_index(field_name, if (inverted_single_semgnent) {
data_type, res = RustResultWrapper(tantivy_create_index_with_single_segment(
path, field_name, data_type, path));
num_threads, } else {
overall_memory_budget_in_bytes)); res = RustResultWrapper(
tantivy_create_index(field_name,
data_type,
path,
num_threads,
overall_memory_budget_in_bytes));
}
AssertInfo(res.result_->success, AssertInfo(res.result_->success,
"failed to create index: {}", "failed to create index: {}",
res.result_->error); res.result_->error);
@ -340,6 +347,193 @@ struct TantivyIndexWrapper {
typeid(T).name()); typeid(T).name());
} }
template <typename T>
void
add_data_by_single_segment_writer(const T* array, uintptr_t len) {
assert(!finished_);
if constexpr (std::is_same_v<T, bool>) {
auto res = RustResultWrapper(
tantivy_index_add_bools_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add bools: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int8_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int8s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int8s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int16_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int16s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int16s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int32_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int64_t>) {
auto res = RustResultWrapper(
tantivy_index_add_int64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add int64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, float>) {
auto res = RustResultWrapper(
tantivy_index_add_f32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add f32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, double>) {
auto res = RustResultWrapper(
tantivy_index_add_f64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add f64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, std::string>) {
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
for (uintptr_t i = 0; i < len; i++) {
auto res = RustResultWrapper(
tantivy_index_add_string_by_single_segment_writer(
writer_,
static_cast<const std::string*>(array)[i].c_str()));
AssertInfo(res.result_->success,
"failed to add string: {}",
res.result_->error);
}
return;
}
throw fmt::format("InvertedIndex.add_data: unsupported data type: {}",
typeid(T).name());
}
template <typename T>
void
add_multi_data_by_single_segment_writer(const T* array, uintptr_t len) {
assert(!finished_);
if constexpr (std::is_same_v<T, bool>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_bools_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi bools: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int8_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int8s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int8s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int16_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int16s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int16s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int32_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, int64_t>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_int64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi int64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, float>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_f32s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi f32s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, double>) {
auto res = RustResultWrapper(
tantivy_index_add_multi_f64s_by_single_segment_writer(
writer_, array, len));
AssertInfo(res.result_->success,
"failed to add multi f64s: {}",
res.result_->error);
return;
}
if constexpr (std::is_same_v<T, std::string>) {
std::vector<const char*> views;
for (uintptr_t i = 0; i < len; i++) {
views.push_back(array[i].c_str());
}
auto res = RustResultWrapper(
tantivy_index_add_multi_keywords_by_single_segment_writer(
writer_, views.data(), len));
AssertInfo(res.result_->success,
"failed to add multi keywords: {}",
res.result_->error);
return;
}
throw fmt::format(
"InvertedIndex.add_multi_data: unsupported data type: {}",
typeid(T).name());
}
inline void inline void
finish() { finish() {
if (finished_) { if (finished_) {

View File

@ -90,6 +90,7 @@ set(MILVUS_TEST_FILES
test_chunked_column.cpp test_chunked_column.cpp
test_rust_result.cpp test_rust_result.cpp
test_cached_search_iterator.cpp test_cached_search_iterator.cpp
test_build_inverted_index_with_single_segment.cpp
) )
if ( INDEX_ENGINE STREQUAL "cardinal" ) if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -113,7 +113,7 @@ class ArrayInvertedIndexTest : public ::testing::Test {
auto index = std::make_unique<index::InvertedIndexTantivy<T>>(); auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
Config cfg; Config cfg;
cfg["is_array"] = true; cfg["is_array"] = true;
index->BuildWithRawData(N_, vec_of_array_.data(), cfg); index->BuildWithRawDataForUT(N_, vec_of_array_.data(), cfg);
LoadIndexInfo info{ LoadIndexInfo info{
.field_id = schema_->get_field_id(FieldName("array")).get(), .field_id = schema_->get_field_id(FieldName("array")).get(),
.index = std::move(index), .index = std::move(index),

View File

@ -0,0 +1,214 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <random>
#include <gtest/gtest.h>
#include "pb/plan.pb.h"
#include "segcore/SegmentSealedImpl.h"
#include "index/InvertedIndexTantivy.h"
#include "test_utils/DataGen.h"
#include "common/Schema.h"
#include "test_utils/GenExprProto.h"
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
template <typename T>
SchemaPtr
GenSchema() {
auto schema_ = std::make_shared<Schema>();
auto pk = schema_->AddDebugField("pk", DataType::INT64);
schema_->set_primary_field_id(pk);
if constexpr (std::is_same_v<T, bool>) {
schema_->AddDebugField("index", DataType::BOOL, false);
} else if constexpr (std::is_same_v<T, int8_t>) {
schema_->AddDebugField("index", DataType::INT8, false);
} else if constexpr (std::is_same_v<T, int16_t>) {
schema_->AddDebugField("index", DataType::INT16, false);
} else if constexpr (std::is_same_v<T, int32_t>) {
schema_->AddDebugField("index", DataType::INT32, false);
} else if constexpr (std::is_same_v<T, int64_t>) {
schema_->AddDebugField("index", DataType::INT64, false);
} else if constexpr (std::is_same_v<T, float>) {
schema_->AddDebugField("index", DataType::FLOAT, false);
} else if constexpr (std::is_same_v<T, double>) {
schema_->AddDebugField("index", DataType::DOUBLE, false);
} else if constexpr (std::is_same_v<T, std::string>) {
schema_->AddDebugField("index", DataType::VARCHAR, false);
}
return schema_;
}
template <typename T>
class BuildInvertedIndexWithSingleSegmentTest : public ::testing::Test {
public:
void
SetUp() override {
schema_ = GenSchema<T>();
seg_ = CreateSealedSegment(schema_);
N_ = 3000;
uint64_t seed = 1234;
auto raw_data = DataGen(schema_, N_, seed);
if constexpr (std::is_same_v<T, bool>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.bool_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, int64_t>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.long_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_integral_v<T>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.int_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, float>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.float_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, double>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.double_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
} else if constexpr (std::is_same_v<T, std::string>) {
auto index_col =
raw_data.get_col(schema_->get_field_id(FieldName("index")))
->scalars()
.string_data()
.data();
for (size_t i = 0; i < N_; i++) {
index_column_data_.push_back(index_col[i]);
}
}
SealedLoadFieldData(raw_data, *seg_);
LoadInvertedIndex();
}
void
TearDown() override {
}
void
LoadInvertedIndex() {
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
Config cfg;
cfg[milvus::index::SCALAR_INDEX_ENGINE_VERSION] = 0;
index->BuildWithRawDataForUT(N_, index_column_data_.data(), cfg);
LoadIndexInfo info{
.field_id = schema_->get_field_id(FieldName("index")).get(),
.index = std::move(index),
};
seg_->LoadIndex(info);
}
T
FieldValueAt(int64_t offset) {
return index_column_data_[offset];
}
public:
SchemaPtr schema_;
SegmentSealedUPtr seg_;
int64_t N_;
boost::container::vector<T> index_column_data_;
};
TYPED_TEST_SUITE_P(BuildInvertedIndexWithSingleSegmentTest);
TYPED_TEST_P(BuildInvertedIndexWithSingleSegmentTest,
ReadFromSingleSegmentIndex) {
const auto& meta = this->schema_->operator[](FieldName("index"));
for (size_t i = 0; i < 10; i++) {
auto column_info = test::GenColumnInfo(
meta.get_id().get(),
static_cast<proto::schema::DataType>(meta.get_data_type()),
false,
false,
static_cast<proto::schema::DataType>(meta.get_element_type()));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> int_dist(1, this->N_);
int random_idx = int_dist(gen) - 1;
auto unary_range_expr = std::make_unique<proto::plan::UnaryRangeExpr>();
unary_range_expr->set_allocated_column_info(column_info);
unary_range_expr->set_op(proto::plan::OpType::Equal);
auto val = this->FieldValueAt(random_idx);
unary_range_expr->set_allocated_value(test::GenGenericValue(val));
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr.release());
auto parser = ProtoParser(*this->schema_);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
BitsetType final;
final = ExecuteQueryExpr(parsed, segpromote, this->N_, MAX_TIMESTAMP);
auto ref = [this, random_idx](size_t offset) -> bool {
return this->index_column_data_[offset] ==
this->index_column_data_[random_idx];
};
ASSERT_EQ(final.size(), this->N_);
for (size_t i = 0; i < this->N_; i++) {
if (std::is_floating_point_v<decltype(val)> && i == random_idx) {
continue;
}
ASSERT_EQ(final[i], ref(i))
<< "i: " << i << ", final[i]: " << final[i]
<< ", ref(i): " << ref(i) << ", random_idx: " << random_idx
<< ", value: " << this->index_column_data_[random_idx]
<< ", value: " << this->index_column_data_[i];
}
}
}
REGISTER_TYPED_TEST_CASE_P(BuildInvertedIndexWithSingleSegmentTest,
ReadFromSingleSegmentIndex);
using ElementType = testing::
Types<bool, int8_t, int16_t, int32_t, int64_t, float, double, std::string>;
INSTANTIATE_TYPED_TEST_SUITE_P(Naive,
BuildInvertedIndexWithSingleSegmentTest,
ElementType);

View File

@ -354,7 +354,7 @@ TEST_F(TestChunkSegment, TestCompareExpr) {
data.begin() + i * test_data_count); data.begin() + i * test_data_count);
} }
index->BuildWithRawData(data.size(), data.data()); index->BuildWithRawDataForUT(data.size(), data.data());
segcore::LoadIndexInfo load_index_info; segcore::LoadIndexInfo load_index_info;
load_index_info.index = std::move(index); load_index_info.index = std::move(index);
load_index_info.field_id = fid.get(); load_index_info.field_id = fid.get();

View File

@ -255,7 +255,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
auto index = index::CreateStringIndexSort(); auto index = index::CreateStringIndexSort();
std::vector<uint8_t> buffer(arr.ByteSize()); std::vector<uint8_t> buffer(arr.ByteSize());
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize())); ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
index->BuildWithRawData(arr.ByteSize(), buffer.data()); index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
LoadIndexInfo info{ LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(), .field_id = schema->get_field_id(FieldName("str")).get(),
.index = std::move(index), .index = std::move(index),
@ -264,7 +264,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
} }
{ {
auto index = index::CreateScalarIndexSort<int64_t>(); auto index = index::CreateScalarIndexSort<int64_t>();
index->BuildWithRawData(N, raw_int.data()); index->BuildWithRawDataForUT(N, raw_int.data());
LoadIndexInfo info{ LoadIndexInfo info{
.field_id = .field_id =
schema->get_field_id(FieldName("another_int64")).get(), schema->get_field_id(FieldName("another_int64")).get(),
@ -278,7 +278,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
LoadInvertedIndex() { LoadInvertedIndex() {
auto index = auto index =
std::make_unique<index::InvertedIndexTantivy<std::string>>(); std::make_unique<index::InvertedIndexTantivy<std::string>>();
index->BuildWithRawData(N, raw_str.data()); index->BuildWithRawDataForUT(N, raw_str.data());
LoadIndexInfo info{ LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(), .field_id = schema->get_field_id(FieldName("str")).get(),
.index = std::move(index), .index = std::move(index),
@ -295,7 +295,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
auto index = std::make_unique<MockStringIndex>(); auto index = std::make_unique<MockStringIndex>();
std::vector<uint8_t> buffer(arr.ByteSize()); std::vector<uint8_t> buffer(arr.ByteSize());
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize())); ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
index->BuildWithRawData(arr.ByteSize(), buffer.data()); index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
LoadIndexInfo info{ LoadIndexInfo info{
.field_id = schema->get_field_id(FieldName("str")).get(), .field_id = schema->get_field_id(FieldName("str")).get(),
.index = std::move(index), .index = std::move(index),

View File

@ -151,8 +151,12 @@ TEST_F(StringIndexMarisaTest, Range) {
TEST_F(StringIndexMarisaTest, Reverse) { TEST_F(StringIndexMarisaTest, Reverse) {
auto index_types = GetIndexTypes<std::string>(); auto index_types = GetIndexTypes<std::string>();
for (const auto& index_type : index_types) { for (const auto& index_type : index_types) {
auto index = milvus::index::IndexFactory::GetInstance() CreateIndexInfo create_index_info{
.CreatePrimitiveScalarIndex<std::string>(index_type); .index_type = index_type,
};
auto index =
milvus::index::IndexFactory::GetInstance()
.CreatePrimitiveScalarIndex<std::string>(create_index_info);
index->Build(nb, strs.data()); index->Build(nb, strs.data());
assert_reverse<std::string>(index.get(), strs); assert_reverse<std::string>(index.get(), strs);
} }
@ -311,7 +315,7 @@ TEST_F(StringIndexMarisaTest, BaseIndexCodec) {
*str_arr.mutable_data() = {strings.begin(), strings.end()}; *str_arr.mutable_data() = {strings.begin(), strings.end()};
std::vector<uint8_t> data(str_arr.ByteSizeLong(), 0); std::vector<uint8_t> data(str_arr.ByteSizeLong(), 0);
str_arr.SerializeToArray(data.data(), str_arr.ByteSizeLong()); str_arr.SerializeToArray(data.data(), str_arr.ByteSizeLong());
index->BuildWithRawData(str_arr.ByteSizeLong(), data.data()); index->BuildWithRawDataForUT(str_arr.ByteSizeLong(), data.data());
std::vector<std::string> invalid_strings = {std::to_string(nb)}; std::vector<std::string> invalid_strings = {std::to_string(nb)};
auto copy_index = milvus::index::CreateStringIndexMarisa(); auto copy_index = milvus::index::CreateStringIndexMarisa();