mirror of https://github.com/milvus-io/milvus.git
fix: enable to build index with single segment (#39233)
fix https://github.com/milvus-io/milvus/issues/39232 --------- Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>pull/39079/head^2
parent
bca2a62b78
commit
8c4ba70a4c
|
@ -45,9 +45,9 @@ class IndexBase {
|
||||||
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) = 0;
|
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) = 0;
|
||||||
|
|
||||||
virtual void
|
virtual void
|
||||||
BuildWithRawData(size_t n,
|
BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config = {}) = 0;
|
const Config& config = {}) = 0;
|
||||||
|
|
||||||
virtual void
|
virtual void
|
||||||
BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0;
|
BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0;
|
||||||
|
|
|
@ -35,10 +35,14 @@ namespace milvus::index {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
ScalarIndexPtr<T>
|
ScalarIndexPtr<T>
|
||||||
IndexFactory::CreatePrimitiveScalarIndex(
|
IndexFactory::CreatePrimitiveScalarIndex(
|
||||||
const IndexType& index_type,
|
const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager_context) {
|
const storage::FileManagerContext& file_manager_context) {
|
||||||
|
auto index_type = create_index_info.index_type;
|
||||||
if (index_type == INVERTED_INDEX_TYPE) {
|
if (index_type == INVERTED_INDEX_TYPE) {
|
||||||
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context);
|
// scalar_index_engine_version 0 means we should built tantivy index within single segment
|
||||||
|
return std::make_unique<InvertedIndexTantivy<T>>(
|
||||||
|
file_manager_context,
|
||||||
|
create_index_info.scalar_index_engine_version == 0);
|
||||||
}
|
}
|
||||||
if (index_type == BITMAP_INDEX_TYPE) {
|
if (index_type == BITMAP_INDEX_TYPE) {
|
||||||
return std::make_unique<BitmapIndex<T>>(file_manager_context);
|
return std::make_unique<BitmapIndex<T>>(file_manager_context);
|
||||||
|
@ -59,12 +63,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
|
||||||
template <>
|
template <>
|
||||||
ScalarIndexPtr<std::string>
|
ScalarIndexPtr<std::string>
|
||||||
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
||||||
const IndexType& index_type,
|
const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager_context) {
|
const storage::FileManagerContext& file_manager_context) {
|
||||||
|
auto index_type = create_index_info.index_type;
|
||||||
#if defined(__linux__) || defined(__APPLE__)
|
#if defined(__linux__) || defined(__APPLE__)
|
||||||
if (index_type == INVERTED_INDEX_TYPE) {
|
if (index_type == INVERTED_INDEX_TYPE) {
|
||||||
|
// scalar_index_engine_version 0 means we should built tantivy index within single segment
|
||||||
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
||||||
file_manager_context);
|
file_manager_context,
|
||||||
|
create_index_info.scalar_index_engine_version == 0);
|
||||||
}
|
}
|
||||||
if (index_type == BITMAP_INDEX_TYPE) {
|
if (index_type == BITMAP_INDEX_TYPE) {
|
||||||
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
|
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
|
||||||
|
@ -294,37 +301,37 @@ IndexFactory::CreateIndex(
|
||||||
IndexBasePtr
|
IndexBasePtr
|
||||||
IndexFactory::CreatePrimitiveScalarIndex(
|
IndexFactory::CreatePrimitiveScalarIndex(
|
||||||
DataType data_type,
|
DataType data_type,
|
||||||
IndexType index_type,
|
const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager_context) {
|
const storage::FileManagerContext& file_manager_context) {
|
||||||
switch (data_type) {
|
switch (data_type) {
|
||||||
// create scalar index
|
// create scalar index
|
||||||
case DataType::BOOL:
|
case DataType::BOOL:
|
||||||
return CreatePrimitiveScalarIndex<bool>(index_type,
|
return CreatePrimitiveScalarIndex<bool>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
case DataType::INT8:
|
case DataType::INT8:
|
||||||
return CreatePrimitiveScalarIndex<int8_t>(index_type,
|
return CreatePrimitiveScalarIndex<int8_t>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
case DataType::INT16:
|
case DataType::INT16:
|
||||||
return CreatePrimitiveScalarIndex<int16_t>(index_type,
|
return CreatePrimitiveScalarIndex<int16_t>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
case DataType::INT32:
|
case DataType::INT32:
|
||||||
return CreatePrimitiveScalarIndex<int32_t>(index_type,
|
return CreatePrimitiveScalarIndex<int32_t>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
case DataType::INT64:
|
case DataType::INT64:
|
||||||
return CreatePrimitiveScalarIndex<int64_t>(index_type,
|
return CreatePrimitiveScalarIndex<int64_t>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
case DataType::FLOAT:
|
case DataType::FLOAT:
|
||||||
return CreatePrimitiveScalarIndex<float>(index_type,
|
return CreatePrimitiveScalarIndex<float>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
case DataType::DOUBLE:
|
case DataType::DOUBLE:
|
||||||
return CreatePrimitiveScalarIndex<double>(index_type,
|
return CreatePrimitiveScalarIndex<double>(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
|
|
||||||
// create string index
|
// create string index
|
||||||
case DataType::STRING:
|
case DataType::STRING:
|
||||||
case DataType::VARCHAR:
|
case DataType::VARCHAR:
|
||||||
return CreatePrimitiveScalarIndex<std::string>(
|
return CreatePrimitiveScalarIndex<std::string>(
|
||||||
index_type, file_manager_context);
|
create_index_info, file_manager_context);
|
||||||
default:
|
default:
|
||||||
PanicInfo(
|
PanicInfo(
|
||||||
DataTypeInvalid,
|
DataTypeInvalid,
|
||||||
|
@ -334,14 +341,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
|
||||||
|
|
||||||
IndexBasePtr
|
IndexBasePtr
|
||||||
IndexFactory::CreateCompositeScalarIndex(
|
IndexFactory::CreateCompositeScalarIndex(
|
||||||
IndexType index_type,
|
const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager_context) {
|
const storage::FileManagerContext& file_manager_context) {
|
||||||
|
auto index_type = create_index_info.index_type;
|
||||||
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
|
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
|
||||||
index_type == INVERTED_INDEX_TYPE) {
|
index_type == INVERTED_INDEX_TYPE) {
|
||||||
auto element_type = static_cast<DataType>(
|
auto element_type = static_cast<DataType>(
|
||||||
file_manager_context.fieldDataMeta.field_schema.element_type());
|
file_manager_context.fieldDataMeta.field_schema.element_type());
|
||||||
return CreatePrimitiveScalarIndex(
|
return CreatePrimitiveScalarIndex(
|
||||||
element_type, index_type, file_manager_context);
|
element_type, create_index_info, file_manager_context);
|
||||||
} else {
|
} else {
|
||||||
PanicInfo(
|
PanicInfo(
|
||||||
Unsupported,
|
Unsupported,
|
||||||
|
@ -373,9 +381,9 @@ IndexFactory::CreateScalarIndex(
|
||||||
case DataType::VARCHAR:
|
case DataType::VARCHAR:
|
||||||
case DataType::STRING:
|
case DataType::STRING:
|
||||||
return CreatePrimitiveScalarIndex(
|
return CreatePrimitiveScalarIndex(
|
||||||
data_type, create_index_info.index_type, file_manager_context);
|
data_type, create_index_info, file_manager_context);
|
||||||
case DataType::ARRAY: {
|
case DataType::ARRAY: {
|
||||||
return CreateCompositeScalarIndex(create_index_info.index_type,
|
return CreateCompositeScalarIndex(create_index_info,
|
||||||
file_manager_context);
|
file_manager_context);
|
||||||
}
|
}
|
||||||
case DataType::JSON: {
|
case DataType::JSON: {
|
||||||
|
|
|
@ -85,14 +85,14 @@ class IndexFactory {
|
||||||
IndexBasePtr
|
IndexBasePtr
|
||||||
CreatePrimitiveScalarIndex(
|
CreatePrimitiveScalarIndex(
|
||||||
DataType data_type,
|
DataType data_type,
|
||||||
IndexType index_type,
|
const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager_context =
|
const storage::FileManagerContext& file_manager_context =
|
||||||
storage::FileManagerContext());
|
storage::FileManagerContext());
|
||||||
|
|
||||||
// For types like array, struct, union, etc
|
// For types like array, struct, union, etc
|
||||||
IndexBasePtr
|
IndexBasePtr
|
||||||
CreateCompositeScalarIndex(
|
CreateCompositeScalarIndex(
|
||||||
IndexType index_type,
|
const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager_context =
|
const storage::FileManagerContext& file_manager_context =
|
||||||
storage::FileManagerContext());
|
storage::FileManagerContext());
|
||||||
|
|
||||||
|
@ -115,7 +115,7 @@ class IndexFactory {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
ScalarIndexPtr<T>
|
ScalarIndexPtr<T>
|
||||||
CreatePrimitiveScalarIndex(const IndexType& index_type,
|
CreatePrimitiveScalarIndex(const CreateIndexInfo& create_index_info,
|
||||||
const storage::FileManagerContext& file_manager =
|
const storage::FileManagerContext& file_manager =
|
||||||
storage::FileManagerContext());
|
storage::FileManagerContext());
|
||||||
};
|
};
|
||||||
|
|
|
@ -26,6 +26,7 @@ struct CreateIndexInfo {
|
||||||
IndexVersion index_engine_version;
|
IndexVersion index_engine_version;
|
||||||
std::string field_name;
|
std::string field_name;
|
||||||
int64_t dim;
|
int64_t dim;
|
||||||
|
int32_t scalar_index_engine_version;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace milvus::index
|
} // namespace milvus::index
|
||||||
|
|
|
@ -84,14 +84,15 @@ InvertedIndexTantivy<T>::InitForBuildIndex() {
|
||||||
path_);
|
path_);
|
||||||
}
|
}
|
||||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||||
field.c_str(), d_type_, path_.c_str());
|
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
InvertedIndexTantivy<T>::InvertedIndexTantivy(
|
InvertedIndexTantivy<T>::InvertedIndexTantivy(
|
||||||
const storage::FileManagerContext& ctx)
|
const storage::FileManagerContext& ctx, bool inverted_index_single_segment)
|
||||||
: ScalarIndex<T>(INVERTED_INDEX_TYPE),
|
: ScalarIndex<T>(INVERTED_INDEX_TYPE),
|
||||||
schema_(ctx.fieldDataMeta.field_schema) {
|
schema_(ctx.fieldDataMeta.field_schema),
|
||||||
|
inverted_index_single_segment_(inverted_index_single_segment) {
|
||||||
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
|
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
|
||||||
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
|
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
|
||||||
// push init wrapper to load process
|
// push init wrapper to load process
|
||||||
|
@ -387,9 +388,9 @@ InvertedIndexTantivy<T>::RegexQuery(const std::string& regex_pattern) {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void
|
void
|
||||||
InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
if constexpr (std::is_same_v<bool, T>) {
|
if constexpr (std::is_same_v<bool, T>) {
|
||||||
schema_.set_data_type(proto::schema::DataType::Bool);
|
schema_.set_data_type(proto::schema::DataType::Bool);
|
||||||
}
|
}
|
||||||
|
@ -421,16 +422,35 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
||||||
boost::filesystem::create_directories(path_);
|
boost::filesystem::create_directories(path_);
|
||||||
d_type_ = get_tantivy_data_type(schema_);
|
d_type_ = get_tantivy_data_type(schema_);
|
||||||
std::string field = "test_inverted_index";
|
std::string field = "test_inverted_index";
|
||||||
|
inverted_index_single_segment_ =
|
||||||
|
GetValueFromConfig<int32_t>(config,
|
||||||
|
milvus::index::SCALAR_INDEX_ENGINE_VERSION)
|
||||||
|
.value_or(1) == 0;
|
||||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||||
field.c_str(), d_type_, path_.c_str());
|
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
|
||||||
if (config.find("is_array") != config.end()) {
|
if (!inverted_index_single_segment_) {
|
||||||
// only used in ut.
|
if (config.find("is_array") != config.end()) {
|
||||||
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
// only used in ut.
|
||||||
for (size_t i = 0; i < n; i++) {
|
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||||
wrapper_->template add_multi_data(arr[i].data(), arr[i].size(), i);
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
wrapper_->template add_multi_data(
|
||||||
|
arr[i].data(), arr[i].size(), i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
|
if (config.find("is_array") != config.end()) {
|
||||||
|
// only used in ut.
|
||||||
|
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||||
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
wrapper_->template add_multi_data_by_single_segment_writer(
|
||||||
|
arr[i].data(), arr[i].size());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
wrapper_->add_data_by_single_segment_writer<T>(
|
||||||
|
static_cast<const T*>(values), n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
wrapper_->create_reader();
|
wrapper_->create_reader();
|
||||||
finish();
|
finish();
|
||||||
|
@ -458,26 +478,48 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
|
||||||
case proto::schema::DataType::Double:
|
case proto::schema::DataType::Double:
|
||||||
case proto::schema::DataType::String:
|
case proto::schema::DataType::String:
|
||||||
case proto::schema::DataType::VarChar: {
|
case proto::schema::DataType::VarChar: {
|
||||||
int64_t offset = 0;
|
// Generally, we will not build inverted index with single segment except for building index
|
||||||
if (schema_.nullable()) {
|
// for query node with older version(2.4). See more comments above `inverted_index_single_segment_`.
|
||||||
for (const auto& data : field_datas) {
|
if (!inverted_index_single_segment_) {
|
||||||
auto n = data->get_num_rows();
|
int64_t offset = 0;
|
||||||
for (int i = 0; i < n; i++) {
|
if (schema_.nullable()) {
|
||||||
if (!data->is_valid(i)) {
|
for (const auto& data : field_datas) {
|
||||||
null_offset.push_back(i);
|
auto n = data->get_num_rows();
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
if (!data->is_valid(i)) {
|
||||||
|
null_offset.push_back(i);
|
||||||
|
}
|
||||||
|
wrapper_->add_multi_data<T>(
|
||||||
|
static_cast<const T*>(data->RawValue(i)),
|
||||||
|
data->is_valid(i),
|
||||||
|
offset++);
|
||||||
}
|
}
|
||||||
wrapper_->add_multi_data<T>(
|
}
|
||||||
static_cast<const T*>(data->RawValue(i)),
|
} else {
|
||||||
data->is_valid(i),
|
for (const auto& data : field_datas) {
|
||||||
offset++);
|
auto n = data->get_num_rows();
|
||||||
|
wrapper_->add_data<T>(
|
||||||
|
static_cast<const T*>(data->Data()), n, offset);
|
||||||
|
offset += n;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (const auto& data : field_datas) {
|
for (const auto& data : field_datas) {
|
||||||
auto n = data->get_num_rows();
|
auto n = data->get_num_rows();
|
||||||
wrapper_->add_data<T>(
|
if (schema_.nullable()) {
|
||||||
static_cast<const T*>(data->Data()), n, offset);
|
for (int i = 0; i < n; i++) {
|
||||||
offset += n;
|
if (!data->is_valid(i)) {
|
||||||
|
null_offset.push_back(i);
|
||||||
|
}
|
||||||
|
wrapper_
|
||||||
|
->add_multi_data_by_single_segment_writer<T>(
|
||||||
|
static_cast<const T*>(data->RawValue(i)),
|
||||||
|
data->is_valid(i));
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
wrapper_->add_data_by_single_segment_writer<T>(
|
||||||
|
static_cast<const T*>(data->Data()), n);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -508,10 +550,15 @@ InvertedIndexTantivy<T>::build_index_for_array(
|
||||||
null_offset.push_back(i);
|
null_offset.push_back(i);
|
||||||
}
|
}
|
||||||
auto length = data->is_valid(i) ? array_column[i].length() : 0;
|
auto length = data->is_valid(i) ? array_column[i].length() : 0;
|
||||||
wrapper_->template add_multi_data(
|
if (!inverted_index_single_segment_) {
|
||||||
reinterpret_cast<const T*>(array_column[i].data()),
|
wrapper_->template add_multi_data(
|
||||||
length,
|
reinterpret_cast<const T*>(array_column[i].data()),
|
||||||
offset++);
|
length,
|
||||||
|
offset++);
|
||||||
|
} else {
|
||||||
|
wrapper_->template add_multi_data_by_single_segment_writer(
|
||||||
|
reinterpret_cast<const T*>(array_column[i].data()), length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -537,7 +584,13 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
|
||||||
array_column[i].template get_data<std::string>(j));
|
array_column[i].template get_data<std::string>(j));
|
||||||
}
|
}
|
||||||
auto length = data->is_valid(i) ? output.size() : 0;
|
auto length = data->is_valid(i) ? output.size() : 0;
|
||||||
wrapper_->template add_multi_data(output.data(), length, offset++);
|
if (!inverted_index_single_segment_) {
|
||||||
|
wrapper_->template add_multi_data(
|
||||||
|
output.data(), length, offset++);
|
||||||
|
} else {
|
||||||
|
wrapper_->template add_multi_data_by_single_segment_writer(
|
||||||
|
output.data(), length);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,8 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||||
InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) {
|
InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) {
|
||||||
}
|
}
|
||||||
|
|
||||||
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx);
|
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx,
|
||||||
|
bool inverted_index_single_segment = false);
|
||||||
|
|
||||||
~InvertedIndexTantivy();
|
~InvertedIndexTantivy();
|
||||||
|
|
||||||
|
@ -80,11 +81,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||||
return wrapper_->count();
|
return wrapper_->count();
|
||||||
}
|
}
|
||||||
|
|
||||||
// BuildWithRawData should be only used in ut. Only string is supported.
|
// BuildWithRawDataForUT should be only used in ut. Only string is supported.
|
||||||
void
|
void
|
||||||
BuildWithRawData(size_t n,
|
BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config = {}) override;
|
const Config& config = {}) override;
|
||||||
|
|
||||||
BinarySet
|
BinarySet
|
||||||
Serialize(const Config& config) override;
|
Serialize(const Config& config) override;
|
||||||
|
@ -205,5 +206,14 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||||
// all data need to be built to align the offset
|
// all data need to be built to align the offset
|
||||||
// so need to store null_offset in inverted index additionally
|
// so need to store null_offset in inverted index additionally
|
||||||
std::vector<size_t> null_offset{};
|
std::vector<size_t> null_offset{};
|
||||||
|
|
||||||
|
// `inverted_index_single_segment_` is used to control whether to build tantivy index with single segment.
|
||||||
|
//
|
||||||
|
// In the older version of milvus, the query node can only read tantivy index built whtin single segment
|
||||||
|
// where the newer version builds and reads index of multi segments by default.
|
||||||
|
// However, the index may be built from a separate node from the query node where the index buliding node is a
|
||||||
|
// new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index
|
||||||
|
// building node to build specific type of tantivy index.
|
||||||
|
bool inverted_index_single_segment_{false};
|
||||||
};
|
};
|
||||||
} // namespace milvus::index
|
} // namespace milvus::index
|
||||||
|
|
|
@ -46,6 +46,8 @@ constexpr const char* MARISA_TRIE_UPPER = "TRIE";
|
||||||
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
|
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
|
||||||
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
|
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
|
||||||
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";
|
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";
|
||||||
|
constexpr const char* SCALAR_INDEX_ENGINE_VERSION =
|
||||||
|
"scalar_index_engine_version";
|
||||||
|
|
||||||
// index meta
|
// index meta
|
||||||
constexpr const char* COLLECTION_ID = "collection_id";
|
constexpr const char* COLLECTION_ID = "collection_id";
|
||||||
|
|
|
@ -72,9 +72,9 @@ ScalarIndex<T>::Query(const DatasetPtr& dataset) {
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<std::string>::BuildWithRawData(size_t n,
|
ScalarIndex<std::string>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
proto::schema::StringArray arr;
|
proto::schema::StringArray arr;
|
||||||
auto ok = arr.ParseFromArray(values, n);
|
auto ok = arr.ParseFromArray(values, n);
|
||||||
Assert(ok);
|
Assert(ok);
|
||||||
|
@ -86,9 +86,9 @@ ScalarIndex<std::string>::BuildWithRawData(size_t n,
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<bool>::BuildWithRawData(size_t n,
|
ScalarIndex<bool>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
proto::schema::BoolArray arr;
|
proto::schema::BoolArray arr;
|
||||||
auto ok = arr.ParseFromArray(values, n);
|
auto ok = arr.ParseFromArray(values, n);
|
||||||
Assert(ok);
|
Assert(ok);
|
||||||
|
@ -97,54 +97,54 @@ ScalarIndex<bool>::BuildWithRawData(size_t n,
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<int8_t>::BuildWithRawData(size_t n,
|
ScalarIndex<int8_t>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
auto data = reinterpret_cast<int8_t*>(const_cast<void*>(values));
|
auto data = reinterpret_cast<int8_t*>(const_cast<void*>(values));
|
||||||
Build(n, data);
|
Build(n, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<int16_t>::BuildWithRawData(size_t n,
|
ScalarIndex<int16_t>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
auto data = reinterpret_cast<int16_t*>(const_cast<void*>(values));
|
auto data = reinterpret_cast<int16_t*>(const_cast<void*>(values));
|
||||||
Build(n, data);
|
Build(n, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<int32_t>::BuildWithRawData(size_t n,
|
ScalarIndex<int32_t>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
auto data = reinterpret_cast<int32_t*>(const_cast<void*>(values));
|
auto data = reinterpret_cast<int32_t*>(const_cast<void*>(values));
|
||||||
Build(n, data);
|
Build(n, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<int64_t>::BuildWithRawData(size_t n,
|
ScalarIndex<int64_t>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
auto data = reinterpret_cast<int64_t*>(const_cast<void*>(values));
|
auto data = reinterpret_cast<int64_t*>(const_cast<void*>(values));
|
||||||
Build(n, data);
|
Build(n, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<float>::BuildWithRawData(size_t n,
|
ScalarIndex<float>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
auto data = reinterpret_cast<float*>(const_cast<void*>(values));
|
auto data = reinterpret_cast<float*>(const_cast<void*>(values));
|
||||||
Build(n, data);
|
Build(n, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void
|
void
|
||||||
ScalarIndex<double>::BuildWithRawData(size_t n,
|
ScalarIndex<double>::BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config) {
|
const Config& config) {
|
||||||
auto data = reinterpret_cast<double*>(const_cast<void*>(values));
|
auto data = reinterpret_cast<double*>(const_cast<void*>(values));
|
||||||
Build(n, data);
|
Build(n, data);
|
||||||
}
|
}
|
||||||
|
|
|
@ -65,9 +65,9 @@ class ScalarIndex : public IndexBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
BuildWithRawData(size_t n,
|
BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config = {}) override;
|
const Config& config = {}) override;
|
||||||
|
|
||||||
void
|
void
|
||||||
BuildWithDataset(const DatasetPtr& dataset,
|
BuildWithDataset(const DatasetPtr& dataset,
|
||||||
|
|
|
@ -42,9 +42,9 @@ class VectorIndex : public IndexBase {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void
|
void
|
||||||
BuildWithRawData(size_t n,
|
BuildWithRawDataForUT(size_t n,
|
||||||
const void* values,
|
const void* values,
|
||||||
const Config& config = {}) override {
|
const Config& config = {}) override {
|
||||||
PanicInfo(Unsupported,
|
PanicInfo(Unsupported,
|
||||||
"vector index don't support build index with raw data");
|
"vector index don't support build index with raw data");
|
||||||
};
|
};
|
||||||
|
|
|
@ -30,6 +30,13 @@ ScalarIndexCreator::ScalarIndexCreator(
|
||||||
if (config.contains("index_type")) {
|
if (config.contains("index_type")) {
|
||||||
index_type_ = config.at("index_type").get<std::string>();
|
index_type_ = config.at("index_type").get<std::string>();
|
||||||
}
|
}
|
||||||
|
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
|
||||||
|
// Use value_or(1) for unit test without setting this value
|
||||||
|
index_info.scalar_index_engine_version =
|
||||||
|
milvus::index::GetValueFromConfig<int32_t>(
|
||||||
|
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
|
||||||
|
.value_or(1);
|
||||||
|
|
||||||
index_info.field_type = dtype_;
|
index_info.field_type = dtype_;
|
||||||
index_info.index_type = index_type();
|
index_info.index_type = index_type();
|
||||||
index_ = index::IndexFactory::GetInstance().CreateIndex(
|
index_ = index::IndexFactory::GetInstance().CreateIndex(
|
||||||
|
@ -40,7 +47,7 @@ void
|
||||||
ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) {
|
ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) {
|
||||||
auto size = dataset->GetRows();
|
auto size = dataset->GetRows();
|
||||||
auto data = dataset->GetTensor();
|
auto data = dataset->GetTensor();
|
||||||
index_->BuildWithRawData(size, data);
|
index_->BuildWithRawDataForUT(size, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
|
@ -166,30 +166,17 @@ CreateIndex(CIndex* res_index,
|
||||||
auto field_type =
|
auto field_type =
|
||||||
static_cast<DataType>(build_index_info->field_schema().data_type());
|
static_cast<DataType>(build_index_info->field_schema().data_type());
|
||||||
|
|
||||||
milvus::index::CreateIndexInfo index_info;
|
|
||||||
index_info.field_type = field_type;
|
|
||||||
|
|
||||||
auto storage_config =
|
auto storage_config =
|
||||||
get_storage_config(build_index_info->storage_config());
|
get_storage_config(build_index_info->storage_config());
|
||||||
auto config = get_config(build_index_info);
|
auto config = get_config(build_index_info);
|
||||||
// get index type
|
|
||||||
auto index_type = milvus::index::GetValueFromConfig<std::string>(
|
|
||||||
config, "index_type");
|
|
||||||
AssertInfo(index_type.has_value(), "index type is empty");
|
|
||||||
index_info.index_type = index_type.value();
|
|
||||||
|
|
||||||
auto engine_version = build_index_info->current_index_version();
|
auto engine_version = build_index_info->current_index_version();
|
||||||
index_info.index_engine_version = engine_version;
|
|
||||||
config[milvus::index::INDEX_ENGINE_VERSION] =
|
config[milvus::index::INDEX_ENGINE_VERSION] =
|
||||||
std::to_string(engine_version);
|
std::to_string(engine_version);
|
||||||
|
auto scalar_index_engine_version =
|
||||||
// get metric type
|
build_index_info->current_scalar_index_version();
|
||||||
if (milvus::IsVectorDataType(field_type)) {
|
config[milvus::index::SCALAR_INDEX_ENGINE_VERSION] =
|
||||||
auto metric_type = milvus::index::GetValueFromConfig<std::string>(
|
scalar_index_engine_version;
|
||||||
config, "metric_type");
|
|
||||||
AssertInfo(metric_type.has_value(), "metric type is empty");
|
|
||||||
index_info.metric_type = metric_type.value();
|
|
||||||
}
|
|
||||||
|
|
||||||
// init file manager
|
// init file manager
|
||||||
milvus::storage::FieldDataMeta field_meta{
|
milvus::storage::FieldDataMeta field_meta{
|
||||||
|
|
|
@ -1193,6 +1193,7 @@ name = "tantivy-binding"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cbindgen",
|
"cbindgen",
|
||||||
|
"either",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"futures",
|
"futures",
|
||||||
"jieba-rs",
|
"jieba-rs",
|
||||||
|
|
|
@ -17,6 +17,7 @@ lazy_static = "1.4.0"
|
||||||
serde_json = "1.0.128"
|
serde_json = "1.0.128"
|
||||||
jieba-rs = "0.6.8"
|
jieba-rs = "0.6.8"
|
||||||
regex = "1.11.1"
|
regex = "1.11.1"
|
||||||
|
either = "1.13.0"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
cbindgen = "0.26.0"
|
cbindgen = "0.26.0"
|
||||||
|
|
|
@ -149,6 +149,10 @@ RustResult tantivy_create_index(const char *field_name,
|
||||||
uintptr_t num_threads,
|
uintptr_t num_threads,
|
||||||
uintptr_t overall_memory_budget_in_bytes);
|
uintptr_t overall_memory_budget_in_bytes);
|
||||||
|
|
||||||
|
RustResult tantivy_create_index_with_single_segment(const char *field_name,
|
||||||
|
TantivyDataType data_type,
|
||||||
|
const char *path);
|
||||||
|
|
||||||
void tantivy_free_index_writer(void *ptr);
|
void tantivy_free_index_writer(void *ptr);
|
||||||
|
|
||||||
RustResult tantivy_finish_index(void *ptr);
|
RustResult tantivy_finish_index(void *ptr);
|
||||||
|
@ -162,78 +166,140 @@ RustResult tantivy_index_add_int8s(void *ptr,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_int8s_by_single_segment_writer(void *ptr,
|
||||||
|
const int8_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_int16s(void *ptr,
|
RustResult tantivy_index_add_int16s(void *ptr,
|
||||||
const int16_t *array,
|
const int16_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_int16s_by_single_segment_writer(void *ptr,
|
||||||
|
const int16_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_int32s(void *ptr,
|
RustResult tantivy_index_add_int32s(void *ptr,
|
||||||
const int32_t *array,
|
const int32_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_int32s_by_single_segment_writer(void *ptr,
|
||||||
|
const int32_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_int64s(void *ptr,
|
RustResult tantivy_index_add_int64s(void *ptr,
|
||||||
const int64_t *array,
|
const int64_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_int64s_by_single_segment_writer(void *ptr,
|
||||||
|
const int64_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_f32s(void *ptr,
|
RustResult tantivy_index_add_f32s(void *ptr,
|
||||||
const float *array,
|
const float *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_f32s_by_single_segment_writer(void *ptr,
|
||||||
|
const float *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_f64s(void *ptr,
|
RustResult tantivy_index_add_f64s(void *ptr,
|
||||||
const double *array,
|
const double *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_f64s_by_single_segment_writer(void *ptr,
|
||||||
|
const double *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_bools(void *ptr,
|
RustResult tantivy_index_add_bools(void *ptr,
|
||||||
const bool *array,
|
const bool *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset_begin);
|
int64_t offset_begin);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_bools_by_single_segment_writer(void *ptr,
|
||||||
|
const bool *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_string(void *ptr, const char *s, int64_t offset);
|
RustResult tantivy_index_add_string(void *ptr, const char *s, int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_string_by_single_segment_writer(void *ptr, const char *s);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_int8s(void *ptr,
|
RustResult tantivy_index_add_multi_int8s(void *ptr,
|
||||||
const int8_t *array,
|
const int8_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_int8s_by_single_segment_writer(void *ptr,
|
||||||
|
const int8_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_int16s(void *ptr,
|
RustResult tantivy_index_add_multi_int16s(void *ptr,
|
||||||
const int16_t *array,
|
const int16_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_int16s_by_single_segment_writer(void *ptr,
|
||||||
|
const int16_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_int32s(void *ptr,
|
RustResult tantivy_index_add_multi_int32s(void *ptr,
|
||||||
const int32_t *array,
|
const int32_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_int32s_by_single_segment_writer(void *ptr,
|
||||||
|
const int32_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_int64s(void *ptr,
|
RustResult tantivy_index_add_multi_int64s(void *ptr,
|
||||||
const int64_t *array,
|
const int64_t *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_int64s_by_single_segment_writer(void *ptr,
|
||||||
|
const int64_t *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_f32s(void *ptr,
|
RustResult tantivy_index_add_multi_f32s(void *ptr,
|
||||||
const float *array,
|
const float *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_f32s_by_single_segment_writer(void *ptr,
|
||||||
|
const float *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_f64s(void *ptr,
|
RustResult tantivy_index_add_multi_f64s(void *ptr,
|
||||||
const double *array,
|
const double *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_f64s_by_single_segment_writer(void *ptr,
|
||||||
|
const double *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_bools(void *ptr,
|
RustResult tantivy_index_add_multi_bools(void *ptr,
|
||||||
const bool *array,
|
const bool *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_bools_by_single_segment_writer(void *ptr,
|
||||||
|
const bool *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_index_add_multi_keywords(void *ptr,
|
RustResult tantivy_index_add_multi_keywords(void *ptr,
|
||||||
const char *const *array,
|
const char *const *array,
|
||||||
uintptr_t len,
|
uintptr_t len,
|
||||||
int64_t offset);
|
int64_t offset);
|
||||||
|
|
||||||
|
RustResult tantivy_index_add_multi_keywords_by_single_segment_writer(void *ptr,
|
||||||
|
const char *const *array,
|
||||||
|
uintptr_t len);
|
||||||
|
|
||||||
RustResult tantivy_create_text_writer(const char *field_name,
|
RustResult tantivy_create_text_writer(const char *field_name,
|
||||||
const char *path,
|
const char *path,
|
||||||
const char *tokenizer_name,
|
const char *tokenizer_name,
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
use std::default;
|
|
||||||
use std::ffi::c_void;
|
use std::ffi::c_void;
|
||||||
use std::ptr::null;
|
use std::ptr::null;
|
||||||
|
|
||||||
|
@ -132,19 +131,15 @@ pub extern "C" fn free_rust_result(result: RustResult) {
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
unsafe {
|
if !result.error.is_null() {
|
||||||
if !result.error.is_null() {
|
free_rust_string(result.error as *mut c_char);
|
||||||
free_rust_string(result.error as *mut c_char);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn free_rust_error(error: *const c_char) {
|
pub extern "C" fn free_rust_error(error: *const c_char) {
|
||||||
unsafe {
|
if !error.is_null() {
|
||||||
if !error.is_null() {
|
free_rust_string(error as *mut c_char);
|
||||||
free_rust_string(error as *mut c_char);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#[repr(u8)]
|
#[repr(u8)]
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum TantivyDataType {
|
pub enum TantivyDataType {
|
||||||
Text,
|
Text,
|
||||||
Keyword,
|
Keyword,
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
use core::{fmt, str};
|
use core::{fmt, str};
|
||||||
|
|
||||||
use serde_json as json;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum TantivyBindingError {
|
pub enum TantivyBindingError {
|
||||||
JsonError(serde_json::Error),
|
JsonError(serde_json::Error),
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
use std::{
|
use std::ffi::{c_char, c_void, CStr};
|
||||||
ffi::{c_char, c_void, CStr},
|
|
||||||
ptr::null,
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
array::{RustArray, RustResult},
|
array::RustResult,
|
||||||
cstr_to_str,
|
cstr_to_str,
|
||||||
index_reader::IndexReaderWrapper,
|
index_reader::IndexReaderWrapper,
|
||||||
string_c::create_string,
|
|
||||||
util::{create_binding, free_binding},
|
util::{create_binding, free_binding},
|
||||||
util_c::tantivy_index_exist,
|
util_c::tantivy_index_exist,
|
||||||
};
|
};
|
||||||
|
|
|
@ -4,7 +4,7 @@ use tantivy::{
|
||||||
Term,
|
Term,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::Result;
|
||||||
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
|
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
|
||||||
|
|
||||||
impl IndexReaderWrapper {
|
impl IndexReaderWrapper {
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
use std::{ffi::CStr, ptr::null};
|
use std::ffi::CStr;
|
||||||
|
|
||||||
use libc::{c_char, c_void};
|
use libc::{c_char, c_void};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
array::{RustArray, RustResult},
|
array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
|
||||||
cstr_to_str,
|
|
||||||
index_reader::IndexReaderWrapper,
|
|
||||||
log::init_log,
|
|
||||||
string_c::{c_str_to_str, create_string},
|
|
||||||
tokenizer::create_tokenizer,
|
tokenizer::create_tokenizer,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use either::Either;
|
||||||
use futures::executor::block_on;
|
use futures::executor::block_on;
|
||||||
use libc::c_char;
|
use libc::c_char;
|
||||||
|
use log::info;
|
||||||
use tantivy::schema::{
|
use tantivy::schema::{
|
||||||
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED,
|
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
|
||||||
};
|
};
|
||||||
use tantivy::{doc, tokenizer, Document, Index, IndexWriter};
|
use tantivy::{doc, Document, Index, IndexWriter, SingleSegmentIndexWriter};
|
||||||
|
|
||||||
use crate::data_type::TantivyDataType;
|
use crate::data_type::TantivyDataType;
|
||||||
|
|
||||||
|
@ -16,11 +18,34 @@ use crate::log::init_log;
|
||||||
|
|
||||||
pub(crate) struct IndexWriterWrapper {
|
pub(crate) struct IndexWriterWrapper {
|
||||||
pub(crate) field: Field,
|
pub(crate) field: Field,
|
||||||
pub(crate) index_writer: IndexWriter,
|
pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
|
||||||
pub(crate) id_field: Field,
|
pub(crate) id_field: Option<Field>,
|
||||||
pub(crate) index: Arc<Index>,
|
pub(crate) index: Arc<Index>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn schema_builder_add_field(
|
||||||
|
schema_builder: &mut SchemaBuilder,
|
||||||
|
field_name: &str,
|
||||||
|
data_type: TantivyDataType,
|
||||||
|
) -> Field {
|
||||||
|
match data_type {
|
||||||
|
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
|
||||||
|
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
|
||||||
|
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
|
||||||
|
TantivyDataType::Keyword => {
|
||||||
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
|
.set_tokenizer("raw")
|
||||||
|
.set_index_option(IndexRecordOption::Basic);
|
||||||
|
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
||||||
|
schema_builder.add_text_field(&field_name, text_options)
|
||||||
|
}
|
||||||
|
TantivyDataType::Text => {
|
||||||
|
panic!("text should be indexed with analyzer");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl IndexWriterWrapper {
|
impl IndexWriterWrapper {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
field_name: String,
|
field_name: String,
|
||||||
|
@ -30,30 +55,10 @@ impl IndexWriterWrapper {
|
||||||
overall_memory_budget_in_bytes: usize,
|
overall_memory_budget_in_bytes: usize,
|
||||||
) -> Result<IndexWriterWrapper> {
|
) -> Result<IndexWriterWrapper> {
|
||||||
init_log();
|
init_log();
|
||||||
|
info!("create index writer, field_name: {}, data_type: {:?}", field_name, data_type);
|
||||||
let field: Field;
|
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
match data_type {
|
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
|
||||||
TantivyDataType::I64 => {
|
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
|
||||||
field = schema_builder.add_i64_field(&field_name, INDEXED);
|
|
||||||
}
|
|
||||||
TantivyDataType::F64 => {
|
|
||||||
field = schema_builder.add_f64_field(&field_name, INDEXED);
|
|
||||||
}
|
|
||||||
TantivyDataType::Bool => {
|
|
||||||
field = schema_builder.add_bool_field(&field_name, INDEXED);
|
|
||||||
}
|
|
||||||
TantivyDataType::Keyword => {
|
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
|
||||||
.set_tokenizer("raw")
|
|
||||||
.set_index_option(IndexRecordOption::Basic);
|
|
||||||
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
|
||||||
field = schema_builder.add_text_field(&field_name, text_options);
|
|
||||||
}
|
|
||||||
TantivyDataType::Text => {
|
|
||||||
panic!("text should be indexed with analyzer");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let id_field = schema_builder.add_i64_field("doc_id", FAST);
|
let id_field = schema_builder.add_i64_field("doc_id", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_dir(path.clone(), schema)?;
|
let index = Index::create_in_dir(path.clone(), schema)?;
|
||||||
|
@ -61,8 +66,28 @@ impl IndexWriterWrapper {
|
||||||
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
|
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
|
||||||
Ok(IndexWriterWrapper {
|
Ok(IndexWriterWrapper {
|
||||||
field,
|
field,
|
||||||
index_writer,
|
index_writer: Either::Left(index_writer),
|
||||||
id_field,
|
id_field: Some(id_field),
|
||||||
|
index: Arc::new(index),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_with_single_segment(
|
||||||
|
field_name: String,
|
||||||
|
data_type: TantivyDataType,
|
||||||
|
path: String,
|
||||||
|
) -> Result<IndexWriterWrapper> {
|
||||||
|
init_log();
|
||||||
|
info!("create single segment index writer, field_name: {}, data_type: {:?}", field_name, data_type);
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_dir(path.clone(), schema)?;
|
||||||
|
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
|
||||||
|
Ok(IndexWriterWrapper {
|
||||||
|
field,
|
||||||
|
index_writer: Either::Right(index_writer),
|
||||||
|
id_field: None,
|
||||||
index: Arc::new(index),
|
index: Arc::new(index),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -71,6 +96,30 @@ impl IndexWriterWrapper {
|
||||||
IndexReaderWrapper::from_index(self.index.clone())
|
IndexReaderWrapper::from_index(self.index.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn index_writer_add_document(&self, document: Document) -> Result<()> {
|
||||||
|
match self.index_writer {
|
||||||
|
Either::Left(ref writer) => {
|
||||||
|
let _ = writer.add_document(document)?;
|
||||||
|
}
|
||||||
|
Either::Right(_) => {
|
||||||
|
panic!("unexpected writer");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn single_segment_index_writer_add_document(&mut self, document: Document) -> Result<()> {
|
||||||
|
match self.index_writer {
|
||||||
|
Either::Left(_) => {
|
||||||
|
panic!("unexpected writer");
|
||||||
|
}
|
||||||
|
Either::Right(ref mut single_segmnet_writer) => {
|
||||||
|
let _ = single_segmnet_writer.add_document(document)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> {
|
pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> {
|
||||||
self.add_i64(data.into(), offset)
|
self.add_i64(data.into(), offset)
|
||||||
}
|
}
|
||||||
|
@ -84,11 +133,10 @@ impl IndexWriterWrapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> {
|
pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> {
|
||||||
let _ = self.index_writer.add_document(doc!(
|
self.index_writer_add_document(doc!(
|
||||||
self.field => data,
|
self.field => data,
|
||||||
self.id_field => offset,
|
self.id_field.unwrap() => offset,
|
||||||
))?;
|
))
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> {
|
pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> {
|
||||||
|
@ -96,27 +144,24 @@ impl IndexWriterWrapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> {
|
pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> {
|
||||||
let _ = self.index_writer.add_document(doc!(
|
self.index_writer_add_document(doc!(
|
||||||
self.field => data,
|
self.field => data,
|
||||||
self.id_field => offset,
|
self.id_field.unwrap() => offset,
|
||||||
))?;
|
))
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> {
|
pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> {
|
||||||
let _ = self.index_writer.add_document(doc!(
|
self.index_writer_add_document(doc!(
|
||||||
self.field => data,
|
self.field => data,
|
||||||
self.id_field => offset,
|
self.id_field.unwrap() => offset,
|
||||||
))?;
|
))
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> {
|
pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> {
|
||||||
let _ = self.index_writer.add_document(doc!(
|
self.index_writer_add_document(doc!(
|
||||||
self.field => data,
|
self.field => data,
|
||||||
self.id_field => offset,
|
self.id_field.unwrap() => offset,
|
||||||
))?;
|
))
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
|
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
|
||||||
|
@ -124,9 +169,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as i64);
|
document.add_field_value(self.field, *data as i64);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
|
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
|
||||||
|
@ -134,9 +178,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as i64);
|
document.add_field_value(self.field, *data as i64);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
|
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
|
||||||
|
@ -144,9 +187,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as i64);
|
document.add_field_value(self.field, *data as i64);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
|
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
|
||||||
|
@ -154,9 +196,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data);
|
document.add_field_value(self.field, *data);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
|
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
|
||||||
|
@ -164,9 +205,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data as f64);
|
document.add_field_value(self.field, *data as f64);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
|
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
|
||||||
|
@ -174,9 +214,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data);
|
document.add_field_value(self.field, *data);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
|
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
|
||||||
|
@ -184,9 +223,8 @@ impl IndexWriterWrapper {
|
||||||
for data in datas {
|
for data in datas {
|
||||||
document.add_field_value(self.field, *data);
|
document.add_field_value(self.field, *data);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
|
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
|
||||||
|
@ -195,31 +233,148 @@ impl IndexWriterWrapper {
|
||||||
let data = unsafe { CStr::from_ptr(*element) };
|
let data = unsafe { CStr::from_ptr(*element) };
|
||||||
document.add_field_value(self.field, data.to_str()?);
|
document.add_field_value(self.field, data.to_str()?);
|
||||||
}
|
}
|
||||||
document.add_i64(self.id_field, offset);
|
document.add_i64(self.id_field.unwrap(), offset);
|
||||||
let _ = self.index_writer.add_document(document)?;
|
self.index_writer_add_document(document)
|
||||||
Ok(())
|
}
|
||||||
|
|
||||||
|
pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> {
|
||||||
|
self.add_i64_by_single_segment_writer(data.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> {
|
||||||
|
self.add_i64_by_single_segment_writer(data.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> {
|
||||||
|
self.add_i64_by_single_segment_writer(data.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> {
|
||||||
|
self.single_segment_index_writer_add_document(doc!(
|
||||||
|
self.field => data
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> {
|
||||||
|
self.add_f64_by_single_segment_writer(data.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> {
|
||||||
|
self.single_segment_index_writer_add_document(doc!(
|
||||||
|
self.field => data
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> {
|
||||||
|
self.single_segment_index_writer_add_document(doc!(
|
||||||
|
self.field => data
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> {
|
||||||
|
self.single_segment_index_writer_add_document(doc!(
|
||||||
|
self.field => data
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data as i64);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data as i64);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data as i64);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data as f64);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for data in datas {
|
||||||
|
document.add_field_value(self.field, *data);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_multi_keywords_by_single_segment_writer(
|
||||||
|
&mut self,
|
||||||
|
datas: &[*const c_char],
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut document = Document::default();
|
||||||
|
for element in datas {
|
||||||
|
let data = unsafe { CStr::from_ptr(*element) };
|
||||||
|
document.add_field_value(self.field, data.to_str()?);
|
||||||
|
}
|
||||||
|
self.single_segment_index_writer_add_document(document)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn manual_merge(&mut self) -> Result<()> {
|
fn manual_merge(&mut self) -> Result<()> {
|
||||||
let metas = self.index_writer.index().searchable_segment_metas()?;
|
let index_writer = self.index_writer.as_mut().left().unwrap();
|
||||||
let policy = self.index_writer.get_merge_policy();
|
let metas = index_writer.index().searchable_segment_metas()?;
|
||||||
|
let policy = index_writer.get_merge_policy();
|
||||||
let candidates = policy.compute_merge_candidates(metas.as_slice());
|
let candidates = policy.compute_merge_candidates(metas.as_slice());
|
||||||
for candidate in candidates {
|
for candidate in candidates {
|
||||||
self.index_writer.merge(candidate.0.as_slice()).wait()?;
|
index_writer.merge(candidate.0.as_slice()).wait()?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(mut self) -> Result<()> {
|
pub fn finish(self) -> Result<()> {
|
||||||
self.index_writer.commit()?;
|
match self.index_writer {
|
||||||
// self.manual_merge();
|
Either::Left(mut index_writer) => {
|
||||||
block_on(self.index_writer.garbage_collect_files())?;
|
index_writer.commit()?;
|
||||||
self.index_writer.wait_merging_threads()?;
|
// self.manual_merge();
|
||||||
|
block_on(index_writer.garbage_collect_files())?;
|
||||||
|
index_writer.wait_merging_threads()?;
|
||||||
|
}
|
||||||
|
Either::Right(single_segment_index_writer) => {
|
||||||
|
single_segment_index_writer
|
||||||
|
.finalize()
|
||||||
|
.expect("failed to build inverted index");
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn commit(&mut self) -> Result<()> {
|
pub(crate) fn commit(&mut self) -> Result<()> {
|
||||||
self.index_writer.commit()?;
|
self.index_writer.as_mut().left().unwrap().commit()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
use core::slice;
|
use core::slice;
|
||||||
use std::{
|
use std::ffi::{c_char, c_void, CStr};
|
||||||
ffi::{c_char, c_void, CStr},
|
|
||||||
ptr::null,
|
|
||||||
};
|
|
||||||
|
|
||||||
use tantivy::Index;
|
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
array::RustResult,
|
array::RustResult,
|
||||||
|
@ -47,6 +42,24 @@ pub extern "C" fn tantivy_create_index(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_create_index_with_single_segment(
|
||||||
|
field_name: *const c_char,
|
||||||
|
data_type: TantivyDataType,
|
||||||
|
path: *const c_char,
|
||||||
|
) -> RustResult {
|
||||||
|
let field_name_str = cstr_to_str!(field_name);
|
||||||
|
let path_str = cstr_to_str!(path);
|
||||||
|
match IndexWriterWrapper::new_with_single_segment(
|
||||||
|
String::from(field_name_str),
|
||||||
|
data_type,
|
||||||
|
String::from(path_str),
|
||||||
|
) {
|
||||||
|
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
|
||||||
|
Err(e) => RustResult::from_error(e.to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
|
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
|
||||||
free_binding::<IndexWriterWrapper>(ptr);
|
free_binding::<IndexWriterWrapper>(ptr);
|
||||||
|
@ -77,6 +90,29 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------build--------------------
|
// -------------------------build--------------------
|
||||||
|
fn execute<T: Copy>(
|
||||||
|
arr: &[T],
|
||||||
|
offset: i64,
|
||||||
|
e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
|
||||||
|
w: &mut IndexWriterWrapper,
|
||||||
|
) -> Result<()> {
|
||||||
|
for (index, data) in arr.iter().enumerate() {
|
||||||
|
e(w, *data, offset + (index as i64))?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_by_single_segment_writer<T: Copy>(
|
||||||
|
arr: &[T],
|
||||||
|
e: fn(&mut IndexWriterWrapper, T) -> Result<()>,
|
||||||
|
w: &mut IndexWriterWrapper,
|
||||||
|
) -> Result<()> {
|
||||||
|
for (_, data) in arr.iter().enumerate() {
|
||||||
|
e(w, *data)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int8s(
|
pub extern "C" fn tantivy_index_add_int8s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -89,6 +125,24 @@ pub extern "C" fn tantivy_index_add_int8s(
|
||||||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
|
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i8,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
unsafe {
|
||||||
|
execute_by_single_segment_writer(
|
||||||
|
arr,
|
||||||
|
IndexWriterWrapper::add_i8_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int16s(
|
pub extern "C" fn tantivy_index_add_int16s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -101,6 +155,24 @@ pub extern "C" fn tantivy_index_add_int16s(
|
||||||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
|
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i16,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
unsafe {
|
||||||
|
execute_by_single_segment_writer(
|
||||||
|
arr,
|
||||||
|
IndexWriterWrapper::add_i16_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int32s(
|
pub extern "C" fn tantivy_index_add_int32s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -113,6 +185,24 @@ pub extern "C" fn tantivy_index_add_int32s(
|
||||||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
|
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i32,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
unsafe {
|
||||||
|
execute_by_single_segment_writer(
|
||||||
|
arr,
|
||||||
|
IndexWriterWrapper::add_i32_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_int64s(
|
pub extern "C" fn tantivy_index_add_int64s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -126,18 +216,23 @@ pub extern "C" fn tantivy_index_add_int64s(
|
||||||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
|
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn execute<T: Copy>(
|
#[no_mangle]
|
||||||
arr: &[T],
|
pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer(
|
||||||
offset: i64,
|
ptr: *mut c_void,
|
||||||
mut e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
|
array: *const i64,
|
||||||
w: &mut IndexWriterWrapper,
|
len: usize,
|
||||||
) -> Result<()> {
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
|
||||||
unsafe {
|
unsafe {
|
||||||
for (index, data) in arr.iter().enumerate() {
|
execute_by_single_segment_writer(
|
||||||
e(w, *data, offset + (index as i64))?;
|
arr,
|
||||||
}
|
IndexWriterWrapper::add_i64_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
|
@ -152,6 +247,24 @@ pub extern "C" fn tantivy_index_add_f32s(
|
||||||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
|
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f32,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
unsafe {
|
||||||
|
execute_by_single_segment_writer(
|
||||||
|
arr,
|
||||||
|
IndexWriterWrapper::add_f32_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_f64s(
|
pub extern "C" fn tantivy_index_add_f64s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -164,6 +277,24 @@ pub extern "C" fn tantivy_index_add_f64s(
|
||||||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
|
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f64,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
unsafe {
|
||||||
|
execute_by_single_segment_writer(
|
||||||
|
arr,
|
||||||
|
IndexWriterWrapper::add_f64_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_bools(
|
pub extern "C" fn tantivy_index_add_bools(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -184,6 +315,24 @@ pub extern "C" fn tantivy_index_add_bools(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const bool,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||||
|
unsafe {
|
||||||
|
execute_by_single_segment_writer(
|
||||||
|
arr,
|
||||||
|
IndexWriterWrapper::add_bool_by_single_segment_writer,
|
||||||
|
&mut (*real),
|
||||||
|
)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: this is not a very efficient way, since we must call this function many times, which
|
// TODO: this is not a very efficient way, since we must call this function many times, which
|
||||||
// will bring a lot of overhead caused by the rust binding.
|
// will bring a lot of overhead caused by the rust binding.
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
|
@ -197,6 +346,16 @@ pub extern "C" fn tantivy_index_add_string(
|
||||||
unsafe { (*real).add_string(s, offset).into() }
|
unsafe { (*real).add_string(s, offset).into() }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_string_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
s: *const c_char,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
let s = cstr_to_str!(s);
|
||||||
|
unsafe { (*real).add_string_by_single_segment_writer(s).into() }
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------- array ------------------------------------------
|
// --------------------------------------------- array ------------------------------------------
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
|
@ -213,6 +372,19 @@ pub extern "C" fn tantivy_index_add_multi_int8s(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_int8s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i8,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_i8s_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_int16s(
|
pub extern "C" fn tantivy_index_add_multi_int16s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -227,6 +399,19 @@ pub extern "C" fn tantivy_index_add_multi_int16s(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_int16s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i16,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_i16s_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_int32s(
|
pub extern "C" fn tantivy_index_add_multi_int32s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -241,6 +426,19 @@ pub extern "C" fn tantivy_index_add_multi_int32s(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_int32s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i32,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_i32s_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_int64s(
|
pub extern "C" fn tantivy_index_add_multi_int64s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -255,6 +453,19 @@ pub extern "C" fn tantivy_index_add_multi_int64s(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_int64s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const i64,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_i64s_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_f32s(
|
pub extern "C" fn tantivy_index_add_multi_f32s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -269,6 +480,19 @@ pub extern "C" fn tantivy_index_add_multi_f32s(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_f32s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f32,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_f32s_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_f64s(
|
pub extern "C" fn tantivy_index_add_multi_f64s(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -283,6 +507,19 @@ pub extern "C" fn tantivy_index_add_multi_f64s(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_f64s_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const f64,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_f64s_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_bools(
|
pub extern "C" fn tantivy_index_add_multi_bools(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -297,6 +534,19 @@ pub extern "C" fn tantivy_index_add_multi_bools(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_bools_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const bool,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real).add_multi_bools_by_single_segment_writer(arr).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_index_add_multi_keywords(
|
pub extern "C" fn tantivy_index_add_multi_keywords(
|
||||||
ptr: *mut c_void,
|
ptr: *mut c_void,
|
||||||
|
@ -310,3 +560,18 @@ pub extern "C" fn tantivy_index_add_multi_keywords(
|
||||||
(*real).add_multi_keywords(arr, offset).into()
|
(*real).add_multi_keywords(arr, offset).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_index_add_multi_keywords_by_single_segment_writer(
|
||||||
|
ptr: *mut c_void,
|
||||||
|
array: *const *const c_char,
|
||||||
|
len: usize,
|
||||||
|
) -> RustResult {
|
||||||
|
let real = ptr as *mut IndexWriterWrapper;
|
||||||
|
unsafe {
|
||||||
|
let arr = convert_to_rust_slice!(array, len);
|
||||||
|
(*real)
|
||||||
|
.add_multi_keywords_by_single_segment_writer(arr)
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use either::Either;
|
||||||
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
|
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
|
||||||
use tantivy::tokenizer::TextAnalyzer;
|
use tantivy::tokenizer::TextAnalyzer;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
@ -44,8 +45,8 @@ impl IndexWriterWrapper {
|
||||||
|
|
||||||
IndexWriterWrapper {
|
IndexWriterWrapper {
|
||||||
field,
|
field,
|
||||||
index_writer,
|
index_writer: Either::Left(index_writer),
|
||||||
id_field,
|
id_field: Some(id_field),
|
||||||
index: Arc::new(index),
|
index: Arc::new(index),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
use std::ffi::c_char;
|
use std::ffi::c_char;
|
||||||
use std::ffi::c_void;
|
|
||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
|
|
||||||
use crate::array::RustResult;
|
use crate::array::RustResult;
|
||||||
use crate::cstr_to_str;
|
use crate::cstr_to_str;
|
||||||
use crate::error::Result;
|
|
||||||
use crate::index_writer::IndexWriterWrapper;
|
use crate::index_writer::IndexWriterWrapper;
|
||||||
use crate::log::init_log;
|
use crate::log::init_log;
|
||||||
use crate::string_c::c_str_to_str;
|
|
||||||
use crate::tokenizer::create_tokenizer;
|
use crate::tokenizer::create_tokenizer;
|
||||||
use crate::util::create_binding;
|
use crate::util::create_binding;
|
||||||
|
|
||||||
|
|
|
@ -81,15 +81,22 @@ struct TantivyIndexWrapper {
|
||||||
TantivyIndexWrapper(const char* field_name,
|
TantivyIndexWrapper(const char* field_name,
|
||||||
TantivyDataType data_type,
|
TantivyDataType data_type,
|
||||||
const char* path,
|
const char* path,
|
||||||
|
bool inverted_single_semgnent = false,
|
||||||
uintptr_t num_threads = DEFAULT_NUM_THREADS,
|
uintptr_t num_threads = DEFAULT_NUM_THREADS,
|
||||||
uintptr_t overall_memory_budget_in_bytes =
|
uintptr_t overall_memory_budget_in_bytes =
|
||||||
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
|
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
|
||||||
auto res = RustResultWrapper(
|
RustResultWrapper res;
|
||||||
tantivy_create_index(field_name,
|
if (inverted_single_semgnent) {
|
||||||
data_type,
|
res = RustResultWrapper(tantivy_create_index_with_single_segment(
|
||||||
path,
|
field_name, data_type, path));
|
||||||
num_threads,
|
} else {
|
||||||
overall_memory_budget_in_bytes));
|
res = RustResultWrapper(
|
||||||
|
tantivy_create_index(field_name,
|
||||||
|
data_type,
|
||||||
|
path,
|
||||||
|
num_threads,
|
||||||
|
overall_memory_budget_in_bytes));
|
||||||
|
}
|
||||||
AssertInfo(res.result_->success,
|
AssertInfo(res.result_->success,
|
||||||
"failed to create index: {}",
|
"failed to create index: {}",
|
||||||
res.result_->error);
|
res.result_->error);
|
||||||
|
@ -340,6 +347,193 @@ struct TantivyIndexWrapper {
|
||||||
typeid(T).name());
|
typeid(T).name());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void
|
||||||
|
add_data_by_single_segment_writer(const T* array, uintptr_t len) {
|
||||||
|
assert(!finished_);
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, bool>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_bools_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add bools: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int8_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_int8s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add int8s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int16_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_int16s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add int16s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int32_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_int32s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add int32s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int64_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_int64s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add int64s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_f32s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add f32s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, double>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_f64s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add f64s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, std::string>) {
|
||||||
|
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
|
||||||
|
for (uintptr_t i = 0; i < len; i++) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_string_by_single_segment_writer(
|
||||||
|
writer_,
|
||||||
|
static_cast<const std::string*>(array)[i].c_str()));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add string: {}",
|
||||||
|
res.result_->error);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw fmt::format("InvertedIndex.add_data: unsupported data type: {}",
|
||||||
|
typeid(T).name());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void
|
||||||
|
add_multi_data_by_single_segment_writer(const T* array, uintptr_t len) {
|
||||||
|
assert(!finished_);
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, bool>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_bools_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi bools: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int8_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_int8s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi int8s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int16_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_int16s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi int16s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int32_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_int32s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi int32s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, int64_t>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_int64s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi int64s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_f32s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi f32s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, double>) {
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_f64s_by_single_segment_writer(
|
||||||
|
writer_, array, len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi f64s: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, std::string>) {
|
||||||
|
std::vector<const char*> views;
|
||||||
|
for (uintptr_t i = 0; i < len; i++) {
|
||||||
|
views.push_back(array[i].c_str());
|
||||||
|
}
|
||||||
|
auto res = RustResultWrapper(
|
||||||
|
tantivy_index_add_multi_keywords_by_single_segment_writer(
|
||||||
|
writer_, views.data(), len));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"failed to add multi keywords: {}",
|
||||||
|
res.result_->error);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw fmt::format(
|
||||||
|
"InvertedIndex.add_multi_data: unsupported data type: {}",
|
||||||
|
typeid(T).name());
|
||||||
|
}
|
||||||
|
|
||||||
inline void
|
inline void
|
||||||
finish() {
|
finish() {
|
||||||
if (finished_) {
|
if (finished_) {
|
||||||
|
|
|
@ -90,6 +90,7 @@ set(MILVUS_TEST_FILES
|
||||||
test_chunked_column.cpp
|
test_chunked_column.cpp
|
||||||
test_rust_result.cpp
|
test_rust_result.cpp
|
||||||
test_cached_search_iterator.cpp
|
test_cached_search_iterator.cpp
|
||||||
|
test_build_inverted_index_with_single_segment.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
if ( INDEX_ENGINE STREQUAL "cardinal" )
|
if ( INDEX_ENGINE STREQUAL "cardinal" )
|
||||||
|
|
|
@ -113,7 +113,7 @@ class ArrayInvertedIndexTest : public ::testing::Test {
|
||||||
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
|
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
|
||||||
Config cfg;
|
Config cfg;
|
||||||
cfg["is_array"] = true;
|
cfg["is_array"] = true;
|
||||||
index->BuildWithRawData(N_, vec_of_array_.data(), cfg);
|
index->BuildWithRawDataForUT(N_, vec_of_array_.data(), cfg);
|
||||||
LoadIndexInfo info{
|
LoadIndexInfo info{
|
||||||
.field_id = schema_->get_field_id(FieldName("array")).get(),
|
.field_id = schema_->get_field_id(FieldName("array")).get(),
|
||||||
.index = std::move(index),
|
.index = std::move(index),
|
||||||
|
|
|
@ -0,0 +1,214 @@
|
||||||
|
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||||
|
// with the License. You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||||
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||||
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||||
|
|
||||||
|
#include <random>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include "pb/plan.pb.h"
|
||||||
|
#include "segcore/SegmentSealedImpl.h"
|
||||||
|
#include "index/InvertedIndexTantivy.h"
|
||||||
|
#include "test_utils/DataGen.h"
|
||||||
|
#include "common/Schema.h"
|
||||||
|
#include "test_utils/GenExprProto.h"
|
||||||
|
#include "query/PlanProto.h"
|
||||||
|
#include "query/ExecPlanNodeVisitor.h"
|
||||||
|
|
||||||
|
using namespace milvus;
|
||||||
|
using namespace milvus::query;
|
||||||
|
using namespace milvus::segcore;
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
SchemaPtr
|
||||||
|
GenSchema() {
|
||||||
|
auto schema_ = std::make_shared<Schema>();
|
||||||
|
auto pk = schema_->AddDebugField("pk", DataType::INT64);
|
||||||
|
schema_->set_primary_field_id(pk);
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, bool>) {
|
||||||
|
schema_->AddDebugField("index", DataType::BOOL, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, int8_t>) {
|
||||||
|
schema_->AddDebugField("index", DataType::INT8, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||||
|
schema_->AddDebugField("index", DataType::INT16, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||||
|
schema_->AddDebugField("index", DataType::INT32, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||||
|
schema_->AddDebugField("index", DataType::INT64, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, float>) {
|
||||||
|
schema_->AddDebugField("index", DataType::FLOAT, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, double>) {
|
||||||
|
schema_->AddDebugField("index", DataType::DOUBLE, false);
|
||||||
|
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||||
|
schema_->AddDebugField("index", DataType::VARCHAR, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
return schema_;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
class BuildInvertedIndexWithSingleSegmentTest : public ::testing::Test {
|
||||||
|
public:
|
||||||
|
void
|
||||||
|
SetUp() override {
|
||||||
|
schema_ = GenSchema<T>();
|
||||||
|
seg_ = CreateSealedSegment(schema_);
|
||||||
|
N_ = 3000;
|
||||||
|
uint64_t seed = 1234;
|
||||||
|
auto raw_data = DataGen(schema_, N_, seed);
|
||||||
|
|
||||||
|
if constexpr (std::is_same_v<T, bool>) {
|
||||||
|
auto index_col =
|
||||||
|
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||||
|
->scalars()
|
||||||
|
.bool_data()
|
||||||
|
.data();
|
||||||
|
for (size_t i = 0; i < N_; i++) {
|
||||||
|
index_column_data_.push_back(index_col[i]);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||||
|
auto index_col =
|
||||||
|
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||||
|
->scalars()
|
||||||
|
.long_data()
|
||||||
|
.data();
|
||||||
|
for (size_t i = 0; i < N_; i++) {
|
||||||
|
index_column_data_.push_back(index_col[i]);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_integral_v<T>) {
|
||||||
|
auto index_col =
|
||||||
|
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||||
|
->scalars()
|
||||||
|
.int_data()
|
||||||
|
.data();
|
||||||
|
for (size_t i = 0; i < N_; i++) {
|
||||||
|
index_column_data_.push_back(index_col[i]);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<T, float>) {
|
||||||
|
auto index_col =
|
||||||
|
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||||
|
->scalars()
|
||||||
|
.float_data()
|
||||||
|
.data();
|
||||||
|
for (size_t i = 0; i < N_; i++) {
|
||||||
|
index_column_data_.push_back(index_col[i]);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<T, double>) {
|
||||||
|
auto index_col =
|
||||||
|
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||||
|
->scalars()
|
||||||
|
.double_data()
|
||||||
|
.data();
|
||||||
|
for (size_t i = 0; i < N_; i++) {
|
||||||
|
index_column_data_.push_back(index_col[i]);
|
||||||
|
}
|
||||||
|
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||||
|
auto index_col =
|
||||||
|
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||||
|
->scalars()
|
||||||
|
.string_data()
|
||||||
|
.data();
|
||||||
|
for (size_t i = 0; i < N_; i++) {
|
||||||
|
index_column_data_.push_back(index_col[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SealedLoadFieldData(raw_data, *seg_);
|
||||||
|
LoadInvertedIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
TearDown() override {
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
LoadInvertedIndex() {
|
||||||
|
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
|
||||||
|
Config cfg;
|
||||||
|
cfg[milvus::index::SCALAR_INDEX_ENGINE_VERSION] = 0;
|
||||||
|
index->BuildWithRawDataForUT(N_, index_column_data_.data(), cfg);
|
||||||
|
LoadIndexInfo info{
|
||||||
|
.field_id = schema_->get_field_id(FieldName("index")).get(),
|
||||||
|
.index = std::move(index),
|
||||||
|
};
|
||||||
|
seg_->LoadIndex(info);
|
||||||
|
}
|
||||||
|
|
||||||
|
T
|
||||||
|
FieldValueAt(int64_t offset) {
|
||||||
|
return index_column_data_[offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
SchemaPtr schema_;
|
||||||
|
SegmentSealedUPtr seg_;
|
||||||
|
int64_t N_;
|
||||||
|
boost::container::vector<T> index_column_data_;
|
||||||
|
};
|
||||||
|
|
||||||
|
TYPED_TEST_SUITE_P(BuildInvertedIndexWithSingleSegmentTest);
|
||||||
|
|
||||||
|
TYPED_TEST_P(BuildInvertedIndexWithSingleSegmentTest,
|
||||||
|
ReadFromSingleSegmentIndex) {
|
||||||
|
const auto& meta = this->schema_->operator[](FieldName("index"));
|
||||||
|
for (size_t i = 0; i < 10; i++) {
|
||||||
|
auto column_info = test::GenColumnInfo(
|
||||||
|
meta.get_id().get(),
|
||||||
|
static_cast<proto::schema::DataType>(meta.get_data_type()),
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
static_cast<proto::schema::DataType>(meta.get_element_type()));
|
||||||
|
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 gen(rd());
|
||||||
|
|
||||||
|
std::uniform_int_distribution<> int_dist(1, this->N_);
|
||||||
|
int random_idx = int_dist(gen) - 1;
|
||||||
|
|
||||||
|
auto unary_range_expr = std::make_unique<proto::plan::UnaryRangeExpr>();
|
||||||
|
unary_range_expr->set_allocated_column_info(column_info);
|
||||||
|
unary_range_expr->set_op(proto::plan::OpType::Equal);
|
||||||
|
auto val = this->FieldValueAt(random_idx);
|
||||||
|
unary_range_expr->set_allocated_value(test::GenGenericValue(val));
|
||||||
|
|
||||||
|
auto expr = test::GenExpr();
|
||||||
|
expr->set_allocated_unary_range_expr(unary_range_expr.release());
|
||||||
|
auto parser = ProtoParser(*this->schema_);
|
||||||
|
auto typed_expr = parser.ParseExprs(*expr);
|
||||||
|
auto parsed = std::make_shared<plan::FilterBitsNode>(
|
||||||
|
DEFAULT_PLANNODE_ID, typed_expr);
|
||||||
|
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
|
||||||
|
BitsetType final;
|
||||||
|
final = ExecuteQueryExpr(parsed, segpromote, this->N_, MAX_TIMESTAMP);
|
||||||
|
auto ref = [this, random_idx](size_t offset) -> bool {
|
||||||
|
return this->index_column_data_[offset] ==
|
||||||
|
this->index_column_data_[random_idx];
|
||||||
|
};
|
||||||
|
ASSERT_EQ(final.size(), this->N_);
|
||||||
|
for (size_t i = 0; i < this->N_; i++) {
|
||||||
|
if (std::is_floating_point_v<decltype(val)> && i == random_idx) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
ASSERT_EQ(final[i], ref(i))
|
||||||
|
<< "i: " << i << ", final[i]: " << final[i]
|
||||||
|
<< ", ref(i): " << ref(i) << ", random_idx: " << random_idx
|
||||||
|
<< ", value: " << this->index_column_data_[random_idx]
|
||||||
|
<< ", value: " << this->index_column_data_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_TYPED_TEST_CASE_P(BuildInvertedIndexWithSingleSegmentTest,
|
||||||
|
ReadFromSingleSegmentIndex);
|
||||||
|
|
||||||
|
using ElementType = testing::
|
||||||
|
Types<bool, int8_t, int16_t, int32_t, int64_t, float, double, std::string>;
|
||||||
|
INSTANTIATE_TYPED_TEST_SUITE_P(Naive,
|
||||||
|
BuildInvertedIndexWithSingleSegmentTest,
|
||||||
|
ElementType);
|
|
@ -354,7 +354,7 @@ TEST_F(TestChunkSegment, TestCompareExpr) {
|
||||||
data.begin() + i * test_data_count);
|
data.begin() + i * test_data_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
index->BuildWithRawData(data.size(), data.data());
|
index->BuildWithRawDataForUT(data.size(), data.data());
|
||||||
segcore::LoadIndexInfo load_index_info;
|
segcore::LoadIndexInfo load_index_info;
|
||||||
load_index_info.index = std::move(index);
|
load_index_info.index = std::move(index);
|
||||||
load_index_info.field_id = fid.get();
|
load_index_info.field_id = fid.get();
|
||||||
|
|
|
@ -255,7 +255,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
||||||
auto index = index::CreateStringIndexSort();
|
auto index = index::CreateStringIndexSort();
|
||||||
std::vector<uint8_t> buffer(arr.ByteSize());
|
std::vector<uint8_t> buffer(arr.ByteSize());
|
||||||
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
|
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
|
||||||
index->BuildWithRawData(arr.ByteSize(), buffer.data());
|
index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
|
||||||
LoadIndexInfo info{
|
LoadIndexInfo info{
|
||||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||||
.index = std::move(index),
|
.index = std::move(index),
|
||||||
|
@ -264,7 +264,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
auto index = index::CreateScalarIndexSort<int64_t>();
|
auto index = index::CreateScalarIndexSort<int64_t>();
|
||||||
index->BuildWithRawData(N, raw_int.data());
|
index->BuildWithRawDataForUT(N, raw_int.data());
|
||||||
LoadIndexInfo info{
|
LoadIndexInfo info{
|
||||||
.field_id =
|
.field_id =
|
||||||
schema->get_field_id(FieldName("another_int64")).get(),
|
schema->get_field_id(FieldName("another_int64")).get(),
|
||||||
|
@ -278,7 +278,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
||||||
LoadInvertedIndex() {
|
LoadInvertedIndex() {
|
||||||
auto index =
|
auto index =
|
||||||
std::make_unique<index::InvertedIndexTantivy<std::string>>();
|
std::make_unique<index::InvertedIndexTantivy<std::string>>();
|
||||||
index->BuildWithRawData(N, raw_str.data());
|
index->BuildWithRawDataForUT(N, raw_str.data());
|
||||||
LoadIndexInfo info{
|
LoadIndexInfo info{
|
||||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||||
.index = std::move(index),
|
.index = std::move(index),
|
||||||
|
@ -295,7 +295,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
||||||
auto index = std::make_unique<MockStringIndex>();
|
auto index = std::make_unique<MockStringIndex>();
|
||||||
std::vector<uint8_t> buffer(arr.ByteSize());
|
std::vector<uint8_t> buffer(arr.ByteSize());
|
||||||
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
|
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
|
||||||
index->BuildWithRawData(arr.ByteSize(), buffer.data());
|
index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
|
||||||
LoadIndexInfo info{
|
LoadIndexInfo info{
|
||||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||||
.index = std::move(index),
|
.index = std::move(index),
|
||||||
|
|
|
@ -151,8 +151,12 @@ TEST_F(StringIndexMarisaTest, Range) {
|
||||||
TEST_F(StringIndexMarisaTest, Reverse) {
|
TEST_F(StringIndexMarisaTest, Reverse) {
|
||||||
auto index_types = GetIndexTypes<std::string>();
|
auto index_types = GetIndexTypes<std::string>();
|
||||||
for (const auto& index_type : index_types) {
|
for (const auto& index_type : index_types) {
|
||||||
auto index = milvus::index::IndexFactory::GetInstance()
|
CreateIndexInfo create_index_info{
|
||||||
.CreatePrimitiveScalarIndex<std::string>(index_type);
|
.index_type = index_type,
|
||||||
|
};
|
||||||
|
auto index =
|
||||||
|
milvus::index::IndexFactory::GetInstance()
|
||||||
|
.CreatePrimitiveScalarIndex<std::string>(create_index_info);
|
||||||
index->Build(nb, strs.data());
|
index->Build(nb, strs.data());
|
||||||
assert_reverse<std::string>(index.get(), strs);
|
assert_reverse<std::string>(index.get(), strs);
|
||||||
}
|
}
|
||||||
|
@ -311,7 +315,7 @@ TEST_F(StringIndexMarisaTest, BaseIndexCodec) {
|
||||||
*str_arr.mutable_data() = {strings.begin(), strings.end()};
|
*str_arr.mutable_data() = {strings.begin(), strings.end()};
|
||||||
std::vector<uint8_t> data(str_arr.ByteSizeLong(), 0);
|
std::vector<uint8_t> data(str_arr.ByteSizeLong(), 0);
|
||||||
str_arr.SerializeToArray(data.data(), str_arr.ByteSizeLong());
|
str_arr.SerializeToArray(data.data(), str_arr.ByteSizeLong());
|
||||||
index->BuildWithRawData(str_arr.ByteSizeLong(), data.data());
|
index->BuildWithRawDataForUT(str_arr.ByteSizeLong(), data.data());
|
||||||
|
|
||||||
std::vector<std::string> invalid_strings = {std::to_string(nb)};
|
std::vector<std::string> invalid_strings = {std::to_string(nb)};
|
||||||
auto copy_index = milvus::index::CreateStringIndexMarisa();
|
auto copy_index = milvus::index::CreateStringIndexMarisa();
|
||||||
|
|
Loading…
Reference in New Issue