mirror of https://github.com/milvus-io/milvus.git
fix: enable to build index with single segment (#39233)
fix https://github.com/milvus-io/milvus/issues/39232 --------- Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>pull/39079/head^2
parent
bca2a62b78
commit
8c4ba70a4c
|
@ -45,9 +45,9 @@ class IndexBase {
|
|||
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) = 0;
|
||||
|
||||
virtual void
|
||||
BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) = 0;
|
||||
BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) = 0;
|
||||
|
||||
virtual void
|
||||
BuildWithDataset(const DatasetPtr& dataset, const Config& config = {}) = 0;
|
||||
|
|
|
@ -35,10 +35,14 @@ namespace milvus::index {
|
|||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
IndexFactory::CreatePrimitiveScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
auto index_type = create_index_info.index_type;
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context);
|
||||
// scalar_index_engine_version 0 means we should built tantivy index within single segment
|
||||
return std::make_unique<InvertedIndexTantivy<T>>(
|
||||
file_manager_context,
|
||||
create_index_info.scalar_index_engine_version == 0);
|
||||
}
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
return std::make_unique<BitmapIndex<T>>(file_manager_context);
|
||||
|
@ -59,12 +63,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
|
|||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
||||
const IndexType& index_type,
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
auto index_type = create_index_info.index_type;
|
||||
#if defined(__linux__) || defined(__APPLE__)
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
// scalar_index_engine_version 0 means we should built tantivy index within single segment
|
||||
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
||||
file_manager_context);
|
||||
file_manager_context,
|
||||
create_index_info.scalar_index_engine_version == 0);
|
||||
}
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
|
||||
|
@ -294,37 +301,37 @@ IndexFactory::CreateIndex(
|
|||
IndexBasePtr
|
||||
IndexFactory::CreatePrimitiveScalarIndex(
|
||||
DataType data_type,
|
||||
IndexType index_type,
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
switch (data_type) {
|
||||
// create scalar index
|
||||
case DataType::BOOL:
|
||||
return CreatePrimitiveScalarIndex<bool>(index_type,
|
||||
return CreatePrimitiveScalarIndex<bool>(create_index_info,
|
||||
file_manager_context);
|
||||
case DataType::INT8:
|
||||
return CreatePrimitiveScalarIndex<int8_t>(index_type,
|
||||
return CreatePrimitiveScalarIndex<int8_t>(create_index_info,
|
||||
file_manager_context);
|
||||
case DataType::INT16:
|
||||
return CreatePrimitiveScalarIndex<int16_t>(index_type,
|
||||
return CreatePrimitiveScalarIndex<int16_t>(create_index_info,
|
||||
file_manager_context);
|
||||
case DataType::INT32:
|
||||
return CreatePrimitiveScalarIndex<int32_t>(index_type,
|
||||
return CreatePrimitiveScalarIndex<int32_t>(create_index_info,
|
||||
file_manager_context);
|
||||
case DataType::INT64:
|
||||
return CreatePrimitiveScalarIndex<int64_t>(index_type,
|
||||
return CreatePrimitiveScalarIndex<int64_t>(create_index_info,
|
||||
file_manager_context);
|
||||
case DataType::FLOAT:
|
||||
return CreatePrimitiveScalarIndex<float>(index_type,
|
||||
return CreatePrimitiveScalarIndex<float>(create_index_info,
|
||||
file_manager_context);
|
||||
case DataType::DOUBLE:
|
||||
return CreatePrimitiveScalarIndex<double>(index_type,
|
||||
return CreatePrimitiveScalarIndex<double>(create_index_info,
|
||||
file_manager_context);
|
||||
|
||||
// create string index
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return CreatePrimitiveScalarIndex<std::string>(
|
||||
index_type, file_manager_context);
|
||||
create_index_info, file_manager_context);
|
||||
default:
|
||||
PanicInfo(
|
||||
DataTypeInvalid,
|
||||
|
@ -334,14 +341,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
|
|||
|
||||
IndexBasePtr
|
||||
IndexFactory::CreateCompositeScalarIndex(
|
||||
IndexType index_type,
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
auto index_type = create_index_info.index_type;
|
||||
if (index_type == HYBRID_INDEX_TYPE || index_type == BITMAP_INDEX_TYPE ||
|
||||
index_type == INVERTED_INDEX_TYPE) {
|
||||
auto element_type = static_cast<DataType>(
|
||||
file_manager_context.fieldDataMeta.field_schema.element_type());
|
||||
return CreatePrimitiveScalarIndex(
|
||||
element_type, index_type, file_manager_context);
|
||||
element_type, create_index_info, file_manager_context);
|
||||
} else {
|
||||
PanicInfo(
|
||||
Unsupported,
|
||||
|
@ -373,9 +381,9 @@ IndexFactory::CreateScalarIndex(
|
|||
case DataType::VARCHAR:
|
||||
case DataType::STRING:
|
||||
return CreatePrimitiveScalarIndex(
|
||||
data_type, create_index_info.index_type, file_manager_context);
|
||||
data_type, create_index_info, file_manager_context);
|
||||
case DataType::ARRAY: {
|
||||
return CreateCompositeScalarIndex(create_index_info.index_type,
|
||||
return CreateCompositeScalarIndex(create_index_info,
|
||||
file_manager_context);
|
||||
}
|
||||
case DataType::JSON: {
|
||||
|
|
|
@ -85,14 +85,14 @@ class IndexFactory {
|
|||
IndexBasePtr
|
||||
CreatePrimitiveScalarIndex(
|
||||
DataType data_type,
|
||||
IndexType index_type,
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
// For types like array, struct, union, etc
|
||||
IndexBasePtr
|
||||
CreateCompositeScalarIndex(
|
||||
IndexType index_type,
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
|
@ -115,7 +115,7 @@ class IndexFactory {
|
|||
|
||||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
CreatePrimitiveScalarIndex(const IndexType& index_type,
|
||||
CreatePrimitiveScalarIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager =
|
||||
storage::FileManagerContext());
|
||||
};
|
||||
|
|
|
@ -26,6 +26,7 @@ struct CreateIndexInfo {
|
|||
IndexVersion index_engine_version;
|
||||
std::string field_name;
|
||||
int64_t dim;
|
||||
int32_t scalar_index_engine_version;
|
||||
};
|
||||
|
||||
} // namespace milvus::index
|
||||
|
|
|
@ -84,14 +84,15 @@ InvertedIndexTantivy<T>::InitForBuildIndex() {
|
|||
path_);
|
||||
}
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||
field.c_str(), d_type_, path_.c_str());
|
||||
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
InvertedIndexTantivy<T>::InvertedIndexTantivy(
|
||||
const storage::FileManagerContext& ctx)
|
||||
const storage::FileManagerContext& ctx, bool inverted_index_single_segment)
|
||||
: ScalarIndex<T>(INVERTED_INDEX_TYPE),
|
||||
schema_(ctx.fieldDataMeta.field_schema) {
|
||||
schema_(ctx.fieldDataMeta.field_schema),
|
||||
inverted_index_single_segment_(inverted_index_single_segment) {
|
||||
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
|
||||
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
|
||||
// push init wrapper to load process
|
||||
|
@ -387,9 +388,9 @@ InvertedIndexTantivy<T>::RegexQuery(const std::string& regex_pattern) {
|
|||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
if constexpr (std::is_same_v<bool, T>) {
|
||||
schema_.set_data_type(proto::schema::DataType::Bool);
|
||||
}
|
||||
|
@ -421,16 +422,35 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
|||
boost::filesystem::create_directories(path_);
|
||||
d_type_ = get_tantivy_data_type(schema_);
|
||||
std::string field = "test_inverted_index";
|
||||
inverted_index_single_segment_ =
|
||||
GetValueFromConfig<int32_t>(config,
|
||||
milvus::index::SCALAR_INDEX_ENGINE_VERSION)
|
||||
.value_or(1) == 0;
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||
field.c_str(), d_type_, path_.c_str());
|
||||
if (config.find("is_array") != config.end()) {
|
||||
// only used in ut.
|
||||
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
wrapper_->template add_multi_data(arr[i].data(), arr[i].size(), i);
|
||||
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
|
||||
if (!inverted_index_single_segment_) {
|
||||
if (config.find("is_array") != config.end()) {
|
||||
// only used in ut.
|
||||
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
wrapper_->template add_multi_data(
|
||||
arr[i].data(), arr[i].size(), i);
|
||||
}
|
||||
} else {
|
||||
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
|
||||
}
|
||||
} else {
|
||||
wrapper_->add_data<T>(static_cast<const T*>(values), n, 0);
|
||||
if (config.find("is_array") != config.end()) {
|
||||
// only used in ut.
|
||||
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
wrapper_->template add_multi_data_by_single_segment_writer(
|
||||
arr[i].data(), arr[i].size());
|
||||
}
|
||||
} else {
|
||||
wrapper_->add_data_by_single_segment_writer<T>(
|
||||
static_cast<const T*>(values), n);
|
||||
}
|
||||
}
|
||||
wrapper_->create_reader();
|
||||
finish();
|
||||
|
@ -458,26 +478,48 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
|
|||
case proto::schema::DataType::Double:
|
||||
case proto::schema::DataType::String:
|
||||
case proto::schema::DataType::VarChar: {
|
||||
int64_t offset = 0;
|
||||
if (schema_.nullable()) {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (!data->is_valid(i)) {
|
||||
null_offset.push_back(i);
|
||||
// Generally, we will not build inverted index with single segment except for building index
|
||||
// for query node with older version(2.4). See more comments above `inverted_index_single_segment_`.
|
||||
if (!inverted_index_single_segment_) {
|
||||
int64_t offset = 0;
|
||||
if (schema_.nullable()) {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (!data->is_valid(i)) {
|
||||
null_offset.push_back(i);
|
||||
}
|
||||
wrapper_->add_multi_data<T>(
|
||||
static_cast<const T*>(data->RawValue(i)),
|
||||
data->is_valid(i),
|
||||
offset++);
|
||||
}
|
||||
wrapper_->add_multi_data<T>(
|
||||
static_cast<const T*>(data->RawValue(i)),
|
||||
data->is_valid(i),
|
||||
offset++);
|
||||
}
|
||||
} else {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<T>(
|
||||
static_cast<const T*>(data->Data()), n, offset);
|
||||
offset += n;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (const auto& data : field_datas) {
|
||||
auto n = data->get_num_rows();
|
||||
wrapper_->add_data<T>(
|
||||
static_cast<const T*>(data->Data()), n, offset);
|
||||
offset += n;
|
||||
if (schema_.nullable()) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (!data->is_valid(i)) {
|
||||
null_offset.push_back(i);
|
||||
}
|
||||
wrapper_
|
||||
->add_multi_data_by_single_segment_writer<T>(
|
||||
static_cast<const T*>(data->RawValue(i)),
|
||||
data->is_valid(i));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
wrapper_->add_data_by_single_segment_writer<T>(
|
||||
static_cast<const T*>(data->Data()), n);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -508,10 +550,15 @@ InvertedIndexTantivy<T>::build_index_for_array(
|
|||
null_offset.push_back(i);
|
||||
}
|
||||
auto length = data->is_valid(i) ? array_column[i].length() : 0;
|
||||
wrapper_->template add_multi_data(
|
||||
reinterpret_cast<const T*>(array_column[i].data()),
|
||||
length,
|
||||
offset++);
|
||||
if (!inverted_index_single_segment_) {
|
||||
wrapper_->template add_multi_data(
|
||||
reinterpret_cast<const T*>(array_column[i].data()),
|
||||
length,
|
||||
offset++);
|
||||
} else {
|
||||
wrapper_->template add_multi_data_by_single_segment_writer(
|
||||
reinterpret_cast<const T*>(array_column[i].data()), length);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -537,7 +584,13 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
|
|||
array_column[i].template get_data<std::string>(j));
|
||||
}
|
||||
auto length = data->is_valid(i) ? output.size() : 0;
|
||||
wrapper_->template add_multi_data(output.data(), length, offset++);
|
||||
if (!inverted_index_single_segment_) {
|
||||
wrapper_->template add_multi_data(
|
||||
output.data(), length, offset++);
|
||||
} else {
|
||||
wrapper_->template add_multi_data_by_single_segment_writer(
|
||||
output.data(), length);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,8 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
|||
InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) {
|
||||
}
|
||||
|
||||
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx);
|
||||
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx,
|
||||
bool inverted_index_single_segment = false);
|
||||
|
||||
~InvertedIndexTantivy();
|
||||
|
||||
|
@ -80,11 +81,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
|||
return wrapper_->count();
|
||||
}
|
||||
|
||||
// BuildWithRawData should be only used in ut. Only string is supported.
|
||||
// BuildWithRawDataForUT should be only used in ut. Only string is supported.
|
||||
void
|
||||
BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override;
|
||||
BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override;
|
||||
|
||||
BinarySet
|
||||
Serialize(const Config& config) override;
|
||||
|
@ -205,5 +206,14 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
|||
// all data need to be built to align the offset
|
||||
// so need to store null_offset in inverted index additionally
|
||||
std::vector<size_t> null_offset{};
|
||||
|
||||
// `inverted_index_single_segment_` is used to control whether to build tantivy index with single segment.
|
||||
//
|
||||
// In the older version of milvus, the query node can only read tantivy index built whtin single segment
|
||||
// where the newer version builds and reads index of multi segments by default.
|
||||
// However, the index may be built from a separate node from the query node where the index buliding node is a
|
||||
// new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index
|
||||
// building node to build specific type of tantivy index.
|
||||
bool inverted_index_single_segment_{false};
|
||||
};
|
||||
} // namespace milvus::index
|
||||
|
|
|
@ -46,6 +46,8 @@ constexpr const char* MARISA_TRIE_UPPER = "TRIE";
|
|||
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
|
||||
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
|
||||
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";
|
||||
constexpr const char* SCALAR_INDEX_ENGINE_VERSION =
|
||||
"scalar_index_engine_version";
|
||||
|
||||
// index meta
|
||||
constexpr const char* COLLECTION_ID = "collection_id";
|
||||
|
|
|
@ -72,9 +72,9 @@ ScalarIndex<T>::Query(const DatasetPtr& dataset) {
|
|||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<std::string>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<std::string>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
proto::schema::StringArray arr;
|
||||
auto ok = arr.ParseFromArray(values, n);
|
||||
Assert(ok);
|
||||
|
@ -86,9 +86,9 @@ ScalarIndex<std::string>::BuildWithRawData(size_t n,
|
|||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<bool>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<bool>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
proto::schema::BoolArray arr;
|
||||
auto ok = arr.ParseFromArray(values, n);
|
||||
Assert(ok);
|
||||
|
@ -97,54 +97,54 @@ ScalarIndex<bool>::BuildWithRawData(size_t n,
|
|||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<int8_t>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<int8_t>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
auto data = reinterpret_cast<int8_t*>(const_cast<void*>(values));
|
||||
Build(n, data);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<int16_t>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<int16_t>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
auto data = reinterpret_cast<int16_t*>(const_cast<void*>(values));
|
||||
Build(n, data);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<int32_t>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<int32_t>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
auto data = reinterpret_cast<int32_t*>(const_cast<void*>(values));
|
||||
Build(n, data);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<int64_t>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<int64_t>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
auto data = reinterpret_cast<int64_t*>(const_cast<void*>(values));
|
||||
Build(n, data);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<float>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<float>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
auto data = reinterpret_cast<float*>(const_cast<void*>(values));
|
||||
Build(n, data);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
ScalarIndex<double>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
ScalarIndex<double>::BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
auto data = reinterpret_cast<double*>(const_cast<void*>(values));
|
||||
Build(n, data);
|
||||
}
|
||||
|
|
|
@ -65,9 +65,9 @@ class ScalarIndex : public IndexBase {
|
|||
}
|
||||
|
||||
void
|
||||
BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override;
|
||||
BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override;
|
||||
|
||||
void
|
||||
BuildWithDataset(const DatasetPtr& dataset,
|
||||
|
|
|
@ -42,9 +42,9 @@ class VectorIndex : public IndexBase {
|
|||
|
||||
public:
|
||||
void
|
||||
BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override {
|
||||
BuildWithRawDataForUT(size_t n,
|
||||
const void* values,
|
||||
const Config& config = {}) override {
|
||||
PanicInfo(Unsupported,
|
||||
"vector index don't support build index with raw data");
|
||||
};
|
||||
|
|
|
@ -30,6 +30,13 @@ ScalarIndexCreator::ScalarIndexCreator(
|
|||
if (config.contains("index_type")) {
|
||||
index_type_ = config.at("index_type").get<std::string>();
|
||||
}
|
||||
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
|
||||
// Use value_or(1) for unit test without setting this value
|
||||
index_info.scalar_index_engine_version =
|
||||
milvus::index::GetValueFromConfig<int32_t>(
|
||||
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
|
||||
.value_or(1);
|
||||
|
||||
index_info.field_type = dtype_;
|
||||
index_info.index_type = index_type();
|
||||
index_ = index::IndexFactory::GetInstance().CreateIndex(
|
||||
|
@ -40,7 +47,7 @@ void
|
|||
ScalarIndexCreator::Build(const milvus::DatasetPtr& dataset) {
|
||||
auto size = dataset->GetRows();
|
||||
auto data = dataset->GetTensor();
|
||||
index_->BuildWithRawData(size, data);
|
||||
index_->BuildWithRawDataForUT(size, data);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -166,30 +166,17 @@ CreateIndex(CIndex* res_index,
|
|||
auto field_type =
|
||||
static_cast<DataType>(build_index_info->field_schema().data_type());
|
||||
|
||||
milvus::index::CreateIndexInfo index_info;
|
||||
index_info.field_type = field_type;
|
||||
|
||||
auto storage_config =
|
||||
get_storage_config(build_index_info->storage_config());
|
||||
auto config = get_config(build_index_info);
|
||||
// get index type
|
||||
auto index_type = milvus::index::GetValueFromConfig<std::string>(
|
||||
config, "index_type");
|
||||
AssertInfo(index_type.has_value(), "index type is empty");
|
||||
index_info.index_type = index_type.value();
|
||||
|
||||
auto engine_version = build_index_info->current_index_version();
|
||||
index_info.index_engine_version = engine_version;
|
||||
config[milvus::index::INDEX_ENGINE_VERSION] =
|
||||
std::to_string(engine_version);
|
||||
|
||||
// get metric type
|
||||
if (milvus::IsVectorDataType(field_type)) {
|
||||
auto metric_type = milvus::index::GetValueFromConfig<std::string>(
|
||||
config, "metric_type");
|
||||
AssertInfo(metric_type.has_value(), "metric type is empty");
|
||||
index_info.metric_type = metric_type.value();
|
||||
}
|
||||
auto scalar_index_engine_version =
|
||||
build_index_info->current_scalar_index_version();
|
||||
config[milvus::index::SCALAR_INDEX_ENGINE_VERSION] =
|
||||
scalar_index_engine_version;
|
||||
|
||||
// init file manager
|
||||
milvus::storage::FieldDataMeta field_meta{
|
||||
|
|
|
@ -1193,6 +1193,7 @@ name = "tantivy-binding"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cbindgen",
|
||||
"either",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"jieba-rs",
|
||||
|
|
|
@ -17,6 +17,7 @@ lazy_static = "1.4.0"
|
|||
serde_json = "1.0.128"
|
||||
jieba-rs = "0.6.8"
|
||||
regex = "1.11.1"
|
||||
either = "1.13.0"
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = "0.26.0"
|
||||
|
|
|
@ -149,6 +149,10 @@ RustResult tantivy_create_index(const char *field_name,
|
|||
uintptr_t num_threads,
|
||||
uintptr_t overall_memory_budget_in_bytes);
|
||||
|
||||
RustResult tantivy_create_index_with_single_segment(const char *field_name,
|
||||
TantivyDataType data_type,
|
||||
const char *path);
|
||||
|
||||
void tantivy_free_index_writer(void *ptr);
|
||||
|
||||
RustResult tantivy_finish_index(void *ptr);
|
||||
|
@ -162,78 +166,140 @@ RustResult tantivy_index_add_int8s(void *ptr,
|
|||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_int8s_by_single_segment_writer(void *ptr,
|
||||
const int8_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_int16s(void *ptr,
|
||||
const int16_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_int16s_by_single_segment_writer(void *ptr,
|
||||
const int16_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_int32s(void *ptr,
|
||||
const int32_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_int32s_by_single_segment_writer(void *ptr,
|
||||
const int32_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_int64s(void *ptr,
|
||||
const int64_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_int64s_by_single_segment_writer(void *ptr,
|
||||
const int64_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_f32s(void *ptr,
|
||||
const float *array,
|
||||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_f32s_by_single_segment_writer(void *ptr,
|
||||
const float *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_f64s(void *ptr,
|
||||
const double *array,
|
||||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_f64s_by_single_segment_writer(void *ptr,
|
||||
const double *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_bools(void *ptr,
|
||||
const bool *array,
|
||||
uintptr_t len,
|
||||
int64_t offset_begin);
|
||||
|
||||
RustResult tantivy_index_add_bools_by_single_segment_writer(void *ptr,
|
||||
const bool *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_string(void *ptr, const char *s, int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_string_by_single_segment_writer(void *ptr, const char *s);
|
||||
|
||||
RustResult tantivy_index_add_multi_int8s(void *ptr,
|
||||
const int8_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_int8s_by_single_segment_writer(void *ptr,
|
||||
const int8_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_int16s(void *ptr,
|
||||
const int16_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_int16s_by_single_segment_writer(void *ptr,
|
||||
const int16_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_int32s(void *ptr,
|
||||
const int32_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_int32s_by_single_segment_writer(void *ptr,
|
||||
const int32_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_int64s(void *ptr,
|
||||
const int64_t *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_int64s_by_single_segment_writer(void *ptr,
|
||||
const int64_t *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_f32s(void *ptr,
|
||||
const float *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_f32s_by_single_segment_writer(void *ptr,
|
||||
const float *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_f64s(void *ptr,
|
||||
const double *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_f64s_by_single_segment_writer(void *ptr,
|
||||
const double *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_bools(void *ptr,
|
||||
const bool *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_bools_by_single_segment_writer(void *ptr,
|
||||
const bool *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_index_add_multi_keywords(void *ptr,
|
||||
const char *const *array,
|
||||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
RustResult tantivy_index_add_multi_keywords_by_single_segment_writer(void *ptr,
|
||||
const char *const *array,
|
||||
uintptr_t len);
|
||||
|
||||
RustResult tantivy_create_text_writer(const char *field_name,
|
||||
const char *path,
|
||||
const char *tokenizer_name,
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
use std::default;
|
||||
use std::ffi::c_void;
|
||||
use std::ptr::null;
|
||||
|
||||
|
@ -132,19 +131,15 @@ pub extern "C" fn free_rust_result(result: RustResult) {
|
|||
}
|
||||
_ => {}
|
||||
}
|
||||
unsafe {
|
||||
if !result.error.is_null() {
|
||||
free_rust_string(result.error as *mut c_char);
|
||||
}
|
||||
if !result.error.is_null() {
|
||||
free_rust_string(result.error as *mut c_char);
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_rust_error(error: *const c_char) {
|
||||
unsafe {
|
||||
if !error.is_null() {
|
||||
free_rust_string(error as *mut c_char);
|
||||
}
|
||||
if !error.is_null() {
|
||||
free_rust_string(error as *mut c_char);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#[repr(u8)]
|
||||
#[derive(Debug)]
|
||||
pub enum TantivyDataType {
|
||||
Text,
|
||||
Keyword,
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
use core::{fmt, str};
|
||||
|
||||
use serde_json as json;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TantivyBindingError {
|
||||
JsonError(serde_json::Error),
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
use std::{
|
||||
ffi::{c_char, c_void, CStr},
|
||||
ptr::null,
|
||||
};
|
||||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
array::{RustArray, RustResult},
|
||||
array::RustResult,
|
||||
cstr_to_str,
|
||||
index_reader::IndexReaderWrapper,
|
||||
string_c::create_string,
|
||||
util::{create_binding, free_binding},
|
||||
util_c::tantivy_index_exist,
|
||||
};
|
||||
|
|
|
@ -4,7 +4,7 @@ use tantivy::{
|
|||
Term,
|
||||
};
|
||||
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use crate::error::Result;
|
||||
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
|
||||
|
||||
impl IndexReaderWrapper {
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
use std::{ffi::CStr, ptr::null};
|
||||
use std::ffi::CStr;
|
||||
|
||||
use libc::{c_char, c_void};
|
||||
|
||||
use crate::{
|
||||
array::{RustArray, RustResult},
|
||||
cstr_to_str,
|
||||
index_reader::IndexReaderWrapper,
|
||||
log::init_log,
|
||||
string_c::{c_str_to_str, create_string},
|
||||
array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
|
||||
tokenizer::create_tokenizer,
|
||||
};
|
||||
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
use std::ffi::CStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use either::Either;
|
||||
use futures::executor::block_on;
|
||||
use libc::c_char;
|
||||
use log::info;
|
||||
use tantivy::schema::{
|
||||
Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED,
|
||||
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
|
||||
};
|
||||
use tantivy::{doc, tokenizer, Document, Index, IndexWriter};
|
||||
use tantivy::{doc, Document, Index, IndexWriter, SingleSegmentIndexWriter};
|
||||
|
||||
use crate::data_type::TantivyDataType;
|
||||
|
||||
|
@ -16,11 +18,34 @@ use crate::log::init_log;
|
|||
|
||||
pub(crate) struct IndexWriterWrapper {
|
||||
pub(crate) field: Field,
|
||||
pub(crate) index_writer: IndexWriter,
|
||||
pub(crate) id_field: Field,
|
||||
pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
|
||||
pub(crate) id_field: Option<Field>,
|
||||
pub(crate) index: Arc<Index>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn schema_builder_add_field(
|
||||
schema_builder: &mut SchemaBuilder,
|
||||
field_name: &str,
|
||||
data_type: TantivyDataType,
|
||||
) -> Field {
|
||||
match data_type {
|
||||
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
|
||||
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
|
||||
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
|
||||
TantivyDataType::Keyword => {
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("raw")
|
||||
.set_index_option(IndexRecordOption::Basic);
|
||||
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
||||
schema_builder.add_text_field(&field_name, text_options)
|
||||
}
|
||||
TantivyDataType::Text => {
|
||||
panic!("text should be indexed with analyzer");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexWriterWrapper {
|
||||
pub fn new(
|
||||
field_name: String,
|
||||
|
@ -30,30 +55,10 @@ impl IndexWriterWrapper {
|
|||
overall_memory_budget_in_bytes: usize,
|
||||
) -> Result<IndexWriterWrapper> {
|
||||
init_log();
|
||||
|
||||
let field: Field;
|
||||
info!("create index writer, field_name: {}, data_type: {:?}", field_name, data_type);
|
||||
let mut schema_builder = Schema::builder();
|
||||
match data_type {
|
||||
TantivyDataType::I64 => {
|
||||
field = schema_builder.add_i64_field(&field_name, INDEXED);
|
||||
}
|
||||
TantivyDataType::F64 => {
|
||||
field = schema_builder.add_f64_field(&field_name, INDEXED);
|
||||
}
|
||||
TantivyDataType::Bool => {
|
||||
field = schema_builder.add_bool_field(&field_name, INDEXED);
|
||||
}
|
||||
TantivyDataType::Keyword => {
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("raw")
|
||||
.set_index_option(IndexRecordOption::Basic);
|
||||
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
|
||||
field = schema_builder.add_text_field(&field_name, text_options);
|
||||
}
|
||||
TantivyDataType::Text => {
|
||||
panic!("text should be indexed with analyzer");
|
||||
}
|
||||
}
|
||||
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
|
||||
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
|
||||
let id_field = schema_builder.add_i64_field("doc_id", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_dir(path.clone(), schema)?;
|
||||
|
@ -61,8 +66,28 @@ impl IndexWriterWrapper {
|
|||
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
|
||||
Ok(IndexWriterWrapper {
|
||||
field,
|
||||
index_writer,
|
||||
id_field,
|
||||
index_writer: Either::Left(index_writer),
|
||||
id_field: Some(id_field),
|
||||
index: Arc::new(index),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn new_with_single_segment(
|
||||
field_name: String,
|
||||
data_type: TantivyDataType,
|
||||
path: String,
|
||||
) -> Result<IndexWriterWrapper> {
|
||||
init_log();
|
||||
info!("create single segment index writer, field_name: {}, data_type: {:?}", field_name, data_type);
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_dir(path.clone(), schema)?;
|
||||
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
|
||||
Ok(IndexWriterWrapper {
|
||||
field,
|
||||
index_writer: Either::Right(index_writer),
|
||||
id_field: None,
|
||||
index: Arc::new(index),
|
||||
})
|
||||
}
|
||||
|
@ -71,6 +96,30 @@ impl IndexWriterWrapper {
|
|||
IndexReaderWrapper::from_index(self.index.clone())
|
||||
}
|
||||
|
||||
fn index_writer_add_document(&self, document: Document) -> Result<()> {
|
||||
match self.index_writer {
|
||||
Either::Left(ref writer) => {
|
||||
let _ = writer.add_document(document)?;
|
||||
}
|
||||
Either::Right(_) => {
|
||||
panic!("unexpected writer");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn single_segment_index_writer_add_document(&mut self, document: Document) -> Result<()> {
|
||||
match self.index_writer {
|
||||
Either::Left(_) => {
|
||||
panic!("unexpected writer");
|
||||
}
|
||||
Either::Right(ref mut single_segmnet_writer) => {
|
||||
let _ = single_segmnet_writer.add_document(document)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> {
|
||||
self.add_i64(data.into(), offset)
|
||||
}
|
||||
|
@ -84,11 +133,10 @@ impl IndexWriterWrapper {
|
|||
}
|
||||
|
||||
pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> {
|
||||
let _ = self.index_writer.add_document(doc!(
|
||||
self.index_writer_add_document(doc!(
|
||||
self.field => data,
|
||||
self.id_field => offset,
|
||||
))?;
|
||||
Ok(())
|
||||
self.id_field.unwrap() => offset,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> {
|
||||
|
@ -96,27 +144,24 @@ impl IndexWriterWrapper {
|
|||
}
|
||||
|
||||
pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> {
|
||||
let _ = self.index_writer.add_document(doc!(
|
||||
self.index_writer_add_document(doc!(
|
||||
self.field => data,
|
||||
self.id_field => offset,
|
||||
))?;
|
||||
Ok(())
|
||||
self.id_field.unwrap() => offset,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> {
|
||||
let _ = self.index_writer.add_document(doc!(
|
||||
self.index_writer_add_document(doc!(
|
||||
self.field => data,
|
||||
self.id_field => offset,
|
||||
))?;
|
||||
Ok(())
|
||||
self.id_field.unwrap() => offset,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> {
|
||||
let _ = self.index_writer.add_document(doc!(
|
||||
self.index_writer_add_document(doc!(
|
||||
self.field => data,
|
||||
self.id_field => offset,
|
||||
))?;
|
||||
Ok(())
|
||||
self.id_field.unwrap() => offset,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
|
||||
|
@ -124,9 +169,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data as i64);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
|
||||
|
@ -134,9 +178,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data as i64);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
|
||||
|
@ -144,9 +187,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data as i64);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
|
||||
|
@ -154,9 +196,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
|
||||
|
@ -164,9 +205,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data as f64);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
|
||||
|
@ -174,9 +214,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
|
||||
|
@ -184,9 +223,8 @@ impl IndexWriterWrapper {
|
|||
for data in datas {
|
||||
document.add_field_value(self.field, *data);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
|
||||
|
@ -195,31 +233,148 @@ impl IndexWriterWrapper {
|
|||
let data = unsafe { CStr::from_ptr(*element) };
|
||||
document.add_field_value(self.field, data.to_str()?);
|
||||
}
|
||||
document.add_i64(self.id_field, offset);
|
||||
let _ = self.index_writer.add_document(document)?;
|
||||
Ok(())
|
||||
document.add_i64(self.id_field.unwrap(), offset);
|
||||
self.index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> {
|
||||
self.add_i64_by_single_segment_writer(data.into())
|
||||
}
|
||||
|
||||
pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> {
|
||||
self.add_i64_by_single_segment_writer(data.into())
|
||||
}
|
||||
|
||||
pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> {
|
||||
self.add_i64_by_single_segment_writer(data.into())
|
||||
}
|
||||
|
||||
pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> {
|
||||
self.single_segment_index_writer_add_document(doc!(
|
||||
self.field => data
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> {
|
||||
self.add_f64_by_single_segment_writer(data.into())
|
||||
}
|
||||
|
||||
pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> {
|
||||
self.single_segment_index_writer_add_document(doc!(
|
||||
self.field => data
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> {
|
||||
self.single_segment_index_writer_add_document(doc!(
|
||||
self.field => data
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> {
|
||||
self.single_segment_index_writer_add_document(doc!(
|
||||
self.field => data
|
||||
))
|
||||
}
|
||||
|
||||
pub fn add_multi_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data as i64);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data as i64);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data as i64);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data as f64);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for data in datas {
|
||||
document.add_field_value(self.field, *data);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
pub fn add_multi_keywords_by_single_segment_writer(
|
||||
&mut self,
|
||||
datas: &[*const c_char],
|
||||
) -> Result<()> {
|
||||
let mut document = Document::default();
|
||||
for element in datas {
|
||||
let data = unsafe { CStr::from_ptr(*element) };
|
||||
document.add_field_value(self.field, data.to_str()?);
|
||||
}
|
||||
self.single_segment_index_writer_add_document(document)
|
||||
}
|
||||
|
||||
fn manual_merge(&mut self) -> Result<()> {
|
||||
let metas = self.index_writer.index().searchable_segment_metas()?;
|
||||
let policy = self.index_writer.get_merge_policy();
|
||||
let index_writer = self.index_writer.as_mut().left().unwrap();
|
||||
let metas = index_writer.index().searchable_segment_metas()?;
|
||||
let policy = index_writer.get_merge_policy();
|
||||
let candidates = policy.compute_merge_candidates(metas.as_slice());
|
||||
for candidate in candidates {
|
||||
self.index_writer.merge(candidate.0.as_slice()).wait()?;
|
||||
index_writer.merge(candidate.0.as_slice()).wait()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> Result<()> {
|
||||
self.index_writer.commit()?;
|
||||
// self.manual_merge();
|
||||
block_on(self.index_writer.garbage_collect_files())?;
|
||||
self.index_writer.wait_merging_threads()?;
|
||||
pub fn finish(self) -> Result<()> {
|
||||
match self.index_writer {
|
||||
Either::Left(mut index_writer) => {
|
||||
index_writer.commit()?;
|
||||
// self.manual_merge();
|
||||
block_on(index_writer.garbage_collect_files())?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
Either::Right(single_segment_index_writer) => {
|
||||
single_segment_index_writer
|
||||
.finalize()
|
||||
.expect("failed to build inverted index");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn commit(&mut self) -> Result<()> {
|
||||
self.index_writer.commit()?;
|
||||
self.index_writer.as_mut().left().unwrap().commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
use core::slice;
|
||||
use std::{
|
||||
ffi::{c_char, c_void, CStr},
|
||||
ptr::null,
|
||||
};
|
||||
|
||||
use tantivy::Index;
|
||||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
array::RustResult,
|
||||
|
@ -47,6 +42,24 @@ pub extern "C" fn tantivy_create_index(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_index_with_single_segment(
|
||||
field_name: *const c_char,
|
||||
data_type: TantivyDataType,
|
||||
path: *const c_char,
|
||||
) -> RustResult {
|
||||
let field_name_str = cstr_to_str!(field_name);
|
||||
let path_str = cstr_to_str!(path);
|
||||
match IndexWriterWrapper::new_with_single_segment(
|
||||
String::from(field_name_str),
|
||||
data_type,
|
||||
String::from(path_str),
|
||||
) {
|
||||
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
|
||||
Err(e) => RustResult::from_error(e.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_index_writer(ptr: *mut c_void) {
|
||||
free_binding::<IndexWriterWrapper>(ptr);
|
||||
|
@ -77,6 +90,29 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes
|
|||
}
|
||||
|
||||
// -------------------------build--------------------
|
||||
fn execute<T: Copy>(
|
||||
arr: &[T],
|
||||
offset: i64,
|
||||
e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
|
||||
w: &mut IndexWriterWrapper,
|
||||
) -> Result<()> {
|
||||
for (index, data) in arr.iter().enumerate() {
|
||||
e(w, *data, offset + (index as i64))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn execute_by_single_segment_writer<T: Copy>(
|
||||
arr: &[T],
|
||||
e: fn(&mut IndexWriterWrapper, T) -> Result<()>,
|
||||
w: &mut IndexWriterWrapper,
|
||||
) -> Result<()> {
|
||||
for (_, data) in arr.iter().enumerate() {
|
||||
e(w, *data)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int8s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -89,6 +125,24 @@ pub extern "C" fn tantivy_index_add_int8s(
|
|||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i8,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_i8_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int16s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -101,6 +155,24 @@ pub extern "C" fn tantivy_index_add_int16s(
|
|||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i16,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_i16_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int32s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -113,6 +185,24 @@ pub extern "C" fn tantivy_index_add_int32s(
|
|||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i32,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_i32_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int64s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -126,18 +216,23 @@ pub extern "C" fn tantivy_index_add_int64s(
|
|||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
|
||||
}
|
||||
|
||||
fn execute<T: Copy>(
|
||||
arr: &[T],
|
||||
offset: i64,
|
||||
mut e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
|
||||
w: &mut IndexWriterWrapper,
|
||||
) -> Result<()> {
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i64,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
|
||||
unsafe {
|
||||
for (index, data) in arr.iter().enumerate() {
|
||||
e(w, *data, offset + (index as i64))?;
|
||||
}
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_i64_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
|
@ -152,6 +247,24 @@ pub extern "C" fn tantivy_index_add_f32s(
|
|||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const f32,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_f32_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_f64s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -164,6 +277,24 @@ pub extern "C" fn tantivy_index_add_f64s(
|
|||
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const f64,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_f64_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_bools(
|
||||
ptr: *mut c_void,
|
||||
|
@ -184,6 +315,24 @@ pub extern "C" fn tantivy_index_add_bools(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const bool,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let arr = unsafe { slice::from_raw_parts(array, len) };
|
||||
unsafe {
|
||||
execute_by_single_segment_writer(
|
||||
arr,
|
||||
IndexWriterWrapper::add_bool_by_single_segment_writer,
|
||||
&mut (*real),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this is not a very efficient way, since we must call this function many times, which
|
||||
// will bring a lot of overhead caused by the rust binding.
|
||||
#[no_mangle]
|
||||
|
@ -197,6 +346,16 @@ pub extern "C" fn tantivy_index_add_string(
|
|||
unsafe { (*real).add_string(s, offset).into() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_string_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
s: *const c_char,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
let s = cstr_to_str!(s);
|
||||
unsafe { (*real).add_string_by_single_segment_writer(s).into() }
|
||||
}
|
||||
|
||||
// --------------------------------------------- array ------------------------------------------
|
||||
|
||||
#[no_mangle]
|
||||
|
@ -213,6 +372,19 @@ pub extern "C" fn tantivy_index_add_multi_int8s(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int8s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i8,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_i8s_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int16s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -227,6 +399,19 @@ pub extern "C" fn tantivy_index_add_multi_int16s(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int16s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i16,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_i16s_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int32s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -241,6 +426,19 @@ pub extern "C" fn tantivy_index_add_multi_int32s(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int32s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i32,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_i32s_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int64s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -255,6 +453,19 @@ pub extern "C" fn tantivy_index_add_multi_int64s(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_int64s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const i64,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_i64s_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_f32s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -269,6 +480,19 @@ pub extern "C" fn tantivy_index_add_multi_f32s(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_f32s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const f32,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_f32s_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_f64s(
|
||||
ptr: *mut c_void,
|
||||
|
@ -283,6 +507,19 @@ pub extern "C" fn tantivy_index_add_multi_f64s(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_f64s_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const f64,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_f64s_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_bools(
|
||||
ptr: *mut c_void,
|
||||
|
@ -297,6 +534,19 @@ pub extern "C" fn tantivy_index_add_multi_bools(
|
|||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_bools_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const bool,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real).add_multi_bools_by_single_segment_writer(arr).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_keywords(
|
||||
ptr: *mut c_void,
|
||||
|
@ -310,3 +560,18 @@ pub extern "C" fn tantivy_index_add_multi_keywords(
|
|||
(*real).add_multi_keywords(arr, offset).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_index_add_multi_keywords_by_single_segment_writer(
|
||||
ptr: *mut c_void,
|
||||
array: *const *const c_char,
|
||||
len: usize,
|
||||
) -> RustResult {
|
||||
let real = ptr as *mut IndexWriterWrapper;
|
||||
unsafe {
|
||||
let arr = convert_to_rust_slice!(array, len);
|
||||
(*real)
|
||||
.add_multi_keywords_by_single_segment_writer(arr)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use std::sync::Arc;
|
||||
|
||||
use either::Either;
|
||||
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
|
||||
use tantivy::tokenizer::TextAnalyzer;
|
||||
use tantivy::Index;
|
||||
|
@ -44,8 +45,8 @@ impl IndexWriterWrapper {
|
|||
|
||||
IndexWriterWrapper {
|
||||
field,
|
||||
index_writer,
|
||||
id_field,
|
||||
index_writer: Either::Left(index_writer),
|
||||
id_field: Some(id_field),
|
||||
index: Arc::new(index),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,10 @@
|
|||
use std::ffi::c_char;
|
||||
use std::ffi::c_void;
|
||||
use std::ffi::CStr;
|
||||
|
||||
use crate::array::RustResult;
|
||||
use crate::cstr_to_str;
|
||||
use crate::error::Result;
|
||||
use crate::index_writer::IndexWriterWrapper;
|
||||
use crate::log::init_log;
|
||||
use crate::string_c::c_str_to_str;
|
||||
use crate::tokenizer::create_tokenizer;
|
||||
use crate::util::create_binding;
|
||||
|
||||
|
|
|
@ -81,15 +81,22 @@ struct TantivyIndexWrapper {
|
|||
TantivyIndexWrapper(const char* field_name,
|
||||
TantivyDataType data_type,
|
||||
const char* path,
|
||||
bool inverted_single_semgnent = false,
|
||||
uintptr_t num_threads = DEFAULT_NUM_THREADS,
|
||||
uintptr_t overall_memory_budget_in_bytes =
|
||||
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_create_index(field_name,
|
||||
data_type,
|
||||
path,
|
||||
num_threads,
|
||||
overall_memory_budget_in_bytes));
|
||||
RustResultWrapper res;
|
||||
if (inverted_single_semgnent) {
|
||||
res = RustResultWrapper(tantivy_create_index_with_single_segment(
|
||||
field_name, data_type, path));
|
||||
} else {
|
||||
res = RustResultWrapper(
|
||||
tantivy_create_index(field_name,
|
||||
data_type,
|
||||
path,
|
||||
num_threads,
|
||||
overall_memory_budget_in_bytes));
|
||||
}
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to create index: {}",
|
||||
res.result_->error);
|
||||
|
@ -340,6 +347,193 @@ struct TantivyIndexWrapper {
|
|||
typeid(T).name());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
add_data_by_single_segment_writer(const T* array, uintptr_t len) {
|
||||
assert(!finished_);
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_bools_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add bools: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_int8s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add int8s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int16_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_int16s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add int16s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int32_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_int32s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add int32s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int64_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_int64s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add int64s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_f32s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add f32s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, double>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_f64s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add f64s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
// TODO: not very efficient, a lot of overhead due to rust-ffi call.
|
||||
for (uintptr_t i = 0; i < len; i++) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_string_by_single_segment_writer(
|
||||
writer_,
|
||||
static_cast<const std::string*>(array)[i].c_str()));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add string: {}",
|
||||
res.result_->error);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw fmt::format("InvertedIndex.add_data: unsupported data type: {}",
|
||||
typeid(T).name());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
add_multi_data_by_single_segment_writer(const T* array, uintptr_t len) {
|
||||
assert(!finished_);
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_bools_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi bools: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_int8s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi int8s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int16_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_int16s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi int16s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int32_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_int32s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi int32s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, int64_t>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_int64s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi int64s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_f32s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi f32s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, double>) {
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_f64s_by_single_segment_writer(
|
||||
writer_, array, len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi f64s: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, std::string>) {
|
||||
std::vector<const char*> views;
|
||||
for (uintptr_t i = 0; i < len; i++) {
|
||||
views.push_back(array[i].c_str());
|
||||
}
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_index_add_multi_keywords_by_single_segment_writer(
|
||||
writer_, views.data(), len));
|
||||
AssertInfo(res.result_->success,
|
||||
"failed to add multi keywords: {}",
|
||||
res.result_->error);
|
||||
return;
|
||||
}
|
||||
|
||||
throw fmt::format(
|
||||
"InvertedIndex.add_multi_data: unsupported data type: {}",
|
||||
typeid(T).name());
|
||||
}
|
||||
|
||||
inline void
|
||||
finish() {
|
||||
if (finished_) {
|
||||
|
|
|
@ -90,6 +90,7 @@ set(MILVUS_TEST_FILES
|
|||
test_chunked_column.cpp
|
||||
test_rust_result.cpp
|
||||
test_cached_search_iterator.cpp
|
||||
test_build_inverted_index_with_single_segment.cpp
|
||||
)
|
||||
|
||||
if ( INDEX_ENGINE STREQUAL "cardinal" )
|
||||
|
|
|
@ -113,7 +113,7 @@ class ArrayInvertedIndexTest : public ::testing::Test {
|
|||
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
|
||||
Config cfg;
|
||||
cfg["is_array"] = true;
|
||||
index->BuildWithRawData(N_, vec_of_array_.data(), cfg);
|
||||
index->BuildWithRawDataForUT(N_, vec_of_array_.data(), cfg);
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema_->get_field_id(FieldName("array")).get(),
|
||||
.index = std::move(index),
|
||||
|
|
|
@ -0,0 +1,214 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <random>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "pb/plan.pb.h"
|
||||
#include "segcore/SegmentSealedImpl.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "test_utils/DataGen.h"
|
||||
#include "common/Schema.h"
|
||||
#include "test_utils/GenExprProto.h"
|
||||
#include "query/PlanProto.h"
|
||||
#include "query/ExecPlanNodeVisitor.h"
|
||||
|
||||
using namespace milvus;
|
||||
using namespace milvus::query;
|
||||
using namespace milvus::segcore;
|
||||
|
||||
template <typename T>
|
||||
SchemaPtr
|
||||
GenSchema() {
|
||||
auto schema_ = std::make_shared<Schema>();
|
||||
auto pk = schema_->AddDebugField("pk", DataType::INT64);
|
||||
schema_->set_primary_field_id(pk);
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
schema_->AddDebugField("index", DataType::BOOL, false);
|
||||
} else if constexpr (std::is_same_v<T, int8_t>) {
|
||||
schema_->AddDebugField("index", DataType::INT8, false);
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
schema_->AddDebugField("index", DataType::INT16, false);
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
schema_->AddDebugField("index", DataType::INT32, false);
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
schema_->AddDebugField("index", DataType::INT64, false);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
schema_->AddDebugField("index", DataType::FLOAT, false);
|
||||
} else if constexpr (std::is_same_v<T, double>) {
|
||||
schema_->AddDebugField("index", DataType::DOUBLE, false);
|
||||
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||
schema_->AddDebugField("index", DataType::VARCHAR, false);
|
||||
}
|
||||
|
||||
return schema_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class BuildInvertedIndexWithSingleSegmentTest : public ::testing::Test {
|
||||
public:
|
||||
void
|
||||
SetUp() override {
|
||||
schema_ = GenSchema<T>();
|
||||
seg_ = CreateSealedSegment(schema_);
|
||||
N_ = 3000;
|
||||
uint64_t seed = 1234;
|
||||
auto raw_data = DataGen(schema_, N_, seed);
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
auto index_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||
->scalars()
|
||||
.bool_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
index_column_data_.push_back(index_col[i]);
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
auto index_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||
->scalars()
|
||||
.long_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
index_column_data_.push_back(index_col[i]);
|
||||
}
|
||||
} else if constexpr (std::is_integral_v<T>) {
|
||||
auto index_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||
->scalars()
|
||||
.int_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
index_column_data_.push_back(index_col[i]);
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
auto index_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||
->scalars()
|
||||
.float_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
index_column_data_.push_back(index_col[i]);
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, double>) {
|
||||
auto index_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||
->scalars()
|
||||
.double_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
index_column_data_.push_back(index_col[i]);
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||
auto index_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("index")))
|
||||
->scalars()
|
||||
.string_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
index_column_data_.push_back(index_col[i]);
|
||||
}
|
||||
}
|
||||
SealedLoadFieldData(raw_data, *seg_);
|
||||
LoadInvertedIndex();
|
||||
}
|
||||
|
||||
void
|
||||
TearDown() override {
|
||||
}
|
||||
|
||||
void
|
||||
LoadInvertedIndex() {
|
||||
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
|
||||
Config cfg;
|
||||
cfg[milvus::index::SCALAR_INDEX_ENGINE_VERSION] = 0;
|
||||
index->BuildWithRawDataForUT(N_, index_column_data_.data(), cfg);
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema_->get_field_id(FieldName("index")).get(),
|
||||
.index = std::move(index),
|
||||
};
|
||||
seg_->LoadIndex(info);
|
||||
}
|
||||
|
||||
T
|
||||
FieldValueAt(int64_t offset) {
|
||||
return index_column_data_[offset];
|
||||
}
|
||||
|
||||
public:
|
||||
SchemaPtr schema_;
|
||||
SegmentSealedUPtr seg_;
|
||||
int64_t N_;
|
||||
boost::container::vector<T> index_column_data_;
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE_P(BuildInvertedIndexWithSingleSegmentTest);
|
||||
|
||||
TYPED_TEST_P(BuildInvertedIndexWithSingleSegmentTest,
|
||||
ReadFromSingleSegmentIndex) {
|
||||
const auto& meta = this->schema_->operator[](FieldName("index"));
|
||||
for (size_t i = 0; i < 10; i++) {
|
||||
auto column_info = test::GenColumnInfo(
|
||||
meta.get_id().get(),
|
||||
static_cast<proto::schema::DataType>(meta.get_data_type()),
|
||||
false,
|
||||
false,
|
||||
static_cast<proto::schema::DataType>(meta.get_element_type()));
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
|
||||
std::uniform_int_distribution<> int_dist(1, this->N_);
|
||||
int random_idx = int_dist(gen) - 1;
|
||||
|
||||
auto unary_range_expr = std::make_unique<proto::plan::UnaryRangeExpr>();
|
||||
unary_range_expr->set_allocated_column_info(column_info);
|
||||
unary_range_expr->set_op(proto::plan::OpType::Equal);
|
||||
auto val = this->FieldValueAt(random_idx);
|
||||
unary_range_expr->set_allocated_value(test::GenGenericValue(val));
|
||||
|
||||
auto expr = test::GenExpr();
|
||||
expr->set_allocated_unary_range_expr(unary_range_expr.release());
|
||||
auto parser = ProtoParser(*this->schema_);
|
||||
auto typed_expr = parser.ParseExprs(*expr);
|
||||
auto parsed = std::make_shared<plan::FilterBitsNode>(
|
||||
DEFAULT_PLANNODE_ID, typed_expr);
|
||||
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
|
||||
BitsetType final;
|
||||
final = ExecuteQueryExpr(parsed, segpromote, this->N_, MAX_TIMESTAMP);
|
||||
auto ref = [this, random_idx](size_t offset) -> bool {
|
||||
return this->index_column_data_[offset] ==
|
||||
this->index_column_data_[random_idx];
|
||||
};
|
||||
ASSERT_EQ(final.size(), this->N_);
|
||||
for (size_t i = 0; i < this->N_; i++) {
|
||||
if (std::is_floating_point_v<decltype(val)> && i == random_idx) {
|
||||
continue;
|
||||
}
|
||||
ASSERT_EQ(final[i], ref(i))
|
||||
<< "i: " << i << ", final[i]: " << final[i]
|
||||
<< ", ref(i): " << ref(i) << ", random_idx: " << random_idx
|
||||
<< ", value: " << this->index_column_data_[random_idx]
|
||||
<< ", value: " << this->index_column_data_[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_TYPED_TEST_CASE_P(BuildInvertedIndexWithSingleSegmentTest,
|
||||
ReadFromSingleSegmentIndex);
|
||||
|
||||
using ElementType = testing::
|
||||
Types<bool, int8_t, int16_t, int32_t, int64_t, float, double, std::string>;
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(Naive,
|
||||
BuildInvertedIndexWithSingleSegmentTest,
|
||||
ElementType);
|
|
@ -354,7 +354,7 @@ TEST_F(TestChunkSegment, TestCompareExpr) {
|
|||
data.begin() + i * test_data_count);
|
||||
}
|
||||
|
||||
index->BuildWithRawData(data.size(), data.data());
|
||||
index->BuildWithRawDataForUT(data.size(), data.data());
|
||||
segcore::LoadIndexInfo load_index_info;
|
||||
load_index_info.index = std::move(index);
|
||||
load_index_info.field_id = fid.get();
|
||||
|
|
|
@ -255,7 +255,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
|||
auto index = index::CreateStringIndexSort();
|
||||
std::vector<uint8_t> buffer(arr.ByteSize());
|
||||
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
|
||||
index->BuildWithRawData(arr.ByteSize(), buffer.data());
|
||||
index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||
.index = std::move(index),
|
||||
|
@ -264,7 +264,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
|||
}
|
||||
{
|
||||
auto index = index::CreateScalarIndexSort<int64_t>();
|
||||
index->BuildWithRawData(N, raw_int.data());
|
||||
index->BuildWithRawDataForUT(N, raw_int.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id =
|
||||
schema->get_field_id(FieldName("another_int64")).get(),
|
||||
|
@ -278,7 +278,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
|||
LoadInvertedIndex() {
|
||||
auto index =
|
||||
std::make_unique<index::InvertedIndexTantivy<std::string>>();
|
||||
index->BuildWithRawData(N, raw_str.data());
|
||||
index->BuildWithRawDataForUT(N, raw_str.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||
.index = std::move(index),
|
||||
|
@ -295,7 +295,7 @@ class SealedSegmentRegexQueryTest : public ::testing::Test {
|
|||
auto index = std::make_unique<MockStringIndex>();
|
||||
std::vector<uint8_t> buffer(arr.ByteSize());
|
||||
ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSize()));
|
||||
index->BuildWithRawData(arr.ByteSize(), buffer.data());
|
||||
index->BuildWithRawDataForUT(arr.ByteSize(), buffer.data());
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema->get_field_id(FieldName("str")).get(),
|
||||
.index = std::move(index),
|
||||
|
|
|
@ -151,8 +151,12 @@ TEST_F(StringIndexMarisaTest, Range) {
|
|||
TEST_F(StringIndexMarisaTest, Reverse) {
|
||||
auto index_types = GetIndexTypes<std::string>();
|
||||
for (const auto& index_type : index_types) {
|
||||
auto index = milvus::index::IndexFactory::GetInstance()
|
||||
.CreatePrimitiveScalarIndex<std::string>(index_type);
|
||||
CreateIndexInfo create_index_info{
|
||||
.index_type = index_type,
|
||||
};
|
||||
auto index =
|
||||
milvus::index::IndexFactory::GetInstance()
|
||||
.CreatePrimitiveScalarIndex<std::string>(create_index_info);
|
||||
index->Build(nb, strs.data());
|
||||
assert_reverse<std::string>(index.get(), strs);
|
||||
}
|
||||
|
@ -311,7 +315,7 @@ TEST_F(StringIndexMarisaTest, BaseIndexCodec) {
|
|||
*str_arr.mutable_data() = {strings.begin(), strings.end()};
|
||||
std::vector<uint8_t> data(str_arr.ByteSizeLong(), 0);
|
||||
str_arr.SerializeToArray(data.data(), str_arr.ByteSizeLong());
|
||||
index->BuildWithRawData(str_arr.ByteSizeLong(), data.data());
|
||||
index->BuildWithRawDataForUT(str_arr.ByteSizeLong(), data.data());
|
||||
|
||||
std::vector<std::string> invalid_strings = {std::to_string(nb)};
|
||||
auto copy_index = milvus::index::CreateStringIndexMarisa();
|
||||
|
|
Loading…
Reference in New Issue