2024-09-01 09:13:03 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include "tantivy-binding.h"
|
|
|
|
#include "rust-binding.h"
|
|
|
|
#include "rust-hashmap.h"
|
|
|
|
#include "token-stream.h"
|
|
|
|
|
|
|
|
namespace milvus::tantivy {
|
|
|
|
|
|
|
|
struct Tokenizer {
|
|
|
|
public:
|
|
|
|
NO_COPY_OR_ASSIGN(Tokenizer);
|
|
|
|
|
2024-11-06 09:48:24 +00:00
|
|
|
explicit Tokenizer(std::string&& params) {
|
|
|
|
auto shared_params = std::make_shared<std::string>(std::move(params));
|
|
|
|
ptr_ = tantivy_create_tokenizer(shared_params->c_str());
|
2024-09-01 09:13:03 +00:00
|
|
|
if (ptr_ == nullptr) {
|
2024-09-10 07:11:08 +00:00
|
|
|
throw std::invalid_argument("invalid tokenizer parameters");
|
2024-09-01 09:13:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-06 09:48:24 +00:00
|
|
|
explicit Tokenizer(void* _ptr) : ptr_(_ptr) {
|
|
|
|
}
|
|
|
|
|
2024-09-01 09:13:03 +00:00
|
|
|
~Tokenizer() {
|
|
|
|
if (ptr_ != nullptr) {
|
|
|
|
tantivy_free_tokenizer(ptr_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<TokenStream>
|
|
|
|
CreateTokenStream(std::string&& text) {
|
|
|
|
auto shared_text = std::make_shared<std::string>(std::move(text));
|
|
|
|
auto token_stream =
|
|
|
|
tantivy_create_token_stream(ptr_, shared_text->c_str());
|
|
|
|
return std::make_unique<TokenStream>(token_stream, shared_text);
|
|
|
|
}
|
|
|
|
|
2024-11-06 09:48:24 +00:00
|
|
|
std::unique_ptr<Tokenizer>
|
|
|
|
Clone() {
|
|
|
|
auto newptr = tantivy_clone_tokenizer(ptr_);
|
|
|
|
return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
|
|
|
|
}
|
|
|
|
|
2024-09-01 09:13:03 +00:00
|
|
|
// CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
|
|
|
|
std::unique_ptr<TokenStream>
|
|
|
|
CreateTokenStreamCopyText(const std::string& text) {
|
|
|
|
auto shared_text = std::make_shared<std::string>(text);
|
|
|
|
auto token_stream =
|
|
|
|
tantivy_create_token_stream(ptr_, shared_text->c_str());
|
|
|
|
return std::make_unique<TokenStream>(token_stream, shared_text);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
void* ptr_;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace milvus::tantivy
|