enhance: pack analyzer code and support lindera tokenizer (#39660)

relate: https://github.com/milvus-io/milvus/issues/39659

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
pull/40088/head
aoiasd 2025-02-24 12:13:55 +08:00 committed by GitHub
parent dd68814c15
commit 38f1608910
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 1939 additions and 206 deletions

File diff suppressed because it is too large Load Diff

View File

@ -5,8 +5,17 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[features]
default = ["lindera-ipadic", "lindera-ko-dic", "lindera-cc-cedict"]
lindera-ipadic = ["lindera/ipadic"]
lindera-ipadic-neologd = ["lindera/ipadic-neologd"]
lindera-unidic = ["lindera/unidic"]
lindera-ko-dic = ["lindera/ko-dic"]
lindera-cc-cedict = ["lindera/cc-cedict"]
[dependencies]
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
lindera = "0.38.1"
futures = "0.3.21"
libc = "0.2"
scopeguard = "1.2"

View File

@ -342,11 +342,11 @@ bool tantivy_token_stream_advance(void *token_stream);
const char *tantivy_token_stream_get_token(void *token_stream);
RustResult tantivy_create_tokenizer(const char *analyzer_params);
RustResult tantivy_create_analyzer(const char *analyzer_params);
void *tantivy_clone_tokenizer(void *ptr);
void *tantivy_clone_analyzer(void *ptr);
void tantivy_free_tokenizer(void *tokenizer);
void tantivy_free_analyzer(void *tokenizer);
bool tantivy_index_exist(const char *path);

View File

@ -1,80 +1,19 @@
use log::warn;
use serde_json as json;
use std::collections::HashMap;
use tantivy::tokenizer::StopWordFilter;
use tantivy::tokenizer::*;
use serde_json as json;
use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::jieba_tokenizer::JiebaTokenizer;
use crate::stop_words;
use crate::tokenizer_filter::*;
use crate::util::*;
use crate::analyzer::{
build_in_analyzer::*,
tokenizers::get_builder_with_tokenizer,
filter::*,
util::*
};
// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder().filter(LowerCaser);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
));
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
fn standard_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
}
fn whitespace_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}
fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
}
fn get_builder_by_name(name: &String) -> Result<TextAnalyzerBuilder> {
match name.as_str() {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()),
other => {
warn!("unsupported tokenizer: {}", other);
Err(TantivyBindingError::InternalError(format!(
"unsupported tokenizer: {}",
other
)))
}
}
}
struct AnalyzerBuilder<'a> {
// builder: TextAnalyzerBuilder
filters: HashMap<String, SystemFilter>,
params: &'a json::Map<String, json::Value>,
}
@ -87,20 +26,21 @@ impl AnalyzerBuilder<'_> {
}
}
fn get_tokenizer_name(&self) -> Result<String>{
fn get_tokenizer_params(&self) -> Result<&json::Value>{
let tokenizer=self.params.get("tokenizer");
if tokenizer.is_none(){
return Err(TantivyBindingError::InternalError(format!(
"tokenizer name or type must be set"
)));
}
if !tokenizer.unwrap().is_string() {
return Err(TantivyBindingError::InternalError(format!(
"tokenizer name should be string"
)));
let value = tokenizer.unwrap();
if value.is_object() || value.is_string() {
return Ok(tokenizer.unwrap())
}
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
Err(TantivyBindingError::InternalError(format!(
"tokenizer name should be string or dict"
)))
}
fn add_custom_filter(
@ -196,7 +136,7 @@ impl AnalyzerBuilder<'_> {
let str_list = get_string_list(value, "filter stop_words")?;
Ok(get_stop_words_list(str_list))
}
None => Ok(vec![]),
_ => Ok(vec![]),
}
}
@ -227,8 +167,8 @@ impl AnalyzerBuilder<'_> {
};
//build custom analyzer
let tokenizer_name = self.get_tokenizer_name()?;
let mut builder = get_builder_by_name(&tokenizer_name)?;
let tokenizer_params = self.get_tokenizer_params()?;
let mut builder = get_builder_with_tokenizer(&tokenizer_params)?;
// build with option
builder = self.build_option(builder)?;
@ -236,7 +176,7 @@ impl AnalyzerBuilder<'_> {
}
}
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer> {
pub(crate) fn create_analyzer_with_filter(params: &String) -> Result<TextAnalyzer> {
match json::from_str::<json::Value>(&params) {
Ok(value) => {
if value.is_null() {
@ -280,16 +220,16 @@ pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyz
}
}
pub(crate) fn create_tokenizer(params: &str) -> Result<TextAnalyzer> {
pub(crate) fn create_analyzer(params: &str) -> Result<TextAnalyzer> {
if params.len() == 0 {
return Ok(standard_analyzer(vec![]));
}
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params))
}
#[cfg(test)]
mod tests {
use crate::tokenizer::create_tokenizer;
use crate::analyzer::analyzer::create_analyzer;
#[test]
fn test_standard_analyzer() {
@ -298,7 +238,7 @@ mod tests {
"stop_words": ["_english_"]
}"#;
let tokenizer = create_tokenizer(&params.to_string());
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
@ -308,7 +248,7 @@ mod tests {
"type": "chinese"
}"#;
let tokenizer = create_tokenizer(&params.to_string());
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
@ -321,4 +261,28 @@ mod tests {
print!("test tokens :{:?}\n", results)
}
#[test]
fn test_lindera_analyzer() {
let params = r#"{
"tokenizer": {
"type": "lindera",
"dict_kind": "ipadic"
}
}"#;
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
print!("test tokens :{:?}\n", results)
}
}

View File

@ -0,0 +1,40 @@
use tantivy::tokenizer::*;
use crate::analyzer::tokenizers::*;
use crate::analyzer::filter::*;
use crate::analyzer::stop_words;
// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder().filter(LowerCaser);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
pub fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
));
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}

View File

@ -2,9 +2,8 @@ use regex;
use serde_json as json;
use tantivy::tokenizer::*;
use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::util::*;
use crate::error::{Result,TantivyBindingError};
use crate::analyzer::util::*;
pub(crate) enum SystemFilter {
Invalid,
@ -79,7 +78,7 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
for element in stop_words {
match element.as_str() {
Some(word) => str_list.push(word.to_string()),
None => {
_ => {
return Err(TantivyBindingError::InternalError(
"decompounder word list item should be string".to_string(),
))
@ -114,12 +113,10 @@ fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemF
}
trait LanguageParser {
type Error;
fn into_language(self) -> Result<Language>;
}
impl LanguageParser for &str {
type Error = TantivyBindingError;
fn into_language(self) -> Result<Language> {
match self.to_lowercase().as_str() {
"arabig" => Ok(Language::Arabic),

View File

@ -0,0 +1,9 @@
mod analyzer;
mod stop_words;
mod tokenizers;
mod build_in_analyzer;
mod filter;
mod util;
pub(crate) use self::analyzer::create_analyzer;
pub(crate) use self::build_in_analyzer::standard_analyzer;

View File

@ -0,0 +1,157 @@
use core::result::Result::Err;
use log::warn;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::token::Token as LToken;
use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder};
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
use tantivy::tokenizer::{Token, Tokenizer, TokenStream};
use serde_json as json;
use crate::error::{Result,TantivyBindingError};
pub struct LinderaTokenStream<'a> {
pub tokens: Vec<LToken<'a>>,
pub token: &'a mut Token,
}
impl<'a> TokenStream for LinderaTokenStream<'a> {
fn advance(&mut self) -> bool {
if self.tokens.is_empty() {
return false;
}
let token = self.tokens.remove(0);
self.token.text = token.text.to_string();
self.token.offset_from = token.byte_start;
self.token.offset_to = token.byte_end;
self.token.position = token.position;
self.token.position_length = token.position_length;
true
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
#[derive(Clone)]
pub struct LinderaTokenizer {
tokenizer: LTokenizer,
token: Token,
}
impl LinderaTokenizer {
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
let kind = fetch_lindera_kind(params)?;
let dictionary = load_dictionary_from_kind(kind);
if dictionary.is_err(){
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer with invalid dict_kind"
)));
}
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
Ok(LinderaTokenizer::from_segmenter(segmenter))
}
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
LinderaTokenizer {
tokenizer: LTokenizer::new(segmenter),
token: Default::default(),
}
}
}
impl Tokenizer for LinderaTokenizer {
type TokenStream<'a> = LinderaTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
self.token.reset();
LinderaTokenStream {
tokens: self.tokenizer.tokenize(text).unwrap(),
token: &mut self.token,
}
}
}
trait DictionaryKindParser {
fn into_dict_kind(self) -> Result<DictionaryKind>;
}
impl DictionaryKindParser for &str{
fn into_dict_kind(self) -> Result<DictionaryKind> {
match self{
"ipadic" => Ok(DictionaryKind::IPADIC),
"ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd),
"unidic" => Ok(DictionaryKind::UniDic),
"ko-dic" => Ok(DictionaryKind::KoDic),
"cc-cedict" => Ok(DictionaryKind::CcCedict),
other => Err(TantivyBindingError::InvalidArgument(format!(
"unsupported lindera dict type: {}",
other
)))
}
}
}
fn fetch_lindera_kind(params:&json::Map<String, json::Value>) -> Result<DictionaryKind>{
match params.get("dict_kind"){
Some(val) => {
if !val.is_string(){
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict kind should be string"
)))
}
val.as_str().unwrap().into_dict_kind()
},
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict_kind must be set"
)))
}
}
}
#[cfg(test)]
mod tests {
use serde_json as json;
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
#[test]
fn test_lindera_tokenizer(){
let params = r#"{
"type": "lindera",
"dict_kind": "ipadic"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
#[test]
#[cfg(feature = "lindera-cc-cedict")]
fn test_lindera_tokenizer_cc(){
let params = r#"{
"type": "lindera",
"dict_kind": "cc-cedict"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
}

View File

@ -0,0 +1,7 @@
mod tokenizer;
mod jieba_tokenizer;
mod lindera_tokenizer;
pub(crate) use self::tokenizer::*;
use self::jieba_tokenizer::JiebaTokenizer;
use self::lindera_tokenizer::LinderaTokenizer;

View File

@ -0,0 +1,73 @@
use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
use lindera::segmenter::Segmenter;
use tantivy::tokenizer::*;
use lindera::mode::Mode;
use serde_json as json;
use log::warn;
use crate::analyzer::tokenizers::{JiebaTokenizer, LinderaTokenizer};
use crate::error::{Result,TantivyBindingError};
pub fn standard_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
}
pub fn whitespace_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}
pub fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
}
pub fn lindera_builder(params: Option<&json::Map<String, json::Value>>) -> Result<TextAnalyzerBuilder>{
if params.is_none(){
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer must be costum"
)))
}
let tokenizer = LinderaTokenizer::from_json(params.unwrap())?;
Ok(TextAnalyzer::builder(tokenizer).dynamic())
}
pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBuilder> {
let name;
let params_map;
if params.is_string(){
name = params.as_str().unwrap();
params_map = None;
}else{
let m = params.as_object().unwrap();
match m.get("type"){
Some(val) => {
if !val.is_string(){
return Err(TantivyBindingError::InvalidArgument(format!(
"tokenizer type should be string"
)))
}
name = val.as_str().unwrap();
},
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"costum tokenizer must set type"
)))
},
}
params_map = Some(m);
}
match name {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()),
"lindera" => lindera_builder(params_map),
other => {
warn!("unsupported tokenizer: {}", other);
Err(TantivyBindingError::InvalidArgument(format!(
"unsupported tokenizer: {}",
other
)))
}
}
}

View File

@ -0,0 +1,45 @@
use serde_json as json;
use crate::error::{Result,TantivyBindingError};
use crate::analyzer::stop_words;
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
if !value.is_array() {
return Err(TantivyBindingError::InternalError(
format!("{} should be array", label).to_string(),
));
}
let stop_words = value.as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words {
match element.as_str() {
Some(word) => str_list.push(word.to_string()),
_ => {
return Err(TantivyBindingError::InternalError(
format!("{} list item should be string", label).to_string(),
))
}
}
}
Ok(str_list)
}
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
let mut stop_words = Vec::new();
for str in str_list {
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
match str.as_str() {
"_english_" => {
for word in stop_words::ENGLISH {
stop_words.push(word.to_string());
}
continue;
}
_other => {}
}
}
stop_words.push(str);
}
stop_words
}

View File

@ -5,7 +5,7 @@ use tantivy::{
};
use crate::error::Result;
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer};
impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer,

View File

@ -4,7 +4,7 @@ use libc::{c_char, c_void};
use crate::{
array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
tokenizer::create_tokenizer,
analyzer::create_analyzer,
};
#[no_mangle]
@ -39,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer(
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name = cstr_to_str!(tokenizer_name);
let params = cstr_to_str!(analyzer_params);
let analyzer = create_tokenizer(params);
let analyzer = create_analyzer(params);
match analyzer {
Ok(text_analyzer) => unsafe {
(*real).register_tokenizer(String::from(tokenizer_name), text_analyzer);

View File

@ -5,7 +5,8 @@ use crate::array::RustResult;
use crate::cstr_to_str;
use crate::index_writer::IndexWriterWrapper;
use crate::log::init_log;
use crate::tokenizer::create_tokenizer;
use crate::string_c::c_str_to_str;
use crate::analyzer::create_analyzer;
use crate::util::create_binding;
#[no_mangle]
@ -23,7 +24,7 @@ pub extern "C" fn tantivy_create_text_writer(
let path_str = cstr_to_str!(path);
let tokenizer_name_str = cstr_to_str!(tokenizer_name);
let params = cstr_to_str!(analyzer_params);
let analyzer = create_tokenizer(params);
let analyzer = create_analyzer(params);
match analyzer {
Ok(text_analyzer) => {
let wrapper = IndexWriterWrapper::create_text_writer(

View File

@ -12,14 +12,11 @@ mod index_writer;
mod index_writer_c;
mod index_writer_text;
mod index_writer_text_c;
mod jieba_tokenizer;
mod log;
mod stop_words;
mod string_c;
mod token_stream_c;
mod tokenizer;
mod analyzer;
mod tokenizer_c;
mod tokenizer_filter;
mod util;
mod util_c;
mod vec_collector;

View File

@ -5,15 +5,15 @@ use crate::{
array::RustResult,
log::init_log,
string_c::c_str_to_str,
tokenizer::create_tokenizer,
analyzer::create_analyzer,
util::{create_binding, free_binding},
};
#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> RustResult {
pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> RustResult {
init_log();
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
let analyzer = create_tokenizer(&params);
let analyzer = create_analyzer(&params);
match analyzer {
Ok(text_analyzer) => RustResult::from_ptr(create_binding(text_analyzer)),
Err(err) => RustResult::from_error(format!(
@ -25,13 +25,13 @@ pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> Ru
}
#[no_mangle]
pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void {
pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void {
let analyzer = ptr as *mut TextAnalyzer;
let clone = unsafe { (*analyzer).clone() };
create_binding(clone)
}
#[no_mangle]
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
pub extern "C" fn tantivy_free_analyzer(tokenizer: *mut c_void) {
free_binding::<TextAnalyzer>(tokenizer);
}

View File

@ -1,12 +1,7 @@
use serde_json as json;
use std::ffi::c_void;
use std::ops::Bound;
use tantivy::{directory::MmapDirectory, Index};
use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::stop_words;
pub fn index_exist(path: &str) -> bool {
let dir = MmapDirectory::open(path).unwrap();
Index::exists(&dir).unwrap()
@ -31,45 +26,4 @@ pub fn free_binding<T>(ptr: *mut c_void) {
unsafe {
drop(Box::from_raw(real));
}
}
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
if !value.is_array() {
return Err(TantivyBindingError::InternalError(
format!("{} should be array", label).to_string(),
));
}
let stop_words = value.as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words {
match element.as_str() {
Some(word) => str_list.push(word.to_string()),
None => {
return Err(TantivyBindingError::InternalError(
format!("{} list item should be string", label).to_string(),
))
}
}
}
Ok(str_list)
}
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
let mut stop_words = Vec::new();
for str in str_list {
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
match str.as_str() {
"_english_" => {
for word in stop_words::ENGLISH {
stop_words.push(word.to_string());
}
continue;
}
_other => {}
}
}
stop_words.push(str);
}
stop_words
}
}

View File

@ -15,7 +15,7 @@ struct Tokenizer {
explicit Tokenizer(std::string&& params) {
auto shared_params = std::make_shared<std::string>(std::move(params));
auto res =
RustResultWrapper(tantivy_create_tokenizer(shared_params->c_str()));
RustResultWrapper(tantivy_create_analyzer(shared_params->c_str()));
AssertInfo(res.result_->success,
"Tokenizer creation failed: {}",
res.result_->error);
@ -27,7 +27,7 @@ struct Tokenizer {
~Tokenizer() {
if (ptr_ != nullptr) {
tantivy_free_tokenizer(ptr_);
tantivy_free_analyzer(ptr_);
}
}
@ -41,7 +41,7 @@ struct Tokenizer {
std::unique_ptr<Tokenizer>
Clone() {
auto newptr = tantivy_clone_tokenizer(ptr_);
auto newptr = tantivy_clone_analyzer(ptr_);
return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
}