enhance: Optimize chinese analyzer and support CnAlphaNumFilter (#37727)

relate: https://github.com/milvus-io/milvus/issues/35853

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
pull/37695/head
aoiasd 2024-11-16 10:28:30 +08:00 committed by GitHub
parent 0ba868ae64
commit 3b5a0df159
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 58 additions and 7 deletions

View File

@ -24,7 +24,7 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
} }
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{ fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
let builder = jieba_builder().filter(CnCharOnlyFilter); let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0{ if stop_words.len() > 0{
return builder.filter(StopWordFilter::remove(stop_words)).build(); return builder.filter(StopWordFilter::remove(stop_words)).build();
} }
@ -275,17 +275,17 @@ mod tests {
}"#; }"#;
let tokenizer = create_tokenizer(&params.to_string()); let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
let mut bining = tokenizer.unwrap(); let mut bining = tokenizer.unwrap();
let regex = regex::Regex::new("\\p{Han}+").unwrap();
let mut stream = bining.token_stream("系统安全;,'';lxyz密码"); let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
let mut results = Vec::<String>::new();
while stream.advance(){ while stream.advance(){
let token = stream.token(); let token = stream.token();
let text = token.text.clone(); results.push(token.text.clone());
print!("test token :{} symbol: {}\n", text.as_str(), regex.is_match(text.as_str()))
} }
print!("test tokens :{:?}\n", results)
} }
} }

View File

@ -11,6 +11,7 @@ pub(crate) enum SystemFilter{
AsciiFolding(AsciiFoldingFilter), AsciiFolding(AsciiFoldingFilter),
AlphaNumOnly(AlphaNumOnlyFilter), AlphaNumOnly(AlphaNumOnlyFilter),
CnCharOnly(CnCharOnlyFilter), CnCharOnly(CnCharOnlyFilter),
CnAlphaNumOnly(CnAlphaNumOnlyFilter),
Length(RemoveLongFilter), Length(RemoveLongFilter),
Stop(StopWordFilter), Stop(StopWordFilter),
Decompounder(SplitCompoundWords), Decompounder(SplitCompoundWords),
@ -24,6 +25,7 @@ impl SystemFilter{
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(), Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(), Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(), Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::Length(filter) => builder.filter(filter).dynamic(), Self::Length(filter) => builder.filter(filter).dynamic(),
Self::Stop(filter) => builder.filter(filter).dynamic(), Self::Stop(filter) => builder.filter(filter).dynamic(),
Self::Decompounder(filter) => builder.filter(filter).dynamic(), Self::Decompounder(filter) => builder.filter(filter).dynamic(),
@ -129,6 +131,7 @@ impl From<&str> for SystemFilter{
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter), "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter), "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter), "cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
"cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter),
_ => Self::Invalid, _ => Self::Invalid,
} }
} }
@ -205,3 +208,51 @@ impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
self.tail.token_mut() self.tail.token_mut()
} }
} }
pub struct CnAlphaNumOnlyFilter;
pub struct CnAlphaNumOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnAlphaNumOnlyFilter{
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
CnAlphaNumOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnAlphaNumOnlyFilterStream {
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}