From 3b5a0df15978525e911003f812a76e1018ab7a42 Mon Sep 17 00:00:00 2001 From: aoiasd <45024769+aoiasd@users.noreply.github.com> Date: Sat, 16 Nov 2024 10:28:30 +0800 Subject: [PATCH] enhance: Optimize chinese analyzer and support CnAlphaNumFilter (#37727) relate: https://github.com/milvus-io/milvus/issues/35853 Signed-off-by: aoiasd --- .../tantivy/tantivy-binding/src/tokenizer.rs | 14 ++--- .../tantivy-binding/src/tokenizer_filter.rs | 51 +++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs index 7a45575854..0e915b56d9 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -24,7 +24,7 @@ pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { } fn chinese_analyzer(stop_words: Vec) -> TextAnalyzer{ - let builder = jieba_builder().filter(CnCharOnlyFilter); + let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); if stop_words.len() > 0{ return builder.filter(StopWordFilter::remove(stop_words)).build(); } @@ -275,17 +275,17 @@ mod tests { }"#; let tokenizer = create_tokenizer(¶ms.to_string()); - assert!(tokenizer.is_ok()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason()); let mut bining = tokenizer.unwrap(); - - let regex = regex::Regex::new("\\p{Han}+").unwrap(); - let mut stream = bining.token_stream("系统安全;,'';lxyz密码"); + + let mut results = Vec::::new(); while stream.advance(){ let token = stream.token(); - let text = token.text.clone(); - print!("test token :{} symbol: {}\n", text.as_str(), regex.is_match(text.as_str())) + results.push(token.text.clone()); } + + print!("test tokens :{:?}\n", results) } } \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs index 52036a06f3..d426a1b1ef 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs @@ -11,6 +11,7 @@ pub(crate) enum SystemFilter{ AsciiFolding(AsciiFoldingFilter), AlphaNumOnly(AlphaNumOnlyFilter), CnCharOnly(CnCharOnlyFilter), + CnAlphaNumOnly(CnAlphaNumOnlyFilter), Length(RemoveLongFilter), Stop(StopWordFilter), Decompounder(SplitCompoundWords), @@ -24,6 +25,7 @@ impl SystemFilter{ Self::AsciiFolding(filter) => builder.filter(filter).dynamic(), Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(), Self::CnCharOnly(filter) => builder.filter(filter).dynamic(), + Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(), Self::Length(filter) => builder.filter(filter).dynamic(), Self::Stop(filter) => builder.filter(filter).dynamic(), Self::Decompounder(filter) => builder.filter(filter).dynamic(), @@ -129,6 +131,7 @@ impl From<&str> for SystemFilter{ "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter), "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter), "cncharonly" => Self::CnCharOnly(CnCharOnlyFilter), + "cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter), _ => Self::Invalid, } } @@ -201,6 +204,54 @@ impl TokenStream for CnCharOnlyFilterStream { self.tail.token() } + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} + +pub struct CnAlphaNumOnlyFilter; + +pub struct CnAlphaNumOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnAlphaNumOnlyFilter{ + type Tokenizer = CnAlphaNumOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper { + CnAlphaNumOnlyFilterWrapper(tokenizer) + } +} +#[derive(Clone)] +pub struct CnAlphaNumOnlyFilterWrapper(T); + +impl Tokenizer for CnAlphaNumOnlyFilterWrapper { + type TokenStream<'a> = CnAlphaNumOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnAlphaNumOnlyFilterStream { + regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnAlphaNumOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + fn token_mut(&mut self) -> &mut Token { self.tail.token_mut() }