mirror of https://github.com/milvus-io/milvus.git
enhance: Optimize chinese analyzer and support CnAlphaNumFilter (#37727)
relate: https://github.com/milvus-io/milvus/issues/35853 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>pull/37695/head
parent
0ba868ae64
commit
3b5a0df159
|
@ -24,7 +24,7 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
||||||
let builder = jieba_builder().filter(CnCharOnlyFilter);
|
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||||
if stop_words.len() > 0{
|
if stop_words.len() > 0{
|
||||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||||
}
|
}
|
||||||
|
@ -275,17 +275,17 @@ mod tests {
|
||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||||
assert!(tokenizer.is_ok());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
|
|
||||||
let regex = regex::Regex::new("\\p{Han}+").unwrap();
|
|
||||||
|
|
||||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||||
|
|
||||||
|
let mut results = Vec::<String>::new();
|
||||||
while stream.advance(){
|
while stream.advance(){
|
||||||
let token = stream.token();
|
let token = stream.token();
|
||||||
let text = token.text.clone();
|
results.push(token.text.clone());
|
||||||
print!("test token :{} symbol: {}\n", text.as_str(), regex.is_match(text.as_str()))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
print!("test tokens :{:?}\n", results)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -11,6 +11,7 @@ pub(crate) enum SystemFilter{
|
||||||
AsciiFolding(AsciiFoldingFilter),
|
AsciiFolding(AsciiFoldingFilter),
|
||||||
AlphaNumOnly(AlphaNumOnlyFilter),
|
AlphaNumOnly(AlphaNumOnlyFilter),
|
||||||
CnCharOnly(CnCharOnlyFilter),
|
CnCharOnly(CnCharOnlyFilter),
|
||||||
|
CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
||||||
Length(RemoveLongFilter),
|
Length(RemoveLongFilter),
|
||||||
Stop(StopWordFilter),
|
Stop(StopWordFilter),
|
||||||
Decompounder(SplitCompoundWords),
|
Decompounder(SplitCompoundWords),
|
||||||
|
@ -24,6 +25,7 @@ impl SystemFilter{
|
||||||
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
||||||
|
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Length(filter) => builder.filter(filter).dynamic(),
|
Self::Length(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
||||||
|
@ -129,6 +131,7 @@ impl From<&str> for SystemFilter{
|
||||||
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
||||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||||
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
|
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
|
||||||
|
"cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
||||||
_ => Self::Invalid,
|
_ => Self::Invalid,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -205,3 +208,51 @@ impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
|
||||||
self.tail.token_mut()
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct CnAlphaNumOnlyFilter;
|
||||||
|
|
||||||
|
pub struct CnAlphaNumOnlyFilterStream<T> {
|
||||||
|
regex: regex::Regex,
|
||||||
|
tail: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TokenFilter for CnAlphaNumOnlyFilter{
|
||||||
|
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
||||||
|
|
||||||
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
||||||
|
CnAlphaNumOnlyFilterWrapper(tokenizer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
|
||||||
|
|
||||||
|
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
|
||||||
|
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||||
|
|
||||||
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||||
|
CnAlphaNumOnlyFilterStream {
|
||||||
|
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
|
||||||
|
tail: self.0.token_stream(text),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
while self.tail.advance() {
|
||||||
|
if self.regex.is_match(&self.tail.token().text) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue