enhance: support use jieba tokenizer with costum dictionary (#39854)

relate: https://github.com/milvus-io/milvus/issues/40168

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
pull/41097/head
aoiasd 2025-04-08 14:52:27 +08:00 committed by GitHub
parent 96eca2531f
commit 6f17720e4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 183 additions and 18 deletions

View File

@ -163,7 +163,7 @@ impl AnalyzerBuilder<'_> {
}
return self.build_template(type_.as_str().unwrap());
}
None => {}
_ => {}
};
//build custom analyzer

View File

@ -16,7 +16,7 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
}
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
let builder = jieba_builder(None).unwrap().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}

View File

@ -1,6 +1,11 @@
use core::{option::Option::Some, result::Result::Ok};
use jieba_rs;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use lazy_static::lazy_static;
use serde_json as json;
use std::borrow::Cow;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use crate::error::{Result, TantivyBindingError};
lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
@ -14,9 +19,10 @@ pub enum JiebaMode {
}
#[derive(Clone)]
pub struct JiebaTokenizer{
pub struct JiebaTokenizer<'a> {
mode: JiebaMode,
hmm: bool,
tokenizer: Cow<'a, jieba_rs::Jieba>,
}
pub struct JiebaTokenStream {
@ -43,21 +49,120 @@ impl TokenStream for JiebaTokenStream {
}
}
impl JiebaTokenizer {
pub fn new() -> JiebaTokenizer{
JiebaTokenizer{mode: JiebaMode::Search, hmm: true}
fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String>, bool)> {
match params.get("dict") {
Some(value) => {
if !value.is_array() {
return Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer dict must be array"
)));
}
let mut dict = Vec::<String>::new();
let mut use_default = false;
// value
for word in value.as_array().unwrap() {
if !word.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer dict item must be string"
)));
}
let text = word.as_str().unwrap().to_string();
if text == "_default_" {
use_default = true;
} else {
dict.push(text);
}
}
Ok((dict, use_default))
}
_ => {
// tokenizer.load_dict(dict)
Ok((vec![], true))
}
}
}
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
match params.get("mode") {
Some(value) => {
if !value.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer mode must be string"
)));
}
let mode = value.as_str().unwrap();
match mode {
"exact" => Ok(JiebaMode::Exact),
"search" => Ok(JiebaMode::Search),
_ => Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer mode must be \"exact\" or \"search\""
))),
}
}
_ => Ok(JiebaMode::Search),
}
}
fn get_jieba_hmm(params: &json::Map<String, json::Value>) -> Result<bool> {
match params.get("hmm") {
Some(value) => {
if !value.is_boolean() {
return Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer mode must be boolean"
)));
}
return Ok(value.as_bool().unwrap());
}
_ => Ok(true),
}
}
impl<'a> JiebaTokenizer<'a> {
pub fn new() -> JiebaTokenizer<'a> {
JiebaTokenizer {
mode: JiebaMode::Search,
hmm: true,
tokenizer: Cow::Borrowed(&JIEBA),
}
}
fn tokenize(&self, text: &str) -> Vec<Token>{
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
let mut tokenizer: jieba_rs::Jieba;
let (dict, use_default) = get_jieba_dict(params)?;
if use_default {
tokenizer = jieba_rs::Jieba::new()
} else {
tokenizer = jieba_rs::Jieba::empty()
}
for word in dict {
tokenizer.add_word(word.as_str(), None, None);
}
let mode = get_jieba_mode(params)?;
let hmm = get_jieba_hmm(params)?;
Ok(JiebaTokenizer {
mode: mode,
hmm: hmm,
tokenizer: Cow::Owned(tokenizer),
})
}
fn tokenize(&self, text: &str) -> Vec<Token> {
let mut indices = text.char_indices().collect::<Vec<_>>();
indices.push((text.len(), '\0'));
let ori_tokens = match self.mode{
let ori_tokens = match self.mode {
JiebaMode::Exact => {
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm)
},
self.tokenizer
.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm)
}
JiebaMode::Search => {
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm)
},
self.tokenizer
.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm)
}
};
let mut tokens = Vec::with_capacity(ori_tokens.len());
@ -74,11 +179,67 @@ impl JiebaTokenizer {
}
}
impl Tokenizer for JiebaTokenizer {
impl Tokenizer for JiebaTokenizer<'static> {
type TokenStream<'a> = JiebaTokenStream;
fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
let tokens = self.tokenize(text);
JiebaTokenStream { tokens, index: 0 }
}
}
}
#[cfg(test)]
mod tests {
use serde_json as json;
use crate::analyzer::tokenizers::jieba_tokenizer::JiebaTokenizer;
use tantivy::tokenizer::TokenStream;
use tantivy::tokenizer::Tokenizer;
#[test]
fn test_jieba_tokenizer() {
let params = r#"{
"type": "jieba"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("结巴分词器");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
print!("test tokens :{:?}\n", results)
}
#[test]
fn test_jieba_tokenizer_with_dict() {
let params = r#"{
"type": "jieba",
"dict": ["结巴分词器"],
"mode": "exact",
"hmm": false
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("milvus结巴分词器中文测试");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
print!("test tokens :{:?}\n", results)
}
}

View File

@ -14,8 +14,12 @@ pub fn whitespace_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}
pub fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
pub fn jieba_builder(params: Option<&json::Map<String, json::Value>>) -> Result<TextAnalyzerBuilder> {
if params.is_none(){
return Ok(TextAnalyzer::builder(JiebaTokenizer::new()).dynamic());
}
let tokenizer = JiebaTokenizer::from_json(params.unwrap())?;
Ok(TextAnalyzer::builder(tokenizer).dynamic())
}
pub fn lindera_builder(
@ -59,7 +63,7 @@ pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBu
match name {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()),
"jieba" => jieba_builder(params_map),
"lindera" => lindera_builder(params_map),
other => {
warn!("unsupported tokenizer: {}", other);