mirror of https://github.com/milvus-io/milvus.git
feat: support extend default dict for jieba tokenizer (#41360)
relate: https://github.com/milvus-io/milvus/issues/41213 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>pull/41462/head
parent
e615e3daa9
commit
11f2fae42e
|
|
@ -100,6 +100,7 @@ cwrapper_rocksdb_build/
|
|||
|
||||
# local file data
|
||||
**/data/*
|
||||
!**/tantivy-binding/src/analyzer/data/*
|
||||
|
||||
internal/proto/**/*.pb.go
|
||||
pkg/streaming/**/*.pb.go
|
||||
|
|
|
|||
584429
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/jieba/dict.txt.big
vendored
Normal file
584429
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/jieba/dict.txt.big
vendored
Normal file
File diff suppressed because it is too large
Load Diff
|
|
@ -1,5 +1,6 @@
|
|||
use core::{option::Option::Some, result::Result::Ok};
|
||||
use jieba_rs;
|
||||
use std::io::BufReader;
|
||||
use lazy_static::lazy_static;
|
||||
use serde_json as json;
|
||||
use std::borrow::Cow;
|
||||
|
|
@ -11,6 +12,8 @@ lazy_static! {
|
|||
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
|
||||
}
|
||||
|
||||
static EXTEND_DEFAULT_DICT: &str = include_str!("../data/jieba/dict.txt.big");
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Clone)]
|
||||
pub enum JiebaMode {
|
||||
|
|
@ -49,7 +52,7 @@ impl TokenStream for JiebaTokenStream {
|
|||
}
|
||||
}
|
||||
|
||||
fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String>, bool)> {
|
||||
fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String>, Option<String>)> {
|
||||
match params.get("dict") {
|
||||
Some(value) => {
|
||||
if !value.is_array() {
|
||||
|
|
@ -58,8 +61,8 @@ fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String
|
|||
)));
|
||||
}
|
||||
let mut dict = Vec::<String>::new();
|
||||
let mut use_default = false;
|
||||
// value
|
||||
let mut system_dict = None;
|
||||
|
||||
for word in value.as_array().unwrap() {
|
||||
if !word.is_string() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
|
|
@ -67,17 +70,21 @@ fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String
|
|||
)));
|
||||
}
|
||||
let text = word.as_str().unwrap().to_string();
|
||||
if text == "_default_" {
|
||||
use_default = true;
|
||||
} else {
|
||||
if text == "_default_" || text == "_extend_default_" {
|
||||
if system_dict.is_some() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer dict can only set one default dict"
|
||||
)));
|
||||
}
|
||||
system_dict = Some(text)
|
||||
} else{
|
||||
dict.push(text);
|
||||
}
|
||||
}
|
||||
Ok((dict, use_default))
|
||||
Ok((dict, system_dict))
|
||||
}
|
||||
_ => {
|
||||
// tokenizer.load_dict(dict)
|
||||
Ok((vec![], true))
|
||||
Ok((vec![], Some("_default_".to_string())))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -129,13 +136,23 @@ impl<'a> JiebaTokenizer<'a> {
|
|||
}
|
||||
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
||||
let mut tokenizer: jieba_rs::Jieba;
|
||||
let (dict, use_default) = get_jieba_dict(params)?;
|
||||
if use_default {
|
||||
tokenizer = jieba_rs::Jieba::new()
|
||||
} else {
|
||||
tokenizer = jieba_rs::Jieba::empty()
|
||||
}
|
||||
let (dict, system_dict) = get_jieba_dict(params)?;
|
||||
|
||||
let mut tokenizer = system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| {
|
||||
match name.as_str() {
|
||||
"_default_" => Ok(jieba_rs::Jieba::new()),
|
||||
"_extend_default_" => {
|
||||
let mut buf = BufReader::new(EXTEND_DEFAULT_DICT.as_bytes());
|
||||
jieba_rs::Jieba::with_dict(&mut buf).map_err(|e|
|
||||
TantivyBindingError::InternalError(format!("failed to load extend default system dict: {}", e))
|
||||
)
|
||||
},
|
||||
_ => Err(TantivyBindingError::InternalError(format!(
|
||||
"invalid system dict name: {}",
|
||||
name
|
||||
)))
|
||||
}
|
||||
})?;
|
||||
|
||||
for word in dict {
|
||||
tokenizer.add_word(word.as_str(), None, None);
|
||||
|
|
@ -242,4 +259,27 @@ mod tests {
|
|||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jieba_tokenizer_with_extend_default_dict() {
|
||||
let params = r#"{
|
||||
"type": "jieba",
|
||||
"dict": ["_extend_default_"]
|
||||
}"#;
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("milvus結巴分詞器中文測試");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue