mirror of https://github.com/milvus-io/milvus.git
feat: support extend default dict for jieba tokenizer (#41360)
relate: https://github.com/milvus-io/milvus/issues/41213 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>pull/41462/head
parent
e615e3daa9
commit
11f2fae42e
|
|
@ -100,6 +100,7 @@ cwrapper_rocksdb_build/
|
||||||
|
|
||||||
# local file data
|
# local file data
|
||||||
**/data/*
|
**/data/*
|
||||||
|
!**/tantivy-binding/src/analyzer/data/*
|
||||||
|
|
||||||
internal/proto/**/*.pb.go
|
internal/proto/**/*.pb.go
|
||||||
pkg/streaming/**/*.pb.go
|
pkg/streaming/**/*.pb.go
|
||||||
|
|
|
||||||
584429
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/jieba/dict.txt.big
vendored
Normal file
584429
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/jieba/dict.txt.big
vendored
Normal file
File diff suppressed because it is too large
Load Diff
|
|
@ -1,5 +1,6 @@
|
||||||
use core::{option::Option::Some, result::Result::Ok};
|
use core::{option::Option::Some, result::Result::Ok};
|
||||||
use jieba_rs;
|
use jieba_rs;
|
||||||
|
use std::io::BufReader;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
@ -11,6 +12,8 @@ lazy_static! {
|
||||||
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
|
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static EXTEND_DEFAULT_DICT: &str = include_str!("../data/jieba/dict.txt.big");
|
||||||
|
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub enum JiebaMode {
|
pub enum JiebaMode {
|
||||||
|
|
@ -49,7 +52,7 @@ impl TokenStream for JiebaTokenStream {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String>, bool)> {
|
fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String>, Option<String>)> {
|
||||||
match params.get("dict") {
|
match params.get("dict") {
|
||||||
Some(value) => {
|
Some(value) => {
|
||||||
if !value.is_array() {
|
if !value.is_array() {
|
||||||
|
|
@ -58,8 +61,8 @@ fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
let mut dict = Vec::<String>::new();
|
let mut dict = Vec::<String>::new();
|
||||||
let mut use_default = false;
|
let mut system_dict = None;
|
||||||
// value
|
|
||||||
for word in value.as_array().unwrap() {
|
for word in value.as_array().unwrap() {
|
||||||
if !word.is_string() {
|
if !word.is_string() {
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
|
@ -67,17 +70,21 @@ fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
let text = word.as_str().unwrap().to_string();
|
let text = word.as_str().unwrap().to_string();
|
||||||
if text == "_default_" {
|
if text == "_default_" || text == "_extend_default_" {
|
||||||
use_default = true;
|
if system_dict.is_some() {
|
||||||
} else {
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"jieba tokenizer dict can only set one default dict"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
system_dict = Some(text)
|
||||||
|
} else{
|
||||||
dict.push(text);
|
dict.push(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok((dict, use_default))
|
Ok((dict, system_dict))
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// tokenizer.load_dict(dict)
|
Ok((vec![], Some("_default_".to_string())))
|
||||||
Ok((vec![], true))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -129,13 +136,23 @@ impl<'a> JiebaTokenizer<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
||||||
let mut tokenizer: jieba_rs::Jieba;
|
let (dict, system_dict) = get_jieba_dict(params)?;
|
||||||
let (dict, use_default) = get_jieba_dict(params)?;
|
|
||||||
if use_default {
|
let mut tokenizer = system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| {
|
||||||
tokenizer = jieba_rs::Jieba::new()
|
match name.as_str() {
|
||||||
} else {
|
"_default_" => Ok(jieba_rs::Jieba::new()),
|
||||||
tokenizer = jieba_rs::Jieba::empty()
|
"_extend_default_" => {
|
||||||
}
|
let mut buf = BufReader::new(EXTEND_DEFAULT_DICT.as_bytes());
|
||||||
|
jieba_rs::Jieba::with_dict(&mut buf).map_err(|e|
|
||||||
|
TantivyBindingError::InternalError(format!("failed to load extend default system dict: {}", e))
|
||||||
|
)
|
||||||
|
},
|
||||||
|
_ => Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"invalid system dict name: {}",
|
||||||
|
name
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
for word in dict {
|
for word in dict {
|
||||||
tokenizer.add_word(word.as_str(), None, None);
|
tokenizer.add_word(word.as_str(), None, None);
|
||||||
|
|
@ -242,4 +259,27 @@ mod tests {
|
||||||
|
|
||||||
print!("test tokens :{:?}\n", results)
|
print!("test tokens :{:?}\n", results)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_jieba_tokenizer_with_extend_default_dict() {
|
||||||
|
let params = r#"{
|
||||||
|
"type": "jieba",
|
||||||
|
"dict": ["_extend_default_"]
|
||||||
|
}"#;
|
||||||
|
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||||
|
assert!(json_param.is_ok());
|
||||||
|
|
||||||
|
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||||
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
let mut bining = tokenizer.unwrap();
|
||||||
|
let mut stream = bining.token_stream("milvus結巴分詞器中文測試");
|
||||||
|
|
||||||
|
let mut results = Vec::<String>::new();
|
||||||
|
while stream.advance() {
|
||||||
|
let token = stream.token();
|
||||||
|
results.push(token.text.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
print!("test tokens :{:?}\n", results)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue