mirror of https://github.com/milvus-io/milvus.git
enhance: support use jieba tokenizer with costum dictionary (#39854)
relate: https://github.com/milvus-io/milvus/issues/40168 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>pull/41097/head
parent
96eca2531f
commit
6f17720e4e
|
@ -163,7 +163,7 @@ impl AnalyzerBuilder<'_> {
|
|||
}
|
||||
return self.build_template(type_.as_str().unwrap());
|
||||
}
|
||||
None => {}
|
||||
_ => {}
|
||||
};
|
||||
|
||||
//build custom analyzer
|
||||
|
|
|
@ -16,7 +16,7 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
|||
}
|
||||
|
||||
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||
let builder = jieba_builder(None).unwrap().filter(CnAlphaNumOnlyFilter);
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
use core::{option::Option::Some, result::Result::Ok};
|
||||
use jieba_rs;
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
use lazy_static::lazy_static;
|
||||
use serde_json as json;
|
||||
use std::borrow::Cow;
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
lazy_static! {
|
||||
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
|
||||
|
@ -14,9 +19,10 @@ pub enum JiebaMode {
|
|||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct JiebaTokenizer{
|
||||
pub struct JiebaTokenizer<'a> {
|
||||
mode: JiebaMode,
|
||||
hmm: bool,
|
||||
tokenizer: Cow<'a, jieba_rs::Jieba>,
|
||||
}
|
||||
|
||||
pub struct JiebaTokenStream {
|
||||
|
@ -43,21 +49,120 @@ impl TokenStream for JiebaTokenStream {
|
|||
}
|
||||
}
|
||||
|
||||
impl JiebaTokenizer {
|
||||
pub fn new() -> JiebaTokenizer{
|
||||
JiebaTokenizer{mode: JiebaMode::Search, hmm: true}
|
||||
fn get_jieba_dict(params: &json::Map<String, json::Value>) -> Result<(Vec<String>, bool)> {
|
||||
match params.get("dict") {
|
||||
Some(value) => {
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer dict must be array"
|
||||
)));
|
||||
}
|
||||
let mut dict = Vec::<String>::new();
|
||||
let mut use_default = false;
|
||||
// value
|
||||
for word in value.as_array().unwrap() {
|
||||
if !word.is_string() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer dict item must be string"
|
||||
)));
|
||||
}
|
||||
let text = word.as_str().unwrap().to_string();
|
||||
if text == "_default_" {
|
||||
use_default = true;
|
||||
} else {
|
||||
dict.push(text);
|
||||
}
|
||||
}
|
||||
Ok((dict, use_default))
|
||||
}
|
||||
_ => {
|
||||
// tokenizer.load_dict(dict)
|
||||
Ok((vec![], true))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
|
||||
match params.get("mode") {
|
||||
Some(value) => {
|
||||
if !value.is_string() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer mode must be string"
|
||||
)));
|
||||
}
|
||||
|
||||
let mode = value.as_str().unwrap();
|
||||
match mode {
|
||||
"exact" => Ok(JiebaMode::Exact),
|
||||
"search" => Ok(JiebaMode::Search),
|
||||
_ => Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer mode must be \"exact\" or \"search\""
|
||||
))),
|
||||
}
|
||||
}
|
||||
_ => Ok(JiebaMode::Search),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_jieba_hmm(params: &json::Map<String, json::Value>) -> Result<bool> {
|
||||
match params.get("hmm") {
|
||||
Some(value) => {
|
||||
if !value.is_boolean() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer mode must be boolean"
|
||||
)));
|
||||
}
|
||||
|
||||
return Ok(value.as_bool().unwrap());
|
||||
}
|
||||
_ => Ok(true),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> JiebaTokenizer<'a> {
|
||||
pub fn new() -> JiebaTokenizer<'a> {
|
||||
JiebaTokenizer {
|
||||
mode: JiebaMode::Search,
|
||||
hmm: true,
|
||||
tokenizer: Cow::Borrowed(&JIEBA),
|
||||
}
|
||||
}
|
||||
|
||||
fn tokenize(&self, text: &str) -> Vec<Token>{
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
||||
let mut tokenizer: jieba_rs::Jieba;
|
||||
let (dict, use_default) = get_jieba_dict(params)?;
|
||||
if use_default {
|
||||
tokenizer = jieba_rs::Jieba::new()
|
||||
} else {
|
||||
tokenizer = jieba_rs::Jieba::empty()
|
||||
}
|
||||
|
||||
for word in dict {
|
||||
tokenizer.add_word(word.as_str(), None, None);
|
||||
}
|
||||
|
||||
let mode = get_jieba_mode(params)?;
|
||||
let hmm = get_jieba_hmm(params)?;
|
||||
|
||||
Ok(JiebaTokenizer {
|
||||
mode: mode,
|
||||
hmm: hmm,
|
||||
tokenizer: Cow::Owned(tokenizer),
|
||||
})
|
||||
}
|
||||
|
||||
fn tokenize(&self, text: &str) -> Vec<Token> {
|
||||
let mut indices = text.char_indices().collect::<Vec<_>>();
|
||||
indices.push((text.len(), '\0'));
|
||||
let ori_tokens = match self.mode{
|
||||
let ori_tokens = match self.mode {
|
||||
JiebaMode::Exact => {
|
||||
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm)
|
||||
},
|
||||
self.tokenizer
|
||||
.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm)
|
||||
}
|
||||
JiebaMode::Search => {
|
||||
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm)
|
||||
},
|
||||
self.tokenizer
|
||||
.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm)
|
||||
}
|
||||
};
|
||||
|
||||
let mut tokens = Vec::with_capacity(ori_tokens.len());
|
||||
|
@ -74,11 +179,67 @@ impl JiebaTokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for JiebaTokenizer {
|
||||
impl Tokenizer for JiebaTokenizer<'static> {
|
||||
type TokenStream<'a> = JiebaTokenStream;
|
||||
|
||||
fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
|
||||
let tokens = self.tokenize(text);
|
||||
JiebaTokenStream { tokens, index: 0 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json as json;
|
||||
|
||||
use crate::analyzer::tokenizers::jieba_tokenizer::JiebaTokenizer;
|
||||
use tantivy::tokenizer::TokenStream;
|
||||
use tantivy::tokenizer::Tokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_jieba_tokenizer() {
|
||||
let params = r#"{
|
||||
"type": "jieba"
|
||||
}"#;
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("结巴分词器");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jieba_tokenizer_with_dict() {
|
||||
let params = r#"{
|
||||
"type": "jieba",
|
||||
"dict": ["结巴分词器"],
|
||||
"mode": "exact",
|
||||
"hmm": false
|
||||
}"#;
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("milvus结巴分词器中文测试");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,8 +14,12 @@ pub fn whitespace_builder() -> TextAnalyzerBuilder {
|
|||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
pub fn jieba_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
||||
pub fn jieba_builder(params: Option<&json::Map<String, json::Value>>) -> Result<TextAnalyzerBuilder> {
|
||||
if params.is_none(){
|
||||
return Ok(TextAnalyzer::builder(JiebaTokenizer::new()).dynamic());
|
||||
}
|
||||
let tokenizer = JiebaTokenizer::from_json(params.unwrap())?;
|
||||
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
||||
}
|
||||
|
||||
pub fn lindera_builder(
|
||||
|
@ -59,7 +63,7 @@ pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBu
|
|||
match name {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
"jieba" => Ok(jieba_builder()),
|
||||
"jieba" => jieba_builder(params_map),
|
||||
"lindera" => lindera_builder(params_map),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
|
|
Loading…
Reference in New Issue