mirror of https://github.com/milvus-io/milvus.git
enhance: pack analyzer code and support lindera tokenizer (#39660)
relate: https://github.com/milvus-io/milvus/issues/39659 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>pull/40088/head
parent
dd68814c15
commit
38f1608910
File diff suppressed because it is too large
Load Diff
|
@ -5,8 +5,17 @@ edition = "2021"
|
|||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[features]
|
||||
default = ["lindera-ipadic", "lindera-ko-dic", "lindera-cc-cedict"]
|
||||
lindera-ipadic = ["lindera/ipadic"]
|
||||
lindera-ipadic-neologd = ["lindera/ipadic-neologd"]
|
||||
lindera-unidic = ["lindera/unidic"]
|
||||
lindera-ko-dic = ["lindera/ko-dic"]
|
||||
lindera-cc-cedict = ["lindera/cc-cedict"]
|
||||
|
||||
[dependencies]
|
||||
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
|
||||
lindera = "0.38.1"
|
||||
futures = "0.3.21"
|
||||
libc = "0.2"
|
||||
scopeguard = "1.2"
|
||||
|
|
|
@ -342,11 +342,11 @@ bool tantivy_token_stream_advance(void *token_stream);
|
|||
|
||||
const char *tantivy_token_stream_get_token(void *token_stream);
|
||||
|
||||
RustResult tantivy_create_tokenizer(const char *analyzer_params);
|
||||
RustResult tantivy_create_analyzer(const char *analyzer_params);
|
||||
|
||||
void *tantivy_clone_tokenizer(void *ptr);
|
||||
void *tantivy_clone_analyzer(void *ptr);
|
||||
|
||||
void tantivy_free_tokenizer(void *tokenizer);
|
||||
void tantivy_free_analyzer(void *tokenizer);
|
||||
|
||||
bool tantivy_index_exist(const char *path);
|
||||
|
||||
|
|
|
@ -1,80 +1,19 @@
|
|||
use log::warn;
|
||||
use serde_json as json;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::StopWordFilter;
|
||||
use tantivy::tokenizer::*;
|
||||
use serde_json as json;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
use crate::jieba_tokenizer::JiebaTokenizer;
|
||||
use crate::stop_words;
|
||||
use crate::tokenizer_filter::*;
|
||||
use crate::util::*;
|
||||
use crate::analyzer::{
|
||||
build_in_analyzer::*,
|
||||
tokenizers::get_builder_with_tokenizer,
|
||||
filter::*,
|
||||
util::*
|
||||
};
|
||||
|
||||
// default build-in analyzer
|
||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder().filter(LowerCaser);
|
||||
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.filter(StopWordFilter::remove(
|
||||
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
|
||||
));
|
||||
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn standard_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
fn whitespace_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
fn jieba_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
||||
}
|
||||
|
||||
fn get_builder_by_name(name: &String) -> Result<TextAnalyzerBuilder> {
|
||||
match name.as_str() {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
"jieba" => Ok(jieba_builder()),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupported tokenizer: {}",
|
||||
other
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct AnalyzerBuilder<'a> {
|
||||
// builder: TextAnalyzerBuilder
|
||||
filters: HashMap<String, SystemFilter>,
|
||||
params: &'a json::Map<String, json::Value>,
|
||||
}
|
||||
|
@ -87,20 +26,21 @@ impl AnalyzerBuilder<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
fn get_tokenizer_name(&self) -> Result<String>{
|
||||
fn get_tokenizer_params(&self) -> Result<&json::Value>{
|
||||
let tokenizer=self.params.get("tokenizer");
|
||||
if tokenizer.is_none(){
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name or type must be set"
|
||||
)));
|
||||
}
|
||||
if !tokenizer.unwrap().is_string() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name should be string"
|
||||
)));
|
||||
let value = tokenizer.unwrap();
|
||||
if value.is_object() || value.is_string() {
|
||||
return Ok(tokenizer.unwrap())
|
||||
}
|
||||
|
||||
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
|
||||
Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name should be string or dict"
|
||||
)))
|
||||
}
|
||||
|
||||
fn add_custom_filter(
|
||||
|
@ -196,7 +136,7 @@ impl AnalyzerBuilder<'_> {
|
|||
let str_list = get_string_list(value, "filter stop_words")?;
|
||||
Ok(get_stop_words_list(str_list))
|
||||
}
|
||||
None => Ok(vec![]),
|
||||
_ => Ok(vec![]),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -227,8 +167,8 @@ impl AnalyzerBuilder<'_> {
|
|||
};
|
||||
|
||||
//build custom analyzer
|
||||
let tokenizer_name = self.get_tokenizer_name()?;
|
||||
let mut builder = get_builder_by_name(&tokenizer_name)?;
|
||||
let tokenizer_params = self.get_tokenizer_params()?;
|
||||
let mut builder = get_builder_with_tokenizer(&tokenizer_params)?;
|
||||
|
||||
// build with option
|
||||
builder = self.build_option(builder)?;
|
||||
|
@ -236,7 +176,7 @@ impl AnalyzerBuilder<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer> {
|
||||
pub(crate) fn create_analyzer_with_filter(params: &String) -> Result<TextAnalyzer> {
|
||||
match json::from_str::<json::Value>(¶ms) {
|
||||
Ok(value) => {
|
||||
if value.is_null() {
|
||||
|
@ -280,16 +220,16 @@ pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyz
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer(params: &str) -> Result<TextAnalyzer> {
|
||||
pub(crate) fn create_analyzer(params: &str) -> Result<TextAnalyzer> {
|
||||
if params.len() == 0 {
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
|
||||
create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizer::create_tokenizer;
|
||||
use crate::analyzer::analyzer::create_analyzer;
|
||||
|
||||
#[test]
|
||||
fn test_standard_analyzer() {
|
||||
|
@ -298,7 +238,7 @@ mod tests {
|
|||
"stop_words": ["_english_"]
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
}
|
||||
|
||||
|
@ -308,7 +248,7 @@ mod tests {
|
|||
"type": "chinese"
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||
|
@ -321,4 +261,28 @@ mod tests {
|
|||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lindera_analyzer() {
|
||||
let params = r#"{
|
||||
"tokenizer": {
|
||||
"type": "lindera",
|
||||
"dict_kind": "ipadic"
|
||||
}
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
}
|
40
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs
vendored
Normal file
40
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/build_in_analyzer.rs
vendored
Normal file
|
@ -0,0 +1,40 @@
|
|||
use tantivy::tokenizer::*;
|
||||
|
||||
use crate::analyzer::tokenizers::*;
|
||||
use crate::analyzer::filter::*;
|
||||
use crate::analyzer::stop_words;
|
||||
|
||||
// default build-in analyzer
|
||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder().filter(LowerCaser);
|
||||
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.filter(StopWordFilter::remove(
|
||||
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
|
||||
));
|
||||
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
|
@ -2,9 +2,8 @@ use regex;
|
|||
use serde_json as json;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
use crate::util::*;
|
||||
use crate::error::{Result,TantivyBindingError};
|
||||
use crate::analyzer::util::*;
|
||||
|
||||
pub(crate) enum SystemFilter {
|
||||
Invalid,
|
||||
|
@ -79,7 +78,7 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
|
|||
for element in stop_words {
|
||||
match element.as_str() {
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
None => {
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list item should be string".to_string(),
|
||||
))
|
||||
|
@ -114,12 +113,10 @@ fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemF
|
|||
}
|
||||
|
||||
trait LanguageParser {
|
||||
type Error;
|
||||
fn into_language(self) -> Result<Language>;
|
||||
}
|
||||
|
||||
impl LanguageParser for &str {
|
||||
type Error = TantivyBindingError;
|
||||
fn into_language(self) -> Result<Language> {
|
||||
match self.to_lowercase().as_str() {
|
||||
"arabig" => Ok(Language::Arabic),
|
|
@ -0,0 +1,9 @@
|
|||
mod analyzer;
|
||||
mod stop_words;
|
||||
mod tokenizers;
|
||||
mod build_in_analyzer;
|
||||
mod filter;
|
||||
mod util;
|
||||
|
||||
pub(crate) use self::analyzer::create_analyzer;
|
||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
157
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs
vendored
Normal file
157
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs
vendored
Normal file
|
@ -0,0 +1,157 @@
|
|||
|
||||
use core::result::Result::Err;
|
||||
use log::warn;
|
||||
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::token::Token as LToken;
|
||||
use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder};
|
||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||
use tantivy::tokenizer::{Token, Tokenizer, TokenStream};
|
||||
|
||||
use serde_json as json;
|
||||
use crate::error::{Result,TantivyBindingError};
|
||||
|
||||
pub struct LinderaTokenStream<'a> {
|
||||
pub tokens: Vec<LToken<'a>>,
|
||||
pub token: &'a mut Token,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for LinderaTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tokens.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let token = self.tokens.remove(0);
|
||||
self.token.text = token.text.to_string();
|
||||
self.token.offset_from = token.byte_start;
|
||||
self.token.offset_to = token.byte_end;
|
||||
self.token.position = token.position;
|
||||
self.token.position_length = token.position_length;
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.token
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LinderaTokenizer {
|
||||
tokenizer: LTokenizer,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl LinderaTokenizer {
|
||||
/// Create a new `LinderaTokenizer`.
|
||||
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
|
||||
let kind = fetch_lindera_kind(params)?;
|
||||
let dictionary = load_dictionary_from_kind(kind);
|
||||
if dictionary.is_err(){
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer with invalid dict_kind"
|
||||
)));
|
||||
}
|
||||
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
|
||||
Ok(LinderaTokenizer::from_segmenter(segmenter))
|
||||
}
|
||||
|
||||
/// Create a new `LinderaTokenizer`.
|
||||
/// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
|
||||
pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
|
||||
LinderaTokenizer {
|
||||
tokenizer: LTokenizer::new(segmenter),
|
||||
token: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for LinderaTokenizer {
|
||||
type TokenStream<'a> = LinderaTokenStream<'a>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
|
||||
self.token.reset();
|
||||
LinderaTokenStream {
|
||||
tokens: self.tokenizer.tokenize(text).unwrap(),
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait DictionaryKindParser {
|
||||
fn into_dict_kind(self) -> Result<DictionaryKind>;
|
||||
}
|
||||
|
||||
impl DictionaryKindParser for &str{
|
||||
fn into_dict_kind(self) -> Result<DictionaryKind> {
|
||||
match self{
|
||||
"ipadic" => Ok(DictionaryKind::IPADIC),
|
||||
"ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd),
|
||||
"unidic" => Ok(DictionaryKind::UniDic),
|
||||
"ko-dic" => Ok(DictionaryKind::KoDic),
|
||||
"cc-cedict" => Ok(DictionaryKind::CcCedict),
|
||||
other => Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"unsupported lindera dict type: {}",
|
||||
other
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_lindera_kind(params:&json::Map<String, json::Value>) -> Result<DictionaryKind>{
|
||||
match params.get("dict_kind"){
|
||||
Some(val) => {
|
||||
if !val.is_string(){
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer dict kind should be string"
|
||||
)))
|
||||
}
|
||||
val.as_str().unwrap().into_dict_kind()
|
||||
},
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer dict_kind must be set"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json as json;
|
||||
|
||||
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_lindera_tokenizer(){
|
||||
let params = r#"{
|
||||
"type": "lindera",
|
||||
"dict_kind": "ipadic"
|
||||
}"#;
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "lindera-cc-cedict")]
|
||||
fn test_lindera_tokenizer_cc(){
|
||||
let params = r#"{
|
||||
"type": "lindera",
|
||||
"dict_kind": "cc-cedict"
|
||||
}"#;
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
}
|
||||
}
|
7
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs
vendored
Normal file
7
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/mod.rs
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
mod tokenizer;
|
||||
mod jieba_tokenizer;
|
||||
mod lindera_tokenizer;
|
||||
|
||||
pub(crate) use self::tokenizer::*;
|
||||
use self::jieba_tokenizer::JiebaTokenizer;
|
||||
use self::lindera_tokenizer::LinderaTokenizer;
|
73
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs
vendored
Normal file
73
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs
vendored
Normal file
|
@ -0,0 +1,73 @@
|
|||
use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
|
||||
use lindera::segmenter::Segmenter;
|
||||
use tantivy::tokenizer::*;
|
||||
use lindera::mode::Mode;
|
||||
use serde_json as json;
|
||||
use log::warn;
|
||||
|
||||
use crate::analyzer::tokenizers::{JiebaTokenizer, LinderaTokenizer};
|
||||
use crate::error::{Result,TantivyBindingError};
|
||||
|
||||
|
||||
pub fn standard_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
pub fn whitespace_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
pub fn jieba_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
||||
}
|
||||
|
||||
pub fn lindera_builder(params: Option<&json::Map<String, json::Value>>) -> Result<TextAnalyzerBuilder>{
|
||||
if params.is_none(){
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer must be costum"
|
||||
)))
|
||||
}
|
||||
let tokenizer = LinderaTokenizer::from_json(params.unwrap())?;
|
||||
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
||||
}
|
||||
|
||||
pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBuilder> {
|
||||
let name;
|
||||
let params_map;
|
||||
if params.is_string(){
|
||||
name = params.as_str().unwrap();
|
||||
params_map = None;
|
||||
}else{
|
||||
let m = params.as_object().unwrap();
|
||||
match m.get("type"){
|
||||
Some(val) => {
|
||||
if !val.is_string(){
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"tokenizer type should be string"
|
||||
)))
|
||||
}
|
||||
name = val.as_str().unwrap();
|
||||
},
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"costum tokenizer must set type"
|
||||
)))
|
||||
},
|
||||
}
|
||||
params_map = Some(m);
|
||||
}
|
||||
|
||||
match name {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
"jieba" => Ok(jieba_builder()),
|
||||
"lindera" => lindera_builder(params_map),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"unsupported tokenizer: {}",
|
||||
other
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
use serde_json as json;
|
||||
|
||||
use crate::error::{Result,TantivyBindingError};
|
||||
use crate::analyzer::stop_words;
|
||||
|
||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} should be array", label).to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let stop_words = value.as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words {
|
||||
match element.as_str() {
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} list item should be string", label).to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(str_list)
|
||||
}
|
||||
|
||||
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||
let mut stop_words = Vec::new();
|
||||
for str in str_list {
|
||||
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||
match str.as_str() {
|
||||
"_english_" => {
|
||||
for word in stop_words::ENGLISH {
|
||||
stop_words.push(word.to_string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
stop_words.push(str);
|
||||
}
|
||||
stop_words
|
||||
}
|
|
@ -5,7 +5,7 @@ use tantivy::{
|
|||
};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
|
||||
use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer};
|
||||
|
||||
impl IndexReaderWrapper {
|
||||
// split the query string into multiple tokens using index's default tokenizer,
|
||||
|
|
|
@ -4,7 +4,7 @@ use libc::{c_char, c_void};
|
|||
|
||||
use crate::{
|
||||
array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
|
||||
tokenizer::create_tokenizer,
|
||||
analyzer::create_analyzer,
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
|
@ -39,7 +39,7 @@ pub extern "C" fn tantivy_register_tokenizer(
|
|||
let real = ptr as *mut IndexReaderWrapper;
|
||||
let tokenizer_name = cstr_to_str!(tokenizer_name);
|
||||
let params = cstr_to_str!(analyzer_params);
|
||||
let analyzer = create_tokenizer(params);
|
||||
let analyzer = create_analyzer(params);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => unsafe {
|
||||
(*real).register_tokenizer(String::from(tokenizer_name), text_analyzer);
|
||||
|
|
|
@ -5,7 +5,8 @@ use crate::array::RustResult;
|
|||
use crate::cstr_to_str;
|
||||
use crate::index_writer::IndexWriterWrapper;
|
||||
use crate::log::init_log;
|
||||
use crate::tokenizer::create_tokenizer;
|
||||
use crate::string_c::c_str_to_str;
|
||||
use crate::analyzer::create_analyzer;
|
||||
use crate::util::create_binding;
|
||||
|
||||
#[no_mangle]
|
||||
|
@ -23,7 +24,7 @@ pub extern "C" fn tantivy_create_text_writer(
|
|||
let path_str = cstr_to_str!(path);
|
||||
let tokenizer_name_str = cstr_to_str!(tokenizer_name);
|
||||
let params = cstr_to_str!(analyzer_params);
|
||||
let analyzer = create_tokenizer(params);
|
||||
let analyzer = create_analyzer(params);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => {
|
||||
let wrapper = IndexWriterWrapper::create_text_writer(
|
||||
|
|
|
@ -12,14 +12,11 @@ mod index_writer;
|
|||
mod index_writer_c;
|
||||
mod index_writer_text;
|
||||
mod index_writer_text_c;
|
||||
mod jieba_tokenizer;
|
||||
mod log;
|
||||
mod stop_words;
|
||||
mod string_c;
|
||||
mod token_stream_c;
|
||||
mod tokenizer;
|
||||
mod analyzer;
|
||||
mod tokenizer_c;
|
||||
mod tokenizer_filter;
|
||||
mod util;
|
||||
mod util_c;
|
||||
mod vec_collector;
|
||||
|
|
|
@ -5,15 +5,15 @@ use crate::{
|
|||
array::RustResult,
|
||||
log::init_log,
|
||||
string_c::c_str_to_str,
|
||||
tokenizer::create_tokenizer,
|
||||
analyzer::create_analyzer,
|
||||
util::{create_binding, free_binding},
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> RustResult {
|
||||
pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> RustResult {
|
||||
init_log();
|
||||
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
let analyzer = create_analyzer(¶ms);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => RustResult::from_ptr(create_binding(text_analyzer)),
|
||||
Err(err) => RustResult::from_error(format!(
|
||||
|
@ -25,13 +25,13 @@ pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> Ru
|
|||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void {
|
||||
pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void {
|
||||
let analyzer = ptr as *mut TextAnalyzer;
|
||||
let clone = unsafe { (*analyzer).clone() };
|
||||
create_binding(clone)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
|
||||
pub extern "C" fn tantivy_free_analyzer(tokenizer: *mut c_void) {
|
||||
free_binding::<TextAnalyzer>(tokenizer);
|
||||
}
|
||||
|
|
|
@ -1,12 +1,7 @@
|
|||
use serde_json as json;
|
||||
use std::ffi::c_void;
|
||||
use std::ops::Bound;
|
||||
use tantivy::{directory::MmapDirectory, Index};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
use crate::stop_words;
|
||||
|
||||
pub fn index_exist(path: &str) -> bool {
|
||||
let dir = MmapDirectory::open(path).unwrap();
|
||||
Index::exists(&dir).unwrap()
|
||||
|
@ -31,45 +26,4 @@ pub fn free_binding<T>(ptr: *mut c_void) {
|
|||
unsafe {
|
||||
drop(Box::from_raw(real));
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} should be array", label).to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let stop_words = value.as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words {
|
||||
match element.as_str() {
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
None => {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} list item should be string", label).to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(str_list)
|
||||
}
|
||||
|
||||
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||
let mut stop_words = Vec::new();
|
||||
for str in str_list {
|
||||
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||
match str.as_str() {
|
||||
"_english_" => {
|
||||
for word in stop_words::ENGLISH {
|
||||
stop_words.push(word.to_string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
stop_words.push(str);
|
||||
}
|
||||
stop_words
|
||||
}
|
||||
}
|
|
@ -15,7 +15,7 @@ struct Tokenizer {
|
|||
explicit Tokenizer(std::string&& params) {
|
||||
auto shared_params = std::make_shared<std::string>(std::move(params));
|
||||
auto res =
|
||||
RustResultWrapper(tantivy_create_tokenizer(shared_params->c_str()));
|
||||
RustResultWrapper(tantivy_create_analyzer(shared_params->c_str()));
|
||||
AssertInfo(res.result_->success,
|
||||
"Tokenizer creation failed: {}",
|
||||
res.result_->error);
|
||||
|
@ -27,7 +27,7 @@ struct Tokenizer {
|
|||
|
||||
~Tokenizer() {
|
||||
if (ptr_ != nullptr) {
|
||||
tantivy_free_tokenizer(ptr_);
|
||||
tantivy_free_analyzer(ptr_);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -41,7 +41,7 @@ struct Tokenizer {
|
|||
|
||||
std::unique_ptr<Tokenizer>
|
||||
Clone() {
|
||||
auto newptr = tantivy_clone_tokenizer(ptr_);
|
||||
auto newptr = tantivy_clone_analyzer(ptr_);
|
||||
return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue