feat: tantivy tokenizer binding (#35801)

fix: #35800

---------

Signed-off-by: longjiquan <jiquan.long@zilliz.com>
pull/35901/head
Jiquan Long 2024-09-01 17:13:03 +08:00 committed by GitHub
parent 1413ffe9b1
commit 5ea2454fdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 1225 additions and 19 deletions

View File

@ -0,0 +1,39 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "segcore/map_c.h"
#include <memory>
#include <map>
#include <string>
using Map = std::map<std::string, std::string>;
CMap
create_cmap() {
auto m = std::make_unique<Map>();
return m.release();
}
void
free_cmap(CMap m) {
delete static_cast<Map*>(m);
}
void
cmap_set(CMap m,
const char* key,
uint32_t key_len,
const char* value,
uint32_t value_len) {
auto mm = static_cast<Map*>(m);
(*mm)[std::string(key, key_len)] = std::string(value, value_len);
}

View File

@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void* CMap;
CMap
create_cmap();
void
free_cmap(CMap m);
void
cmap_set(CMap m,
const char* key,
uint32_t key_len,
const char* value,
uint32_t value_len);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,38 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <stdlib.h>
#include <string.h>
#include "segcore/token_stream_c.h"
#include "token-stream.h"
void
free_token_stream(CTokenStream token_stream) {
delete static_cast<milvus::tantivy::TokenStream*>(token_stream);
}
bool
token_stream_advance(CTokenStream token_stream) {
return static_cast<milvus::tantivy::TokenStream*>(token_stream)->advance();
}
// Note: returned token must be freed by the caller using `free_token`.
const char*
token_stream_get_token(CTokenStream token_stream) {
return static_cast<milvus::tantivy::TokenStream*>(token_stream)
->get_token_no_copy();
}
void
free_token(void* token) {
free_rust_string(static_cast<const char*>(token));
}

View File

@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <stdint.h>
#include "map_c.h"
#include "common/type_c.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef void* CTokenStream;
void free_token_stream(CTokenStream);
bool token_stream_advance(CTokenStream);
// Note: returned string must be freed by the caller.
const char* token_stream_get_token(CTokenStream);
void
free_token(void* token);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,41 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "segcore/tokenizer_c.h"
#include "common/EasyAssert.h"
#include "tokenizer.h"
using Map = std::map<std::string, std::string>;
CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer) {
try {
auto mm = reinterpret_cast<Map*>(m);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
*tokenizer = impl.release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(&e);
}
}
void
free_tokenizer(CTokenizer tokenizer) {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
delete impl;
}
CTokenStream
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
return impl->CreateTokenStream(std::string(text, text_len)).release();
}

View File

@ -0,0 +1,37 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <stdint.h>
#include "segcore/map_c.h"
#include "segcore/token_stream_c.h"
#include "common/type_c.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef void* CTokenizer;
CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer);
void
free_tokenizer(CTokenizer tokenizer);
CTokenStream
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len);
#ifdef __cplusplus
}
#endif

View File

@ -58,6 +58,13 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
endif()
# TODO: move these below tests to ut.
option(BUILD_TANTIVY_WITH_UT "compile tantivy with ut" OFF)
if (BUILD_TANTIVY_WITH_UT)
message(STATUS "compile tantivy with ut")
add_executable(test_tantivy test.cpp)
target_link_libraries(test_tantivy
tantivy_binding
@ -78,4 +85,12 @@ target_link_libraries(ffi_demo
dl
)
add_executable(tokenizer_demo tokenizer_demo.cpp)
target_link_libraries(tokenizer_demo
tantivy_binding
dl
)
else ()
endif ()
set( TANTIVY_INCLUDE_DIR ${LIB_HEADER_FOLDER};${CMAKE_CURRENT_SOURCE_DIR} CACHE INTERNAL "Path to tantivy include directory" )

View File

@ -0,0 +1,44 @@
#pragma once
#include <map>
#include "tantivy-binding.h"
#include "rust-binding.h"
namespace milvus::tantivy {
struct RustHashMap {
public:
NO_COPY_OR_ASSIGN(RustHashMap);
RustHashMap() {
ptr_ = create_hashmap();
}
~RustHashMap() {
if (ptr_ != nullptr) {
free_hashmap(ptr_);
}
}
void
from(const std::map<std::string, std::string>& m) {
for (const auto& [k, v] : m) {
set(k, v);
}
}
void*
get_pointer() {
return ptr_;
}
void
set(const std::string& k, const std::string& v) {
hashmap_set_value(ptr_, k.c_str(), v.c_str());
}
private:
void* ptr_ = nullptr;
};
} // namespace milvus::tantivy

View File

@ -180,6 +180,15 @@ dependencies = [
"libc",
]
[[package]]
name = "cedarwood"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
dependencies = [
"smallvec",
]
[[package]]
name = "census"
version = "0.4.2"
@ -443,6 +452,15 @@ dependencies = [
"slab",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generator"
version = "0.7.5"
@ -559,6 +577,21 @@ version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jieba-rs"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown 0.14.3",
"lazy_static",
"phf",
"phf_codegen",
"regex",
]
[[package]]
name = "jobserver"
version = "0.1.28"
@ -754,6 +787,44 @@ dependencies = [
"stable_deref_trait",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared",
"rand",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.13"
@ -796,6 +867,21 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "rayon"
version = "1.10.0"
@ -953,6 +1039,12 @@ dependencies = [
"lazy_static",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "sketches-ddsketch"
version = "0.2.2"
@ -1070,10 +1162,12 @@ dependencies = [
"cbindgen",
"env_logger",
"futures",
"lazy_static",
"libc",
"log",
"scopeguard",
"tantivy",
"tantivy-jieba",
"zstd-sys",
]
@ -1126,6 +1220,17 @@ dependencies = [
"utf8-ranges",
]
[[package]]
name = "tantivy-jieba"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44022293c12a8f878e03439b2f11806d3d394130fe33d4e7781cba91abbac0a4"
dependencies = [
"jieba-rs",
"lazy_static",
"tantivy-tokenizer-api",
]
[[package]]
name = "tantivy-query-grammar"
version = "0.21.0"

View File

@ -13,6 +13,8 @@ scopeguard = "1.2"
zstd-sys = "=2.0.9"
env_logger = "0.11.3"
log = "0.4.21"
tantivy-jieba = "0.10.0"
lazy_static = "1.4.0"
[build-dependencies]
cbindgen = "0.26.0"

View File

@ -25,6 +25,12 @@ void free_rust_array(RustArray array);
void print_vector_of_strings(const char *const *ptr, uintptr_t len);
void *create_hashmap();
void hashmap_set_value(void *map, const char *key, const char *value);
void free_hashmap(void *map);
void *tantivy_load_index(const char *path);
void tantivy_free_index_reader(void *ptr);
@ -122,6 +128,20 @@ void tantivy_index_add_multi_keywords(void *ptr,
uintptr_t len,
int64_t offset);
void free_rust_string(const char *ptr);
void *tantivy_create_token_stream(void *tokenizer, const char *text);
void tantivy_free_token_stream(void *token_stream);
bool tantivy_token_stream_advance(void *token_stream);
const char *tantivy_token_stream_get_token(void *token_stream);
void *tantivy_create_tokenizer(void *tokenizer_params);
void tantivy_free_tokenizer(void *tokenizer);
bool tantivy_index_exist(const char *path);
} // extern "C"

View File

@ -0,0 +1,28 @@
use std::collections::HashMap;
use std::ffi::CStr;
use std::os::raw::c_char;
use libc::c_void;
use crate::util::{create_binding, free_binding};
#[no_mangle]
pub extern "C" fn create_hashmap() -> *mut c_void {
let map: HashMap<String, String> = HashMap::new();
create_binding(map)
}
#[no_mangle]
pub extern "C" fn hashmap_set_value(map: *mut c_void, key: *const c_char, value: *const c_char) {
let m = map as *mut HashMap<String, String>;
let k = unsafe { CStr::from_ptr(key).to_str().unwrap() };
let v = unsafe { CStr::from_ptr(value).to_str().unwrap() };
unsafe {
(*m).insert(String::from(k), String::from(v));
}
}
#[no_mangle]
pub extern "C" fn free_hashmap(map: *mut c_void) {
free_binding::<HashMap<String, String>>(map);
}

View File

@ -10,7 +10,6 @@ use tantivy::{doc, tokenizer, Document, Index, IndexWriter};
use crate::data_type::TantivyDataType;
use crate::index_reader::IndexReaderWrapper;
use crate::log::init_log;
pub(crate) struct IndexWriterWrapper {

View File

@ -2,6 +2,7 @@ mod array;
mod data_type;
mod demo_c;
mod docid_collector;
mod hashmap_c;
mod hashset_collector;
mod index_reader;
mod index_reader_c;
@ -9,6 +10,10 @@ mod index_writer;
mod index_writer_c;
mod linkedlist_collector;
mod log;
mod string_c;
mod token_stream_c;
mod tokenizer;
mod tokenizer_c;
mod util;
mod util_c;
mod vec_collector;

View File

@ -0,0 +1,22 @@
use std::ffi::{CStr, CString};
use libc::c_char;
use std::str;
// Be careful to use this function, since the returned str depends on the input to be not freed.
pub(crate) unsafe fn c_str_to_str<'a>(s: *const c_char) -> &'a str {
let rs = CStr::from_ptr(s);
str::from_utf8_unchecked(rs.to_bytes())
}
pub(crate) fn create_string(s: &str) -> *const c_char {
CString::new(s).unwrap().into_raw()
}
#[no_mangle]
pub extern "C" fn free_rust_string(ptr: *const c_char) {
unsafe {
let _ = CString::from_raw(ptr as *mut c_char);
}
}

View File

@ -0,0 +1,40 @@
use std::ffi::{c_char};
use libc::c_void;
use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer};
use crate::string_c::c_str_to_str;
use crate::{
string_c::create_string,
util::{create_binding, free_binding},
};
// Note: the tokenizer and text must be released after the token_stream.
#[no_mangle]
pub extern "C" fn tantivy_create_token_stream(
tokenizer: *mut c_void,
text: *const c_char,
) -> *mut c_void {
let analyzer = tokenizer as *mut TextAnalyzer;
let token_stream = unsafe { (*analyzer).token_stream(c_str_to_str(text)) };
create_binding(token_stream)
}
#[no_mangle]
pub extern "C" fn tantivy_free_token_stream(token_stream: *mut c_void) {
free_binding::<BoxTokenStream<'_>>(token_stream);
}
#[no_mangle]
pub extern "C" fn tantivy_token_stream_advance(token_stream: *mut c_void) -> bool {
let real = token_stream as *mut BoxTokenStream<'_>;
unsafe { (*real).advance() }
}
// Note: the returned token should be released by calling `free_string` after use.
#[no_mangle]
pub extern "C" fn tantivy_token_stream_get_token(token_stream: *mut c_void) -> *const c_char {
let real = token_stream as *mut BoxTokenStream<'_>;
let token = unsafe { (*real).token().text.as_str() };
create_string(token)
}

View File

@ -0,0 +1,34 @@
use lazy_static::lazy_static;
use log::info;
use std::collections::HashMap;
use tantivy::tokenizer::{TextAnalyzer, TokenizerManager};
lazy_static! {
static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default();
}
pub(crate) fn default_tokenizer() -> TextAnalyzer {
DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
}
fn jieba_tokenizer() -> TextAnalyzer {
tantivy_jieba::JiebaTokenizer {}.into()
}
pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
match params.get("tokenizer") {
Some(tokenizer_name) => match tokenizer_name.as_str() {
"default" => {
return Some(default_tokenizer());
}
"jieba" => return Some(jieba_tokenizer()),
_ => {
return None;
}
},
None => {
info!("no tokenizer is specific, use default tokenizer");
return Some(default_tokenizer());
}
}
}

View File

@ -0,0 +1,26 @@
use std::collections::HashMap;
use libc::c_void;
use tantivy::tokenizer::TextAnalyzer;
use crate::{
tokenizer::create_tokenizer,
util::{create_binding, free_binding},
};
#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
};
match analyzer {
Some(text_analyzer) => create_binding(text_analyzer),
None => std::ptr::null_mut(),
}
}
#[no_mangle]
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
free_binding::<TextAnalyzer>(tokenizer);
}

View File

@ -0,0 +1,50 @@
#pragma once
#include <assert.h>
#include <memory>
#include <string>
#include "tantivy-binding.h"
#include "rust-binding.h"
namespace milvus::tantivy {
struct TokenStream {
public:
NO_COPY_OR_ASSIGN(TokenStream);
TokenStream(void* ptr, std::shared_ptr<std::string> text)
: ptr_(ptr), text_(text) {
assert(ptr != nullptr);
}
~TokenStream() {
if (ptr_ != nullptr) {
tantivy_free_token_stream(ptr_);
}
}
public:
bool
advance() {
return tantivy_token_stream_advance(ptr_);
}
std::string
get_token() {
auto token = tantivy_token_stream_get_token(ptr_);
std::string s(token);
free_rust_string(token);
return s;
}
// Note: the returned token must be freed by calling `free_rust_string`.
const char*
get_token_no_copy() {
return tantivy_token_stream_get_token(ptr_);
}
public:
void* ptr_;
std::shared_ptr<std::string> text_;
};
} // namespace milvus::tantivy

View File

@ -0,0 +1,50 @@
#pragma once
#include "tantivy-binding.h"
#include "rust-binding.h"
#include "rust-hashmap.h"
#include "token-stream.h"
namespace milvus::tantivy {
struct Tokenizer {
public:
NO_COPY_OR_ASSIGN(Tokenizer);
explicit Tokenizer(const std::map<std::string, std::string>& params) {
RustHashMap m;
m.from(params);
ptr_ = tantivy_create_tokenizer(m.get_pointer());
if (ptr_ == nullptr) {
throw "invalid tokenizer parameters";
}
}
~Tokenizer() {
if (ptr_ != nullptr) {
tantivy_free_tokenizer(ptr_);
}
}
std::unique_ptr<TokenStream>
CreateTokenStream(std::string&& text) {
auto shared_text = std::make_shared<std::string>(std::move(text));
auto token_stream =
tantivy_create_token_stream(ptr_, shared_text->c_str());
return std::make_unique<TokenStream>(token_stream, shared_text);
}
// CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
std::unique_ptr<TokenStream>
CreateTokenStreamCopyText(const std::string& text) {
auto shared_text = std::make_shared<std::string>(text);
auto token_stream =
tantivy_create_token_stream(ptr_, shared_text->c_str());
return std::make_unique<TokenStream>(token_stream, shared_text);
}
private:
void* ptr_;
};
} // namespace milvus::tantivy

View File

@ -0,0 +1,39 @@
#include <iostream>
#include "token-stream.h"
#include "tokenizer.h"
using Map = std::map<std::string, std::string>;
using namespace milvus::tantivy;
void
test_tokenizer(const Map& m, std::string&& text) {
Tokenizer tokenizer(m);
auto token_stream = tokenizer.CreateTokenStream(std::move(text));
while (token_stream->advance()) {
auto token = token_stream->get_token();
std::cout << token << std::endl;
}
}
int
main(int argc, char* argv[]) {
// default tokenizer
{
Map m;
test_tokenizer(m, "football, basketball, pingpang");
}
// jieba tokenizer
{
Map m;
std::string tokenizer_name = "jieba";
m["tokenizer"] = tokenizer_name;
test_tokenizer(m,
"张华考上了北京大学;李萍进了中等技术学校;我在百货公司"
"当售货员:我们都有光明的前途");
}
return 0;
}

View File

@ -0,0 +1,45 @@
package ctokenizer
/*
#cgo pkg-config: milvus_core
#include <stdlib.h> // free
#include "segcore/map_c.h"
*/
import "C"
import "unsafe"
type CMap struct {
ptr C.CMap
}
func NewCMap() *CMap {
return &CMap{
ptr: C.create_cmap(),
}
}
func (m *CMap) GetPointer() C.CMap {
return m.ptr
}
func (m *CMap) Set(key string, value string) {
cKey := C.CString(key)
defer C.free(unsafe.Pointer(cKey))
cValue := C.CString(value)
defer C.free(unsafe.Pointer(cValue))
C.cmap_set(m.ptr, cKey, (C.uint32_t)(len(key)), cValue, (C.uint32_t)(len(value)))
}
func (m *CMap) From(gm map[string]string) {
for k, v := range gm {
m.Set(k, v)
}
}
func (m *CMap) Destroy() {
if m.ptr != nil {
C.free_cmap(m.ptr)
}
}

View File

@ -0,0 +1,40 @@
package ctokenizer
/*
#cgo pkg-config: milvus_core
#include <stdlib.h> // free
#include "segcore/token_stream_c.h"
*/
import "C"
import (
"unsafe"
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
)
var _ tokenizerapi.TokenStream = (*CTokenStream)(nil)
type CTokenStream struct {
ptr C.CTokenStream
}
func NewCTokenStream(ptr C.CTokenStream) *CTokenStream {
return &CTokenStream{
ptr: ptr,
}
}
func (impl *CTokenStream) Advance() bool {
return bool(C.token_stream_advance(impl.ptr))
}
func (impl *CTokenStream) Token() string {
token := C.token_stream_get_token(impl.ptr)
defer C.free_token(unsafe.Pointer(token))
return C.GoString(token)
}
func (impl *CTokenStream) Destroy() {
C.free_token_stream(impl.ptr)
}

View File

@ -0,0 +1,38 @@
package ctokenizer
/*
#cgo pkg-config: milvus_core
#include <stdlib.h> // free
#include "segcore/tokenizer_c.h"
#include "segcore/token_stream_c.h"
*/
import "C"
import (
"unsafe"
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
)
var _ tokenizerapi.Tokenizer = (*CTokenizer)(nil)
type CTokenizer struct {
ptr C.CTokenizer
}
func NewCTokenizer(ptr C.CTokenizer) *CTokenizer {
return &CTokenizer{
ptr: ptr,
}
}
func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
cText := C.CString(text)
defer C.free(unsafe.Pointer(cText))
ptr := C.create_token_stream(impl.ptr, cText, (C.uint32_t)(len(text)))
return NewCTokenStream(ptr)
}
func (impl *CTokenizer) Destroy() {
C.free_tokenizer(impl.ptr)
}

View File

@ -0,0 +1,27 @@
package ctokenizer
/*
#cgo pkg-config: milvus_core
#include <stdlib.h> // free
#include "segcore/tokenizer_c.h"
#include "segcore/token_stream_c.h"
*/
import "C"
import (
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
)
func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) {
mm := NewCMap()
defer mm.Destroy()
mm.From(m)
var ptr C.CTokenizer
status := C.create_tokenizer(mm.GetPointer(), &ptr)
if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
return nil, err
}
return NewCTokenizer(ptr), nil
}

View File

@ -0,0 +1,39 @@
package ctokenizer
import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
)
func TestTokenizer(t *testing.T) {
// default tokenizer.
{
m := make(map[string]string)
tokenizer, err := NewTokenizer(m)
assert.NoError(t, err)
defer tokenizer.Destroy()
tokenStream := tokenizer.NewTokenStream("football, basketball, pingpang")
defer tokenStream.Destroy()
for tokenStream.Advance() {
fmt.Println(tokenStream.Token())
}
}
// jieba tokenizer.
{
m := make(map[string]string)
m["tokenizer"] = "jieba"
tokenizer, err := NewTokenizer(m)
assert.NoError(t, err)
defer tokenizer.Destroy()
tokenStream := tokenizer.NewTokenStream("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途")
defer tokenStream.Destroy()
for tokenStream.Advance() {
fmt.Println(tokenStream.Token())
}
}
}

View File

@ -0,0 +1,37 @@
package ctokenizer
/*
#cgo pkg-config: milvus_core
#include <stdlib.h> // free
#include "common/type_c.h"
*/
import "C"
import (
"fmt"
"unsafe"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
)
// HandleCStatus deal with the error returned from CGO
func HandleCStatus(status *C.CStatus, extraInfo string) error {
if status.error_code == 0 {
return nil
}
errorCode := int(status.error_code)
errorMsg := C.GoString(status.error_msg)
defer C.free(unsafe.Pointer(status.error_msg))
logMsg := fmt.Sprintf("%s, C Runtime Exception: %s\n", extraInfo, errorMsg)
log.Warn(logMsg)
if errorCode == 2003 {
return merr.WrapErrSegcoreUnsupported(int32(errorCode), logMsg)
}
if errorCode == 2033 {
log.Info("fake finished the task")
return merr.ErrSegcorePretendFinished
}
return merr.WrapErrSegcore(int32(errorCode), logMsg)
}

View File

@ -0,0 +1,146 @@
// Code generated by mockery v2.32.4. DO NOT EDIT.
package mocks
import mock "github.com/stretchr/testify/mock"
// TokenStream is an autogenerated mock type for the TokenStream type
type TokenStream struct {
mock.Mock
}
type TokenStream_Expecter struct {
mock *mock.Mock
}
func (_m *TokenStream) EXPECT() *TokenStream_Expecter {
return &TokenStream_Expecter{mock: &_m.Mock}
}
// Advance provides a mock function with given fields:
func (_m *TokenStream) Advance() bool {
ret := _m.Called()
var r0 bool
if rf, ok := ret.Get(0).(func() bool); ok {
r0 = rf()
} else {
r0 = ret.Get(0).(bool)
}
return r0
}
// TokenStream_Advance_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Advance'
type TokenStream_Advance_Call struct {
*mock.Call
}
// Advance is a helper method to define mock.On call
func (_e *TokenStream_Expecter) Advance() *TokenStream_Advance_Call {
return &TokenStream_Advance_Call{Call: _e.mock.On("Advance")}
}
func (_c *TokenStream_Advance_Call) Run(run func()) *TokenStream_Advance_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *TokenStream_Advance_Call) Return(_a0 bool) *TokenStream_Advance_Call {
_c.Call.Return(_a0)
return _c
}
func (_c *TokenStream_Advance_Call) RunAndReturn(run func() bool) *TokenStream_Advance_Call {
_c.Call.Return(run)
return _c
}
// Destroy provides a mock function with given fields:
func (_m *TokenStream) Destroy() {
_m.Called()
}
// TokenStream_Destroy_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Destroy'
type TokenStream_Destroy_Call struct {
*mock.Call
}
// Destroy is a helper method to define mock.On call
func (_e *TokenStream_Expecter) Destroy() *TokenStream_Destroy_Call {
return &TokenStream_Destroy_Call{Call: _e.mock.On("Destroy")}
}
func (_c *TokenStream_Destroy_Call) Run(run func()) *TokenStream_Destroy_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *TokenStream_Destroy_Call) Return() *TokenStream_Destroy_Call {
_c.Call.Return()
return _c
}
func (_c *TokenStream_Destroy_Call) RunAndReturn(run func()) *TokenStream_Destroy_Call {
_c.Call.Return(run)
return _c
}
// Token provides a mock function with given fields:
func (_m *TokenStream) Token() string {
ret := _m.Called()
var r0 string
if rf, ok := ret.Get(0).(func() string); ok {
r0 = rf()
} else {
r0 = ret.Get(0).(string)
}
return r0
}
// TokenStream_Token_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Token'
type TokenStream_Token_Call struct {
*mock.Call
}
// Token is a helper method to define mock.On call
func (_e *TokenStream_Expecter) Token() *TokenStream_Token_Call {
return &TokenStream_Token_Call{Call: _e.mock.On("Token")}
}
func (_c *TokenStream_Token_Call) Run(run func()) *TokenStream_Token_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *TokenStream_Token_Call) Return(_a0 string) *TokenStream_Token_Call {
_c.Call.Return(_a0)
return _c
}
func (_c *TokenStream_Token_Call) RunAndReturn(run func() string) *TokenStream_Token_Call {
_c.Call.Return(run)
return _c
}
// NewTokenStream creates a new instance of TokenStream. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewTokenStream(t interface {
mock.TestingT
Cleanup(func())
}) *TokenStream {
mock := &TokenStream{}
mock.Mock.Test(t)
t.Cleanup(func() { mock.AssertExpectations(t) })
return mock
}

View File

@ -0,0 +1,111 @@
// Code generated by mockery v2.32.4. DO NOT EDIT.
package mocks
import (
tokenizerapi "github.com/milvus-io/milvus/internal/util/tokenizerapi"
mock "github.com/stretchr/testify/mock"
)
// Tokenizer is an autogenerated mock type for the Tokenizer type
type Tokenizer struct {
mock.Mock
}
type Tokenizer_Expecter struct {
mock *mock.Mock
}
func (_m *Tokenizer) EXPECT() *Tokenizer_Expecter {
return &Tokenizer_Expecter{mock: &_m.Mock}
}
// Destroy provides a mock function with given fields:
func (_m *Tokenizer) Destroy() {
_m.Called()
}
// Tokenizer_Destroy_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Destroy'
type Tokenizer_Destroy_Call struct {
*mock.Call
}
// Destroy is a helper method to define mock.On call
func (_e *Tokenizer_Expecter) Destroy() *Tokenizer_Destroy_Call {
return &Tokenizer_Destroy_Call{Call: _e.mock.On("Destroy")}
}
func (_c *Tokenizer_Destroy_Call) Run(run func()) *Tokenizer_Destroy_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *Tokenizer_Destroy_Call) Return() *Tokenizer_Destroy_Call {
_c.Call.Return()
return _c
}
func (_c *Tokenizer_Destroy_Call) RunAndReturn(run func()) *Tokenizer_Destroy_Call {
_c.Call.Return(run)
return _c
}
// NewTokenStream provides a mock function with given fields: text
func (_m *Tokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
ret := _m.Called(text)
var r0 tokenizerapi.TokenStream
if rf, ok := ret.Get(0).(func(string) tokenizerapi.TokenStream); ok {
r0 = rf(text)
} else {
if ret.Get(0) != nil {
r0 = ret.Get(0).(tokenizerapi.TokenStream)
}
}
return r0
}
// Tokenizer_NewTokenStream_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'NewTokenStream'
type Tokenizer_NewTokenStream_Call struct {
*mock.Call
}
// NewTokenStream is a helper method to define mock.On call
// - text string
func (_e *Tokenizer_Expecter) NewTokenStream(text interface{}) *Tokenizer_NewTokenStream_Call {
return &Tokenizer_NewTokenStream_Call{Call: _e.mock.On("NewTokenStream", text)}
}
func (_c *Tokenizer_NewTokenStream_Call) Run(run func(text string)) *Tokenizer_NewTokenStream_Call {
_c.Call.Run(func(args mock.Arguments) {
run(args[0].(string))
})
return _c
}
func (_c *Tokenizer_NewTokenStream_Call) Return(_a0 tokenizerapi.TokenStream) *Tokenizer_NewTokenStream_Call {
_c.Call.Return(_a0)
return _c
}
func (_c *Tokenizer_NewTokenStream_Call) RunAndReturn(run func(string) tokenizerapi.TokenStream) *Tokenizer_NewTokenStream_Call {
_c.Call.Return(run)
return _c
}
// NewTokenizer creates a new instance of Tokenizer. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewTokenizer(t interface {
mock.TestingT
Cleanup(func())
}) *Tokenizer {
mock := &Tokenizer{}
mock.Mock.Test(t)
t.Cleanup(func() { mock.AssertExpectations(t) })
return mock
}

View File

@ -0,0 +1,8 @@
package tokenizerapi
//go:generate mockery --name=TokenStream --with-expecter
type TokenStream interface {
Advance() bool
Token() string
Destroy()
}

View File

@ -0,0 +1,7 @@
package tokenizerapi
//go:generate mockery --name=Tokenizer --with-expecter
type Tokenizer interface {
NewTokenStream(text string) TokenStream
Destroy()
}