mirror of https://github.com/milvus-io/milvus.git
feat: tantivy tokenizer binding (#35801)
fix: #35800 --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>pull/35901/head
parent
1413ffe9b1
commit
5ea2454fdf
|
@ -0,0 +1,39 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "segcore/map_c.h"
|
||||
|
||||
#include <memory>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
CMap
|
||||
create_cmap() {
|
||||
auto m = std::make_unique<Map>();
|
||||
return m.release();
|
||||
}
|
||||
|
||||
void
|
||||
free_cmap(CMap m) {
|
||||
delete static_cast<Map*>(m);
|
||||
}
|
||||
|
||||
void
|
||||
cmap_set(CMap m,
|
||||
const char* key,
|
||||
uint32_t key_len,
|
||||
const char* value,
|
||||
uint32_t value_len) {
|
||||
auto mm = static_cast<Map*>(m);
|
||||
(*mm)[std::string(key, key_len)] = std::string(value, value_len);
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* CMap;
|
||||
|
||||
CMap
|
||||
create_cmap();
|
||||
|
||||
void
|
||||
free_cmap(CMap m);
|
||||
|
||||
void
|
||||
cmap_set(CMap m,
|
||||
const char* key,
|
||||
uint32_t key_len,
|
||||
const char* value,
|
||||
uint32_t value_len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,38 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "segcore/token_stream_c.h"
|
||||
#include "token-stream.h"
|
||||
|
||||
void
|
||||
free_token_stream(CTokenStream token_stream) {
|
||||
delete static_cast<milvus::tantivy::TokenStream*>(token_stream);
|
||||
}
|
||||
|
||||
bool
|
||||
token_stream_advance(CTokenStream token_stream) {
|
||||
return static_cast<milvus::tantivy::TokenStream*>(token_stream)->advance();
|
||||
}
|
||||
|
||||
// Note: returned token must be freed by the caller using `free_token`.
|
||||
const char*
|
||||
token_stream_get_token(CTokenStream token_stream) {
|
||||
return static_cast<milvus::tantivy::TokenStream*>(token_stream)
|
||||
->get_token_no_copy();
|
||||
}
|
||||
|
||||
void
|
||||
free_token(void* token) {
|
||||
free_rust_string(static_cast<const char*>(token));
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "map_c.h"
|
||||
#include "common/type_c.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* CTokenStream;
|
||||
|
||||
void free_token_stream(CTokenStream);
|
||||
|
||||
bool token_stream_advance(CTokenStream);
|
||||
|
||||
// Note: returned string must be freed by the caller.
|
||||
const char* token_stream_get_token(CTokenStream);
|
||||
|
||||
void
|
||||
free_token(void* token);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,41 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "segcore/tokenizer_c.h"
|
||||
#include "common/EasyAssert.h"
|
||||
|
||||
#include "tokenizer.h"
|
||||
|
||||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
CStatus
|
||||
create_tokenizer(CMap m, CTokenizer* tokenizer) {
|
||||
try {
|
||||
auto mm = reinterpret_cast<Map*>(m);
|
||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
|
||||
*tokenizer = impl.release();
|
||||
return milvus::SuccessCStatus();
|
||||
} catch (std::exception& e) {
|
||||
return milvus::FailureCStatus(&e);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
free_tokenizer(CTokenizer tokenizer) {
|
||||
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
|
||||
delete impl;
|
||||
}
|
||||
|
||||
CTokenStream
|
||||
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) {
|
||||
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
|
||||
return impl->CreateTokenStream(std::string(text, text_len)).release();
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "segcore/map_c.h"
|
||||
#include "segcore/token_stream_c.h"
|
||||
#include "common/type_c.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* CTokenizer;
|
||||
|
||||
CStatus
|
||||
create_tokenizer(CMap m, CTokenizer* tokenizer);
|
||||
|
||||
void
|
||||
free_tokenizer(CTokenizer tokenizer);
|
||||
|
||||
CTokenStream
|
||||
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -58,6 +58,13 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|||
add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address)
|
||||
endif()
|
||||
|
||||
# TODO: move these below tests to ut.
|
||||
|
||||
option(BUILD_TANTIVY_WITH_UT "compile tantivy with ut" OFF)
|
||||
|
||||
if (BUILD_TANTIVY_WITH_UT)
|
||||
message(STATUS "compile tantivy with ut")
|
||||
|
||||
add_executable(test_tantivy test.cpp)
|
||||
target_link_libraries(test_tantivy
|
||||
tantivy_binding
|
||||
|
@ -78,4 +85,12 @@ target_link_libraries(ffi_demo
|
|||
dl
|
||||
)
|
||||
|
||||
add_executable(tokenizer_demo tokenizer_demo.cpp)
|
||||
target_link_libraries(tokenizer_demo
|
||||
tantivy_binding
|
||||
dl
|
||||
)
|
||||
else ()
|
||||
endif ()
|
||||
|
||||
set( TANTIVY_INCLUDE_DIR ${LIB_HEADER_FOLDER};${CMAKE_CURRENT_SOURCE_DIR} CACHE INTERNAL "Path to tantivy include directory" )
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#pragma once
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "rust-binding.h"
|
||||
|
||||
namespace milvus::tantivy {
|
||||
|
||||
struct RustHashMap {
|
||||
public:
|
||||
NO_COPY_OR_ASSIGN(RustHashMap);
|
||||
|
||||
RustHashMap() {
|
||||
ptr_ = create_hashmap();
|
||||
}
|
||||
|
||||
~RustHashMap() {
|
||||
if (ptr_ != nullptr) {
|
||||
free_hashmap(ptr_);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
from(const std::map<std::string, std::string>& m) {
|
||||
for (const auto& [k, v] : m) {
|
||||
set(k, v);
|
||||
}
|
||||
}
|
||||
|
||||
void*
|
||||
get_pointer() {
|
||||
return ptr_;
|
||||
}
|
||||
|
||||
void
|
||||
set(const std::string& k, const std::string& v) {
|
||||
hashmap_set_value(ptr_, k.c_str(), v.c_str());
|
||||
}
|
||||
|
||||
private:
|
||||
void* ptr_ = nullptr;
|
||||
};
|
||||
} // namespace milvus::tantivy
|
|
@ -180,6 +180,15 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cedarwood"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "census"
|
||||
version = "0.4.2"
|
||||
|
@ -443,6 +452,15 @@ dependencies = [
|
|||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generator"
|
||||
version = "0.7.5"
|
||||
|
@ -559,6 +577,21 @@ version = "1.0.11"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "jieba-rs"
|
||||
version = "0.6.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e"
|
||||
dependencies = [
|
||||
"cedarwood",
|
||||
"fxhash",
|
||||
"hashbrown 0.14.3",
|
||||
"lazy_static",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.28"
|
||||
|
@ -754,6 +787,44 @@ dependencies = [
|
|||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
|
||||
dependencies = [
|
||||
"phf_shared",
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-lite"
|
||||
version = "0.2.13"
|
||||
|
@ -796,6 +867,21 @@ dependencies = [
|
|||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
|
@ -953,6 +1039,12 @@ dependencies = [
|
|||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||
|
||||
[[package]]
|
||||
name = "sketches-ddsketch"
|
||||
version = "0.2.2"
|
||||
|
@ -1070,10 +1162,12 @@ dependencies = [
|
|||
"cbindgen",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"scopeguard",
|
||||
"tantivy",
|
||||
"tantivy-jieba",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
|
@ -1126,6 +1220,17 @@ dependencies = [
|
|||
"utf8-ranges",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-jieba"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44022293c12a8f878e03439b2f11806d3d394130fe33d4e7781cba91abbac0a4"
|
||||
dependencies = [
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
"tantivy-tokenizer-api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.21.0"
|
||||
|
|
|
@ -13,6 +13,8 @@ scopeguard = "1.2"
|
|||
zstd-sys = "=2.0.9"
|
||||
env_logger = "0.11.3"
|
||||
log = "0.4.21"
|
||||
tantivy-jieba = "0.10.0"
|
||||
lazy_static = "1.4.0"
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = "0.26.0"
|
||||
|
|
|
@ -25,6 +25,12 @@ void free_rust_array(RustArray array);
|
|||
|
||||
void print_vector_of_strings(const char *const *ptr, uintptr_t len);
|
||||
|
||||
void *create_hashmap();
|
||||
|
||||
void hashmap_set_value(void *map, const char *key, const char *value);
|
||||
|
||||
void free_hashmap(void *map);
|
||||
|
||||
void *tantivy_load_index(const char *path);
|
||||
|
||||
void tantivy_free_index_reader(void *ptr);
|
||||
|
@ -122,6 +128,20 @@ void tantivy_index_add_multi_keywords(void *ptr,
|
|||
uintptr_t len,
|
||||
int64_t offset);
|
||||
|
||||
void free_rust_string(const char *ptr);
|
||||
|
||||
void *tantivy_create_token_stream(void *tokenizer, const char *text);
|
||||
|
||||
void tantivy_free_token_stream(void *token_stream);
|
||||
|
||||
bool tantivy_token_stream_advance(void *token_stream);
|
||||
|
||||
const char *tantivy_token_stream_get_token(void *token_stream);
|
||||
|
||||
void *tantivy_create_tokenizer(void *tokenizer_params);
|
||||
|
||||
void tantivy_free_tokenizer(void *tokenizer);
|
||||
|
||||
bool tantivy_index_exist(const char *path);
|
||||
|
||||
} // extern "C"
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
use std::collections::HashMap;
|
||||
use std::ffi::CStr;
|
||||
use std::os::raw::c_char;
|
||||
|
||||
use libc::c_void;
|
||||
|
||||
use crate::util::{create_binding, free_binding};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn create_hashmap() -> *mut c_void {
|
||||
let map: HashMap<String, String> = HashMap::new();
|
||||
create_binding(map)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn hashmap_set_value(map: *mut c_void, key: *const c_char, value: *const c_char) {
|
||||
let m = map as *mut HashMap<String, String>;
|
||||
let k = unsafe { CStr::from_ptr(key).to_str().unwrap() };
|
||||
let v = unsafe { CStr::from_ptr(value).to_str().unwrap() };
|
||||
unsafe {
|
||||
(*m).insert(String::from(k), String::from(v));
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_hashmap(map: *mut c_void) {
|
||||
free_binding::<HashMap<String, String>>(map);
|
||||
}
|
|
@ -10,7 +10,6 @@ use tantivy::{doc, tokenizer, Document, Index, IndexWriter};
|
|||
|
||||
use crate::data_type::TantivyDataType;
|
||||
|
||||
use crate::index_reader::IndexReaderWrapper;
|
||||
use crate::log::init_log;
|
||||
|
||||
pub(crate) struct IndexWriterWrapper {
|
||||
|
|
|
@ -2,6 +2,7 @@ mod array;
|
|||
mod data_type;
|
||||
mod demo_c;
|
||||
mod docid_collector;
|
||||
mod hashmap_c;
|
||||
mod hashset_collector;
|
||||
mod index_reader;
|
||||
mod index_reader_c;
|
||||
|
@ -9,6 +10,10 @@ mod index_writer;
|
|||
mod index_writer_c;
|
||||
mod linkedlist_collector;
|
||||
mod log;
|
||||
mod string_c;
|
||||
mod token_stream_c;
|
||||
mod tokenizer;
|
||||
mod tokenizer_c;
|
||||
mod util;
|
||||
mod util_c;
|
||||
mod vec_collector;
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
use std::ffi::{CStr, CString};
|
||||
|
||||
use libc::c_char;
|
||||
|
||||
use std::str;
|
||||
|
||||
// Be careful to use this function, since the returned str depends on the input to be not freed.
|
||||
pub(crate) unsafe fn c_str_to_str<'a>(s: *const c_char) -> &'a str {
|
||||
let rs = CStr::from_ptr(s);
|
||||
str::from_utf8_unchecked(rs.to_bytes())
|
||||
}
|
||||
|
||||
pub(crate) fn create_string(s: &str) -> *const c_char {
|
||||
CString::new(s).unwrap().into_raw()
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_rust_string(ptr: *const c_char) {
|
||||
unsafe {
|
||||
let _ = CString::from_raw(ptr as *mut c_char);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
use std::ffi::{c_char};
|
||||
|
||||
use libc::c_void;
|
||||
use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer};
|
||||
|
||||
use crate::string_c::c_str_to_str;
|
||||
use crate::{
|
||||
string_c::create_string,
|
||||
util::{create_binding, free_binding},
|
||||
};
|
||||
|
||||
// Note: the tokenizer and text must be released after the token_stream.
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_token_stream(
|
||||
tokenizer: *mut c_void,
|
||||
text: *const c_char,
|
||||
) -> *mut c_void {
|
||||
let analyzer = tokenizer as *mut TextAnalyzer;
|
||||
let token_stream = unsafe { (*analyzer).token_stream(c_str_to_str(text)) };
|
||||
create_binding(token_stream)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_token_stream(token_stream: *mut c_void) {
|
||||
free_binding::<BoxTokenStream<'_>>(token_stream);
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_token_stream_advance(token_stream: *mut c_void) -> bool {
|
||||
let real = token_stream as *mut BoxTokenStream<'_>;
|
||||
unsafe { (*real).advance() }
|
||||
}
|
||||
|
||||
// Note: the returned token should be released by calling `free_string` after use.
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_token_stream_get_token(token_stream: *mut c_void) -> *const c_char {
|
||||
let real = token_stream as *mut BoxTokenStream<'_>;
|
||||
let token = unsafe { (*real).token().text.as_str() };
|
||||
create_string(token)
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
|
||||
lazy_static! {
|
||||
static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default();
|
||||
}
|
||||
|
||||
pub(crate) fn default_tokenizer() -> TextAnalyzer {
|
||||
DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
|
||||
}
|
||||
|
||||
fn jieba_tokenizer() -> TextAnalyzer {
|
||||
tantivy_jieba::JiebaTokenizer {}.into()
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
|
||||
match params.get("tokenizer") {
|
||||
Some(tokenizer_name) => match tokenizer_name.as_str() {
|
||||
"default" => {
|
||||
return Some(default_tokenizer());
|
||||
}
|
||||
"jieba" => return Some(jieba_tokenizer()),
|
||||
_ => {
|
||||
return None;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
info!("no tokenizer is specific, use default tokenizer");
|
||||
return Some(default_tokenizer());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use libc::c_void;
|
||||
use tantivy::tokenizer::TextAnalyzer;
|
||||
|
||||
use crate::{
|
||||
tokenizer::create_tokenizer,
|
||||
util::{create_binding, free_binding},
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
|
||||
let analyzer = unsafe {
|
||||
let m = tokenizer_params as *const HashMap<String, String>;
|
||||
create_tokenizer(&(*m))
|
||||
};
|
||||
match analyzer {
|
||||
Some(text_analyzer) => create_binding(text_analyzer),
|
||||
None => std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
|
||||
free_binding::<TextAnalyzer>(tokenizer);
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
#pragma once
|
||||
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "rust-binding.h"
|
||||
|
||||
namespace milvus::tantivy {
|
||||
struct TokenStream {
|
||||
public:
|
||||
NO_COPY_OR_ASSIGN(TokenStream);
|
||||
|
||||
TokenStream(void* ptr, std::shared_ptr<std::string> text)
|
||||
: ptr_(ptr), text_(text) {
|
||||
assert(ptr != nullptr);
|
||||
}
|
||||
|
||||
~TokenStream() {
|
||||
if (ptr_ != nullptr) {
|
||||
tantivy_free_token_stream(ptr_);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
bool
|
||||
advance() {
|
||||
return tantivy_token_stream_advance(ptr_);
|
||||
}
|
||||
|
||||
std::string
|
||||
get_token() {
|
||||
auto token = tantivy_token_stream_get_token(ptr_);
|
||||
std::string s(token);
|
||||
free_rust_string(token);
|
||||
return s;
|
||||
}
|
||||
|
||||
// Note: the returned token must be freed by calling `free_rust_string`.
|
||||
const char*
|
||||
get_token_no_copy() {
|
||||
return tantivy_token_stream_get_token(ptr_);
|
||||
}
|
||||
|
||||
public:
|
||||
void* ptr_;
|
||||
std::shared_ptr<std::string> text_;
|
||||
};
|
||||
} // namespace milvus::tantivy
|
|
@ -0,0 +1,50 @@
|
|||
#pragma once
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "rust-binding.h"
|
||||
#include "rust-hashmap.h"
|
||||
#include "token-stream.h"
|
||||
|
||||
namespace milvus::tantivy {
|
||||
|
||||
struct Tokenizer {
|
||||
public:
|
||||
NO_COPY_OR_ASSIGN(Tokenizer);
|
||||
|
||||
explicit Tokenizer(const std::map<std::string, std::string>& params) {
|
||||
RustHashMap m;
|
||||
m.from(params);
|
||||
ptr_ = tantivy_create_tokenizer(m.get_pointer());
|
||||
if (ptr_ == nullptr) {
|
||||
throw "invalid tokenizer parameters";
|
||||
}
|
||||
}
|
||||
|
||||
~Tokenizer() {
|
||||
if (ptr_ != nullptr) {
|
||||
tantivy_free_tokenizer(ptr_);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<TokenStream>
|
||||
CreateTokenStream(std::string&& text) {
|
||||
auto shared_text = std::make_shared<std::string>(std::move(text));
|
||||
auto token_stream =
|
||||
tantivy_create_token_stream(ptr_, shared_text->c_str());
|
||||
return std::make_unique<TokenStream>(token_stream, shared_text);
|
||||
}
|
||||
|
||||
// CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
|
||||
std::unique_ptr<TokenStream>
|
||||
CreateTokenStreamCopyText(const std::string& text) {
|
||||
auto shared_text = std::make_shared<std::string>(text);
|
||||
auto token_stream =
|
||||
tantivy_create_token_stream(ptr_, shared_text->c_str());
|
||||
return std::make_unique<TokenStream>(token_stream, shared_text);
|
||||
}
|
||||
|
||||
private:
|
||||
void* ptr_;
|
||||
};
|
||||
|
||||
} // namespace milvus::tantivy
|
|
@ -0,0 +1,39 @@
|
|||
#include <iostream>
|
||||
#include "token-stream.h"
|
||||
#include "tokenizer.h"
|
||||
|
||||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
using namespace milvus::tantivy;
|
||||
|
||||
void
|
||||
test_tokenizer(const Map& m, std::string&& text) {
|
||||
Tokenizer tokenizer(m);
|
||||
|
||||
auto token_stream = tokenizer.CreateTokenStream(std::move(text));
|
||||
while (token_stream->advance()) {
|
||||
auto token = token_stream->get_token();
|
||||
std::cout << token << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char* argv[]) {
|
||||
// default tokenizer
|
||||
{
|
||||
Map m;
|
||||
test_tokenizer(m, "football, basketball, pingpang");
|
||||
}
|
||||
|
||||
// jieba tokenizer
|
||||
{
|
||||
Map m;
|
||||
std::string tokenizer_name = "jieba";
|
||||
m["tokenizer"] = tokenizer_name;
|
||||
test_tokenizer(m,
|
||||
"张华考上了北京大学;李萍进了中等技术学校;我在百货公司"
|
||||
"当售货员:我们都有光明的前途");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package ctokenizer
|
||||
|
||||
/*
|
||||
#cgo pkg-config: milvus_core
|
||||
#include <stdlib.h> // free
|
||||
#include "segcore/map_c.h"
|
||||
*/
|
||||
import "C"
|
||||
import "unsafe"
|
||||
|
||||
type CMap struct {
|
||||
ptr C.CMap
|
||||
}
|
||||
|
||||
func NewCMap() *CMap {
|
||||
return &CMap{
|
||||
ptr: C.create_cmap(),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *CMap) GetPointer() C.CMap {
|
||||
return m.ptr
|
||||
}
|
||||
|
||||
func (m *CMap) Set(key string, value string) {
|
||||
cKey := C.CString(key)
|
||||
defer C.free(unsafe.Pointer(cKey))
|
||||
|
||||
cValue := C.CString(value)
|
||||
defer C.free(unsafe.Pointer(cValue))
|
||||
|
||||
C.cmap_set(m.ptr, cKey, (C.uint32_t)(len(key)), cValue, (C.uint32_t)(len(value)))
|
||||
}
|
||||
|
||||
func (m *CMap) From(gm map[string]string) {
|
||||
for k, v := range gm {
|
||||
m.Set(k, v)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *CMap) Destroy() {
|
||||
if m.ptr != nil {
|
||||
C.free_cmap(m.ptr)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package ctokenizer
|
||||
|
||||
/*
|
||||
#cgo pkg-config: milvus_core
|
||||
#include <stdlib.h> // free
|
||||
#include "segcore/token_stream_c.h"
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
|
||||
)
|
||||
|
||||
var _ tokenizerapi.TokenStream = (*CTokenStream)(nil)
|
||||
|
||||
type CTokenStream struct {
|
||||
ptr C.CTokenStream
|
||||
}
|
||||
|
||||
func NewCTokenStream(ptr C.CTokenStream) *CTokenStream {
|
||||
return &CTokenStream{
|
||||
ptr: ptr,
|
||||
}
|
||||
}
|
||||
|
||||
func (impl *CTokenStream) Advance() bool {
|
||||
return bool(C.token_stream_advance(impl.ptr))
|
||||
}
|
||||
|
||||
func (impl *CTokenStream) Token() string {
|
||||
token := C.token_stream_get_token(impl.ptr)
|
||||
defer C.free_token(unsafe.Pointer(token))
|
||||
return C.GoString(token)
|
||||
}
|
||||
|
||||
func (impl *CTokenStream) Destroy() {
|
||||
C.free_token_stream(impl.ptr)
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package ctokenizer
|
||||
|
||||
/*
|
||||
#cgo pkg-config: milvus_core
|
||||
#include <stdlib.h> // free
|
||||
#include "segcore/tokenizer_c.h"
|
||||
#include "segcore/token_stream_c.h"
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
|
||||
)
|
||||
|
||||
var _ tokenizerapi.Tokenizer = (*CTokenizer)(nil)
|
||||
|
||||
type CTokenizer struct {
|
||||
ptr C.CTokenizer
|
||||
}
|
||||
|
||||
func NewCTokenizer(ptr C.CTokenizer) *CTokenizer {
|
||||
return &CTokenizer{
|
||||
ptr: ptr,
|
||||
}
|
||||
}
|
||||
|
||||
func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
|
||||
cText := C.CString(text)
|
||||
defer C.free(unsafe.Pointer(cText))
|
||||
ptr := C.create_token_stream(impl.ptr, cText, (C.uint32_t)(len(text)))
|
||||
return NewCTokenStream(ptr)
|
||||
}
|
||||
|
||||
func (impl *CTokenizer) Destroy() {
|
||||
C.free_tokenizer(impl.ptr)
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package ctokenizer
|
||||
|
||||
/*
|
||||
#cgo pkg-config: milvus_core
|
||||
#include <stdlib.h> // free
|
||||
#include "segcore/tokenizer_c.h"
|
||||
#include "segcore/token_stream_c.h"
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
|
||||
)
|
||||
|
||||
func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) {
|
||||
mm := NewCMap()
|
||||
defer mm.Destroy()
|
||||
mm.From(m)
|
||||
|
||||
var ptr C.CTokenizer
|
||||
status := C.create_tokenizer(mm.GetPointer(), &ptr)
|
||||
if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return NewCTokenizer(ptr), nil
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package ctokenizer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestTokenizer(t *testing.T) {
|
||||
// default tokenizer.
|
||||
{
|
||||
m := make(map[string]string)
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
assert.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
||||
tokenStream := tokenizer.NewTokenStream("football, basketball, pingpang")
|
||||
defer tokenStream.Destroy()
|
||||
for tokenStream.Advance() {
|
||||
fmt.Println(tokenStream.Token())
|
||||
}
|
||||
}
|
||||
|
||||
// jieba tokenizer.
|
||||
{
|
||||
m := make(map[string]string)
|
||||
m["tokenizer"] = "jieba"
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
assert.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
||||
tokenStream := tokenizer.NewTokenStream("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途")
|
||||
defer tokenStream.Destroy()
|
||||
for tokenStream.Advance() {
|
||||
fmt.Println(tokenStream.Token())
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package ctokenizer
|
||||
|
||||
/*
|
||||
#cgo pkg-config: milvus_core
|
||||
#include <stdlib.h> // free
|
||||
#include "common/type_c.h"
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unsafe"
|
||||
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||
)
|
||||
|
||||
// HandleCStatus deal with the error returned from CGO
|
||||
func HandleCStatus(status *C.CStatus, extraInfo string) error {
|
||||
if status.error_code == 0 {
|
||||
return nil
|
||||
}
|
||||
errorCode := int(status.error_code)
|
||||
errorMsg := C.GoString(status.error_msg)
|
||||
defer C.free(unsafe.Pointer(status.error_msg))
|
||||
|
||||
logMsg := fmt.Sprintf("%s, C Runtime Exception: %s\n", extraInfo, errorMsg)
|
||||
log.Warn(logMsg)
|
||||
if errorCode == 2003 {
|
||||
return merr.WrapErrSegcoreUnsupported(int32(errorCode), logMsg)
|
||||
}
|
||||
if errorCode == 2033 {
|
||||
log.Info("fake finished the task")
|
||||
return merr.ErrSegcorePretendFinished
|
||||
}
|
||||
return merr.WrapErrSegcore(int32(errorCode), logMsg)
|
||||
}
|
|
@ -0,0 +1,146 @@
|
|||
// Code generated by mockery v2.32.4. DO NOT EDIT.
|
||||
|
||||
package mocks
|
||||
|
||||
import mock "github.com/stretchr/testify/mock"
|
||||
|
||||
// TokenStream is an autogenerated mock type for the TokenStream type
|
||||
type TokenStream struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
type TokenStream_Expecter struct {
|
||||
mock *mock.Mock
|
||||
}
|
||||
|
||||
func (_m *TokenStream) EXPECT() *TokenStream_Expecter {
|
||||
return &TokenStream_Expecter{mock: &_m.Mock}
|
||||
}
|
||||
|
||||
// Advance provides a mock function with given fields:
|
||||
func (_m *TokenStream) Advance() bool {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 bool
|
||||
if rf, ok := ret.Get(0).(func() bool); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(bool)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// TokenStream_Advance_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Advance'
|
||||
type TokenStream_Advance_Call struct {
|
||||
*mock.Call
|
||||
}
|
||||
|
||||
// Advance is a helper method to define mock.On call
|
||||
func (_e *TokenStream_Expecter) Advance() *TokenStream_Advance_Call {
|
||||
return &TokenStream_Advance_Call{Call: _e.mock.On("Advance")}
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Advance_Call) Run(run func()) *TokenStream_Advance_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
run()
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Advance_Call) Return(_a0 bool) *TokenStream_Advance_Call {
|
||||
_c.Call.Return(_a0)
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Advance_Call) RunAndReturn(run func() bool) *TokenStream_Advance_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
||||
// Destroy provides a mock function with given fields:
|
||||
func (_m *TokenStream) Destroy() {
|
||||
_m.Called()
|
||||
}
|
||||
|
||||
// TokenStream_Destroy_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Destroy'
|
||||
type TokenStream_Destroy_Call struct {
|
||||
*mock.Call
|
||||
}
|
||||
|
||||
// Destroy is a helper method to define mock.On call
|
||||
func (_e *TokenStream_Expecter) Destroy() *TokenStream_Destroy_Call {
|
||||
return &TokenStream_Destroy_Call{Call: _e.mock.On("Destroy")}
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Destroy_Call) Run(run func()) *TokenStream_Destroy_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
run()
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Destroy_Call) Return() *TokenStream_Destroy_Call {
|
||||
_c.Call.Return()
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Destroy_Call) RunAndReturn(run func()) *TokenStream_Destroy_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
||||
// Token provides a mock function with given fields:
|
||||
func (_m *TokenStream) Token() string {
|
||||
ret := _m.Called()
|
||||
|
||||
var r0 string
|
||||
if rf, ok := ret.Get(0).(func() string); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(string)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// TokenStream_Token_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Token'
|
||||
type TokenStream_Token_Call struct {
|
||||
*mock.Call
|
||||
}
|
||||
|
||||
// Token is a helper method to define mock.On call
|
||||
func (_e *TokenStream_Expecter) Token() *TokenStream_Token_Call {
|
||||
return &TokenStream_Token_Call{Call: _e.mock.On("Token")}
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Token_Call) Run(run func()) *TokenStream_Token_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
run()
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Token_Call) Return(_a0 string) *TokenStream_Token_Call {
|
||||
_c.Call.Return(_a0)
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *TokenStream_Token_Call) RunAndReturn(run func() string) *TokenStream_Token_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
||||
// NewTokenStream creates a new instance of TokenStream. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
|
||||
// The first argument is typically a *testing.T value.
|
||||
func NewTokenStream(t interface {
|
||||
mock.TestingT
|
||||
Cleanup(func())
|
||||
}) *TokenStream {
|
||||
mock := &TokenStream{}
|
||||
mock.Mock.Test(t)
|
||||
|
||||
t.Cleanup(func() { mock.AssertExpectations(t) })
|
||||
|
||||
return mock
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
// Code generated by mockery v2.32.4. DO NOT EDIT.
|
||||
|
||||
package mocks
|
||||
|
||||
import (
|
||||
tokenizerapi "github.com/milvus-io/milvus/internal/util/tokenizerapi"
|
||||
mock "github.com/stretchr/testify/mock"
|
||||
)
|
||||
|
||||
// Tokenizer is an autogenerated mock type for the Tokenizer type
|
||||
type Tokenizer struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
type Tokenizer_Expecter struct {
|
||||
mock *mock.Mock
|
||||
}
|
||||
|
||||
func (_m *Tokenizer) EXPECT() *Tokenizer_Expecter {
|
||||
return &Tokenizer_Expecter{mock: &_m.Mock}
|
||||
}
|
||||
|
||||
// Destroy provides a mock function with given fields:
|
||||
func (_m *Tokenizer) Destroy() {
|
||||
_m.Called()
|
||||
}
|
||||
|
||||
// Tokenizer_Destroy_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Destroy'
|
||||
type Tokenizer_Destroy_Call struct {
|
||||
*mock.Call
|
||||
}
|
||||
|
||||
// Destroy is a helper method to define mock.On call
|
||||
func (_e *Tokenizer_Expecter) Destroy() *Tokenizer_Destroy_Call {
|
||||
return &Tokenizer_Destroy_Call{Call: _e.mock.On("Destroy")}
|
||||
}
|
||||
|
||||
func (_c *Tokenizer_Destroy_Call) Run(run func()) *Tokenizer_Destroy_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
run()
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *Tokenizer_Destroy_Call) Return() *Tokenizer_Destroy_Call {
|
||||
_c.Call.Return()
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *Tokenizer_Destroy_Call) RunAndReturn(run func()) *Tokenizer_Destroy_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
||||
// NewTokenStream provides a mock function with given fields: text
|
||||
func (_m *Tokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
|
||||
ret := _m.Called(text)
|
||||
|
||||
var r0 tokenizerapi.TokenStream
|
||||
if rf, ok := ret.Get(0).(func(string) tokenizerapi.TokenStream); ok {
|
||||
r0 = rf(text)
|
||||
} else {
|
||||
if ret.Get(0) != nil {
|
||||
r0 = ret.Get(0).(tokenizerapi.TokenStream)
|
||||
}
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// Tokenizer_NewTokenStream_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'NewTokenStream'
|
||||
type Tokenizer_NewTokenStream_Call struct {
|
||||
*mock.Call
|
||||
}
|
||||
|
||||
// NewTokenStream is a helper method to define mock.On call
|
||||
// - text string
|
||||
func (_e *Tokenizer_Expecter) NewTokenStream(text interface{}) *Tokenizer_NewTokenStream_Call {
|
||||
return &Tokenizer_NewTokenStream_Call{Call: _e.mock.On("NewTokenStream", text)}
|
||||
}
|
||||
|
||||
func (_c *Tokenizer_NewTokenStream_Call) Run(run func(text string)) *Tokenizer_NewTokenStream_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
run(args[0].(string))
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *Tokenizer_NewTokenStream_Call) Return(_a0 tokenizerapi.TokenStream) *Tokenizer_NewTokenStream_Call {
|
||||
_c.Call.Return(_a0)
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *Tokenizer_NewTokenStream_Call) RunAndReturn(run func(string) tokenizerapi.TokenStream) *Tokenizer_NewTokenStream_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
||||
// NewTokenizer creates a new instance of Tokenizer. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
|
||||
// The first argument is typically a *testing.T value.
|
||||
func NewTokenizer(t interface {
|
||||
mock.TestingT
|
||||
Cleanup(func())
|
||||
}) *Tokenizer {
|
||||
mock := &Tokenizer{}
|
||||
mock.Mock.Test(t)
|
||||
|
||||
t.Cleanup(func() { mock.AssertExpectations(t) })
|
||||
|
||||
return mock
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
package tokenizerapi
|
||||
|
||||
//go:generate mockery --name=TokenStream --with-expecter
|
||||
type TokenStream interface {
|
||||
Advance() bool
|
||||
Token() string
|
||||
Destroy()
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package tokenizerapi
|
||||
|
||||
//go:generate mockery --name=Tokenizer --with-expecter
|
||||
type Tokenizer interface {
|
||||
NewTokenStream(text string) TokenStream
|
||||
Destroy()
|
||||
}
|
Loading…
Reference in New Issue