From 5ea2454fdf5b2310f984b4b2e55fd42357a2562f Mon Sep 17 00:00:00 2001 From: Jiquan Long Date: Sun, 1 Sep 2024 17:13:03 +0800 Subject: [PATCH] feat: tantivy tokenizer binding (#35801) fix: #35800 --------- Signed-off-by: longjiquan --- internal/core/src/segcore/CMakeLists.txt | 2 +- internal/core/src/segcore/map_c.cpp | 39 +++++ internal/core/src/segcore/map_c.h | 37 +++++ internal/core/src/segcore/token_stream_c.cpp | 38 +++++ internal/core/src/segcore/token_stream_c.h | 37 +++++ internal/core/src/segcore/tokenizer_c.cpp | 41 +++++ internal/core/src/segcore/tokenizer_c.h | 37 +++++ .../core/thirdparty/tantivy/CMakeLists.txt | 49 ++++-- .../core/thirdparty/tantivy/rust-hashmap.h | 44 ++++++ .../tantivy/tantivy-binding/Cargo.lock | 105 +++++++++++++ .../tantivy/tantivy-binding/Cargo.toml | 2 + .../tantivy-binding/include/tantivy-binding.h | 20 +++ .../tantivy/tantivy-binding/src/hashmap_c.rs | 28 ++++ .../tantivy-binding/src/index_writer.rs | 1 - .../tantivy/tantivy-binding/src/lib.rs | 5 + .../tantivy/tantivy-binding/src/string_c.rs | 22 +++ .../tantivy-binding/src/token_stream_c.rs | 40 +++++ .../tantivy/tantivy-binding/src/tokenizer.rs | 34 ++++ .../tantivy-binding/src/tokenizer_c.rs | 26 ++++ .../core/thirdparty/tantivy/token-stream.h | 50 ++++++ internal/core/thirdparty/tantivy/tokenizer.h | 50 ++++++ .../thirdparty/tantivy/tokenizer_demo.cpp | 39 +++++ internal/util/ctokenizer/c_map.go | 45 ++++++ internal/util/ctokenizer/c_token_stream.go | 40 +++++ internal/util/ctokenizer/c_tokenizer.go | 38 +++++ .../util/ctokenizer/c_tokenizer_factory.go | 27 ++++ internal/util/ctokenizer/c_tokenizer_test.go | 39 +++++ internal/util/ctokenizer/helper.go | 37 +++++ .../util/tokenizerapi/mocks/TokenStream.go | 146 ++++++++++++++++++ internal/util/tokenizerapi/mocks/Tokenizer.go | 111 +++++++++++++ internal/util/tokenizerapi/token_stream.go | 8 + internal/util/tokenizerapi/tokenizer.go | 7 + 32 files changed, 1225 insertions(+), 19 deletions(-) create mode 100644 internal/core/src/segcore/map_c.cpp create mode 100644 internal/core/src/segcore/map_c.h create mode 100644 internal/core/src/segcore/token_stream_c.cpp create mode 100644 internal/core/src/segcore/token_stream_c.h create mode 100644 internal/core/src/segcore/tokenizer_c.cpp create mode 100644 internal/core/src/segcore/tokenizer_c.h create mode 100644 internal/core/thirdparty/tantivy/rust-hashmap.h create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/hashmap_c.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/string_c.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs create mode 100644 internal/core/thirdparty/tantivy/token-stream.h create mode 100644 internal/core/thirdparty/tantivy/tokenizer.h create mode 100644 internal/core/thirdparty/tantivy/tokenizer_demo.cpp create mode 100644 internal/util/ctokenizer/c_map.go create mode 100644 internal/util/ctokenizer/c_token_stream.go create mode 100644 internal/util/ctokenizer/c_tokenizer.go create mode 100644 internal/util/ctokenizer/c_tokenizer_factory.go create mode 100644 internal/util/ctokenizer/c_tokenizer_test.go create mode 100644 internal/util/ctokenizer/helper.go create mode 100644 internal/util/tokenizerapi/mocks/TokenStream.go create mode 100644 internal/util/tokenizerapi/mocks/Tokenizer.go create mode 100644 internal/util/tokenizerapi/token_stream.go create mode 100644 internal/util/tokenizerapi/tokenizer.go diff --git a/internal/core/src/segcore/CMakeLists.txt b/internal/core/src/segcore/CMakeLists.txt index 618396132b..63eec8e63d 100644 --- a/internal/core/src/segcore/CMakeLists.txt +++ b/internal/core/src/segcore/CMakeLists.txt @@ -11,4 +11,4 @@ add_source_at_current_directory_recursively() -add_library(milvus_segcore OBJECT ${SOURCE_FILES}) \ No newline at end of file +add_library(milvus_segcore OBJECT ${SOURCE_FILES}) diff --git a/internal/core/src/segcore/map_c.cpp b/internal/core/src/segcore/map_c.cpp new file mode 100644 index 0000000000..e0a21f1c38 --- /dev/null +++ b/internal/core/src/segcore/map_c.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include "segcore/map_c.h" + +#include +#include +#include + +using Map = std::map; + +CMap +create_cmap() { + auto m = std::make_unique(); + return m.release(); +} + +void +free_cmap(CMap m) { + delete static_cast(m); +} + +void +cmap_set(CMap m, + const char* key, + uint32_t key_len, + const char* value, + uint32_t value_len) { + auto mm = static_cast(m); + (*mm)[std::string(key, key_len)] = std::string(value, value_len); +} diff --git a/internal/core/src/segcore/map_c.h b/internal/core/src/segcore/map_c.h new file mode 100644 index 0000000000..74a6f10f05 --- /dev/null +++ b/internal/core/src/segcore/map_c.h @@ -0,0 +1,37 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* CMap; + +CMap +create_cmap(); + +void +free_cmap(CMap m); + +void +cmap_set(CMap m, + const char* key, + uint32_t key_len, + const char* value, + uint32_t value_len); + +#ifdef __cplusplus +} +#endif diff --git a/internal/core/src/segcore/token_stream_c.cpp b/internal/core/src/segcore/token_stream_c.cpp new file mode 100644 index 0000000000..b3410268b9 --- /dev/null +++ b/internal/core/src/segcore/token_stream_c.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include + +#include "segcore/token_stream_c.h" +#include "token-stream.h" + +void +free_token_stream(CTokenStream token_stream) { + delete static_cast(token_stream); +} + +bool +token_stream_advance(CTokenStream token_stream) { + return static_cast(token_stream)->advance(); +} + +// Note: returned token must be freed by the caller using `free_token`. +const char* +token_stream_get_token(CTokenStream token_stream) { + return static_cast(token_stream) + ->get_token_no_copy(); +} + +void +free_token(void* token) { + free_rust_string(static_cast(token)); +} diff --git a/internal/core/src/segcore/token_stream_c.h b/internal/core/src/segcore/token_stream_c.h new file mode 100644 index 0000000000..3d830881f5 --- /dev/null +++ b/internal/core/src/segcore/token_stream_c.h @@ -0,0 +1,37 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#pragma once + +#include + +#include "map_c.h" +#include "common/type_c.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* CTokenStream; + +void free_token_stream(CTokenStream); + +bool token_stream_advance(CTokenStream); + +// Note: returned string must be freed by the caller. +const char* token_stream_get_token(CTokenStream); + +void +free_token(void* token); + +#ifdef __cplusplus +} +#endif diff --git a/internal/core/src/segcore/tokenizer_c.cpp b/internal/core/src/segcore/tokenizer_c.cpp new file mode 100644 index 0000000000..292817eeb0 --- /dev/null +++ b/internal/core/src/segcore/tokenizer_c.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include "segcore/tokenizer_c.h" +#include "common/EasyAssert.h" + +#include "tokenizer.h" + +using Map = std::map; + +CStatus +create_tokenizer(CMap m, CTokenizer* tokenizer) { + try { + auto mm = reinterpret_cast(m); + auto impl = std::make_unique(*mm); + *tokenizer = impl.release(); + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(&e); + } +} + +void +free_tokenizer(CTokenizer tokenizer) { + auto impl = reinterpret_cast(tokenizer); + delete impl; +} + +CTokenStream +create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) { + auto impl = reinterpret_cast(tokenizer); + return impl->CreateTokenStream(std::string(text, text_len)).release(); +} diff --git a/internal/core/src/segcore/tokenizer_c.h b/internal/core/src/segcore/tokenizer_c.h new file mode 100644 index 0000000000..96bb307d36 --- /dev/null +++ b/internal/core/src/segcore/tokenizer_c.h @@ -0,0 +1,37 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#pragma once + +#include + +#include "segcore/map_c.h" +#include "segcore/token_stream_c.h" +#include "common/type_c.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* CTokenizer; + +CStatus +create_tokenizer(CMap m, CTokenizer* tokenizer); + +void +free_tokenizer(CTokenizer tokenizer); + +CTokenStream +create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len); + +#ifdef __cplusplus +} +#endif diff --git a/internal/core/thirdparty/tantivy/CMakeLists.txt b/internal/core/thirdparty/tantivy/CMakeLists.txt index 46371c9dc7..2fa79a3fef 100644 --- a/internal/core/thirdparty/tantivy/CMakeLists.txt +++ b/internal/core/thirdparty/tantivy/CMakeLists.txt @@ -58,24 +58,39 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug") add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) endif() -add_executable(test_tantivy test.cpp) -target_link_libraries(test_tantivy - tantivy_binding - boost_filesystem - dl - ) +# TODO: move these below tests to ut. -add_executable(bench_tantivy bench.cpp) -target_link_libraries(bench_tantivy - tantivy_binding - boost_filesystem - dl - ) +option(BUILD_TANTIVY_WITH_UT "compile tantivy with ut" OFF) -add_executable(ffi_demo ffi_demo.cpp) -target_link_libraries(ffi_demo - tantivy_binding - dl - ) +if (BUILD_TANTIVY_WITH_UT) + message(STATUS "compile tantivy with ut") + add_executable(test_tantivy test.cpp) + target_link_libraries(test_tantivy + tantivy_binding + boost_filesystem + dl + ) + + add_executable(bench_tantivy bench.cpp) + target_link_libraries(bench_tantivy + tantivy_binding + boost_filesystem + dl + ) + + add_executable(ffi_demo ffi_demo.cpp) + target_link_libraries(ffi_demo + tantivy_binding + dl + ) + + add_executable(tokenizer_demo tokenizer_demo.cpp) + target_link_libraries(tokenizer_demo + tantivy_binding + dl + ) +else () +endif () + set( TANTIVY_INCLUDE_DIR ${LIB_HEADER_FOLDER};${CMAKE_CURRENT_SOURCE_DIR} CACHE INTERNAL "Path to tantivy include directory" ) diff --git a/internal/core/thirdparty/tantivy/rust-hashmap.h b/internal/core/thirdparty/tantivy/rust-hashmap.h new file mode 100644 index 0000000000..0376de94d6 --- /dev/null +++ b/internal/core/thirdparty/tantivy/rust-hashmap.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +#include "tantivy-binding.h" +#include "rust-binding.h" + +namespace milvus::tantivy { + +struct RustHashMap { + public: + NO_COPY_OR_ASSIGN(RustHashMap); + + RustHashMap() { + ptr_ = create_hashmap(); + } + + ~RustHashMap() { + if (ptr_ != nullptr) { + free_hashmap(ptr_); + } + } + + void + from(const std::map& m) { + for (const auto& [k, v] : m) { + set(k, v); + } + } + + void* + get_pointer() { + return ptr_; + } + + void + set(const std::string& k, const std::string& v) { + hashmap_set_value(ptr_, k.c_str(), v.c_str()); + } + + private: + void* ptr_ = nullptr; +}; +} // namespace milvus::tantivy diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index 4ed3a35e4b..47872ac812 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -180,6 +180,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "census" version = "0.4.2" @@ -443,6 +452,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generator" version = "0.7.5" @@ -559,6 +577,21 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jieba-rs" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f0c1347cd3ac8d7c6e3a2dc33ac496d365cf09fc0831aa61111e1a6738983e" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown 0.14.3", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + [[package]] name = "jobserver" version = "0.1.28" @@ -754,6 +787,44 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -796,6 +867,21 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + [[package]] name = "rayon" version = "1.10.0" @@ -953,6 +1039,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "sketches-ddsketch" version = "0.2.2" @@ -1070,10 +1162,12 @@ dependencies = [ "cbindgen", "env_logger", "futures", + "lazy_static", "libc", "log", "scopeguard", "tantivy", + "tantivy-jieba", "zstd-sys", ] @@ -1126,6 +1220,17 @@ dependencies = [ "utf8-ranges", ] +[[package]] +name = "tantivy-jieba" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44022293c12a8f878e03439b2f11806d3d394130fe33d4e7781cba91abbac0a4" +dependencies = [ + "jieba-rs", + "lazy_static", + "tantivy-tokenizer-api", +] + [[package]] name = "tantivy-query-grammar" version = "0.21.0" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 12de291c5b..3bf9759d47 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -13,6 +13,8 @@ scopeguard = "1.2" zstd-sys = "=2.0.9" env_logger = "0.11.3" log = "0.4.21" +tantivy-jieba = "0.10.0" +lazy_static = "1.4.0" [build-dependencies] cbindgen = "0.26.0" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 782adeb346..2f21472338 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -25,6 +25,12 @@ void free_rust_array(RustArray array); void print_vector_of_strings(const char *const *ptr, uintptr_t len); +void *create_hashmap(); + +void hashmap_set_value(void *map, const char *key, const char *value); + +void free_hashmap(void *map); + void *tantivy_load_index(const char *path); void tantivy_free_index_reader(void *ptr); @@ -122,6 +128,20 @@ void tantivy_index_add_multi_keywords(void *ptr, uintptr_t len, int64_t offset); +void free_rust_string(const char *ptr); + +void *tantivy_create_token_stream(void *tokenizer, const char *text); + +void tantivy_free_token_stream(void *token_stream); + +bool tantivy_token_stream_advance(void *token_stream); + +const char *tantivy_token_stream_get_token(void *token_stream); + +void *tantivy_create_tokenizer(void *tokenizer_params); + +void tantivy_free_tokenizer(void *tokenizer); + bool tantivy_index_exist(const char *path); } // extern "C" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/hashmap_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/hashmap_c.rs new file mode 100644 index 0000000000..8185a27910 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/hashmap_c.rs @@ -0,0 +1,28 @@ +use std::collections::HashMap; +use std::ffi::CStr; +use std::os::raw::c_char; + +use libc::c_void; + +use crate::util::{create_binding, free_binding}; + +#[no_mangle] +pub extern "C" fn create_hashmap() -> *mut c_void { + let map: HashMap = HashMap::new(); + create_binding(map) +} + +#[no_mangle] +pub extern "C" fn hashmap_set_value(map: *mut c_void, key: *const c_char, value: *const c_char) { + let m = map as *mut HashMap; + let k = unsafe { CStr::from_ptr(key).to_str().unwrap() }; + let v = unsafe { CStr::from_ptr(value).to_str().unwrap() }; + unsafe { + (*m).insert(String::from(k), String::from(v)); + } +} + +#[no_mangle] +pub extern "C" fn free_hashmap(map: *mut c_void) { + free_binding::>(map); +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs index 7c2601df4f..e2e7c01f2f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs @@ -10,7 +10,6 @@ use tantivy::{doc, tokenizer, Document, Index, IndexWriter}; use crate::data_type::TantivyDataType; -use crate::index_reader::IndexReaderWrapper; use crate::log::init_log; pub(crate) struct IndexWriterWrapper { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index 36bb21fd92..41515724f3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -2,6 +2,7 @@ mod array; mod data_type; mod demo_c; mod docid_collector; +mod hashmap_c; mod hashset_collector; mod index_reader; mod index_reader_c; @@ -9,6 +10,10 @@ mod index_writer; mod index_writer_c; mod linkedlist_collector; mod log; +mod string_c; +mod token_stream_c; +mod tokenizer; +mod tokenizer_c; mod util; mod util_c; mod vec_collector; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/string_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/string_c.rs new file mode 100644 index 0000000000..fc1c1ea091 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/string_c.rs @@ -0,0 +1,22 @@ +use std::ffi::{CStr, CString}; + +use libc::c_char; + +use std::str; + +// Be careful to use this function, since the returned str depends on the input to be not freed. +pub(crate) unsafe fn c_str_to_str<'a>(s: *const c_char) -> &'a str { + let rs = CStr::from_ptr(s); + str::from_utf8_unchecked(rs.to_bytes()) +} + +pub(crate) fn create_string(s: &str) -> *const c_char { + CString::new(s).unwrap().into_raw() +} + +#[no_mangle] +pub extern "C" fn free_rust_string(ptr: *const c_char) { + unsafe { + let _ = CString::from_raw(ptr as *mut c_char); + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs new file mode 100644 index 0000000000..e3b4c9f7c2 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/token_stream_c.rs @@ -0,0 +1,40 @@ +use std::ffi::{c_char}; + +use libc::c_void; +use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer}; + +use crate::string_c::c_str_to_str; +use crate::{ + string_c::create_string, + util::{create_binding, free_binding}, +}; + +// Note: the tokenizer and text must be released after the token_stream. +#[no_mangle] +pub extern "C" fn tantivy_create_token_stream( + tokenizer: *mut c_void, + text: *const c_char, +) -> *mut c_void { + let analyzer = tokenizer as *mut TextAnalyzer; + let token_stream = unsafe { (*analyzer).token_stream(c_str_to_str(text)) }; + create_binding(token_stream) +} + +#[no_mangle] +pub extern "C" fn tantivy_free_token_stream(token_stream: *mut c_void) { + free_binding::>(token_stream); +} + +#[no_mangle] +pub extern "C" fn tantivy_token_stream_advance(token_stream: *mut c_void) -> bool { + let real = token_stream as *mut BoxTokenStream<'_>; + unsafe { (*real).advance() } +} + +// Note: the returned token should be released by calling `free_string` after use. +#[no_mangle] +pub extern "C" fn tantivy_token_stream_get_token(token_stream: *mut c_void) -> *const c_char { + let real = token_stream as *mut BoxTokenStream<'_>; + let token = unsafe { (*real).token().text.as_str() }; + create_string(token) +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs new file mode 100644 index 0000000000..a72e0d6b1f --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs @@ -0,0 +1,34 @@ +use lazy_static::lazy_static; +use log::info; +use std::collections::HashMap; +use tantivy::tokenizer::{TextAnalyzer, TokenizerManager}; + +lazy_static! { + static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default(); +} + +pub(crate) fn default_tokenizer() -> TextAnalyzer { + DEFAULT_TOKENIZER_MANAGER.get("default").unwrap() +} + +fn jieba_tokenizer() -> TextAnalyzer { + tantivy_jieba::JiebaTokenizer {}.into() +} + +pub(crate) fn create_tokenizer(params: &HashMap) -> Option { + match params.get("tokenizer") { + Some(tokenizer_name) => match tokenizer_name.as_str() { + "default" => { + return Some(default_tokenizer()); + } + "jieba" => return Some(jieba_tokenizer()), + _ => { + return None; + } + }, + None => { + info!("no tokenizer is specific, use default tokenizer"); + return Some(default_tokenizer()); + } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs new file mode 100644 index 0000000000..c2caf097fc --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs @@ -0,0 +1,26 @@ +use std::collections::HashMap; + +use libc::c_void; +use tantivy::tokenizer::TextAnalyzer; + +use crate::{ + tokenizer::create_tokenizer, + util::{create_binding, free_binding}, +}; + +#[no_mangle] +pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void { + let analyzer = unsafe { + let m = tokenizer_params as *const HashMap; + create_tokenizer(&(*m)) + }; + match analyzer { + Some(text_analyzer) => create_binding(text_analyzer), + None => std::ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) { + free_binding::(tokenizer); +} diff --git a/internal/core/thirdparty/tantivy/token-stream.h b/internal/core/thirdparty/tantivy/token-stream.h new file mode 100644 index 0000000000..03718be21b --- /dev/null +++ b/internal/core/thirdparty/tantivy/token-stream.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#include "tantivy-binding.h" +#include "rust-binding.h" + +namespace milvus::tantivy { +struct TokenStream { + public: + NO_COPY_OR_ASSIGN(TokenStream); + + TokenStream(void* ptr, std::shared_ptr text) + : ptr_(ptr), text_(text) { + assert(ptr != nullptr); + } + + ~TokenStream() { + if (ptr_ != nullptr) { + tantivy_free_token_stream(ptr_); + } + } + + public: + bool + advance() { + return tantivy_token_stream_advance(ptr_); + } + + std::string + get_token() { + auto token = tantivy_token_stream_get_token(ptr_); + std::string s(token); + free_rust_string(token); + return s; + } + + // Note: the returned token must be freed by calling `free_rust_string`. + const char* + get_token_no_copy() { + return tantivy_token_stream_get_token(ptr_); + } + + public: + void* ptr_; + std::shared_ptr text_; +}; +} // namespace milvus::tantivy diff --git a/internal/core/thirdparty/tantivy/tokenizer.h b/internal/core/thirdparty/tantivy/tokenizer.h new file mode 100644 index 0000000000..94ef893818 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tokenizer.h @@ -0,0 +1,50 @@ +#pragma once + +#include "tantivy-binding.h" +#include "rust-binding.h" +#include "rust-hashmap.h" +#include "token-stream.h" + +namespace milvus::tantivy { + +struct Tokenizer { + public: + NO_COPY_OR_ASSIGN(Tokenizer); + + explicit Tokenizer(const std::map& params) { + RustHashMap m; + m.from(params); + ptr_ = tantivy_create_tokenizer(m.get_pointer()); + if (ptr_ == nullptr) { + throw "invalid tokenizer parameters"; + } + } + + ~Tokenizer() { + if (ptr_ != nullptr) { + tantivy_free_tokenizer(ptr_); + } + } + + std::unique_ptr + CreateTokenStream(std::string&& text) { + auto shared_text = std::make_shared(std::move(text)); + auto token_stream = + tantivy_create_token_stream(ptr_, shared_text->c_str()); + return std::make_unique(token_stream, shared_text); + } + + // CreateTokenStreamCopyText will copy the text and then create token stream based on the text. + std::unique_ptr + CreateTokenStreamCopyText(const std::string& text) { + auto shared_text = std::make_shared(text); + auto token_stream = + tantivy_create_token_stream(ptr_, shared_text->c_str()); + return std::make_unique(token_stream, shared_text); + } + + private: + void* ptr_; +}; + +} // namespace milvus::tantivy diff --git a/internal/core/thirdparty/tantivy/tokenizer_demo.cpp b/internal/core/thirdparty/tantivy/tokenizer_demo.cpp new file mode 100644 index 0000000000..76a912b83f --- /dev/null +++ b/internal/core/thirdparty/tantivy/tokenizer_demo.cpp @@ -0,0 +1,39 @@ +#include +#include "token-stream.h" +#include "tokenizer.h" + +using Map = std::map; + +using namespace milvus::tantivy; + +void +test_tokenizer(const Map& m, std::string&& text) { + Tokenizer tokenizer(m); + + auto token_stream = tokenizer.CreateTokenStream(std::move(text)); + while (token_stream->advance()) { + auto token = token_stream->get_token(); + std::cout << token << std::endl; + } +} + +int +main(int argc, char* argv[]) { + // default tokenizer + { + Map m; + test_tokenizer(m, "football, basketball, pingpang"); + } + + // jieba tokenizer + { + Map m; + std::string tokenizer_name = "jieba"; + m["tokenizer"] = tokenizer_name; + test_tokenizer(m, + "张华考上了北京大学;李萍进了中等技术学校;我在百货公司" + "当售货员:我们都有光明的前途"); + } + + return 0; +} diff --git a/internal/util/ctokenizer/c_map.go b/internal/util/ctokenizer/c_map.go new file mode 100644 index 0000000000..ba6cc1dc94 --- /dev/null +++ b/internal/util/ctokenizer/c_map.go @@ -0,0 +1,45 @@ +package ctokenizer + +/* +#cgo pkg-config: milvus_core +#include // free +#include "segcore/map_c.h" +*/ +import "C" +import "unsafe" + +type CMap struct { + ptr C.CMap +} + +func NewCMap() *CMap { + return &CMap{ + ptr: C.create_cmap(), + } +} + +func (m *CMap) GetPointer() C.CMap { + return m.ptr +} + +func (m *CMap) Set(key string, value string) { + cKey := C.CString(key) + defer C.free(unsafe.Pointer(cKey)) + + cValue := C.CString(value) + defer C.free(unsafe.Pointer(cValue)) + + C.cmap_set(m.ptr, cKey, (C.uint32_t)(len(key)), cValue, (C.uint32_t)(len(value))) +} + +func (m *CMap) From(gm map[string]string) { + for k, v := range gm { + m.Set(k, v) + } +} + +func (m *CMap) Destroy() { + if m.ptr != nil { + C.free_cmap(m.ptr) + } +} diff --git a/internal/util/ctokenizer/c_token_stream.go b/internal/util/ctokenizer/c_token_stream.go new file mode 100644 index 0000000000..48109b8ba6 --- /dev/null +++ b/internal/util/ctokenizer/c_token_stream.go @@ -0,0 +1,40 @@ +package ctokenizer + +/* +#cgo pkg-config: milvus_core +#include // free +#include "segcore/token_stream_c.h" +*/ +import "C" + +import ( + "unsafe" + + "github.com/milvus-io/milvus/internal/util/tokenizerapi" +) + +var _ tokenizerapi.TokenStream = (*CTokenStream)(nil) + +type CTokenStream struct { + ptr C.CTokenStream +} + +func NewCTokenStream(ptr C.CTokenStream) *CTokenStream { + return &CTokenStream{ + ptr: ptr, + } +} + +func (impl *CTokenStream) Advance() bool { + return bool(C.token_stream_advance(impl.ptr)) +} + +func (impl *CTokenStream) Token() string { + token := C.token_stream_get_token(impl.ptr) + defer C.free_token(unsafe.Pointer(token)) + return C.GoString(token) +} + +func (impl *CTokenStream) Destroy() { + C.free_token_stream(impl.ptr) +} diff --git a/internal/util/ctokenizer/c_tokenizer.go b/internal/util/ctokenizer/c_tokenizer.go new file mode 100644 index 0000000000..915aa4cfa1 --- /dev/null +++ b/internal/util/ctokenizer/c_tokenizer.go @@ -0,0 +1,38 @@ +package ctokenizer + +/* +#cgo pkg-config: milvus_core +#include // free +#include "segcore/tokenizer_c.h" +#include "segcore/token_stream_c.h" +*/ +import "C" + +import ( + "unsafe" + + "github.com/milvus-io/milvus/internal/util/tokenizerapi" +) + +var _ tokenizerapi.Tokenizer = (*CTokenizer)(nil) + +type CTokenizer struct { + ptr C.CTokenizer +} + +func NewCTokenizer(ptr C.CTokenizer) *CTokenizer { + return &CTokenizer{ + ptr: ptr, + } +} + +func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream { + cText := C.CString(text) + defer C.free(unsafe.Pointer(cText)) + ptr := C.create_token_stream(impl.ptr, cText, (C.uint32_t)(len(text))) + return NewCTokenStream(ptr) +} + +func (impl *CTokenizer) Destroy() { + C.free_tokenizer(impl.ptr) +} diff --git a/internal/util/ctokenizer/c_tokenizer_factory.go b/internal/util/ctokenizer/c_tokenizer_factory.go new file mode 100644 index 0000000000..c5690d8861 --- /dev/null +++ b/internal/util/ctokenizer/c_tokenizer_factory.go @@ -0,0 +1,27 @@ +package ctokenizer + +/* +#cgo pkg-config: milvus_core +#include // free +#include "segcore/tokenizer_c.h" +#include "segcore/token_stream_c.h" +*/ +import "C" + +import ( + "github.com/milvus-io/milvus/internal/util/tokenizerapi" +) + +func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) { + mm := NewCMap() + defer mm.Destroy() + mm.From(m) + + var ptr C.CTokenizer + status := C.create_tokenizer(mm.GetPointer(), &ptr) + if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil { + return nil, err + } + + return NewCTokenizer(ptr), nil +} diff --git a/internal/util/ctokenizer/c_tokenizer_test.go b/internal/util/ctokenizer/c_tokenizer_test.go new file mode 100644 index 0000000000..9b9517020d --- /dev/null +++ b/internal/util/ctokenizer/c_tokenizer_test.go @@ -0,0 +1,39 @@ +package ctokenizer + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTokenizer(t *testing.T) { + // default tokenizer. + { + m := make(map[string]string) + tokenizer, err := NewTokenizer(m) + assert.NoError(t, err) + defer tokenizer.Destroy() + + tokenStream := tokenizer.NewTokenStream("football, basketball, pingpang") + defer tokenStream.Destroy() + for tokenStream.Advance() { + fmt.Println(tokenStream.Token()) + } + } + + // jieba tokenizer. + { + m := make(map[string]string) + m["tokenizer"] = "jieba" + tokenizer, err := NewTokenizer(m) + assert.NoError(t, err) + defer tokenizer.Destroy() + + tokenStream := tokenizer.NewTokenStream("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途") + defer tokenStream.Destroy() + for tokenStream.Advance() { + fmt.Println(tokenStream.Token()) + } + } +} diff --git a/internal/util/ctokenizer/helper.go b/internal/util/ctokenizer/helper.go new file mode 100644 index 0000000000..38e681e201 --- /dev/null +++ b/internal/util/ctokenizer/helper.go @@ -0,0 +1,37 @@ +package ctokenizer + +/* +#cgo pkg-config: milvus_core +#include // free +#include "common/type_c.h" +*/ +import "C" + +import ( + "fmt" + "unsafe" + + "github.com/milvus-io/milvus/pkg/log" + "github.com/milvus-io/milvus/pkg/util/merr" +) + +// HandleCStatus deal with the error returned from CGO +func HandleCStatus(status *C.CStatus, extraInfo string) error { + if status.error_code == 0 { + return nil + } + errorCode := int(status.error_code) + errorMsg := C.GoString(status.error_msg) + defer C.free(unsafe.Pointer(status.error_msg)) + + logMsg := fmt.Sprintf("%s, C Runtime Exception: %s\n", extraInfo, errorMsg) + log.Warn(logMsg) + if errorCode == 2003 { + return merr.WrapErrSegcoreUnsupported(int32(errorCode), logMsg) + } + if errorCode == 2033 { + log.Info("fake finished the task") + return merr.ErrSegcorePretendFinished + } + return merr.WrapErrSegcore(int32(errorCode), logMsg) +} diff --git a/internal/util/tokenizerapi/mocks/TokenStream.go b/internal/util/tokenizerapi/mocks/TokenStream.go new file mode 100644 index 0000000000..ae556b619a --- /dev/null +++ b/internal/util/tokenizerapi/mocks/TokenStream.go @@ -0,0 +1,146 @@ +// Code generated by mockery v2.32.4. DO NOT EDIT. + +package mocks + +import mock "github.com/stretchr/testify/mock" + +// TokenStream is an autogenerated mock type for the TokenStream type +type TokenStream struct { + mock.Mock +} + +type TokenStream_Expecter struct { + mock *mock.Mock +} + +func (_m *TokenStream) EXPECT() *TokenStream_Expecter { + return &TokenStream_Expecter{mock: &_m.Mock} +} + +// Advance provides a mock function with given fields: +func (_m *TokenStream) Advance() bool { + ret := _m.Called() + + var r0 bool + if rf, ok := ret.Get(0).(func() bool); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// TokenStream_Advance_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Advance' +type TokenStream_Advance_Call struct { + *mock.Call +} + +// Advance is a helper method to define mock.On call +func (_e *TokenStream_Expecter) Advance() *TokenStream_Advance_Call { + return &TokenStream_Advance_Call{Call: _e.mock.On("Advance")} +} + +func (_c *TokenStream_Advance_Call) Run(run func()) *TokenStream_Advance_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *TokenStream_Advance_Call) Return(_a0 bool) *TokenStream_Advance_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *TokenStream_Advance_Call) RunAndReturn(run func() bool) *TokenStream_Advance_Call { + _c.Call.Return(run) + return _c +} + +// Destroy provides a mock function with given fields: +func (_m *TokenStream) Destroy() { + _m.Called() +} + +// TokenStream_Destroy_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Destroy' +type TokenStream_Destroy_Call struct { + *mock.Call +} + +// Destroy is a helper method to define mock.On call +func (_e *TokenStream_Expecter) Destroy() *TokenStream_Destroy_Call { + return &TokenStream_Destroy_Call{Call: _e.mock.On("Destroy")} +} + +func (_c *TokenStream_Destroy_Call) Run(run func()) *TokenStream_Destroy_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *TokenStream_Destroy_Call) Return() *TokenStream_Destroy_Call { + _c.Call.Return() + return _c +} + +func (_c *TokenStream_Destroy_Call) RunAndReturn(run func()) *TokenStream_Destroy_Call { + _c.Call.Return(run) + return _c +} + +// Token provides a mock function with given fields: +func (_m *TokenStream) Token() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// TokenStream_Token_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Token' +type TokenStream_Token_Call struct { + *mock.Call +} + +// Token is a helper method to define mock.On call +func (_e *TokenStream_Expecter) Token() *TokenStream_Token_Call { + return &TokenStream_Token_Call{Call: _e.mock.On("Token")} +} + +func (_c *TokenStream_Token_Call) Run(run func()) *TokenStream_Token_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *TokenStream_Token_Call) Return(_a0 string) *TokenStream_Token_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *TokenStream_Token_Call) RunAndReturn(run func() string) *TokenStream_Token_Call { + _c.Call.Return(run) + return _c +} + +// NewTokenStream creates a new instance of TokenStream. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewTokenStream(t interface { + mock.TestingT + Cleanup(func()) +}) *TokenStream { + mock := &TokenStream{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/internal/util/tokenizerapi/mocks/Tokenizer.go b/internal/util/tokenizerapi/mocks/Tokenizer.go new file mode 100644 index 0000000000..e0dad6c19d --- /dev/null +++ b/internal/util/tokenizerapi/mocks/Tokenizer.go @@ -0,0 +1,111 @@ +// Code generated by mockery v2.32.4. DO NOT EDIT. + +package mocks + +import ( + tokenizerapi "github.com/milvus-io/milvus/internal/util/tokenizerapi" + mock "github.com/stretchr/testify/mock" +) + +// Tokenizer is an autogenerated mock type for the Tokenizer type +type Tokenizer struct { + mock.Mock +} + +type Tokenizer_Expecter struct { + mock *mock.Mock +} + +func (_m *Tokenizer) EXPECT() *Tokenizer_Expecter { + return &Tokenizer_Expecter{mock: &_m.Mock} +} + +// Destroy provides a mock function with given fields: +func (_m *Tokenizer) Destroy() { + _m.Called() +} + +// Tokenizer_Destroy_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Destroy' +type Tokenizer_Destroy_Call struct { + *mock.Call +} + +// Destroy is a helper method to define mock.On call +func (_e *Tokenizer_Expecter) Destroy() *Tokenizer_Destroy_Call { + return &Tokenizer_Destroy_Call{Call: _e.mock.On("Destroy")} +} + +func (_c *Tokenizer_Destroy_Call) Run(run func()) *Tokenizer_Destroy_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *Tokenizer_Destroy_Call) Return() *Tokenizer_Destroy_Call { + _c.Call.Return() + return _c +} + +func (_c *Tokenizer_Destroy_Call) RunAndReturn(run func()) *Tokenizer_Destroy_Call { + _c.Call.Return(run) + return _c +} + +// NewTokenStream provides a mock function with given fields: text +func (_m *Tokenizer) NewTokenStream(text string) tokenizerapi.TokenStream { + ret := _m.Called(text) + + var r0 tokenizerapi.TokenStream + if rf, ok := ret.Get(0).(func(string) tokenizerapi.TokenStream); ok { + r0 = rf(text) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(tokenizerapi.TokenStream) + } + } + + return r0 +} + +// Tokenizer_NewTokenStream_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'NewTokenStream' +type Tokenizer_NewTokenStream_Call struct { + *mock.Call +} + +// NewTokenStream is a helper method to define mock.On call +// - text string +func (_e *Tokenizer_Expecter) NewTokenStream(text interface{}) *Tokenizer_NewTokenStream_Call { + return &Tokenizer_NewTokenStream_Call{Call: _e.mock.On("NewTokenStream", text)} +} + +func (_c *Tokenizer_NewTokenStream_Call) Run(run func(text string)) *Tokenizer_NewTokenStream_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Tokenizer_NewTokenStream_Call) Return(_a0 tokenizerapi.TokenStream) *Tokenizer_NewTokenStream_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *Tokenizer_NewTokenStream_Call) RunAndReturn(run func(string) tokenizerapi.TokenStream) *Tokenizer_NewTokenStream_Call { + _c.Call.Return(run) + return _c +} + +// NewTokenizer creates a new instance of Tokenizer. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewTokenizer(t interface { + mock.TestingT + Cleanup(func()) +}) *Tokenizer { + mock := &Tokenizer{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/internal/util/tokenizerapi/token_stream.go b/internal/util/tokenizerapi/token_stream.go new file mode 100644 index 0000000000..2df0f0202e --- /dev/null +++ b/internal/util/tokenizerapi/token_stream.go @@ -0,0 +1,8 @@ +package tokenizerapi + +//go:generate mockery --name=TokenStream --with-expecter +type TokenStream interface { + Advance() bool + Token() string + Destroy() +} diff --git a/internal/util/tokenizerapi/tokenizer.go b/internal/util/tokenizerapi/tokenizer.go new file mode 100644 index 0000000000..2b6debbec7 --- /dev/null +++ b/internal/util/tokenizerapi/tokenizer.go @@ -0,0 +1,7 @@ +package tokenizerapi + +//go:generate mockery --name=Tokenizer --with-expecter +type Tokenizer interface { + NewTokenStream(text string) TokenStream + Destroy() +}