milvus/internal/core/thirdparty/tantivy/jieba_demo.cpp

64 lines
1.8 KiB
C++

#include <string>
#include <vector>
#include <boost/filesystem/operations.hpp>
#include "tantivy-binding.h"
#include "tantivy-wrapper.h"
using namespace milvus::tantivy;
std::set<uint32_t>
to_set(const RustArrayWrapper& w) {
std::set<uint32_t> s(w.array_.array, w.array_.array + w.array_.len);
return s;
}
int
main(int argc, char* argv[]) {
std::string tokenizer_name = "jieba";
std::map<std::string, std::string> analyzer_params;
analyzer_params["tokenizer"] = tokenizer_name;
auto text_index = TantivyIndexWrapper(
"text_demo", true, "", tokenizer_name.c_str(), analyzer_params);
auto write_single_text = [&text_index](const std::string& s,
int64_t offset) {
text_index.add_data(&s, 1, offset);
};
{
write_single_text(
"张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我"
"们都有光明的前途",
0);
write_single_text("测试中文分词器的效果", 1);
write_single_text("黄金时代", 2);
write_single_text("青铜时代", 3);
text_index.commit();
}
text_index.create_reader();
text_index.register_tokenizer(tokenizer_name.c_str(), analyzer_params);
{
auto result = to_set(text_index.match_query("北京"));
assert(result.size() == 1);
assert(result.find(0) != result.end());
}
{
auto result = to_set(text_index.match_query("效果"));
assert(result.size() == 1);
assert(result.find(1) != result.end());
}
{
auto result = to_set(text_index.match_query("时代"));
assert(result.size() == 2);
assert(result.find(2) != result.end());
assert(result.find(3) != result.end());
}
return 0;
}