milvus/internal/core/thirdparty/tantivy/tokenizer_demo.cpp

48 lines
1.4 KiB
C++

#include <iostream>
#include "token-stream.h"
#include "tokenizer.h"
using Map = std::map<std::string, std::string>;
using namespace milvus::tantivy;
void
test_tokenizer(const Map& m, std::string&& text) {
Tokenizer tokenizer(m);
auto token_stream = tokenizer.CreateTokenStream(std::move(text));
while (token_stream->advance()) {
auto token = token_stream->get_token();
std::cout << token << std::endl;
}
}
int
main(int argc, char* argv[]) {
// default tokenizer
{
Map m;
test_tokenizer(m, "football, basketball, pingpang");
test_tokenizer(m, "Avatar");
test_tokenizer(m, "Action, Adventure, Fantasy, Science Fiction");
test_tokenizer(m, "Ingenious Film Partners, Twentiesth Century Fox");
test_tokenizer(m, "Sam Worthington as Jack Sully");
}
// jieba tokenizer
{
Map m;
std::string tokenizer_name = "jieba";
m["tokenizer"] = tokenizer_name;
test_tokenizer(m,
"张华考上了北京大学;李萍进了中等技术学校;我在百货公司"
"当售货员:我们都有光明的前途");
test_tokenizer(m, "青铜时代");
test_tokenizer(m, "黄金时代");
test_tokenizer(m, "时代");
test_tokenizer(m, "测试中文分词器的效果");
}
return 0;
}