mirror of https://github.com/milvus-io/milvus.git
378 lines
14 KiB
C++
378 lines
14 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <string>
|
|
#include <random>
|
|
|
|
#include "common/Schema.h"
|
|
#include "test_utils/GenExprProto.h"
|
|
#include "query/PlanProto.h"
|
|
#include "query/ExecPlanNodeVisitor.h"
|
|
#include "expr/ITypeExpr.h"
|
|
#include "test_utils/storage_test_utils.h"
|
|
#include "index/IndexFactory.h"
|
|
#include "index/NgramInvertedIndex.h"
|
|
#include "segcore/load_index_c.h"
|
|
|
|
using namespace milvus;
|
|
using namespace milvus::query;
|
|
using namespace milvus::segcore;
|
|
using namespace milvus::exec;
|
|
|
|
TEST(ConvertToNgramLiteralTest, EmptyString) {
|
|
auto result = parse_ngram_pattern("");
|
|
ASSERT_FALSE(result.has_value());
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, ExactMatchSimple) {
|
|
auto result = parse_ngram_pattern("abc");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "abc");
|
|
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, ExactMatchWithEscapedPercent) {
|
|
auto result = parse_ngram_pattern("ab\\%cd");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "ab%cd");
|
|
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, ExactMatchWithEscapedSpecialChar) {
|
|
auto result = parse_ngram_pattern("a.b");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "a\\.b");
|
|
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, PrefixMatchSimple) {
|
|
auto result = parse_ngram_pattern("%abc");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "abc");
|
|
EXPECT_EQ(result->type, MatchType::PrefixMatch);
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, PostfixMatchSimple) {
|
|
auto result = parse_ngram_pattern("abc%");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "abc");
|
|
EXPECT_EQ(result->type, MatchType::PostfixMatch);
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, InnerMatchSimple) {
|
|
auto result = parse_ngram_pattern("%abc%");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "abc");
|
|
EXPECT_EQ(result->type, MatchType::InnerMatch);
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, MatchSinglePercentMiddle) {
|
|
auto result = parse_ngram_pattern("a%b");
|
|
ASSERT_FALSE(result.has_value());
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, MatchTypeReturnsNullopt) {
|
|
EXPECT_FALSE(parse_ngram_pattern("%").has_value());
|
|
// %a%b (n=2, not %xxx%) -> Match -> nullopt
|
|
EXPECT_FALSE(parse_ngram_pattern("%a%b").has_value());
|
|
// a%b%c (n=2, not %xxx%) -> Match -> nullopt
|
|
EXPECT_FALSE(parse_ngram_pattern("a%b%c").has_value());
|
|
// %% (n=2, not %xxx% because length is not > 2) -> Match -> nullopt
|
|
EXPECT_FALSE(parse_ngram_pattern("%%").has_value());
|
|
// %a%b%c% (n=3) -> Match -> nullopt
|
|
EXPECT_FALSE(parse_ngram_pattern("%a%b%c%").has_value());
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, UnescapedUnderscoreReturnsNullopt) {
|
|
EXPECT_FALSE(parse_ngram_pattern("a_b").has_value());
|
|
EXPECT_FALSE(parse_ngram_pattern("%a_b").has_value());
|
|
EXPECT_FALSE(parse_ngram_pattern("a_b%").has_value());
|
|
EXPECT_FALSE(parse_ngram_pattern("%a_b%").has_value());
|
|
}
|
|
|
|
TEST(ConvertToNgramLiteralTest, EscapedUnderscore) {
|
|
auto result = parse_ngram_pattern("a\\_b");
|
|
ASSERT_TRUE(result.has_value());
|
|
EXPECT_EQ(result->literal, "a_b");
|
|
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
|
}
|
|
|
|
auto
|
|
generate_field_meta(int64_t collection_id = 1,
|
|
int64_t partition_id = 2,
|
|
int64_t segment_id = 3,
|
|
int64_t field_id = 101,
|
|
DataType data_type = DataType::NONE,
|
|
DataType element_type = DataType::NONE,
|
|
bool nullable = false) -> storage::FieldDataMeta {
|
|
auto meta = storage::FieldDataMeta{
|
|
.collection_id = collection_id,
|
|
.partition_id = partition_id,
|
|
.segment_id = segment_id,
|
|
.field_id = field_id,
|
|
};
|
|
meta.field_schema.set_data_type(
|
|
static_cast<proto::schema::DataType>(data_type));
|
|
meta.field_schema.set_element_type(
|
|
static_cast<proto::schema::DataType>(element_type));
|
|
meta.field_schema.set_nullable(nullable);
|
|
return meta;
|
|
}
|
|
|
|
auto
|
|
generate_index_meta(int64_t segment_id = 3,
|
|
int64_t field_id = 101,
|
|
int64_t index_build_id = 1000,
|
|
int64_t index_version = 10000) -> storage::IndexMeta {
|
|
return storage::IndexMeta{
|
|
.segment_id = segment_id,
|
|
.field_id = field_id,
|
|
.build_id = index_build_id,
|
|
.index_version = index_version,
|
|
};
|
|
}
|
|
|
|
auto
|
|
generate_local_storage_config(const std::string& root_path)
|
|
-> storage::StorageConfig {
|
|
auto ret = storage::StorageConfig{};
|
|
ret.storage_type = "local";
|
|
ret.root_path = root_path;
|
|
return ret;
|
|
}
|
|
|
|
void
|
|
test_ngram_with_data(const boost::container::vector<std::string>& data,
|
|
const std::string& literal,
|
|
const std::vector<bool>& expected_result) {
|
|
int64_t collection_id = 1;
|
|
int64_t partition_id = 2;
|
|
int64_t segment_id = 3;
|
|
int64_t index_build_id = 4000;
|
|
int64_t index_version = 4000;
|
|
int64_t index_id = 5000;
|
|
|
|
auto schema = std::make_shared<Schema>();
|
|
auto field_id = schema->AddDebugField("ngram", DataType::VARCHAR);
|
|
|
|
auto field_meta = generate_field_meta(collection_id,
|
|
partition_id,
|
|
segment_id,
|
|
field_id.get(),
|
|
DataType::VARCHAR,
|
|
DataType::NONE,
|
|
false);
|
|
auto index_meta = generate_index_meta(
|
|
segment_id, field_id.get(), index_build_id, index_version);
|
|
|
|
std::string root_path = "/tmp/test-inverted-index/";
|
|
auto storage_config = generate_local_storage_config(root_path);
|
|
auto cm = CreateChunkManager(storage_config);
|
|
|
|
std::random_device rd;
|
|
std::mt19937 gen(rd());
|
|
std::uniform_int_distribution<> distrib(1, 100);
|
|
|
|
size_t nb = data.size();
|
|
|
|
auto field_data = storage::CreateFieldData(DataType::VARCHAR, false);
|
|
field_data->FillFieldData(data.data(), data.size());
|
|
|
|
auto segment = CreateSealedSegment(schema);
|
|
auto field_data_info = PrepareSingleFieldInsertBinlog(collection_id,
|
|
partition_id,
|
|
segment_id,
|
|
field_id.get(),
|
|
{field_data},
|
|
cm);
|
|
segment->LoadFieldData(field_data_info);
|
|
|
|
auto payload_reader =
|
|
std::make_shared<milvus::storage::PayloadReader>(field_data);
|
|
storage::InsertData insert_data(payload_reader);
|
|
insert_data.SetFieldDataMeta(field_meta);
|
|
insert_data.SetTimestamps(0, 100);
|
|
|
|
auto serialized_bytes = insert_data.Serialize(storage::Remote);
|
|
|
|
auto get_binlog_path = [=](int64_t log_id) {
|
|
return fmt::format("{}/{}/{}/{}/{}",
|
|
collection_id,
|
|
partition_id,
|
|
segment_id,
|
|
field_id.get(),
|
|
log_id);
|
|
};
|
|
|
|
auto log_path = get_binlog_path(0);
|
|
|
|
auto cm_w = ChunkManagerWrapper(cm);
|
|
cm_w.Write(log_path, serialized_bytes.data(), serialized_bytes.size());
|
|
|
|
storage::FileManagerContext ctx(field_meta, index_meta, cm);
|
|
std::vector<std::string> index_files;
|
|
|
|
{
|
|
Config config;
|
|
config["index_type"] = milvus::index::INVERTED_INDEX_TYPE;
|
|
config["insert_files"] = std::vector<std::string>{log_path};
|
|
|
|
auto ngram_params = index::NgramParams{
|
|
.loading_index = false,
|
|
.min_gram = 2,
|
|
.max_gram = 4,
|
|
};
|
|
auto index =
|
|
std::make_shared<index::NgramInvertedIndex>(ctx, ngram_params);
|
|
index->Build(config);
|
|
|
|
auto create_index_result = index->Upload();
|
|
auto memSize = create_index_result->GetMemSize();
|
|
auto serializedSize = create_index_result->GetSerializedSize();
|
|
ASSERT_GT(memSize, 0);
|
|
ASSERT_GT(serializedSize, 0);
|
|
index_files = create_index_result->GetIndexFiles();
|
|
}
|
|
|
|
{
|
|
index::CreateIndexInfo index_info{};
|
|
index_info.index_type = milvus::index::INVERTED_INDEX_TYPE;
|
|
index_info.field_type = DataType::VARCHAR;
|
|
|
|
Config config;
|
|
config[milvus::index::INDEX_FILES] = index_files;
|
|
config[milvus::LOAD_PRIORITY] =
|
|
milvus::proto::common::LoadPriority::HIGH;
|
|
|
|
auto ngram_params = index::NgramParams{
|
|
.loading_index = true,
|
|
.min_gram = 2,
|
|
.max_gram = 4,
|
|
};
|
|
auto index =
|
|
std::make_unique<index::NgramInvertedIndex>(ctx, ngram_params);
|
|
index->Load(milvus::tracer::TraceContext{}, config);
|
|
|
|
auto cnt = index->Count();
|
|
ASSERT_EQ(cnt, nb);
|
|
|
|
exec::SegmentExpr segment_expr(std::move(std::vector<exec::ExprPtr>{}),
|
|
"SegmentExpr",
|
|
segment.get(),
|
|
field_id,
|
|
{},
|
|
DataType::VARCHAR,
|
|
nb,
|
|
8192,
|
|
0);
|
|
|
|
auto bitset = index->InnerMatchQuery(literal, &segment_expr).value();
|
|
for (size_t i = 0; i < nb; i++) {
|
|
ASSERT_EQ(bitset[i], expected_result[i]);
|
|
}
|
|
}
|
|
|
|
{
|
|
std::map<std::string, std::string> index_params{
|
|
{milvus::index::INDEX_TYPE, milvus::index::NGRAM_INDEX_TYPE},
|
|
{milvus::index::MIN_GRAM, "2"},
|
|
{milvus::index::MAX_GRAM, "4"},
|
|
{milvus::LOAD_PRIORITY, "HIGH"},
|
|
};
|
|
milvus::segcore::LoadIndexInfo load_index_info{
|
|
.collection_id = collection_id,
|
|
.partition_id = partition_id,
|
|
.segment_id = segment_id,
|
|
.field_id = field_id.get(),
|
|
.field_type = DataType::VARCHAR,
|
|
.enable_mmap = true,
|
|
.mmap_dir_path = "/tmp/test-ngram-index-mmap-dir",
|
|
.index_id = index_id,
|
|
.index_build_id = index_build_id,
|
|
.index_version = index_version,
|
|
.index_params = index_params,
|
|
.index_files = index_files,
|
|
.schema = field_meta.field_schema,
|
|
.index_size = 1024 * 1024 * 1024,
|
|
};
|
|
|
|
uint8_t trace_id[16] = {0};
|
|
uint8_t span_id[8] = {0};
|
|
trace_id[0] = 1;
|
|
span_id[0] = 2;
|
|
CTraceContext trace{
|
|
.traceID = trace_id,
|
|
.spanID = span_id,
|
|
.traceFlags = 0,
|
|
};
|
|
auto cload_index_info = static_cast<CLoadIndexInfo>(&load_index_info);
|
|
AppendIndexV2(trace, cload_index_info);
|
|
UpdateSealedSegmentIndex(segment.get(), cload_index_info);
|
|
|
|
auto unary_range_expr =
|
|
test::GenUnaryRangeExpr(OpType::InnerMatch, literal);
|
|
auto column_info = test::GenColumnInfo(
|
|
field_id.get(), proto::schema::DataType::VarChar, false, false);
|
|
unary_range_expr->set_allocated_column_info(column_info);
|
|
auto expr = test::GenExpr();
|
|
expr->set_allocated_unary_range_expr(unary_range_expr);
|
|
auto parser = ProtoParser(schema);
|
|
auto typed_expr = parser.ParseExprs(*expr);
|
|
auto parsed = std::make_shared<plan::FilterBitsNode>(
|
|
DEFAULT_PLANNODE_ID, typed_expr);
|
|
BitsetType final;
|
|
final = ExecuteQueryExpr(parsed, segment.get(), nb, MAX_TIMESTAMP);
|
|
for (size_t i = 0; i < nb; i++) {
|
|
ASSERT_EQ(final[i], expected_result[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST(NgramIndex, TestNgramWikiEpisode) {
|
|
boost::container::vector<std::string> data;
|
|
// not hit
|
|
data.push_back(
|
|
"'Indira Davelba Murillo Alvarado (Tegucigalpa, "
|
|
"the youngest of eight siblings. She attended primary school at the "
|
|
"Escuela 14 de Julio, and her secondary studies at the Instituto "
|
|
"school called \"Indi del Bosque\", where she taught the children of "
|
|
"Honduran women'");
|
|
// hit
|
|
data.push_back(
|
|
"Richmond Green Secondary School is a public secondary school in "
|
|
"Richmond Hill, Ontario, Canada.");
|
|
// hit
|
|
data.push_back(
|
|
"The Gymnasium in 2002 Gymnasium Philippinum or Philippinum High "
|
|
"School is an almost 500-year-old secondary school in Marburg, Hesse, "
|
|
"Germany.");
|
|
// hit
|
|
data.push_back(
|
|
"Sir Winston Churchill Secondary School is a Canadian secondary school "
|
|
"located in St. Catharines, Ontario.");
|
|
// not hit
|
|
data.push_back("Sir Winston Churchill Secondary School");
|
|
|
|
std::vector<bool> expected_result{false, true, true, true, false};
|
|
|
|
test_ngram_with_data(data, "secondary school", expected_result);
|
|
}
|
|
|
|
TEST(NgramIndex, TestNgramAllFalse) {
|
|
boost::container::vector<std::string> data(10000,
|
|
"elementary school secondary");
|
|
|
|
// all can be hit by ngram tantivy but will be filterred out by the second phase
|
|
test_ngram_with_data(
|
|
data, "secondary school", std::vector<bool>(10000, false));
|
|
}
|