Support flat

Signed-off-by: FluorineDog <guilin.gou@zilliz.com>
pull/4973/head^2
FluorineDog 2021-02-27 12:46:37 +08:00 committed by yefu.chen
parent 04c36eb8dd
commit 66146223ca
10 changed files with 120 additions and 56 deletions

View File

@ -15,6 +15,7 @@
#include "utils/EasyAssert.h"
#include <string>
#include <stdexcept>
#include <optional>
namespace milvus {
inline int
@ -82,7 +83,7 @@ datatype_is_vector(DataType datatype) {
return datatype == DataType::VECTOR_BINARY || datatype == DataType::VECTOR_FLOAT;
}
struct FieldMeta {
class FieldMeta {
public:
FieldMeta(const FieldMeta&) = delete;
FieldMeta(FieldMeta&&) = default;
@ -95,7 +96,7 @@ struct FieldMeta {
Assert(!is_vector());
}
FieldMeta(const FieldName& name, FieldId id, DataType type, int64_t dim, MetricType metric_type)
FieldMeta(const FieldName& name, FieldId id, DataType type, int64_t dim, std::optional<MetricType> metric_type)
: name_(name), id_(id), type_(type), vector_info_(VectorInfo{dim, metric_type}) {
Assert(is_vector());
}
@ -113,7 +114,7 @@ struct FieldMeta {
return vector_info_->dim_;
}
MetricType
std::optional<MetricType>
get_metric_type() const {
Assert(is_vector());
Assert(vector_info_.has_value());
@ -147,7 +148,7 @@ struct FieldMeta {
private:
struct VectorInfo {
int64_t dim_;
MetricType metric_type_;
std::optional<MetricType> metric_type_;
};
FieldName name_;
FieldId id_;

View File

@ -34,7 +34,7 @@ class Schema {
// auto gen field_id for convenience
FieldId
AddDebugField(const std::string& name, DataType data_type, int64_t dim, MetricType metric_type) {
AddDebugField(const std::string& name, DataType data_type, int64_t dim, std::optional<MetricType> metric_type) {
static int64_t debug_id = 2001;
auto field_id = FieldId(debug_id);
debug_id += 2;

View File

@ -70,35 +70,40 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
// std::vector<float> final_dis(total_count, std::numeric_limits<float>::max());
SubQueryResult final_qr(num_queries, topK, metric_type);
dataset::QueryDataset query_dataset{metric_type, num_queries, topK, dim, query_data};
auto max_indexed_id = indexing_record.get_finished_ack();
const auto& field_indexing = indexing_record.get_vec_field_indexing(vecfield_offset);
auto search_conf = field_indexing.get_search_conf(topK);
for (int chunk_id = 0; chunk_id < max_indexed_id; ++chunk_id) {
auto size_per_chunk = field_indexing.get_size_per_chunk();
auto indexing = field_indexing.get_chunk_indexing(chunk_id);
auto sub_view = BitsetSubView(bitset, chunk_id * size_per_chunk, size_per_chunk);
auto sub_qr = SearchOnIndex(query_dataset, *indexing, search_conf, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_labels()) {
if (x != -1) {
x += chunk_id * size_per_chunk;
}
}
final_qr.merge(sub_qr);
}
auto vec_ptr = record.get_field_data<FloatVector>(vecfield_offset);
int current_chunk_id = 0;
if (indexing_record.is_in(vecfield_offset)) {
auto max_indexed_id = indexing_record.get_finished_ack();
const auto& field_indexing = indexing_record.get_vec_field_indexing(vecfield_offset);
auto search_conf = field_indexing.get_search_conf(topK);
Assert(vec_ptr->get_size_per_chunk() == field_indexing.get_size_per_chunk());
for (int chunk_id = current_chunk_id; chunk_id < max_indexed_id; ++chunk_id) {
auto size_per_chunk = field_indexing.get_size_per_chunk();
auto indexing = field_indexing.get_chunk_indexing(chunk_id);
auto sub_view = BitsetSubView(bitset, chunk_id * size_per_chunk, size_per_chunk);
auto sub_qr = SearchOnIndex(query_dataset, *indexing, search_conf, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_labels()) {
if (x != -1) {
x += chunk_id * size_per_chunk;
}
}
final_qr.merge(sub_qr);
}
current_chunk_id = max_indexed_id;
}
// step 4: brute force search where small indexing is unavailable
auto vec_size_per_chunk = vec_ptr->get_size_per_chunk();
Assert(vec_size_per_chunk == field_indexing.get_size_per_chunk());
auto max_chunk = upper_div(ins_barrier, vec_size_per_chunk);
for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) {
for (int chunk_id = current_chunk_id; chunk_id < max_chunk; ++chunk_id) {
auto& chunk = vec_ptr->get_chunk(chunk_id);
auto element_begin = chunk_id * vec_size_per_chunk;
@ -116,6 +121,7 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
}
final_qr.merge(sub_qr);
}
current_chunk_id = max_chunk;
results.result_distances_ = std::move(final_qr.mutable_values());
results.internal_seg_offsets_ = std::move(final_qr.mutable_labels());

View File

@ -39,21 +39,26 @@ VectorFieldIndexing::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const Vec
knowhere::Config
VectorFieldIndexing::get_build_conf() const {
// TODO
auto type_opt = field_meta_.get_metric_type();
Assert(type_opt.has_value());
auto type_name = MetricTypeToName(type_opt.value());
return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()},
{knowhere::IndexParams::nlist, 100},
{knowhere::IndexParams::nprobe, 4},
{knowhere::Metric::TYPE, MetricTypeToName(field_meta_.get_metric_type())},
{knowhere::Metric::TYPE, type_name},
{knowhere::meta::DEVICEID, 0}};
}
knowhere::Config
VectorFieldIndexing::get_search_conf(int top_K) const {
return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()},
{knowhere::meta::TOPK, top_K},
{knowhere::IndexParams::nlist, 100},
{knowhere::IndexParams::nprobe, 4},
{knowhere::Metric::TYPE, MetricTypeToName(field_meta_.get_metric_type())},
{knowhere::meta::DEVICEID, 0}};
// TODO
auto type_opt = field_meta_.get_metric_type();
Assert(type_opt.has_value());
auto type_name = MetricTypeToName(type_opt.value());
return knowhere::Config{{knowhere::meta::DIM, field_meta_.get_dim()}, {knowhere::meta::TOPK, top_K},
{knowhere::IndexParams::nlist, 100}, {knowhere::IndexParams::nprobe, 4},
{knowhere::Metric::TYPE, type_name}, {knowhere::meta::DEVICEID, 0}};
}
void

View File

@ -109,14 +109,25 @@ class IndexingRecord {
void
Initialize() {
int offset = 0;
for (auto& field : schema_) {
if (field.get_data_type() != DataType::VECTOR_BINARY) {
field_indexings_.try_emplace(FieldOffset(offset), CreateIndex(field, size_per_chunk_));
int offset_id = 0;
for (const FieldMeta& field : schema_) {
auto offset = FieldOffset(offset_id);
++offset_id;
if (field.is_vector()) {
// TODO: skip binary small index now, reenable after config.yaml is ready
if (field.get_data_type() == DataType::VECTOR_BINARY) {
continue;
}
// flat should be skipped
if (!field.get_metric_type().has_value()) {
continue;
}
}
++offset;
field_indexings_.try_emplace(offset, CreateIndex(field, size_per_chunk_));
}
assert(offset == schema_.size());
assert(offset_id == schema_.size());
}
// concurrent, reentrant
@ -131,7 +142,7 @@ class IndexingRecord {
const FieldIndexing&
get_field_indexing(FieldOffset field_offset) const {
assert(field_indexings_.count(field_offset));
Assert(field_indexings_.count(field_offset));
return *field_indexings_.at(field_offset);
}
@ -142,6 +153,12 @@ class IndexingRecord {
AssertInfo(ptr, "invalid indexing");
return *ptr;
}
bool
is_in(FieldOffset field_offset) const {
return field_indexings_.count(field_offset);
}
template <typename T>
auto
get_scalar_field_indexing(FieldOffset field_offset) const -> const ScalarFieldIndexing<T>& {

View File

@ -178,7 +178,8 @@ SegmentSealedImpl::vector_search(int64_t vec_count,
query::dataset::QueryDataset dataset;
dataset.query_data = query_data;
dataset.num_queries = query_count;
dataset.metric_type = field_meta.get_metric_type();
// if(field_meta.is)
dataset.metric_type = query_info.metric_type_;
dataset.topk = query_info.topK_;
dataset.dim = field_meta.get_dim();

View File

@ -29,17 +29,8 @@ FetchContent_Declare(
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH} )
FetchContent_GetProperties( googletest )
if ( NOT googletest_POPULATED )
FetchContent_Populate( googletest )
# Adding the following targets:
# gtest, gtest_main, gmock, gmock_main
add_subdirectory( ${googletest_SOURCE_DIR}
${googletest_BINARY_DIR}
EXCLUDE_FROM_ALL )
endif()
FetchContent_MakeAvailable(googletest)
# include(GoogleTest)
# ****************************************************************
# Create ALIAS Target

View File

@ -1,5 +1,3 @@
enable_testing()
include_directories(${CMAKE_HOME_DIRECTORY}/src)
include_directories(${CMAKE_HOME_DIRECTORY}/src/index/knowhere)
set(MILVUS_TEST_FILES

View File

@ -411,6 +411,50 @@ TEST(Query, ExecEmpty) {
}
}
TEST(Query, ExecWithoutPredicateFlat) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, std::nullopt);
schema->AddDebugField("age", DataType::FLOAT);
std::string dsl = R"({
"bool": {
"must": [
{
"vector": {
"fakevec": {
"metric_type": "L2",
"params": {
"nprobe": 10
},
"query": "$0",
"topk": 5
}
}
}
]
}
})";
auto plan = CreatePlan(*schema, dsl);
int64_t N = 1000 * 1000;
auto dataset = DataGen(schema, N);
auto segment = CreateGrowingSegment(schema);
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto num_queries = 5;
auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024);
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
QueryResult qr;
Timestamp time = 1000000;
std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};
qr = segment->Search(plan.get(), ph_group_arr.data(), &time, 1);
std::vector<std::vector<std::string>> results;
int topk = 5;
auto json = QueryResultToJson(qr);
std::cout << json.dump(2);
}
TEST(Query, ExecWithoutPredicate) {
using namespace milvus::query;
using namespace milvus::segcore;

View File

@ -21,6 +21,7 @@
// #include "utils/Json.h"
#include "test_utils/DataGen.h"
#include <random>
#include <optional>
using std::cin;
using std::cout;
using std::endl;