enhance: reduce 1x copy while retrieving data from growing segment (#28323)

Signed-off-by: yah01 <yah2er0ne@outlook.com>
pull/28349/head
yah01 2023-11-10 15:44:22 +08:00 committed by GitHub
parent 70995383bf
commit 267c67dfee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 84 additions and 45 deletions

View File

@ -23,6 +23,7 @@
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "common/Types.h" #include "common/Types.h"
#include "fmt/format.h" #include "fmt/format.h"
#include "log/Log.h"
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
#include "query/PlanNode.h" #include "query/PlanNode.h"
#include "query/SearchOnSealed.h" #include "query/SearchOnSealed.h"
@ -347,42 +348,52 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
auto vec_ptr = insert_record_.get_field_data_base(field_id); auto vec_ptr = insert_record_.get_field_data_base(field_id);
auto& field_meta = schema_->operator[](field_id); auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector()) { if (field_meta.is_vector()) {
aligned_vector<char> output(field_meta.get_sizeof() * count); auto result = CreateVectorDataArray(count, field_meta);
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
bulk_subscript_impl<FloatVector>(field_id, bulk_subscript_impl<FloatVector>(field_id,
field_meta.get_sizeof(), field_meta.get_sizeof(),
vec_ptr, vec_ptr,
seg_offsets, seg_offsets,
count, count,
output.data()); result->mutable_vectors()
->mutable_float_vector()
->mutable_data()
->mutable_data());
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) { } else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
bulk_subscript_impl<BinaryVector>(field_id, bulk_subscript_impl<BinaryVector>(
field_id,
field_meta.get_sizeof(), field_meta.get_sizeof(),
vec_ptr, vec_ptr,
seg_offsets, seg_offsets,
count, count,
output.data()); result->mutable_vectors()->mutable_binary_vector()->data());
} else if (field_meta.get_data_type() == DataType::VECTOR_FLOAT16) { } else if (field_meta.get_data_type() == DataType::VECTOR_FLOAT16) {
bulk_subscript_impl<Float16Vector>(field_id, bulk_subscript_impl<Float16Vector>(
field_id,
field_meta.get_sizeof(), field_meta.get_sizeof(),
vec_ptr, vec_ptr,
seg_offsets, seg_offsets,
count, count,
output.data()); result->mutable_vectors()->mutable_float16_vector()->data());
} else { } else {
PanicInfo(DataTypeInvalid, "logical error"); PanicInfo(DataTypeInvalid, "logical error");
} }
return CreateVectorDataArrayFrom(output.data(), count, field_meta); return result;
} }
AssertInfo(!field_meta.is_vector(), AssertInfo(!field_meta.is_vector(),
"Scalar field meta type is vector type"); "Scalar field meta type is vector type");
switch (field_meta.get_data_type()) { switch (field_meta.get_data_type()) {
case DataType::BOOL: { case DataType::BOOL: {
FixedVector<bool> output(count); auto result = CreateScalarDataArray(count, field_meta);
bulk_subscript_impl<bool>( bulk_subscript_impl<bool>(vec_ptr,
vec_ptr, seg_offsets, count, output.data()); seg_offsets,
return CreateScalarDataArrayFrom(output.data(), count, field_meta); count,
result->mutable_scalars()
->mutable_bool_data()
->mutable_data()
->mutable_data());
return result;
} }
case DataType::INT8: { case DataType::INT8: {
FixedVector<int8_t> output(count); FixedVector<int8_t> output(count);
@ -397,28 +408,48 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
return CreateScalarDataArrayFrom(output.data(), count, field_meta); return CreateScalarDataArrayFrom(output.data(), count, field_meta);
} }
case DataType::INT32: { case DataType::INT32: {
FixedVector<int32_t> output(count); auto result = CreateScalarDataArray(count, field_meta);
bulk_subscript_impl<int32_t>( bulk_subscript_impl<int32_t>(vec_ptr,
vec_ptr, seg_offsets, count, output.data()); seg_offsets,
return CreateScalarDataArrayFrom(output.data(), count, field_meta); count,
result->mutable_scalars()
->mutable_int_data()
->mutable_data()
->mutable_data());
return result;
} }
case DataType::INT64: { case DataType::INT64: {
FixedVector<int64_t> output(count); auto result = CreateScalarDataArray(count, field_meta);
bulk_subscript_impl<int64_t>( bulk_subscript_impl<int64_t>(vec_ptr,
vec_ptr, seg_offsets, count, output.data()); seg_offsets,
return CreateScalarDataArrayFrom(output.data(), count, field_meta); count,
result->mutable_scalars()
->mutable_long_data()
->mutable_data()
->mutable_data());
return result;
} }
case DataType::FLOAT: { case DataType::FLOAT: {
FixedVector<float> output(count); auto result = CreateScalarDataArray(count, field_meta);
bulk_subscript_impl<float>( bulk_subscript_impl<float>(vec_ptr,
vec_ptr, seg_offsets, count, output.data()); seg_offsets,
return CreateScalarDataArrayFrom(output.data(), count, field_meta); count,
result->mutable_scalars()
->mutable_float_data()
->mutable_data()
->mutable_data());
return result;
} }
case DataType::DOUBLE: { case DataType::DOUBLE: {
FixedVector<double> output(count); auto result = CreateScalarDataArray(count, field_meta);
bulk_subscript_impl<double>( bulk_subscript_impl<double>(vec_ptr,
vec_ptr, seg_offsets, count, output.data()); seg_offsets,
return CreateScalarDataArrayFrom(output.data(), count, field_meta); count,
result->mutable_scalars()
->mutable_double_data()
->mutable_data()
->mutable_data());
return result;
} }
case DataType::VARCHAR: { case DataType::VARCHAR: {
FixedVector<std::string> output(count); FixedVector<std::string> output(count);

View File

@ -100,6 +100,7 @@ TEST(Growing, RealCount) {
TEST(Growing, FillData) { TEST(Growing, FillData) {
auto schema = std::make_shared<Schema>(); auto schema = std::make_shared<Schema>();
auto metric_type = knowhere::metric::L2; auto metric_type = knowhere::metric::L2;
auto bool_field = schema->AddDebugField("bool", DataType::BOOL);
auto int8_field = schema->AddDebugField("int8", DataType::INT8); auto int8_field = schema->AddDebugField("int8", DataType::INT8);
auto int16_field = schema->AddDebugField("int16", DataType::INT16); auto int16_field = schema->AddDebugField("int16", DataType::INT16);
auto int32_field = schema->AddDebugField("int32", DataType::INT32); auto int32_field = schema->AddDebugField("int32", DataType::INT32);
@ -145,6 +146,7 @@ TEST(Growing, FillData) {
int64_t dim = 128; int64_t dim = 128;
for (int64_t i = 0; i < n_batch; i++) { for (int64_t i = 0; i < n_batch; i++) {
auto dataset = DataGen(schema, per_batch); auto dataset = DataGen(schema, per_batch);
auto bool_values = dataset.get_col<bool>(bool_field);
auto int8_values = dataset.get_col<int8_t>(int8_field); auto int8_values = dataset.get_col<int8_t>(int8_field);
auto int16_values = dataset.get_col<int16_t>(int16_field); auto int16_values = dataset.get_col<int16_t>(int16_field);
auto int32_values = dataset.get_col<int32_t>(int32_field); auto int32_values = dataset.get_col<int32_t>(int32_field);
@ -172,6 +174,8 @@ TEST(Growing, FillData) {
dataset.raw_); dataset.raw_);
auto num_inserted = (i + 1) * per_batch; auto num_inserted = (i + 1) * per_batch;
auto ids_ds = GenRandomIds(num_inserted); auto ids_ds = GenRandomIds(num_inserted);
auto bool_result =
segment->bulk_subscript(bool_field, ids_ds->GetIds(), num_inserted);
auto int8_result = auto int8_result =
segment->bulk_subscript(int8_field, ids_ds->GetIds(), num_inserted); segment->bulk_subscript(int8_field, ids_ds->GetIds(), num_inserted);
auto int16_result = segment->bulk_subscript( auto int16_result = segment->bulk_subscript(
@ -203,6 +207,7 @@ TEST(Growing, FillData) {
auto vec_result = auto vec_result =
segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted); segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted);
EXPECT_EQ(bool_result->scalars().bool_data().data_size(), num_inserted);
EXPECT_EQ(int8_result->scalars().int_data().data_size(), num_inserted); EXPECT_EQ(int8_result->scalars().int_data().data_size(), num_inserted);
EXPECT_EQ(int16_result->scalars().int_data().data_size(), num_inserted); EXPECT_EQ(int16_result->scalars().int_data().data_size(), num_inserted);
EXPECT_EQ(int32_result->scalars().int_data().data_size(), num_inserted); EXPECT_EQ(int32_result->scalars().int_data().data_size(), num_inserted);

View File

@ -14,6 +14,7 @@
#include <map> #include <map>
#include <tuple> #include <tuple>
#include "common/Types.h"
#include "indexbuilder/IndexFactory.h" #include "indexbuilder/IndexFactory.h"
#include "indexbuilder/VecIndexCreator.h" #include "indexbuilder/VecIndexCreator.h"
#include "common/QueryResult.h" #include "common/QueryResult.h"
@ -103,8 +104,8 @@ class IndexWrapperTest : public ::testing::TestWithParam<Param> {
bool is_binary; bool is_binary;
DataType vec_field_data_type; DataType vec_field_data_type;
knowhere::DataSetPtr xb_dataset; knowhere::DataSetPtr xb_dataset;
std::vector<float> xb_data; FixedVector<float> xb_data;
std::vector<uint8_t> xb_bin_data; FixedVector<uint8_t> xb_bin_data;
knowhere::DataSetPtr xq_dataset; knowhere::DataSetPtr xq_dataset;
int64_t query_offset = 100; int64_t query_offset = 100;
int64_t NB = 10000; int64_t NB = 10000;
@ -141,8 +142,8 @@ TEST_P(IndexWrapperTest, BuildAndQuery) {
auto dataset = GenDataset(NB, metric_type, is_binary); auto dataset = GenDataset(NB, metric_type, is_binary);
knowhere::DataSetPtr xb_dataset; knowhere::DataSetPtr xb_dataset;
std::vector<uint8_t> bin_vecs; FixedVector<uint8_t> bin_vecs;
std::vector<float> f_vecs; FixedVector<float> f_vecs;
if (is_binary) { if (is_binary) {
bin_vecs = dataset.get_col<uint8_t>(milvus::FieldId(100)); bin_vecs = dataset.get_col<uint8_t>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataSet(NB, DIM, bin_vecs.data()); xb_dataset = knowhere::GenDataSet(NB, DIM, bin_vecs.data());
@ -153,7 +154,7 @@ TEST_P(IndexWrapperTest, BuildAndQuery) {
ASSERT_NO_THROW(index->Build(xb_dataset)); ASSERT_NO_THROW(index->Build(xb_dataset));
auto binary_set = index->Serialize(); auto binary_set = index->Serialize();
std::vector<std::string> index_files; FixedVector<std::string> index_files;
for (auto& binary : binary_set.binary_map_) { for (auto& binary : binary_set.binary_map_) {
index_files.emplace_back(binary.first); index_files.emplace_back(binary.first);
} }

View File

@ -17,6 +17,7 @@
#include <vector> #include <vector>
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "common/Types.h"
#include "knowhere/comp/index_param.h" #include "knowhere/comp/index_param.h"
#include "nlohmann/json.hpp" #include "nlohmann/json.hpp"
#include "query/SearchBruteForce.h" #include "query/SearchBruteForce.h"
@ -356,8 +357,8 @@ class IndexTest : public ::testing::TestWithParam<Param> {
milvus::Config range_search_conf; milvus::Config range_search_conf;
milvus::DataType vec_field_data_type; milvus::DataType vec_field_data_type;
knowhere::DataSetPtr xb_dataset; knowhere::DataSetPtr xb_dataset;
std::vector<float> xb_data; FixedVector<float> xb_data;
std::vector<uint8_t> xb_bin_data; FixedVector<uint8_t> xb_bin_data;
knowhere::DataSetPtr xq_dataset; knowhere::DataSetPtr xq_dataset;
int64_t query_offset = 100; int64_t query_offset = 100;
int64_t NB = 3000; int64_t NB = 3000;
@ -610,7 +611,7 @@ TEST(Indexing, SearchDiskAnnWithInvalidParam) {
// build disk ann index // build disk ann index
auto dataset = GenDataset(NB, metric_type, false); auto dataset = GenDataset(NB, metric_type, false);
std::vector<float> xb_data = FixedVector<float> xb_data =
dataset.get_col<float>(milvus::FieldId(field_id)); dataset.get_col<float>(milvus::FieldId(field_id));
knowhere::DataSetPtr xb_dataset = knowhere::DataSetPtr xb_dataset =
knowhere::GenDataSet(NB, DIM, xb_data.data()); knowhere::GenDataSet(NB, DIM, xb_data.data());

View File

@ -21,6 +21,7 @@
#include "Constants.h" #include "Constants.h"
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "common/Schema.h" #include "common/Schema.h"
#include "common/Types.h"
#include "index/ScalarIndexSort.h" #include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h" #include "index/StringIndexSort.h"
#include "index/VectorMemIndex.h" #include "index/VectorMemIndex.h"
@ -81,9 +82,9 @@ struct GeneratedData {
} }
template <typename T> template <typename T>
std::vector<T> FixedVector<T>
get_col(FieldId field_id) const { get_col(FieldId field_id) const {
std::vector<T> ret(raw_->num_rows()); FixedVector<T> ret(raw_->num_rows());
for (auto i = 0; i < raw_->fields_data_size(); i++) { for (auto i = 0; i < raw_->fields_data_size(); i++) {
auto target_field_data = raw_->fields_data(i); auto target_field_data = raw_->fields_data(i);
if (field_id.get() != target_field_data.field_id()) { if (field_id.get() != target_field_data.field_id()) {