mirror of https://github.com/milvus-io/milvus.git
enhance: support null in c data_datacodec and load null value (#32183)
1. support read and write null in segcore will store valid_data(use uint8_t type to save memory) in fieldData. 2. support load null binlog reader read and write data into column(sealed segment), insertRecord(growing segment). In sealed segment, store valid_data directly. In growing segment, considering prior implementation and easy code reading, it covert uint8_t to fbvector<bool>, which may optimize in future. 3. retrieve valid_data. parse valid_data in search/query. #31728 --------- Signed-off-by: lixinguo <xinguo.li@zilliz.com> Co-authored-by: lixinguo <xinguo.li@zilliz.com>pull/34924/head
parent
92de49e38c
commit
5616b7e8d2
|
@ -871,4 +871,4 @@ trace:
|
|||
#maxMemSize will the whole available GPU memory.
|
||||
gpu:
|
||||
initMemSize: # Gpu Memory Pool init size
|
||||
maxMemSize: # Gpu Memory Pool Max size
|
||||
maxMemSize: # Gpu Memory Pool Max size
|
|
@ -30,6 +30,9 @@ template <typename Type, bool is_type_entire_row>
|
|||
void
|
||||
FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
|
||||
ssize_t element_count) {
|
||||
AssertInfo(!nullable_,
|
||||
"need to fill valid_data, use the 3-argument version instead");
|
||||
|
||||
if (element_count == 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -40,7 +43,38 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
|
|||
}
|
||||
std::copy_n(static_cast<const Type*>(source),
|
||||
element_count * dim_,
|
||||
field_data_.data() + length_ * dim_);
|
||||
data_.data() + length_ * dim_);
|
||||
length_ += element_count;
|
||||
}
|
||||
|
||||
template <typename Type, bool is_type_entire_row>
|
||||
void
|
||||
FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
|
||||
const void* field_data, const uint8_t* valid_data, ssize_t element_count) {
|
||||
AssertInfo(
|
||||
nullable_,
|
||||
"no need to fill valid_data, use the 2-argument version instead");
|
||||
if (element_count == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard lck(tell_mutex_);
|
||||
if (length_ + element_count > get_num_rows()) {
|
||||
resize_field_data(length_ + element_count);
|
||||
}
|
||||
std::copy_n(static_cast<const Type*>(field_data),
|
||||
element_count * dim_,
|
||||
data_.data() + length_ * dim_);
|
||||
|
||||
ssize_t byte_count = (element_count + 7) / 8;
|
||||
// Note: if 'nullable == true` and valid_data is nullptr
|
||||
// means null_count == 0, will fill it with 0xFF
|
||||
if (valid_data == nullptr) {
|
||||
valid_data_.resize(byte_count, 0xFF);
|
||||
} else {
|
||||
std::copy_n(valid_data, byte_count, valid_data_.data());
|
||||
}
|
||||
|
||||
length_ += element_count;
|
||||
}
|
||||
|
||||
|
@ -66,6 +100,7 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
|
|||
if (element_count == 0) {
|
||||
return;
|
||||
}
|
||||
null_count = array->null_count();
|
||||
switch (data_type_) {
|
||||
case DataType::BOOL: {
|
||||
AssertInfo(array->type()->id() == arrow::Type::type::BOOL,
|
||||
|
@ -76,42 +111,71 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
|
|||
for (size_t index = 0; index < element_count; ++index) {
|
||||
values[index] = bool_array->Value(index);
|
||||
}
|
||||
if (nullable_) {
|
||||
return FillFieldData(values.data(),
|
||||
bool_array->null_bitmap_data(),
|
||||
element_count);
|
||||
}
|
||||
return FillFieldData(values.data(), element_count);
|
||||
}
|
||||
case DataType::INT8: {
|
||||
auto array_info =
|
||||
GetDataInfoFromArray<arrow::Int8Array, arrow::Type::type::INT8>(
|
||||
array);
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
array_info.first, array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(array_info.first, array_info.second);
|
||||
}
|
||||
case DataType::INT16: {
|
||||
auto array_info =
|
||||
GetDataInfoFromArray<arrow::Int16Array,
|
||||
arrow::Type::type::INT16>(array);
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
array_info.first, array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(array_info.first, array_info.second);
|
||||
}
|
||||
case DataType::INT32: {
|
||||
auto array_info =
|
||||
GetDataInfoFromArray<arrow::Int32Array,
|
||||
arrow::Type::type::INT32>(array);
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
array_info.first, array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(array_info.first, array_info.second);
|
||||
}
|
||||
case DataType::INT64: {
|
||||
auto array_info =
|
||||
GetDataInfoFromArray<arrow::Int64Array,
|
||||
arrow::Type::type::INT64>(array);
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
array_info.first, array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(array_info.first, array_info.second);
|
||||
}
|
||||
case DataType::FLOAT: {
|
||||
auto array_info =
|
||||
GetDataInfoFromArray<arrow::FloatArray,
|
||||
arrow::Type::type::FLOAT>(array);
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
array_info.first, array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(array_info.first, array_info.second);
|
||||
}
|
||||
case DataType::DOUBLE: {
|
||||
auto array_info =
|
||||
GetDataInfoFromArray<arrow::DoubleArray,
|
||||
arrow::Type::type::DOUBLE>(array);
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
array_info.first, array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(array_info.first, array_info.second);
|
||||
}
|
||||
case DataType::STRING:
|
||||
|
@ -124,6 +188,10 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
|
|||
for (size_t index = 0; index < element_count; ++index) {
|
||||
values[index] = string_array->GetString(index);
|
||||
}
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
values.data(), array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(values.data(), element_count);
|
||||
}
|
||||
case DataType::JSON: {
|
||||
|
@ -136,17 +204,33 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
|
|||
values[index] =
|
||||
Json(simdjson::padded_string(json_array->GetString(index)));
|
||||
}
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
values.data(), array->null_bitmap_data(), element_count);
|
||||
}
|
||||
return FillFieldData(values.data(), element_count);
|
||||
}
|
||||
case DataType::ARRAY: {
|
||||
auto array_array =
|
||||
std::dynamic_pointer_cast<arrow::BinaryArray>(array);
|
||||
std::vector<Array> values(element_count);
|
||||
int null_number = 0;
|
||||
for (size_t index = 0; index < element_count; ++index) {
|
||||
ScalarArray field_data;
|
||||
field_data.ParseFromString(array_array->GetString(index));
|
||||
if (array_array->GetString(index) == "") {
|
||||
null_number++;
|
||||
continue;
|
||||
}
|
||||
auto success =
|
||||
field_data.ParseFromString(array_array->GetString(index));
|
||||
AssertInfo(success, "parse from string failed");
|
||||
values[index] = Array(field_data);
|
||||
}
|
||||
if (nullable_) {
|
||||
return FillFieldData(
|
||||
values.data(), array->null_bitmap_data(), element_count);
|
||||
}
|
||||
AssertInfo(null_number == 0, "get empty string when not nullable");
|
||||
return FillFieldData(values.data(), element_count);
|
||||
}
|
||||
case DataType::VECTOR_FLOAT:
|
||||
|
@ -201,27 +285,33 @@ template class FieldDataImpl<bfloat16, false>;
|
|||
template class FieldDataImpl<knowhere::sparse::SparseRow<float>, true>;
|
||||
|
||||
FieldDataPtr
|
||||
InitScalarFieldData(const DataType& type, int64_t cap_rows) {
|
||||
InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows) {
|
||||
switch (type) {
|
||||
case DataType::BOOL:
|
||||
return std::make_shared<FieldData<bool>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<bool>>(type, nullable, cap_rows);
|
||||
case DataType::INT8:
|
||||
return std::make_shared<FieldData<int8_t>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<int8_t>>(
|
||||
type, nullable, cap_rows);
|
||||
case DataType::INT16:
|
||||
return std::make_shared<FieldData<int16_t>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<int16_t>>(
|
||||
type, nullable, cap_rows);
|
||||
case DataType::INT32:
|
||||
return std::make_shared<FieldData<int32_t>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<int32_t>>(
|
||||
type, nullable, cap_rows);
|
||||
case DataType::INT64:
|
||||
return std::make_shared<FieldData<int64_t>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<int64_t>>(
|
||||
type, nullable, cap_rows);
|
||||
case DataType::FLOAT:
|
||||
return std::make_shared<FieldData<float>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<float>>(type, nullable, cap_rows);
|
||||
case DataType::DOUBLE:
|
||||
return std::make_shared<FieldData<double>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<double>>(
|
||||
type, nullable, cap_rows);
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return std::make_shared<FieldData<std::string>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<std::string>>(
|
||||
type, nullable, cap_rows);
|
||||
case DataType::JSON:
|
||||
return std::make_shared<FieldData<Json>>(type, cap_rows);
|
||||
return std::make_shared<FieldData<Json>>(type, nullable, cap_rows);
|
||||
default:
|
||||
PanicInfo(DataTypeInvalid,
|
||||
"InitScalarFieldData not support data type " +
|
||||
|
|
|
@ -30,14 +30,18 @@ template <typename Type>
|
|||
class FieldData : public FieldDataImpl<Type, true> {
|
||||
public:
|
||||
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
|
||||
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
|
||||
explicit FieldData(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataImpl<Type, true>::FieldDataImpl(
|
||||
1, data_type, buffered_num_rows) {
|
||||
1, data_type, nullable, buffered_num_rows) {
|
||||
}
|
||||
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
|
||||
explicit FieldData(DataType data_type, FixedVector<Type>&& inner_data)
|
||||
explicit FieldData(DataType data_type,
|
||||
bool nullable,
|
||||
FixedVector<Type>&& inner_data)
|
||||
: FieldDataImpl<Type, true>::FieldDataImpl(
|
||||
1, data_type, std::move(inner_data)) {
|
||||
1, data_type, nullable, std::move(inner_data)) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -45,8 +49,10 @@ template <>
|
|||
class FieldData<std::string> : public FieldDataStringImpl {
|
||||
public:
|
||||
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
|
||||
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
|
||||
: FieldDataStringImpl(data_type, buffered_num_rows) {
|
||||
explicit FieldData(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataStringImpl(data_type, nullable, buffered_num_rows) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -54,8 +60,10 @@ template <>
|
|||
class FieldData<Json> : public FieldDataJsonImpl {
|
||||
public:
|
||||
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
|
||||
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
|
||||
: FieldDataJsonImpl(data_type, buffered_num_rows) {
|
||||
explicit FieldData(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataJsonImpl(data_type, nullable, buffered_num_rows) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -63,8 +71,10 @@ template <>
|
|||
class FieldData<Array> : public FieldDataArrayImpl {
|
||||
public:
|
||||
static_assert(IsScalar<Array> || std::is_same_v<std::string, PkType>);
|
||||
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
|
||||
: FieldDataArrayImpl(data_type, buffered_num_rows) {
|
||||
explicit FieldData(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataArrayImpl(data_type, nullable, buffered_num_rows) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -75,7 +85,7 @@ class FieldData<FloatVector> : public FieldDataImpl<float, false> {
|
|||
DataType data_type,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataImpl<float, false>::FieldDataImpl(
|
||||
dim, data_type, buffered_num_rows) {
|
||||
dim, data_type, false, buffered_num_rows) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -86,7 +96,7 @@ class FieldData<BinaryVector> : public FieldDataImpl<uint8_t, false> {
|
|||
DataType data_type,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: binary_dim_(dim),
|
||||
FieldDataImpl(dim / 8, data_type, buffered_num_rows) {
|
||||
FieldDataImpl(dim / 8, data_type, false, buffered_num_rows) {
|
||||
Assert(dim % 8 == 0);
|
||||
}
|
||||
|
||||
|
@ -106,7 +116,7 @@ class FieldData<Float16Vector> : public FieldDataImpl<float16, false> {
|
|||
DataType data_type,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataImpl<float16, false>::FieldDataImpl(
|
||||
dim, data_type, buffered_num_rows) {
|
||||
dim, data_type, false, buffered_num_rows) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -117,7 +127,7 @@ class FieldData<BFloat16Vector> : public FieldDataImpl<bfloat16, false> {
|
|||
DataType data_type,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataImpl<bfloat16, false>::FieldDataImpl(
|
||||
dim, data_type, buffered_num_rows) {
|
||||
dim, data_type, false, buffered_num_rows) {
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -134,6 +144,6 @@ using FieldDataChannel = Channel<FieldDataPtr>;
|
|||
using FieldDataChannelPtr = std::shared_ptr<FieldDataChannel>;
|
||||
|
||||
FieldDataPtr
|
||||
InitScalarFieldData(const DataType& type, int64_t cap_rows);
|
||||
InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows);
|
||||
|
||||
} // namespace milvus
|
|
@ -25,6 +25,7 @@
|
|||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
|
||||
#include "Types.h"
|
||||
#include "arrow/api.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "common/FieldMeta.h"
|
||||
|
@ -40,7 +41,8 @@ using DataType = milvus::DataType;
|
|||
|
||||
class FieldDataBase {
|
||||
public:
|
||||
explicit FieldDataBase(DataType data_type) : data_type_(data_type) {
|
||||
explicit FieldDataBase(DataType data_type, bool nullable)
|
||||
: data_type_(data_type), nullable_(nullable) {
|
||||
}
|
||||
virtual ~FieldDataBase() = default;
|
||||
|
||||
|
@ -49,6 +51,11 @@ class FieldDataBase {
|
|||
virtual void
|
||||
FillFieldData(const void* source, ssize_t element_count) = 0;
|
||||
|
||||
virtual void
|
||||
FillFieldData(const void* field_data,
|
||||
const uint8_t* valid_data,
|
||||
ssize_t element_count) = 0;
|
||||
|
||||
virtual void
|
||||
FillFieldData(const std::shared_ptr<arrow::Array> array) = 0;
|
||||
|
||||
|
@ -57,6 +64,9 @@ class FieldDataBase {
|
|||
virtual void*
|
||||
Data() = 0;
|
||||
|
||||
virtual uint8_t*
|
||||
ValidData() = 0;
|
||||
|
||||
// For all FieldDataImpl subclasses, this method returns a Type* that points
|
||||
// at the offset-th row of this field data.
|
||||
virtual const void*
|
||||
|
@ -66,9 +76,15 @@ class FieldDataBase {
|
|||
virtual int64_t
|
||||
Size() const = 0;
|
||||
|
||||
virtual int64_t
|
||||
DataSize() const = 0;
|
||||
|
||||
virtual int64_t
|
||||
ValidDataSize() const = 0;
|
||||
|
||||
// Returns the serialized bytes size of the index-th row.
|
||||
virtual int64_t
|
||||
Size(ssize_t index) const = 0;
|
||||
DataSize(ssize_t index) const = 0;
|
||||
|
||||
// Number of filled rows
|
||||
virtual size_t
|
||||
|
@ -77,6 +93,9 @@ class FieldDataBase {
|
|||
virtual bool
|
||||
IsFull() const = 0;
|
||||
|
||||
virtual bool
|
||||
IsNullable() const = 0;
|
||||
|
||||
virtual void
|
||||
Reserve(size_t cap) = 0;
|
||||
|
||||
|
@ -94,8 +113,15 @@ class FieldDataBase {
|
|||
return data_type_;
|
||||
}
|
||||
|
||||
virtual int64_t
|
||||
get_null_count() const = 0;
|
||||
|
||||
virtual bool
|
||||
is_valid(ssize_t offset) const = 0;
|
||||
|
||||
protected:
|
||||
const DataType data_type_;
|
||||
const bool nullable_;
|
||||
};
|
||||
|
||||
template <typename Type, bool is_type_entire_row = false>
|
||||
|
@ -112,25 +138,53 @@ class FieldDataImpl : public FieldDataBase {
|
|||
public:
|
||||
explicit FieldDataImpl(ssize_t dim,
|
||||
DataType data_type,
|
||||
bool nullable,
|
||||
int64_t buffered_num_rows = 0)
|
||||
: FieldDataBase(data_type),
|
||||
: FieldDataBase(data_type, nullable),
|
||||
num_rows_(buffered_num_rows),
|
||||
dim_(is_type_entire_row ? 1 : dim) {
|
||||
field_data_.resize(num_rows_ * dim_);
|
||||
data_.resize(num_rows_ * dim_);
|
||||
if (nullable) {
|
||||
if (IsVectorDataType(data_type)) {
|
||||
PanicInfo(NotImplemented, "vector type not support null");
|
||||
}
|
||||
valid_data_.resize((num_rows_ + 7) / 8);
|
||||
}
|
||||
}
|
||||
|
||||
explicit FieldDataImpl(size_t dim,
|
||||
DataType type,
|
||||
FixedVector<Type>&& field_data)
|
||||
: FieldDataBase(type), dim_(is_type_entire_row ? 1 : dim) {
|
||||
field_data_ = std::move(field_data);
|
||||
Assert(field_data.size() % dim == 0);
|
||||
num_rows_ = field_data.size() / dim;
|
||||
bool nullable,
|
||||
FixedVector<Type>&& data)
|
||||
: FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) {
|
||||
AssertInfo(!nullable, "need to fill valid_data when nullable is true");
|
||||
data_ = std::move(data);
|
||||
Assert(data.size() % dim == 0);
|
||||
num_rows_ = data.size() / dim;
|
||||
}
|
||||
|
||||
explicit FieldDataImpl(size_t dim,
|
||||
DataType type,
|
||||
bool nullable,
|
||||
FixedVector<Type>&& data,
|
||||
FixedVector<uint8_t>&& valid_data)
|
||||
: FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) {
|
||||
AssertInfo(nullable,
|
||||
"no need to fill valid_data when nullable is false");
|
||||
data_ = std::move(data);
|
||||
valid_data_ = std::move(valid_data);
|
||||
Assert(data.size() % dim == 0);
|
||||
num_rows_ = data.size() / dim;
|
||||
}
|
||||
|
||||
void
|
||||
FillFieldData(const void* source, ssize_t element_count) override;
|
||||
|
||||
void
|
||||
FillFieldData(const void* field_data,
|
||||
const uint8_t* valid_data,
|
||||
ssize_t element_count) override;
|
||||
|
||||
void
|
||||
FillFieldData(const std::shared_ptr<arrow::Array> array) override;
|
||||
|
||||
|
@ -155,7 +209,12 @@ class FieldDataImpl : public FieldDataBase {
|
|||
|
||||
void*
|
||||
Data() override {
|
||||
return field_data_.data();
|
||||
return data_.data();
|
||||
}
|
||||
|
||||
uint8_t*
|
||||
ValidData() override {
|
||||
return valid_data_.data();
|
||||
}
|
||||
|
||||
const void*
|
||||
|
@ -164,16 +223,36 @@ class FieldDataImpl : public FieldDataBase {
|
|||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
"subscript position don't has valid value");
|
||||
return &field_data_[offset];
|
||||
return &data_[offset];
|
||||
}
|
||||
|
||||
// std::optional<const void*>
|
||||
// Value(ssize_t offset) {
|
||||
// if (!is_type_entire_row) {
|
||||
// return RawValue(offset);
|
||||
// }
|
||||
// AssertInfo(offset < get_num_rows(),
|
||||
// "field data subscript out of range");
|
||||
// AssertInfo(offset < length(),
|
||||
// "subscript position don't has valid value");
|
||||
// if (nullable_ && !valid_data_[offset]) {
|
||||
// return std::nullopt;
|
||||
// }
|
||||
// return &field_data_[offset];
|
||||
// }
|
||||
|
||||
int64_t
|
||||
Size() const override {
|
||||
return DataSize() + ValidDataSize();
|
||||
}
|
||||
|
||||
int64_t
|
||||
DataSize() const override {
|
||||
return sizeof(Type) * length() * dim_;
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size(ssize_t offset) const override {
|
||||
DataSize(ssize_t offset) const override {
|
||||
AssertInfo(offset < get_num_rows(),
|
||||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
|
@ -181,6 +260,14 @@ class FieldDataImpl : public FieldDataBase {
|
|||
return sizeof(Type) * dim_;
|
||||
}
|
||||
|
||||
int64_t
|
||||
ValidDataSize() const override {
|
||||
if (nullable_) {
|
||||
return sizeof(uint8_t) * (length() + 7) / 8;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t
|
||||
Length() const override {
|
||||
return length_;
|
||||
|
@ -193,12 +280,20 @@ class FieldDataImpl : public FieldDataBase {
|
|||
return buffered_num_rows == filled_num_rows;
|
||||
}
|
||||
|
||||
bool
|
||||
IsNullable() const override {
|
||||
return nullable_;
|
||||
}
|
||||
|
||||
void
|
||||
Reserve(size_t cap) override {
|
||||
std::lock_guard lck(num_rows_mutex_);
|
||||
if (cap > num_rows_) {
|
||||
num_rows_ = cap;
|
||||
field_data_.resize(num_rows_ * dim_);
|
||||
data_.resize(num_rows_ * dim_);
|
||||
}
|
||||
if (nullable_) {
|
||||
valid_data_.resize((num_rows_ + 7) / 8);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -214,7 +309,10 @@ class FieldDataImpl : public FieldDataBase {
|
|||
std::lock_guard lck(num_rows_mutex_);
|
||||
if (num_rows > num_rows_) {
|
||||
num_rows_ = num_rows;
|
||||
field_data_.resize(num_rows_ * dim_);
|
||||
data_.resize(num_rows_ * dim_);
|
||||
if (nullable_) {
|
||||
valid_data_.resize((num_rows + 7) / 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -229,12 +327,34 @@ class FieldDataImpl : public FieldDataBase {
|
|||
return dim_;
|
||||
}
|
||||
|
||||
int64_t
|
||||
get_null_count() const override {
|
||||
std::shared_lock lck(tell_mutex_);
|
||||
return null_count;
|
||||
}
|
||||
|
||||
bool
|
||||
is_valid(ssize_t offset) const override {
|
||||
std::shared_lock lck(tell_mutex_);
|
||||
AssertInfo(offset < get_num_rows(),
|
||||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
"subscript position don't has valid value");
|
||||
if (!nullable_) {
|
||||
return true;
|
||||
}
|
||||
auto bit = (valid_data_[offset >> 3] >> ((offset & 0x07))) & 1;
|
||||
return bit;
|
||||
}
|
||||
|
||||
protected:
|
||||
FixedVector<Type> field_data_;
|
||||
// number of elements field_data_ can hold
|
||||
FixedVector<Type> data_{};
|
||||
FixedVector<uint8_t> valid_data_{};
|
||||
// number of elements data_ can hold
|
||||
int64_t num_rows_;
|
||||
mutable std::shared_mutex num_rows_mutex_;
|
||||
// number of actual elements in field_data_
|
||||
int64_t null_count{0};
|
||||
// number of actual elements in data_
|
||||
size_t length_{};
|
||||
mutable std::shared_mutex tell_mutex_;
|
||||
|
||||
|
@ -244,27 +364,30 @@ class FieldDataImpl : public FieldDataBase {
|
|||
|
||||
class FieldDataStringImpl : public FieldDataImpl<std::string, true> {
|
||||
public:
|
||||
explicit FieldDataStringImpl(DataType data_type, int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<std::string, true>(1, data_type, total_num_rows) {
|
||||
explicit FieldDataStringImpl(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<std::string, true>(
|
||||
1, data_type, nullable, total_num_rows) {
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size() const override {
|
||||
DataSize() const override {
|
||||
int64_t data_size = 0;
|
||||
for (size_t offset = 0; offset < length(); ++offset) {
|
||||
data_size += field_data_[offset].size();
|
||||
data_size += data_[offset].size();
|
||||
}
|
||||
|
||||
return data_size;
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size(ssize_t offset) const override {
|
||||
DataSize(ssize_t offset) const override {
|
||||
AssertInfo(offset < get_num_rows(),
|
||||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
"subscript position don't has valid value");
|
||||
return field_data_[offset].size();
|
||||
return data_[offset].size();
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -281,36 +404,46 @@ class FieldDataStringImpl : public FieldDataImpl<std::string, true> {
|
|||
|
||||
auto i = 0;
|
||||
for (const auto& str : *array) {
|
||||
field_data_[length_ + i] = str.value();
|
||||
data_[length_ + i] = str.value();
|
||||
i++;
|
||||
}
|
||||
if (IsNullable()) {
|
||||
auto valid_data = array->null_bitmap_data();
|
||||
if (valid_data == nullptr) {
|
||||
valid_data_.resize((n + 7) / 8, 0xFF);
|
||||
} else {
|
||||
std::copy_n(valid_data, (n + 7) / 8, valid_data_.data());
|
||||
}
|
||||
}
|
||||
length_ += n;
|
||||
}
|
||||
};
|
||||
|
||||
class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
|
||||
public:
|
||||
explicit FieldDataJsonImpl(DataType data_type, int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<Json, true>(1, data_type, total_num_rows) {
|
||||
explicit FieldDataJsonImpl(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<Json, true>(1, data_type, nullable, total_num_rows) {
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size() const override {
|
||||
DataSize() const override {
|
||||
int64_t data_size = 0;
|
||||
for (size_t offset = 0; offset < length(); ++offset) {
|
||||
data_size += field_data_[offset].data().size();
|
||||
data_size += data_[offset].data().size();
|
||||
}
|
||||
|
||||
return data_size;
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size(ssize_t offset) const override {
|
||||
DataSize(ssize_t offset) const override {
|
||||
AssertInfo(offset < get_num_rows(),
|
||||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
"subscript position don't has valid value");
|
||||
return field_data_[offset].data().size();
|
||||
return data_[offset].data().size();
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -337,10 +470,17 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
|
|||
|
||||
auto i = 0;
|
||||
for (const auto& json : *array) {
|
||||
field_data_[length_ + i] =
|
||||
Json(simdjson::padded_string(json.value()));
|
||||
data_[length_ + i] = Json(simdjson::padded_string(json.value()));
|
||||
i++;
|
||||
}
|
||||
if (IsNullable()) {
|
||||
auto valid_data = array->null_bitmap_data();
|
||||
if (valid_data == nullptr) {
|
||||
valid_data_.resize((n + 7) / 8, 0xFF);
|
||||
} else {
|
||||
std::copy_n(valid_data, (n + 7) / 8, valid_data_.data());
|
||||
}
|
||||
}
|
||||
length_ += n;
|
||||
}
|
||||
};
|
||||
|
@ -351,28 +491,28 @@ class FieldDataSparseVectorImpl
|
|||
explicit FieldDataSparseVectorImpl(DataType data_type,
|
||||
int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<knowhere::sparse::SparseRow<float>, true>(
|
||||
/*dim=*/1, data_type, total_num_rows),
|
||||
/*dim=*/1, data_type, false, total_num_rows),
|
||||
vec_dim_(0) {
|
||||
AssertInfo(data_type == DataType::VECTOR_SPARSE_FLOAT,
|
||||
"invalid data type for sparse vector");
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size() const override {
|
||||
DataSize() const override {
|
||||
int64_t data_size = 0;
|
||||
for (size_t i = 0; i < length(); ++i) {
|
||||
data_size += field_data_[i].data_byte_size();
|
||||
data_size += data_[i].data_byte_size();
|
||||
}
|
||||
return data_size;
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size(ssize_t offset) const override {
|
||||
DataSize(ssize_t offset) const override {
|
||||
AssertInfo(offset < get_num_rows(),
|
||||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
"subscript position don't has valid value");
|
||||
return field_data_[offset].data_byte_size();
|
||||
return data_[offset].data_byte_size();
|
||||
}
|
||||
|
||||
// source is a pointer to element_count of
|
||||
|
@ -393,7 +533,7 @@ class FieldDataSparseVectorImpl
|
|||
auto& row = ptr[i];
|
||||
vec_dim_ = std::max(vec_dim_, row.dim());
|
||||
}
|
||||
std::copy_n(ptr, element_count, field_data_.data() + length_);
|
||||
std::copy_n(ptr, element_count, data_.data() + length_);
|
||||
length_ += element_count;
|
||||
}
|
||||
|
||||
|
@ -412,7 +552,7 @@ class FieldDataSparseVectorImpl
|
|||
|
||||
for (int64_t i = 0; i < array->length(); ++i) {
|
||||
auto view = array->GetView(i);
|
||||
auto& row = field_data_[length_ + i];
|
||||
auto& row = data_[length_ + i];
|
||||
row = CopyAndWrapSparseRow(view.data(), view.size());
|
||||
vec_dim_ = std::max(vec_dim_, row.dim());
|
||||
}
|
||||
|
@ -430,27 +570,28 @@ class FieldDataSparseVectorImpl
|
|||
|
||||
class FieldDataArrayImpl : public FieldDataImpl<Array, true> {
|
||||
public:
|
||||
explicit FieldDataArrayImpl(DataType data_type, int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<Array, true>(1, data_type, total_num_rows) {
|
||||
explicit FieldDataArrayImpl(DataType data_type,
|
||||
bool nullable,
|
||||
int64_t total_num_rows = 0)
|
||||
: FieldDataImpl<Array, true>(1, data_type, nullable, total_num_rows) {
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size() const {
|
||||
DataSize() const override {
|
||||
int64_t data_size = 0;
|
||||
for (size_t offset = 0; offset < length(); ++offset) {
|
||||
data_size += field_data_[offset].byte_size();
|
||||
data_size += data_[offset].byte_size();
|
||||
}
|
||||
|
||||
return data_size;
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size(ssize_t offset) const {
|
||||
DataSize(ssize_t offset) const override {
|
||||
AssertInfo(offset < get_num_rows(),
|
||||
"field data subscript out of range");
|
||||
AssertInfo(offset < length(),
|
||||
"subscript position don't has valid value");
|
||||
return field_data_[offset].byte_size();
|
||||
return data_[offset].byte_size();
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -35,27 +35,34 @@ class FieldMeta {
|
|||
FieldMeta&
|
||||
operator=(FieldMeta&&) = default;
|
||||
|
||||
FieldMeta(const FieldName& name, FieldId id, DataType type)
|
||||
: name_(name), id_(id), type_(type) {
|
||||
FieldMeta(const FieldName& name, FieldId id, DataType type, bool nullable)
|
||||
: name_(name), id_(id), type_(type), nullable_(nullable) {
|
||||
Assert(!IsVectorDataType(type_));
|
||||
}
|
||||
|
||||
FieldMeta(const FieldName& name,
|
||||
FieldId id,
|
||||
DataType type,
|
||||
int64_t max_length)
|
||||
int64_t max_length,
|
||||
bool nullable)
|
||||
: name_(name),
|
||||
id_(id),
|
||||
type_(type),
|
||||
string_info_(StringInfo{max_length}) {
|
||||
string_info_(StringInfo{max_length}),
|
||||
nullable_(nullable) {
|
||||
Assert(IsStringDataType(type_));
|
||||
}
|
||||
|
||||
FieldMeta(const FieldName& name,
|
||||
FieldId id,
|
||||
DataType type,
|
||||
DataType element_type)
|
||||
: name_(name), id_(id), type_(type), element_type_(element_type) {
|
||||
DataType element_type,
|
||||
bool nullable)
|
||||
: name_(name),
|
||||
id_(id),
|
||||
type_(type),
|
||||
element_type_(element_type),
|
||||
nullable_(nullable) {
|
||||
Assert(IsArrayDataType(type_));
|
||||
}
|
||||
|
||||
|
@ -65,12 +72,15 @@ class FieldMeta {
|
|||
FieldId id,
|
||||
DataType type,
|
||||
int64_t dim,
|
||||
std::optional<knowhere::MetricType> metric_type)
|
||||
std::optional<knowhere::MetricType> metric_type,
|
||||
bool nullable)
|
||||
: name_(name),
|
||||
id_(id),
|
||||
type_(type),
|
||||
vector_info_(VectorInfo{dim, std::move(metric_type)}) {
|
||||
vector_info_(VectorInfo{dim, std::move(metric_type)}),
|
||||
nullable_(nullable) {
|
||||
Assert(IsVectorDataType(type_));
|
||||
Assert(!nullable);
|
||||
}
|
||||
|
||||
int64_t
|
||||
|
@ -126,6 +136,11 @@ class FieldMeta {
|
|||
return IsStringDataType(type_);
|
||||
}
|
||||
|
||||
bool
|
||||
is_nullable() const {
|
||||
return nullable_;
|
||||
}
|
||||
|
||||
size_t
|
||||
get_sizeof() const {
|
||||
AssertInfo(!IsSparseFloatVectorDataType(type_),
|
||||
|
@ -157,6 +172,7 @@ class FieldMeta {
|
|||
FieldId id_;
|
||||
DataType type_ = DataType::NONE;
|
||||
DataType element_type_ = DataType::NONE;
|
||||
bool nullable_;
|
||||
std::optional<VectorInfo> vector_info_;
|
||||
std::optional<StringInfo> string_info_;
|
||||
};
|
||||
|
|
|
@ -38,7 +38,7 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
|
|||
schema_proto.fields()) {
|
||||
auto field_id = FieldId(child.fieldid());
|
||||
auto name = FieldName(child.name());
|
||||
|
||||
auto nullable = child.nullable();
|
||||
if (field_id.get() < 100) {
|
||||
// system field id
|
||||
auto is_system =
|
||||
|
@ -60,22 +60,27 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
|
|||
dim = boost::lexical_cast<int64_t>(type_map.at("dim"));
|
||||
}
|
||||
if (!index_map.count("metric_type")) {
|
||||
schema->AddField(name, field_id, data_type, dim, std::nullopt);
|
||||
schema->AddField(
|
||||
name, field_id, data_type, dim, std::nullopt, false);
|
||||
} else {
|
||||
auto metric_type = index_map.at("metric_type");
|
||||
schema->AddField(name, field_id, data_type, dim, metric_type);
|
||||
schema->AddField(
|
||||
name, field_id, data_type, dim, metric_type, false);
|
||||
}
|
||||
} else if (IsStringDataType(data_type)) {
|
||||
auto type_map = RepeatedKeyValToMap(child.type_params());
|
||||
AssertInfo(type_map.count(MAX_LENGTH), "max_length not found");
|
||||
auto max_len =
|
||||
boost::lexical_cast<int64_t>(type_map.at(MAX_LENGTH));
|
||||
schema->AddField(name, field_id, data_type, max_len);
|
||||
schema->AddField(name, field_id, data_type, max_len, nullable);
|
||||
} else if (IsArrayDataType(data_type)) {
|
||||
schema->AddField(
|
||||
name, field_id, data_type, DataType(child.element_type()));
|
||||
schema->AddField(name,
|
||||
field_id,
|
||||
data_type,
|
||||
DataType(child.element_type()),
|
||||
nullable);
|
||||
} else {
|
||||
schema->AddField(name, field_id, data_type);
|
||||
schema->AddField(name, field_id, data_type, nullable);
|
||||
}
|
||||
|
||||
if (child.is_primary_key()) {
|
||||
|
@ -93,6 +98,7 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
|
|||
|
||||
const FieldMeta FieldMeta::RowIdMeta(FieldName("RowID"),
|
||||
RowFieldID,
|
||||
DataType::INT64);
|
||||
DataType::INT64,
|
||||
false);
|
||||
|
||||
} // namespace milvus
|
||||
|
|
|
@ -34,29 +34,35 @@ static int64_t debug_id = START_USER_FIELDID;
|
|||
class Schema {
|
||||
public:
|
||||
FieldId
|
||||
AddDebugField(const std::string& name, DataType data_type) {
|
||||
AddDebugField(const std::string& name,
|
||||
DataType data_type,
|
||||
bool nullable = false) {
|
||||
auto field_id = FieldId(debug_id);
|
||||
debug_id++;
|
||||
this->AddField(FieldName(name), field_id, data_type);
|
||||
this->AddField(FieldName(name), field_id, data_type, nullable);
|
||||
return field_id;
|
||||
}
|
||||
|
||||
FieldId
|
||||
AddDebugField(const std::string& name,
|
||||
DataType data_type,
|
||||
DataType element_type) {
|
||||
DataType element_type,
|
||||
bool nullable = false) {
|
||||
auto field_id = FieldId(debug_id);
|
||||
debug_id++;
|
||||
this->AddField(FieldName(name), field_id, data_type, element_type);
|
||||
this->AddField(
|
||||
FieldName(name), field_id, data_type, element_type, nullable);
|
||||
return field_id;
|
||||
}
|
||||
|
||||
FieldId
|
||||
AddDebugArrayField(const std::string& name, DataType element_type) {
|
||||
AddDebugArrayField(const std::string& name,
|
||||
DataType element_type,
|
||||
bool nullable) {
|
||||
auto field_id = FieldId(debug_id);
|
||||
debug_id++;
|
||||
this->AddField(
|
||||
FieldName(name), field_id, DataType::ARRAY, element_type);
|
||||
FieldName(name), field_id, DataType::ARRAY, element_type, nullable);
|
||||
return field_id;
|
||||
}
|
||||
|
||||
|
@ -68,16 +74,19 @@ class Schema {
|
|||
std::optional<knowhere::MetricType> metric_type) {
|
||||
auto field_id = FieldId(debug_id);
|
||||
debug_id++;
|
||||
auto field_meta =
|
||||
FieldMeta(FieldName(name), field_id, data_type, dim, metric_type);
|
||||
auto field_meta = FieldMeta(
|
||||
FieldName(name), field_id, data_type, dim, metric_type, false);
|
||||
this->AddField(std::move(field_meta));
|
||||
return field_id;
|
||||
}
|
||||
|
||||
// scalar type
|
||||
void
|
||||
AddField(const FieldName& name, const FieldId id, DataType data_type) {
|
||||
auto field_meta = FieldMeta(name, id, data_type);
|
||||
AddField(const FieldName& name,
|
||||
const FieldId id,
|
||||
DataType data_type,
|
||||
bool nullable) {
|
||||
auto field_meta = FieldMeta(name, id, data_type, nullable);
|
||||
this->AddField(std::move(field_meta));
|
||||
}
|
||||
|
||||
|
@ -86,8 +95,10 @@ class Schema {
|
|||
AddField(const FieldName& name,
|
||||
const FieldId id,
|
||||
DataType data_type,
|
||||
DataType element_type) {
|
||||
auto field_meta = FieldMeta(name, id, data_type, element_type);
|
||||
DataType element_type,
|
||||
bool nullable) {
|
||||
auto field_meta =
|
||||
FieldMeta(name, id, data_type, element_type, nullable);
|
||||
this->AddField(std::move(field_meta));
|
||||
}
|
||||
|
||||
|
@ -96,8 +107,9 @@ class Schema {
|
|||
AddField(const FieldName& name,
|
||||
const FieldId id,
|
||||
DataType data_type,
|
||||
int64_t max_length) {
|
||||
auto field_meta = FieldMeta(name, id, data_type, max_length);
|
||||
int64_t max_length,
|
||||
bool nullable) {
|
||||
auto field_meta = FieldMeta(name, id, data_type, max_length, nullable);
|
||||
this->AddField(std::move(field_meta));
|
||||
}
|
||||
|
||||
|
@ -107,8 +119,10 @@ class Schema {
|
|||
const FieldId id,
|
||||
DataType data_type,
|
||||
int64_t dim,
|
||||
std::optional<knowhere::MetricType> metric_type) {
|
||||
auto field_meta = FieldMeta(name, id, data_type, dim, metric_type);
|
||||
std::optional<knowhere::MetricType> metric_type,
|
||||
bool nullable) {
|
||||
auto field_meta =
|
||||
FieldMeta(name, id, data_type, dim, metric_type, false);
|
||||
this->AddField(std::move(field_meta));
|
||||
}
|
||||
|
||||
|
|
|
@ -65,7 +65,8 @@ class ColumnVector final : public BaseVector {
|
|||
size_t length,
|
||||
std::optional<size_t> null_count = std::nullopt)
|
||||
: BaseVector(data_type, length, null_count) {
|
||||
values_ = InitScalarFieldData(data_type, length);
|
||||
//todo: support null expr
|
||||
values_ = InitScalarFieldData(data_type, false, length);
|
||||
}
|
||||
|
||||
// ColumnVector(FixedVector<bool>&& data)
|
||||
|
@ -78,7 +79,7 @@ class ColumnVector final : public BaseVector {
|
|||
ColumnVector(TargetBitmap&& bitmap)
|
||||
: BaseVector(DataType::INT8, bitmap.size()) {
|
||||
values_ = std::make_shared<FieldDataImpl<uint8_t, false>>(
|
||||
bitmap.size(), DataType::INT8, std::move(bitmap).into());
|
||||
bitmap.size(), DataType::INT8, false, std::move(bitmap).into());
|
||||
}
|
||||
|
||||
virtual ~ColumnVector() override {
|
||||
|
|
|
@ -117,8 +117,9 @@ BitmapIndex<T>::BuildV2(const Config& config) {
|
|||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
// todo: support nullable index
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType(GetDType<T>()), 0, total_num_rows);
|
||||
DataType(GetDType<T>()), false, 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
|
|
@ -295,8 +295,9 @@ HybridScalarIndex<T>::BuildV2(const Config& config) {
|
|||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
// todo: support nullable index
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType(GetDType<T>()), 0, total_num_rows);
|
||||
DataType(GetDType<T>()), false, 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
|
|
@ -169,8 +169,9 @@ InvertedIndexTantivy<T>::BuildV2(const Config& config) {
|
|||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
// todo: support nullable index
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType(GetDType<T>()), 0, total_num_rows);
|
||||
DataType(GetDType<T>()), false, 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
|
|
@ -72,8 +72,15 @@ ScalarIndexSort<T>::BuildV2(const Config& config) {
|
|||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
auto nullable =
|
||||
col_data->type()->id() == arrow::Type::NA ? true : false;
|
||||
// will support build scalar index when nullable in the future just skip it
|
||||
// now, not support to build index in nullable field_data
|
||||
// todo: support nullable index
|
||||
AssertInfo(!nullable,
|
||||
"not support to build index in nullable field_data");
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType(GetDType<T>()), 0, total_num_rows);
|
||||
DataType(GetDType<T>()), nullable, 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
|
|
@ -83,8 +83,15 @@ StringIndexMarisa::BuildV2(const Config& config) {
|
|||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
auto field_data =
|
||||
storage::CreateFieldData(DataType::STRING, 0, total_num_rows);
|
||||
auto nullable =
|
||||
col_data->type()->id() == arrow::Type::NA ? true : false;
|
||||
// will support build scalar index when nullable in the future just skip it
|
||||
// now, not support to build index in nullable field_data
|
||||
// todo: support nullable index
|
||||
AssertInfo(!nullable,
|
||||
"not support to build index in nullable field_data");
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType::STRING, nullable, 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
|
|
@ -249,9 +249,9 @@ AssembleIndexDatas(std::map<std::string, FieldDataPtr>& index_datas) {
|
|||
std::string prefix = item[NAME];
|
||||
int slice_num = item[SLICE_NUM];
|
||||
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
|
||||
|
||||
// todo: support nullable index
|
||||
auto new_field_data =
|
||||
storage::CreateFieldData(DataType::INT8, 1, total_len);
|
||||
storage::CreateFieldData(DataType::INT8, false, 1, total_len);
|
||||
|
||||
for (auto i = 0; i < slice_num; ++i) {
|
||||
std::string file_name = GenSlicedFileName(prefix, i);
|
||||
|
@ -288,9 +288,9 @@ AssembleIndexDatas(std::map<std::string, FieldDataChannelPtr>& index_datas,
|
|||
std::string prefix = item[NAME];
|
||||
int slice_num = item[SLICE_NUM];
|
||||
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
|
||||
|
||||
// todo: support nullable index
|
||||
auto new_field_data =
|
||||
storage::CreateFieldData(DataType::INT8, 1, total_len);
|
||||
storage::CreateFieldData(DataType::INT8, false, 1, total_len);
|
||||
|
||||
for (auto i = 0; i < slice_num; ++i) {
|
||||
std::string file_name = GenSlicedFileName(prefix, i);
|
||||
|
|
|
@ -259,9 +259,9 @@ VectorMemIndex<T>::LoadV2(const Config& config) {
|
|||
std::string prefix = item[NAME];
|
||||
int slice_num = item[SLICE_NUM];
|
||||
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
|
||||
|
||||
auto new_field_data =
|
||||
milvus::storage::CreateFieldData(DataType::INT8, 1, total_len);
|
||||
// todo: support nullable index
|
||||
auto new_field_data = milvus::storage::CreateFieldData(
|
||||
DataType::INT8, false, 1, total_len);
|
||||
for (auto i = 0; i < slice_num; ++i) {
|
||||
std::string file_name =
|
||||
index_prefix + "/" + GenSlicedFileName(prefix, i);
|
||||
|
@ -358,9 +358,9 @@ VectorMemIndex<T>::Load(milvus::tracer::TraceContext ctx,
|
|||
std::string prefix = item[NAME];
|
||||
int slice_num = item[SLICE_NUM];
|
||||
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
|
||||
|
||||
// todo: support nullable index
|
||||
auto new_field_data = milvus::storage::CreateFieldData(
|
||||
DataType::INT8, 1, total_len);
|
||||
DataType::INT8, false, 1, total_len);
|
||||
|
||||
std::vector<std::string> batch;
|
||||
batch.reserve(slice_num);
|
||||
|
@ -462,8 +462,9 @@ VectorMemIndex<T>::BuildV2(const Config& config) {
|
|||
}
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
// todo: support nullable index
|
||||
auto field_data =
|
||||
storage::CreateFieldData(field_type, dim, total_num_rows);
|
||||
storage::CreateFieldData(field_type, false, dim, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -52,7 +53,6 @@ namespace milvus {
|
|||
*/
|
||||
constexpr size_t STRING_PADDING = 1;
|
||||
constexpr size_t ARRAY_PADDING = 1;
|
||||
|
||||
constexpr size_t BLOCK_SIZE = 8192;
|
||||
|
||||
class ColumnBase {
|
||||
|
@ -74,10 +74,10 @@ class ColumnBase {
|
|||
|
||||
type_size_ = field_meta.get_sizeof();
|
||||
|
||||
cap_size_ = type_size_ * reserve;
|
||||
data_cap_size_ = field_meta.get_sizeof() * reserve;
|
||||
|
||||
// use anon mapping so we are able to free these memory with munmap only
|
||||
size_t mapped_size = cap_size_ + padding_;
|
||||
size_t mapped_size = data_cap_size_ + padding_;
|
||||
data_ = static_cast<char*>(mmap(nullptr,
|
||||
mapped_size,
|
||||
PROT_READ | PROT_WRITE,
|
||||
|
@ -89,6 +89,10 @@ class ColumnBase {
|
|||
strerror(errno),
|
||||
mapped_size);
|
||||
|
||||
if (field_meta.is_nullable()) {
|
||||
nullable_ = true;
|
||||
valid_data_.reserve(reserve);
|
||||
}
|
||||
UpdateMetricWhenMmap(mapped_size);
|
||||
}
|
||||
|
||||
|
@ -97,24 +101,29 @@ class ColumnBase {
|
|||
int dim,
|
||||
const DataType& data_type,
|
||||
storage::MmapChunkManagerPtr mcm,
|
||||
storage::MmapChunkDescriptorPtr descriptor)
|
||||
storage::MmapChunkDescriptorPtr descriptor,
|
||||
bool nullable)
|
||||
: mcm_(mcm),
|
||||
mmap_descriptor_(descriptor),
|
||||
type_size_(GetDataTypeSize(data_type, dim)),
|
||||
num_rows_(0),
|
||||
size_(0),
|
||||
cap_size_(reserve),
|
||||
mapping_type_(MAP_WITH_MANAGER) {
|
||||
data_size_(0),
|
||||
data_cap_size_(reserve),
|
||||
mapping_type_(MAP_WITH_MANAGER),
|
||||
nullable_(nullable) {
|
||||
AssertInfo((mcm != nullptr) && descriptor != nullptr,
|
||||
"use wrong mmap chunk manager and mmap chunk descriptor to "
|
||||
"create column.");
|
||||
|
||||
SetPaddingSize(data_type);
|
||||
size_t mapped_size = cap_size_ + padding_;
|
||||
size_t mapped_size = data_cap_size_ + padding_;
|
||||
data_ = (char*)mcm_->Allocate(mmap_descriptor_, (uint64_t)mapped_size);
|
||||
AssertInfo(data_ != nullptr,
|
||||
"fail to create with mmap manager: map_size = {}",
|
||||
mapped_size);
|
||||
if (nullable_) {
|
||||
valid_data_.reserve(reserve);
|
||||
}
|
||||
}
|
||||
|
||||
// mmap mode ctor
|
||||
|
@ -128,11 +137,11 @@ class ColumnBase {
|
|||
num_rows_ = size / type_size_;
|
||||
}
|
||||
|
||||
size_ = size;
|
||||
cap_size_ = size;
|
||||
data_size_ = size;
|
||||
data_cap_size_ = size;
|
||||
// use exactly same size of file, padding shall be written in file already
|
||||
// see also https://github.com/milvus-io/milvus/issues/34442
|
||||
size_t mapped_size = cap_size_;
|
||||
size_t mapped_size = data_cap_size_;
|
||||
data_ = static_cast<char*>(mmap(
|
||||
nullptr, mapped_size, PROT_READ, MAP_SHARED, file.Descriptor(), 0));
|
||||
AssertInfo(data_ != MAP_FAILED,
|
||||
|
@ -140,6 +149,12 @@ class ColumnBase {
|
|||
strerror(errno));
|
||||
madvise(data_, mapped_size, MADV_WILLNEED);
|
||||
|
||||
// valid_data store in memory
|
||||
if (field_meta.is_nullable()) {
|
||||
nullable_ = true;
|
||||
valid_data_.reserve(num_rows_);
|
||||
}
|
||||
|
||||
UpdateMetricWhenMmap(mapped_size);
|
||||
}
|
||||
|
||||
|
@ -148,15 +163,17 @@ class ColumnBase {
|
|||
ColumnBase(const File& file,
|
||||
size_t size,
|
||||
int dim,
|
||||
const DataType& data_type)
|
||||
: size_(size),
|
||||
cap_size_(size),
|
||||
const DataType& data_type,
|
||||
bool nullable)
|
||||
: data_size_(size),
|
||||
data_cap_size_(size),
|
||||
nullable_(nullable),
|
||||
mapping_type_(MappingType::MAP_WITH_FILE) {
|
||||
SetPaddingSize(data_type);
|
||||
|
||||
// use exact same size of file, padding shall be written in file already
|
||||
// see also https://github.com/milvus-io/milvus/issues/34442
|
||||
size_t mapped_size = cap_size_;
|
||||
size_t mapped_size = data_cap_size_;
|
||||
if (!IsVariableDataType(data_type)) {
|
||||
type_size_ = GetDataTypeSize(data_type, dim);
|
||||
num_rows_ = size / type_size_;
|
||||
|
@ -167,35 +184,44 @@ class ColumnBase {
|
|||
"failed to create file-backed map, err: {}",
|
||||
strerror(errno));
|
||||
|
||||
if (nullable) {
|
||||
valid_data_.reserve(num_rows_);
|
||||
}
|
||||
|
||||
UpdateMetricWhenMmap(mapped_size);
|
||||
}
|
||||
|
||||
virtual ~ColumnBase() {
|
||||
if (data_ != nullptr) {
|
||||
if (mapping_type_ != MappingType::MAP_WITH_MANAGER) {
|
||||
size_t mapped_size = cap_size_ + padding_;
|
||||
size_t mapped_size = data_cap_size_ + padding_;
|
||||
if (munmap(data_, mapped_size)) {
|
||||
AssertInfo(true,
|
||||
"failed to unmap variable field, err={}",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
UpdateMetricWhenMunmap(cap_size_ + padding_);
|
||||
UpdateMetricWhenMunmap(data_cap_size_ + padding_);
|
||||
}
|
||||
if (nullable_) {
|
||||
valid_data_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
ColumnBase(ColumnBase&& column) noexcept
|
||||
: data_(column.data_),
|
||||
cap_size_(column.cap_size_),
|
||||
nullable_(column.nullable_),
|
||||
valid_data_(column.valid_data_),
|
||||
padding_(column.padding_),
|
||||
type_size_(column.type_size_),
|
||||
num_rows_(column.num_rows_),
|
||||
size_(column.size_) {
|
||||
data_size_(column.data_size_) {
|
||||
column.data_ = nullptr;
|
||||
column.cap_size_ = 0;
|
||||
column.data_cap_size_ = 0;
|
||||
column.padding_ = 0;
|
||||
column.num_rows_ = 0;
|
||||
column.size_ = 0;
|
||||
column.data_size_ = 0;
|
||||
column.nullable_ = false;
|
||||
}
|
||||
|
||||
// Data() points at an addr that contains the elements
|
||||
|
@ -210,6 +236,21 @@ class ColumnBase {
|
|||
return data_;
|
||||
}
|
||||
|
||||
bool
|
||||
IsValid(size_t offset) const {
|
||||
return valid_data_[offset];
|
||||
}
|
||||
|
||||
bool
|
||||
IsNullable() const {
|
||||
return nullable_;
|
||||
}
|
||||
|
||||
size_t
|
||||
DataSize() const {
|
||||
return data_size_;
|
||||
}
|
||||
|
||||
size_t
|
||||
NumRows() const {
|
||||
return num_rows_;
|
||||
|
@ -217,14 +258,15 @@ class ColumnBase {
|
|||
|
||||
virtual size_t
|
||||
ByteSize() const {
|
||||
return cap_size_ + padding_;
|
||||
// folly::fbvector<bool> implemented with bit compression.
|
||||
return data_cap_size_ + padding_ + (valid_data_.size() + 7) / 8;
|
||||
}
|
||||
|
||||
// The capacity of the column,
|
||||
// DO NOT call this for variable length column(including SparseFloatColumn).
|
||||
virtual size_t
|
||||
Capacity() const {
|
||||
return cap_size_ / type_size_;
|
||||
return data_cap_size_ / type_size_;
|
||||
}
|
||||
|
||||
virtual SpanBase
|
||||
|
@ -245,28 +287,55 @@ class ColumnBase {
|
|||
|
||||
virtual void
|
||||
AppendBatch(const FieldDataPtr data) {
|
||||
size_t required_size = size_ + data->Size();
|
||||
if (required_size > cap_size_) {
|
||||
Expand(required_size * 2 + padding_);
|
||||
size_t required_size = data_size_ + data->DataSize();
|
||||
if (required_size > data_cap_size_) {
|
||||
ExpandData(required_size * 2 + padding_);
|
||||
}
|
||||
|
||||
std::copy_n(static_cast<const char*>(data->Data()),
|
||||
data->Size(),
|
||||
data_ + size_);
|
||||
size_ = required_size;
|
||||
data->DataSize(),
|
||||
data_ + data_size_);
|
||||
data_size_ = required_size;
|
||||
if (nullable_) {
|
||||
size_t required_rows = num_rows_ + data->get_num_rows();
|
||||
if (required_rows > valid_data_.size()) {
|
||||
valid_data_.reserve(required_rows * 2);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < data->get_num_rows(); i++) {
|
||||
valid_data_.push_back(data->is_valid(i));
|
||||
}
|
||||
}
|
||||
num_rows_ += data->Length();
|
||||
}
|
||||
|
||||
// Append one row
|
||||
virtual void
|
||||
Append(const char* data, size_t size) {
|
||||
size_t required_size = size_ + size;
|
||||
if (required_size > cap_size_) {
|
||||
Expand(required_size * 2);
|
||||
AssertInfo(!nullable_,
|
||||
"no need to pass valid_data when nullable is false");
|
||||
size_t required_size = data_size_ + size;
|
||||
if (required_size > data_cap_size_) {
|
||||
ExpandData(required_size * 2);
|
||||
}
|
||||
|
||||
std::copy_n(data, size, data_ + size_);
|
||||
size_ = required_size;
|
||||
std::copy_n(data, size, data_ + data_size_);
|
||||
data_size_ = required_size;
|
||||
num_rows_++;
|
||||
}
|
||||
|
||||
// Append one row
|
||||
virtual void
|
||||
Append(const char* data, const bool valid_data, size_t size) {
|
||||
AssertInfo(nullable_, "need to pass valid_data_ when nullable is true");
|
||||
size_t required_size = data_size_ + size;
|
||||
if (required_size > data_cap_size_) {
|
||||
ExpandData(required_size * 2);
|
||||
}
|
||||
|
||||
std::copy_n(data, size, data_ + data_size_);
|
||||
valid_data_.push_back(valid_data);
|
||||
data_size_ = required_size;
|
||||
num_rows_++;
|
||||
}
|
||||
|
||||
|
@ -290,10 +359,15 @@ class ColumnBase {
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
SetValidData(FixedVector<bool>&& valid_data) {
|
||||
valid_data_ = std::move(valid_data);
|
||||
}
|
||||
|
||||
protected:
|
||||
// only for memory mode and mmap manager mode, not mmap
|
||||
void
|
||||
Expand(size_t new_size) {
|
||||
ExpandData(size_t new_size) {
|
||||
if (new_size == 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -317,8 +391,8 @@ class ColumnBase {
|
|||
new_size + padding_);
|
||||
|
||||
if (data_ != nullptr) {
|
||||
std::memcpy(data, data_, size_);
|
||||
if (munmap(data_, cap_size_ + padding_)) {
|
||||
std::memcpy(data, data_, data_size_);
|
||||
if (munmap(data_, data_cap_size_ + padding_)) {
|
||||
auto err = errno;
|
||||
size_t mapped_size = new_size + padding_;
|
||||
munmap(data, mapped_size);
|
||||
|
@ -328,13 +402,13 @@ class ColumnBase {
|
|||
false,
|
||||
"failed to unmap while expanding: {}, old_map_size={}",
|
||||
strerror(err),
|
||||
cap_size_ + padding_);
|
||||
data_cap_size_ + padding_);
|
||||
}
|
||||
UpdateMetricWhenMunmap(cap_size_ + padding_);
|
||||
UpdateMetricWhenMunmap(data_cap_size_ + padding_);
|
||||
}
|
||||
|
||||
data_ = data;
|
||||
cap_size_ = new_size;
|
||||
data_cap_size_ = new_size;
|
||||
mapping_type_ = MappingType::MAP_WITH_ANONYMOUS;
|
||||
} else if (mapping_type_ == MappingType::MAP_WITH_MANAGER) {
|
||||
size_t new_mapped_size = new_size + padding_;
|
||||
|
@ -342,25 +416,30 @@ class ColumnBase {
|
|||
AssertInfo(data != nullptr,
|
||||
"fail to create with mmap manager: map_size = {}",
|
||||
new_mapped_size);
|
||||
std::memcpy(data, data_, size_);
|
||||
std::memcpy(data, data_, data_cap_size_);
|
||||
// allocate space only append in one growing segment, so no need to munmap()
|
||||
data_ = (char*)data;
|
||||
cap_size_ = new_size;
|
||||
data_cap_size_ = new_size;
|
||||
mapping_type_ = MappingType::MAP_WITH_MANAGER;
|
||||
}
|
||||
}
|
||||
|
||||
char* data_{nullptr};
|
||||
bool nullable_{false};
|
||||
// When merging multiple valid_data, the bit operation logic is very complex
|
||||
// for the reason that, FixedVector<bool> use bit granularity for storage and access
|
||||
// so FixedVector is also used to store valid_data on the sealed segment.
|
||||
FixedVector<bool> valid_data_;
|
||||
// capacity in bytes
|
||||
size_t cap_size_{0};
|
||||
size_t data_cap_size_{0};
|
||||
size_t padding_{0};
|
||||
// type_size_ is not used for sparse float vector column.
|
||||
size_t type_size_{1};
|
||||
size_t num_rows_{0};
|
||||
|
||||
// length in bytes
|
||||
size_t size_{0};
|
||||
storage::MmapChunkDescriptorPtr mmap_descriptor_ = nullptr;
|
||||
size_t data_size_{0};
|
||||
|
||||
private:
|
||||
void
|
||||
|
@ -413,16 +492,21 @@ class Column : public ColumnBase {
|
|||
}
|
||||
|
||||
// mmap mode ctor
|
||||
Column(const File& file, size_t size, int dim, DataType data_type)
|
||||
: ColumnBase(file, size, dim, data_type) {
|
||||
Column(const File& file,
|
||||
size_t size,
|
||||
int dim,
|
||||
DataType data_type,
|
||||
bool nullable)
|
||||
: ColumnBase(file, size, dim, data_type, nullable) {
|
||||
}
|
||||
|
||||
Column(size_t reserve,
|
||||
int dim,
|
||||
const DataType& data_type,
|
||||
storage::MmapChunkManagerPtr mcm,
|
||||
storage::MmapChunkDescriptorPtr descriptor)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor) {
|
||||
storage::MmapChunkDescriptorPtr descriptor,
|
||||
bool nullable)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) {
|
||||
}
|
||||
|
||||
Column(Column&& column) noexcept : ColumnBase(std::move(column)) {
|
||||
|
@ -432,7 +516,7 @@ class Column : public ColumnBase {
|
|||
|
||||
SpanBase
|
||||
Span() const override {
|
||||
return SpanBase(data_, num_rows_, cap_size_ / num_rows_);
|
||||
return SpanBase(data_, num_rows_, data_cap_size_ / num_rows_);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -453,7 +537,7 @@ class SparseFloatColumn : public ColumnBase {
|
|||
size_t size,
|
||||
int dim,
|
||||
const DataType& data_type)
|
||||
: ColumnBase(file, size, dim, data_type) {
|
||||
: ColumnBase(file, size, dim, data_type, false) {
|
||||
}
|
||||
// mmap with mmap manager
|
||||
SparseFloatColumn(size_t reserve,
|
||||
|
@ -461,7 +545,7 @@ class SparseFloatColumn : public ColumnBase {
|
|||
const DataType& data_type,
|
||||
storage::MmapChunkManagerPtr mcm,
|
||||
storage::MmapChunkDescriptorPtr descriptor)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor) {
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor, false) {
|
||||
}
|
||||
|
||||
SparseFloatColumn(SparseFloatColumn&& column) noexcept
|
||||
|
@ -477,6 +561,14 @@ class SparseFloatColumn : public ColumnBase {
|
|||
return static_cast<const char*>(static_cast<const void*>(vec_.data()));
|
||||
}
|
||||
|
||||
// This is used to advice mmap prefetch, we don't currently support mmap for
|
||||
// sparse float vector thus not implemented for now.
|
||||
size_t
|
||||
ByteSize() const override {
|
||||
PanicInfo(ErrorCode::Unsupported,
|
||||
"ByteSize not supported for sparse float column");
|
||||
}
|
||||
|
||||
size_t
|
||||
Capacity() const override {
|
||||
PanicInfo(ErrorCode::Unsupported,
|
||||
|
@ -524,7 +616,7 @@ class SparseFloatColumn : public ColumnBase {
|
|||
num_rows_ = indices.size();
|
||||
// so that indices[num_rows_] - indices[num_rows_ - 1] is the size of
|
||||
// the last row.
|
||||
indices.push_back(size_);
|
||||
indices.push_back(data_size_);
|
||||
for (size_t i = 0; i < num_rows_; i++) {
|
||||
auto vec_size = indices[i + 1] - indices[i];
|
||||
AssertInfo(
|
||||
|
@ -564,8 +656,9 @@ class VariableColumn : public ColumnBase {
|
|||
int dim,
|
||||
const DataType& data_type,
|
||||
storage::MmapChunkManagerPtr mcm,
|
||||
storage::MmapChunkDescriptorPtr descriptor)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor) {
|
||||
storage::MmapChunkDescriptorPtr descriptor,
|
||||
bool nullable)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) {
|
||||
}
|
||||
|
||||
VariableColumn(VariableColumn&& column) noexcept
|
||||
|
@ -622,7 +715,7 @@ class VariableColumn : public ColumnBase {
|
|||
pos += sizeof(uint32_t) + size;
|
||||
}
|
||||
|
||||
return BufferView{pos, size_ - (pos - data_)};
|
||||
return BufferView{pos, data_size_ - (pos - data_)};
|
||||
}
|
||||
|
||||
ViewType
|
||||
|
@ -654,9 +747,12 @@ class VariableColumn : public ColumnBase {
|
|||
void
|
||||
Append(FieldDataPtr chunk) {
|
||||
for (auto i = 0; i < chunk->get_num_rows(); i++) {
|
||||
indices_.emplace_back(size_);
|
||||
indices_.emplace_back(data_size_);
|
||||
auto data = static_cast<const T*>(chunk->RawValue(i));
|
||||
size_ += sizeof(uint32_t) + data->size();
|
||||
data_size_ += sizeof(uint32_t) + data->size();
|
||||
if (nullable_) {
|
||||
valid_data_.push_back(chunk->is_valid(i));
|
||||
}
|
||||
}
|
||||
load_buf_.emplace(std::move(chunk));
|
||||
}
|
||||
|
@ -671,9 +767,9 @@ class VariableColumn : public ColumnBase {
|
|||
|
||||
// for variable length column in memory mode only
|
||||
if (data_ == nullptr) {
|
||||
size_t total_size = size_;
|
||||
size_ = 0;
|
||||
Expand(total_size);
|
||||
size_t total_data_size = data_size_;
|
||||
data_size_ = 0;
|
||||
ExpandData(total_data_size);
|
||||
|
||||
while (!load_buf_.empty()) {
|
||||
auto chunk = std::move(load_buf_.front());
|
||||
|
@ -681,12 +777,19 @@ class VariableColumn : public ColumnBase {
|
|||
|
||||
// data_ as: |size|data|size|data......
|
||||
for (auto i = 0; i < chunk->get_num_rows(); i++) {
|
||||
auto current_size = (uint32_t)chunk->Size(i);
|
||||
std::memcpy(data_ + size_, ¤t_size, sizeof(uint32_t));
|
||||
size_ += sizeof(uint32_t);
|
||||
auto current_size = (uint32_t)chunk->DataSize(i);
|
||||
std::memcpy(
|
||||
data_ + data_size_, ¤t_size, sizeof(uint32_t));
|
||||
data_size_ += sizeof(uint32_t);
|
||||
auto data = static_cast<const T*>(chunk->RawValue(i));
|
||||
std::memcpy(data_ + size_, data->c_str(), data->size());
|
||||
size_ += data->size();
|
||||
std::memcpy(
|
||||
data_ + data_size_, data->c_str(), data->size());
|
||||
data_size_ += data->size();
|
||||
}
|
||||
if (nullable_) {
|
||||
for (size_t i = 0; i < chunk->get_num_rows(); i++) {
|
||||
valid_data_.push_back(chunk->is_valid(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -711,7 +814,6 @@ class VariableColumn : public ColumnBase {
|
|||
private:
|
||||
// loading states
|
||||
std::queue<FieldDataPtr> load_buf_{};
|
||||
|
||||
// raw data index, record indices located 0, interval, 2 * interval, 3 * interval
|
||||
// ... just like page index, interval set to 8192 that matches search engine's batch size
|
||||
std::vector<uint64_t> indices_{};
|
||||
|
@ -735,8 +837,9 @@ class ArrayColumn : public ColumnBase {
|
|||
int dim,
|
||||
const DataType& data_type,
|
||||
storage::MmapChunkManagerPtr mcm,
|
||||
storage::MmapChunkDescriptorPtr descriptor)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor) {
|
||||
storage::MmapChunkDescriptorPtr descriptor,
|
||||
bool nullable)
|
||||
: ColumnBase(reserve, dim, data_type, mcm, descriptor, nullable) {
|
||||
}
|
||||
|
||||
ArrayColumn(ArrayColumn&& column) noexcept
|
||||
|
@ -769,9 +872,14 @@ class ArrayColumn : public ColumnBase {
|
|||
}
|
||||
|
||||
void
|
||||
Append(const Array& array) {
|
||||
indices_.emplace_back(size_);
|
||||
Append(const Array& array, bool valid_data = false) {
|
||||
indices_.emplace_back(data_size_);
|
||||
element_indices_.emplace_back(array.get_offsets());
|
||||
if (nullable_) {
|
||||
return ColumnBase::Append(static_cast<const char*>(array.data()),
|
||||
array.byte_size(),
|
||||
valid_data);
|
||||
}
|
||||
ColumnBase::Append(static_cast<const char*>(array.data()),
|
||||
array.byte_size());
|
||||
}
|
||||
|
@ -797,7 +905,7 @@ class ArrayColumn : public ColumnBase {
|
|||
std::move(element_indices_[i]));
|
||||
}
|
||||
views_.emplace_back(data_ + indices_.back(),
|
||||
size_ - indices_.back(),
|
||||
data_size_ - indices_.back(),
|
||||
element_type_,
|
||||
std::move(element_indices_[indices_.size() - 1]));
|
||||
element_indices_.clear();
|
||||
|
@ -810,4 +918,4 @@ class ArrayColumn : public ColumnBase {
|
|||
std::vector<ArrayView> views_{};
|
||||
DataType element_type_;
|
||||
};
|
||||
} // namespace milvus
|
||||
} // namespace milvus
|
|
@ -87,7 +87,8 @@ WriteFieldData(File& file,
|
|||
const FieldDataPtr& data,
|
||||
uint64_t& total_written,
|
||||
std::vector<uint64_t>& indices,
|
||||
std::vector<std::vector<uint64_t>>& element_indices) {
|
||||
std::vector<std::vector<uint64_t>>& element_indices,
|
||||
FixedVector<bool>& valid_data) {
|
||||
if (IsVariableDataType(data_type)) {
|
||||
switch (data_type) {
|
||||
case DataType::VARCHAR:
|
||||
|
@ -168,13 +169,22 @@ WriteFieldData(File& file,
|
|||
}
|
||||
} else {
|
||||
// write as: data|data|data|data|data|data......
|
||||
size_t written = file.Write(data->Data(), data->Size());
|
||||
if (written < data->Size()) {
|
||||
size_t written = file.Write(data->Data(), data->DataSize());
|
||||
if (written < data->DataSize()) {
|
||||
THROW_FILE_WRITE_ERROR
|
||||
}
|
||||
for (auto i = 0; i < data->get_num_rows(); i++) {
|
||||
indices.emplace_back(total_written);
|
||||
total_written += data->Size(i);
|
||||
total_written += data->DataSize(i);
|
||||
}
|
||||
}
|
||||
if (data->IsNullable()) {
|
||||
size_t required_rows = valid_data.size() + data->get_num_rows();
|
||||
if (required_rows > valid_data.size()) {
|
||||
valid_data.reserve(required_rows * 2);
|
||||
}
|
||||
for (size_t i = 0; i < data->get_num_rows(); i++) {
|
||||
valid_data.push_back(data->is_valid(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -105,7 +105,7 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
|
|||
segment.get_chunk_mutex());
|
||||
int32_t current_chunk_id = 0;
|
||||
// step 3: brute force search where small indexing is unavailable
|
||||
auto vec_ptr = record.get_field_data_base(vecfield_id);
|
||||
auto vec_ptr = record.get_data_base(vecfield_id);
|
||||
auto vec_size_per_chunk = vec_ptr->get_size_per_chunk();
|
||||
auto max_chunk = upper_div(active_count, vec_size_per_chunk);
|
||||
|
||||
|
|
|
@ -41,8 +41,7 @@ class GrowingDataGetter : public DataGetter<T> {
|
|||
const segcore::ConcurrentVector<T>* growing_raw_data_;
|
||||
GrowingDataGetter(const segcore::SegmentGrowingImpl& segment,
|
||||
FieldId fieldId) {
|
||||
growing_raw_data_ =
|
||||
segment.get_insert_record().get_field_data<T>(fieldId);
|
||||
growing_raw_data_ = segment.get_insert_record().get_data<T>(fieldId);
|
||||
}
|
||||
|
||||
GrowingDataGetter(const GrowingDataGetter<T>& other)
|
||||
|
|
|
@ -326,7 +326,6 @@ class ConcurrentVectorImpl : public VectorBase {
|
|||
fill_chunk(chunk_id, 0, element_count, source, source_offset);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
fill_chunk(ssize_t chunk_id,
|
||||
ssize_t chunk_offset,
|
||||
|
|
|
@ -312,7 +312,7 @@ class IndexingRecord {
|
|||
}
|
||||
auto& indexing = field_indexings_.at(fieldId);
|
||||
auto type = indexing->get_field_meta().get_data_type();
|
||||
auto field_raw_data = record.get_field_data_base(fieldId);
|
||||
auto field_raw_data = record.get_data_base(fieldId);
|
||||
if (type == DataType::VECTOR_FLOAT &&
|
||||
reserved_offset + size >= indexing->get_build_threshold()) {
|
||||
indexing->AppendSegmentIndexDense(
|
||||
|
@ -349,11 +349,11 @@ class IndexingRecord {
|
|||
|
||||
if (type == DataType::VECTOR_FLOAT &&
|
||||
reserved_offset + size >= indexing->get_build_threshold()) {
|
||||
auto vec_base = record.get_field_data_base(fieldId);
|
||||
auto vec_base = record.get_data_base(fieldId);
|
||||
indexing->AppendSegmentIndexDense(
|
||||
reserved_offset, size, vec_base, data->Data());
|
||||
} else if (type == DataType::VECTOR_SPARSE_FLOAT) {
|
||||
auto vec_base = record.get_field_data_base(fieldId);
|
||||
auto vec_base = record.get_data_base(fieldId);
|
||||
indexing->AppendSegmentIndexSparse(
|
||||
reserved_offset,
|
||||
size,
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <shared_mutex>
|
||||
|
@ -407,6 +408,65 @@ class OffsetOrderedArray : public OffsetMap {
|
|||
std::vector<std::pair<T, int32_t>> array_;
|
||||
};
|
||||
|
||||
class ThreadSafeValidData {
|
||||
public:
|
||||
explicit ThreadSafeValidData() = default;
|
||||
explicit ThreadSafeValidData(FixedVector<bool> data)
|
||||
: data_(std::move(data)) {
|
||||
}
|
||||
|
||||
void
|
||||
set_data_raw(const std::vector<FieldDataPtr>& datas) {
|
||||
std::unique_lock<std::shared_mutex> lck(mutex_);
|
||||
auto total = 0;
|
||||
for (auto& field_data : datas) {
|
||||
total += field_data->get_num_rows();
|
||||
}
|
||||
if (length_ + total > data_.size()) {
|
||||
data_.reserve(length_ + total);
|
||||
}
|
||||
length_ += total;
|
||||
for (auto& field_data : datas) {
|
||||
auto num_row = field_data->get_num_rows();
|
||||
for (size_t i = 0; i < num_row; i++) {
|
||||
data_.push_back(field_data->is_valid(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
set_data_raw(size_t num_rows,
|
||||
const DataArray* data,
|
||||
const FieldMeta& field_meta) {
|
||||
std::unique_lock<std::shared_mutex> lck(mutex_);
|
||||
if (field_meta.is_nullable()) {
|
||||
if (length_ + num_rows > data_.size()) {
|
||||
data_.reserve(length_ + num_rows);
|
||||
}
|
||||
|
||||
auto src = data->valid_data().data();
|
||||
for (size_t i = 0; i < num_rows; ++i) {
|
||||
data_.push_back(src[i]);
|
||||
// data_[length_ + i] = src[i];
|
||||
}
|
||||
length_ += num_rows;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
is_valid(size_t offset) {
|
||||
std::shared_lock<std::shared_mutex> lck(mutex_);
|
||||
Assert(offset < length_);
|
||||
return data_[offset];
|
||||
}
|
||||
|
||||
private:
|
||||
mutable std::shared_mutex mutex_{};
|
||||
FixedVector<bool> data_;
|
||||
// number of actual elements
|
||||
size_t length_{0};
|
||||
};
|
||||
|
||||
template <bool is_sealed = false>
|
||||
struct InsertRecord {
|
||||
InsertRecord(
|
||||
|
@ -419,6 +479,9 @@ struct InsertRecord {
|
|||
for (auto& field : schema) {
|
||||
auto field_id = field.first;
|
||||
auto& field_meta = field.second;
|
||||
if (field_meta.is_nullable()) {
|
||||
this->append_valid_data(field_id);
|
||||
}
|
||||
if (pk2offset_ == nullptr && pk_field_id.has_value() &&
|
||||
pk_field_id.value() == field_id) {
|
||||
switch (field_meta.get_data_type()) {
|
||||
|
@ -451,28 +514,28 @@ struct InsertRecord {
|
|||
}
|
||||
if (field_meta.is_vector()) {
|
||||
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
|
||||
this->append_field_data<FloatVector>(
|
||||
this->append_data<FloatVector>(
|
||||
field_id, field_meta.get_dim(), size_per_chunk);
|
||||
continue;
|
||||
} else if (field_meta.get_data_type() ==
|
||||
DataType::VECTOR_BINARY) {
|
||||
this->append_field_data<BinaryVector>(
|
||||
this->append_data<BinaryVector>(
|
||||
field_id, field_meta.get_dim(), size_per_chunk);
|
||||
continue;
|
||||
} else if (field_meta.get_data_type() ==
|
||||
DataType::VECTOR_FLOAT16) {
|
||||
this->append_field_data<Float16Vector>(
|
||||
this->append_data<Float16Vector>(
|
||||
field_id, field_meta.get_dim(), size_per_chunk);
|
||||
continue;
|
||||
} else if (field_meta.get_data_type() ==
|
||||
DataType::VECTOR_BFLOAT16) {
|
||||
this->append_field_data<BFloat16Vector>(
|
||||
this->append_data<BFloat16Vector>(
|
||||
field_id, field_meta.get_dim(), size_per_chunk);
|
||||
continue;
|
||||
} else if (field_meta.get_data_type() ==
|
||||
DataType::VECTOR_SPARSE_FLOAT) {
|
||||
this->append_field_data<SparseFloatVector>(field_id,
|
||||
size_per_chunk);
|
||||
this->append_data<SparseFloatVector>(field_id,
|
||||
size_per_chunk);
|
||||
continue;
|
||||
} else {
|
||||
PanicInfo(DataTypeInvalid,
|
||||
|
@ -482,44 +545,43 @@ struct InsertRecord {
|
|||
}
|
||||
switch (field_meta.get_data_type()) {
|
||||
case DataType::BOOL: {
|
||||
this->append_field_data<bool>(field_id, size_per_chunk);
|
||||
this->append_data<bool>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::INT8: {
|
||||
this->append_field_data<int8_t>(field_id, size_per_chunk);
|
||||
this->append_data<int8_t>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::INT16: {
|
||||
this->append_field_data<int16_t>(field_id, size_per_chunk);
|
||||
this->append_data<int16_t>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::INT32: {
|
||||
this->append_field_data<int32_t>(field_id, size_per_chunk);
|
||||
this->append_data<int32_t>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::INT64: {
|
||||
this->append_field_data<int64_t>(field_id, size_per_chunk);
|
||||
this->append_data<int64_t>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::FLOAT: {
|
||||
this->append_field_data<float>(field_id, size_per_chunk);
|
||||
this->append_data<float>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::DOUBLE: {
|
||||
this->append_field_data<double>(field_id, size_per_chunk);
|
||||
this->append_data<double>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::VARCHAR: {
|
||||
this->append_field_data<std::string>(field_id,
|
||||
size_per_chunk);
|
||||
this->append_data<std::string>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::JSON: {
|
||||
this->append_field_data<Json>(field_id, size_per_chunk);
|
||||
this->append_data<Json>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
case DataType::ARRAY: {
|
||||
this->append_field_data<Array>(field_id, size_per_chunk);
|
||||
this->append_data<Array>(field_id, size_per_chunk);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
|
@ -666,23 +728,22 @@ struct InsertRecord {
|
|||
pk2offset_->seal();
|
||||
}
|
||||
|
||||
// get field data without knowing the type
|
||||
// get data without knowing the type
|
||||
VectorBase*
|
||||
get_field_data_base(FieldId field_id) const {
|
||||
AssertInfo(fields_data_.find(field_id) != fields_data_.end(),
|
||||
get_data_base(FieldId field_id) const {
|
||||
AssertInfo(data_.find(field_id) != data_.end(),
|
||||
"Cannot find field_data with field_id: " +
|
||||
std::to_string(field_id.get()));
|
||||
AssertInfo(
|
||||
fields_data_.at(field_id) != nullptr,
|
||||
"fields_data_ at i is null" + std::to_string(field_id.get()));
|
||||
return fields_data_.at(field_id).get();
|
||||
AssertInfo(data_.at(field_id) != nullptr,
|
||||
"data_ at i is null" + std::to_string(field_id.get()));
|
||||
return data_.at(field_id).get();
|
||||
}
|
||||
|
||||
// get field data in given type, const version
|
||||
template <typename Type>
|
||||
const ConcurrentVector<Type>*
|
||||
get_field_data(FieldId field_id) const {
|
||||
auto base_ptr = get_field_data_base(field_id);
|
||||
get_data(FieldId field_id) const {
|
||||
auto base_ptr = get_data_base(field_id);
|
||||
auto ptr = dynamic_cast<const ConcurrentVector<Type>*>(base_ptr);
|
||||
Assert(ptr);
|
||||
return ptr;
|
||||
|
@ -691,36 +752,58 @@ struct InsertRecord {
|
|||
// get field data in given type, non-const version
|
||||
template <typename Type>
|
||||
ConcurrentVector<Type>*
|
||||
get_field_data(FieldId field_id) {
|
||||
auto base_ptr = get_field_data_base(field_id);
|
||||
get_data(FieldId field_id) {
|
||||
auto base_ptr = get_data_base(field_id);
|
||||
auto ptr = dynamic_cast<ConcurrentVector<Type>*>(base_ptr);
|
||||
Assert(ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
ThreadSafeValidData*
|
||||
get_valid_data(FieldId field_id) const {
|
||||
AssertInfo(valid_data_.find(field_id) != valid_data_.end(),
|
||||
"Cannot find valid_data with field_id: " +
|
||||
std::to_string(field_id.get()));
|
||||
AssertInfo(valid_data_.at(field_id) != nullptr,
|
||||
"valid_data_ at i is null" + std::to_string(field_id.get()));
|
||||
return valid_data_.at(field_id).get();
|
||||
}
|
||||
|
||||
bool
|
||||
is_valid_data_exist(FieldId field_id) {
|
||||
return valid_data_.find(field_id) != valid_data_.end();
|
||||
}
|
||||
|
||||
// append a column of scalar or sparse float vector type
|
||||
template <typename Type>
|
||||
void
|
||||
append_field_data(FieldId field_id, int64_t size_per_chunk) {
|
||||
append_data(FieldId field_id, int64_t size_per_chunk) {
|
||||
static_assert(IsScalar<Type> || IsSparse<Type>);
|
||||
fields_data_.emplace(field_id,
|
||||
std::make_unique<ConcurrentVector<Type>>(
|
||||
size_per_chunk, mmap_descriptor_));
|
||||
data_.emplace(field_id,
|
||||
std::make_unique<ConcurrentVector<Type>>(
|
||||
size_per_chunk, mmap_descriptor_));
|
||||
}
|
||||
|
||||
// append a column of scalar type
|
||||
void
|
||||
append_valid_data(FieldId field_id) {
|
||||
valid_data_.emplace(field_id, std::make_unique<ThreadSafeValidData>());
|
||||
}
|
||||
|
||||
// append a column of vector type
|
||||
template <typename VectorType>
|
||||
void
|
||||
append_field_data(FieldId field_id, int64_t dim, int64_t size_per_chunk) {
|
||||
append_data(FieldId field_id, int64_t dim, int64_t size_per_chunk) {
|
||||
static_assert(std::is_base_of_v<VectorTrait, VectorType>);
|
||||
fields_data_.emplace(field_id,
|
||||
std::make_unique<ConcurrentVector<VectorType>>(
|
||||
dim, size_per_chunk, mmap_descriptor_));
|
||||
data_.emplace(field_id,
|
||||
std::make_unique<ConcurrentVector<VectorType>>(
|
||||
dim, size_per_chunk, mmap_descriptor_));
|
||||
}
|
||||
|
||||
void
|
||||
drop_field_data(FieldId field_id) {
|
||||
fields_data_.erase(field_id);
|
||||
data_.erase(field_id);
|
||||
valid_data_.erase(field_id);
|
||||
}
|
||||
|
||||
const ConcurrentVector<Timestamp>&
|
||||
|
@ -740,7 +823,7 @@ struct InsertRecord {
|
|||
ack_responder_.clear();
|
||||
timestamp_index_ = TimestampIndex();
|
||||
pk2offset_->clear();
|
||||
fields_data_.clear();
|
||||
data_.clear();
|
||||
}
|
||||
|
||||
bool
|
||||
|
@ -762,7 +845,9 @@ struct InsertRecord {
|
|||
std::unique_ptr<OffsetMap> pk2offset_;
|
||||
|
||||
private:
|
||||
std::unordered_map<FieldId, std::unique_ptr<VectorBase>> fields_data_{};
|
||||
std::unordered_map<FieldId, std::unique_ptr<VectorBase>> data_{};
|
||||
std::unordered_map<FieldId, std::unique_ptr<ThreadSafeValidData>>
|
||||
valid_data_{};
|
||||
mutable std::shared_mutex shared_mutex_{};
|
||||
storage::MmapChunkDescriptorPtr mmap_descriptor_;
|
||||
};
|
||||
|
|
|
@ -57,11 +57,11 @@ SegmentGrowingImpl::try_remove_chunks(FieldId fieldId) {
|
|||
if (indexing_record_.SyncDataWithIndex(fieldId)) {
|
||||
VectorBase* vec_data_base =
|
||||
dynamic_cast<segcore::ConcurrentVector<FloatVector>*>(
|
||||
insert_record_.get_field_data_base(fieldId));
|
||||
insert_record_.get_data_base(fieldId));
|
||||
if (!vec_data_base) {
|
||||
vec_data_base =
|
||||
dynamic_cast<segcore::ConcurrentVector<SparseFloatVector>*>(
|
||||
insert_record_.get_field_data_base(fieldId));
|
||||
insert_record_.get_data_base(fieldId));
|
||||
}
|
||||
if (vec_data_base && vec_data_base->num_chunk() > 0 &&
|
||||
chunk_mutex_.try_lock()) {
|
||||
|
@ -105,11 +105,17 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
|
|||
fmt::format("can't find field {}", field_id.get()));
|
||||
auto data_offset = field_id_to_offset[field_id];
|
||||
if (!indexing_record_.SyncDataWithIndex(field_id)) {
|
||||
insert_record_.get_field_data_base(field_id)->set_data_raw(
|
||||
insert_record_.get_data_base(field_id)->set_data_raw(
|
||||
reserved_offset,
|
||||
num_rows,
|
||||
&insert_record_proto->fields_data(data_offset),
|
||||
field_meta);
|
||||
if (field_meta.is_nullable()) {
|
||||
insert_record_.get_valid_data(field_id)->set_data_raw(
|
||||
num_rows,
|
||||
&insert_record_proto->fields_data(data_offset),
|
||||
field_meta);
|
||||
}
|
||||
}
|
||||
//insert vector data into index
|
||||
if (segcore_config_.get_enable_interim_segment_index()) {
|
||||
|
@ -230,8 +236,12 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) {
|
|||
}
|
||||
|
||||
if (!indexing_record_.SyncDataWithIndex(field_id)) {
|
||||
insert_record_.get_field_data_base(field_id)->set_data_raw(
|
||||
insert_record_.get_data_base(field_id)->set_data_raw(
|
||||
reserved_offset, field_data);
|
||||
if (insert_record_.is_valid_data_exist(field_id)) {
|
||||
insert_record_.get_valid_data(field_id)->set_data_raw(
|
||||
field_data);
|
||||
}
|
||||
}
|
||||
if (segcore_config_.get_enable_interim_segment_index()) {
|
||||
auto offset = reserved_offset;
|
||||
|
@ -318,7 +328,7 @@ SegmentGrowingImpl::LoadFieldDataV2(const LoadFieldDataInfo& infos) {
|
|||
}
|
||||
|
||||
if (!indexing_record_.SyncDataWithIndex(field_id)) {
|
||||
insert_record_.get_field_data_base(field_id)->set_data_raw(
|
||||
insert_record_.get_data_base(field_id)->set_data_raw(
|
||||
reserved_offset, field_data);
|
||||
}
|
||||
if (segcore_config_.get_enable_interim_segment_index()) {
|
||||
|
@ -420,7 +430,7 @@ SegmentGrowingImpl::LoadDeletedRecord(const LoadDeletedRecordInfo& info) {
|
|||
|
||||
SpanBase
|
||||
SegmentGrowingImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const {
|
||||
auto vec = get_insert_record().get_field_data_base(field_id);
|
||||
auto vec = get_insert_record().get_data_base(field_id);
|
||||
return vec->get_span_base(chunk_id);
|
||||
}
|
||||
|
||||
|
@ -457,7 +467,7 @@ std::unique_ptr<DataArray>
|
|||
SegmentGrowingImpl::bulk_subscript(FieldId field_id,
|
||||
const int64_t* seg_offsets,
|
||||
int64_t count) const {
|
||||
auto vec_ptr = insert_record_.get_field_data_base(field_id);
|
||||
auto vec_ptr = insert_record_.get_data_base(field_id);
|
||||
auto& field_meta = schema_->operator[](field_id);
|
||||
if (field_meta.is_vector()) {
|
||||
auto result = CreateVectorDataArray(count, field_meta);
|
||||
|
@ -514,6 +524,14 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
|
|||
AssertInfo(!field_meta.is_vector(),
|
||||
"Scalar field meta type is vector type");
|
||||
auto result = CreateScalarDataArray(count, field_meta);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto valid_data_ptr = insert_record_.get_valid_data(field_id);
|
||||
auto res = result->mutable_valid_data()->mutable_data();
|
||||
for (int64_t i = 0; i < count; ++i) {
|
||||
auto offset = seg_offsets[i];
|
||||
res[i] = valid_data_ptr->is_valid(offset);
|
||||
}
|
||||
}
|
||||
switch (field_meta.get_data_type()) {
|
||||
case DataType::BOOL: {
|
||||
bulk_subscript_impl<bool>(vec_ptr,
|
||||
|
|
|
@ -443,7 +443,12 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
|
|||
auto rawValue = field_data->RawValue(i);
|
||||
auto array =
|
||||
static_cast<const milvus::Array*>(rawValue);
|
||||
var_column->Append(*array);
|
||||
if (field_data->IsNullable()) {
|
||||
var_column->Append(*array,
|
||||
field_data->is_valid(i));
|
||||
} else {
|
||||
var_column->Append(*array);
|
||||
}
|
||||
|
||||
// we stores the offset for each array element, so there is a additional uint64_t for each array element
|
||||
field_data_size =
|
||||
|
@ -480,7 +485,6 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
|
|||
FieldDataPtr field_data;
|
||||
while (data.channel->pop(field_data)) {
|
||||
column->AppendBatch(field_data);
|
||||
|
||||
stats_.mem_size += field_data->Size();
|
||||
}
|
||||
LoadPrimitiveSkipIndex(
|
||||
|
@ -550,18 +554,19 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) {
|
|||
uint64_t total_written = 0;
|
||||
std::vector<uint64_t> indices{};
|
||||
std::vector<std::vector<uint64_t>> element_indices{};
|
||||
FixedVector<bool> valid_data{};
|
||||
while (data.channel->pop(field_data)) {
|
||||
WriteFieldData(file,
|
||||
data_type,
|
||||
field_data,
|
||||
total_written,
|
||||
indices,
|
||||
element_indices);
|
||||
element_indices,
|
||||
valid_data);
|
||||
}
|
||||
WriteFieldPadding(file, data_type, total_written);
|
||||
|
||||
auto num_rows = data.row_count;
|
||||
std::shared_ptr<ColumnBase> column{};
|
||||
auto num_rows = data.row_count;
|
||||
if (IsVariableDataType(data_type)) {
|
||||
switch (data_type) {
|
||||
case milvus::DataType::STRING:
|
||||
|
@ -604,6 +609,8 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) {
|
|||
column = std::make_shared<Column>(file, total_written, field_meta);
|
||||
}
|
||||
|
||||
column->SetValidData(std::move(valid_data));
|
||||
|
||||
{
|
||||
std::unique_lock lck(mutex_);
|
||||
fields_.emplace(field_id, column);
|
||||
|
@ -712,7 +719,7 @@ SegmentSealedImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const {
|
|||
auto& field_data = it->second;
|
||||
return field_data->Span();
|
||||
}
|
||||
auto field_data = insert_record_.get_field_data_base(field_id);
|
||||
auto field_data = insert_record_.get_data_base(field_id);
|
||||
AssertInfo(field_data->num_chunk() == 1,
|
||||
"num chunk not equal to 1 for sealed segment");
|
||||
return field_data->get_span_base(0);
|
||||
|
@ -1236,6 +1243,13 @@ SegmentSealedImpl::get_raw_data(FieldId field_id,
|
|||
// to make sure it won't get released if segment released
|
||||
auto column = fields_.at(field_id);
|
||||
auto ret = fill_with_empty(field_id, count);
|
||||
if (column->IsNullable()) {
|
||||
auto dst = ret->mutable_valid_data()->mutable_data();
|
||||
for (int64_t i = 0; i < count; ++i) {
|
||||
auto offset = seg_offsets[i];
|
||||
dst[i] = column->IsValid(offset);
|
||||
}
|
||||
}
|
||||
switch (field_meta.get_data_type()) {
|
||||
case DataType::VARCHAR:
|
||||
case DataType::STRING: {
|
||||
|
|
|
@ -232,6 +232,10 @@ CreateScalarDataArray(int64_t count, const FieldMeta& field_meta) {
|
|||
data_array->set_type(static_cast<milvus::proto::schema::DataType>(
|
||||
field_meta.get_data_type()));
|
||||
|
||||
if (field_meta.is_nullable()) {
|
||||
data_array->mutable_valid_data()->Resize(count, false);
|
||||
}
|
||||
|
||||
auto scalar_array = data_array->mutable_scalars();
|
||||
switch (data_type) {
|
||||
case DataType::BOOL: {
|
||||
|
@ -360,6 +364,7 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) {
|
|||
|
||||
std::unique_ptr<DataArray>
|
||||
CreateScalarDataArrayFrom(const void* data_raw,
|
||||
const void* valid_data,
|
||||
int64_t count,
|
||||
const FieldMeta& field_meta) {
|
||||
auto data_type = field_meta.get_data_type();
|
||||
|
@ -367,6 +372,11 @@ CreateScalarDataArrayFrom(const void* data_raw,
|
|||
data_array->set_field_id(field_meta.get_id().get());
|
||||
data_array->set_type(static_cast<milvus::proto::schema::DataType>(
|
||||
field_meta.get_data_type()));
|
||||
if (field_meta.is_nullable()) {
|
||||
auto valid_data_ = reinterpret_cast<const bool*>(valid_data);
|
||||
auto obj = data_array->mutable_valid_data();
|
||||
obj->Add(valid_data_, valid_data_ + count);
|
||||
}
|
||||
|
||||
auto scalar_array = data_array->mutable_scalars();
|
||||
switch (data_type) {
|
||||
|
@ -517,12 +527,14 @@ CreateVectorDataArrayFrom(const void* data_raw,
|
|||
|
||||
std::unique_ptr<DataArray>
|
||||
CreateDataArrayFrom(const void* data_raw,
|
||||
const void* valid_data,
|
||||
int64_t count,
|
||||
const FieldMeta& field_meta) {
|
||||
auto data_type = field_meta.get_data_type();
|
||||
|
||||
if (!IsVectorDataType(data_type)) {
|
||||
return CreateScalarDataArrayFrom(data_raw, count, field_meta);
|
||||
return CreateScalarDataArrayFrom(
|
||||
data_raw, valid_data, count, field_meta);
|
||||
}
|
||||
|
||||
return CreateVectorDataArrayFrom(data_raw, count, field_meta);
|
||||
|
@ -535,6 +547,7 @@ MergeDataArray(std::vector<MergeBase>& merge_bases,
|
|||
auto data_type = field_meta.get_data_type();
|
||||
auto data_array = std::make_unique<DataArray>();
|
||||
data_array->set_field_id(field_meta.get_id().get());
|
||||
auto nullable = field_meta.is_nullable();
|
||||
data_array->set_type(static_cast<milvus::proto::schema::DataType>(
|
||||
field_meta.get_data_type()));
|
||||
|
||||
|
@ -588,6 +601,12 @@ MergeDataArray(std::vector<MergeBase>& merge_bases,
|
|||
continue;
|
||||
}
|
||||
|
||||
if (nullable) {
|
||||
auto data = src_field_data->valid_data().data();
|
||||
auto obj = data_array->mutable_valid_data();
|
||||
*(obj->Add()) = data[src_offset];
|
||||
}
|
||||
|
||||
auto scalar_array = data_array->mutable_scalars();
|
||||
switch (data_type) {
|
||||
case DataType::BOOL: {
|
||||
|
@ -781,6 +800,7 @@ LoadFieldDatasFromRemote2(std::shared_ptr<milvus_storage::Space> space,
|
|||
data->GetColumnByName(field.second.get_name().get());
|
||||
auto field_data = storage::CreateFieldData(
|
||||
field.second.get_data_type(),
|
||||
field.second.is_nullable(),
|
||||
field.second.is_vector() ? field.second.get_dim() : 0,
|
||||
total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
|
|
|
@ -63,6 +63,7 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta);
|
|||
|
||||
std::unique_ptr<DataArray>
|
||||
CreateScalarDataArrayFrom(const void* data_raw,
|
||||
const void* valid_data,
|
||||
int64_t count,
|
||||
const FieldMeta& field_meta);
|
||||
|
||||
|
@ -73,6 +74,7 @@ CreateVectorDataArrayFrom(const void* data_raw,
|
|||
|
||||
std::unique_ptr<DataArray>
|
||||
CreateDataArrayFrom(const void* data_raw,
|
||||
const void* valid_data,
|
||||
int64_t count,
|
||||
const FieldMeta& field_meta);
|
||||
|
||||
|
|
|
@ -379,7 +379,8 @@ LoadFieldRawData(CSegmentInterface c_segment,
|
|||
dim = field_meta.get_dim();
|
||||
}
|
||||
}
|
||||
auto field_data = milvus::storage::CreateFieldData(data_type, dim);
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(data_type, false, dim);
|
||||
field_data->FillFieldData(data, row_count);
|
||||
milvus::FieldDataChannelPtr channel =
|
||||
std::make_shared<milvus::FieldDataChannel>();
|
||||
|
|
|
@ -112,7 +112,7 @@ ChunkCache::Mmap(const FieldDataPtr& field_data,
|
|||
uint64_t offset = 0;
|
||||
for (auto i = 0; i < field_data->get_num_rows(); ++i) {
|
||||
indices.push_back(offset);
|
||||
offset += field_data->Size(i);
|
||||
offset += field_data->DataSize(i);
|
||||
}
|
||||
auto sparse_column = std::make_shared<SparseFloatColumn>(
|
||||
data_size, dim, data_type, mcm_, descriptor);
|
||||
|
@ -123,7 +123,7 @@ ChunkCache::Mmap(const FieldDataPtr& field_data,
|
|||
false, "TODO: unimplemented for variable data type: {}", data_type);
|
||||
} else {
|
||||
column = std::make_shared<Column>(
|
||||
data_size, dim, data_type, mcm_, descriptor);
|
||||
data_size, dim, data_type, mcm_, descriptor,field_data->IsNullable());
|
||||
}
|
||||
column->AppendBatch(field_data);
|
||||
return column;
|
||||
|
|
|
@ -31,6 +31,10 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) {
|
|||
DescriptorEvent descriptor_event(reader);
|
||||
DataType data_type =
|
||||
DataType(descriptor_event.event_data.fix_part.data_type);
|
||||
auto& extras = descriptor_event.event_data.extras;
|
||||
bool nullable = (extras.find(NULLABLE) != extras.end())
|
||||
? std::any_cast<bool>(extras[NULLABLE])
|
||||
: false;
|
||||
auto descriptor_fix_part = descriptor_event.event_data.fix_part;
|
||||
FieldDataMeta data_meta{descriptor_fix_part.collection_id,
|
||||
descriptor_fix_part.partition_id,
|
||||
|
@ -42,7 +46,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) {
|
|||
auto event_data_length =
|
||||
header.event_length_ - GetEventHeaderSize(header);
|
||||
auto insert_event_data =
|
||||
InsertEventData(reader, event_data_length, data_type);
|
||||
InsertEventData(reader, event_data_length, data_type, nullable);
|
||||
auto insert_data =
|
||||
std::make_unique<InsertData>(insert_event_data.field_data);
|
||||
insert_data->SetFieldDataMeta(data_meta);
|
||||
|
@ -54,7 +58,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) {
|
|||
auto event_data_length =
|
||||
header.event_length_ - GetEventHeaderSize(header);
|
||||
auto index_event_data =
|
||||
IndexEventData(reader, event_data_length, data_type);
|
||||
IndexEventData(reader, event_data_length, data_type, nullable);
|
||||
auto field_data = index_event_data.field_data;
|
||||
// for compatible with golang indexcode.Serialize, which set dataType to String
|
||||
if (data_type == DataType::STRING) {
|
||||
|
@ -63,7 +67,7 @@ DeserializeRemoteFileData(BinlogReaderPtr reader) {
|
|||
AssertInfo(
|
||||
field_data->get_num_rows() == 1,
|
||||
"wrong length of string num in old index binlog file");
|
||||
auto new_field_data = CreateFieldData(DataType::INT8);
|
||||
auto new_field_data = CreateFieldData(DataType::INT8, nullable);
|
||||
new_field_data->FillFieldData(
|
||||
(*static_cast<const std::string*>(field_data->RawValue(0)))
|
||||
.c_str(),
|
||||
|
|
|
@ -411,7 +411,7 @@ DiskFileManagerImpl::CacheRawDataToDisk(
|
|||
num_rows += total_num_rows;
|
||||
auto col_data = data->GetColumnByName(index_meta_.field_name);
|
||||
auto field_data = storage::CreateFieldData(
|
||||
index_meta_.field_type, index_meta_.dim, total_num_rows);
|
||||
index_meta_.field_type, false, index_meta_.dim, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
dim = field_data->get_dim();
|
||||
auto data_size =
|
||||
|
@ -741,7 +741,7 @@ DiskFileManagerImpl::CacheOptFieldToDisk(
|
|||
}
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
auto field_data =
|
||||
storage::CreateFieldData(field_type, 1, total_num_rows);
|
||||
storage::CreateFieldData(field_type, false, 1, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.emplace_back(field_data);
|
||||
}
|
||||
|
|
|
@ -209,7 +209,8 @@ DescriptorEventData::Serialize() {
|
|||
|
||||
BaseEventData::BaseEventData(BinlogReaderPtr reader,
|
||||
int event_length,
|
||||
DataType data_type) {
|
||||
DataType data_type,
|
||||
bool nullable) {
|
||||
auto ast = reader->Read(sizeof(start_timestamp), &start_timestamp);
|
||||
AssertInfo(ast.ok(), "read start timestamp failed");
|
||||
ast = reader->Read(sizeof(end_timestamp), &end_timestamp);
|
||||
|
@ -220,7 +221,7 @@ BaseEventData::BaseEventData(BinlogReaderPtr reader,
|
|||
auto res = reader->Read(payload_length);
|
||||
AssertInfo(res.first.ok(), "read payload failed");
|
||||
auto payload_reader = std::make_shared<PayloadReader>(
|
||||
res.second.get(), payload_length, data_type);
|
||||
res.second.get(), payload_length, data_type, nullable);
|
||||
field_data = payload_reader->get_field_data();
|
||||
}
|
||||
|
||||
|
@ -230,10 +231,11 @@ BaseEventData::Serialize() {
|
|||
std::shared_ptr<PayloadWriter> payload_writer;
|
||||
if (IsVectorDataType(data_type) &&
|
||||
!IsSparseFloatVectorDataType(data_type)) {
|
||||
payload_writer =
|
||||
std::make_unique<PayloadWriter>(data_type, field_data->get_dim());
|
||||
payload_writer = std::make_unique<PayloadWriter>(
|
||||
data_type, field_data->get_dim(), field_data->IsNullable());
|
||||
} else {
|
||||
payload_writer = std::make_unique<PayloadWriter>(data_type);
|
||||
payload_writer = std::make_unique<PayloadWriter>(
|
||||
data_type, field_data->IsNullable());
|
||||
}
|
||||
switch (data_type) {
|
||||
case DataType::VARCHAR:
|
||||
|
@ -242,8 +244,8 @@ BaseEventData::Serialize() {
|
|||
++offset) {
|
||||
auto str = static_cast<const std::string*>(
|
||||
field_data->RawValue(offset));
|
||||
payload_writer->add_one_string_payload(str->c_str(),
|
||||
str->size());
|
||||
auto size = field_data->is_valid(offset) ? str->size() : -1;
|
||||
payload_writer->add_one_string_payload(str->c_str(), size);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -253,10 +255,12 @@ BaseEventData::Serialize() {
|
|||
auto array =
|
||||
static_cast<const Array*>(field_data->RawValue(offset));
|
||||
auto array_string = array->output_data().SerializeAsString();
|
||||
auto size =
|
||||
field_data->is_valid(offset) ? array_string.size() : -1;
|
||||
|
||||
payload_writer->add_one_binary_payload(
|
||||
reinterpret_cast<const uint8_t*>(array_string.c_str()),
|
||||
array_string.size());
|
||||
size);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -289,8 +293,10 @@ BaseEventData::Serialize() {
|
|||
auto payload =
|
||||
Payload{data_type,
|
||||
static_cast<const uint8_t*>(field_data->Data()),
|
||||
field_data->ValidData(),
|
||||
field_data->get_num_rows(),
|
||||
field_data->get_dim()};
|
||||
field_data->get_dim(),
|
||||
field_data->IsNullable()};
|
||||
payload_writer->add_payload(payload);
|
||||
}
|
||||
}
|
||||
|
@ -310,11 +316,13 @@ BaseEventData::Serialize() {
|
|||
return res;
|
||||
}
|
||||
|
||||
BaseEvent::BaseEvent(BinlogReaderPtr reader, DataType data_type) {
|
||||
BaseEvent::BaseEvent(BinlogReaderPtr reader,
|
||||
DataType data_type,
|
||||
bool nullable) {
|
||||
event_header = EventHeader(reader);
|
||||
auto event_data_length =
|
||||
event_header.event_length_ - GetEventHeaderSize(event_header);
|
||||
event_data = BaseEventData(reader, event_data_length, data_type);
|
||||
event_data = BaseEventData(reader, event_data_length, data_type, nullable);
|
||||
}
|
||||
|
||||
std::vector<uint8_t>
|
||||
|
@ -370,8 +378,9 @@ std::vector<uint8_t>
|
|||
LocalInsertEvent::Serialize() {
|
||||
int row_num = field_data->get_num_rows();
|
||||
int dimension = field_data->get_dim();
|
||||
int payload_size = field_data->Size();
|
||||
int len = sizeof(row_num) + sizeof(dimension) + payload_size;
|
||||
int data_size = field_data->DataSize();
|
||||
int valid_data_size = field_data->ValidDataSize();
|
||||
int len = sizeof(row_num) + sizeof(dimension) + data_size + valid_data_size;
|
||||
|
||||
std::vector<uint8_t> res(len);
|
||||
int offset = 0;
|
||||
|
@ -379,8 +388,9 @@ LocalInsertEvent::Serialize() {
|
|||
offset += sizeof(row_num);
|
||||
memcpy(res.data() + offset, &dimension, sizeof(dimension));
|
||||
offset += sizeof(dimension);
|
||||
memcpy(res.data() + offset, field_data->Data(), payload_size);
|
||||
|
||||
memcpy(res.data() + offset, field_data->Data(), data_size);
|
||||
offset += data_size;
|
||||
memcpy(res.data() + offset, field_data->ValidData(), valid_data_size);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -393,7 +403,7 @@ LocalIndexEvent::LocalIndexEvent(BinlogReaderPtr reader) {
|
|||
auto res = reader->Read(index_size);
|
||||
AssertInfo(res.first.ok(), "read payload failed");
|
||||
auto payload_reader = std::make_shared<PayloadReader>(
|
||||
res.second.get(), index_size, DataType::INT8);
|
||||
res.second.get(), index_size, DataType::INT8, false);
|
||||
field_data = payload_reader->get_field_data();
|
||||
}
|
||||
|
||||
|
|
|
@ -80,7 +80,8 @@ struct BaseEventData {
|
|||
BaseEventData() = default;
|
||||
explicit BaseEventData(BinlogReaderPtr reader,
|
||||
int event_length,
|
||||
DataType data_type);
|
||||
DataType data_type,
|
||||
bool nullable);
|
||||
|
||||
std::vector<uint8_t>
|
||||
Serialize();
|
||||
|
@ -103,7 +104,9 @@ struct BaseEvent {
|
|||
int64_t event_offset;
|
||||
|
||||
BaseEvent() = default;
|
||||
explicit BaseEvent(BinlogReaderPtr reader, DataType data_type);
|
||||
explicit BaseEvent(BinlogReaderPtr reader,
|
||||
DataType data_type,
|
||||
bool nullable);
|
||||
|
||||
std::vector<uint8_t>
|
||||
Serialize();
|
||||
|
|
|
@ -69,7 +69,7 @@ InsertData::serialize_to_remote_file() {
|
|||
}
|
||||
des_event_data.extras[ORIGIN_SIZE_KEY] =
|
||||
std::to_string(field_data_->Size());
|
||||
//(todo:smellthemoon) set nullable
|
||||
des_event_data.extras[NULLABLE] = field_data_->IsNullable();
|
||||
|
||||
auto& des_event_header = descriptor_event.event_header;
|
||||
// TODO :: set timestamp
|
||||
|
|
|
@ -27,8 +27,9 @@ namespace milvus::storage {
|
|||
|
||||
PayloadReader::PayloadReader(const uint8_t* data,
|
||||
int length,
|
||||
DataType data_type)
|
||||
: column_type_(data_type) {
|
||||
DataType data_type,
|
||||
bool nullable)
|
||||
: column_type_(data_type), nullable(nullable) {
|
||||
auto input = std::make_shared<arrow::io::BufferReader>(data, length);
|
||||
init(input);
|
||||
}
|
||||
|
@ -72,7 +73,7 @@ PayloadReader::init(std::shared_ptr<arrow::io::BufferReader> input) {
|
|||
st = arrow_reader->GetRecordBatchReader(&rb_reader);
|
||||
AssertInfo(st.ok(), "get record batch reader");
|
||||
|
||||
field_data_ = CreateFieldData(column_type_, dim_, total_num_rows);
|
||||
field_data_ = CreateFieldData(column_type_, nullable, dim_, total_num_rows);
|
||||
for (arrow::Result<std::shared_ptr<arrow::RecordBatch>> maybe_batch :
|
||||
*rb_reader) {
|
||||
AssertInfo(maybe_batch.ok(), "get batch record success");
|
||||
|
|
|
@ -26,7 +26,10 @@ namespace milvus::storage {
|
|||
|
||||
class PayloadReader {
|
||||
public:
|
||||
explicit PayloadReader(const uint8_t* data, int length, DataType data_type);
|
||||
explicit PayloadReader(const uint8_t* data,
|
||||
int length,
|
||||
DataType data_type,
|
||||
bool nullable);
|
||||
|
||||
~PayloadReader() = default;
|
||||
|
||||
|
@ -41,6 +44,7 @@ class PayloadReader {
|
|||
private:
|
||||
DataType column_type_;
|
||||
int dim_;
|
||||
bool nullable;
|
||||
FieldDataPtr field_data_;
|
||||
};
|
||||
|
||||
|
|
|
@ -32,8 +32,10 @@ class PayloadInputStream;
|
|||
struct Payload {
|
||||
DataType data_type;
|
||||
const uint8_t* raw_data;
|
||||
int64_t rows;
|
||||
const uint8_t* valid_data;
|
||||
const int64_t rows;
|
||||
std::optional<int> dimension;
|
||||
bool nullable;
|
||||
};
|
||||
|
||||
class PayloadOutputStream : public arrow::io::OutputStream {
|
||||
|
|
|
@ -23,18 +23,19 @@
|
|||
namespace milvus::storage {
|
||||
|
||||
// create payload writer for numeric data type
|
||||
PayloadWriter::PayloadWriter(const DataType column_type)
|
||||
: column_type_(column_type) {
|
||||
PayloadWriter::PayloadWriter(const DataType column_type, bool nullable)
|
||||
: column_type_(column_type), nullable_(nullable) {
|
||||
builder_ = CreateArrowBuilder(column_type);
|
||||
schema_ = CreateArrowSchema(column_type);
|
||||
schema_ = CreateArrowSchema(column_type, nullable);
|
||||
}
|
||||
|
||||
// create payload writer for vector data type
|
||||
PayloadWriter::PayloadWriter(const DataType column_type, int dim)
|
||||
: column_type_(column_type) {
|
||||
PayloadWriter::PayloadWriter(const DataType column_type, int dim, bool nullable)
|
||||
: column_type_(column_type), nullable_(nullable) {
|
||||
AssertInfo(column_type != DataType::VECTOR_SPARSE_FLOAT,
|
||||
"PayloadWriter for Sparse Float Vector should be created "
|
||||
"using the constructor without dimension");
|
||||
AssertInfo(nullable == false, "only scalcar type support null now");
|
||||
init_dimension(dim);
|
||||
}
|
||||
|
||||
|
@ -48,7 +49,7 @@ PayloadWriter::init_dimension(int dim) {
|
|||
|
||||
dimension_ = dim;
|
||||
builder_ = CreateArrowBuilder(column_type_, dim);
|
||||
schema_ = CreateArrowSchema(column_type_, dim);
|
||||
schema_ = CreateArrowSchema(column_type_, dim, nullable_);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -25,8 +25,8 @@
|
|||
namespace milvus::storage {
|
||||
class PayloadWriter {
|
||||
public:
|
||||
explicit PayloadWriter(const DataType column_type);
|
||||
explicit PayloadWriter(const DataType column_type, int dim);
|
||||
explicit PayloadWriter(const DataType column_type, int dim, bool nullable);
|
||||
explicit PayloadWriter(const DataType column_type, bool nullable);
|
||||
~PayloadWriter() = default;
|
||||
|
||||
void
|
||||
|
@ -58,6 +58,7 @@ class PayloadWriter {
|
|||
|
||||
private:
|
||||
DataType column_type_;
|
||||
bool nullable_;
|
||||
std::shared_ptr<arrow::ArrayBuilder> builder_;
|
||||
std::shared_ptr<arrow::Schema> schema_;
|
||||
std::shared_ptr<PayloadOutputStream> output_;
|
||||
|
|
|
@ -75,6 +75,18 @@ std::map<std::string, int> ReadAheadPolicy_Map = {
|
|||
{"willneed", MADV_WILLNEED},
|
||||
{"dontneed", MADV_DONTNEED}};
|
||||
|
||||
// in arrow, null_bitmap read from the least significant bit
|
||||
std::vector<uint8_t>
|
||||
genValidIter(const uint8_t* valid_data, int length) {
|
||||
std::vector<uint8_t> valid_data_;
|
||||
valid_data_.reserve(length);
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
auto bit = (valid_data[i >> 3] >> (i & 0x07)) & 1;
|
||||
valid_data_.push_back(bit);
|
||||
}
|
||||
return valid_data_;
|
||||
}
|
||||
|
||||
StorageType
|
||||
ReadMediumType(BinlogReaderPtr reader) {
|
||||
AssertInfo(reader->Tell() == 0,
|
||||
|
@ -106,12 +118,22 @@ template <typename DT, typename BT>
|
|||
void
|
||||
add_numeric_payload(std::shared_ptr<arrow::ArrayBuilder> builder,
|
||||
DT* start,
|
||||
const uint8_t* valid_data,
|
||||
bool nullable,
|
||||
int length) {
|
||||
AssertInfo(builder != nullptr, "empty arrow builder");
|
||||
auto numeric_builder = std::dynamic_pointer_cast<BT>(builder);
|
||||
auto ast = numeric_builder->AppendValues(start, start + length);
|
||||
AssertInfo(
|
||||
ast.ok(), "append value to arrow builder failed: {}", ast.ToString());
|
||||
arrow::Status ast;
|
||||
if (nullable) {
|
||||
// need iter to read valid_data when write
|
||||
auto iter = genValidIter(valid_data, length);
|
||||
ast =
|
||||
numeric_builder->AppendValues(start, start + length, iter.begin());
|
||||
AssertInfo(ast.ok(), "append value to arrow builder failed");
|
||||
} else {
|
||||
ast = numeric_builder->AppendValues(start, start + length);
|
||||
AssertInfo(ast.ok(), "append value to arrow builder failed");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -121,48 +143,49 @@ AddPayloadToArrowBuilder(std::shared_ptr<arrow::ArrayBuilder> builder,
|
|||
auto raw_data = const_cast<uint8_t*>(payload.raw_data);
|
||||
auto length = payload.rows;
|
||||
auto data_type = payload.data_type;
|
||||
auto nullable = payload.nullable;
|
||||
|
||||
switch (data_type) {
|
||||
case DataType::BOOL: {
|
||||
auto bool_data = reinterpret_cast<bool*>(raw_data);
|
||||
add_numeric_payload<bool, arrow::BooleanBuilder>(
|
||||
builder, bool_data, length);
|
||||
builder, bool_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::INT8: {
|
||||
auto int8_data = reinterpret_cast<int8_t*>(raw_data);
|
||||
add_numeric_payload<int8_t, arrow::Int8Builder>(
|
||||
builder, int8_data, length);
|
||||
builder, int8_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::INT16: {
|
||||
auto int16_data = reinterpret_cast<int16_t*>(raw_data);
|
||||
add_numeric_payload<int16_t, arrow::Int16Builder>(
|
||||
builder, int16_data, length);
|
||||
builder, int16_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::INT32: {
|
||||
auto int32_data = reinterpret_cast<int32_t*>(raw_data);
|
||||
add_numeric_payload<int32_t, arrow::Int32Builder>(
|
||||
builder, int32_data, length);
|
||||
builder, int32_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::INT64: {
|
||||
auto int64_data = reinterpret_cast<int64_t*>(raw_data);
|
||||
add_numeric_payload<int64_t, arrow::Int64Builder>(
|
||||
builder, int64_data, length);
|
||||
builder, int64_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::FLOAT: {
|
||||
auto float_data = reinterpret_cast<float*>(raw_data);
|
||||
add_numeric_payload<float, arrow::FloatBuilder>(
|
||||
builder, float_data, length);
|
||||
builder, float_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::DOUBLE: {
|
||||
auto double_data = reinterpret_cast<double_t*>(raw_data);
|
||||
add_numeric_payload<double, arrow::DoubleBuilder>(
|
||||
builder, double_data, length);
|
||||
builder, double_data, payload.valid_data, nullable, length);
|
||||
break;
|
||||
}
|
||||
case DataType::VECTOR_FLOAT16:
|
||||
|
@ -292,40 +315,50 @@ CreateArrowBuilder(DataType data_type, int dim) {
|
|||
}
|
||||
|
||||
std::shared_ptr<arrow::Schema>
|
||||
CreateArrowSchema(DataType data_type) {
|
||||
CreateArrowSchema(DataType data_type, bool nullable) {
|
||||
switch (static_cast<DataType>(data_type)) {
|
||||
case DataType::BOOL: {
|
||||
return arrow::schema({arrow::field("val", arrow::boolean())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::boolean(), nullable)});
|
||||
}
|
||||
case DataType::INT8: {
|
||||
return arrow::schema({arrow::field("val", arrow::int8())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::int8(), nullable)});
|
||||
}
|
||||
case DataType::INT16: {
|
||||
return arrow::schema({arrow::field("val", arrow::int16())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::int16(), nullable)});
|
||||
}
|
||||
case DataType::INT32: {
|
||||
return arrow::schema({arrow::field("val", arrow::int32())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::int32(), nullable)});
|
||||
}
|
||||
case DataType::INT64: {
|
||||
return arrow::schema({arrow::field("val", arrow::int64())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::int64(), nullable)});
|
||||
}
|
||||
case DataType::FLOAT: {
|
||||
return arrow::schema({arrow::field("val", arrow::float32())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::float32(), nullable)});
|
||||
}
|
||||
case DataType::DOUBLE: {
|
||||
return arrow::schema({arrow::field("val", arrow::float64())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::float64(), nullable)});
|
||||
}
|
||||
case DataType::VARCHAR:
|
||||
case DataType::STRING: {
|
||||
return arrow::schema({arrow::field("val", arrow::utf8())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::utf8(), nullable)});
|
||||
}
|
||||
case DataType::ARRAY:
|
||||
case DataType::JSON: {
|
||||
return arrow::schema({arrow::field("val", arrow::binary())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::binary(), nullable)});
|
||||
}
|
||||
// sparse float vector doesn't require a dim
|
||||
case DataType::VECTOR_SPARSE_FLOAT: {
|
||||
return arrow::schema({arrow::field("val", arrow::binary())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::binary(), nullable)});
|
||||
}
|
||||
default: {
|
||||
PanicInfo(
|
||||
|
@ -335,30 +368,37 @@ CreateArrowSchema(DataType data_type) {
|
|||
}
|
||||
|
||||
std::shared_ptr<arrow::Schema>
|
||||
CreateArrowSchema(DataType data_type, int dim) {
|
||||
CreateArrowSchema(DataType data_type, int dim, bool nullable) {
|
||||
switch (static_cast<DataType>(data_type)) {
|
||||
case DataType::VECTOR_FLOAT: {
|
||||
AssertInfo(dim > 0, "invalid dim value: {}", dim);
|
||||
return arrow::schema({arrow::field(
|
||||
"val", arrow::fixed_size_binary(dim * sizeof(float)))});
|
||||
return arrow::schema(
|
||||
{arrow::field("val",
|
||||
arrow::fixed_size_binary(dim * sizeof(float)),
|
||||
nullable)});
|
||||
}
|
||||
case DataType::VECTOR_BINARY: {
|
||||
AssertInfo(dim % 8 == 0 && dim > 0, "invalid dim value: {}", dim);
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::fixed_size_binary(dim / 8))});
|
||||
return arrow::schema({arrow::field(
|
||||
"val", arrow::fixed_size_binary(dim / 8), nullable)});
|
||||
}
|
||||
case DataType::VECTOR_FLOAT16: {
|
||||
AssertInfo(dim > 0, "invalid dim value: {}", dim);
|
||||
return arrow::schema({arrow::field(
|
||||
"val", arrow::fixed_size_binary(dim * sizeof(float16)))});
|
||||
return arrow::schema(
|
||||
{arrow::field("val",
|
||||
arrow::fixed_size_binary(dim * sizeof(float16)),
|
||||
nullable)});
|
||||
}
|
||||
case DataType::VECTOR_BFLOAT16: {
|
||||
AssertInfo(dim > 0, "invalid dim value");
|
||||
return arrow::schema({arrow::field(
|
||||
"val", arrow::fixed_size_binary(dim * sizeof(bfloat16)))});
|
||||
return arrow::schema(
|
||||
{arrow::field("val",
|
||||
arrow::fixed_size_binary(dim * sizeof(bfloat16)),
|
||||
nullable)});
|
||||
}
|
||||
case DataType::VECTOR_SPARSE_FLOAT: {
|
||||
return arrow::schema({arrow::field("val", arrow::binary())});
|
||||
return arrow::schema(
|
||||
{arrow::field("val", arrow::binary(), nullable)});
|
||||
}
|
||||
default: {
|
||||
PanicInfo(
|
||||
|
@ -499,7 +539,7 @@ EncodeAndUploadIndexSlice(ChunkManager* chunk_manager,
|
|||
IndexMeta index_meta,
|
||||
FieldDataMeta field_meta,
|
||||
std::string object_key) {
|
||||
auto field_data = CreateFieldData(DataType::INT8);
|
||||
auto field_data = CreateFieldData(DataType::INT8, false);
|
||||
field_data->FillFieldData(buf, batch_size);
|
||||
auto indexData = std::make_shared<IndexData>(field_data);
|
||||
indexData->set_index_meta(index_meta);
|
||||
|
@ -518,7 +558,8 @@ EncodeAndUploadIndexSlice2(std::shared_ptr<milvus_storage::Space> space,
|
|||
IndexMeta index_meta,
|
||||
FieldDataMeta field_meta,
|
||||
std::string object_key) {
|
||||
auto field_data = CreateFieldData(DataType::INT8);
|
||||
// todo: support nullable index
|
||||
auto field_data = CreateFieldData(DataType::INT8, false);
|
||||
field_data->FillFieldData(buf, batch_size);
|
||||
auto indexData = std::make_shared<IndexData>(field_data);
|
||||
indexData->set_index_meta(index_meta);
|
||||
|
@ -542,7 +583,8 @@ EncodeAndUploadFieldSlice(ChunkManager* chunk_manager,
|
|||
auto dim = IsSparseFloatVectorDataType(field_meta.get_data_type())
|
||||
? -1
|
||||
: field_meta.get_dim();
|
||||
auto field_data = CreateFieldData(field_meta.get_data_type(), dim, 0);
|
||||
auto field_data = CreateFieldData(
|
||||
field_meta.get_data_type(), field_meta.is_nullable(), dim, 0);
|
||||
field_data->FillFieldData(buf, element_count);
|
||||
auto insertData = std::make_shared<InsertData>(field_data);
|
||||
insertData->SetFieldDataMeta(field_data_meta);
|
||||
|
@ -779,30 +821,42 @@ CreateChunkManager(const StorageConfig& storage_config) {
|
|||
}
|
||||
|
||||
FieldDataPtr
|
||||
CreateFieldData(const DataType& type, int64_t dim, int64_t total_num_rows) {
|
||||
CreateFieldData(const DataType& type,
|
||||
bool nullable,
|
||||
int64_t dim,
|
||||
int64_t total_num_rows) {
|
||||
switch (type) {
|
||||
case DataType::BOOL:
|
||||
return std::make_shared<FieldData<bool>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<bool>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::INT8:
|
||||
return std::make_shared<FieldData<int8_t>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<int8_t>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::INT16:
|
||||
return std::make_shared<FieldData<int16_t>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<int16_t>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::INT32:
|
||||
return std::make_shared<FieldData<int32_t>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<int32_t>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::INT64:
|
||||
return std::make_shared<FieldData<int64_t>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<int64_t>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::FLOAT:
|
||||
return std::make_shared<FieldData<float>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<float>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::DOUBLE:
|
||||
return std::make_shared<FieldData<double>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<double>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return std::make_shared<FieldData<std::string>>(type,
|
||||
total_num_rows);
|
||||
return std::make_shared<FieldData<std::string>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::JSON:
|
||||
return std::make_shared<FieldData<Json>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<Json>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::ARRAY:
|
||||
return std::make_shared<FieldData<Array>>(type, total_num_rows);
|
||||
return std::make_shared<FieldData<Array>>(
|
||||
type, nullable, total_num_rows);
|
||||
case DataType::VECTOR_FLOAT:
|
||||
return std::make_shared<FieldData<FloatVector>>(
|
||||
dim, type, total_num_rows);
|
||||
|
@ -859,11 +913,16 @@ MergeFieldData(std::vector<FieldDataPtr>& data_array) {
|
|||
for (const auto& data : data_array) {
|
||||
total_length += data->Length();
|
||||
}
|
||||
|
||||
auto merged_data = storage::CreateFieldData(data_array[0]->get_data_type());
|
||||
auto merged_data = storage::CreateFieldData(data_array[0]->get_data_type(),
|
||||
data_array[0]->IsNullable());
|
||||
merged_data->Reserve(total_length);
|
||||
for (const auto& data : data_array) {
|
||||
merged_data->FillFieldData(data->Data(), data->Length());
|
||||
if (merged_data->IsNullable()) {
|
||||
merged_data->FillFieldData(
|
||||
data->Data(), data->ValidData(), data->Length());
|
||||
} else {
|
||||
merged_data->FillFieldData(data->Data(), data->Length());
|
||||
}
|
||||
}
|
||||
return merged_data;
|
||||
}
|
||||
|
|
|
@ -58,10 +58,10 @@ std::shared_ptr<arrow::ArrayBuilder>
|
|||
CreateArrowBuilder(DataType data_type, int dim);
|
||||
|
||||
std::shared_ptr<arrow::Schema>
|
||||
CreateArrowSchema(DataType data_type);
|
||||
CreateArrowSchema(DataType data_type, bool nullable);
|
||||
|
||||
std::shared_ptr<arrow::Schema>
|
||||
CreateArrowSchema(DataType data_type, int dim);
|
||||
CreateArrowSchema(DataType data_type, int dim, bool nullable);
|
||||
|
||||
int
|
||||
GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema,
|
||||
|
@ -156,6 +156,7 @@ CreateChunkManager(const StorageConfig& storage_config);
|
|||
|
||||
FieldDataPtr
|
||||
CreateFieldData(const DataType& type,
|
||||
bool nullable = false,
|
||||
int64_t dim = 1,
|
||||
int64_t total_num_rows = 0);
|
||||
|
||||
|
|
|
@ -815,8 +815,11 @@ TEST(Expr, PraseArrayContainsExpr) {
|
|||
auto schema = std::make_shared<Schema>();
|
||||
schema->AddDebugField(
|
||||
"fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
|
||||
schema->AddField(
|
||||
FieldName("array"), FieldId(101), DataType::ARRAY, DataType::INT64);
|
||||
schema->AddField(FieldName("array"),
|
||||
FieldId(101),
|
||||
DataType::ARRAY,
|
||||
DataType::INT64,
|
||||
false);
|
||||
auto plan =
|
||||
CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
|
||||
}
|
||||
|
|
|
@ -35,21 +35,21 @@ GenTestSchema() {
|
|||
schema_->set_primary_field_id(pk);
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
schema_->AddDebugArrayField("array", DataType::BOOL);
|
||||
schema_->AddDebugArrayField("array", DataType::BOOL,false);
|
||||
} else if constexpr (std::is_same_v<T, int8_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT8);
|
||||
schema_->AddDebugArrayField("array", DataType::INT8,false);
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT16);
|
||||
schema_->AddDebugArrayField("array", DataType::INT16,false);
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT32);
|
||||
schema_->AddDebugArrayField("array", DataType::INT32,false);
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT64);
|
||||
schema_->AddDebugArrayField("array", DataType::INT64,false);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
schema_->AddDebugArrayField("array", DataType::FLOAT);
|
||||
schema_->AddDebugArrayField("array", DataType::FLOAT,false);
|
||||
} else if constexpr (std::is_same_v<T, double>) {
|
||||
schema_->AddDebugArrayField("array", DataType::DOUBLE);
|
||||
schema_->AddDebugArrayField("array", DataType::DOUBLE,false);
|
||||
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||
schema_->AddDebugArrayField("array", DataType::VARCHAR);
|
||||
schema_->AddDebugArrayField("array", DataType::VARCHAR,false);
|
||||
}
|
||||
|
||||
return schema_;
|
||||
|
|
|
@ -72,7 +72,7 @@ class BinlogIndexTest : public ::testing::TestWithParam<Param> {
|
|||
schema->AddDebugField("fakevec", data_type, data_d, metric_type);
|
||||
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
|
||||
schema->set_primary_field_id(i64_fid);
|
||||
vec_field_data = storage::CreateFieldData(data_type, data_d);
|
||||
vec_field_data = storage::CreateFieldData(data_type, false, data_d);
|
||||
|
||||
if (data_type == DataType::VECTOR_FLOAT) {
|
||||
auto vec_data = GenRandomFloatVecData(data_n, data_d);
|
||||
|
@ -123,9 +123,9 @@ class BinlogIndexTest : public ::testing::TestWithParam<Param> {
|
|||
// load id
|
||||
LoadFieldDataInfo row_id_info;
|
||||
FieldMeta row_id_field_meta(
|
||||
FieldName("RowID"), RowFieldID, DataType::INT64);
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
FieldName("RowID"), RowFieldID, DataType::INT64, false);
|
||||
auto field_data = std::make_shared<milvus::FieldData<int64_t>>(
|
||||
DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.row_ids_.data(), data_n);
|
||||
auto field_data_info = FieldDataInfo{
|
||||
RowFieldID.get(), data_n, std::vector<FieldDataPtr>{field_data}};
|
||||
|
@ -133,9 +133,9 @@ class BinlogIndexTest : public ::testing::TestWithParam<Param> {
|
|||
// load ts
|
||||
LoadFieldDataInfo ts_info;
|
||||
FieldMeta ts_field_meta(
|
||||
FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
||||
field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
FieldName("Timestamp"), TimestampFieldID, DataType::INT64, false);
|
||||
field_data = std::make_shared<milvus::FieldData<int64_t>>(
|
||||
DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.timestamps_.data(), data_n);
|
||||
field_data_info = FieldDataInfo{TimestampFieldID.get(),
|
||||
data_n,
|
||||
|
|
|
@ -1011,6 +1011,74 @@ TEST(CApiTest, DeleteRepeatedPksFromSealedSegment) {
|
|||
DeleteSegment(segment);
|
||||
}
|
||||
|
||||
TEST(CApiTest, SearcTestWhenNullable) {
|
||||
auto c_collection = NewCollection(get_default_schema_config_nullable());
|
||||
CSegmentInterface segment;
|
||||
auto status = NewSegment(c_collection, Growing, -1, &segment);
|
||||
ASSERT_EQ(status.error_code, Success);
|
||||
auto col = (milvus::segcore::Collection*)c_collection;
|
||||
|
||||
int N = 10000;
|
||||
auto dataset = DataGen(col->get_schema(), N);
|
||||
int64_t ts_offset = 1000;
|
||||
|
||||
int64_t offset;
|
||||
PreInsert(segment, N, &offset);
|
||||
|
||||
auto insert_data = serialize(dataset.raw_);
|
||||
auto ins_res = Insert(segment,
|
||||
offset,
|
||||
N,
|
||||
dataset.row_ids_.data(),
|
||||
dataset.timestamps_.data(),
|
||||
insert_data.data(),
|
||||
insert_data.size());
|
||||
ASSERT_EQ(ins_res.error_code, Success);
|
||||
|
||||
milvus::proto::plan::PlanNode plan_node;
|
||||
auto vector_anns = plan_node.mutable_vector_anns();
|
||||
vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector);
|
||||
vector_anns->set_placeholder_tag("$0");
|
||||
vector_anns->set_field_id(100);
|
||||
auto query_info = vector_anns->mutable_query_info();
|
||||
query_info->set_topk(10);
|
||||
query_info->set_round_decimal(3);
|
||||
query_info->set_metric_type("L2");
|
||||
query_info->set_search_params(R"({"nprobe": 10})");
|
||||
auto plan_str = plan_node.SerializeAsString();
|
||||
|
||||
int num_queries = 10;
|
||||
auto blob = generate_query_data(num_queries);
|
||||
|
||||
void* plan = nullptr;
|
||||
status = CreateSearchPlanByExpr(
|
||||
c_collection, plan_str.data(), plan_str.size(), &plan);
|
||||
ASSERT_EQ(status.error_code, Success);
|
||||
|
||||
void* placeholderGroup = nullptr;
|
||||
status = ParsePlaceholderGroup(
|
||||
plan, blob.data(), blob.length(), &placeholderGroup);
|
||||
ASSERT_EQ(status.error_code, Success);
|
||||
|
||||
std::vector<CPlaceholderGroup> placeholderGroups;
|
||||
placeholderGroups.push_back(placeholderGroup);
|
||||
|
||||
CSearchResult search_result;
|
||||
auto res = CSearch(segment, plan, placeholderGroup, {}, &search_result);
|
||||
ASSERT_EQ(res.error_code, Success);
|
||||
|
||||
CSearchResult search_result2;
|
||||
auto res2 = CSearch(segment, plan, placeholderGroup, {}, &search_result2);
|
||||
ASSERT_EQ(res2.error_code, Success);
|
||||
|
||||
DeleteSearchPlan(plan);
|
||||
DeletePlaceholderGroup(placeholderGroup);
|
||||
DeleteSearchResult(search_result);
|
||||
DeleteSearchResult(search_result2);
|
||||
DeleteCollection(c_collection);
|
||||
DeleteSegment(segment);
|
||||
}
|
||||
|
||||
TEST(CApiTest, InsertSamePkAfterDeleteOnGrowingSegment) {
|
||||
auto collection = NewCollection(get_default_schema_config());
|
||||
CSegmentInterface segment;
|
||||
|
@ -4238,8 +4306,9 @@ TEST(CApiTest, SealedSegment_Update_Field_Size) {
|
|||
|
||||
TEST(CApiTest, GrowingSegment_Load_Field_Data) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64);
|
||||
schema->AddField(FieldName("Timestamp"), FieldId(1), DataType::INT64);
|
||||
schema->AddField(FieldName("RowID"), FieldId(0), DataType::INT64, false);
|
||||
schema->AddField(
|
||||
FieldName("Timestamp"), FieldId(1), DataType::INT64, false);
|
||||
auto str_fid = schema->AddDebugField("string", DataType::VARCHAR);
|
||||
auto vec_fid = schema->AddDebugField(
|
||||
"vector_float", DataType::VECTOR_FLOAT, DIM, "L2");
|
||||
|
|
|
@ -66,7 +66,8 @@ TEST_F(ChunkCacheTest, Read) {
|
|||
fake_id,
|
||||
milvus::DataType::VECTOR_FLOAT,
|
||||
dim,
|
||||
metric_type);
|
||||
metric_type,
|
||||
false);
|
||||
|
||||
auto lcm = milvus::storage::LocalChunkManagerSingleton::GetInstance()
|
||||
.GetChunkManager();
|
||||
|
@ -114,7 +115,8 @@ TEST_F(ChunkCacheTest, TestMultithreads) {
|
|||
fake_id,
|
||||
milvus::DataType::VECTOR_FLOAT,
|
||||
dim,
|
||||
metric_type);
|
||||
metric_type,
|
||||
false);
|
||||
|
||||
auto lcm = milvus::storage::LocalChunkManagerSingleton::GetInstance()
|
||||
.GetChunkManager();
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
// limitations under the License.
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <string>
|
||||
|
||||
#include "storage/DataCodec.h"
|
||||
#include "storage/InsertData.h"
|
||||
|
@ -22,6 +23,7 @@
|
|||
#include "storage/Util.h"
|
||||
#include "common/Consts.h"
|
||||
#include "common/Json.h"
|
||||
#include <cstddef>
|
||||
#include "test_utils/Constants.h"
|
||||
#include "test_utils/DataGen.h"
|
||||
|
||||
|
@ -29,7 +31,8 @@ using namespace milvus;
|
|||
|
||||
TEST(storage, InsertDataBool) {
|
||||
FixedVector<bool> data = {true, false, true, false, true};
|
||||
auto field_data = milvus::storage::CreateFieldData(storage::DataType::BOOL);
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::BOOL, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -48,14 +51,51 @@ TEST(storage, InsertDataBool) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::BOOL);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<bool> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataBoolNullable) {
|
||||
FixedVector<bool> data = {true, false, false, false, true};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::BOOL, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::BOOL);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
FixedVector<bool> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
// valid_data is 0001 0011, read from LSB, '1' means the according index is valid
|
||||
ASSERT_EQ(data[0], new_data[0]);
|
||||
ASSERT_EQ(data[1], new_data[1]);
|
||||
ASSERT_EQ(data[4], new_data[4]);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt8) {
|
||||
FixedVector<int8_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8);
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT8, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -74,15 +114,48 @@ TEST(storage, InsertDataInt8) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT8);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<int8_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt8Nullable) {
|
||||
FixedVector<int8_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT8, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT8);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<int8_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {1, 2, 0, 0, 5};
|
||||
ASSERT_EQ(data, new_data);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt16) {
|
||||
FixedVector<int16_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT16);
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT16, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -101,15 +174,48 @@ TEST(storage, InsertDataInt16) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT16);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<int16_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt16Nullable) {
|
||||
FixedVector<int16_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT16, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT16);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<int16_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {1, 2, 0, 0, 5};
|
||||
ASSERT_EQ(data, new_data);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt32) {
|
||||
FixedVector<int32_t> data = {true, false, true, false, true};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT32);
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT32, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -128,15 +234,48 @@ TEST(storage, InsertDataInt32) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT32);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<int32_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt32Nullable) {
|
||||
FixedVector<int32_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT32, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT32);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<int32_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {1, 2, 0, 0, 5};
|
||||
ASSERT_EQ(data, new_data);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt64) {
|
||||
FixedVector<int64_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT64);
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT64, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -155,16 +294,49 @@ TEST(storage, InsertDataInt64) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT64);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<int64_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataInt64Nullable) {
|
||||
FixedVector<int64_t> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT64, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::INT64);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<int64_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {1, 2, 0, 0, 5};
|
||||
ASSERT_EQ(data, new_data);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataString) {
|
||||
FixedVector<std::string> data = {
|
||||
"test1", "test2", "test3", "test4", "test5"};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::VARCHAR);
|
||||
milvus::storage::CreateFieldData(storage::DataType::VARCHAR, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -184,18 +356,56 @@ TEST(storage, InsertDataString) {
|
|||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VARCHAR);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<std::string> new_data(data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
for (int i = 0; i < data.size(); ++i) {
|
||||
new_data[i] =
|
||||
*static_cast<const std::string*>(new_payload->RawValue(i));
|
||||
ASSERT_EQ(new_payload->Size(i), data[i].size());
|
||||
ASSERT_EQ(new_payload->DataSize(i), data[i].size());
|
||||
}
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataStringNullable) {
|
||||
FixedVector<std::string> data = {
|
||||
"test1", "test2", "test3", "test4", "test5"};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::STRING, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::STRING);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<std::string> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {"test1", "test2", "", "", "test5"};
|
||||
for (int i = 0; i < data.size(); ++i) {
|
||||
new_data[i] =
|
||||
*static_cast<const std::string*>(new_payload->RawValue(i));
|
||||
ASSERT_EQ(new_payload->DataSize(i), data[i].size());
|
||||
}
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataFloat) {
|
||||
FixedVector<float> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::FLOAT);
|
||||
milvus::storage::CreateFieldData(storage::DataType::FLOAT, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -214,15 +424,47 @@ TEST(storage, InsertDataFloat) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::FLOAT);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<float> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataFloatNullable) {
|
||||
FixedVector<float> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::FLOAT, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::FLOAT);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<float> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {1, 2, 0, 0, 5};
|
||||
ASSERT_EQ(data, new_data);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataDouble) {
|
||||
FixedVector<double> data = {1.0, 2.0, 3.0, 4.2, 5.3};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::DOUBLE);
|
||||
milvus::storage::CreateFieldData(storage::DataType::DOUBLE, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -241,16 +483,49 @@ TEST(storage, InsertDataDouble) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::DOUBLE);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
FixedVector<double> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataDoubleNullable) {
|
||||
FixedVector<double> data = {1, 2, 3, 4, 5};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::DOUBLE, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x13};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::DOUBLE);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
FixedVector<double> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
data = {1, 2, 0, 0, 5};
|
||||
ASSERT_EQ(data, new_data);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 2);
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataFloatVector) {
|
||||
std::vector<float> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
int DIM = 2;
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::VECTOR_FLOAT, DIM);
|
||||
auto field_data = milvus::storage::CreateFieldData(
|
||||
storage::DataType::VECTOR_FLOAT, false, DIM);
|
||||
field_data->FillFieldData(data.data(), data.size() / DIM);
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -269,6 +544,7 @@ TEST(storage, InsertDataFloatVector) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_FLOAT);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size() / DIM);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
std::vector<float> new_data(data.size());
|
||||
memcpy(new_data.data(),
|
||||
new_payload->Data(),
|
||||
|
@ -281,7 +557,7 @@ TEST(storage, InsertDataSparseFloat) {
|
|||
auto vecs = milvus::segcore::GenerateRandomSparseFloatVector(
|
||||
n_rows, kTestSparseDim, kTestSparseVectorDensity);
|
||||
auto field_data = milvus::storage::CreateFieldData(
|
||||
storage::DataType::VECTOR_SPARSE_FLOAT, kTestSparseDim, n_rows);
|
||||
storage::DataType::VECTOR_SPARSE_FLOAT, false, kTestSparseDim, n_rows);
|
||||
field_data->FillFieldData(vecs.get(), n_rows);
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -301,6 +577,7 @@ TEST(storage, InsertDataSparseFloat) {
|
|||
ASSERT_TRUE(new_payload->get_data_type() ==
|
||||
storage::DataType::VECTOR_SPARSE_FLOAT);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), n_rows);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
auto new_data = static_cast<const knowhere::sparse::SparseRow<float>*>(
|
||||
new_payload->Data());
|
||||
|
||||
|
@ -318,8 +595,8 @@ TEST(storage, InsertDataSparseFloat) {
|
|||
TEST(storage, InsertDataBinaryVector) {
|
||||
std::vector<uint8_t> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
int DIM = 16;
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::VECTOR_BINARY, DIM);
|
||||
auto field_data = milvus::storage::CreateFieldData(
|
||||
storage::DataType::VECTOR_BINARY, false, DIM);
|
||||
field_data->FillFieldData(data.data(), data.size() * 8 / DIM);
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -338,8 +615,9 @@ TEST(storage, InsertDataBinaryVector) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_BINARY);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size() * 8 / DIM);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
std::vector<uint8_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->Size());
|
||||
memcpy(new_data.data(), new_payload->Data(), new_payload->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
|
@ -347,7 +625,7 @@ TEST(storage, InsertDataFloat16Vector) {
|
|||
std::vector<float16> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
int DIM = 2;
|
||||
auto field_data = milvus::storage::CreateFieldData(
|
||||
storage::DataType::VECTOR_FLOAT16, DIM);
|
||||
storage::DataType::VECTOR_FLOAT16, false, DIM);
|
||||
field_data->FillFieldData(data.data(), data.size() / DIM);
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -366,6 +644,7 @@ TEST(storage, InsertDataFloat16Vector) {
|
|||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_FLOAT16);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size() / DIM);
|
||||
ASSERT_EQ(new_payload->get_null_count(), 0);
|
||||
std::vector<float16> new_data(data.size());
|
||||
memcpy(new_data.data(),
|
||||
new_payload->Data(),
|
||||
|
@ -373,39 +652,10 @@ TEST(storage, InsertDataFloat16Vector) {
|
|||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataBFloat16Vector) {
|
||||
std::vector<bfloat16> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
int DIM = 2;
|
||||
auto field_data = milvus::storage::CreateFieldData(
|
||||
storage::DataType::VECTOR_BFLOAT16, DIM);
|
||||
field_data->FillFieldData(data.data(), data.size() / DIM);
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::VECTOR_BFLOAT16);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size() / DIM);
|
||||
std::vector<bfloat16> new_data(data.size());
|
||||
memcpy(new_data.data(),
|
||||
new_payload->Data(),
|
||||
new_payload->get_num_rows() * sizeof(bfloat16) * DIM);
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
TEST(storage, IndexData) {
|
||||
std::vector<uint8_t> data = {1, 2, 3, 4, 5, 6, 7, 8};
|
||||
auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8);
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT8, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::IndexData index_data(field_data);
|
||||
|
@ -427,7 +677,7 @@ TEST(storage, IndexData) {
|
|||
ASSERT_EQ(new_field_data->get_data_type(), storage::DataType::INT8);
|
||||
ASSERT_EQ(new_field_data->Size(), data.size());
|
||||
std::vector<uint8_t> new_data(data.size());
|
||||
memcpy(new_data.data(), new_field_data->Data(), new_field_data->Size());
|
||||
memcpy(new_data.data(), new_field_data->Data(), new_field_data->DataSize());
|
||||
ASSERT_EQ(data, new_data);
|
||||
}
|
||||
|
||||
|
@ -441,7 +691,7 @@ TEST(storage, InsertDataStringArray) {
|
|||
auto string_array = Array(field_string_data);
|
||||
FixedVector<Array> data = {string_array};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::ARRAY);
|
||||
milvus::storage::CreateFieldData(storage::DataType::ARRAY, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
|
@ -463,7 +713,56 @@ TEST(storage, InsertDataStringArray) {
|
|||
FixedVector<Array> new_data(data.size());
|
||||
for (int i = 0; i < data.size(); ++i) {
|
||||
new_data[i] = *static_cast<const Array*>(new_payload->RawValue(i));
|
||||
ASSERT_EQ(new_payload->Size(i), data[i].byte_size());
|
||||
ASSERT_EQ(new_payload->DataSize(i), data[i].byte_size());
|
||||
ASSERT_TRUE(data[i].operator==(new_data[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(storage, InsertDataStringArrayNullable) {
|
||||
milvus::proto::schema::ScalarField field_string_data;
|
||||
field_string_data.mutable_string_data()->add_data("test_array1");
|
||||
field_string_data.mutable_string_data()->add_data("test_array2");
|
||||
field_string_data.mutable_string_data()->add_data("test_array3");
|
||||
field_string_data.mutable_string_data()->add_data("test_array4");
|
||||
field_string_data.mutable_string_data()->add_data("test_array5");
|
||||
auto string_array = Array(field_string_data);
|
||||
milvus::proto::schema::ScalarField field_int_data;
|
||||
field_string_data.mutable_int_data()->add_data(1);
|
||||
field_string_data.mutable_int_data()->add_data(2);
|
||||
field_string_data.mutable_int_data()->add_data(3);
|
||||
field_string_data.mutable_int_data()->add_data(4);
|
||||
field_string_data.mutable_int_data()->add_data(5);
|
||||
auto int_array = Array(field_int_data);
|
||||
FixedVector<Array> data = {string_array, int_array};
|
||||
auto field_data =
|
||||
milvus::storage::CreateFieldData(storage::DataType::ARRAY, true);
|
||||
uint8_t* valid_data = new uint8_t[1]{0x01};
|
||||
field_data->FillFieldData(data.data(), valid_data, data.size());
|
||||
|
||||
storage::InsertData insert_data(field_data);
|
||||
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
|
||||
insert_data.SetFieldDataMeta(field_data_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
|
||||
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
|
||||
[&](uint8_t*) {});
|
||||
auto new_insert_data = storage::DeserializeFileData(
|
||||
serialized_data_ptr, serialized_bytes.size());
|
||||
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
|
||||
ASSERT_EQ(new_insert_data->GetTimeRage(),
|
||||
std::make_pair(Timestamp(0), Timestamp(100)));
|
||||
auto new_payload = new_insert_data->GetFieldData();
|
||||
ASSERT_EQ(new_payload->get_data_type(), storage::DataType::ARRAY);
|
||||
ASSERT_EQ(new_payload->get_num_rows(), data.size());
|
||||
ASSERT_EQ(new_payload->get_null_count(), 1);
|
||||
FixedVector<Array> expected_data = {string_array, Array()};
|
||||
FixedVector<Array> new_data(data.size());
|
||||
for (int i = 0; i < data.size(); ++i) {
|
||||
new_data[i] = *static_cast<const Array*>(new_payload->RawValue(i));
|
||||
ASSERT_EQ(new_payload->DataSize(i), data[i].byte_size());
|
||||
ASSERT_TRUE(expected_data[i].operator==(new_data[i]));
|
||||
}
|
||||
ASSERT_EQ(*new_payload->ValidData(), *valid_data);
|
||||
delete[] valid_data;
|
||||
}
|
||||
|
|
|
@ -104,7 +104,8 @@ TEST_F(DiskAnnFileManagerTest, AddFilePositiveParallel) {
|
|||
auto buf = std::unique_ptr<uint8_t[]>(new uint8_t[file_size]);
|
||||
lcm->Read(file, buf.get(), file_size);
|
||||
|
||||
auto index = milvus::storage::CreateFieldData(storage::DataType::INT8);
|
||||
auto index =
|
||||
milvus::storage::CreateFieldData(storage::DataType::INT8, false);
|
||||
index->FillFieldData(buf.get(), file_size);
|
||||
auto rows = index->get_num_rows();
|
||||
auto rawData = (uint8_t*)(index->Data());
|
||||
|
@ -268,7 +269,7 @@ auto
|
|||
PrepareInsertData(const int64_t opt_field_data_range) -> std::string {
|
||||
std::vector<NativeType> data =
|
||||
PrepareRawFieldData<NativeType>(opt_field_data_range);
|
||||
auto field_data = storage::CreateFieldData(DT, 1, kEntityCnt);
|
||||
auto field_data = storage::CreateFieldData(DT, false, 1, kEntityCnt);
|
||||
field_data->FillFieldData(data.data(), kEntityCnt);
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(kOptVecFieldDataMeta);
|
||||
|
|
|
@ -1613,7 +1613,8 @@ TEST(Expr, TestExprPerformance) {
|
|||
|
||||
TEST_P(ExprTest, test_term_pk) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
schema->AddField(FieldName("Timestamp"), FieldId(1), DataType::INT64);
|
||||
schema->AddField(
|
||||
FieldName("Timestamp"), FieldId(1), DataType::INT64, false);
|
||||
auto vec_fid = schema->AddDebugField("fakevec", data_type, 16, metric_type);
|
||||
auto str1_fid = schema->AddDebugField("string1", DataType::VARCHAR);
|
||||
auto int64_fid = schema->AddDebugField("int64", DataType::INT64);
|
||||
|
|
|
@ -32,7 +32,7 @@ prepareSegmentSystemFieldData(const std::unique_ptr<SegmentSealed>& segment,
|
|||
size_t row_count,
|
||||
GeneratedData& data_set) {
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64, false);
|
||||
field_data->FillFieldData(data_set.row_ids_.data(), row_count);
|
||||
auto field_data_info =
|
||||
FieldDataInfo{RowFieldID.get(),
|
||||
|
@ -40,7 +40,8 @@ prepareSegmentSystemFieldData(const std::unique_ptr<SegmentSealed>& segment,
|
|||
std::vector<milvus::FieldDataPtr>{field_data}};
|
||||
segment->LoadFieldData(RowFieldID, field_data_info);
|
||||
|
||||
field_data = std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64, false);
|
||||
field_data->FillFieldData(data_set.timestamps_.data(), row_count);
|
||||
field_data_info =
|
||||
FieldDataInfo{TimestampFieldID.get(),
|
||||
|
|
|
@ -323,5 +323,176 @@ TEST_P(GrowingTest, FillData) {
|
|||
num_inserted);
|
||||
EXPECT_EQ(float_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
|
||||
EXPECT_EQ(bool_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int8_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int16_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int32_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int64_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(float_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(double_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(varchar_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(json_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(long_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(bool_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(string_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(double_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(float_array_result->valid_data_size(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(Growing, FillNullableData) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
auto metric_type = knowhere::metric::L2;
|
||||
auto bool_field = schema->AddDebugField("bool", DataType::BOOL, true);
|
||||
auto int8_field = schema->AddDebugField("int8", DataType::INT8, true);
|
||||
auto int16_field = schema->AddDebugField("int16", DataType::INT16, true);
|
||||
auto int32_field = schema->AddDebugField("int32", DataType::INT32, true);
|
||||
auto int64_field = schema->AddDebugField("int64", DataType::INT64);
|
||||
auto float_field = schema->AddDebugField("float", DataType::FLOAT, true);
|
||||
auto double_field = schema->AddDebugField("double", DataType::DOUBLE, true);
|
||||
auto varchar_field =
|
||||
schema->AddDebugField("varchar", DataType::VARCHAR, true);
|
||||
auto json_field = schema->AddDebugField("json", DataType::JSON, true);
|
||||
auto int_array_field = schema->AddDebugField(
|
||||
"int_array", DataType::ARRAY, DataType::INT8, true);
|
||||
auto long_array_field = schema->AddDebugField(
|
||||
"long_array", DataType::ARRAY, DataType::INT64, true);
|
||||
auto bool_array_field = schema->AddDebugField(
|
||||
"bool_array", DataType::ARRAY, DataType::BOOL, true);
|
||||
auto string_array_field = schema->AddDebugField(
|
||||
"string_array", DataType::ARRAY, DataType::VARCHAR, true);
|
||||
auto double_array_field = schema->AddDebugField(
|
||||
"double_array", DataType::ARRAY, DataType::DOUBLE, true);
|
||||
auto float_array_field = schema->AddDebugField(
|
||||
"float_array", DataType::ARRAY, DataType::FLOAT, true);
|
||||
auto vec = schema->AddDebugField(
|
||||
"embeddings", DataType::VECTOR_FLOAT, 128, metric_type);
|
||||
schema->set_primary_field_id(int64_field);
|
||||
|
||||
std::map<std::string, std::string> index_params = {
|
||||
{"index_type", "IVF_FLAT"},
|
||||
{"metric_type", metric_type},
|
||||
{"nlist", "128"}};
|
||||
std::map<std::string, std::string> type_params = {{"dim", "128"}};
|
||||
FieldIndexMeta fieldIndexMeta(
|
||||
vec, std::move(index_params), std::move(type_params));
|
||||
auto config = SegcoreConfig::default_config();
|
||||
config.set_chunk_rows(1024);
|
||||
config.set_enable_interim_segment_index(true);
|
||||
std::map<FieldId, FieldIndexMeta> filedMap = {{vec, fieldIndexMeta}};
|
||||
IndexMetaPtr metaPtr =
|
||||
std::make_shared<CollectionIndexMeta>(100000, std::move(filedMap));
|
||||
auto segment_growing = CreateGrowingSegment(schema, metaPtr, 1, config);
|
||||
auto segment = dynamic_cast<SegmentGrowingImpl*>(segment_growing.get());
|
||||
|
||||
int64_t per_batch = 1000;
|
||||
int64_t n_batch = 3;
|
||||
int64_t dim = 128;
|
||||
for (int64_t i = 0; i < n_batch; i++) {
|
||||
auto dataset = DataGen(schema, per_batch);
|
||||
auto bool_values = dataset.get_col<bool>(bool_field);
|
||||
auto int8_values = dataset.get_col<int8_t>(int8_field);
|
||||
auto int16_values = dataset.get_col<int16_t>(int16_field);
|
||||
auto int32_values = dataset.get_col<int32_t>(int32_field);
|
||||
auto int64_values = dataset.get_col<int64_t>(int64_field);
|
||||
auto float_values = dataset.get_col<float>(float_field);
|
||||
auto double_values = dataset.get_col<double>(double_field);
|
||||
auto varchar_values = dataset.get_col<std::string>(varchar_field);
|
||||
auto json_values = dataset.get_col<std::string>(json_field);
|
||||
auto int_array_values = dataset.get_col<ScalarArray>(int_array_field);
|
||||
auto long_array_values = dataset.get_col<ScalarArray>(long_array_field);
|
||||
auto bool_array_values = dataset.get_col<ScalarArray>(bool_array_field);
|
||||
auto string_array_values =
|
||||
dataset.get_col<ScalarArray>(string_array_field);
|
||||
auto double_array_values =
|
||||
dataset.get_col<ScalarArray>(double_array_field);
|
||||
auto float_array_values =
|
||||
dataset.get_col<ScalarArray>(float_array_field);
|
||||
auto vector_values = dataset.get_col<float>(vec);
|
||||
|
||||
auto offset = segment->PreInsert(per_batch);
|
||||
segment->Insert(offset,
|
||||
per_batch,
|
||||
dataset.row_ids_.data(),
|
||||
dataset.timestamps_.data(),
|
||||
dataset.raw_);
|
||||
auto num_inserted = (i + 1) * per_batch;
|
||||
auto ids_ds = GenRandomIds(num_inserted);
|
||||
auto bool_result =
|
||||
segment->bulk_subscript(bool_field, ids_ds->GetIds(), num_inserted);
|
||||
auto int8_result =
|
||||
segment->bulk_subscript(int8_field, ids_ds->GetIds(), num_inserted);
|
||||
auto int16_result = segment->bulk_subscript(
|
||||
int16_field, ids_ds->GetIds(), num_inserted);
|
||||
auto int32_result = segment->bulk_subscript(
|
||||
int32_field, ids_ds->GetIds(), num_inserted);
|
||||
auto int64_result = segment->bulk_subscript(
|
||||
int64_field, ids_ds->GetIds(), num_inserted);
|
||||
auto float_result = segment->bulk_subscript(
|
||||
float_field, ids_ds->GetIds(), num_inserted);
|
||||
auto double_result = segment->bulk_subscript(
|
||||
double_field, ids_ds->GetIds(), num_inserted);
|
||||
auto varchar_result = segment->bulk_subscript(
|
||||
varchar_field, ids_ds->GetIds(), num_inserted);
|
||||
auto json_result =
|
||||
segment->bulk_subscript(json_field, ids_ds->GetIds(), num_inserted);
|
||||
auto int_array_result = segment->bulk_subscript(
|
||||
int_array_field, ids_ds->GetIds(), num_inserted);
|
||||
auto long_array_result = segment->bulk_subscript(
|
||||
long_array_field, ids_ds->GetIds(), num_inserted);
|
||||
auto bool_array_result = segment->bulk_subscript(
|
||||
bool_array_field, ids_ds->GetIds(), num_inserted);
|
||||
auto string_array_result = segment->bulk_subscript(
|
||||
string_array_field, ids_ds->GetIds(), num_inserted);
|
||||
auto double_array_result = segment->bulk_subscript(
|
||||
double_array_field, ids_ds->GetIds(), num_inserted);
|
||||
auto float_array_result = segment->bulk_subscript(
|
||||
float_array_field, ids_ds->GetIds(), num_inserted);
|
||||
auto vec_result =
|
||||
segment->bulk_subscript(vec, ids_ds->GetIds(), num_inserted);
|
||||
|
||||
EXPECT_EQ(bool_result->scalars().bool_data().data_size(), num_inserted);
|
||||
EXPECT_EQ(int8_result->scalars().int_data().data_size(), num_inserted);
|
||||
EXPECT_EQ(int16_result->scalars().int_data().data_size(), num_inserted);
|
||||
EXPECT_EQ(int32_result->scalars().int_data().data_size(), num_inserted);
|
||||
EXPECT_EQ(int64_result->scalars().long_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(float_result->scalars().float_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(double_result->scalars().double_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(varchar_result->scalars().string_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(json_result->scalars().json_data().data_size(), num_inserted);
|
||||
EXPECT_EQ(vec_result->vectors().float_vector().data_size(),
|
||||
num_inserted * dim);
|
||||
EXPECT_EQ(int_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(long_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(bool_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(string_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(double_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(float_array_result->scalars().array_data().data_size(),
|
||||
num_inserted);
|
||||
EXPECT_EQ(bool_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(int8_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(int16_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(int32_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(float_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(double_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(varchar_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(json_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(int_array_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(long_array_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(bool_array_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(string_array_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(double_array_result->valid_data_size(), num_inserted);
|
||||
EXPECT_EQ(float_array_result->valid_data_size(), num_inserted);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -150,10 +150,10 @@ TEST_P(GrowingIndexTest, Correctness) {
|
|||
const VectorBase* field_data = nullptr;
|
||||
if (is_sparse) {
|
||||
field_data = segmentImplPtr->get_insert_record()
|
||||
.get_field_data<milvus::SparseFloatVector>(vec);
|
||||
.get_data<milvus::SparseFloatVector>(vec);
|
||||
} else {
|
||||
field_data = segmentImplPtr->get_insert_record()
|
||||
.get_field_data<milvus::FloatVector>(vec);
|
||||
.get_data<milvus::FloatVector>(vec);
|
||||
}
|
||||
|
||||
auto inserted = (i + 1) * per_batch;
|
||||
|
|
|
@ -330,7 +330,7 @@ test_string() {
|
|||
data.push_back(std::to_string(rand()));
|
||||
}
|
||||
|
||||
auto field_data = storage::CreateFieldData(dtype);
|
||||
auto field_data = storage::CreateFieldData(dtype, false);
|
||||
field_data->FillFieldData(data.data(), data.size());
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(field_meta);
|
||||
|
|
|
@ -194,7 +194,7 @@ test_run() {
|
|||
for (int64_t i = 0; i < nb * dim; ++i) {
|
||||
data_gen[i] = rand();
|
||||
}
|
||||
auto field_data = storage::CreateFieldData(dtype, dim);
|
||||
auto field_data = storage::CreateFieldData(dtype, false, dim);
|
||||
field_data->FillFieldData(data_gen.data(), data_gen.size() / dim);
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(field_meta);
|
||||
|
|
|
@ -555,6 +555,7 @@ TEST(Query, FillSegment) {
|
|||
{
|
||||
auto field = proto.add_fields();
|
||||
field->set_name("fakevec");
|
||||
field->set_nullable(false);
|
||||
field->set_is_primary_key(false);
|
||||
field->set_description("asdgfsagf");
|
||||
field->set_fieldid(100);
|
||||
|
@ -570,6 +571,7 @@ TEST(Query, FillSegment) {
|
|||
{
|
||||
auto field = proto.add_fields();
|
||||
field->set_name("the_key");
|
||||
field->set_nullable(false);
|
||||
field->set_fieldid(101);
|
||||
field->set_is_primary_key(true);
|
||||
field->set_description("asdgfsagf");
|
||||
|
@ -579,6 +581,7 @@ TEST(Query, FillSegment) {
|
|||
{
|
||||
auto field = proto.add_fields();
|
||||
field->set_name("the_value");
|
||||
field->set_nullable(true);
|
||||
field->set_fieldid(102);
|
||||
field->set_is_primary_key(false);
|
||||
field->set_description("asdgfsagf");
|
||||
|
@ -595,6 +598,7 @@ TEST(Query, FillSegment) {
|
|||
dataset.get_col<float>(FieldId(100)); // vector field
|
||||
const auto std_i32_vec =
|
||||
dataset.get_col<int32_t>(FieldId(102)); // scalar field
|
||||
const auto i32_vec_valid_data = dataset.get_col_valid(FieldId(102));
|
||||
|
||||
std::vector<std::unique_ptr<SegmentInternalInterface>> segments;
|
||||
segments.emplace_back([&] {
|
||||
|
@ -659,6 +663,8 @@ TEST(Query, FillSegment) {
|
|||
auto output_i32_field_data =
|
||||
fields_data.at(i32_field_id)->scalars().int_data().data();
|
||||
ASSERT_EQ(output_i32_field_data.size(), topk * num_queries);
|
||||
auto output_i32_valid_data = fields_data.at(i32_field_id)->valid_data();
|
||||
ASSERT_EQ(output_i32_valid_data.size(), topk * num_queries);
|
||||
|
||||
for (int i = 0; i < topk * num_queries; i++) {
|
||||
int64_t val = std::get<int64_t>(result->primary_keys_[i]);
|
||||
|
@ -666,6 +672,7 @@ TEST(Query, FillSegment) {
|
|||
auto internal_offset = result->seg_offsets_[i];
|
||||
auto std_val = std_vec[internal_offset];
|
||||
auto std_i32 = std_i32_vec[internal_offset];
|
||||
auto std_i32_valid = i32_vec_valid_data[internal_offset];
|
||||
std::vector<float> std_vfloat(dim);
|
||||
std::copy_n(std_vfloat_vec.begin() + dim * internal_offset,
|
||||
dim,
|
||||
|
@ -684,6 +691,10 @@ TEST(Query, FillSegment) {
|
|||
int i32;
|
||||
memcpy(&i32, &output_i32_field_data[i], sizeof(int32_t));
|
||||
ASSERT_EQ(i32, std_i32);
|
||||
// check int32 valid field
|
||||
bool i32_valid;
|
||||
memcpy(&i32_valid, &output_i32_valid_data[i], sizeof(bool));
|
||||
ASSERT_EQ(i32_valid, std_i32_valid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -862,9 +862,9 @@ TEST(Sealed, LoadScalarIndex) {
|
|||
|
||||
LoadFieldDataInfo row_id_info;
|
||||
FieldMeta row_id_field_meta(
|
||||
FieldName("RowID"), RowFieldID, DataType::INT64);
|
||||
FieldName("RowID"), RowFieldID, DataType::INT64, false);
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.row_ids_.data(), N);
|
||||
auto field_data_info = FieldDataInfo{
|
||||
RowFieldID.get(), N, std::vector<FieldDataPtr>{field_data}};
|
||||
|
@ -872,8 +872,9 @@ TEST(Sealed, LoadScalarIndex) {
|
|||
|
||||
LoadFieldDataInfo ts_info;
|
||||
FieldMeta ts_field_meta(
|
||||
FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
||||
field_data = std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
FieldName("Timestamp"), TimestampFieldID, DataType::INT64, false);
|
||||
field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.timestamps_.data(), N);
|
||||
field_data_info = FieldDataInfo{
|
||||
TimestampFieldID.get(), N, std::vector<FieldDataPtr>{field_data}};
|
||||
|
@ -1142,7 +1143,8 @@ TEST(Sealed, BF) {
|
|||
SealedLoadFieldData(dataset, *segment, {fake_id.get()});
|
||||
|
||||
auto vec_data = GenRandomFloatVecs(N, dim);
|
||||
auto field_data = storage::CreateFieldData(DataType::VECTOR_FLOAT, dim);
|
||||
auto field_data =
|
||||
storage::CreateFieldData(DataType::VECTOR_FLOAT, false, dim);
|
||||
field_data->FillFieldData(vec_data.data(), N);
|
||||
auto field_data_info =
|
||||
FieldDataInfo{fake_id.get(), N, std::vector<FieldDataPtr>{field_data}};
|
||||
|
@ -1196,7 +1198,8 @@ TEST(Sealed, BF_Overflow) {
|
|||
SealedLoadFieldData(dataset, *segment, {fake_id.get()});
|
||||
|
||||
auto vec_data = GenMaxFloatVecs(N, dim);
|
||||
auto field_data = storage::CreateFieldData(DataType::VECTOR_FLOAT, dim);
|
||||
auto field_data =
|
||||
storage::CreateFieldData(DataType::VECTOR_FLOAT, false, dim);
|
||||
field_data->FillFieldData(vec_data.data(), N);
|
||||
auto field_data_info =
|
||||
FieldDataInfo{fake_id.get(), N, std::vector<FieldDataPtr>{field_data}};
|
||||
|
@ -1874,7 +1877,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
|
|||
|
||||
//test for int64
|
||||
std::vector<int64_t> pks = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||
auto pk_field_data = storage::CreateFieldData(DataType::INT64, 1, 10);
|
||||
auto pk_field_data =
|
||||
storage::CreateFieldData(DataType::INT64, false, 1, 10);
|
||||
pk_field_data->FillFieldData(pks.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
pk_fid, 0, DataType::INT64, pk_field_data->Data(), N);
|
||||
|
@ -1915,7 +1919,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
|
|||
|
||||
//test for int32
|
||||
std::vector<int32_t> int32s = {2, 2, 3, 4, 5, 6, 7, 8, 9, 12};
|
||||
auto int32_field_data = storage::CreateFieldData(DataType::INT32, 1, 10);
|
||||
auto int32_field_data =
|
||||
storage::CreateFieldData(DataType::INT32, false, 1, 10);
|
||||
int32_field_data->FillFieldData(int32s.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
i32_fid, 0, DataType::INT32, int32_field_data->Data(), N);
|
||||
|
@ -1925,7 +1930,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
|
|||
|
||||
//test for int16
|
||||
std::vector<int16_t> int16s = {2, 2, 3, 4, 5, 6, 7, 8, 9, 12};
|
||||
auto int16_field_data = storage::CreateFieldData(DataType::INT16, 1, 10);
|
||||
auto int16_field_data =
|
||||
storage::CreateFieldData(DataType::INT16, false, 1, 10);
|
||||
int16_field_data->FillFieldData(int16s.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
i16_fid, 0, DataType::INT16, int16_field_data->Data(), N);
|
||||
|
@ -1935,7 +1941,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
|
|||
|
||||
//test for int8
|
||||
std::vector<int8_t> int8s = {2, 2, 3, 4, 5, 6, 7, 8, 9, 12};
|
||||
auto int8_field_data = storage::CreateFieldData(DataType::INT8, 1, 10);
|
||||
auto int8_field_data =
|
||||
storage::CreateFieldData(DataType::INT8, false, 1, 10);
|
||||
int8_field_data->FillFieldData(int8s.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
i8_fid, 0, DataType::INT8, int8_field_data->Data(), N);
|
||||
|
@ -1946,7 +1953,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
|
|||
// test for float
|
||||
std::vector<float> floats = {
|
||||
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
|
||||
auto float_field_data = storage::CreateFieldData(DataType::FLOAT, 1, 10);
|
||||
auto float_field_data =
|
||||
storage::CreateFieldData(DataType::FLOAT, false, 1, 10);
|
||||
float_field_data->FillFieldData(floats.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
float_fid, 0, DataType::FLOAT, float_field_data->Data(), N);
|
||||
|
@ -1957,7 +1965,8 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
|
|||
// test for double
|
||||
std::vector<double> doubles = {
|
||||
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
|
||||
auto double_field_data = storage::CreateFieldData(DataType::DOUBLE, 1, 10);
|
||||
auto double_field_data =
|
||||
storage::CreateFieldData(DataType::DOUBLE, false, 1, 10);
|
||||
double_field_data->FillFieldData(doubles.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
double_fid, 0, DataType::DOUBLE, double_field_data->Data(), N);
|
||||
|
@ -1980,7 +1989,8 @@ TEST(Sealed, SkipIndexSkipBinaryRange) {
|
|||
|
||||
//test for int64
|
||||
std::vector<int64_t> pks = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||
auto pk_field_data = storage::CreateFieldData(DataType::INT64, 1, 10);
|
||||
auto pk_field_data =
|
||||
storage::CreateFieldData(DataType::INT64, false, 1, 10);
|
||||
pk_field_data->FillFieldData(pks.data(), N);
|
||||
segment->LoadPrimitiveSkipIndex(
|
||||
pk_fid, 0, DataType::INT64, pk_field_data->Data(), N);
|
||||
|
@ -2015,7 +2025,8 @@ TEST(Sealed, SkipIndexSkipStringRange) {
|
|||
|
||||
//test for string
|
||||
std::vector<std::string> strings = {"e", "f", "g", "g", "j"};
|
||||
auto string_field_data = storage::CreateFieldData(DataType::VARCHAR, 1, N);
|
||||
auto string_field_data =
|
||||
storage::CreateFieldData(DataType::VARCHAR, false, 1, N);
|
||||
string_field_data->FillFieldData(strings.data(), N);
|
||||
auto string_field_data_info = FieldDataInfo{
|
||||
string_fid.get(), N, std::vector<FieldDataPtr>{string_field_data}};
|
||||
|
@ -2191,4 +2202,174 @@ TEST(Sealed, QueryAllFields) {
|
|||
dataset_size);
|
||||
EXPECT_EQ(float_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
|
||||
EXPECT_EQ(bool_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int8_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int16_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int32_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int64_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(float_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(double_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(varchar_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(json_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(int_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(long_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(bool_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(string_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(double_array_result->valid_data_size(), 0);
|
||||
EXPECT_EQ(float_array_result->valid_data_size(), 0);
|
||||
}
|
||||
|
||||
TEST(Sealed, QueryAllNullableFields) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
auto metric_type = knowhere::metric::L2;
|
||||
auto bool_field = schema->AddDebugField("bool", DataType::BOOL, true);
|
||||
auto int8_field = schema->AddDebugField("int8", DataType::INT8, true);
|
||||
auto int16_field = schema->AddDebugField("int16", DataType::INT16, true);
|
||||
auto int32_field = schema->AddDebugField("int32", DataType::INT32, true);
|
||||
auto int64_field = schema->AddDebugField("int64", DataType::INT64, false);
|
||||
auto float_field = schema->AddDebugField("float", DataType::FLOAT, true);
|
||||
auto double_field = schema->AddDebugField("double", DataType::DOUBLE, true);
|
||||
auto varchar_field =
|
||||
schema->AddDebugField("varchar", DataType::VARCHAR, true);
|
||||
auto json_field = schema->AddDebugField("json", DataType::JSON, true);
|
||||
auto int_array_field = schema->AddDebugField(
|
||||
"int_array", DataType::ARRAY, DataType::INT8, true);
|
||||
auto long_array_field = schema->AddDebugField(
|
||||
"long_array", DataType::ARRAY, DataType::INT64, true);
|
||||
auto bool_array_field = schema->AddDebugField(
|
||||
"bool_array", DataType::ARRAY, DataType::BOOL, true);
|
||||
auto string_array_field = schema->AddDebugField(
|
||||
"string_array", DataType::ARRAY, DataType::VARCHAR, true);
|
||||
auto double_array_field = schema->AddDebugField(
|
||||
"double_array", DataType::ARRAY, DataType::DOUBLE, true);
|
||||
auto float_array_field = schema->AddDebugField(
|
||||
"float_array", DataType::ARRAY, DataType::FLOAT, true);
|
||||
auto vec = schema->AddDebugField(
|
||||
"embeddings", DataType::VECTOR_FLOAT, 128, metric_type);
|
||||
schema->set_primary_field_id(int64_field);
|
||||
|
||||
std::map<std::string, std::string> index_params = {
|
||||
{"index_type", "IVF_FLAT"},
|
||||
{"metric_type", metric_type},
|
||||
{"nlist", "128"}};
|
||||
std::map<std::string, std::string> type_params = {{"dim", "128"}};
|
||||
FieldIndexMeta fieldIndexMeta(
|
||||
vec, std::move(index_params), std::move(type_params));
|
||||
std::map<FieldId, FieldIndexMeta> filedMap = {{vec, fieldIndexMeta}};
|
||||
IndexMetaPtr metaPtr =
|
||||
std::make_shared<CollectionIndexMeta>(100000, std::move(filedMap));
|
||||
auto segment_sealed = CreateSealedSegment(schema, metaPtr);
|
||||
auto segment = dynamic_cast<SegmentSealedImpl*>(segment_sealed.get());
|
||||
|
||||
int64_t dataset_size = 1000;
|
||||
int64_t dim = 128;
|
||||
auto dataset = DataGen(schema, dataset_size);
|
||||
SealedLoadFieldData(dataset, *segment);
|
||||
|
||||
auto bool_values = dataset.get_col<bool>(bool_field);
|
||||
auto int8_values = dataset.get_col<int8_t>(int8_field);
|
||||
auto int16_values = dataset.get_col<int16_t>(int16_field);
|
||||
auto int32_values = dataset.get_col<int32_t>(int32_field);
|
||||
auto int64_values = dataset.get_col<int64_t>(int64_field);
|
||||
auto float_values = dataset.get_col<float>(float_field);
|
||||
auto double_values = dataset.get_col<double>(double_field);
|
||||
auto varchar_values = dataset.get_col<std::string>(varchar_field);
|
||||
auto json_values = dataset.get_col<std::string>(json_field);
|
||||
auto int_array_values = dataset.get_col<ScalarArray>(int_array_field);
|
||||
auto long_array_values = dataset.get_col<ScalarArray>(long_array_field);
|
||||
auto bool_array_values = dataset.get_col<ScalarArray>(bool_array_field);
|
||||
auto string_array_values = dataset.get_col<ScalarArray>(string_array_field);
|
||||
auto double_array_values = dataset.get_col<ScalarArray>(double_array_field);
|
||||
auto float_array_values = dataset.get_col<ScalarArray>(float_array_field);
|
||||
auto vector_values = dataset.get_col<float>(vec);
|
||||
|
||||
auto bool_valid_values = dataset.get_col_valid(bool_field);
|
||||
auto int8_valid_values = dataset.get_col_valid(int8_field);
|
||||
auto int16_valid_values = dataset.get_col_valid(int16_field);
|
||||
auto int32_valid_values = dataset.get_col_valid(int32_field);
|
||||
auto float_valid_values = dataset.get_col_valid(float_field);
|
||||
auto double_valid_values = dataset.get_col_valid(double_field);
|
||||
auto varchar_valid_values = dataset.get_col_valid(varchar_field);
|
||||
auto json_valid_values = dataset.get_col_valid(json_field);
|
||||
auto int_array_valid_values = dataset.get_col_valid(int_array_field);
|
||||
auto long_array_valid_values = dataset.get_col_valid(long_array_field);
|
||||
auto bool_array_valid_values = dataset.get_col_valid(bool_array_field);
|
||||
auto string_array_valid_values = dataset.get_col_valid(string_array_field);
|
||||
auto double_array_valid_values = dataset.get_col_valid(double_array_field);
|
||||
auto float_array_valid_values = dataset.get_col_valid(float_array_field);
|
||||
|
||||
auto ids_ds = GenRandomIds(dataset_size);
|
||||
auto bool_result =
|
||||
segment->bulk_subscript(bool_field, ids_ds->GetIds(), dataset_size);
|
||||
auto int8_result =
|
||||
segment->bulk_subscript(int8_field, ids_ds->GetIds(), dataset_size);
|
||||
auto int16_result =
|
||||
segment->bulk_subscript(int16_field, ids_ds->GetIds(), dataset_size);
|
||||
auto int32_result =
|
||||
segment->bulk_subscript(int32_field, ids_ds->GetIds(), dataset_size);
|
||||
auto int64_result =
|
||||
segment->bulk_subscript(int64_field, ids_ds->GetIds(), dataset_size);
|
||||
auto float_result =
|
||||
segment->bulk_subscript(float_field, ids_ds->GetIds(), dataset_size);
|
||||
auto double_result =
|
||||
segment->bulk_subscript(double_field, ids_ds->GetIds(), dataset_size);
|
||||
auto varchar_result =
|
||||
segment->bulk_subscript(varchar_field, ids_ds->GetIds(), dataset_size);
|
||||
auto json_result =
|
||||
segment->bulk_subscript(json_field, ids_ds->GetIds(), dataset_size);
|
||||
auto int_array_result = segment->bulk_subscript(
|
||||
int_array_field, ids_ds->GetIds(), dataset_size);
|
||||
auto long_array_result = segment->bulk_subscript(
|
||||
long_array_field, ids_ds->GetIds(), dataset_size);
|
||||
auto bool_array_result = segment->bulk_subscript(
|
||||
bool_array_field, ids_ds->GetIds(), dataset_size);
|
||||
auto string_array_result = segment->bulk_subscript(
|
||||
string_array_field, ids_ds->GetIds(), dataset_size);
|
||||
auto double_array_result = segment->bulk_subscript(
|
||||
double_array_field, ids_ds->GetIds(), dataset_size);
|
||||
auto float_array_result = segment->bulk_subscript(
|
||||
float_array_field, ids_ds->GetIds(), dataset_size);
|
||||
auto vec_result =
|
||||
segment->bulk_subscript(vec, ids_ds->GetIds(), dataset_size);
|
||||
|
||||
EXPECT_EQ(bool_result->scalars().bool_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(int8_result->scalars().int_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(int16_result->scalars().int_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(int32_result->scalars().int_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(int64_result->scalars().long_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(float_result->scalars().float_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(double_result->scalars().double_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(varchar_result->scalars().string_data().data_size(),
|
||||
dataset_size);
|
||||
EXPECT_EQ(json_result->scalars().json_data().data_size(), dataset_size);
|
||||
EXPECT_EQ(vec_result->vectors().float_vector().data_size(),
|
||||
dataset_size * dim);
|
||||
EXPECT_EQ(int_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
EXPECT_EQ(long_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
EXPECT_EQ(bool_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
EXPECT_EQ(string_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
EXPECT_EQ(double_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
EXPECT_EQ(float_array_result->scalars().array_data().data_size(),
|
||||
dataset_size);
|
||||
|
||||
EXPECT_EQ(bool_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(int8_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(int16_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(int32_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(float_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(double_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(varchar_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(json_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(int_array_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(long_array_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(bool_array_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(string_array_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(double_array_result->valid_data_size(), dataset_size);
|
||||
EXPECT_EQ(float_array_result->valid_data_size(), dataset_size);
|
||||
}
|
||||
|
|
|
@ -148,7 +148,7 @@ TEST(Util, get_common_prefix) {
|
|||
EXPECT_STREQ(common_prefix.c_str(), "");
|
||||
}
|
||||
|
||||
TEST(Util, dis_closer){
|
||||
TEST(Util, dis_closer) {
|
||||
EXPECT_TRUE(milvus::query::dis_closer(0.1, 0.2, "L2"));
|
||||
EXPECT_FALSE(milvus::query::dis_closer(0.2, 0.1, "L2"));
|
||||
EXPECT_FALSE(milvus::query::dis_closer(0.1, 0.1, "L2"));
|
||||
|
|
|
@ -215,6 +215,21 @@ struct GeneratedData {
|
|||
return std::move(ret);
|
||||
}
|
||||
|
||||
FixedVector<bool>
|
||||
get_col_valid(FieldId field_id) const {
|
||||
for (const auto& target_field_data : raw_->fields_data()) {
|
||||
if (field_id.get() == target_field_data.field_id()) {
|
||||
auto& field_meta = schema_->operator[](field_id);
|
||||
Assert(field_meta.is_nullable());
|
||||
FixedVector<bool> ret(raw_->num_rows());
|
||||
auto src_data = target_field_data.valid_data().data();
|
||||
std::copy_n(src_data, raw_->num_rows(), ret.data());
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
PanicInfo(FieldIDInvalid, "field id not find");
|
||||
}
|
||||
|
||||
std::unique_ptr<DataArray>
|
||||
get_col(FieldId field_id) const {
|
||||
for (const auto& target_field_data : raw_->fields_data()) {
|
||||
|
@ -318,8 +333,14 @@ inline GeneratedData DataGen(SchemaPtr schema,
|
|||
auto insert_data = std::make_unique<InsertRecordProto>();
|
||||
auto insert_cols = [&insert_data](
|
||||
auto& data, int64_t count, auto& field_meta) {
|
||||
FixedVector<bool> valid_data(count);
|
||||
if (field_meta.is_nullable()) {
|
||||
for (int i = 0; i < count; ++i) {
|
||||
valid_data[i] = i % 2 == 0 ? true : false;
|
||||
}
|
||||
}
|
||||
auto array = milvus::segcore::CreateDataArrayFrom(
|
||||
data.data(), count, field_meta);
|
||||
data.data(), valid_data.data(), count, field_meta);
|
||||
insert_data->mutable_fields_data()->AddAllocated(array.release());
|
||||
};
|
||||
|
||||
|
@ -378,7 +399,7 @@ inline GeneratedData DataGen(SchemaPtr schema,
|
|||
auto res = GenerateRandomSparseFloatVector(
|
||||
N, kTestSparseDim, kTestSparseVectorDensity, seed);
|
||||
auto array = milvus::segcore::CreateDataArrayFrom(
|
||||
res.get(), N, field_meta);
|
||||
res.get(), nullptr, N, field_meta);
|
||||
insert_data->mutable_fields_data()->AddAllocated(
|
||||
array.release());
|
||||
break;
|
||||
|
@ -647,7 +668,7 @@ DataGenForJsonArray(SchemaPtr schema,
|
|||
auto insert_cols = [&insert_data](
|
||||
auto& data, int64_t count, auto& field_meta) {
|
||||
auto array = milvus::segcore::CreateDataArrayFrom(
|
||||
data.data(), count, field_meta);
|
||||
data.data(), nullptr, count, field_meta);
|
||||
insert_data->mutable_fields_data()->AddAllocated(array.release());
|
||||
};
|
||||
for (auto field_id : schema->get_field_ids()) {
|
||||
|
@ -953,9 +974,30 @@ CreateFieldDataFromDataArray(ssize_t raw_count,
|
|||
auto createFieldData = [&field_data, &raw_count](const void* raw_data,
|
||||
DataType data_type,
|
||||
int64_t dim) {
|
||||
field_data = storage::CreateFieldData(data_type, dim);
|
||||
field_data = storage::CreateFieldData(data_type, false, dim);
|
||||
field_data->FillFieldData(raw_data, raw_count);
|
||||
};
|
||||
auto createNullableFieldData = [&field_data, &raw_count](
|
||||
const void* raw_data,
|
||||
const bool* raw_valid_data,
|
||||
DataType data_type,
|
||||
int64_t dim) {
|
||||
field_data = storage::CreateFieldData(data_type, true, dim);
|
||||
int byteSize = (raw_count + 7) / 8;
|
||||
uint8_t* valid_data = new uint8_t[byteSize];
|
||||
for (int i = 0; i < raw_count; i++) {
|
||||
bool value = raw_valid_data[i];
|
||||
int byteIndex = i / 8;
|
||||
int bitIndex = i % 8;
|
||||
if (value) {
|
||||
valid_data[byteIndex] |= (1 << bitIndex);
|
||||
} else {
|
||||
valid_data[byteIndex] &= ~(1 << bitIndex);
|
||||
}
|
||||
}
|
||||
field_data->FillFieldData(raw_data, valid_data, raw_count);
|
||||
delete[] valid_data;
|
||||
};
|
||||
|
||||
if (field_meta.is_vector()) {
|
||||
switch (field_meta.get_data_type()) {
|
||||
|
@ -998,48 +1040,98 @@ CreateFieldDataFromDataArray(ssize_t raw_count,
|
|||
switch (field_meta.get_data_type()) {
|
||||
case DataType::BOOL: {
|
||||
auto raw_data = data->scalars().bool_data().data().data();
|
||||
createFieldData(raw_data, DataType::BOOL, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
raw_data, raw_valid_data, DataType::BOOL, dim);
|
||||
} else {
|
||||
createFieldData(raw_data, DataType::BOOL, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT8: {
|
||||
auto src_data = data->scalars().int_data().data();
|
||||
std::vector<int8_t> data_raw(src_data.size());
|
||||
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
|
||||
createFieldData(data_raw.data(), DataType::INT8, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
data_raw.data(), raw_valid_data, DataType::INT8, dim);
|
||||
} else {
|
||||
createFieldData(data_raw.data(), DataType::INT8, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT16: {
|
||||
auto src_data = data->scalars().int_data().data();
|
||||
std::vector<int16_t> data_raw(src_data.size());
|
||||
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
|
||||
createFieldData(data_raw.data(), DataType::INT16, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
data_raw.data(), raw_valid_data, DataType::INT16, dim);
|
||||
} else {
|
||||
createFieldData(data_raw.data(), DataType::INT16, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT32: {
|
||||
auto raw_data = data->scalars().int_data().data().data();
|
||||
createFieldData(raw_data, DataType::INT32, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
raw_data, raw_valid_data, DataType::INT32, dim);
|
||||
} else {
|
||||
createFieldData(raw_data, DataType::INT32, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT64: {
|
||||
auto raw_data = data->scalars().long_data().data().data();
|
||||
createFieldData(raw_data, DataType::INT64, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
raw_data, raw_valid_data, DataType::INT64, dim);
|
||||
} else {
|
||||
createFieldData(raw_data, DataType::INT64, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::FLOAT: {
|
||||
auto raw_data = data->scalars().float_data().data().data();
|
||||
createFieldData(raw_data, DataType::FLOAT, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
raw_data, raw_valid_data, DataType::FLOAT, dim);
|
||||
} else {
|
||||
createFieldData(raw_data, DataType::FLOAT, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::DOUBLE: {
|
||||
auto raw_data = data->scalars().double_data().data().data();
|
||||
createFieldData(raw_data, DataType::DOUBLE, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
raw_data, raw_valid_data, DataType::DOUBLE, dim);
|
||||
} else {
|
||||
createFieldData(raw_data, DataType::DOUBLE, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::VARCHAR: {
|
||||
auto begin = data->scalars().string_data().data().begin();
|
||||
auto end = data->scalars().string_data().data().end();
|
||||
std::vector<std::string> data_raw(begin, end);
|
||||
createFieldData(data_raw.data(), DataType::VARCHAR, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(data_raw.data(),
|
||||
raw_valid_data,
|
||||
DataType::VARCHAR,
|
||||
dim);
|
||||
} else {
|
||||
createFieldData(data_raw.data(), DataType::VARCHAR, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::JSON: {
|
||||
|
@ -1049,7 +1141,13 @@ CreateFieldDataFromDataArray(ssize_t raw_count,
|
|||
auto str = src_data.Get(i);
|
||||
data_raw[i] = Json(simdjson::padded_string(str));
|
||||
}
|
||||
createFieldData(data_raw.data(), DataType::JSON, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
data_raw.data(), raw_valid_data, DataType::JSON, dim);
|
||||
} else {
|
||||
createFieldData(data_raw.data(), DataType::JSON, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::ARRAY: {
|
||||
|
@ -1058,7 +1156,13 @@ CreateFieldDataFromDataArray(ssize_t raw_count,
|
|||
for (int i = 0; i < src_data.size(); i++) {
|
||||
data_raw[i] = Array(src_data.at(i));
|
||||
}
|
||||
createFieldData(data_raw.data(), DataType::ARRAY, dim);
|
||||
if (field_meta.is_nullable()) {
|
||||
auto raw_valid_data = data->valid_data().data();
|
||||
createNullableFieldData(
|
||||
data_raw.data(), raw_valid_data, DataType::ARRAY, dim);
|
||||
} else {
|
||||
createFieldData(data_raw.data(), DataType::ARRAY, dim);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
|
@ -1077,8 +1181,8 @@ SealedLoadFieldData(const GeneratedData& dataset,
|
|||
bool with_mmap = false) {
|
||||
auto row_count = dataset.row_ids_.size();
|
||||
{
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
auto field_data = std::make_shared<milvus::FieldData<int64_t>>(
|
||||
DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.row_ids_.data(), row_count);
|
||||
auto field_data_info =
|
||||
FieldDataInfo(RowFieldID.get(),
|
||||
|
@ -1087,8 +1191,8 @@ SealedLoadFieldData(const GeneratedData& dataset,
|
|||
seg.LoadFieldData(RowFieldID, field_data_info);
|
||||
}
|
||||
{
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
auto field_data = std::make_shared<milvus::FieldData<int64_t>>(
|
||||
DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.timestamps_.data(), row_count);
|
||||
auto field_data_info =
|
||||
FieldDataInfo(TimestampFieldID.get(),
|
||||
|
|
|
@ -119,29 +119,62 @@ CheckSearchResultDuplicate(const std::vector<CSearchResult>& results,
|
|||
const char*
|
||||
get_default_schema_config() {
|
||||
static std::string conf = R"(name: "default-collection"
|
||||
fields: <
|
||||
fieldID: 100
|
||||
name: "fakevec"
|
||||
data_type: FloatVector
|
||||
type_params: <
|
||||
key: "dim"
|
||||
value: "16"
|
||||
>
|
||||
index_params: <
|
||||
key: "metric_type"
|
||||
value: "L2"
|
||||
>
|
||||
>
|
||||
fields: <
|
||||
fieldID: 101
|
||||
name: "age"
|
||||
data_type: Int64
|
||||
is_primary_key: true
|
||||
>)";
|
||||
fields: <
|
||||
fieldID: 100
|
||||
name: "fakevec"
|
||||
data_type: FloatVector
|
||||
type_params: <
|
||||
key: "dim"
|
||||
value: "16"
|
||||
>
|
||||
index_params: <
|
||||
key: "metric_type"
|
||||
value: "L2"
|
||||
>
|
||||
>
|
||||
fields: <
|
||||
fieldID: 101
|
||||
name: "age"
|
||||
data_type: Int64
|
||||
is_primary_key: true
|
||||
>)";
|
||||
static std::string fake_conf = "";
|
||||
return conf.c_str();
|
||||
}
|
||||
|
||||
const char*
|
||||
get_default_schema_config_nullable() {
|
||||
static std::string conf = R"(name: "default-collection"
|
||||
fields: <
|
||||
fieldID: 100
|
||||
name: "fakevec"
|
||||
data_type: FloatVector
|
||||
type_params: <
|
||||
key: "dim"
|
||||
value: "16"
|
||||
>
|
||||
index_params: <
|
||||
key: "metric_type"
|
||||
value: "L2"
|
||||
>
|
||||
>
|
||||
fields: <
|
||||
fieldID: 101
|
||||
name: "age"
|
||||
data_type: Int64
|
||||
is_primary_key: true
|
||||
>
|
||||
fields: <
|
||||
fieldID: 102
|
||||
name: "nullable"
|
||||
data_type: Int32
|
||||
nullable:true
|
||||
>)";
|
||||
static std::string fake_conf = "";
|
||||
return conf.c_str();
|
||||
}
|
||||
|
||||
|
||||
CStatus
|
||||
CSearch(CSegmentInterface c_segment,
|
||||
CSearchPlan c_plan,
|
||||
|
|
|
@ -89,15 +89,15 @@ PrepareInsertBinlog(int64_t collection_id,
|
|||
};
|
||||
|
||||
{
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
auto field_data = std::make_shared<milvus::FieldData<int64_t>>(
|
||||
DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.row_ids_.data(), row_count);
|
||||
auto path = prefix + "/" + std::to_string(RowFieldID.get());
|
||||
SaveFieldData(field_data, path, RowFieldID.get());
|
||||
}
|
||||
{
|
||||
auto field_data =
|
||||
std::make_shared<milvus::FieldData<int64_t>>(DataType::INT64);
|
||||
auto field_data = std::make_shared<milvus::FieldData<int64_t>>(
|
||||
DataType::INT64, false);
|
||||
field_data->FillFieldData(dataset.timestamps_.data(), row_count);
|
||||
auto path = prefix + "/" + std::to_string(TimestampFieldID.get());
|
||||
SaveFieldData(field_data, path, TimestampFieldID.get());
|
||||
|
|
|
@ -221,7 +221,7 @@ func (codec *IndexFileBinlogCodec) DeserializeImpl(blobs []*Blob) (
|
|||
switch dataType {
|
||||
// just for backward compatibility
|
||||
case schemapb.DataType_Int8:
|
||||
// todo: smellthemoon, valid_data may need to check when create index
|
||||
// todo: valid_data may need to check when create index
|
||||
content, _, err := eventReader.GetByteFromPayload()
|
||||
if err != nil {
|
||||
log.Warn("failed to get byte from payload",
|
||||
|
|
Loading…
Reference in New Issue