mirror of https://github.com/milvus-io/milvus.git
fix: fix binary vector data size (#33750)
issue: https://github.com/milvus-io/milvus/issues/22837 - fix byte size wrong for binary vectors - fix the expect/actual error msg Signed-off-by: chasingegg <chao.gao@zilliz.com>pull/33968/head
parent
a789c60380
commit
0d20303e54
|
@ -139,6 +139,16 @@ GetDataTypeSize(DataType data_type, int dim = 1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline size_t
|
||||||
|
GetVecRowSize(int64_t dim) {
|
||||||
|
if constexpr (std::is_same_v<T, bin1>) {
|
||||||
|
return (dim / 8) * sizeof(bin1);
|
||||||
|
} else {
|
||||||
|
return dim * sizeof(T);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: use magic_enum when available
|
// TODO: use magic_enum when available
|
||||||
inline std::string
|
inline std::string
|
||||||
GetDataTypeName(DataType data_type) {
|
GetDataTypeName(DataType data_type) {
|
||||||
|
@ -393,6 +403,18 @@ IndexIsSparse(const IndexType& index_type) {
|
||||||
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND;
|
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool
|
||||||
|
IsFloatVectorMetricType(const MetricType& metric_type) {
|
||||||
|
return metric_type == knowhere::metric::L2 ||
|
||||||
|
metric_type == knowhere::metric::IP ||
|
||||||
|
metric_type == knowhere::metric::COSINE;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool
|
||||||
|
IsBinaryVectorMetricType(const MetricType& metric_type) {
|
||||||
|
return !IsFloatVectorMetricType(metric_type);
|
||||||
|
}
|
||||||
|
|
||||||
// Plus 1 because we can't use greater(>) symbol
|
// Plus 1 because we can't use greater(>) symbol
|
||||||
constexpr size_t REF_SIZE_THRESHOLD = 16 + 1;
|
constexpr size_t REF_SIZE_THRESHOLD = 16 + 1;
|
||||||
|
|
||||||
|
|
|
@ -274,7 +274,7 @@ IndexFactory::CreateVectorIndex(
|
||||||
index_type, metric_type, version, file_manager_context);
|
index_type, metric_type, version, file_manager_context);
|
||||||
}
|
}
|
||||||
case DataType::VECTOR_BINARY: {
|
case DataType::VECTOR_BINARY: {
|
||||||
return std::make_unique<VectorMemIndex<uint8_t>>(
|
return std::make_unique<VectorMemIndex<bin1>>(
|
||||||
index_type, metric_type, version, file_manager_context);
|
index_type, metric_type, version, file_manager_context);
|
||||||
}
|
}
|
||||||
case DataType::VECTOR_FLOAT16: {
|
case DataType::VECTOR_FLOAT16: {
|
||||||
|
@ -360,7 +360,7 @@ IndexFactory::CreateVectorIndex(
|
||||||
create_index_info, file_manager_context, space);
|
create_index_info, file_manager_context, space);
|
||||||
}
|
}
|
||||||
case DataType::VECTOR_BINARY: {
|
case DataType::VECTOR_BINARY: {
|
||||||
return std::make_unique<VectorMemIndex<uint8_t>>(
|
return std::make_unique<VectorMemIndex<bin1>>(
|
||||||
create_index_info, file_manager_context, space);
|
create_index_info, file_manager_context, space);
|
||||||
}
|
}
|
||||||
case DataType::VECTOR_FLOAT16: {
|
case DataType::VECTOR_FLOAT16: {
|
||||||
|
|
|
@ -91,6 +91,20 @@ SetValueToConfig(Config& cfg, const std::string& key, const T value) {
|
||||||
cfg[key] = value;
|
cfg[key] = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline void
|
||||||
|
CheckMetricTypeSupport(const MetricType& metric_type) {
|
||||||
|
if constexpr (std::is_same_v<T, bin1>) {
|
||||||
|
AssertInfo(
|
||||||
|
IsBinaryVectorMetricType(metric_type),
|
||||||
|
"binary vector does not float vector metric type: " + metric_type);
|
||||||
|
} else {
|
||||||
|
AssertInfo(
|
||||||
|
IsFloatVectorMetricType(metric_type),
|
||||||
|
"float vector does not binary vector metric type: " + metric_type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int64_t
|
int64_t
|
||||||
GetDimFromConfig(const Config& config);
|
GetDimFromConfig(const Config& config);
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ VectorDiskAnnIndex<T>::VectorDiskAnnIndex(
|
||||||
const IndexVersion& version,
|
const IndexVersion& version,
|
||||||
const storage::FileManagerContext& file_manager_context)
|
const storage::FileManagerContext& file_manager_context)
|
||||||
: VectorIndex(index_type, metric_type) {
|
: VectorIndex(index_type, metric_type) {
|
||||||
|
CheckMetricTypeSupport<T>(metric_type);
|
||||||
file_manager_ =
|
file_manager_ =
|
||||||
std::make_shared<storage::DiskFileManagerImpl>(file_manager_context);
|
std::make_shared<storage::DiskFileManagerImpl>(file_manager_context);
|
||||||
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
||||||
|
@ -80,6 +81,7 @@ VectorDiskAnnIndex<T>::VectorDiskAnnIndex(
|
||||||
std::shared_ptr<milvus_storage::Space> space,
|
std::shared_ptr<milvus_storage::Space> space,
|
||||||
const storage::FileManagerContext& file_manager_context)
|
const storage::FileManagerContext& file_manager_context)
|
||||||
: space_(space), VectorIndex(index_type, metric_type) {
|
: space_(space), VectorIndex(index_type, metric_type) {
|
||||||
|
CheckMetricTypeSupport<T>(metric_type);
|
||||||
file_manager_ = std::make_shared<storage::DiskFileManagerImpl>(
|
file_manager_ = std::make_shared<storage::DiskFileManagerImpl>(
|
||||||
file_manager_context, file_manager_context.space_);
|
file_manager_context, file_manager_context.space_);
|
||||||
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
||||||
|
@ -316,7 +318,7 @@ VectorDiskAnnIndex<T>::BuildWithDataset(const DatasetPtr& dataset,
|
||||||
local_chunk_manager->Write(local_data_path, offset, &dim, sizeof(dim));
|
local_chunk_manager->Write(local_data_path, offset, &dim, sizeof(dim));
|
||||||
offset += sizeof(dim);
|
offset += sizeof(dim);
|
||||||
|
|
||||||
auto data_size = num * dim * sizeof(T);
|
size_t data_size = static_cast<size_t>(num) * milvus::GetVecRowSize<T>(dim);
|
||||||
auto raw_data = const_cast<void*>(milvus::GetDatasetTensor(dataset));
|
auto raw_data = const_cast<void*>(milvus::GetDatasetTensor(dataset));
|
||||||
local_chunk_manager->Write(local_data_path, offset, raw_data, data_size);
|
local_chunk_manager->Write(local_data_path, offset, raw_data, data_size);
|
||||||
|
|
||||||
|
@ -448,12 +450,7 @@ VectorDiskAnnIndex<T>::GetVector(const DatasetPtr dataset) const {
|
||||||
auto tensor = res.value()->GetTensor();
|
auto tensor = res.value()->GetTensor();
|
||||||
auto row_num = res.value()->GetRows();
|
auto row_num = res.value()->GetRows();
|
||||||
auto dim = res.value()->GetDim();
|
auto dim = res.value()->GetDim();
|
||||||
int64_t data_size;
|
int64_t data_size = milvus::GetVecRowSize<T>(dim) * row_num;
|
||||||
if constexpr (std::is_same_v<T, bin1>) {
|
|
||||||
data_size = dim / 8 * row_num;
|
|
||||||
} else {
|
|
||||||
data_size = dim * row_num * sizeof(T);
|
|
||||||
}
|
|
||||||
std::vector<uint8_t> raw_data;
|
std::vector<uint8_t> raw_data;
|
||||||
raw_data.resize(data_size);
|
raw_data.resize(data_size);
|
||||||
memcpy(raw_data.data(), tensor, data_size);
|
memcpy(raw_data.data(), tensor, data_size);
|
||||||
|
|
|
@ -61,6 +61,7 @@ VectorMemIndex<T>::VectorMemIndex(
|
||||||
const IndexVersion& version,
|
const IndexVersion& version,
|
||||||
const storage::FileManagerContext& file_manager_context)
|
const storage::FileManagerContext& file_manager_context)
|
||||||
: VectorIndex(index_type, metric_type) {
|
: VectorIndex(index_type, metric_type) {
|
||||||
|
CheckMetricTypeSupport<T>(metric_type);
|
||||||
AssertInfo(!is_unsupported(index_type, metric_type),
|
AssertInfo(!is_unsupported(index_type, metric_type),
|
||||||
index_type + " doesn't support metric: " + metric_type);
|
index_type + " doesn't support metric: " + metric_type);
|
||||||
if (file_manager_context.Valid()) {
|
if (file_manager_context.Valid()) {
|
||||||
|
@ -90,6 +91,7 @@ VectorMemIndex<T>::VectorMemIndex(
|
||||||
: VectorIndex(create_index_info.index_type, create_index_info.metric_type),
|
: VectorIndex(create_index_info.index_type, create_index_info.metric_type),
|
||||||
space_(space),
|
space_(space),
|
||||||
create_index_info_(create_index_info) {
|
create_index_info_(create_index_info) {
|
||||||
|
CheckMetricTypeSupport<T>(create_index_info.metric_type);
|
||||||
AssertInfo(!is_unsupported(create_index_info.index_type,
|
AssertInfo(!is_unsupported(create_index_info.index_type,
|
||||||
create_index_info.metric_type),
|
create_index_info.metric_type),
|
||||||
create_index_info.index_type +
|
create_index_info.index_type +
|
||||||
|
@ -668,12 +670,7 @@ VectorMemIndex<T>::GetVector(const DatasetPtr dataset) const {
|
||||||
auto tensor = res.value()->GetTensor();
|
auto tensor = res.value()->GetTensor();
|
||||||
auto row_num = res.value()->GetRows();
|
auto row_num = res.value()->GetRows();
|
||||||
auto dim = res.value()->GetDim();
|
auto dim = res.value()->GetDim();
|
||||||
int64_t data_size;
|
int64_t data_size = milvus::GetVecRowSize<T>(dim) * row_num;
|
||||||
if constexpr (std::is_same_v<T, bin1>) {
|
|
||||||
data_size = dim / 8 * row_num;
|
|
||||||
} else {
|
|
||||||
data_size = dim * row_num * sizeof(T);
|
|
||||||
}
|
|
||||||
std::vector<uint8_t> raw_data;
|
std::vector<uint8_t> raw_data;
|
||||||
raw_data.resize(data_size);
|
raw_data.resize(data_size);
|
||||||
memcpy(raw_data.data(), tensor, data_size);
|
memcpy(raw_data.data(), tensor, data_size);
|
||||||
|
|
|
@ -414,7 +414,7 @@ DiskFileManagerImpl::CacheRawDataToDisk(
|
||||||
field_data->FillFieldData(col_data);
|
field_data->FillFieldData(col_data);
|
||||||
dim = field_data->get_dim();
|
dim = field_data->get_dim();
|
||||||
auto data_size =
|
auto data_size =
|
||||||
field_data->get_num_rows() * index_meta_.dim * sizeof(DataType);
|
field_data->get_num_rows() * milvus::GetVecRowSize<DataType>(dim);
|
||||||
local_chunk_manager->Write(local_data_path,
|
local_chunk_manager->Write(local_data_path,
|
||||||
write_offset,
|
write_offset,
|
||||||
const_cast<void*>(field_data->Data()),
|
const_cast<void*>(field_data->Data()),
|
||||||
|
@ -516,8 +516,8 @@ DiskFileManagerImpl::CacheRawDataToDisk(std::vector<std::string> remote_files) {
|
||||||
"inconsistent dim value in multi binlogs!");
|
"inconsistent dim value in multi binlogs!");
|
||||||
dim = field_data->get_dim();
|
dim = field_data->get_dim();
|
||||||
|
|
||||||
auto data_size =
|
auto data_size = field_data->get_num_rows() *
|
||||||
field_data->get_num_rows() * dim * sizeof(DataType);
|
milvus::GetVecRowSize<DataType>(dim);
|
||||||
local_chunk_manager->Write(
|
local_chunk_manager->Write(
|
||||||
local_data_path,
|
local_data_path,
|
||||||
write_offset,
|
write_offset,
|
||||||
|
|
|
@ -103,8 +103,8 @@ class GrowingTest
|
||||||
public:
|
public:
|
||||||
void
|
void
|
||||||
SetUp() override {
|
SetUp() override {
|
||||||
auto index_type = std::get<0>(GetParam());
|
index_type = std::get<0>(GetParam());
|
||||||
auto metric_type = std::get<1>(GetParam());
|
metric_type = std::get<1>(GetParam());
|
||||||
if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT ||
|
if (index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT ||
|
||||||
index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC) {
|
index_type == knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC) {
|
||||||
data_type = DataType::VECTOR_FLOAT;
|
data_type = DataType::VECTOR_FLOAT;
|
||||||
|
|
|
@ -329,8 +329,7 @@ class IndexTest : public ::testing::TestWithParam<Param> {
|
||||||
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
|
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
|
||||||
is_sparse = true;
|
is_sparse = true;
|
||||||
vec_field_data_type = milvus::DataType::VECTOR_SPARSE_FLOAT;
|
vec_field_data_type = milvus::DataType::VECTOR_SPARSE_FLOAT;
|
||||||
} else if (index_type == knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT ||
|
} else if (IsBinaryVectorMetricType(metric_type)) {
|
||||||
index_type == knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP) {
|
|
||||||
is_binary = true;
|
is_binary = true;
|
||||||
vec_field_data_type = milvus::DataType::VECTOR_BINARY;
|
vec_field_data_type = milvus::DataType::VECTOR_BINARY;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -141,7 +141,7 @@ func (v *validateUtil) checkAligned(data []*schemapb.FieldData, schema *typeutil
|
||||||
}
|
}
|
||||||
errDimMismatch := func(fieldName string, dataDim int64, schemaDim int64) error {
|
errDimMismatch := func(fieldName string, dataDim int64, schemaDim int64) error {
|
||||||
msg := fmt.Sprintf("the dim (%d) of field data(%s) is not equal to schema dim (%d)", dataDim, fieldName, schemaDim)
|
msg := fmt.Sprintf("the dim (%d) of field data(%s) is not equal to schema dim (%d)", dataDim, fieldName, schemaDim)
|
||||||
return merr.WrapErrParameterInvalid(dataDim, schemaDim, msg)
|
return merr.WrapErrParameterInvalid(schemaDim, dataDim, msg)
|
||||||
}
|
}
|
||||||
for _, field := range data {
|
for _, field := range data {
|
||||||
switch field.GetType() {
|
switch field.GetType() {
|
||||||
|
|
Loading…
Reference in New Issue