mirror of https://github.com/milvus-io/milvus.git
support get entity by ids in segcore (#5456)
Signed-off-by: fluorinedog <fluorinedog@gmail.com>pull/5475/head
parent
fce792b8bf
commit
b1a9aea6a6
|
@ -71,14 +71,13 @@ endif()
|
|||
# **************************** Compiler arguments ****************************
|
||||
message( STATUS "Building Milvus CPU version" )
|
||||
|
||||
|
||||
#append_flags( CMAKE_CXX_FLAGS
|
||||
# FLAGS
|
||||
# "-fPIC"
|
||||
# "-DELPP_THREAD_SAFE"
|
||||
# "-fopenmp"
|
||||
# "-Werror"
|
||||
# )
|
||||
append_flags( CMAKE_CXX_FLAGS
|
||||
FLAGS
|
||||
"-fPIC"
|
||||
"-DELPP_THREAD_SAFE"
|
||||
"-fopenmp"
|
||||
"-Werror"
|
||||
)
|
||||
|
||||
# **************************** Coding style check tools ****************************
|
||||
find_package( ClangTools )
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
*cmake-build-debug*
|
||||
*cmake-build-release*
|
||||
*cmake-build-*
|
||||
*cmake_build*
|
||||
*/build/*
|
||||
*thirdparty*
|
||||
*src/grpc*
|
||||
*output*
|
||||
|
|
|
@ -27,6 +27,7 @@ add_subdirectory( exceptions )
|
|||
add_subdirectory( utils )
|
||||
add_subdirectory( log )
|
||||
add_subdirectory( pb )
|
||||
add_subdirectory( storage )
|
||||
add_subdirectory( segcore )
|
||||
add_subdirectory( cache )
|
||||
add_subdirectory( query )
|
||||
|
|
|
@ -109,6 +109,7 @@ datatype_is_floating(DataType datatype) {
|
|||
|
||||
class FieldMeta {
|
||||
public:
|
||||
static const FieldMeta RowIdMeta;
|
||||
FieldMeta(const FieldMeta&) = delete;
|
||||
FieldMeta(FieldMeta&&) = default;
|
||||
FieldMeta&
|
||||
|
@ -179,4 +180,5 @@ class FieldMeta {
|
|||
DataType type_ = DataType::NONE;
|
||||
std::optional<VectorInfo> vector_info_;
|
||||
};
|
||||
|
||||
} // namespace milvus
|
||||
|
|
|
@ -32,7 +32,7 @@ RepeatedKeyValToMap(const google::protobuf::RepeatedPtrField<proto::common::KeyV
|
|||
std::shared_ptr<Schema>
|
||||
Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
schema->set_auto_id(schema_proto.autoid());
|
||||
// schema->set_auto_id(schema_proto.autoid());
|
||||
|
||||
// NOTE: only two system
|
||||
|
||||
|
@ -51,11 +51,6 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
|
|||
|
||||
auto data_type = DataType(child.data_type());
|
||||
|
||||
if (child.is_primary_key()) {
|
||||
AssertInfo(!schema->primary_key_offset_opt_.has_value(), "repetitive primary key");
|
||||
schema->primary_key_offset_opt_ = field_offset;
|
||||
}
|
||||
|
||||
if (datatype_is_vector(data_type)) {
|
||||
auto type_map = RepeatedKeyValToMap(child.type_params());
|
||||
auto index_map = RepeatedKeyValToMap(child.index_params());
|
||||
|
@ -71,13 +66,22 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
|
|||
} else {
|
||||
schema->AddField(name, field_id, data_type);
|
||||
}
|
||||
|
||||
if (child.is_primary_key()) {
|
||||
AssertInfo(!schema->get_primary_key_offset().has_value(), "repetitive primary key");
|
||||
Assert(!schema_proto.autoid());
|
||||
schema->set_primary_key(field_offset);
|
||||
}
|
||||
}
|
||||
if (schema->is_auto_id_) {
|
||||
AssertInfo(!schema->primary_key_offset_opt_.has_value(), "auto id mode: shouldn't have primary key");
|
||||
if (schema->get_is_auto_id()) {
|
||||
AssertInfo(!schema->get_primary_key_offset().has_value(), "auto id mode: shouldn't have primary key");
|
||||
} else {
|
||||
AssertInfo(schema->primary_key_offset_opt_.has_value(), "primary key should be specified when autoId is off");
|
||||
AssertInfo(schema->get_primary_key_offset().has_value(), "primary key should be specified when autoId is off");
|
||||
}
|
||||
|
||||
return schema;
|
||||
}
|
||||
|
||||
const FieldMeta FieldMeta::RowIdMeta(FieldName("RowID"), FieldId(0), DataType::INT64);
|
||||
|
||||
} // namespace milvus
|
||||
|
|
|
@ -11,8 +11,8 @@
|
|||
|
||||
#pragma once
|
||||
#include "FieldMeta.h"
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
|
@ -66,6 +66,12 @@ class Schema {
|
|||
is_auto_id_ = is_auto_id;
|
||||
}
|
||||
|
||||
void
|
||||
set_primary_key(FieldOffset field_offset) {
|
||||
is_auto_id_ = false;
|
||||
this->primary_key_offset_opt_ = field_offset;
|
||||
}
|
||||
|
||||
bool
|
||||
get_is_auto_id() const {
|
||||
return is_auto_id_;
|
||||
|
|
|
@ -17,7 +17,9 @@
|
|||
#include <boost/align/aligned_allocator.hpp>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <NamedType/named_type.hpp>
|
||||
#include "pb/schema.pb.h"
|
||||
|
||||
namespace milvus {
|
||||
using Timestamp = uint64_t; // TODO: use TiKV-like timestamp
|
||||
|
@ -25,6 +27,11 @@ using engine::DataType;
|
|||
using engine::FieldElementType;
|
||||
using engine::idx_t;
|
||||
|
||||
using ScalarArray = proto::schema::ScalarField;
|
||||
using DataArray = proto::schema::FieldData;
|
||||
using VectorArray = proto::schema::VectorField;
|
||||
using IdArray = proto::schema::IDs;
|
||||
|
||||
using MetricType = faiss::MetricType;
|
||||
|
||||
MetricType
|
||||
|
@ -43,6 +50,8 @@ constexpr std::false_type always_false{};
|
|||
template <typename T>
|
||||
using aligned_vector = std::vector<T, boost::alignment::aligned_allocator<T, 64>>;
|
||||
|
||||
struct EntityResult {};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct QueryResult {
|
||||
QueryResult() = default;
|
||||
|
@ -72,9 +81,22 @@ struct QueryResult {
|
|||
|
||||
using QueryResultPtr = std::shared_ptr<QueryResult>;
|
||||
|
||||
using FieldId = fluent::NamedType<int64_t, struct FieldIdTag, fluent::Comparable, fluent::Hashable>;
|
||||
using FieldName = fluent::NamedType<std::string, struct FieldNameTag, fluent::Comparable, fluent::Hashable>;
|
||||
using FieldOffset = fluent::NamedType<int64_t, struct FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
|
||||
struct EntityResults {
|
||||
// use protobuf results to simplify
|
||||
};
|
||||
|
||||
namespace impl {
|
||||
// hide identifier name to make auto completion happy
|
||||
struct FieldIdTag;
|
||||
struct FieldNameTag;
|
||||
struct FieldOffsetTag;
|
||||
struct SegOffsetTag;
|
||||
}; // namespace impl
|
||||
|
||||
using FieldId = fluent::NamedType<int64_t, impl::FieldIdTag, fluent::Comparable, fluent::Hashable>;
|
||||
using FieldName = fluent::NamedType<std::string, impl::FieldNameTag, fluent::Comparable, fluent::Hashable>;
|
||||
using FieldOffset = fluent::NamedType<int64_t, impl::FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
|
||||
using SegOffset = fluent::NamedType<int64_t, impl::SegOffsetTag, fluent::Arithmetic>;
|
||||
|
||||
using BitsetView = faiss::BitsetView;
|
||||
inline BitsetView
|
||||
|
|
|
@ -27,6 +27,7 @@ set(SEGCORE_FILES
|
|||
SegmentInterface.cpp
|
||||
SegcoreConfig.cpp
|
||||
segcore_init_c.cpp
|
||||
ScalarIndex.cpp
|
||||
)
|
||||
add_library(milvus_segcore SHARED
|
||||
${SEGCORE_FILES}
|
||||
|
@ -38,5 +39,6 @@ target_link_libraries(milvus_segcore
|
|||
milvus_common
|
||||
milvus_query
|
||||
milvus_utils
|
||||
milvus_storage
|
||||
)
|
||||
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "ScalarIndex.h"
|
||||
|
||||
namespace milvus::segcore {
|
||||
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
ScalarIndexVector::do_search_ids(const IdArray& ids) const {
|
||||
auto res_ids = std::make_unique<IdArray>();
|
||||
// TODO: support string array
|
||||
static_assert(std::is_same_v<T, int64_t>);
|
||||
Assert(ids.has_int_id());
|
||||
auto src_ids = ids.int_id();
|
||||
auto dst_ids = res_ids->mutable_int_id();
|
||||
std::vector<SegOffset> dst_offsets;
|
||||
|
||||
// TODO: a possible optimization:
|
||||
// TODO: sort the input id array to make access cache friendly
|
||||
|
||||
// assume no repeated key now
|
||||
// TODO: support repeated key
|
||||
for (auto id : src_ids.data()) {
|
||||
using Pair = std::pair<T, SegOffset>;
|
||||
auto [iter_beg, iter_end] =
|
||||
std::equal_range(mapping_.begin(), mapping_.end(), std::make_pair(id, SegOffset(0)),
|
||||
[](const Pair& left, const Pair& right) { return left.first < right.first; });
|
||||
|
||||
if (iter_beg == iter_end) {
|
||||
// no data
|
||||
continue;
|
||||
}
|
||||
// TODO: for repeated key, decide the final offset with Timestamp
|
||||
// no repeated key, simplified logic
|
||||
Assert(iter_beg + 1 == iter_end);
|
||||
auto [entry_id, entry_offset] = *iter_beg;
|
||||
|
||||
dst_ids->add_data(entry_id);
|
||||
dst_offsets.push_back(entry_offset);
|
||||
}
|
||||
return {std::move(res_ids), std::move(dst_offsets)};
|
||||
}
|
||||
void
|
||||
ScalarIndexVector::append_data(const ScalarIndexVector::T* ids, int64_t count, SegOffset base) {
|
||||
for (int64_t i = 0; i < count; ++i) {
|
||||
auto offset = base + SegOffset(i);
|
||||
mapping_.emplace_back(ids[i], offset);
|
||||
}
|
||||
}
|
||||
void
|
||||
ScalarIndexVector::build() {
|
||||
std::sort(mapping_.begin(), mapping_.end());
|
||||
}
|
||||
} // namespace milvus::segcore
|
|
@ -0,0 +1,47 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
#include "exceptions/EasyAssert.h"
|
||||
#include "common/Types.h"
|
||||
#include "pb/schema.pb.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace milvus::segcore {
|
||||
|
||||
class ScalarIndexBase {
|
||||
public:
|
||||
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
do_search_ids(const IdArray& ids) const = 0;
|
||||
virtual ~ScalarIndexBase() = default;
|
||||
};
|
||||
|
||||
class ScalarIndexVector : public ScalarIndexBase {
|
||||
using T = int64_t;
|
||||
|
||||
public:
|
||||
// TODO: use proto::schema::ids
|
||||
void
|
||||
append_data(const T* ids, int64_t count, SegOffset base);
|
||||
|
||||
void
|
||||
build();
|
||||
|
||||
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
do_search_ids(const IdArray& ids) const override;
|
||||
|
||||
private:
|
||||
std::vector<std::pair<T, SegOffset>> mapping_;
|
||||
};
|
||||
} // namespace milvus::segcore
|
|
@ -84,4 +84,131 @@ SegmentInternalInterface::Search(const query::Plan* plan,
|
|||
return results;
|
||||
}
|
||||
|
||||
// Note: this is temporary solution.
|
||||
// modify bulk script implement to make process more clear
|
||||
static std::unique_ptr<ScalarArray>
|
||||
CreateScalarArrayFrom(const void* data_raw, int64_t count, DataType data_type) {
|
||||
auto scalar_array = std::make_unique<ScalarArray>();
|
||||
switch (data_type) {
|
||||
case DataType::BOOL: {
|
||||
auto data = reinterpret_cast<const double*>(data_raw);
|
||||
auto obj = scalar_array->mutable_bool_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
case DataType::INT8: {
|
||||
auto data = reinterpret_cast<const int8_t*>(data_raw);
|
||||
auto obj = scalar_array->mutable_int_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
case DataType::INT16: {
|
||||
auto data = reinterpret_cast<const int16_t*>(data_raw);
|
||||
auto obj = scalar_array->mutable_int_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
case DataType::INT32: {
|
||||
auto data = reinterpret_cast<const int16_t*>(data_raw);
|
||||
auto obj = scalar_array->mutable_int_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
case DataType::INT64: {
|
||||
auto data = reinterpret_cast<const int64_t*>(data_raw);
|
||||
auto obj = scalar_array->mutable_long_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
case DataType::FLOAT: {
|
||||
auto data = reinterpret_cast<const float*>(data_raw);
|
||||
auto obj = scalar_array->mutable_float_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
case DataType::DOUBLE: {
|
||||
auto data = reinterpret_cast<const double*>(data_raw);
|
||||
auto obj = scalar_array->mutable_double_data();
|
||||
obj->mutable_data()->Add(data, data + count);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
PanicInfo("unsupported datatype");
|
||||
}
|
||||
}
|
||||
return scalar_array;
|
||||
}
|
||||
|
||||
static std::unique_ptr<DataArray>
|
||||
CreateDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta) {
|
||||
auto data_type = field_meta.get_data_type();
|
||||
auto data_array = std::make_unique<DataArray>();
|
||||
|
||||
if (!datatype_is_vector(data_type)) {
|
||||
auto scalar_array = CreateScalarArrayFrom(data_raw, count, data_type);
|
||||
data_array->set_allocated_scalars(scalar_array.release());
|
||||
} else {
|
||||
auto vector_array = data_array->mutable_vectors();
|
||||
auto dim = field_meta.get_dim();
|
||||
vector_array->set_dim(dim);
|
||||
switch (data_type) {
|
||||
case DataType::VECTOR_FLOAT: {
|
||||
auto length = count * dim;
|
||||
auto data = reinterpret_cast<const float*>(data_raw);
|
||||
auto obj = vector_array->mutable_float_vector();
|
||||
obj->mutable_data()->Add(data, data + length);
|
||||
break;
|
||||
}
|
||||
case DataType::VECTOR_BINARY: {
|
||||
Assert(dim % 8 == 0);
|
||||
auto num_bytes = count * dim / 8;
|
||||
auto data = reinterpret_cast<const char*>(data_raw);
|
||||
auto obj = vector_array->mutable_binary_vector();
|
||||
obj->assign(data, num_bytes);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
PanicInfo("unsupportted datatype");
|
||||
}
|
||||
}
|
||||
}
|
||||
return data_array;
|
||||
}
|
||||
|
||||
std::unique_ptr<DataArray>
|
||||
SegmentInternalInterface::BulkSubScript(FieldOffset field_offset, const SegOffset* seg_offsets, int64_t count) const {
|
||||
if (field_offset.get() >= 0) {
|
||||
auto& field_meta = get_schema()[field_offset];
|
||||
aligned_vector<char> data(field_meta.get_sizeof() * count);
|
||||
bulk_subscript(field_offset, (const int64_t*)seg_offsets, count, data.data());
|
||||
return CreateDataArrayFrom(data.data(), count, field_meta);
|
||||
} else {
|
||||
Assert(field_offset.get() == -1);
|
||||
aligned_vector<char> data(sizeof(int64_t) * count);
|
||||
bulk_subscript(SystemFieldType::RowId, (const int64_t*)seg_offsets, count, data.data());
|
||||
return CreateDataArrayFrom(data.data(), count, FieldMeta::RowIdMeta);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<proto::milvus::RetrieveResults>
|
||||
SegmentInternalInterface::GetEntityById(const std::vector<FieldOffset>& field_offsets, const IdArray& id_array) const {
|
||||
auto results = std::make_unique<proto::milvus::RetrieveResults>();
|
||||
|
||||
auto [ids_, seg_offsets] = search_ids(id_array);
|
||||
results->set_allocated_ids(ids_.release());
|
||||
|
||||
auto fields_data = results->mutable_fields_data();
|
||||
for (auto field_offset : field_offsets) {
|
||||
auto col = BulkSubScript(field_offset, seg_offsets.data(), seg_offsets.size());
|
||||
fields_data->AddAllocated(col.release());
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
SegmentInternalInterface::search_ids(const IdArray& id_array) const {
|
||||
// TODO: protobuf in a nutshell
|
||||
PanicInfo("unimplemented");
|
||||
}
|
||||
|
||||
} // namespace milvus::segcore
|
||||
|
|
|
@ -18,6 +18,10 @@
|
|||
#include <knowhere/index/vector_index/VecIndex.h>
|
||||
#include "common/SystemProperty.h"
|
||||
#include "query/PlanNode.h"
|
||||
#include "pb/schema.pb.h"
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
|
||||
namespace milvus::segcore {
|
||||
|
||||
|
@ -35,6 +39,9 @@ class SegmentInterface {
|
|||
const Timestamp timestamps[],
|
||||
int64_t num_groups) const = 0;
|
||||
|
||||
virtual std::unique_ptr<proto::milvus::RetrieveResults>
|
||||
GetEntityById(const std::vector<FieldOffset>& field_offsets, const IdArray& id_array) const = 0;
|
||||
|
||||
virtual int64_t
|
||||
GetMemoryUsageInBytes() const = 0;
|
||||
|
||||
|
@ -79,6 +86,9 @@ class SegmentInternalInterface : public SegmentInterface {
|
|||
void
|
||||
FillTargetEntry(const query::Plan* plan, QueryResult& results) const override;
|
||||
|
||||
std::unique_ptr<proto::milvus::RetrieveResults>
|
||||
GetEntityById(const std::vector<FieldOffset>& field_offsets, const IdArray& id_array) const override;
|
||||
|
||||
public:
|
||||
virtual void
|
||||
vector_search(int64_t vec_count,
|
||||
|
@ -109,6 +119,7 @@ class SegmentInternalInterface : public SegmentInterface {
|
|||
virtual const knowhere::Index*
|
||||
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const = 0;
|
||||
|
||||
// TODO remove system fields
|
||||
// calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to system_type
|
||||
virtual void
|
||||
bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
|
||||
|
@ -117,6 +128,14 @@ class SegmentInternalInterface : public SegmentInterface {
|
|||
virtual void
|
||||
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
|
||||
|
||||
// TODO: special hack: FieldOffset == -1 -> RowId.
|
||||
// TODO: remove this hack when transfer is done
|
||||
virtual std::unique_ptr<DataArray>
|
||||
BulkSubScript(FieldOffset field_offset, const SegOffset* seg_offsets, int64_t count) const;
|
||||
|
||||
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
search_ids(const IdArray& id_array) const;
|
||||
|
||||
virtual void
|
||||
check_search(const query::Plan* plan) const = 0;
|
||||
|
||||
|
|
|
@ -11,8 +11,9 @@
|
|||
#pragma once
|
||||
#include <memory>
|
||||
|
||||
#include "SegmentInterface.h"
|
||||
#include "segcore/SegmentInterface.h"
|
||||
#include "common/LoadInfo.h"
|
||||
#include <utility>
|
||||
|
||||
namespace milvus::segcore {
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "query/SearchOnSealed.h"
|
||||
#include "query/ScalarIndex.h"
|
||||
#include "query/SearchBruteForce.h"
|
||||
|
||||
namespace milvus::segcore {
|
||||
|
||||
static inline void
|
||||
|
@ -57,6 +58,14 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
|
|||
auto field_id = FieldId(info.field_id);
|
||||
Assert(info.blob);
|
||||
Assert(info.row_count > 0);
|
||||
auto create_index = [](const int64_t* data, int64_t size) {
|
||||
Assert(size);
|
||||
auto pk_index = std::make_unique<ScalarIndexVector>();
|
||||
pk_index->append_data(data, size, SegOffset(0));
|
||||
pk_index->build();
|
||||
return pk_index;
|
||||
};
|
||||
|
||||
if (SystemProperty::Instance().IsSystem(field_id)) {
|
||||
auto system_field_type = SystemProperty::Instance().GetSystemFieldType(field_id);
|
||||
Assert(system_field_type == SystemFieldType::RowId);
|
||||
|
@ -65,14 +74,15 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
|
|||
// prepare data
|
||||
aligned_vector<idx_t> vec_data(info.row_count);
|
||||
std::copy_n(src_ptr, info.row_count, vec_data.data());
|
||||
auto pk_index = create_index(vec_data.data(), vec_data.size());
|
||||
|
||||
// write data under lock
|
||||
std::unique_lock lck(mutex_);
|
||||
update_row_count(info.row_count);
|
||||
AssertInfo(row_ids_.empty(), "already exists");
|
||||
row_ids_ = std::move(vec_data);
|
||||
primary_key_index_ = std::move(pk_index);
|
||||
++system_ready_count_;
|
||||
|
||||
} else {
|
||||
// prepare data
|
||||
auto field_offset = schema_->get_offset(field_id);
|
||||
|
@ -90,6 +100,11 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
|
|||
index = query::generate_scalar_index(span, field_meta.get_data_type());
|
||||
}
|
||||
|
||||
std::unique_ptr<ScalarIndexBase> pk_index_;
|
||||
if (schema_->get_primary_key_offset() == field_offset) {
|
||||
pk_index_ = create_index((const int64_t*)vec_data.data(), info.row_count);
|
||||
}
|
||||
|
||||
// write data under lock
|
||||
std::unique_lock lck(mutex_);
|
||||
update_row_count(info.row_count);
|
||||
|
@ -103,6 +118,9 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
|
|||
field_datas_[field_offset.get()] = std::move(vec_data);
|
||||
scalar_indexings_[field_offset.get()] = std::move(index);
|
||||
}
|
||||
if (schema_->get_primary_key_offset() == field_offset) {
|
||||
primary_key_index_ = std::move(pk_index_);
|
||||
}
|
||||
|
||||
set_bit(field_data_ready_bitset_, field_offset, true);
|
||||
}
|
||||
|
@ -135,6 +153,7 @@ SegmentSealedImpl::chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) c
|
|||
|
||||
const knowhere::Index*
|
||||
SegmentSealedImpl::chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const {
|
||||
Assert(chunk_id == 0);
|
||||
// TODO: support scalar index
|
||||
auto ptr = scalar_indexings_[field_offset.get()].get();
|
||||
Assert(ptr);
|
||||
|
@ -305,7 +324,7 @@ SegmentSealedImpl::bulk_subscript_impl(
|
|||
auto offset = seg_offsets[i];
|
||||
auto dst = dst_vec + i * element_sizeof;
|
||||
const char* src;
|
||||
if (offset != 0) {
|
||||
if (offset != -1) {
|
||||
src = src_vec + element_sizeof * offset;
|
||||
} else {
|
||||
src = none.data();
|
||||
|
@ -383,6 +402,43 @@ SegmentSealedImpl::HasFieldData(FieldId field_id) const {
|
|||
}
|
||||
}
|
||||
|
||||
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
SegmentSealedImpl::search_ids(const IdArray& id_array) const {
|
||||
AssertInfo(id_array.has_int_id(), "string ids are not implemented");
|
||||
auto arr = id_array.int_id();
|
||||
Assert(primary_key_index_);
|
||||
return primary_key_index_->do_search_ids(id_array);
|
||||
}
|
||||
|
||||
// void
|
||||
// SegmentSealedImpl::build_index_if_primary_key(FieldId field_id) {
|
||||
// auto create_index = [](const int64_t* data, int64_t size) {
|
||||
// Assert(size);
|
||||
// auto pk_index = std::make_unique<ScalarIndexVector>();
|
||||
// pk_index->append_data(data, size, SegOffset(0));
|
||||
// pk_index->build();
|
||||
// return pk_index;
|
||||
// };
|
||||
//
|
||||
// if (SystemProperty::Instance().IsSystem(field_id)) {
|
||||
// Assert(SystemProperty::Instance().GetSystemFieldType(field_id) == SystemFieldType::RowId);
|
||||
// Assert(schema_->get_is_auto_id());
|
||||
// Assert(row_count_opt_.has_value());
|
||||
// auto row_count = row_count_opt_.value();
|
||||
// Assert(row_count == row_ids_.size());
|
||||
// primary_key_index_ = create_index(row_ids_.data(), row_count);
|
||||
//
|
||||
// } else if (this->schema_->get_primary_key_offset() == schema_->get_offset(field_id)) {
|
||||
// auto pk_offset = schema_->get_offset(field_id);
|
||||
// auto& field_data = field_datas_[pk_offset.get()];
|
||||
// Assert(row_count_opt_.has_value());
|
||||
// auto row_count = row_count_opt_.value();
|
||||
// Assert(field_data.size() == row_count * sizeof(int64_t));
|
||||
//
|
||||
// primary_key_index_ = create_index((const int64_t*)field_data.data(), row_count);
|
||||
// }
|
||||
// }
|
||||
|
||||
SegmentSealedPtr
|
||||
CreateSealedSegment(SchemaPtr schema) {
|
||||
return std::make_unique<SegmentSealedImpl>(schema);
|
||||
|
|
|
@ -12,9 +12,11 @@
|
|||
#pragma once
|
||||
#include "segcore/SegmentSealed.h"
|
||||
#include "SealedIndexingRecord.h"
|
||||
#include "ScalarIndex.h"
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace milvus::segcore {
|
||||
class SegmentSealedImpl : public SegmentSealed {
|
||||
|
@ -107,17 +109,30 @@ class SegmentSealedImpl : public SegmentSealed {
|
|||
return system_ready_count_ == 1;
|
||||
}
|
||||
|
||||
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
|
||||
search_ids(const IdArray& id_array) const;
|
||||
|
||||
// virtual void
|
||||
// build_index_if_primary_key(FieldId field_id);
|
||||
|
||||
private:
|
||||
// segment loading state
|
||||
boost::dynamic_bitset<> field_data_ready_bitset_;
|
||||
boost::dynamic_bitset<> vecindex_ready_bitset_;
|
||||
std::atomic<int> system_ready_count_ = 0;
|
||||
// segment datas
|
||||
|
||||
// TODO: generate index for scalar
|
||||
std::optional<int64_t> row_count_opt_;
|
||||
|
||||
// TODO: use protobuf format
|
||||
// TODO: remove duplicated indexing
|
||||
std::vector<std::unique_ptr<knowhere::Index>> scalar_indexings_;
|
||||
SealedIndexingRecord vecindexs_;
|
||||
std::unique_ptr<ScalarIndexBase> primary_key_index_;
|
||||
|
||||
std::vector<aligned_vector<char>> field_datas_;
|
||||
|
||||
SealedIndexingRecord vecindexs_;
|
||||
aligned_vector<idx_t> row_ids_;
|
||||
SchemaPtr schema_;
|
||||
};
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
# or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
set(MILVUS_STORAGE_SRCS
|
||||
StorageCache.cpp
|
||||
)
|
||||
|
||||
add_library(milvus_storage ${MILVUS_STORAGE_SRCS})
|
||||
target_link_libraries(milvus_storage milvus_proto milvus_utils knowhere boost_bitset_ext)
|
|
@ -0,0 +1,12 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "storage/StorageCache.h"
|
|
@ -0,0 +1,32 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include "common/Types.h"
|
||||
|
||||
namespace milvus::storage {
|
||||
|
||||
// struct DataBlock {
|
||||
// // TODO
|
||||
// };
|
||||
|
||||
// struct BlockId {
|
||||
// int64_t segment_id_;
|
||||
// FieldId field_id_;
|
||||
// };
|
||||
|
||||
// class StorageCacheEngine {
|
||||
// // TODO
|
||||
// };
|
||||
|
||||
} // namespace milvus::storage
|
|
@ -23,7 +23,7 @@ set(MILVUS_TEST_FILES
|
|||
test_indexing.cpp
|
||||
test_query.cpp
|
||||
test_expr.cpp
|
||||
test_bitmap.cpp
|
||||
test_bitmap.cpp
|
||||
test_binary.cpp
|
||||
test_index_wrapper.cpp
|
||||
test_common.cpp
|
||||
|
@ -35,6 +35,7 @@ set(MILVUS_TEST_FILES
|
|||
init_gtest.cpp
|
||||
test_init.cpp
|
||||
test_plan_proto.cpp
|
||||
test_get_entity_by_ids.cpp
|
||||
)
|
||||
|
||||
add_executable(all_tests
|
||||
|
@ -49,6 +50,7 @@ set(INDEX_BUILDER_TEST_FILES
|
|||
add_executable(index_builder_test
|
||||
${INDEX_BUILDER_TEST_FILES}
|
||||
)
|
||||
|
||||
target_link_libraries(index_builder_test
|
||||
gtest
|
||||
gtest_main
|
||||
|
@ -66,5 +68,5 @@ target_link_libraries(all_tests
|
|||
pthread
|
||||
)
|
||||
|
||||
install (TARGETS all_tests DESTINATION unittest)
|
||||
install (TARGETS index_builder_test DESTINATION unittest)
|
||||
install(TARGETS all_tests DESTINATION unittest)
|
||||
install(TARGETS index_builder_test DESTINATION unittest)
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "test_utils/DataGen.h"
|
||||
#include "segcore/ScalarIndex.h"
|
||||
using namespace milvus;
|
||||
using namespace milvus::segcore;
|
||||
|
||||
TEST(GetEntityByIds, ScalarIndex) {
|
||||
SUCCEED();
|
||||
auto index = std::make_unique<ScalarIndexVector>();
|
||||
std::vector<int64_t> data;
|
||||
int N = 1000;
|
||||
auto req_ids = std::make_unique<IdArray>();
|
||||
auto req_ids_arr = req_ids->mutable_int_id();
|
||||
|
||||
for (int i = 0; i < N; ++i) {
|
||||
data.push_back(i * 3 % N);
|
||||
req_ids_arr->add_data(i);
|
||||
}
|
||||
index->append_data(data.data(), N, SegOffset(10000));
|
||||
index->build();
|
||||
|
||||
auto [res_ids, res_offsets] = index->do_search_ids(*req_ids);
|
||||
auto res_ids_arr = res_ids->int_id();
|
||||
|
||||
for (int i = 0; i < N; ++i) {
|
||||
auto res_offset = res_offsets[i].get() - 10000;
|
||||
auto res_id = res_ids_arr.data(i);
|
||||
auto std_id = (res_offset * 3 % N);
|
||||
ASSERT_EQ(res_id, std_id);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GetEntityByIds, AUTOID) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
|
||||
auto DIM = 16;
|
||||
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
|
||||
|
||||
int64_t N = 10000;
|
||||
int64_t req_size = 10;
|
||||
auto choose = [=](int i) { return i * 3 % N; };
|
||||
|
||||
auto dataset = DataGen(schema, N);
|
||||
auto segment = CreateSealedSegment(schema);
|
||||
SealedLoader(dataset, *segment);
|
||||
|
||||
auto req_ids = std::make_unique<IdArray>();
|
||||
auto req_ids_arr = req_ids->mutable_int_id();
|
||||
|
||||
auto i64_col = dataset.get_col<int64_t>(0);
|
||||
auto vf_col = dataset.get_col<float>(1);
|
||||
for (int i = 0; i < req_size; ++i) {
|
||||
req_ids_arr->add_data(dataset.row_ids_[choose(i)]);
|
||||
}
|
||||
|
||||
// should be ruled out
|
||||
req_ids_arr->add_data(-1);
|
||||
|
||||
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
|
||||
auto retrieve_results = segment->GetEntityById(target_offsets, *req_ids);
|
||||
auto ids = retrieve_results->ids().int_id();
|
||||
Assert(retrieve_results->fields_data_size() == target_offsets.size());
|
||||
FieldOffset field_offset(0);
|
||||
auto field0 = retrieve_results->fields_data(0);
|
||||
Assert(field0.has_scalars());
|
||||
auto field0_data = field0.scalars().long_data();
|
||||
for (int i = 0; i < req_size; ++i) {
|
||||
auto id = ids.data(i);
|
||||
auto index = choose(i);
|
||||
ASSERT_EQ(id, dataset.row_ids_[index]);
|
||||
auto data = field0_data.data(i);
|
||||
ASSERT_EQ(data, i64_col[index]);
|
||||
}
|
||||
|
||||
auto field1 = retrieve_results->fields_data(1);
|
||||
Assert(field1.has_vectors());
|
||||
auto field1_data = field1.vectors().float_vector();
|
||||
ASSERT_EQ(field1_data.data_size(), DIM * req_size);
|
||||
|
||||
for (int i = 0; i < req_size; ++i) {
|
||||
for (int d = 0; d < DIM; ++d) {
|
||||
auto index = choose(i);
|
||||
auto data = field1_data.data(i * DIM + d);
|
||||
auto ref = vf_col[index * DIM + d];
|
||||
ASSERT_EQ(data, ref);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GetEntityByIds, PrimaryKey) {
|
||||
auto schema = std::make_shared<Schema>();
|
||||
auto fid_64 = schema->AddDebugField("counter_i64", DataType::INT64);
|
||||
auto DIM = 16;
|
||||
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
|
||||
schema->set_primary_key(FieldOffset(0));
|
||||
|
||||
int64_t N = 10000;
|
||||
int64_t req_size = 10;
|
||||
auto choose = [=](int i) { return i * 3 % N; };
|
||||
|
||||
auto dataset = DataGen(schema, N);
|
||||
auto segment = CreateSealedSegment(schema);
|
||||
SealedLoader(dataset, *segment);
|
||||
|
||||
auto req_ids = std::make_unique<IdArray>();
|
||||
auto req_ids_arr = req_ids->mutable_int_id();
|
||||
|
||||
auto i64_col = dataset.get_col<int64_t>(0);
|
||||
auto vf_col = dataset.get_col<float>(1);
|
||||
for (int i = 0; i < req_size; ++i) {
|
||||
req_ids_arr->add_data(i64_col[choose(i)]);
|
||||
}
|
||||
|
||||
// should be ruled out
|
||||
req_ids_arr->add_data(-1);
|
||||
|
||||
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
|
||||
auto retrieve_results = segment->GetEntityById(target_offsets, *req_ids);
|
||||
auto ids = retrieve_results->ids().int_id();
|
||||
Assert(retrieve_results->fields_data_size() == target_offsets.size());
|
||||
FieldOffset field_offset(0);
|
||||
auto field0 = retrieve_results->fields_data(0);
|
||||
Assert(field0.has_scalars());
|
||||
auto field0_data = field0.scalars().long_data();
|
||||
for (int i = 0; i < req_size; ++i) {
|
||||
auto id = ids.data(i);
|
||||
auto index = choose(i);
|
||||
ASSERT_EQ(id, i64_col[index]);
|
||||
auto data = field0_data.data(i);
|
||||
ASSERT_EQ(data, i64_col[index]);
|
||||
}
|
||||
|
||||
auto field1 = retrieve_results->fields_data(1);
|
||||
Assert(field1.has_vectors());
|
||||
auto field1_data = field1.vectors().float_vector();
|
||||
ASSERT_EQ(field1_data.data_size(), DIM * req_size);
|
||||
|
||||
for (int i = 0; i < req_size; ++i) {
|
||||
for (int d = 0; d < DIM; ++d) {
|
||||
auto index = choose(i);
|
||||
auto data = field1_data.data(i * DIM + d);
|
||||
auto ref = vf_col[index * DIM + d];
|
||||
ASSERT_EQ(data, ref);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -186,7 +186,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42) {
|
|||
res.row_ids_.push_back(i);
|
||||
res.timestamps_.push_back(i);
|
||||
}
|
||||
|
||||
std::shuffle(res.row_ids_.begin(), res.row_ids_.end(), er);
|
||||
res.generate_rows(N, schema);
|
||||
return std::move(res);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue