support get entity by ids in segcore (#5456)

Signed-off-by: fluorinedog <fluorinedog@gmail.com>
pull/5475/head
FluorineDog 2021-05-28 10:39:30 +08:00 committed by GitHub
parent fce792b8bf
commit b1a9aea6a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 613 additions and 31 deletions

View File

@ -71,14 +71,13 @@ endif()
# **************************** Compiler arguments ****************************
message( STATUS "Building Milvus CPU version" )
#append_flags( CMAKE_CXX_FLAGS
# FLAGS
# "-fPIC"
# "-DELPP_THREAD_SAFE"
# "-fopenmp"
# "-Werror"
# )
append_flags( CMAKE_CXX_FLAGS
FLAGS
"-fPIC"
"-DELPP_THREAD_SAFE"
"-fopenmp"
"-Werror"
)
# **************************** Coding style check tools ****************************
find_package( ClangTools )

View File

@ -1,6 +1,6 @@
*cmake-build-debug*
*cmake-build-release*
*cmake-build-*
*cmake_build*
*/build/*
*thirdparty*
*src/grpc*
*output*

View File

@ -27,6 +27,7 @@ add_subdirectory( exceptions )
add_subdirectory( utils )
add_subdirectory( log )
add_subdirectory( pb )
add_subdirectory( storage )
add_subdirectory( segcore )
add_subdirectory( cache )
add_subdirectory( query )

View File

@ -109,6 +109,7 @@ datatype_is_floating(DataType datatype) {
class FieldMeta {
public:
static const FieldMeta RowIdMeta;
FieldMeta(const FieldMeta&) = delete;
FieldMeta(FieldMeta&&) = default;
FieldMeta&
@ -179,4 +180,5 @@ class FieldMeta {
DataType type_ = DataType::NONE;
std::optional<VectorInfo> vector_info_;
};
} // namespace milvus

View File

@ -32,7 +32,7 @@ RepeatedKeyValToMap(const google::protobuf::RepeatedPtrField<proto::common::KeyV
std::shared_ptr<Schema>
Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
auto schema = std::make_shared<Schema>();
schema->set_auto_id(schema_proto.autoid());
// schema->set_auto_id(schema_proto.autoid());
// NOTE: only two system
@ -51,11 +51,6 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
auto data_type = DataType(child.data_type());
if (child.is_primary_key()) {
AssertInfo(!schema->primary_key_offset_opt_.has_value(), "repetitive primary key");
schema->primary_key_offset_opt_ = field_offset;
}
if (datatype_is_vector(data_type)) {
auto type_map = RepeatedKeyValToMap(child.type_params());
auto index_map = RepeatedKeyValToMap(child.index_params());
@ -71,13 +66,22 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
} else {
schema->AddField(name, field_id, data_type);
}
if (child.is_primary_key()) {
AssertInfo(!schema->get_primary_key_offset().has_value(), "repetitive primary key");
Assert(!schema_proto.autoid());
schema->set_primary_key(field_offset);
}
}
if (schema->is_auto_id_) {
AssertInfo(!schema->primary_key_offset_opt_.has_value(), "auto id mode: shouldn't have primary key");
if (schema->get_is_auto_id()) {
AssertInfo(!schema->get_primary_key_offset().has_value(), "auto id mode: shouldn't have primary key");
} else {
AssertInfo(schema->primary_key_offset_opt_.has_value(), "primary key should be specified when autoId is off");
AssertInfo(schema->get_primary_key_offset().has_value(), "primary key should be specified when autoId is off");
}
return schema;
}
const FieldMeta FieldMeta::RowIdMeta(FieldName("RowID"), FieldId(0), DataType::INT64);
} // namespace milvus

View File

@ -11,8 +11,8 @@
#pragma once
#include "FieldMeta.h"
#include <vector>
#include <utility>
#include <vector>
#include <string>
#include <unordered_map>
#include <memory>
@ -66,6 +66,12 @@ class Schema {
is_auto_id_ = is_auto_id;
}
void
set_primary_key(FieldOffset field_offset) {
is_auto_id_ = false;
this->primary_key_offset_opt_ = field_offset;
}
bool
get_is_auto_id() const {
return is_auto_id_;

View File

@ -17,7 +17,9 @@
#include <boost/align/aligned_allocator.hpp>
#include <memory>
#include <vector>
#include <utility>
#include <NamedType/named_type.hpp>
#include "pb/schema.pb.h"
namespace milvus {
using Timestamp = uint64_t; // TODO: use TiKV-like timestamp
@ -25,6 +27,11 @@ using engine::DataType;
using engine::FieldElementType;
using engine::idx_t;
using ScalarArray = proto::schema::ScalarField;
using DataArray = proto::schema::FieldData;
using VectorArray = proto::schema::VectorField;
using IdArray = proto::schema::IDs;
using MetricType = faiss::MetricType;
MetricType
@ -43,6 +50,8 @@ constexpr std::false_type always_false{};
template <typename T>
using aligned_vector = std::vector<T, boost::alignment::aligned_allocator<T, 64>>;
struct EntityResult {};
///////////////////////////////////////////////////////////////////////////////////////////////////
struct QueryResult {
QueryResult() = default;
@ -72,9 +81,22 @@ struct QueryResult {
using QueryResultPtr = std::shared_ptr<QueryResult>;
using FieldId = fluent::NamedType<int64_t, struct FieldIdTag, fluent::Comparable, fluent::Hashable>;
using FieldName = fluent::NamedType<std::string, struct FieldNameTag, fluent::Comparable, fluent::Hashable>;
using FieldOffset = fluent::NamedType<int64_t, struct FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
struct EntityResults {
// use protobuf results to simplify
};
namespace impl {
// hide identifier name to make auto completion happy
struct FieldIdTag;
struct FieldNameTag;
struct FieldOffsetTag;
struct SegOffsetTag;
}; // namespace impl
using FieldId = fluent::NamedType<int64_t, impl::FieldIdTag, fluent::Comparable, fluent::Hashable>;
using FieldName = fluent::NamedType<std::string, impl::FieldNameTag, fluent::Comparable, fluent::Hashable>;
using FieldOffset = fluent::NamedType<int64_t, impl::FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
using SegOffset = fluent::NamedType<int64_t, impl::SegOffsetTag, fluent::Arithmetic>;
using BitsetView = faiss::BitsetView;
inline BitsetView

View File

@ -27,6 +27,7 @@ set(SEGCORE_FILES
SegmentInterface.cpp
SegcoreConfig.cpp
segcore_init_c.cpp
ScalarIndex.cpp
)
add_library(milvus_segcore SHARED
${SEGCORE_FILES}
@ -38,5 +39,6 @@ target_link_libraries(milvus_segcore
milvus_common
milvus_query
milvus_utils
milvus_storage
)

View File

@ -0,0 +1,61 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "ScalarIndex.h"
namespace milvus::segcore {
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
ScalarIndexVector::do_search_ids(const IdArray& ids) const {
auto res_ids = std::make_unique<IdArray>();
// TODO: support string array
static_assert(std::is_same_v<T, int64_t>);
Assert(ids.has_int_id());
auto src_ids = ids.int_id();
auto dst_ids = res_ids->mutable_int_id();
std::vector<SegOffset> dst_offsets;
// TODO: a possible optimization:
// TODO: sort the input id array to make access cache friendly
// assume no repeated key now
// TODO: support repeated key
for (auto id : src_ids.data()) {
using Pair = std::pair<T, SegOffset>;
auto [iter_beg, iter_end] =
std::equal_range(mapping_.begin(), mapping_.end(), std::make_pair(id, SegOffset(0)),
[](const Pair& left, const Pair& right) { return left.first < right.first; });
if (iter_beg == iter_end) {
// no data
continue;
}
// TODO: for repeated key, decide the final offset with Timestamp
// no repeated key, simplified logic
Assert(iter_beg + 1 == iter_end);
auto [entry_id, entry_offset] = *iter_beg;
dst_ids->add_data(entry_id);
dst_offsets.push_back(entry_offset);
}
return {std::move(res_ids), std::move(dst_offsets)};
}
void
ScalarIndexVector::append_data(const ScalarIndexVector::T* ids, int64_t count, SegOffset base) {
for (int64_t i = 0; i < count; ++i) {
auto offset = base + SegOffset(i);
mapping_.emplace_back(ids[i], offset);
}
}
void
ScalarIndexVector::build() {
std::sort(mapping_.begin(), mapping_.end());
}
} // namespace milvus::segcore

View File

@ -0,0 +1,47 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "exceptions/EasyAssert.h"
#include "common/Types.h"
#include "pb/schema.pb.h"
#include <memory>
#include <vector>
#include <utility>
namespace milvus::segcore {
class ScalarIndexBase {
public:
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
do_search_ids(const IdArray& ids) const = 0;
virtual ~ScalarIndexBase() = default;
};
class ScalarIndexVector : public ScalarIndexBase {
using T = int64_t;
public:
// TODO: use proto::schema::ids
void
append_data(const T* ids, int64_t count, SegOffset base);
void
build();
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
do_search_ids(const IdArray& ids) const override;
private:
std::vector<std::pair<T, SegOffset>> mapping_;
};
} // namespace milvus::segcore

View File

@ -84,4 +84,131 @@ SegmentInternalInterface::Search(const query::Plan* plan,
return results;
}
// Note: this is temporary solution.
// modify bulk script implement to make process more clear
static std::unique_ptr<ScalarArray>
CreateScalarArrayFrom(const void* data_raw, int64_t count, DataType data_type) {
auto scalar_array = std::make_unique<ScalarArray>();
switch (data_type) {
case DataType::BOOL: {
auto data = reinterpret_cast<const double*>(data_raw);
auto obj = scalar_array->mutable_bool_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT8: {
auto data = reinterpret_cast<const int8_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT16: {
auto data = reinterpret_cast<const int16_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT32: {
auto data = reinterpret_cast<const int16_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT64: {
auto data = reinterpret_cast<const int64_t*>(data_raw);
auto obj = scalar_array->mutable_long_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::FLOAT: {
auto data = reinterpret_cast<const float*>(data_raw);
auto obj = scalar_array->mutable_float_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::DOUBLE: {
auto data = reinterpret_cast<const double*>(data_raw);
auto obj = scalar_array->mutable_double_data();
obj->mutable_data()->Add(data, data + count);
break;
}
default: {
PanicInfo("unsupported datatype");
}
}
return scalar_array;
}
static std::unique_ptr<DataArray>
CreateDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
if (!datatype_is_vector(data_type)) {
auto scalar_array = CreateScalarArrayFrom(data_raw, count, data_type);
data_array->set_allocated_scalars(scalar_array.release());
} else {
auto vector_array = data_array->mutable_vectors();
auto dim = field_meta.get_dim();
vector_array->set_dim(dim);
switch (data_type) {
case DataType::VECTOR_FLOAT: {
auto length = count * dim;
auto data = reinterpret_cast<const float*>(data_raw);
auto obj = vector_array->mutable_float_vector();
obj->mutable_data()->Add(data, data + length);
break;
}
case DataType::VECTOR_BINARY: {
Assert(dim % 8 == 0);
auto num_bytes = count * dim / 8;
auto data = reinterpret_cast<const char*>(data_raw);
auto obj = vector_array->mutable_binary_vector();
obj->assign(data, num_bytes);
break;
}
default: {
PanicInfo("unsupportted datatype");
}
}
}
return data_array;
}
std::unique_ptr<DataArray>
SegmentInternalInterface::BulkSubScript(FieldOffset field_offset, const SegOffset* seg_offsets, int64_t count) const {
if (field_offset.get() >= 0) {
auto& field_meta = get_schema()[field_offset];
aligned_vector<char> data(field_meta.get_sizeof() * count);
bulk_subscript(field_offset, (const int64_t*)seg_offsets, count, data.data());
return CreateDataArrayFrom(data.data(), count, field_meta);
} else {
Assert(field_offset.get() == -1);
aligned_vector<char> data(sizeof(int64_t) * count);
bulk_subscript(SystemFieldType::RowId, (const int64_t*)seg_offsets, count, data.data());
return CreateDataArrayFrom(data.data(), count, FieldMeta::RowIdMeta);
}
}
std::unique_ptr<proto::milvus::RetrieveResults>
SegmentInternalInterface::GetEntityById(const std::vector<FieldOffset>& field_offsets, const IdArray& id_array) const {
auto results = std::make_unique<proto::milvus::RetrieveResults>();
auto [ids_, seg_offsets] = search_ids(id_array);
results->set_allocated_ids(ids_.release());
auto fields_data = results->mutable_fields_data();
for (auto field_offset : field_offsets) {
auto col = BulkSubScript(field_offset, seg_offsets.data(), seg_offsets.size());
fields_data->AddAllocated(col.release());
}
return results;
}
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
SegmentInternalInterface::search_ids(const IdArray& id_array) const {
// TODO: protobuf in a nutshell
PanicInfo("unimplemented");
}
} // namespace milvus::segcore

View File

@ -18,6 +18,10 @@
#include <knowhere/index/vector_index/VecIndex.h>
#include "common/SystemProperty.h"
#include "query/PlanNode.h"
#include "pb/schema.pb.h"
#include <memory>
#include <vector>
#include <utility>
namespace milvus::segcore {
@ -35,6 +39,9 @@ class SegmentInterface {
const Timestamp timestamps[],
int64_t num_groups) const = 0;
virtual std::unique_ptr<proto::milvus::RetrieveResults>
GetEntityById(const std::vector<FieldOffset>& field_offsets, const IdArray& id_array) const = 0;
virtual int64_t
GetMemoryUsageInBytes() const = 0;
@ -79,6 +86,9 @@ class SegmentInternalInterface : public SegmentInterface {
void
FillTargetEntry(const query::Plan* plan, QueryResult& results) const override;
std::unique_ptr<proto::milvus::RetrieveResults>
GetEntityById(const std::vector<FieldOffset>& field_offsets, const IdArray& id_array) const override;
public:
virtual void
vector_search(int64_t vec_count,
@ -109,6 +119,7 @@ class SegmentInternalInterface : public SegmentInterface {
virtual const knowhere::Index*
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const = 0;
// TODO remove system fields
// calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to system_type
virtual void
bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
@ -117,6 +128,14 @@ class SegmentInternalInterface : public SegmentInterface {
virtual void
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
// TODO: special hack: FieldOffset == -1 -> RowId.
// TODO: remove this hack when transfer is done
virtual std::unique_ptr<DataArray>
BulkSubScript(FieldOffset field_offset, const SegOffset* seg_offsets, int64_t count) const;
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
search_ids(const IdArray& id_array) const;
virtual void
check_search(const query::Plan* plan) const = 0;

View File

@ -11,8 +11,9 @@
#pragma once
#include <memory>
#include "SegmentInterface.h"
#include "segcore/SegmentInterface.h"
#include "common/LoadInfo.h"
#include <utility>
namespace milvus::segcore {

View File

@ -13,6 +13,7 @@
#include "query/SearchOnSealed.h"
#include "query/ScalarIndex.h"
#include "query/SearchBruteForce.h"
namespace milvus::segcore {
static inline void
@ -57,6 +58,14 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
auto field_id = FieldId(info.field_id);
Assert(info.blob);
Assert(info.row_count > 0);
auto create_index = [](const int64_t* data, int64_t size) {
Assert(size);
auto pk_index = std::make_unique<ScalarIndexVector>();
pk_index->append_data(data, size, SegOffset(0));
pk_index->build();
return pk_index;
};
if (SystemProperty::Instance().IsSystem(field_id)) {
auto system_field_type = SystemProperty::Instance().GetSystemFieldType(field_id);
Assert(system_field_type == SystemFieldType::RowId);
@ -65,14 +74,15 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
// prepare data
aligned_vector<idx_t> vec_data(info.row_count);
std::copy_n(src_ptr, info.row_count, vec_data.data());
auto pk_index = create_index(vec_data.data(), vec_data.size());
// write data under lock
std::unique_lock lck(mutex_);
update_row_count(info.row_count);
AssertInfo(row_ids_.empty(), "already exists");
row_ids_ = std::move(vec_data);
primary_key_index_ = std::move(pk_index);
++system_ready_count_;
} else {
// prepare data
auto field_offset = schema_->get_offset(field_id);
@ -90,6 +100,11 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
index = query::generate_scalar_index(span, field_meta.get_data_type());
}
std::unique_ptr<ScalarIndexBase> pk_index_;
if (schema_->get_primary_key_offset() == field_offset) {
pk_index_ = create_index((const int64_t*)vec_data.data(), info.row_count);
}
// write data under lock
std::unique_lock lck(mutex_);
update_row_count(info.row_count);
@ -103,6 +118,9 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
field_datas_[field_offset.get()] = std::move(vec_data);
scalar_indexings_[field_offset.get()] = std::move(index);
}
if (schema_->get_primary_key_offset() == field_offset) {
primary_key_index_ = std::move(pk_index_);
}
set_bit(field_data_ready_bitset_, field_offset, true);
}
@ -135,6 +153,7 @@ SegmentSealedImpl::chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) c
const knowhere::Index*
SegmentSealedImpl::chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const {
Assert(chunk_id == 0);
// TODO: support scalar index
auto ptr = scalar_indexings_[field_offset.get()].get();
Assert(ptr);
@ -305,7 +324,7 @@ SegmentSealedImpl::bulk_subscript_impl(
auto offset = seg_offsets[i];
auto dst = dst_vec + i * element_sizeof;
const char* src;
if (offset != 0) {
if (offset != -1) {
src = src_vec + element_sizeof * offset;
} else {
src = none.data();
@ -383,6 +402,43 @@ SegmentSealedImpl::HasFieldData(FieldId field_id) const {
}
}
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
SegmentSealedImpl::search_ids(const IdArray& id_array) const {
AssertInfo(id_array.has_int_id(), "string ids are not implemented");
auto arr = id_array.int_id();
Assert(primary_key_index_);
return primary_key_index_->do_search_ids(id_array);
}
// void
// SegmentSealedImpl::build_index_if_primary_key(FieldId field_id) {
// auto create_index = [](const int64_t* data, int64_t size) {
// Assert(size);
// auto pk_index = std::make_unique<ScalarIndexVector>();
// pk_index->append_data(data, size, SegOffset(0));
// pk_index->build();
// return pk_index;
// };
//
// if (SystemProperty::Instance().IsSystem(field_id)) {
// Assert(SystemProperty::Instance().GetSystemFieldType(field_id) == SystemFieldType::RowId);
// Assert(schema_->get_is_auto_id());
// Assert(row_count_opt_.has_value());
// auto row_count = row_count_opt_.value();
// Assert(row_count == row_ids_.size());
// primary_key_index_ = create_index(row_ids_.data(), row_count);
//
// } else if (this->schema_->get_primary_key_offset() == schema_->get_offset(field_id)) {
// auto pk_offset = schema_->get_offset(field_id);
// auto& field_data = field_datas_[pk_offset.get()];
// Assert(row_count_opt_.has_value());
// auto row_count = row_count_opt_.value();
// Assert(field_data.size() == row_count * sizeof(int64_t));
//
// primary_key_index_ = create_index((const int64_t*)field_data.data(), row_count);
// }
// }
SegmentSealedPtr
CreateSealedSegment(SchemaPtr schema) {
return std::make_unique<SegmentSealedImpl>(schema);

View File

@ -12,9 +12,11 @@
#pragma once
#include "segcore/SegmentSealed.h"
#include "SealedIndexingRecord.h"
#include "ScalarIndex.h"
#include <map>
#include <vector>
#include <memory>
#include <utility>
namespace milvus::segcore {
class SegmentSealedImpl : public SegmentSealed {
@ -107,17 +109,30 @@ class SegmentSealedImpl : public SegmentSealed {
return system_ready_count_ == 1;
}
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
search_ids(const IdArray& id_array) const;
// virtual void
// build_index_if_primary_key(FieldId field_id);
private:
// segment loading state
boost::dynamic_bitset<> field_data_ready_bitset_;
boost::dynamic_bitset<> vecindex_ready_bitset_;
std::atomic<int> system_ready_count_ = 0;
// segment datas
// TODO: generate index for scalar
std::optional<int64_t> row_count_opt_;
// TODO: use protobuf format
// TODO: remove duplicated indexing
std::vector<std::unique_ptr<knowhere::Index>> scalar_indexings_;
SealedIndexingRecord vecindexs_;
std::unique_ptr<ScalarIndexBase> primary_key_index_;
std::vector<aligned_vector<char>> field_datas_;
SealedIndexingRecord vecindexs_;
aligned_vector<idx_t> row_ids_;
SchemaPtr schema_;
};

View File

@ -0,0 +1,17 @@
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under the License
set(MILVUS_STORAGE_SRCS
StorageCache.cpp
)
add_library(milvus_storage ${MILVUS_STORAGE_SRCS})
target_link_libraries(milvus_storage milvus_proto milvus_utils knowhere boost_bitset_ext)

View File

@ -0,0 +1,12 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "storage/StorageCache.h"

View File

@ -0,0 +1,32 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <optional>
#include "common/Types.h"
namespace milvus::storage {
// struct DataBlock {
// // TODO
// };
// struct BlockId {
// int64_t segment_id_;
// FieldId field_id_;
// };
// class StorageCacheEngine {
// // TODO
// };
} // namespace milvus::storage

View File

@ -23,7 +23,7 @@ set(MILVUS_TEST_FILES
test_indexing.cpp
test_query.cpp
test_expr.cpp
test_bitmap.cpp
test_bitmap.cpp
test_binary.cpp
test_index_wrapper.cpp
test_common.cpp
@ -35,6 +35,7 @@ set(MILVUS_TEST_FILES
init_gtest.cpp
test_init.cpp
test_plan_proto.cpp
test_get_entity_by_ids.cpp
)
add_executable(all_tests
@ -49,6 +50,7 @@ set(INDEX_BUILDER_TEST_FILES
add_executable(index_builder_test
${INDEX_BUILDER_TEST_FILES}
)
target_link_libraries(index_builder_test
gtest
gtest_main
@ -66,5 +68,5 @@ target_link_libraries(all_tests
pthread
)
install (TARGETS all_tests DESTINATION unittest)
install (TARGETS index_builder_test DESTINATION unittest)
install(TARGETS all_tests DESTINATION unittest)
install(TARGETS index_builder_test DESTINATION unittest)

View File

@ -0,0 +1,157 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include "test_utils/DataGen.h"
#include "segcore/ScalarIndex.h"
using namespace milvus;
using namespace milvus::segcore;
TEST(GetEntityByIds, ScalarIndex) {
SUCCEED();
auto index = std::make_unique<ScalarIndexVector>();
std::vector<int64_t> data;
int N = 1000;
auto req_ids = std::make_unique<IdArray>();
auto req_ids_arr = req_ids->mutable_int_id();
for (int i = 0; i < N; ++i) {
data.push_back(i * 3 % N);
req_ids_arr->add_data(i);
}
index->append_data(data.data(), N, SegOffset(10000));
index->build();
auto [res_ids, res_offsets] = index->do_search_ids(*req_ids);
auto res_ids_arr = res_ids->int_id();
for (int i = 0; i < N; ++i) {
auto res_offset = res_offsets[i].get() - 10000;
auto res_id = res_ids_arr.data(i);
auto std_id = (res_offset * 3 % N);
ASSERT_EQ(res_id, std_id);
}
}
TEST(GetEntityByIds, AUTOID) {
auto schema = std::make_shared<Schema>();
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
int64_t N = 10000;
int64_t req_size = 10;
auto choose = [=](int i) { return i * 3 % N; };
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto req_ids = std::make_unique<IdArray>();
auto req_ids_arr = req_ids->mutable_int_id();
auto i64_col = dataset.get_col<int64_t>(0);
auto vf_col = dataset.get_col<float>(1);
for (int i = 0; i < req_size; ++i) {
req_ids_arr->add_data(dataset.row_ids_[choose(i)]);
}
// should be ruled out
req_ids_arr->add_data(-1);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
auto retrieve_results = segment->GetEntityById(target_offsets, *req_ids);
auto ids = retrieve_results->ids().int_id();
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();
for (int i = 0; i < req_size; ++i) {
auto id = ids.data(i);
auto index = choose(i);
ASSERT_EQ(id, dataset.row_ids_[index]);
auto data = field0_data.data(i);
ASSERT_EQ(data, i64_col[index]);
}
auto field1 = retrieve_results->fields_data(1);
Assert(field1.has_vectors());
auto field1_data = field1.vectors().float_vector();
ASSERT_EQ(field1_data.data_size(), DIM * req_size);
for (int i = 0; i < req_size; ++i) {
for (int d = 0; d < DIM; ++d) {
auto index = choose(i);
auto data = field1_data.data(i * DIM + d);
auto ref = vf_col[index * DIM + d];
ASSERT_EQ(data, ref);
}
}
}
TEST(GetEntityByIds, PrimaryKey) {
auto schema = std::make_shared<Schema>();
auto fid_64 = schema->AddDebugField("counter_i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
int64_t N = 10000;
int64_t req_size = 10;
auto choose = [=](int i) { return i * 3 % N; };
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto req_ids = std::make_unique<IdArray>();
auto req_ids_arr = req_ids->mutable_int_id();
auto i64_col = dataset.get_col<int64_t>(0);
auto vf_col = dataset.get_col<float>(1);
for (int i = 0; i < req_size; ++i) {
req_ids_arr->add_data(i64_col[choose(i)]);
}
// should be ruled out
req_ids_arr->add_data(-1);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
auto retrieve_results = segment->GetEntityById(target_offsets, *req_ids);
auto ids = retrieve_results->ids().int_id();
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();
for (int i = 0; i < req_size; ++i) {
auto id = ids.data(i);
auto index = choose(i);
ASSERT_EQ(id, i64_col[index]);
auto data = field0_data.data(i);
ASSERT_EQ(data, i64_col[index]);
}
auto field1 = retrieve_results->fields_data(1);
Assert(field1.has_vectors());
auto field1_data = field1.vectors().float_vector();
ASSERT_EQ(field1_data.data_size(), DIM * req_size);
for (int i = 0; i < req_size; ++i) {
for (int d = 0; d < DIM; ++d) {
auto index = choose(i);
auto data = field1_data.data(i * DIM + d);
auto ref = vf_col[index * DIM + d];
ASSERT_EQ(data, ref);
}
}
}

View File

@ -186,7 +186,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42) {
res.row_ids_.push_back(i);
res.timestamps_.push_back(i);
}
std::shuffle(res.row_ids_.begin(), res.row_ids_.end(), er);
res.generate_rows(N, schema);
return std::move(res);
}