Support string type in segcore (#16546)

Signed-off-by: xige-16 <xi.ge@zilliz.com>
Co-authored-by: dragondriver <jiquan.long@zilliz.com>

Co-authored-by: dragondriver <jiquan.long@zilliz.com>
pull/16735/head
xige-16 2022-04-29 13:35:49 +08:00 committed by GitHub
parent 9537394971
commit 515d0369de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
156 changed files with 7657 additions and 4449 deletions

View File

@ -15,6 +15,11 @@
#include <string>
namespace milvus {
inline bool
IsVectorType(CDataType dtype) {
return dtype == CDataType::FloatVector || dtype == CDataType::BinaryVector;
}
template <typename T, typename = std::enable_if_t<std::is_fundamental_v<T> || std::is_same_v<T, std::string>>>
inline CDataType
GetDType() {

View File

@ -17,7 +17,11 @@
#pragma once
#include <stdint.h>
#include "Types.h"
const int64_t INVALID_ID = -1;
const int64_t INVALID_OFFSET = -1;
const int64_t INVALID_FIELD_ID = -1;
const int64_t INVALID_SEG_OFFSET = -1;
const milvus::PkType INVALID_PK; // of std::monostate if not set.
// TODO: default field start id, could get from config.yaml
const int64_t START_USER_FIELDID = 100;
const char MAX_LENGTH_PER_ROW[] = "max_length_per_row";

View File

@ -73,6 +73,8 @@ datatype_name(DataType data_type) {
return "float";
case DataType::DOUBLE:
return "double";
case DataType::VARCHAR:
return "varChar";
case DataType::VECTOR_FLOAT:
return "vector_float";
case DataType::VECTOR_BINARY: {
@ -90,6 +92,17 @@ datatype_is_vector(DataType datatype) {
return datatype == DataType::VECTOR_BINARY || datatype == DataType::VECTOR_FLOAT;
}
inline bool
datatype_is_string(DataType datatype) {
switch (datatype) {
case DataType::VARCHAR:
case DataType::STRING:
return true;
default:
return false;
}
}
inline bool
datatype_is_integer(DataType datatype) {
switch (datatype) {
@ -128,6 +141,11 @@ class FieldMeta {
Assert(!is_vector());
}
FieldMeta(const FieldName& name, FieldId id, DataType type, int64_t max_length_per_row)
: name_(name), id_(id), type_(type), string_info_(StringInfo{max_length_per_row}) {
Assert(is_string());
}
FieldMeta(const FieldName& name, FieldId id, DataType type, int64_t dim, std::optional<MetricType> metric_type)
: name_(name), id_(id), type_(type), vector_info_(VectorInfo{dim, metric_type}) {
Assert(is_vector());
@ -139,6 +157,12 @@ class FieldMeta {
return type_ == DataType::VECTOR_BINARY || type_ == DataType::VECTOR_FLOAT;
}
bool
is_string() const {
Assert(type_ != DataType::NONE);
return type_ == DataType::VARCHAR || type_ == DataType::STRING;
}
int64_t
get_dim() const {
Assert(is_vector());
@ -146,6 +170,13 @@ class FieldMeta {
return vector_info_->dim_;
}
int64_t
get_max_len() const {
Assert(is_string());
Assert(string_info_.has_value());
return string_info_->max_length_per_row;
}
std::optional<MetricType>
get_metric_type() const {
Assert(is_vector());
@ -168,10 +199,12 @@ class FieldMeta {
return type_;
}
int
int64_t
get_sizeof() const {
if (is_vector()) {
return datatype_sizeof(type_, get_dim());
} else if (is_string()) {
return string_info_->max_length_per_row;
} else {
return datatype_sizeof(type_);
}
@ -182,10 +215,14 @@ class FieldMeta {
int64_t dim_;
std::optional<MetricType> metric_type_;
};
struct StringInfo {
int64_t max_length_per_row;
};
FieldName name_;
FieldId id_;
DataType type_ = DataType::NONE;
std::optional<VectorInfo> vector_info_;
std::optional<StringInfo> string_info_;
};
} // namespace milvus

View File

@ -18,25 +18,31 @@
#include <map>
#include <string>
#include "Types.h"
#include "knowhere/index/vector_index/VecIndex.h"
#include "common/CDataType.h"
#include "knowhere/index/Index.h"
struct LoadIndexInfo {
int64_t field_id;
CDataType field_type;
std::map<std::string, std::string> index_params;
knowhere::VecIndexPtr index;
knowhere::IndexPtr index;
};
// NOTE: field_id can be system field
// NOTE: Refer to common/SystemProperty.cpp for details
// TODO: use arrow to pass field data instead of proto
struct LoadFieldDataInfo {
int64_t field_id;
const void* blob = nullptr;
// const void* blob = nullptr;
const milvus::DataArray* field_data;
int64_t row_count = -1;
};
struct LoadDeletedRecordInfo {
const void* timestamps = nullptr;
const void* primary_keys = nullptr;
const milvus::IdArray* primary_keys = nullptr;
int64_t row_count = -1;
};

View File

@ -17,6 +17,7 @@
#pragma once
#include <memory>
#include <map>
#include <limits>
#include <string>
#include <utility>
@ -32,52 +33,49 @@
namespace milvus {
struct SearchResult {
SearchResult() = default;
SearchResult(int64_t num_queries, int64_t topk) : topk_(topk), num_queries_(num_queries) {
auto count = get_row_count();
distances_.resize(count);
ids_.resize(count);
int64_t
get_total_result_count() const {
int64_t count = 0;
for (auto topk : real_topK_per_nq_) {
count += topk;
}
return count;
}
int64_t
get_row_count() const {
return topk_ * num_queries_;
}
// vector type
void
AddField(const FieldName& name,
const FieldId id,
DataType data_type,
int64_t dim,
std::optional<MetricType> metric_type) {
this->AddField(FieldMeta(name, id, data_type, dim, metric_type));
}
// scalar type
void
AddField(const FieldName& name, const FieldId id, DataType data_type) {
this->AddField(FieldMeta(name, id, data_type));
}
void
AddField(FieldMeta&& field_meta) {
output_fields_meta_.emplace_back(std::move(field_meta));
get_result_count(int nq_offset) const {
AssertInfo(nq_offset <= real_topK_per_nq_.size(), "wrong nq offset when get real search result count");
int64_t count = 0;
for (auto i = 0; i < nq_offset; i++) {
count += real_topK_per_nq_[i];
}
return count;
}
public:
int64_t num_queries_;
int64_t topk_;
std::vector<float> distances_;
std::vector<int64_t> ids_; // primary keys
public:
// TODO(gexi): utilize these fields
void* segment_;
// first fill data during search, and then update data after reducing search results
std::vector<float> distances_;
std::vector<int64_t> seg_offsets_;
// fist fill data during fillPrimaryKey, and then update data after reducing search results
std::vector<PkType> primary_keys_;
DataType pk_type_;
// fill data during reducing search result
std::vector<int64_t> result_offsets_;
std::vector<int64_t> primary_keys_;
aligned_vector<char> ids_data_;
std::vector<aligned_vector<char>> output_fields_data_;
std::vector<FieldMeta> output_fields_meta_;
// after reducing search result done, size(distances_) = size(seg_offsets_) = size(primary_keys_) =
// size(primary_keys_)
// set output fields data when fill target entity
std::map<FieldId, std::unique_ptr<milvus::DataArray>> output_fields_data_;
// used for reduce, filter invalid pk, get real topks count
std::vector<int64_t> real_topK_per_nq_;
};
using SearchResultPtr = std::shared_ptr<SearchResult>;

View File

@ -43,7 +43,6 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
// NOTE: only two system
for (const milvus::proto::schema::FieldSchema& child : schema_proto.fields()) {
auto field_offset = FieldOffset(schema->size());
auto field_id = FieldId(child.fieldid());
auto name = FieldName(child.name());
@ -69,25 +68,26 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
auto metric_type = GetMetricType(index_map.at("metric_type"));
schema->AddField(name, field_id, data_type, dim, metric_type);
}
} else if (datatype_is_string(data_type)) {
auto type_map = RepeatedKeyValToMap(child.type_params());
AssertInfo(type_map.count(MAX_LENGTH_PER_ROW), "max_length_per_row not found");
auto max_len = boost::lexical_cast<int64_t>(type_map.at(MAX_LENGTH_PER_ROW));
schema->AddField(name, field_id, data_type, max_len);
} else {
schema->AddField(name, field_id, data_type);
}
if (child.is_primary_key()) {
AssertInfo(!schema->get_primary_key_offset().has_value(), "repetitive primary key");
Assert(!schema_proto.autoid());
schema->set_primary_key(field_offset);
AssertInfo(!schema->get_primary_field_id().has_value(), "repetitive primary key");
schema->set_primary_field_id(field_id);
}
}
if (schema->get_is_auto_id()) {
AssertInfo(!schema->get_primary_key_offset().has_value(), "auto id mode: shouldn't have primary key");
} else {
AssertInfo(schema->get_primary_key_offset().has_value(), "primary key should be specified when autoId is off");
}
AssertInfo(schema->get_primary_field_id().has_value(), "primary key should be specified");
return schema;
}
const FieldMeta FieldMeta::RowIdMeta(FieldName("RowID"), FieldId(0), DataType::INT64);
const FieldMeta FieldMeta::RowIdMeta(FieldName("RowID"), RowFieldID, DataType::INT64);
} // namespace milvus

View File

@ -25,16 +25,18 @@
#include "FieldMeta.h"
#include "pb/schema.pb.h"
#include "Consts.h"
namespace milvus {
static int64_t debug_id = START_USER_FIELDID;
class Schema {
public:
FieldId
AddDebugField(const std::string& name, DataType data_type) {
static int64_t debug_id = 1000;
auto field_id = FieldId(debug_id);
debug_id += 2;
debug_id++;
this->AddField(FieldName(name), field_id, data_type);
return field_id;
}
@ -42,9 +44,8 @@ class Schema {
// auto gen field_id for convenience
FieldId
AddDebugField(const std::string& name, DataType data_type, int64_t dim, std::optional<MetricType> metric_type) {
static int64_t debug_id = 2001;
auto field_id = FieldId(debug_id);
debug_id += 2;
debug_id++;
auto field_meta = FieldMeta(FieldName(name), field_id, data_type, dim, metric_type);
this->AddField(std::move(field_meta));
return field_id;
@ -57,6 +58,13 @@ class Schema {
this->AddField(std::move(field_meta));
}
// string type
void
AddField(const FieldName& name, const FieldId id, DataType data_type, int64_t max_length_per_row) {
auto field_meta = FieldMeta(name, id, data_type, max_length_per_row);
this->AddField(std::move(field_meta));
}
// vector type
void
AddField(const FieldName& name,
@ -69,19 +77,8 @@ class Schema {
}
void
set_auto_id(bool is_auto_id) {
is_auto_id_ = is_auto_id;
}
void
set_primary_key(FieldOffset field_offset) {
is_auto_id_ = false;
this->primary_key_offset_opt_ = field_offset;
}
bool
get_is_auto_id() const {
return is_auto_id_;
set_primary_field_id(FieldId field_id) {
this->primary_field_id_opt_ = field_id;
}
auto
@ -100,10 +97,10 @@ class Schema {
}
const FieldMeta&
operator[](FieldOffset field_offset) const {
Assert(field_offset.get() >= 0);
Assert(field_offset.get() < fields_.size());
return fields_[field_offset.get()];
operator[](FieldId field_id) const {
Assert(field_id.get() >= 0);
AssertInfo(fields_.count(field_id), "Cannot find field_id");
return fields_.at(field_id);
}
auto
@ -111,39 +108,32 @@ class Schema {
return total_sizeof_;
}
const std::vector<int64_t>&
get_sizeof_infos() const {
return sizeof_infos_;
FieldId
get_field_id(const FieldName& field_name) const {
AssertInfo(name_ids_.count(field_name), "Cannot find field_name");
return name_ids_.at(field_name);
}
FieldOffset
get_offset(const FieldName& field_name) const {
Assert(name_offsets_.count(field_name));
return name_offsets_.at(field_name);
}
FieldOffset
get_offset(const FieldId& field_id) const {
Assert(id_offsets_.count(field_id));
return id_offsets_.at(field_id);
}
const std::vector<FieldMeta>&
const std::unordered_map<FieldId, FieldMeta>&
get_fields() const {
return fields_;
}
const FieldMeta&
operator[](const FieldName& field_name) const {
auto offset_iter = name_offsets_.find(field_name);
AssertInfo(offset_iter != name_offsets_.end(), "Cannot find field_name: " + field_name.get());
auto offset = offset_iter->second;
return (*this)[offset];
const std::vector<FieldId>&
get_field_ids() const {
return field_ids_;
}
std::optional<FieldOffset>
get_primary_key_offset() const {
return primary_key_offset_opt_;
const FieldMeta&
operator[](const FieldName& field_name) const {
auto id_iter = name_ids_.find(field_name);
AssertInfo(id_iter != name_ids_.end(), "Cannot find field_name: " + field_name.get());
return fields_.at(id_iter->second);
}
std::optional<FieldId>
get_primary_field_id() const {
return primary_field_id_opt_;
}
public:
@ -152,29 +142,33 @@ class Schema {
void
AddField(FieldMeta&& field_meta) {
auto offset = fields_.size();
AssertInfo(!name_offsets_.count(field_meta.get_name()), "duplicated field name");
name_offsets_.emplace(field_meta.get_name(), offset);
AssertInfo(!id_offsets_.count(field_meta.get_id()), "duplicated field id");
id_offsets_.emplace(field_meta.get_id(), offset);
auto field_name = field_meta.get_name();
auto field_id = field_meta.get_id();
AssertInfo(!name_ids_.count(field_name), "duplicated field name");
AssertInfo(!id_names_.count(field_id), "duplicated field id");
name_ids_.emplace(field_name, field_id);
id_names_.emplace(field_id, field_name);
fields_.emplace(field_id, field_meta);
field_ids_.emplace_back(field_id);
auto field_sizeof = field_meta.get_sizeof();
sizeof_infos_.push_back(std::move(field_sizeof));
fields_.emplace_back(std::move(field_meta));
total_sizeof_ += field_sizeof;
}
private:
int64_t debug_id = START_USER_FIELDID;
std::vector<FieldId> field_ids_;
// this is where data holds
std::vector<FieldMeta> fields_;
std::unordered_map<FieldId, FieldMeta> fields_;
// a mapping for random access
std::unordered_map<FieldName, FieldOffset> name_offsets_; // field_name -> offset
std::unordered_map<FieldId, FieldOffset> id_offsets_; // field_id -> offset
std::vector<int64_t> sizeof_infos_;
int total_sizeof_ = 0;
bool is_auto_id_ = true;
std::optional<FieldOffset> primary_key_offset_opt_;
std::unordered_map<FieldName, FieldId> name_ids_; // field_name -> field_id
std::unordered_map<FieldId, FieldName> id_names_; // field_id -> field_name
int64_t total_sizeof_ = 0;
std::optional<FieldId> primary_field_id_opt_;
};
using SchemaPtr = std::shared_ptr<Schema>;

View File

@ -18,6 +18,7 @@
#include <cassert>
#include <type_traits>
#include <string>
#include "Types.h"
#include "VectorTrait.h"
@ -56,7 +57,7 @@ class Span;
// TODO: refine Span to support T=FloatVector
template <typename T>
class Span<T, typename std::enable_if_t<std::is_fundamental_v<T>>> {
class Span<T, typename std::enable_if_t<IsScalar<T> || std::is_same_v<T, PkType>>> {
public:
using embeded_type = T;
explicit Span(const T* data, int64_t row_count) : data_(data), row_count_(row_count) {

View File

@ -17,7 +17,7 @@
#include <yaml-cpp/yaml.h>
#include "SystemProperty.h"
#include "Types.h"
#include "Consts.h"
#include "exceptions/EasyAssert.h"
namespace milvus {
@ -73,10 +73,10 @@ InstanceImpl() {
using Type = SystemFieldType;
impl.name_to_types_.emplace(FieldName("RowID"), Type::RowId);
impl.id_to_types_.emplace(FieldId(0), Type::RowId);
impl.id_to_types_.emplace(RowFieldID, Type::RowId);
impl.name_to_types_.emplace(FieldName("Timestamp"), Type::Timestamp);
impl.id_to_types_.emplace(FieldId(1), Type::Timestamp);
impl.id_to_types_.emplace(TimestampFieldID, Type::Timestamp);
return impl;
}();

View File

@ -22,6 +22,7 @@
#include "common/type_c.h"
#include "pb/schema.pb.h"
#include "CGoHelper.h"
#include "common/Consts.h"
namespace milvus {
@ -55,4 +56,9 @@ MetricTypeToName(MetricType metric_type) {
return metric_bimap.right.at(metric_type);
}
bool
IsPrimaryKeyDataType(DataType data_type) {
return data_type == engine::DataType::INT64 || data_type == DataType::VARCHAR;
}
} // namespace milvus

View File

@ -22,11 +22,14 @@
#include <utility>
#include <vector>
#include <boost/align/aligned_allocator.hpp>
#include <boost/container/vector.hpp>
#include <boost/dynamic_bitset.hpp>
#include <NamedType/named_type.hpp>
#include <variant>
#include "knowhere/common/MetricType.h"
#include "pb/schema.pb.h"
#include "pb/segcore.pb.h"
#include "utils/Types.h"
namespace milvus {
@ -36,12 +39,15 @@ constexpr auto MAX_TIMESTAMP = std::numeric_limits<Timestamp>::max();
using engine::DataType;
using engine::idx_t;
constexpr auto MAX_ROW_COUNT = std::numeric_limits<engine::idx_t>::max();
using ScalarArray = proto::schema::ScalarField;
using DataArray = proto::schema::FieldData;
using VectorArray = proto::schema::VectorField;
using IdArray = proto::schema::IDs;
using MetricType = faiss::MetricType;
using InsertData = proto::segcore::InsertRecord;
using PkType = std::variant<std::monostate, int64_t, std::string>;
MetricType
GetMetricType(const std::string& type);
@ -49,6 +55,9 @@ GetMetricType(const std::string& type);
std::string
MetricTypeToName(MetricType metric_type);
bool
IsPrimaryKeyDataType(DataType data_type);
// NOTE: dependent type
// used at meta-template programming
template <class...>
@ -70,11 +79,16 @@ struct SegOffsetTag;
using FieldId = fluent::NamedType<int64_t, impl::FieldIdTag, fluent::Comparable, fluent::Hashable>;
using FieldName = fluent::NamedType<std::string, impl::FieldNameTag, fluent::Comparable, fluent::Hashable>;
using FieldOffset = fluent::NamedType<int64_t, impl::FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
// using FieldOffset = fluent::NamedType<int64_t, impl::FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
using SegOffset = fluent::NamedType<int64_t, impl::SegOffsetTag, fluent::Arithmetic>;
using BitsetType = boost::dynamic_bitset<>;
using BitsetTypePtr = std::shared_ptr<boost::dynamic_bitset<>>;
using BitsetTypeOpt = std::optional<BitsetType>;
template <typename Type>
using FixedVector = boost::container::vector<Type>;
const FieldId RowFieldID = FieldId(0);
const FieldId TimestampFieldID = FieldId(1);
} // namespace milvus

View File

@ -0,0 +1,48 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <string>
namespace milvus {
inline bool
PrefixMatch(const std::string& str, const std::string& prefix) {
auto ret = strncmp(str.c_str(), prefix.c_str(), prefix.length());
if (ret != 0) {
return false;
}
return true;
}
inline bool
PostfixMatch(const std::string& str, const std::string& postfix) {
if (postfix.length() > str.length()) {
return false;
}
int offset = str.length() - postfix.length();
auto ret = strncmp(str.c_str() + offset, postfix.c_str(), postfix.length());
if (ret != 0) {
return false;
}
//
// int i = postfix.length() - 1;
// int j = str.length() - 1;
// for (; i >= 0; i--, j--) {
// if (postfix[i] != str[j]) {
// return false;
// }
// }
return true;
}
} // namespace milvus

View File

@ -16,6 +16,7 @@
#pragma once
#include "Types.h"
#include <string>
namespace milvus {
@ -48,7 +49,7 @@ template <typename T>
constexpr bool IsVector = std::is_base_of_v<VectorTrait, T>;
template <typename T>
constexpr bool IsScalar = std::is_fundamental_v<T>;
constexpr bool IsScalar = std::is_fundamental_v<T> || std::is_same_v<T, std::string>;
template <typename T, typename Enabled = void>
struct EmbeddedTypeImpl;
@ -66,4 +67,17 @@ struct EmbeddedTypeImpl<T, std::enable_if_t<IsVector<T>>> {
template <typename T>
using EmbeddedType = typename EmbeddedTypeImpl<T>::type;
struct FundamentalTag {};
struct StringTag {};
template <class T>
struct TagDispatchTrait {
using Tag = FundamentalTag;
};
template <>
struct TagDispatchTrait<std::string> {
using Tag = StringTag;
};
} // namespace milvus

View File

@ -70,13 +70,13 @@ typedef struct CProto {
typedef struct CLoadFieldDataInfo {
int64_t field_id;
void* blob;
const char* blob;
int64_t row_count;
} CLoadFieldDataInfo;
typedef struct CLoadDeletedRecordInfo {
void* timestamps;
void* primary_keys;
const char* primary_keys;
int64_t row_count;
} CLoadDeletedRecordInfo;

View File

@ -33,6 +33,9 @@ class IndexBase : public Index {
virtual const TargetBitmapPtr
Query(const DatasetPtr& dataset) = 0;
virtual size_t
Count() = 0;
};
using IndexBasePtr = std::unique_ptr<IndexBase>;

View File

@ -37,6 +37,11 @@ class ScalarIndexSort : public ScalarIndex<T> {
void
BuildWithDataset(const DatasetPtr& dataset) override;
size_t
Count() override {
return data_.size();
}
void
Build(size_t n, const T* values) override;

View File

@ -35,6 +35,11 @@ class StringIndexMarisa : public StringIndex {
void
Load(const BinarySet& set) override;
size_t
Count() override {
return str_ids_.size();
}
void
Build(size_t n, const std::string* values) override;

View File

@ -0,0 +1,72 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <memory>
#include <vector>
#include <string>
#include "index/ScalarIndexSort.h"
#include "index/StringIndex.h"
#include "common/Utils.h"
namespace milvus::scalar {
// TODO: should inherit from StringIndex?
class StringIndexSort : public ScalarIndexSort<std::string> {
public:
void
BuildWithDataset(const DatasetPtr& dataset) override {
auto size = dataset->Get<int64_t>(knowhere::meta::ROWS);
auto data = dataset->Get<const void*>(knowhere::meta::TENSOR);
proto::schema::StringArray arr;
arr.ParseFromArray(data, size);
{
// TODO: optimize here. avoid memory copy.
std::vector<std::string> vecs{arr.data().begin(), arr.data().end()};
Build(arr.data().size(), vecs.data());
}
{
// TODO: test this way.
// auto strs = (const std::string*)arr.data().data();
// Build(arr.data().size(), strs);
}
}
const TargetBitmapPtr
Query(const DatasetPtr& dataset) override {
auto op = dataset->Get<OperatorType>(OPERATOR_TYPE);
if (op == PrefixMatchOp) {
auto prefix = dataset->Get<std::string>(PREFIX_VALUE);
return PrefixMatch(prefix);
}
return ScalarIndex<std::string>::Query(dataset);
}
const TargetBitmapPtr
PrefixMatch(std::string prefix) {
auto data = GetData();
TargetBitmapPtr bitset = std::make_unique<TargetBitmap>(data.size());
for (size_t i = 0; i < data.size(); i++) {
if (milvus::PrefixMatch(data[i].a_, prefix)) {
bitset->set(data[i].idx_);
}
}
return bitset;
}
};
using StringIndexSortPtr = std::unique_ptr<StringIndexSort>;
inline StringIndexSortPtr
CreateStringIndexSort() {
return std::make_unique<StringIndexSort>();
}
} // namespace milvus::scalar

View File

@ -532,13 +532,14 @@ const char descriptor_table_protodef_plan_2eproto[] PROTOBUF_SECTION_VARIABLE(pr
"\030\001 \001(\0132\035.milvus.proto.plan.VectorANNSH\000\022"
"-\n\npredicates\030\002 \001(\0132\027.milvus.proto.plan."
"ExprH\000\022\030\n\020output_field_ids\030\003 \003(\003B\006\n\004node"
"*n\n\006OpType\022\013\n\007Invalid\020\000\022\017\n\013GreaterThan\020\001"
"\022\020\n\014GreaterEqual\020\002\022\014\n\010LessThan\020\003\022\r\n\tLess"
"Equal\020\004\022\t\n\005Equal\020\005\022\014\n\010NotEqual\020\006*G\n\013Arit"
"hOpType\022\013\n\007Unknown\020\000\022\007\n\003Add\020\001\022\007\n\003Sub\020\002\022\007"
"\n\003Mul\020\003\022\007\n\003Div\020\004\022\007\n\003Mod\020\005B3Z1github.com/"
"milvus-io/milvus/internal/proto/planpbb\006"
"proto3"
"*\221\001\n\006OpType\022\013\n\007Invalid\020\000\022\017\n\013GreaterThan\020"
"\001\022\020\n\014GreaterEqual\020\002\022\014\n\010LessThan\020\003\022\r\n\tLes"
"sEqual\020\004\022\t\n\005Equal\020\005\022\014\n\010NotEqual\020\006\022\017\n\013Pre"
"fixMatch\020\007\022\020\n\014PostfixMatch\020\010*G\n\013ArithOpT"
"ype\022\013\n\007Unknown\020\000\022\007\n\003Add\020\001\022\007\n\003Sub\020\002\022\007\n\003Mu"
"l\020\003\022\007\n\003Div\020\004\022\007\n\003Mod\020\005B3Z1github.com/milv"
"us-io/milvus/internal/proto/planpbb\006prot"
"o3"
;
static const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable*const descriptor_table_plan_2eproto_deps[1] = {
&::descriptor_table_schema_2eproto,
@ -560,7 +561,7 @@ static ::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_pla
static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_plan_2eproto_once;
static bool descriptor_table_plan_2eproto_initialized = false;
const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_plan_2eproto = {
&descriptor_table_plan_2eproto_initialized, descriptor_table_protodef_plan_2eproto, "plan.proto", 2846,
&descriptor_table_plan_2eproto_initialized, descriptor_table_protodef_plan_2eproto, "plan.proto", 2882,
&descriptor_table_plan_2eproto_once, descriptor_table_plan_2eproto_sccs, descriptor_table_plan_2eproto_deps, 12, 1,
schemas, file_default_instances, TableStruct_plan_2eproto::offsets,
file_level_metadata_plan_2eproto, 14, file_level_enum_descriptors_plan_2eproto, file_level_service_descriptors_plan_2eproto,
@ -628,6 +629,8 @@ bool OpType_IsValid(int value) {
case 4:
case 5:
case 6:
case 7:
case 8:
return true;
default:
return false;

View File

@ -183,12 +183,14 @@ enum OpType : int {
LessEqual = 4,
Equal = 5,
NotEqual = 6,
PrefixMatch = 7,
PostfixMatch = 8,
OpType_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::PROTOBUF_NAMESPACE_ID::int32>::min(),
OpType_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::PROTOBUF_NAMESPACE_ID::int32>::max()
};
bool OpType_IsValid(int value);
constexpr OpType OpType_MIN = Invalid;
constexpr OpType OpType_MAX = NotEqual;
constexpr OpType OpType_MAX = PostfixMatch;
constexpr int OpType_ARRAYSIZE = OpType_MAX + 1;
const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* OpType_descriptor();

View File

@ -33,9 +33,28 @@ class LoadSegmentMetaDefaultTypeInternal {
public:
::PROTOBUF_NAMESPACE_ID::internal::ExplicitlyConstructed<LoadSegmentMeta> _instance;
} _LoadSegmentMeta_default_instance_;
class InsertRecordDefaultTypeInternal {
public:
::PROTOBUF_NAMESPACE_ID::internal::ExplicitlyConstructed<InsertRecord> _instance;
} _InsertRecord_default_instance_;
} // namespace segcore
} // namespace proto
} // namespace milvus
static void InitDefaultsscc_info_InsertRecord_segcore_2eproto() {
GOOGLE_PROTOBUF_VERIFY_VERSION;
{
void* ptr = &::milvus::proto::segcore::_InsertRecord_default_instance_;
new (ptr) ::milvus::proto::segcore::InsertRecord();
::PROTOBUF_NAMESPACE_ID::internal::OnShutdownDestroyMessage(ptr);
}
::milvus::proto::segcore::InsertRecord::InitAsDefaultInstance();
}
::PROTOBUF_NAMESPACE_ID::internal::SCCInfo<1> scc_info_InsertRecord_segcore_2eproto =
{{ATOMIC_VAR_INIT(::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase::kUninitialized), 1, InitDefaultsscc_info_InsertRecord_segcore_2eproto}, {
&scc_info_FieldData_schema_2eproto.base,}};
static void InitDefaultsscc_info_LoadFieldMeta_segcore_2eproto() {
GOOGLE_PROTOBUF_VERIFY_VERSION;
@ -81,7 +100,7 @@ static void InitDefaultsscc_info_RetrieveResults_segcore_2eproto() {
&scc_info_IDs_schema_2eproto.base,
&scc_info_FieldData_schema_2eproto.base,}};
static ::PROTOBUF_NAMESPACE_ID::Metadata file_level_metadata_segcore_2eproto[3];
static ::PROTOBUF_NAMESPACE_ID::Metadata file_level_metadata_segcore_2eproto[4];
static constexpr ::PROTOBUF_NAMESPACE_ID::EnumDescriptor const** file_level_enum_descriptors_segcore_2eproto = nullptr;
static constexpr ::PROTOBUF_NAMESPACE_ID::ServiceDescriptor const** file_level_service_descriptors_segcore_2eproto = nullptr;
@ -109,17 +128,26 @@ const ::PROTOBUF_NAMESPACE_ID::uint32 TableStruct_segcore_2eproto::offsets[] PRO
~0u, // no _weak_field_map_
PROTOBUF_FIELD_OFFSET(::milvus::proto::segcore::LoadSegmentMeta, metas_),
PROTOBUF_FIELD_OFFSET(::milvus::proto::segcore::LoadSegmentMeta, total_size_),
~0u, // no _has_bits_
PROTOBUF_FIELD_OFFSET(::milvus::proto::segcore::InsertRecord, _internal_metadata_),
~0u, // no _extensions_
~0u, // no _oneof_case_
~0u, // no _weak_field_map_
PROTOBUF_FIELD_OFFSET(::milvus::proto::segcore::InsertRecord, fields_data_),
PROTOBUF_FIELD_OFFSET(::milvus::proto::segcore::InsertRecord, num_rows_),
};
static const ::PROTOBUF_NAMESPACE_ID::internal::MigrationSchema schemas[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) = {
{ 0, -1, sizeof(::milvus::proto::segcore::RetrieveResults)},
{ 8, -1, sizeof(::milvus::proto::segcore::LoadFieldMeta)},
{ 16, -1, sizeof(::milvus::proto::segcore::LoadSegmentMeta)},
{ 23, -1, sizeof(::milvus::proto::segcore::InsertRecord)},
};
static ::PROTOBUF_NAMESPACE_ID::Message const * const file_default_instances[] = {
reinterpret_cast<const ::PROTOBUF_NAMESPACE_ID::Message*>(&::milvus::proto::segcore::_RetrieveResults_default_instance_),
reinterpret_cast<const ::PROTOBUF_NAMESPACE_ID::Message*>(&::milvus::proto::segcore::_LoadFieldMeta_default_instance_),
reinterpret_cast<const ::PROTOBUF_NAMESPACE_ID::Message*>(&::milvus::proto::segcore::_LoadSegmentMeta_default_instance_),
reinterpret_cast<const ::PROTOBUF_NAMESPACE_ID::Message*>(&::milvus::proto::segcore::_InsertRecord_default_instance_),
};
const char descriptor_table_protodef_segcore_2eproto[] PROTOBUF_SECTION_VARIABLE(protodesc_cold) =
@ -131,14 +159,17 @@ const char descriptor_table_protodef_segcore_2eproto[] PROTOBUF_SECTION_VARIABLE
"in_timestamp\030\001 \001(\003\022\025\n\rmax_timestamp\030\002 \001("
"\003\022\021\n\trow_count\030\003 \001(\003\"Y\n\017LoadSegmentMeta\022"
"2\n\005metas\030\001 \003(\0132#.milvus.proto.segcore.Lo"
"adFieldMeta\022\022\n\ntotal_size\030\002 \001(\003B6Z4githu"
"b.com/milvus-io/milvus/internal/proto/se"
"gcorepbb\006proto3"
"adFieldMeta\022\022\n\ntotal_size\030\002 \001(\003\"U\n\014Inser"
"tRecord\0223\n\013fields_data\030\001 \003(\0132\036.milvus.pr"
"oto.schema.FieldData\022\020\n\010num_rows\030\002 \001(\003B6"
"Z4github.com/milvus-io/milvus/internal/p"
"roto/segcorepbb\006proto3"
;
static const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable*const descriptor_table_segcore_2eproto_deps[1] = {
&::descriptor_table_schema_2eproto,
};
static ::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_segcore_2eproto_sccs[3] = {
static ::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_segcore_2eproto_sccs[4] = {
&scc_info_InsertRecord_segcore_2eproto.base,
&scc_info_LoadFieldMeta_segcore_2eproto.base,
&scc_info_LoadSegmentMeta_segcore_2eproto.base,
&scc_info_RetrieveResults_segcore_2eproto.base,
@ -146,10 +177,10 @@ static ::PROTOBUF_NAMESPACE_ID::internal::SCCInfoBase*const descriptor_table_seg
static ::PROTOBUF_NAMESPACE_ID::internal::once_flag descriptor_table_segcore_2eproto_once;
static bool descriptor_table_segcore_2eproto_initialized = false;
const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_segcore_2eproto = {
&descriptor_table_segcore_2eproto_initialized, descriptor_table_protodef_segcore_2eproto, "segcore.proto", 415,
&descriptor_table_segcore_2eproto_once, descriptor_table_segcore_2eproto_sccs, descriptor_table_segcore_2eproto_deps, 3, 1,
&descriptor_table_segcore_2eproto_initialized, descriptor_table_protodef_segcore_2eproto, "segcore.proto", 502,
&descriptor_table_segcore_2eproto_once, descriptor_table_segcore_2eproto_sccs, descriptor_table_segcore_2eproto_deps, 4, 1,
schemas, file_default_instances, TableStruct_segcore_2eproto::offsets,
file_level_metadata_segcore_2eproto, 3, file_level_enum_descriptors_segcore_2eproto, file_level_service_descriptors_segcore_2eproto,
file_level_metadata_segcore_2eproto, 4, file_level_enum_descriptors_segcore_2eproto, file_level_service_descriptors_segcore_2eproto,
};
// Force running AddDescriptors() at dynamic initialization time.
@ -1199,6 +1230,314 @@ void LoadSegmentMeta::InternalSwap(LoadSegmentMeta* other) {
}
// ===================================================================
void InsertRecord::InitAsDefaultInstance() {
}
class InsertRecord::_Internal {
public:
};
void InsertRecord::clear_fields_data() {
fields_data_.Clear();
}
InsertRecord::InsertRecord()
: ::PROTOBUF_NAMESPACE_ID::Message(), _internal_metadata_(nullptr) {
SharedCtor();
// @@protoc_insertion_point(constructor:milvus.proto.segcore.InsertRecord)
}
InsertRecord::InsertRecord(const InsertRecord& from)
: ::PROTOBUF_NAMESPACE_ID::Message(),
_internal_metadata_(nullptr),
fields_data_(from.fields_data_) {
_internal_metadata_.MergeFrom(from._internal_metadata_);
num_rows_ = from.num_rows_;
// @@protoc_insertion_point(copy_constructor:milvus.proto.segcore.InsertRecord)
}
void InsertRecord::SharedCtor() {
::PROTOBUF_NAMESPACE_ID::internal::InitSCC(&scc_info_InsertRecord_segcore_2eproto.base);
num_rows_ = PROTOBUF_LONGLONG(0);
}
InsertRecord::~InsertRecord() {
// @@protoc_insertion_point(destructor:milvus.proto.segcore.InsertRecord)
SharedDtor();
}
void InsertRecord::SharedDtor() {
}
void InsertRecord::SetCachedSize(int size) const {
_cached_size_.Set(size);
}
const InsertRecord& InsertRecord::default_instance() {
::PROTOBUF_NAMESPACE_ID::internal::InitSCC(&::scc_info_InsertRecord_segcore_2eproto.base);
return *internal_default_instance();
}
void InsertRecord::Clear() {
// @@protoc_insertion_point(message_clear_start:milvus.proto.segcore.InsertRecord)
::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0;
// Prevent compiler warnings about cached_has_bits being unused
(void) cached_has_bits;
fields_data_.Clear();
num_rows_ = PROTOBUF_LONGLONG(0);
_internal_metadata_.Clear();
}
#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
const char* InsertRecord::_InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) {
#define CHK_(x) if (PROTOBUF_PREDICT_FALSE(!(x))) goto failure
while (!ctx->Done(&ptr)) {
::PROTOBUF_NAMESPACE_ID::uint32 tag;
ptr = ::PROTOBUF_NAMESPACE_ID::internal::ReadTag(ptr, &tag);
CHK_(ptr);
switch (tag >> 3) {
// repeated .milvus.proto.schema.FieldData fields_data = 1;
case 1:
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 10)) {
ptr -= 1;
do {
ptr += 1;
ptr = ctx->ParseMessage(add_fields_data(), ptr);
CHK_(ptr);
if (!ctx->DataAvailable(ptr)) break;
} while (::PROTOBUF_NAMESPACE_ID::internal::UnalignedLoad<::PROTOBUF_NAMESPACE_ID::uint8>(ptr) == 10);
} else goto handle_unusual;
continue;
// int64 num_rows = 2;
case 2:
if (PROTOBUF_PREDICT_TRUE(static_cast<::PROTOBUF_NAMESPACE_ID::uint8>(tag) == 16)) {
num_rows_ = ::PROTOBUF_NAMESPACE_ID::internal::ReadVarint(&ptr);
CHK_(ptr);
} else goto handle_unusual;
continue;
default: {
handle_unusual:
if ((tag & 7) == 4 || tag == 0) {
ctx->SetLastTag(tag);
goto success;
}
ptr = UnknownFieldParse(tag, &_internal_metadata_, ptr, ctx);
CHK_(ptr != nullptr);
continue;
}
} // switch
} // while
success:
return ptr;
failure:
ptr = nullptr;
goto success;
#undef CHK_
}
#else // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
bool InsertRecord::MergePartialFromCodedStream(
::PROTOBUF_NAMESPACE_ID::io::CodedInputStream* input) {
#define DO_(EXPRESSION) if (!PROTOBUF_PREDICT_TRUE(EXPRESSION)) goto failure
::PROTOBUF_NAMESPACE_ID::uint32 tag;
// @@protoc_insertion_point(parse_start:milvus.proto.segcore.InsertRecord)
for (;;) {
::std::pair<::PROTOBUF_NAMESPACE_ID::uint32, bool> p = input->ReadTagWithCutoffNoLastTag(127u);
tag = p.first;
if (!p.second) goto handle_unusual;
switch (::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::GetTagFieldNumber(tag)) {
// repeated .milvus.proto.schema.FieldData fields_data = 1;
case 1: {
if (static_cast< ::PROTOBUF_NAMESPACE_ID::uint8>(tag) == (10 & 0xFF)) {
DO_(::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::ReadMessage(
input, add_fields_data()));
} else {
goto handle_unusual;
}
break;
}
// int64 num_rows = 2;
case 2: {
if (static_cast< ::PROTOBUF_NAMESPACE_ID::uint8>(tag) == (16 & 0xFF)) {
DO_((::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::ReadPrimitive<
::PROTOBUF_NAMESPACE_ID::int64, ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_INT64>(
input, &num_rows_)));
} else {
goto handle_unusual;
}
break;
}
default: {
handle_unusual:
if (tag == 0) {
goto success;
}
DO_(::PROTOBUF_NAMESPACE_ID::internal::WireFormat::SkipField(
input, tag, _internal_metadata_.mutable_unknown_fields()));
break;
}
}
}
success:
// @@protoc_insertion_point(parse_success:milvus.proto.segcore.InsertRecord)
return true;
failure:
// @@protoc_insertion_point(parse_failure:milvus.proto.segcore.InsertRecord)
return false;
#undef DO_
}
#endif // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
void InsertRecord::SerializeWithCachedSizes(
::PROTOBUF_NAMESPACE_ID::io::CodedOutputStream* output) const {
// @@protoc_insertion_point(serialize_start:milvus.proto.segcore.InsertRecord)
::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// repeated .milvus.proto.schema.FieldData fields_data = 1;
for (unsigned int i = 0,
n = static_cast<unsigned int>(this->fields_data_size()); i < n; i++) {
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteMessageMaybeToArray(
1,
this->fields_data(static_cast<int>(i)),
output);
}
// int64 num_rows = 2;
if (this->num_rows() != 0) {
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt64(2, this->num_rows(), output);
}
if (_internal_metadata_.have_unknown_fields()) {
::PROTOBUF_NAMESPACE_ID::internal::WireFormat::SerializeUnknownFields(
_internal_metadata_.unknown_fields(), output);
}
// @@protoc_insertion_point(serialize_end:milvus.proto.segcore.InsertRecord)
}
::PROTOBUF_NAMESPACE_ID::uint8* InsertRecord::InternalSerializeWithCachedSizesToArray(
::PROTOBUF_NAMESPACE_ID::uint8* target) const {
// @@protoc_insertion_point(serialize_to_array_start:milvus.proto.segcore.InsertRecord)
::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0;
(void) cached_has_bits;
// repeated .milvus.proto.schema.FieldData fields_data = 1;
for (unsigned int i = 0,
n = static_cast<unsigned int>(this->fields_data_size()); i < n; i++) {
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::
InternalWriteMessageToArray(
1, this->fields_data(static_cast<int>(i)), target);
}
// int64 num_rows = 2;
if (this->num_rows() != 0) {
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::WriteInt64ToArray(2, this->num_rows(), target);
}
if (_internal_metadata_.have_unknown_fields()) {
target = ::PROTOBUF_NAMESPACE_ID::internal::WireFormat::SerializeUnknownFieldsToArray(
_internal_metadata_.unknown_fields(), target);
}
// @@protoc_insertion_point(serialize_to_array_end:milvus.proto.segcore.InsertRecord)
return target;
}
size_t InsertRecord::ByteSizeLong() const {
// @@protoc_insertion_point(message_byte_size_start:milvus.proto.segcore.InsertRecord)
size_t total_size = 0;
if (_internal_metadata_.have_unknown_fields()) {
total_size +=
::PROTOBUF_NAMESPACE_ID::internal::WireFormat::ComputeUnknownFieldsSize(
_internal_metadata_.unknown_fields());
}
::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0;
// Prevent compiler warnings about cached_has_bits being unused
(void) cached_has_bits;
// repeated .milvus.proto.schema.FieldData fields_data = 1;
{
unsigned int count = static_cast<unsigned int>(this->fields_data_size());
total_size += 1UL * count;
for (unsigned int i = 0; i < count; i++) {
total_size +=
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::MessageSize(
this->fields_data(static_cast<int>(i)));
}
}
// int64 num_rows = 2;
if (this->num_rows() != 0) {
total_size += 1 +
::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::Int64Size(
this->num_rows());
}
int cached_size = ::PROTOBUF_NAMESPACE_ID::internal::ToCachedSize(total_size);
SetCachedSize(cached_size);
return total_size;
}
void InsertRecord::MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) {
// @@protoc_insertion_point(generalized_merge_from_start:milvus.proto.segcore.InsertRecord)
GOOGLE_DCHECK_NE(&from, this);
const InsertRecord* source =
::PROTOBUF_NAMESPACE_ID::DynamicCastToGenerated<InsertRecord>(
&from);
if (source == nullptr) {
// @@protoc_insertion_point(generalized_merge_from_cast_fail:milvus.proto.segcore.InsertRecord)
::PROTOBUF_NAMESPACE_ID::internal::ReflectionOps::Merge(from, this);
} else {
// @@protoc_insertion_point(generalized_merge_from_cast_success:milvus.proto.segcore.InsertRecord)
MergeFrom(*source);
}
}
void InsertRecord::MergeFrom(const InsertRecord& from) {
// @@protoc_insertion_point(class_specific_merge_from_start:milvus.proto.segcore.InsertRecord)
GOOGLE_DCHECK_NE(&from, this);
_internal_metadata_.MergeFrom(from._internal_metadata_);
::PROTOBUF_NAMESPACE_ID::uint32 cached_has_bits = 0;
(void) cached_has_bits;
fields_data_.MergeFrom(from.fields_data_);
if (from.num_rows() != 0) {
set_num_rows(from.num_rows());
}
}
void InsertRecord::CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) {
// @@protoc_insertion_point(generalized_copy_from_start:milvus.proto.segcore.InsertRecord)
if (&from == this) return;
Clear();
MergeFrom(from);
}
void InsertRecord::CopyFrom(const InsertRecord& from) {
// @@protoc_insertion_point(class_specific_copy_from_start:milvus.proto.segcore.InsertRecord)
if (&from == this) return;
Clear();
MergeFrom(from);
}
bool InsertRecord::IsInitialized() const {
return true;
}
void InsertRecord::InternalSwap(InsertRecord* other) {
using std::swap;
_internal_metadata_.Swap(&other->_internal_metadata_);
CastToBase(&fields_data_)->InternalSwap(CastToBase(&other->fields_data_));
swap(num_rows_, other->num_rows_);
}
::PROTOBUF_NAMESPACE_ID::Metadata InsertRecord::GetMetadata() const {
return GetMetadataStatic();
}
// @@protoc_insertion_point(namespace_scope)
} // namespace segcore
} // namespace proto
@ -1213,6 +1552,9 @@ template<> PROTOBUF_NOINLINE ::milvus::proto::segcore::LoadFieldMeta* Arena::Cre
template<> PROTOBUF_NOINLINE ::milvus::proto::segcore::LoadSegmentMeta* Arena::CreateMaybeMessage< ::milvus::proto::segcore::LoadSegmentMeta >(Arena* arena) {
return Arena::CreateInternal< ::milvus::proto::segcore::LoadSegmentMeta >(arena);
}
template<> PROTOBUF_NOINLINE ::milvus::proto::segcore::InsertRecord* Arena::CreateMaybeMessage< ::milvus::proto::segcore::InsertRecord >(Arena* arena) {
return Arena::CreateInternal< ::milvus::proto::segcore::InsertRecord >(arena);
}
PROTOBUF_NAMESPACE_CLOSE
// @@protoc_insertion_point(global_scope)

View File

@ -48,7 +48,7 @@ struct TableStruct_segcore_2eproto {
PROTOBUF_SECTION_VARIABLE(protodesc_cold);
static const ::PROTOBUF_NAMESPACE_ID::internal::AuxillaryParseTableField aux[]
PROTOBUF_SECTION_VARIABLE(protodesc_cold);
static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[3]
static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[4]
PROTOBUF_SECTION_VARIABLE(protodesc_cold);
static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
@ -58,6 +58,9 @@ extern const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table
namespace milvus {
namespace proto {
namespace segcore {
class InsertRecord;
class InsertRecordDefaultTypeInternal;
extern InsertRecordDefaultTypeInternal _InsertRecord_default_instance_;
class LoadFieldMeta;
class LoadFieldMetaDefaultTypeInternal;
extern LoadFieldMetaDefaultTypeInternal _LoadFieldMeta_default_instance_;
@ -71,6 +74,7 @@ extern RetrieveResultsDefaultTypeInternal _RetrieveResults_default_instance_;
} // namespace proto
} // namespace milvus
PROTOBUF_NAMESPACE_OPEN
template<> ::milvus::proto::segcore::InsertRecord* Arena::CreateMaybeMessage<::milvus::proto::segcore::InsertRecord>(Arena*);
template<> ::milvus::proto::segcore::LoadFieldMeta* Arena::CreateMaybeMessage<::milvus::proto::segcore::LoadFieldMeta>(Arena*);
template<> ::milvus::proto::segcore::LoadSegmentMeta* Arena::CreateMaybeMessage<::milvus::proto::segcore::LoadSegmentMeta>(Arena*);
template<> ::milvus::proto::segcore::RetrieveResults* Arena::CreateMaybeMessage<::milvus::proto::segcore::RetrieveResults>(Arena*);
@ -529,6 +533,150 @@ class LoadSegmentMeta :
mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
friend struct ::TableStruct_segcore_2eproto;
};
// -------------------------------------------------------------------
class InsertRecord :
public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:milvus.proto.segcore.InsertRecord) */ {
public:
InsertRecord();
virtual ~InsertRecord();
InsertRecord(const InsertRecord& from);
InsertRecord(InsertRecord&& from) noexcept
: InsertRecord() {
*this = ::std::move(from);
}
inline InsertRecord& operator=(const InsertRecord& from) {
CopyFrom(from);
return *this;
}
inline InsertRecord& operator=(InsertRecord&& from) noexcept {
if (GetArenaNoVirtual() == from.GetArenaNoVirtual()) {
if (this != &from) InternalSwap(&from);
} else {
CopyFrom(from);
}
return *this;
}
static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
return GetDescriptor();
}
static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
return GetMetadataStatic().descriptor;
}
static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
return GetMetadataStatic().reflection;
}
static const InsertRecord& default_instance();
static void InitAsDefaultInstance(); // FOR INTERNAL USE ONLY
static inline const InsertRecord* internal_default_instance() {
return reinterpret_cast<const InsertRecord*>(
&_InsertRecord_default_instance_);
}
static constexpr int kIndexInFileMessages =
3;
friend void swap(InsertRecord& a, InsertRecord& b) {
a.Swap(&b);
}
inline void Swap(InsertRecord* other) {
if (other == this) return;
InternalSwap(other);
}
// implements Message ----------------------------------------------
inline InsertRecord* New() const final {
return CreateMaybeMessage<InsertRecord>(nullptr);
}
InsertRecord* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
return CreateMaybeMessage<InsertRecord>(arena);
}
void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
void CopyFrom(const InsertRecord& from);
void MergeFrom(const InsertRecord& from);
PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
bool IsInitialized() const final;
size_t ByteSizeLong() const final;
#if GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
#else
bool MergePartialFromCodedStream(
::PROTOBUF_NAMESPACE_ID::io::CodedInputStream* input) final;
#endif // GOOGLE_PROTOBUF_ENABLE_EXPERIMENTAL_PARSER
void SerializeWithCachedSizes(
::PROTOBUF_NAMESPACE_ID::io::CodedOutputStream* output) const final;
::PROTOBUF_NAMESPACE_ID::uint8* InternalSerializeWithCachedSizesToArray(
::PROTOBUF_NAMESPACE_ID::uint8* target) const final;
int GetCachedSize() const final { return _cached_size_.Get(); }
private:
inline void SharedCtor();
inline void SharedDtor();
void SetCachedSize(int size) const final;
void InternalSwap(InsertRecord* other);
friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
return "milvus.proto.segcore.InsertRecord";
}
private:
inline ::PROTOBUF_NAMESPACE_ID::Arena* GetArenaNoVirtual() const {
return nullptr;
}
inline void* MaybeArenaPtr() const {
return nullptr;
}
public:
::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
private:
static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_segcore_2eproto);
return ::descriptor_table_segcore_2eproto.file_level_metadata[kIndexInFileMessages];
}
public:
// nested types ----------------------------------------------------
// accessors -------------------------------------------------------
enum : int {
kFieldsDataFieldNumber = 1,
kNumRowsFieldNumber = 2,
};
// repeated .milvus.proto.schema.FieldData fields_data = 1;
int fields_data_size() const;
void clear_fields_data();
::milvus::proto::schema::FieldData* mutable_fields_data(int index);
::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< ::milvus::proto::schema::FieldData >*
mutable_fields_data();
const ::milvus::proto::schema::FieldData& fields_data(int index) const;
::milvus::proto::schema::FieldData* add_fields_data();
const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< ::milvus::proto::schema::FieldData >&
fields_data() const;
// int64 num_rows = 2;
void clear_num_rows();
::PROTOBUF_NAMESPACE_ID::int64 num_rows() const;
void set_num_rows(::PROTOBUF_NAMESPACE_ID::int64 value);
// @@protoc_insertion_point(class_scope:milvus.proto.segcore.InsertRecord)
private:
class _Internal;
::PROTOBUF_NAMESPACE_ID::internal::InternalMetadataWithArena _internal_metadata_;
::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< ::milvus::proto::schema::FieldData > fields_data_;
::PROTOBUF_NAMESPACE_ID::int64 num_rows_;
mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
friend struct ::TableStruct_segcore_2eproto;
};
// ===================================================================
@ -736,6 +884,51 @@ inline void LoadSegmentMeta::set_total_size(::PROTOBUF_NAMESPACE_ID::int64 value
// @@protoc_insertion_point(field_set:milvus.proto.segcore.LoadSegmentMeta.total_size)
}
// -------------------------------------------------------------------
// InsertRecord
// repeated .milvus.proto.schema.FieldData fields_data = 1;
inline int InsertRecord::fields_data_size() const {
return fields_data_.size();
}
inline ::milvus::proto::schema::FieldData* InsertRecord::mutable_fields_data(int index) {
// @@protoc_insertion_point(field_mutable:milvus.proto.segcore.InsertRecord.fields_data)
return fields_data_.Mutable(index);
}
inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< ::milvus::proto::schema::FieldData >*
InsertRecord::mutable_fields_data() {
// @@protoc_insertion_point(field_mutable_list:milvus.proto.segcore.InsertRecord.fields_data)
return &fields_data_;
}
inline const ::milvus::proto::schema::FieldData& InsertRecord::fields_data(int index) const {
// @@protoc_insertion_point(field_get:milvus.proto.segcore.InsertRecord.fields_data)
return fields_data_.Get(index);
}
inline ::milvus::proto::schema::FieldData* InsertRecord::add_fields_data() {
// @@protoc_insertion_point(field_add:milvus.proto.segcore.InsertRecord.fields_data)
return fields_data_.Add();
}
inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< ::milvus::proto::schema::FieldData >&
InsertRecord::fields_data() const {
// @@protoc_insertion_point(field_list:milvus.proto.segcore.InsertRecord.fields_data)
return fields_data_;
}
// int64 num_rows = 2;
inline void InsertRecord::clear_num_rows() {
num_rows_ = PROTOBUF_LONGLONG(0);
}
inline ::PROTOBUF_NAMESPACE_ID::int64 InsertRecord::num_rows() const {
// @@protoc_insertion_point(field_get:milvus.proto.segcore.InsertRecord.num_rows)
return num_rows_;
}
inline void InsertRecord::set_num_rows(::PROTOBUF_NAMESPACE_ID::int64 value) {
num_rows_ = value;
// @@protoc_insertion_point(field_set:milvus.proto.segcore.InsertRecord.num_rows)
}
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif // __GNUC__
@ -743,6 +936,8 @@ inline void LoadSegmentMeta::set_total_size(::PROTOBUF_NAMESPACE_ID::int64 value
// -------------------------------------------------------------------
// -------------------------------------------------------------------
// @@protoc_insertion_point(namespace_scope)

View File

@ -86,15 +86,14 @@ struct LogicalBinaryExpr : BinaryExprBase {
};
struct TermExpr : Expr {
const FieldOffset field_offset_;
const FieldId field_id_;
const DataType data_type_;
protected:
// prevent accidential instantiation
TermExpr() = delete;
TermExpr(const FieldOffset field_offset, const DataType data_type)
: field_offset_(field_offset), data_type_(data_type) {
TermExpr(const FieldId field_id, const DataType data_type) : field_id_(field_id), data_type_(data_type) {
}
public:
@ -110,6 +109,8 @@ enum class OpType {
LessEqual = 4,
Equal = 5,
NotEqual = 6,
PrefixMatch = 7,
PostfixMatch = 8,
};
enum class ArithOpType {
@ -134,7 +135,7 @@ static const std::map<ArithOpType, std::string> mapping_arith_op_ = {
};
struct BinaryArithOpEvalRangeExpr : Expr {
const FieldOffset field_offset_;
const FieldId field_id_;
const DataType data_type_;
const OpType op_type_;
const ArithOpType arith_op_;
@ -143,11 +144,11 @@ struct BinaryArithOpEvalRangeExpr : Expr {
// prevent accidential instantiation
BinaryArithOpEvalRangeExpr() = delete;
BinaryArithOpEvalRangeExpr(const FieldOffset field_offset,
BinaryArithOpEvalRangeExpr(const FieldId field_id,
const DataType data_type,
const OpType op_type,
const ArithOpType arith_op)
: field_offset_(field_offset), data_type_(data_type), op_type_(op_type), arith_op_(arith_op) {
: field_id_(field_id), data_type_(data_type), op_type_(op_type), arith_op_(arith_op) {
}
public:
@ -163,7 +164,7 @@ static const std::map<std::string, OpType> mapping_ = {
};
struct UnaryRangeExpr : Expr {
const FieldOffset field_offset_;
const FieldId field_id_;
const DataType data_type_;
const OpType op_type_;
@ -171,8 +172,8 @@ struct UnaryRangeExpr : Expr {
// prevent accidential instantiation
UnaryRangeExpr() = delete;
UnaryRangeExpr(const FieldOffset field_offset, const DataType data_type, const OpType op_type)
: field_offset_(field_offset), data_type_(data_type), op_type_(op_type) {
UnaryRangeExpr(const FieldId field_id, const DataType data_type, const OpType op_type)
: field_id_(field_id), data_type_(data_type), op_type_(op_type) {
}
public:
@ -181,7 +182,7 @@ struct UnaryRangeExpr : Expr {
};
struct BinaryRangeExpr : Expr {
const FieldOffset field_offset_;
const FieldId field_id_;
const DataType data_type_;
const bool lower_inclusive_;
const bool upper_inclusive_;
@ -190,11 +191,11 @@ struct BinaryRangeExpr : Expr {
// prevent accidential instantiation
BinaryRangeExpr() = delete;
BinaryRangeExpr(const FieldOffset field_offset,
BinaryRangeExpr(const FieldId field_id,
const DataType data_type,
const bool lower_inclusive,
const bool upper_inclusive)
: field_offset_(field_offset),
: field_id_(field_id),
data_type_(data_type),
lower_inclusive_(lower_inclusive),
upper_inclusive_(upper_inclusive) {
@ -206,8 +207,8 @@ struct BinaryRangeExpr : Expr {
};
struct CompareExpr : Expr {
FieldOffset left_field_offset_;
FieldOffset right_field_offset_;
FieldId left_field_id_;
FieldId right_field_id_;
DataType left_data_type_;
DataType right_data_type_;
OpType op_type_;

View File

@ -28,8 +28,8 @@ template <typename T>
struct TermExprImpl : TermExpr {
const std::vector<T> terms_;
TermExprImpl(const FieldOffset field_offset, const DataType data_type, const std::vector<T>& terms)
: TermExpr(field_offset, data_type), terms_(terms) {
TermExprImpl(const FieldId field_id, const DataType data_type, const std::vector<T>& terms)
: TermExpr(field_id, data_type), terms_(terms) {
}
};
@ -38,13 +38,13 @@ struct BinaryArithOpEvalRangeExprImpl : BinaryArithOpEvalRangeExpr {
const T right_operand_;
const T value_;
BinaryArithOpEvalRangeExprImpl(const FieldOffset field_offset,
BinaryArithOpEvalRangeExprImpl(const FieldId field_id,
const DataType data_type,
const ArithOpType arith_op,
const T right_operand,
const OpType op_type,
const T value)
: BinaryArithOpEvalRangeExpr(field_offset, data_type, op_type, arith_op),
: BinaryArithOpEvalRangeExpr(field_id, data_type, op_type, arith_op),
right_operand_(right_operand),
value_(value) {
}
@ -54,8 +54,8 @@ template <typename T>
struct UnaryRangeExprImpl : UnaryRangeExpr {
const T value_;
UnaryRangeExprImpl(const FieldOffset field_offset, const DataType data_type, const OpType op_type, const T value)
: UnaryRangeExpr(field_offset, data_type, op_type), value_(value) {
UnaryRangeExprImpl(const FieldId field_id, const DataType data_type, const OpType op_type, const T value)
: UnaryRangeExpr(field_id, data_type, op_type), value_(value) {
}
};
@ -64,13 +64,13 @@ struct BinaryRangeExprImpl : BinaryRangeExpr {
const T lower_value_;
const T upper_value_;
BinaryRangeExprImpl(const FieldOffset field_offset,
BinaryRangeExprImpl(const FieldId field_id,
const DataType data_type,
const bool lower_inclusive,
const bool upper_inclusive,
const T lower_value,
const T upper_value)
: BinaryRangeExpr(field_offset, data_type, lower_inclusive, upper_inclusive),
: BinaryRangeExpr(field_id, data_type, lower_inclusive, upper_inclusive),
lower_value_(lower_value),
upper_value_(upper_value) {
}

View File

@ -75,13 +75,13 @@ Parser::ParseCompareNode(const Json& out_body) {
Assert(item0.is_string());
auto left_field_name = FieldName(item0.get<std::string>());
expr->left_data_type_ = schema[left_field_name].get_data_type();
expr->left_field_offset_ = schema.get_offset(left_field_name);
expr->left_field_id_ = schema.get_field_id(left_field_name);
auto& item1 = body[1];
Assert(item1.is_string());
auto right_field_name = FieldName(item1.get<std::string>());
expr->right_data_type_ = schema[right_field_name].get_data_type();
expr->right_field_offset_ = schema.get_offset(right_field_name);
expr->right_field_id_ = schema.get_field_id(right_field_name);
return expr;
}
@ -188,7 +188,7 @@ Parser::ParseVecNode(const Json& out_body) {
AssertInfo(topk > 0, "topk must greater than 0");
AssertInfo(topk < 16384, "topk is too large");
auto field_offset = schema.get_offset(field_name);
auto field_id = schema.get_field_id(field_name);
auto vec_node = [&]() -> std::unique_ptr<VectorPlanNode> {
auto& field_meta = schema.operator[](field_name);
@ -202,12 +202,12 @@ Parser::ParseVecNode(const Json& out_body) {
vec_node->search_info_.topk_ = topk;
vec_node->search_info_.metric_type_ = GetMetricType(vec_info.at("metric_type"));
vec_node->search_info_.search_params_ = vec_info.at("params");
vec_node->search_info_.field_offset_ = field_offset;
vec_node->search_info_.field_id_ = field_id;
vec_node->search_info_.round_decimal_ = vec_info.at("round_decimal");
vec_node->placeholder_tag_ = vec_info.at("query");
auto tag = vec_node->placeholder_tag_;
AssertInfo(!tag2field_.count(tag), "duplicated placeholder tag");
tag2field_.emplace(tag, field_offset);
tag2field_.emplace(tag, field_id);
return vec_node;
}
@ -232,7 +232,8 @@ Parser::ParseTermNodeImpl(const FieldName& field_name, const Json& body) {
terms[i] = value;
}
std::sort(terms.begin(), terms.end());
return std::make_unique<TermExprImpl<T>>(schema.get_offset(field_name), schema[field_name].get_data_type(), terms);
return std::make_unique<TermExprImpl<T>>(schema.get_field_id(field_name), schema[field_name].get_data_type(),
terms);
}
template <typename T>
@ -298,8 +299,8 @@ Parser::ParseRangeNodeImpl(const FieldName& field_name, const Json& body) {
}
return std::make_unique<BinaryArithOpEvalRangeExprImpl<T>>(
schema.get_offset(field_name), schema[field_name].get_data_type(), arith_op_mapping_.at(arith_op_name),
right_operand, mapping_.at(op_name), value);
schema.get_field_id(field_name), schema[field_name].get_data_type(),
arith_op_mapping_.at(arith_op_name), right_operand, mapping_.at(op_name), value);
}
if constexpr (std::is_same_v<T, bool>) {
@ -312,7 +313,7 @@ Parser::ParseRangeNodeImpl(const FieldName& field_name, const Json& body) {
static_assert(always_false<T>, "unsupported type");
}
return std::make_unique<UnaryRangeExprImpl<T>>(
schema.get_offset(field_name), schema[field_name].get_data_type(), mapping_.at(op_name), item.value());
schema.get_field_id(field_name), schema[field_name].get_data_type(), mapping_.at(op_name), item.value());
} else if (body.size() == 2) {
bool has_lower_value = false;
bool has_upper_value = false;
@ -351,7 +352,7 @@ Parser::ParseRangeNodeImpl(const FieldName& field_name, const Json& body) {
}
}
AssertInfo(has_lower_value && has_upper_value, "illegal binary-range node");
return std::make_unique<BinaryRangeExprImpl<T>>(schema.get_offset(field_name),
return std::make_unique<BinaryRangeExprImpl<T>>(schema.get_field_id(field_name),
schema[field_name].get_data_type(), lower_inclusive,
upper_inclusive, lower_value, upper_value);
} else {

View File

@ -85,7 +85,7 @@ class Parser {
private:
const Schema& schema;
std::map<std::string, FieldOffset> tag2field_; // PlaceholderName -> field offset
std::map<std::string, FieldId> tag2field_; // PlaceholderName -> field id
std::optional<std::unique_ptr<VectorPlanNode>> vector_node_opt_;
};

View File

@ -32,8 +32,8 @@ ParsePlaceholderGroup(const Plan* plan, const std::string& blob) {
Placeholder element;
element.tag_ = info.tag();
Assert(plan->tag2field_.count(element.tag_));
auto field_offset = plan->tag2field_.at(element.tag_);
auto& field_meta = plan->schema_[field_offset];
auto field_id = plan->tag2field_.at(element.tag_);
auto& field_meta = plan->schema_[field_id];
element.num_of_queries_ = info.values_size();
AssertInfo(element.num_of_queries_, "must have queries");
Assert(element.num_of_queries_ > 0);
@ -86,9 +86,9 @@ GetNumOfQueries(const PlaceholderGroup* group) {
// std::unique_ptr<RetrievePlan>
// CreateRetrievePlan(const Schema& schema, proto::segcore::RetrieveRequest&& request) {
// auto plan = std::make_unique<RetrievePlan>();
// plan->ids_ = std::unique_ptr<proto::schema::IDs>(request.release_ids());
// plan->seg_offsets_ = std::unique_ptr<proto::schema::IDs>(request.release_ids());
// for (auto& field_id : request.output_fields_id()) {
// plan->field_offsets_.push_back(schema.get_offset(FieldId(field_id)));
// plan->field_ids_.push_back(schema.get_offset(FieldId(field_id)));
// }
// return plan;
//}

View File

@ -22,6 +22,7 @@
#include "exceptions/EasyAssert.h"
#include "pb/milvus.pb.h"
#include "utils/Json.h"
#include "common/Consts.h"
namespace milvus::query {
@ -33,8 +34,10 @@ struct ExtractedPlanInfo {
}
void
add_involved_field(FieldOffset field_offset) {
involved_fields_.set(field_offset.get());
add_involved_field(FieldId field_id) {
auto pos = field_id.get() - START_USER_FIELDID;
AssertInfo(pos >= 0, "field id is invalid");
involved_fields_.set(pos);
}
public:
@ -49,8 +52,8 @@ struct Plan {
public:
const Schema& schema_;
std::unique_ptr<VectorPlanNode> plan_node_;
std::map<std::string, FieldOffset> tag2field_; // PlaceholderName -> FieldOffset
std::vector<FieldOffset> target_entries_;
std::map<std::string, FieldId> tag2field_; // PlaceholderName -> FieldId
std::vector<FieldId> target_entries_;
void
check_identical(Plan& other);
@ -86,7 +89,7 @@ struct RetrievePlan {
public:
const Schema& schema_;
std::unique_ptr<RetrievePlanNode> plan_node_;
std::vector<FieldOffset> field_offsets_;
std::vector<FieldId> field_ids_;
};
using PlanPtr = std::unique_ptr<Plan>;

View File

@ -37,7 +37,7 @@ using PlanNodePtr = std::unique_ptr<PlanNode>;
struct SearchInfo {
int64_t topk_;
int64_t round_decimal_;
FieldOffset field_offset_;
FieldId field_id_;
MetricType metric_type_;
nlohmann::json search_params_;
};

View File

@ -11,18 +11,21 @@
#include <google/protobuf/text_format.h>
#include <string>
#include "ExprImpl.h"
#include "PlanProto.h"
#include "generated/ExtractInfoExprVisitor.h"
#include "generated/ExtractInfoPlanNodeVisitor.h"
#include "common/VectorTrait.h"
namespace milvus::query {
namespace planpb = milvus::proto::plan;
template <typename T>
std::unique_ptr<TermExprImpl<T>>
ExtractTermExprImpl(FieldOffset field_offset, DataType data_type, const planpb::TermExpr& expr_proto) {
static_assert(std::is_fundamental_v<T>);
ExtractTermExprImpl(FieldId field_id, DataType data_type, const planpb::TermExpr& expr_proto) {
static_assert(IsScalar<T>);
auto size = expr_proto.values_size();
std::vector<T> terms(size);
for (int i = 0; i < size; ++i) {
@ -36,18 +39,21 @@ ExtractTermExprImpl(FieldOffset field_offset, DataType data_type, const planpb::
} else if constexpr (std::is_floating_point_v<T>) {
Assert(value_proto.val_case() == planpb::GenericValue::kFloatVal);
terms[i] = static_cast<T>(value_proto.float_val());
} else if constexpr (std::is_same_v<T, std::string>) {
Assert(value_proto.val_case() == planpb::GenericValue::kStringVal);
terms[i] = static_cast<T>(value_proto.string_val());
} else {
static_assert(always_false<T>);
}
}
std::sort(terms.begin(), terms.end());
return std::make_unique<TermExprImpl<T>>(field_offset, data_type, terms);
return std::make_unique<TermExprImpl<T>>(field_id, data_type, terms);
}
template <typename T>
std::unique_ptr<UnaryRangeExprImpl<T>>
ExtractUnaryRangeExprImpl(FieldOffset field_offset, DataType data_type, const planpb::UnaryRangeExpr& expr_proto) {
static_assert(std::is_fundamental_v<T>);
ExtractUnaryRangeExprImpl(FieldId field_id, DataType data_type, const planpb::UnaryRangeExpr& expr_proto) {
static_assert(IsScalar<T>);
auto getValue = [&](const auto& value_proto) -> T {
if constexpr (std::is_same_v<T, bool>) {
Assert(value_proto.val_case() == planpb::GenericValue::kBoolVal);
@ -58,18 +64,21 @@ ExtractUnaryRangeExprImpl(FieldOffset field_offset, DataType data_type, const pl
} else if constexpr (std::is_floating_point_v<T>) {
Assert(value_proto.val_case() == planpb::GenericValue::kFloatVal);
return static_cast<T>(value_proto.float_val());
} else if constexpr (std::is_same_v<T, std::string>) {
Assert(value_proto.val_case() == planpb::GenericValue::kStringVal);
return static_cast<T>(value_proto.string_val());
} else {
static_assert(always_false<T>);
}
};
return std::make_unique<UnaryRangeExprImpl<T>>(field_offset, data_type, static_cast<OpType>(expr_proto.op()),
return std::make_unique<UnaryRangeExprImpl<T>>(field_id, data_type, static_cast<OpType>(expr_proto.op()),
getValue(expr_proto.value()));
}
template <typename T>
std::unique_ptr<BinaryRangeExprImpl<T>>
ExtractBinaryRangeExprImpl(FieldOffset field_offset, DataType data_type, const planpb::BinaryRangeExpr& expr_proto) {
static_assert(std::is_fundamental_v<T>);
ExtractBinaryRangeExprImpl(FieldId field_id, DataType data_type, const planpb::BinaryRangeExpr& expr_proto) {
static_assert(IsScalar<T>);
auto getValue = [&](const auto& value_proto) -> T {
if constexpr (std::is_same_v<T, bool>) {
Assert(value_proto.val_case() == planpb::GenericValue::kBoolVal);
@ -80,18 +89,21 @@ ExtractBinaryRangeExprImpl(FieldOffset field_offset, DataType data_type, const p
} else if constexpr (std::is_floating_point_v<T>) {
Assert(value_proto.val_case() == planpb::GenericValue::kFloatVal);
return static_cast<T>(value_proto.float_val());
} else if constexpr (std::is_same_v<T, std::string>) {
Assert(value_proto.val_case() == planpb::GenericValue::kStringVal);
return static_cast<T>(value_proto.string_val());
} else {
static_assert(always_false<T>);
}
};
return std::make_unique<BinaryRangeExprImpl<T>>(field_offset, data_type, expr_proto.lower_inclusive(),
return std::make_unique<BinaryRangeExprImpl<T>>(field_id, data_type, expr_proto.lower_inclusive(),
expr_proto.upper_inclusive(), getValue(expr_proto.lower_value()),
getValue(expr_proto.upper_value()));
}
template <typename T>
std::unique_ptr<BinaryArithOpEvalRangeExprImpl<T>>
ExtractBinaryArithOpEvalRangeExprImpl(FieldOffset field_offset,
ExtractBinaryArithOpEvalRangeExprImpl(FieldId field_id,
DataType data_type,
const planpb::BinaryArithOpEvalRangeExpr& expr_proto) {
static_assert(std::is_fundamental_v<T>);
@ -110,7 +122,7 @@ ExtractBinaryArithOpEvalRangeExprImpl(FieldOffset field_offset,
}
};
return std::make_unique<BinaryArithOpEvalRangeExprImpl<T>>(
field_offset, data_type, static_cast<ArithOpType>(expr_proto.arith_op()), getValue(expr_proto.right_operand()),
field_id, data_type, static_cast<ArithOpType>(expr_proto.arith_op()), getValue(expr_proto.right_operand()),
static_cast<OpType>(expr_proto.op()), getValue(expr_proto.value()));
}
@ -131,8 +143,7 @@ ProtoParser::PlanNodeFromProto(const planpb::PlanNode& plan_node_proto) {
SearchInfo search_info;
auto field_id = FieldId(anns_proto.field_id());
auto field_offset = schema.get_offset(field_id);
search_info.field_offset_ = field_offset;
search_info.field_id_ = field_id;
search_info.metric_type_ = GetMetricType(query_info_proto.metric_type());
search_info.topk_ = query_info_proto.topk();
@ -165,6 +176,7 @@ ProtoParser::RetrievePlanNodeFromProto(const planpb::PlanNode& plan_node_proto)
std::unique_ptr<Plan>
ProtoParser::CreatePlan(const proto::plan::PlanNode& plan_node_proto) {
// std::cout << plan_node_proto.DebugString() << std::endl;
auto plan = std::make_unique<Plan>(schema);
auto plan_node = PlanNodeFromProto(plan_node_proto);
@ -172,14 +184,13 @@ ProtoParser::CreatePlan(const proto::plan::PlanNode& plan_node_proto) {
ExtractInfoPlanNodeVisitor extractor(plan_info);
plan_node->accept(extractor);
plan->tag2field_["$0"] = plan_node->search_info_.field_offset_;
plan->tag2field_["$0"] = plan_node->search_info_.field_id_;
plan->plan_node_ = std::move(plan_node);
plan->extra_info_opt_ = std::move(plan_info);
for (auto field_id_raw : plan_node_proto.output_field_ids()) {
auto field_id = FieldId(field_id_raw);
auto offset = schema.get_offset(field_id);
plan->target_entries_.push_back(offset);
plan->target_entries_.push_back(field_id);
}
return plan;
@ -197,8 +208,7 @@ ProtoParser::CreateRetrievePlan(const proto::plan::PlanNode& plan_node_proto) {
retrieve_plan->plan_node_ = std::move(plan_node);
for (auto field_id_raw : plan_node_proto.output_field_ids()) {
auto field_id = FieldId(field_id_raw);
auto offset = schema.get_offset(field_id);
retrieve_plan->field_offsets_.push_back(offset);
retrieve_plan->field_ids_.push_back(field_id);
}
return retrieve_plan;
}
@ -207,32 +217,34 @@ ExprPtr
ProtoParser::ParseUnaryRangeExpr(const proto::plan::UnaryRangeExpr& expr_pb) {
auto& column_info = expr_pb.column_info();
auto field_id = FieldId(column_info.field_id());
auto field_offset = schema.get_offset(field_id);
auto data_type = schema[field_offset].get_data_type();
auto data_type = schema[field_id].get_data_type();
Assert(data_type == static_cast<DataType>(column_info.data_type()));
auto result = [&]() -> ExprPtr {
switch (data_type) {
case DataType::BOOL: {
return ExtractUnaryRangeExprImpl<bool>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<bool>(field_id, data_type, expr_pb);
}
case DataType::INT8: {
return ExtractUnaryRangeExprImpl<int8_t>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<int8_t>(field_id, data_type, expr_pb);
}
case DataType::INT16: {
return ExtractUnaryRangeExprImpl<int16_t>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<int16_t>(field_id, data_type, expr_pb);
}
case DataType::INT32: {
return ExtractUnaryRangeExprImpl<int32_t>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<int32_t>(field_id, data_type, expr_pb);
}
case DataType::INT64: {
return ExtractUnaryRangeExprImpl<int64_t>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<int64_t>(field_id, data_type, expr_pb);
}
case DataType::FLOAT: {
return ExtractUnaryRangeExprImpl<float>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<float>(field_id, data_type, expr_pb);
}
case DataType::DOUBLE: {
return ExtractUnaryRangeExprImpl<double>(field_offset, data_type, expr_pb);
return ExtractUnaryRangeExprImpl<double>(field_id, data_type, expr_pb);
}
case DataType::VARCHAR: {
return ExtractUnaryRangeExprImpl<std::string>(field_id, data_type, expr_pb);
}
default: {
PanicInfo("unsupported data type");
@ -246,32 +258,34 @@ ExprPtr
ProtoParser::ParseBinaryRangeExpr(const proto::plan::BinaryRangeExpr& expr_pb) {
auto& columnInfo = expr_pb.column_info();
auto field_id = FieldId(columnInfo.field_id());
auto field_offset = schema.get_offset(field_id);
auto data_type = schema[field_offset].get_data_type();
auto data_type = schema[field_id].get_data_type();
Assert(data_type == (DataType)columnInfo.data_type());
auto result = [&]() -> ExprPtr {
switch (data_type) {
case DataType::BOOL: {
return ExtractBinaryRangeExprImpl<bool>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<bool>(field_id, data_type, expr_pb);
}
case DataType::INT8: {
return ExtractBinaryRangeExprImpl<int8_t>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<int8_t>(field_id, data_type, expr_pb);
}
case DataType::INT16: {
return ExtractBinaryRangeExprImpl<int16_t>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<int16_t>(field_id, data_type, expr_pb);
}
case DataType::INT32: {
return ExtractBinaryRangeExprImpl<int32_t>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<int32_t>(field_id, data_type, expr_pb);
}
case DataType::INT64: {
return ExtractBinaryRangeExprImpl<int64_t>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<int64_t>(field_id, data_type, expr_pb);
}
case DataType::FLOAT: {
return ExtractBinaryRangeExprImpl<float>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<float>(field_id, data_type, expr_pb);
}
case DataType::DOUBLE: {
return ExtractBinaryRangeExprImpl<double>(field_offset, data_type, expr_pb);
return ExtractBinaryRangeExprImpl<double>(field_id, data_type, expr_pb);
}
case DataType::VARCHAR: {
return ExtractBinaryRangeExprImpl<std::string>(field_id, data_type, expr_pb);
}
default: {
PanicInfo("unsupported data type");
@ -285,21 +299,19 @@ ExprPtr
ProtoParser::ParseCompareExpr(const proto::plan::CompareExpr& expr_pb) {
auto& left_column_info = expr_pb.left_column_info();
auto left_field_id = FieldId(left_column_info.field_id());
auto left_field_offset = schema.get_offset(left_field_id);
auto left_data_type = schema[left_field_offset].get_data_type();
auto left_data_type = schema[left_field_id].get_data_type();
Assert(left_data_type == static_cast<DataType>(left_column_info.data_type()));
auto& right_column_info = expr_pb.right_column_info();
auto right_field_id = FieldId(right_column_info.field_id());
auto right_field_offset = schema.get_offset(right_field_id);
auto right_data_type = schema[right_field_offset].get_data_type();
auto right_data_type = schema[right_field_id].get_data_type();
Assert(right_data_type == static_cast<DataType>(right_column_info.data_type()));
return [&]() -> ExprPtr {
auto result = std::make_unique<CompareExpr>();
result->left_field_offset_ = left_field_offset;
result->left_field_id_ = left_field_id;
result->left_data_type_ = left_data_type;
result->right_field_offset_ = right_field_offset;
result->right_field_id_ = right_field_id;
result->right_data_type_ = right_data_type;
result->op_type_ = static_cast<OpType>(expr_pb.op());
return result;
@ -310,33 +322,35 @@ ExprPtr
ProtoParser::ParseTermExpr(const proto::plan::TermExpr& expr_pb) {
auto& columnInfo = expr_pb.column_info();
auto field_id = FieldId(columnInfo.field_id());
auto field_offset = schema.get_offset(field_id);
auto data_type = schema[field_offset].get_data_type();
auto data_type = schema[field_id].get_data_type();
Assert(data_type == (DataType)columnInfo.data_type());
// auto& field_meta = schema[field_offset];
auto result = [&]() -> ExprPtr {
switch (data_type) {
case DataType::BOOL: {
return ExtractTermExprImpl<bool>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<bool>(field_id, data_type, expr_pb);
}
case DataType::INT8: {
return ExtractTermExprImpl<int8_t>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<int8_t>(field_id, data_type, expr_pb);
}
case DataType::INT16: {
return ExtractTermExprImpl<int16_t>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<int16_t>(field_id, data_type, expr_pb);
}
case DataType::INT32: {
return ExtractTermExprImpl<int32_t>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<int32_t>(field_id, data_type, expr_pb);
}
case DataType::INT64: {
return ExtractTermExprImpl<int64_t>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<int64_t>(field_id, data_type, expr_pb);
}
case DataType::FLOAT: {
return ExtractTermExprImpl<float>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<float>(field_id, data_type, expr_pb);
}
case DataType::DOUBLE: {
return ExtractTermExprImpl<double>(field_offset, data_type, expr_pb);
return ExtractTermExprImpl<double>(field_id, data_type, expr_pb);
}
case DataType::VARCHAR: {
return ExtractTermExprImpl<std::string>(field_id, data_type, expr_pb);
}
default: {
PanicInfo("unsupported data type");
@ -366,29 +380,28 @@ ExprPtr
ProtoParser::ParseBinaryArithOpEvalRangeExpr(const proto::plan::BinaryArithOpEvalRangeExpr& expr_pb) {
auto& column_info = expr_pb.column_info();
auto field_id = FieldId(column_info.field_id());
auto field_offset = schema.get_offset(field_id);
auto data_type = schema[field_offset].get_data_type();
auto data_type = schema[field_id].get_data_type();
Assert(data_type == static_cast<DataType>(column_info.data_type()));
auto result = [&]() -> ExprPtr {
switch (data_type) {
case DataType::INT8: {
return ExtractBinaryArithOpEvalRangeExprImpl<int8_t>(field_offset, data_type, expr_pb);
return ExtractBinaryArithOpEvalRangeExprImpl<int8_t>(field_id, data_type, expr_pb);
}
case DataType::INT16: {
return ExtractBinaryArithOpEvalRangeExprImpl<int16_t>(field_offset, data_type, expr_pb);
return ExtractBinaryArithOpEvalRangeExprImpl<int16_t>(field_id, data_type, expr_pb);
}
case DataType::INT32: {
return ExtractBinaryArithOpEvalRangeExprImpl<int32_t>(field_offset, data_type, expr_pb);
return ExtractBinaryArithOpEvalRangeExprImpl<int32_t>(field_id, data_type, expr_pb);
}
case DataType::INT64: {
return ExtractBinaryArithOpEvalRangeExprImpl<int64_t>(field_offset, data_type, expr_pb);
return ExtractBinaryArithOpEvalRangeExprImpl<int64_t>(field_id, data_type, expr_pb);
}
case DataType::FLOAT: {
return ExtractBinaryArithOpEvalRangeExprImpl<float>(field_offset, data_type, expr_pb);
return ExtractBinaryArithOpEvalRangeExprImpl<float>(field_id, data_type, expr_pb);
}
case DataType::DOUBLE: {
return ExtractBinaryArithOpEvalRangeExprImpl<double>(field_offset, data_type, expr_pb);
return ExtractBinaryArithOpEvalRangeExprImpl<double>(field_id, data_type, expr_pb);
}
default: {
PanicInfo("unsupported data type");

View File

@ -0,0 +1,72 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "common/VectorTrait.h"
#include "exceptions/EasyAssert.h"
#include "query/Expr.h"
#include "common/Utils.h"
#include "query/Utils.h"
#include <functional>
#include <string>
namespace milvus::query {
template <typename Op, typename T, typename U>
bool
RelationalImpl(const T& t, const U& u, FundamentalTag, FundamentalTag) {
return Op{}(t, u);
}
template <typename Op, typename T, typename U>
bool
RelationalImpl(const T& t, const U& u, FundamentalTag, StringTag) {
PanicInfo("incompitible data type");
}
template <typename Op, typename T, typename U>
bool
RelationalImpl(const T& t, const U& u, StringTag, FundamentalTag) {
PanicInfo("incompitible data type");
}
template <typename Op, typename T, typename U>
bool
RelationalImpl(const T& t, const U& u, StringTag, StringTag) {
return Op{}(t, u);
}
template <typename Op>
struct Relational {
template <typename T, typename U>
bool
operator()(const T& t, const U& u) const {
return RelationalImpl<Op, T, U>(t, u, typename TagDispatchTrait<T>::Tag{}, typename TagDispatchTrait<U>::Tag{});
}
template <typename... T>
bool
operator()(const T&...) const {
PanicInfo("incompatible operands");
}
};
template <OpType op>
struct MatchOp {
template <typename T, typename U>
bool
operator()(const T& t, const U& u) {
return Match(t, u, op);
}
};
} // namespace milvus::query

View File

@ -12,6 +12,7 @@
#pragma once
#include <memory>
#include <string>
#include <index/ScalarIndexSort.h>
#include "common/FieldMeta.h"
@ -45,6 +46,8 @@ generate_scalar_index(SpanBase data, DataType data_type) {
return generate_scalar_index(Span<float>(data));
case DataType::DOUBLE:
return generate_scalar_index(Span<double>(data));
case DataType::VARCHAR:
return generate_scalar_index(Span<std::string>(data));
default:
PanicInfo("unsupported type");
}

View File

@ -76,7 +76,7 @@ BinarySearchBruteForceFast(MetricType metric_type,
const BitsetView& bitset) {
SubSearchResult sub_result(num_queries, topk, metric_type, round_decimal);
float* result_distances = sub_result.get_distances();
idx_t* result_ids = sub_result.get_ids();
idx_t* result_ids = sub_result.get_seg_offsets();
int64_t code_size = dim / 8;
const idx_t block_size = size_per_chunk;
@ -101,10 +101,12 @@ FloatSearchBruteForce(const dataset::SearchDataset& dataset,
auto query_data = reinterpret_cast<const float*>(dataset.query_data);
auto chunk_data = reinterpret_cast<const float*>(chunk_data_raw);
if (metric_type == MetricType::METRIC_L2) {
faiss::float_maxheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_ids(), sub_qr.get_distances()};
faiss::float_maxheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_seg_offsets(),
sub_qr.get_distances()};
faiss::knn_L2sqr(query_data, chunk_data, dim, num_queries, size_per_chunk, &buf, nullptr, bitset);
} else {
faiss::float_minheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_ids(), sub_qr.get_distances()};
faiss::float_minheap_array_t buf{(size_t)num_queries, (size_t)topk, sub_qr.get_seg_offsets(),
sub_qr.get_distances()};
faiss::knn_inner_product(query_data, chunk_data, dim, num_queries, size_per_chunk, &buf, bitset);
}
sub_qr.round_values();

View File

@ -30,8 +30,8 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
// step 1.1: get meta
// step 1.2: get which vector field to search
auto vecfield_offset = info.field_offset_;
auto& field = schema[vecfield_offset];
auto vecfield_id = info.field_id_;
auto& field = schema[vecfield_id];
AssertInfo(field.get_data_type() == DataType::VECTOR_FLOAT, "[FloatSearch]Field data type isn't VECTOR_FLOAT");
auto dim = field.get_dim();
@ -44,13 +44,13 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
// std::vector<float> final_dis(total_count, std::numeric_limits<float>::max());
SubSearchResult final_qr(num_queries, topk, metric_type, round_decimal);
dataset::SearchDataset search_dataset{metric_type, num_queries, topk, round_decimal, dim, query_data};
auto vec_ptr = record.get_field_data<FloatVector>(vecfield_offset);
auto vec_ptr = record.get_field_data<FloatVector>(vecfield_id);
int current_chunk_id = 0;
if (indexing_record.is_in(vecfield_offset)) {
if (indexing_record.is_in(vecfield_id)) {
auto max_indexed_id = indexing_record.get_finished_ack();
const auto& field_indexing = indexing_record.get_vec_field_indexing(vecfield_offset);
const auto& field_indexing = indexing_record.get_vec_field_indexing(vecfield_id);
auto search_conf = field_indexing.get_search_params(topk);
AssertInfo(vec_ptr->get_size_per_chunk() == field_indexing.get_size_per_chunk(),
"[FloatSearch]Chunk size of vector not equal to chunk size of field index");
@ -63,7 +63,7 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
auto sub_qr = SearchOnIndex(search_dataset, *indexing, search_conf, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_ids()) {
for (auto& x : sub_qr.mutable_seg_offsets()) {
if (x != -1) {
x += chunk_id * size_per_chunk;
}
@ -89,7 +89,7 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
auto sub_qr = FloatSearchBruteForce(search_dataset, chunk.data(), size_per_chunk, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_ids()) {
for (auto& x : sub_qr.mutable_seg_offsets()) {
if (x != -1) {
x += chunk_id * vec_size_per_chunk;
}
@ -98,7 +98,7 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
}
current_chunk_id = max_chunk;
results.distances_ = std::move(final_qr.mutable_distances());
results.ids_ = std::move(final_qr.mutable_ids());
results.seg_offsets_ = std::move(final_qr.mutable_seg_offsets());
results.topk_ = topk;
results.num_queries_ = num_queries;
@ -123,8 +123,8 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
// step 2.1: get meta
// step 2.2: get which vector field to search
auto vecfield_offset = info.field_offset_;
auto& field = schema[vecfield_offset];
auto vecfield_id = info.field_id_;
auto& field = schema[vecfield_id];
AssertInfo(field.get_data_type() == DataType::VECTOR_BINARY, "[BinarySearch]Field data type isn't VECTOR_BINARY");
auto dim = field.get_dim();
@ -134,7 +134,7 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
// step 3: small indexing search
query::dataset::SearchDataset search_dataset{metric_type, num_queries, topk, round_decimal, dim, query_data};
auto vec_ptr = record.get_field_data<BinaryVector>(vecfield_offset);
auto vec_ptr = record.get_field_data<BinaryVector>(vecfield_id);
auto max_indexed_id = 0;
// step 4: brute force search where small indexing is unavailable
@ -151,7 +151,7 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
auto sub_result = BinarySearchBruteForce(search_dataset, chunk.data(), nsize, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_result.mutable_ids()) {
for (auto& x : sub_result.mutable_seg_offsets()) {
if (x != -1) {
x += chunk_id * vec_size_per_chunk;
}
@ -161,7 +161,7 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
final_result.round_values();
results.distances_ = std::move(final_result.mutable_distances());
results.ids_ = std::move(final_result.mutable_ids());
results.seg_offsets_ = std::move(final_result.mutable_seg_offsets());
results.topk_ = topk;
results.num_queries_ = num_queries;
@ -178,7 +178,7 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
const BitsetView& bitset,
SearchResult& results) {
// TODO: add data_type to info
auto data_type = segment.get_schema()[info.field_offset_].get_data_type();
auto data_type = segment.get_schema()[info.field_id_].get_data_type();
AssertInfo(datatype_is_vector(data_type), "[SearchOnGrowing]Data type isn't vector type");
if (data_type == DataType::VECTOR_FLOAT) {
auto typed_data = reinterpret_cast<const float*>(query_data);

View File

@ -35,7 +35,7 @@ SearchOnIndex(const dataset::SearchDataset& search_dataset,
SubSearchResult sub_qr(num_queries, topK, metric_type, round_decimal);
std::copy_n(dis, num_queries * topK, sub_qr.get_distances());
std::copy_n(uids, num_queries * topK, sub_qr.get_ids());
std::copy_n(uids, num_queries * topK, sub_qr.get_seg_offsets());
sub_qr.round_values();
return sub_qr;
}

View File

@ -32,13 +32,13 @@ SearchOnSealed(const Schema& schema,
auto topk = search_info.topk_;
auto round_decimal = search_info.round_decimal_;
auto field_offset = search_info.field_offset_;
auto& field = schema[field_offset];
auto field_id = search_info.field_id_;
auto& field = schema[field_id];
// Assert(field.get_data_type() == DataType::VECTOR_FLOAT);
auto dim = field.get_dim();
AssertInfo(record.is_ready(field_offset), "[SearchOnSealed]Record isn't ready");
auto field_indexing = record.get_field_indexing(field_offset);
AssertInfo(record.is_ready(field_id), "[SearchOnSealed]Record isn't ready");
auto field_indexing = record.get_field_indexing(field_id);
AssertInfo(field_indexing->metric_type_ == search_info.metric_type_,
"Metric type of field index isn't the same with search info");
@ -67,12 +67,12 @@ SearchOnSealed(const Schema& schema,
distances[i] = round(distances[i] * multiplier) / multiplier;
}
}
result.ids_.resize(total_num);
result.seg_offsets_.resize(total_num);
result.distances_.resize(total_num);
result.num_queries_ = num_queries;
result.topk_ = topk;
std::copy_n(ids, total_num, result.ids_.data());
std::copy_n(ids, total_num, result.seg_offsets_.data());
std::copy_n(distances, total_num, result.distances_.data());
}
} // namespace milvus::query

View File

@ -27,7 +27,7 @@ SubSearchResult::merge_impl(const SubSearchResult& right) {
for (int64_t qn = 0; qn < num_queries_; ++qn) {
auto offset = qn * topk_;
int64_t* __restrict__ left_ids = this->get_ids() + offset;
int64_t* __restrict__ left_ids = this->get_seg_offsets() + offset;
float* __restrict__ left_distances = this->get_distances() + offset;
auto right_ids = right.get_ids() + offset;

View File

@ -23,7 +23,7 @@ class SubSearchResult {
: metric_type_(metric_type),
num_queries_(num_queries),
topk_(topk),
ids_(num_queries * topk, -1),
seg_offsets_(num_queries * topk, -1),
distances_(num_queries * topk, init_value(metric_type)),
round_decimal_(round_decimal) {
}
@ -57,12 +57,12 @@ class SubSearchResult {
const int64_t*
get_ids() const {
return ids_.data();
return seg_offsets_.data();
}
int64_t*
get_ids() {
return ids_.data();
get_seg_offsets() {
return seg_offsets_.data();
}
const float*
@ -76,8 +76,8 @@ class SubSearchResult {
}
auto&
mutable_ids() {
return ids_;
mutable_seg_offsets() {
return seg_offsets_;
}
auto&
@ -104,7 +104,7 @@ class SubSearchResult {
int64_t topk_;
int64_t round_decimal_;
MetricType metric_type_;
std::vector<int64_t> ids_;
std::vector<int64_t> seg_offsets_;
std::vector<float> distances_;
};

View File

@ -0,0 +1,38 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <string>
#include "query/Expr.h"
#include "common/Utils.h"
namespace milvus::query {
template <typename T, typename U>
inline bool
Match(const T& x, const U& y, OpType op) {
PanicInfo("not supported");
}
template <>
inline bool
Match<std::string>(const std::string& str, const std::string& val, OpType op) {
switch (op) {
case OpType::PrefixMatch:
return PrefixMatch(str, val);
case OpType::PostfixMatch:
return PostfixMatch(str, val);
default:
PanicInfo("not supported");
}
}
} // namespace milvus::query

View File

@ -62,11 +62,11 @@ class ExecExprVisitor : public ExprVisitor {
public:
template <typename T, typename IndexFunc, typename ElementFunc>
auto
ExecRangeVisitorImpl(FieldOffset field_offset, IndexFunc func, ElementFunc element_func) -> BitsetType;
ExecRangeVisitorImpl(FieldId field_id, IndexFunc func, ElementFunc element_func) -> BitsetType;
template <typename T, typename ElementFunc>
auto
ExecDataRangeVisitorImpl(FieldOffset field_offset, ElementFunc element_func) -> BitsetType;
ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_func) -> BitsetType;
template <typename T>
auto

View File

@ -18,6 +18,8 @@
#include "query/ExprImpl.h"
#include "query/generated/ExecExprVisitor.h"
#include "segcore/SegmentGrowingImpl.h"
#include "query/Utils.h"
#include "query/Relational.h"
namespace milvus::query {
// THIS CONTAINS EXTRA BODY FOR VISITOR
@ -42,7 +44,7 @@ class ExecExprVisitor : ExprVisitor {
public:
template <typename T, typename IndexFunc, typename ElementFunc>
auto
ExecRangeVisitorImpl(FieldOffset field_offset, IndexFunc func, ElementFunc element_func) -> BitsetType;
ExecRangeVisitorImpl(FieldId field_id, IndexFunc func, ElementFunc element_func) -> BitsetType;
template <typename T>
auto
@ -144,18 +146,17 @@ Assemble(const std::deque<BitsetType>& srcs) -> BitsetType {
template <typename T, typename IndexFunc, typename ElementFunc>
auto
ExecExprVisitor::ExecRangeVisitorImpl(FieldOffset field_offset, IndexFunc index_func, ElementFunc element_func)
-> BitsetType {
ExecExprVisitor::ExecRangeVisitorImpl(FieldId field_id, IndexFunc index_func, ElementFunc element_func) -> BitsetType {
auto& schema = segment_.get_schema();
auto& field_meta = schema[field_offset];
auto indexing_barrier = segment_.num_chunk_index(field_offset);
auto& field_meta = schema[field_id];
auto indexing_barrier = segment_.num_chunk_index(field_id);
auto size_per_chunk = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, size_per_chunk);
std::deque<BitsetType> results;
using Index = scalar::ScalarIndex<T>;
for (auto chunk_id = 0; chunk_id < indexing_barrier; ++chunk_id) {
const Index& indexing = segment_.chunk_scalar_index<T>(field_offset, chunk_id);
const Index& indexing = segment_.chunk_scalar_index<T>(field_id, chunk_id);
// NOTE: knowhere is not const-ready
// This is a dirty workaround
auto data = index_func(const_cast<Index*>(&indexing));
@ -165,7 +166,7 @@ ExecExprVisitor::ExecRangeVisitorImpl(FieldOffset field_offset, IndexFunc index_
for (auto chunk_id = indexing_barrier; chunk_id < num_chunk; ++chunk_id) {
auto this_size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
BitsetType result(this_size);
auto chunk = segment_.chunk_data<T>(field_offset, chunk_id);
auto chunk = segment_.chunk_data<T>(field_id, chunk_id);
const T* data = chunk.data();
for (int index = 0; index < this_size; ++index) {
result[index] = element_func(data[index]);
@ -180,9 +181,9 @@ ExecExprVisitor::ExecRangeVisitorImpl(FieldOffset field_offset, IndexFunc index_
template <typename T, typename ElementFunc>
auto
ExecExprVisitor::ExecDataRangeVisitorImpl(FieldOffset field_offset, ElementFunc element_func) -> BitsetType {
ExecExprVisitor::ExecDataRangeVisitorImpl(FieldId field_id, ElementFunc element_func) -> BitsetType {
auto& schema = segment_.get_schema();
auto& field_meta = schema[field_offset];
auto& field_meta = schema[field_id];
auto size_per_chunk = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, size_per_chunk);
std::deque<BitsetType> results;
@ -190,7 +191,7 @@ ExecExprVisitor::ExecDataRangeVisitorImpl(FieldOffset field_offset, ElementFunc
for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
auto this_size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
BitsetType result(this_size);
auto chunk = segment_.chunk_data<T>(field_offset, chunk_id);
auto chunk = segment_.chunk_data<T>(field_id, chunk_id);
const T* data = chunk.data();
for (int index = 0; index < this_size; ++index) {
result[index] = element_func(data[index]);
@ -217,33 +218,44 @@ ExecExprVisitor::ExecUnaryRangeVisitorDispatcher(UnaryRangeExpr& expr_raw) -> Bi
case OpType::Equal: {
auto index_func = [val](Index* index) { return index->In(1, &val); };
auto elem_func = [val](T x) { return (x == val); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case OpType::NotEqual: {
auto index_func = [val](Index* index) { return index->NotIn(1, &val); };
auto elem_func = [val](T x) { return (x != val); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case OpType::GreaterEqual: {
auto index_func = [val](Index* index) { return index->Range(val, Operator::GE); };
auto elem_func = [val](T x) { return (x >= val); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case OpType::GreaterThan: {
auto index_func = [val](Index* index) { return index->Range(val, Operator::GT); };
auto elem_func = [val](T x) { return (x > val); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case OpType::LessEqual: {
auto index_func = [val](Index* index) { return index->Range(val, Operator::LE); };
auto elem_func = [val](T x) { return (x <= val); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case OpType::LessThan: {
auto index_func = [val](Index* index) { return index->Range(val, Operator::LT); };
auto elem_func = [val](T x) { return (x < val); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
case OpType::PrefixMatch: {
auto index_func = [val](Index* index) {
auto dataset = std::make_unique<knowhere::Dataset>();
dataset->Set(scalar::OPERATOR_TYPE, Operator::PrefixMatchOp);
dataset->Set(scalar::PREFIX_VALUE, val);
return index->Query(std::move(dataset));
};
auto elem_func = [val, op](T x) { return Match(x, val, op); };
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
// TODO: PostfixMatch
default: {
PanicInfo("unsupported range node");
}
@ -268,25 +280,25 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcher(BinaryArithOpEvalRa
switch (arith_op) {
case ArithOpType::Add: {
auto elem_func = [val, right_operand](T x) { return ((x + right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Sub: {
auto elem_func = [val, right_operand](T x) { return ((x - right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Mul: {
auto elem_func = [val, right_operand](T x) { return ((x * right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Div: {
auto elem_func = [val, right_operand](T x) { return ((x / right_operand) == val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Mod: {
auto elem_func = [val, right_operand](T x) {
return (static_cast<T>(fmod(x, right_operand)) == val);
};
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
default: {
PanicInfo("unsupported arithmetic operation");
@ -297,25 +309,25 @@ ExecExprVisitor::ExecBinaryArithOpEvalRangeVisitorDispatcher(BinaryArithOpEvalRa
switch (arith_op) {
case ArithOpType::Add: {
auto elem_func = [val, right_operand](T x) { return ((x + right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Sub: {
auto elem_func = [val, right_operand](T x) { return ((x - right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Mul: {
auto elem_func = [val, right_operand](T x) { return ((x * right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Div: {
auto elem_func = [val, right_operand](T x) { return ((x / right_operand) != val); };
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
case ArithOpType::Mod: {
auto elem_func = [val, right_operand](T x) {
return (static_cast<T>(fmod(x, right_operand)) != val);
};
return ExecDataRangeVisitorImpl<T>(expr.field_offset_, elem_func);
return ExecDataRangeVisitorImpl<T>(expr.field_id_, elem_func);
}
default: {
PanicInfo("unsupported arithmetic operation");
@ -348,23 +360,23 @@ ExecExprVisitor::ExecBinaryRangeVisitorDispatcher(BinaryRangeExpr& expr_raw) ->
auto index_func = [=](Index* index) { return index->Range(val1, lower_inclusive, val2, upper_inclusive); };
if (lower_inclusive && upper_inclusive) {
auto elem_func = [val1, val2](T x) { return (val1 <= x && x <= val2); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
} else if (lower_inclusive && !upper_inclusive) {
auto elem_func = [val1, val2](T x) { return (val1 <= x && x < val2); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
} else if (!lower_inclusive && upper_inclusive) {
auto elem_func = [val1, val2](T x) { return (val1 < x && x <= val2); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
} else {
auto elem_func = [val1, val2](T x) { return (val1 < x && x < val2); };
return ExecRangeVisitorImpl<T>(expr.field_offset_, index_func, elem_func);
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
}
#pragma clang diagnostic pop
void
ExecExprVisitor::visit(UnaryRangeExpr& expr) {
auto& field_meta = segment_.get_schema()[expr.field_offset_];
auto& field_meta = segment_.get_schema()[expr.field_id_];
AssertInfo(expr.data_type_ == field_meta.get_data_type(),
"[ExecExprVisitor]DataType of expr isn't field_meta data type");
BitsetType res;
@ -397,6 +409,10 @@ ExecExprVisitor::visit(UnaryRangeExpr& expr) {
res = ExecUnaryRangeVisitorDispatcher<double>(expr);
break;
}
case DataType::VARCHAR: {
res = ExecUnaryRangeVisitorDispatcher<std::string>(expr);
break;
}
default:
PanicInfo("unsupported");
}
@ -406,7 +422,7 @@ ExecExprVisitor::visit(UnaryRangeExpr& expr) {
void
ExecExprVisitor::visit(BinaryArithOpEvalRangeExpr& expr) {
auto& field_meta = segment_.get_schema()[expr.field_offset_];
auto& field_meta = segment_.get_schema()[expr.field_id_];
AssertInfo(expr.data_type_ == field_meta.get_data_type(),
"[ExecExprVisitor]DataType of expr isn't field_meta data type");
BitsetType res;
@ -444,7 +460,7 @@ ExecExprVisitor::visit(BinaryArithOpEvalRangeExpr& expr) {
void
ExecExprVisitor::visit(BinaryRangeExpr& expr) {
auto& field_meta = segment_.get_schema()[expr.field_offset_];
auto& field_meta = segment_.get_schema()[expr.field_id_];
AssertInfo(expr.data_type_ == field_meta.get_data_type(),
"[ExecExprVisitor]DataType of expr isn't field_meta data type");
BitsetType res;
@ -477,6 +493,10 @@ ExecExprVisitor::visit(BinaryRangeExpr& expr) {
res = ExecBinaryRangeVisitorDispatcher<double>(expr);
break;
}
case DataType::VARCHAR: {
res = ExecBinaryRangeVisitorDispatcher<std::string>(expr);
break;
}
default:
PanicInfo("unsupported");
}
@ -501,52 +521,56 @@ struct relational {
template <typename Op>
auto
ExecExprVisitor::ExecCompareExprDispatcher(CompareExpr& expr, Op op) -> BitsetType {
using number = boost::variant<bool, int8_t, int16_t, int32_t, int64_t, float, double>;
using number = boost::variant<bool, int8_t, int16_t, int32_t, int64_t, float, double, std::string>;
auto size_per_chunk = segment_.size_per_chunk();
auto num_chunk = upper_div(row_count_, size_per_chunk);
std::deque<BitsetType> bitsets;
for (int64_t chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
auto size = chunk_id == num_chunk - 1 ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
auto getChunkData = [&, chunk_id](DataType type, FieldOffset offset) -> std::function<const number(int)> {
auto getChunkData = [&, chunk_id](DataType type, FieldId field_id) -> std::function<const number(int)> {
switch (type) {
case DataType::BOOL: {
auto chunk_data = segment_.chunk_data<bool>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<bool>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::INT8: {
auto chunk_data = segment_.chunk_data<int8_t>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<int8_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::INT16: {
auto chunk_data = segment_.chunk_data<int16_t>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<int16_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::INT32: {
auto chunk_data = segment_.chunk_data<int32_t>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<int32_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::INT64: {
auto chunk_data = segment_.chunk_data<int64_t>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<int64_t>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::FLOAT: {
auto chunk_data = segment_.chunk_data<float>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<float>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::DOUBLE: {
auto chunk_data = segment_.chunk_data<double>(offset, chunk_id).data();
auto chunk_data = segment_.chunk_data<double>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
case DataType::VARCHAR: {
auto chunk_data = segment_.chunk_data<std::string>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
}
default:
PanicInfo("unsupported datatype");
}
};
auto left = getChunkData(expr.left_data_type_, expr.left_field_offset_);
auto right = getChunkData(expr.right_data_type_, expr.right_field_offset_);
auto left = getChunkData(expr.left_data_type_, expr.left_field_id_);
auto right = getChunkData(expr.right_data_type_, expr.right_field_id_);
BitsetType bitset(size);
for (int i = 0; i < size; ++i) {
bool is_in = boost::apply_visitor(relational<decltype(op)>{}, left(i), right(i));
bool is_in = boost::apply_visitor(Relational<decltype(op)>{}, left(i), right(i));
bitset[i] = is_in;
}
bitsets.emplace_back(std::move(bitset));
@ -559,8 +583,8 @@ ExecExprVisitor::ExecCompareExprDispatcher(CompareExpr& expr, Op op) -> BitsetTy
void
ExecExprVisitor::visit(CompareExpr& expr) {
auto& schema = segment_.get_schema();
auto& left_field_meta = schema[expr.left_field_offset_];
auto& right_field_meta = schema[expr.right_field_offset_];
auto& left_field_meta = schema[expr.left_field_id_];
auto& right_field_meta = schema[expr.right_field_id_];
AssertInfo(expr.left_data_type_ == left_field_meta.get_data_type(),
"[ExecExprVisitor]Left data type not equal to left field mata type");
AssertInfo(expr.right_data_type_ == right_field_meta.get_data_type(),
@ -592,6 +616,12 @@ ExecExprVisitor::visit(CompareExpr& expr) {
res = ExecCompareExprDispatcher(expr, std::less<>{});
break;
}
case OpType::PrefixMatch: {
res = ExecCompareExprDispatcher(expr, MatchOp<OpType::PrefixMatch>{});
break;
}
// case OpType::PostfixMatch: {
// }
default: {
PanicInfo("unsupported optype");
}
@ -605,21 +635,37 @@ auto
ExecExprVisitor::ExecTermVisitorImpl(TermExpr& expr_raw) -> BitsetType {
auto& expr = static_cast<TermExprImpl<T>&>(expr_raw);
auto& schema = segment_.get_schema();
auto primary_offset = schema.get_primary_key_offset();
auto field_offset = expr_raw.field_offset_;
auto& field_meta = schema[field_offset];
auto primary_filed_id = schema.get_primary_field_id();
auto field_id = expr_raw.field_id_;
auto& field_meta = schema[field_id];
bool use_pk_index = false;
if (primary_offset.has_value()) {
use_pk_index = primary_offset.value() == field_offset && field_meta.get_data_type() == engine::DataType::INT64;
if (primary_filed_id.has_value()) {
use_pk_index = primary_filed_id.value() == field_id && IsPrimaryKeyDataType(field_meta.get_data_type());
}
if (use_pk_index) {
auto id_array = std::make_unique<IdArray>();
auto dst_ids = id_array->mutable_int_id();
for (const auto& id : expr.terms_) {
dst_ids->add_data(id);
switch (field_meta.get_data_type()) {
case DataType::INT64: {
auto dst_ids = id_array->mutable_int_id();
for (const auto& id : expr.terms_) {
dst_ids->add_data((int64_t&)id);
}
break;
}
case DataType::VARCHAR: {
auto dst_ids = id_array->mutable_str_id();
for (const auto& id : expr.terms_) {
dst_ids->add_data((std::string&)id);
}
break;
}
default: {
PanicInfo("unsupported type");
}
}
auto [uids, seg_offsets] = segment_.search_ids(*id_array, timestamp_);
BitsetType bitset(row_count_);
for (const auto& offset : seg_offsets) {
@ -636,7 +682,7 @@ ExecExprVisitor::ExecTermVisitorImpl(TermExpr& expr_raw) -> BitsetType {
auto num_chunk = upper_div(row_count_, size_per_chunk);
std::unordered_set<T> term_set(expr.terms_.begin(), expr.terms_.end());
for (int64_t chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
Span<T> chunk = segment_.chunk_data<T>(field_offset, chunk_id);
Span<T> chunk = segment_.chunk_data<T>(field_id, chunk_id);
auto chunk_data = chunk.data();
auto size = (chunk_id == num_chunk - 1) ? row_count_ - chunk_id * size_per_chunk : size_per_chunk;
BitsetType bitset(size);
@ -650,9 +696,34 @@ ExecExprVisitor::ExecTermVisitorImpl(TermExpr& expr_raw) -> BitsetType {
return final_result;
}
// TODO: refactor this to use `scalar::ScalarIndex::In`.
// made a test to compare the performance.
// vector<bool> don't match the template.
// boost::container::vector<bool> match.
template <>
auto
ExecExprVisitor::ExecTermVisitorImpl<std::string>(TermExpr& expr_raw) -> BitsetType {
using T = std::string;
auto& expr = static_cast<TermExprImpl<T>&>(expr_raw);
using Index = scalar::ScalarIndex<T>;
using Operator = scalar::OperatorType;
const auto& terms = expr.terms_;
auto n = terms.size();
std::unordered_set<T> term_set(expr.terms_.begin(), expr.terms_.end());
auto index_func = [&terms, n](Index* index) { return index->In(n, terms.data()); };
auto elem_func = [&terms, &term_set](T x) {
//// terms has already been sorted.
// return std::binary_search(terms.begin(), terms.end(), x);
return term_set.find(x) != term_set.end();
};
return ExecRangeVisitorImpl<T>(expr.field_id_, index_func, elem_func);
}
void
ExecExprVisitor::visit(TermExpr& expr) {
auto& field_meta = segment_.get_schema()[expr.field_offset_];
auto& field_meta = segment_.get_schema()[expr.field_id_];
AssertInfo(expr.data_type_ == field_meta.get_data_type(),
"[ExecExprVisitor]DataType of expr isn't field_meta data type ");
BitsetType res;
@ -685,6 +756,10 @@ ExecExprVisitor::visit(TermExpr& expr) {
res = ExecTermVisitorImpl<double>(expr);
break;
}
case DataType::VARCHAR: {
res = ExecTermVisitorImpl<std::string>(expr);
break;
}
default:
PanicInfo("unsupported");
}

View File

@ -61,7 +61,7 @@ empty_search_result(int64_t num_queries, int64_t topk, int64_t round_decimal, Me
SubSearchResult result(num_queries, topk, metric_type, round_decimal);
final_result.num_queries_ = num_queries;
final_result.topk_ = topk;
final_result.ids_ = std::move(result.mutable_ids());
final_result.seg_offsets_ = std::move(result.mutable_seg_offsets());
final_result.distances_ = std::move(result.mutable_distances());
return final_result;
}

View File

@ -40,28 +40,28 @@ ExtractInfoExprVisitor::visit(LogicalBinaryExpr& expr) {
void
ExtractInfoExprVisitor::visit(TermExpr& expr) {
plan_info_.add_involved_field(expr.field_offset_);
plan_info_.add_involved_field(expr.field_id_);
}
void
ExtractInfoExprVisitor::visit(UnaryRangeExpr& expr) {
plan_info_.add_involved_field(expr.field_offset_);
plan_info_.add_involved_field(expr.field_id_);
}
void
ExtractInfoExprVisitor::visit(BinaryRangeExpr& expr) {
plan_info_.add_involved_field(expr.field_offset_);
plan_info_.add_involved_field(expr.field_id_);
}
void
ExtractInfoExprVisitor::visit(CompareExpr& expr) {
plan_info_.add_involved_field(expr.left_field_offset_);
plan_info_.add_involved_field(expr.right_field_offset_);
plan_info_.add_involved_field(expr.left_field_id_);
plan_info_.add_involved_field(expr.right_field_id_);
}
void
ExtractInfoExprVisitor::visit(BinaryArithOpEvalRangeExpr& expr) {
plan_info_.add_involved_field(expr.field_offset_);
plan_info_.add_involved_field(expr.field_id_);
}
} // namespace milvus::query

View File

@ -30,7 +30,7 @@ class ExtractInfoPlanNodeVisitor : PlanNodeVisitor {
void
ExtractInfoPlanNodeVisitor::visit(FloatVectorANNS& node) {
plan_info_.add_involved_field(node.search_info_.field_offset_);
plan_info_.add_involved_field(node.search_info_.field_id_);
if (node.predicate_.has_value()) {
ExtractInfoExprVisitor expr_visitor(plan_info_);
node.predicate_.value()->accept(expr_visitor);
@ -39,7 +39,7 @@ ExtractInfoPlanNodeVisitor::visit(FloatVectorANNS& node) {
void
ExtractInfoPlanNodeVisitor::visit(BinaryVectorANNS& node) {
plan_info_.add_involved_field(node.search_info_.field_offset_);
plan_info_.add_involved_field(node.search_info_.field_id_);
if (node.predicate_.has_value()) {
ExtractInfoExprVisitor expr_visitor(plan_info_);
node.predicate_.value()->accept(expr_visitor);

View File

@ -132,7 +132,7 @@ ShowExprVisitor::visit(TermExpr& expr) {
}();
Json res{{"expr_type", "Term"},
{"field_offset", expr.field_offset_.get()},
{"field_id", expr.field_id_.get()},
{"data_type", datatype_name(expr.data_type_)},
{"terms", std::move(terms)}};
@ -147,7 +147,7 @@ UnaryRangeExtract(const UnaryRangeExpr& expr_raw) {
auto expr = dynamic_cast<const UnaryRangeExprImpl<T>*>(&expr_raw);
AssertInfo(expr, "[ShowExprVisitor]UnaryRangeExpr cast to UnaryRangeExprImpl failed");
Json res{{"expr_type", "UnaryRange"},
{"field_offset", expr->field_offset_.get()},
{"field_id", expr->field_id_.get()},
{"data_type", datatype_name(expr->data_type_)},
{"op", OpType_Name(static_cast<OpType>(expr->op_type_))},
{"value", expr->value_}};
@ -193,7 +193,7 @@ BinaryRangeExtract(const BinaryRangeExpr& expr_raw) {
auto expr = dynamic_cast<const BinaryRangeExprImpl<T>*>(&expr_raw);
AssertInfo(expr, "[ShowExprVisitor]BinaryRangeExpr cast to BinaryRangeExprImpl failed");
Json res{{"expr_type", "BinaryRange"},
{"field_offset", expr->field_offset_.get()},
{"field_id", expr->field_id_.get()},
{"data_type", datatype_name(expr->data_type_)},
{"lower_inclusive", expr->lower_inclusive_},
{"upper_inclusive", expr->upper_inclusive_},
@ -240,9 +240,9 @@ ShowExprVisitor::visit(CompareExpr& expr) {
AssertInfo(!json_opt_.has_value(), "[ShowExprVisitor]Ret json already has value before visit");
Json res{{"expr_type", "Compare"},
{"left_field_offset", expr.left_field_offset_.get()},
{"left_field_id", expr.left_field_id_.get()},
{"left_data_type", datatype_name(expr.left_data_type_)},
{"right_field_offset", expr.right_field_offset_.get()},
{"right_field_id", expr.right_field_id_.get()},
{"right_data_type", datatype_name(expr.right_data_type_)},
{"op", OpType_Name(static_cast<OpType>(expr.op_type_))}};
json_opt_ = res;
@ -260,7 +260,7 @@ BinaryArithOpEvalRangeExtract(const BinaryArithOpEvalRangeExpr& expr_raw) {
AssertInfo(expr, "[ShowExprVisitor]BinaryArithOpEvalRangeExpr cast to BinaryArithOpEvalRangeExprImpl failed");
Json res{{"expr_type", "BinaryArithOpEvalRange"},
{"field_offset", expr->field_offset_.get()},
{"field_offset", expr->field_id_.get()},
{"data_type", datatype_name(expr->data_type_)},
{"arith_op", ArithOpType_Name(static_cast<ArithOpType>(expr->arith_op_))},
{"right_operand", expr->right_operand_},

View File

@ -55,7 +55,7 @@ ShowPlanNodeVisitor::visit(FloatVectorANNS& node) {
Json json_body{
{"node_type", "FloatVectorANNS"}, //
{"metric_type", MetricTypeToName(info.metric_type_)}, //
{"field_offset_", info.field_offset_.get()}, //
{"field_id_", info.field_id_.get()}, //
{"topk", info.topk_}, //
{"search_params", info.search_params_}, //
{"placeholder_tag", node.placeholder_tag_}, //
@ -77,7 +77,7 @@ ShowPlanNodeVisitor::visit(BinaryVectorANNS& node) {
Json json_body{
{"node_type", "BinaryVectorANNS"}, //
{"metric_type", MetricTypeToName(info.metric_type_)}, //
{"field_offset_", info.field_offset_.get()}, //
{"field_id_", info.field_id_.get()}, //
{"topk", info.topk_}, //
{"search_params", info.search_params_}, //
{"placeholder_tag", node.placeholder_tag_}, //

View File

@ -32,7 +32,8 @@ set(SEGCORE_FILES
segcore_init_c.cpp
ScalarIndex.cpp
TimestampIndex.cpp
)
Utils.cpp
ConcurrentVector.cpp)
add_library(milvus_segcore SHARED
${SEGCORE_FILES}
)

View File

@ -0,0 +1,122 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "segcore/ConcurrentVector.h"
namespace milvus::segcore {
void
VectorBase::set_data_raw(ssize_t element_offset,
ssize_t element_count,
const DataArray* data,
const FieldMeta& field_meta) {
if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
return set_data_raw(element_offset, data->vectors().float_vector().data().data(), element_count);
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
return set_data_raw(element_offset, data->vectors().binary_vector().data(), element_count);
} else {
PanicInfo("unsupported");
}
}
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
return set_data_raw(element_offset, data->scalars().bool_data().data().data(), element_count);
}
case DataType::INT8: {
auto src_data = data->scalars().int_data().data();
std::vector<int8_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return set_data_raw(element_offset, data_raw.data(), element_count);
}
case DataType::INT16: {
auto src_data = data->scalars().int_data().data();
std::vector<int16_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return set_data_raw(element_offset, data_raw.data(), element_count);
}
case DataType::INT32: {
return set_data_raw(element_offset, data->scalars().int_data().data().data(), element_count);
}
case DataType::INT64: {
return set_data_raw(element_offset, data->scalars().long_data().data().data(), element_count);
}
case DataType::FLOAT: {
return set_data_raw(element_offset, data->scalars().float_data().data().data(), element_count);
}
case DataType::DOUBLE: {
return set_data_raw(element_offset, data->scalars().double_data().data().data(), element_count);
}
case DataType::VARCHAR: {
auto begin = data->scalars().string_data().data().begin();
auto end = data->scalars().string_data().data().end();
std::vector<std::string> data_raw(begin, end);
return set_data_raw(element_offset, data_raw.data(), element_count);
}
default: {
PanicInfo("unsupported");
}
}
}
void
VectorBase::fill_chunk_data(ssize_t element_count, const DataArray* data, const FieldMeta& field_meta) {
if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
return fill_chunk_data(data->vectors().float_vector().data().data(), element_count);
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
return fill_chunk_data(data->vectors().binary_vector().data(), element_count);
} else {
PanicInfo("unsupported");
}
}
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
return fill_chunk_data(data->scalars().bool_data().data().data(), element_count);
}
case DataType::INT8: {
auto src_data = data->scalars().int_data().data();
std::vector<int8_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return fill_chunk_data(data_raw.data(), element_count);
}
case DataType::INT16: {
auto src_data = data->scalars().int_data().data();
std::vector<int16_t> data_raw(src_data.size());
std::copy_n(src_data.data(), src_data.size(), data_raw.data());
return fill_chunk_data(data_raw.data(), element_count);
}
case DataType::INT32: {
return fill_chunk_data(data->scalars().int_data().data().data(), element_count);
}
case DataType::INT64: {
return fill_chunk_data(data->scalars().long_data().data().data(), element_count);
}
case DataType::FLOAT: {
return fill_chunk_data(data->scalars().float_data().data().data(), element_count);
}
case DataType::DOUBLE: {
return fill_chunk_data(data->scalars().double_data().data().data(), element_count);
}
case DataType::VARCHAR: {
auto begin = data->scalars().string_data().data().begin();
auto end = data->scalars().string_data().data().end();
std::vector<std::string> data_raw(begin, end);
return fill_chunk_data(data_raw.data(), element_count);
}
default: {
PanicInfo("unsupported");
}
}
}
} // namespace milvus::segcore

View File

@ -15,23 +15,22 @@
#include <cassert>
#include <deque>
#include <mutex>
#include <string>
#include <unordered_map>
#include <shared_mutex>
#include <utility>
#include <vector>
#include <boost/container/vector.hpp>
#include <tbb/concurrent_vector.h>
#include "common/Types.h"
#include "common/Span.h"
#include "exceptions/EasyAssert.h"
#include "utils/Utils.h"
#include "common/FieldMeta.h"
namespace milvus::segcore {
template <typename Type>
using FixedVector = boost::container::vector<Type>;
template <typename Type>
class ThreadSafeVector {
public:
@ -66,6 +65,13 @@ class ThreadSafeVector {
return size_;
}
void
clear() {
std::lock_guard lck(mutex_);
size_ = 0;
vec_.clear();
}
private:
std::atomic<int64_t> size_ = 0;
std::deque<Type> vec_;
@ -84,6 +90,15 @@ class VectorBase {
virtual void
set_data_raw(ssize_t element_offset, const void* source, ssize_t element_count) = 0;
void
set_data_raw(ssize_t element_offset, ssize_t element_count, const DataArray* data, const FieldMeta& field_meta);
virtual void
fill_chunk_data(const void* source, ssize_t element_count) = 0;
void
fill_chunk_data(ssize_t element_count, const DataArray* data, const FieldMeta& field_meta);
virtual SpanBase
get_span_base(int64_t chunk_id) const = 0;
@ -92,6 +107,15 @@ class VectorBase {
return size_per_chunk_;
}
virtual const void*
get_chunk_data(ssize_t chunk_index) const = 0;
virtual ssize_t
num_chunk() const = 0;
virtual bool
empty() = 0;
protected:
const int64_t size_per_chunk_;
};
@ -128,13 +152,13 @@ class ConcurrentVectorImpl : public VectorBase {
get_span(int64_t chunk_id) const {
auto& chunk = get_chunk(chunk_id);
if constexpr (is_scalar) {
return Span<TraitType>(chunk.data(), size_per_chunk_);
return Span<TraitType>(chunk.data(), chunk.size());
} else if constexpr (std::is_same_v<Type, int64_t> || std::is_same_v<Type, int>) {
// only for testing
PanicInfo("unimplemented");
} else {
static_assert(std::is_same_v<typename TraitType::embedded_type, Type>);
return Span<TraitType>(chunk.data(), size_per_chunk_, Dim);
return Span<TraitType>(chunk.data(), chunk.size(), Dim);
}
}
@ -143,17 +167,27 @@ class ConcurrentVectorImpl : public VectorBase {
return get_span(chunk_id);
}
void
fill_chunk_data(const void* source, ssize_t element_count) override {
if (element_count == 0) {
return;
}
AssertInfo(chunks_.size() == 0, "no empty concurrent vector");
chunks_.emplace_to_at_least(1, Dim * element_count);
set_data(0, static_cast<const Type*>(source), element_count);
}
void
set_data_raw(ssize_t element_offset, const void* source, ssize_t element_count) override {
if (element_count == 0) {
return;
}
this->grow_to_at_least(element_offset + element_count);
set_data(element_offset, static_cast<const Type*>(source), element_count);
}
void
set_data(ssize_t element_offset, const Type* source, ssize_t element_count) {
if (element_count == 0) {
return;
}
this->grow_to_at_least(element_offset + element_count);
auto chunk_id = element_offset / size_per_chunk_;
auto chunk_offset = element_offset % size_per_chunk_;
ssize_t source_offset = 0;
@ -190,6 +224,11 @@ class ConcurrentVectorImpl : public VectorBase {
return chunks_[chunk_index];
}
const void*
get_chunk_data(ssize_t chunk_index) const override {
return chunks_[chunk_index].data();
}
// just for fun, don't use it directly
const Type*
get_element(ssize_t element_index) const {
@ -207,10 +246,26 @@ class ConcurrentVectorImpl : public VectorBase {
}
ssize_t
num_chunk() const {
num_chunk() const override {
return chunks_.size();
}
bool
empty() override {
for (size_t i = 0; i < chunks_.size(); i++) {
if (get_chunk(i).size() > 0) {
return false;
}
}
return true;
}
void
clear() {
chunks_.clear();
}
private:
void
fill_chunk(
@ -234,7 +289,7 @@ class ConcurrentVectorImpl : public VectorBase {
template <typename Type>
class ConcurrentVector : public ConcurrentVectorImpl<Type, true> {
public:
static_assert(std::is_fundamental_v<Type>);
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit ConcurrentVector(int64_t size_per_chunk)
: ConcurrentVectorImpl<Type, true>::ConcurrentVectorImpl(1, size_per_chunk) {
}

View File

@ -32,9 +32,7 @@ struct DeletedRecord {
};
static constexpr int64_t deprecated_size_per_chunk = 32 * 1024;
DeletedRecord()
: lru_(std::make_shared<TmpBitmap>()),
timestamps_(deprecated_size_per_chunk),
uids_(deprecated_size_per_chunk) {
: lru_(std::make_shared<TmpBitmap>()), timestamps_(deprecated_size_per_chunk), pks_(deprecated_size_per_chunk) {
lru_->bitmap_ptr = std::make_shared<BitsetType>();
}
@ -60,7 +58,7 @@ struct DeletedRecord {
std::atomic<int64_t> reserved = 0;
AckResponder ack_responder_;
ConcurrentVector<Timestamp> timestamps_;
ConcurrentVector<idx_t> uids_;
ConcurrentVector<PkType> pks_;
int64_t record_size_ = 0;
private:

View File

@ -11,7 +11,8 @@
#include <string>
#include <thread>
#include <index/ScalarIndexSort.h>
#include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h"
#include "common/SystemProperty.h"
#include "knowhere/index/vector_index/IndexIVF.h"
@ -111,9 +112,15 @@ ScalarFieldIndexing<T>::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const
const auto& chunk = source->get_chunk(chunk_id);
// build index for chunk
// TODO
auto indexing = scalar::CreateScalarIndexSort<T>();
indexing->Build(vec_base->get_size_per_chunk(), chunk.data());
data_[chunk_id] = std::move(indexing);
if constexpr (std::is_same_v<T, std::string>) {
auto indexing = scalar::CreateStringIndexSort();
indexing->Build(vec_base->get_size_per_chunk(), chunk.data());
data_[chunk_id] = std::move(indexing);
} else {
auto indexing = scalar::CreateScalarIndexSort<T>();
indexing->Build(vec_base->get_size_per_chunk(), chunk.data());
data_[chunk_id] = std::move(indexing);
}
}
}
@ -142,6 +149,8 @@ CreateIndex(const FieldMeta& field_meta, const SegcoreConfig& segcore_config) {
return std::make_unique<ScalarFieldIndexing<float>>(field_meta, segcore_config);
case DataType::DOUBLE:
return std::make_unique<ScalarFieldIndexing<double>>(field_meta, segcore_config);
case DataType::VARCHAR:
return std::make_unique<ScalarFieldIndexing<std::string>>(field_meta, segcore_config);
default:
PanicInfo("unsupported");
}

View File

@ -118,22 +118,21 @@ class IndexingRecord {
void
Initialize() {
int offset_id = 0;
for (const FieldMeta& field : schema_) {
auto offset = FieldOffset(offset_id);
for (auto& [field_id, field_meta] : schema_.get_fields()) {
++offset_id;
if (field.is_vector()) {
if (field_meta.is_vector()) {
// TODO: skip binary small index now, reenable after config.yaml is ready
if (field.get_data_type() == DataType::VECTOR_BINARY) {
if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
continue;
}
// flat should be skipped
if (!field.get_metric_type().has_value()) {
if (!field_meta.get_metric_type().has_value()) {
continue;
}
}
field_indexings_.try_emplace(offset, CreateIndex(field, segcore_config_));
field_indexings_.try_emplace(field_id, CreateIndex(field_meta, segcore_config_));
}
assert(offset_id == schema_.size());
}
@ -149,28 +148,28 @@ class IndexingRecord {
}
const FieldIndexing&
get_field_indexing(FieldOffset field_offset) const {
Assert(field_indexings_.count(field_offset));
return *field_indexings_.at(field_offset);
get_field_indexing(FieldId field_id) const {
Assert(field_indexings_.count(field_id));
return *field_indexings_.at(field_id);
}
const VectorFieldIndexing&
get_vec_field_indexing(FieldOffset field_offset) const {
auto& field_indexing = get_field_indexing(field_offset);
get_vec_field_indexing(FieldId field_id) const {
auto& field_indexing = get_field_indexing(field_id);
auto ptr = dynamic_cast<const VectorFieldIndexing*>(&field_indexing);
AssertInfo(ptr, "invalid indexing");
return *ptr;
}
bool
is_in(FieldOffset field_offset) const {
return field_indexings_.count(field_offset);
is_in(FieldId field_id) const {
return field_indexings_.count(field_id);
}
template <typename T>
auto
get_scalar_field_indexing(FieldOffset field_offset) const -> const ScalarFieldIndexing<T>& {
auto& entry = get_field_indexing(field_offset);
get_scalar_field_indexing(FieldId field_id) const -> const ScalarFieldIndexing<T>& {
auto& entry = get_field_indexing(field_id);
auto ptr = dynamic_cast<const ScalarFieldIndexing<T>*>(&entry);
AssertInfo(ptr, "invalid indexing");
return *ptr;
@ -189,7 +188,7 @@ class IndexingRecord {
private:
// field_offset => indexing
std::map<FieldOffset, std::unique_ptr<FieldIndexing>> field_indexings_;
std::map<FieldId, std::unique_ptr<FieldIndexing>> field_indexings_;
};
} // namespace milvus::segcore

View File

@ -14,46 +14,53 @@
namespace milvus::segcore {
InsertRecord::InsertRecord(const Schema& schema, int64_t size_per_chunk)
: uids_(size_per_chunk), timestamps_(size_per_chunk) {
: row_ids_(size_per_chunk), timestamps_(size_per_chunk) {
for (auto& field : schema) {
if (field.is_vector()) {
if (field.get_data_type() == DataType::VECTOR_FLOAT) {
this->append_field_data<FloatVector>(field.get_dim(), size_per_chunk);
auto field_id = field.first;
auto& field_meta = field.second;
if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
this->append_field_data<FloatVector>(field_id, field_meta.get_dim(), size_per_chunk);
continue;
} else if (field.get_data_type() == DataType::VECTOR_BINARY) {
this->append_field_data<BinaryVector>(field.get_dim(), size_per_chunk);
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
this->append_field_data<BinaryVector>(field_id, field_meta.get_dim(), size_per_chunk);
continue;
} else {
PanicInfo("unsupported");
}
}
switch (field.get_data_type()) {
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
this->append_field_data<bool>(size_per_chunk);
this->append_field_data<bool>(field_id, size_per_chunk);
break;
}
case DataType::INT8: {
this->append_field_data<int8_t>(size_per_chunk);
this->append_field_data<int8_t>(field_id, size_per_chunk);
break;
}
case DataType::INT16: {
this->append_field_data<int16_t>(size_per_chunk);
this->append_field_data<int16_t>(field_id, size_per_chunk);
break;
}
case DataType::INT32: {
this->append_field_data<int32_t>(size_per_chunk);
this->append_field_data<int32_t>(field_id, size_per_chunk);
break;
}
case DataType::INT64: {
this->append_field_data<int64_t>(size_per_chunk);
this->append_field_data<int64_t>(field_id, size_per_chunk);
break;
}
case DataType::FLOAT: {
this->append_field_data<float>(size_per_chunk);
this->append_field_data<float>(field_id, size_per_chunk);
break;
}
case DataType::DOUBLE: {
this->append_field_data<double>(size_per_chunk);
this->append_field_data<double>(field_id, size_per_chunk);
break;
}
case DataType::VARCHAR: {
this->append_field_data<std::string>(field_id, size_per_chunk);
break;
}
default: {

View File

@ -13,34 +13,41 @@
#include <memory>
#include <vector>
#include <unordered_map>
#include "common/Schema.h"
#include "segcore/AckResponder.h"
#include "segcore/ConcurrentVector.h"
#include "segcore/Record.h"
#include "TimestampIndex.h"
namespace milvus::segcore {
struct InsertRecord {
ConcurrentVector<Timestamp> timestamps_;
ConcurrentVector<idx_t> row_ids_;
// used for preInsert of growing segment
std::atomic<int64_t> reserved = 0;
AckResponder ack_responder_;
ConcurrentVector<Timestamp> timestamps_;
ConcurrentVector<idx_t> uids_;
// used for timestamps index of sealed segment
TimestampIndex timestamp_index_;
explicit InsertRecord(const Schema& schema, int64_t size_per_chunk);
// get field data without knowing the type
VectorBase*
get_field_data_base(FieldOffset field_offset) const {
auto ptr = fields_data_[field_offset.get()].get();
get_field_data_base(FieldId field_id) const {
auto ptr = fields_data_.at(field_id).get();
return ptr;
}
// get field data in given type, const version
template <typename Type>
const ConcurrentVector<Type>*
get_field_data(FieldOffset field_offset) const {
auto base_ptr = get_field_data_base(field_offset);
get_field_data(FieldId field_id) const {
auto base_ptr = get_field_data_base(field_id);
auto ptr = dynamic_cast<const ConcurrentVector<Type>*>(base_ptr);
Assert(ptr);
return ptr;
@ -49,8 +56,8 @@ struct InsertRecord {
// get field data in given type, non-const version
template <typename Type>
ConcurrentVector<Type>*
get_field_data(FieldOffset field_offset) {
auto base_ptr = get_field_data_base(field_offset);
get_field_data(FieldId field_id) {
auto base_ptr = get_field_data_base(field_id);
auto ptr = dynamic_cast<ConcurrentVector<Type>*>(base_ptr);
Assert(ptr);
return ptr;
@ -59,21 +66,27 @@ struct InsertRecord {
// append a column of scalar type
template <typename Type>
void
append_field_data(int64_t size_per_chunk) {
static_assert(std::is_fundamental_v<Type>);
fields_data_.emplace_back(std::make_unique<ConcurrentVector<Type>>(size_per_chunk));
append_field_data(FieldId field_id, int64_t size_per_chunk) {
static_assert(IsScalar<Type>);
fields_data_.emplace(field_id, std::make_unique<ConcurrentVector<Type>>(size_per_chunk));
}
// append a column of vector type
template <typename VectorType>
void
append_field_data(int64_t dim, int64_t size_per_chunk) {
append_field_data(FieldId field_id, int64_t dim, int64_t size_per_chunk) {
static_assert(std::is_base_of_v<VectorTrait, VectorType>);
fields_data_.emplace_back(std::make_unique<ConcurrentVector<VectorType>>(dim, size_per_chunk));
fields_data_.emplace(field_id, std::make_unique<ConcurrentVector<VectorType>>(dim, size_per_chunk));
}
void
drop_field_data(FieldId field_id) {
fields_data_.erase(field_id);
}
private:
std::vector<std::unique_ptr<VectorBase>> fields_data_;
// std::vector<std::unique_ptr<VectorBase>> fields_data_;
std::unordered_map<FieldId, std::unique_ptr<VectorBase>> fields_data_;
};
} // namespace milvus::segcore

View File

@ -19,14 +19,15 @@
using milvus::SearchResult;
struct SearchResultPair {
int64_t primary_key_;
milvus::PkType primary_key_;
float distance_;
milvus::SearchResult* search_result_;
int64_t index_;
int64_t offset_;
int64_t offset_rb_; // right bound
SearchResultPair(int64_t primary_key, float distance, SearchResult* result, int64_t index, int64_t lb, int64_t rb)
SearchResultPair(
milvus::PkType primary_key, float distance, SearchResult* result, int64_t index, int64_t lb, int64_t rb)
: primary_key_(primary_key),
distance_(distance),
search_result_(result),
@ -37,10 +38,10 @@ struct SearchResultPair {
bool
operator>(const SearchResultPair& other) const {
if (this->primary_key_ == INVALID_ID) {
if (this->primary_key_ == INVALID_PK) {
return false;
} else {
if (other.primary_key_ == INVALID_ID) {
if (other.primary_key_ == INVALID_PK) {
return true;
} else {
return (distance_ > other.distance_);
@ -50,17 +51,12 @@ struct SearchResultPair {
void
reset() {
offset_++;
if (offset_ < offset_rb_) {
offset_++;
if (offset_ < offset_rb_) {
primary_key_ = search_result_->primary_keys_.at(offset_);
distance_ = search_result_->distances_.at(offset_);
} else {
primary_key_ = INVALID_ID;
distance_ = std::numeric_limits<float>::max();
}
primary_key_ = search_result_->primary_keys_.at(offset_);
distance_ = search_result_->distances_.at(offset_);
} else {
primary_key_ = INVALID_ID;
primary_key_ = INVALID_PK;
distance_ = std::numeric_limits<float>::max();
}
}

View File

@ -12,6 +12,7 @@
#pragma once
#include <map>
#include <unordered_map>
#include <memory>
#include <shared_mutex>
#include <utility>
@ -32,36 +33,36 @@ using SealedIndexingEntryPtr = std::unique_ptr<SealedIndexingEntry>;
struct SealedIndexingRecord {
void
append_field_indexing(FieldOffset field_offset, MetricType metric_type, knowhere::VecIndexPtr indexing) {
append_field_indexing(FieldId field_id, MetricType metric_type, knowhere::VecIndexPtr indexing) {
auto ptr = std::make_unique<SealedIndexingEntry>();
ptr->indexing_ = indexing;
ptr->metric_type_ = metric_type;
std::unique_lock lck(mutex_);
field_indexings_[field_offset] = std::move(ptr);
field_indexings_[field_id] = std::move(ptr);
}
const SealedIndexingEntry*
get_field_indexing(FieldOffset field_offset) const {
get_field_indexing(FieldId field_id) const {
std::shared_lock lck(mutex_);
AssertInfo(field_indexings_.count(field_offset), "field_offset not found");
return field_indexings_.at(field_offset).get();
AssertInfo(field_indexings_.count(field_id), "field_id not found");
return field_indexings_.at(field_id).get();
}
void
drop_field_indexing(FieldOffset field_offset) {
drop_field_indexing(FieldId field_id) {
std::unique_lock lck(mutex_);
field_indexings_.erase(field_offset);
field_indexings_.erase(field_id);
}
bool
is_ready(FieldOffset field_offset) const {
is_ready(FieldId field_id) const {
std::shared_lock lck(mutex_);
return field_indexings_.count(field_offset);
return field_indexings_.count(field_id);
}
private:
// field_offset -> SealedIndexingEntry
std::map<FieldOffset, SealedIndexingEntryPtr> field_indexings_;
std::unordered_map<FieldId, SealedIndexingEntryPtr> field_indexings_;
mutable std::shared_mutex mutex_;
};

View File

@ -43,19 +43,12 @@ class SegmentGrowing : public SegmentInternalInterface {
virtual int64_t
PreInsert(int64_t size) = 0;
virtual Status
Insert(int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const Timestamp* timestamps,
const RowBasedRawData& values) = 0;
virtual void
Insert(int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const Timestamp* timestamps,
const ColumnBasedRawData& values) = 0;
const InsertData* insert_data) = 0;
// virtual int64_t
// PreDelete(int64_t size) = 0;

View File

@ -23,12 +23,13 @@
#include "segcore/Reduce.h"
#include "segcore/SegmentGrowingImpl.h"
#include "utils/Utils.h"
#include "segcore/Utils.h"
namespace milvus::segcore {
int64_t
SegmentGrowingImpl::PreInsert(int64_t size) {
auto reserved_begin = record_.reserved.fetch_add(size);
auto reserved_begin = insert_record_.reserved.fetch_add(size);
return reserved_begin;
}
@ -65,12 +66,12 @@ SegmentGrowingImpl::get_deleted_bitmap(int64_t del_barrier,
}
for (auto del_index = start; del_index < end; ++del_index) {
// get uid in delete logs
auto uid = deleted_record_.uids_[del_index];
auto uid = deleted_record_.pks_[del_index];
// map uid to corresponding offsets, select the max one, which should be the target
// the max one should be closest to query_timestamp, so the delete log should refer to it
int64_t the_offset = -1;
auto [iter_b, iter_e] = uid2offset_.equal_range(uid);
auto [iter_b, iter_e] = pk2offset_.equal_range(uid);
for (auto iter = iter_b; iter != iter_e; ++iter) {
auto offset = iter->second;
@ -79,7 +80,7 @@ SegmentGrowingImpl::get_deleted_bitmap(int64_t del_barrier,
if (the_offset == -1) {
continue;
}
if (record_.timestamps_[the_offset] >= query_timestamp) {
if (insert_record_.timestamps_[the_offset] >= query_timestamp) {
bitmap->reset(the_offset);
} else {
bitmap->set(the_offset);
@ -105,121 +106,80 @@ SegmentGrowingImpl::mask_with_delete(BitsetType& bitset, int64_t ins_barrier, Ti
bitset |= delete_bitset;
}
Status
SegmentGrowingImpl::Insert(int64_t reserved_begin,
void
SegmentGrowingImpl::Insert(int64_t reserved_offset,
int64_t size,
const int64_t* uids_raw,
const int64_t* row_ids,
const Timestamp* timestamps_raw,
const RowBasedRawData& entities_raw) {
AssertInfo(entities_raw.count == size, "Entities_raw count not equal to insert size");
// step 1: check schema if valid
if (entities_raw.sizeof_per_row != schema_->get_total_sizeof()) {
std::string msg = "entity length = " + std::to_string(entities_raw.sizeof_per_row) +
", schema length = " + std::to_string(schema_->get_total_sizeof());
throw std::runtime_error(msg);
const InsertData* insert_data) {
AssertInfo(insert_data->num_rows() == size, "Entities_raw count not equal to insert size");
// AssertInfo(insert_data->fields_data_size() == schema_->size(),
// "num fields of insert data not equal to num of schema fields");
// step 1: check insert data if valid
std::unordered_map<FieldId, int64_t> field_id_to_offset;
int64_t field_offset = 0;
for (auto field : insert_data->fields_data()) {
auto field_id = FieldId(field.field_id());
AssertInfo(!field_id_to_offset.count(field_id), "duplicate field data");
field_id_to_offset.emplace(field_id, field_offset++);
}
// step 2: sort timestamp
auto raw_data = reinterpret_cast<const char*>(entities_raw.raw_data);
auto len_per_row = entities_raw.sizeof_per_row;
std::vector<std::tuple<Timestamp, idx_t, int64_t>> ordering;
ordering.resize(size);
// #pragma omp parallel for
// query node already guarantees that the timestamp is ordered, avoid field data copy in c++
// step 3: fill into Segment.ConcurrentVector
insert_record_.timestamps_.set_data_raw(reserved_offset, timestamps_raw, size);
insert_record_.row_ids_.set_data_raw(reserved_offset, row_ids, size);
for (auto [field_id, field_meta] : schema_->get_fields()) {
AssertInfo(field_id_to_offset.count(field_id), "Cannot find field_id");
auto data_offset = field_id_to_offset[field_id];
insert_record_.get_field_data_base(field_id)->set_data_raw(reserved_offset, size,
&insert_data->fields_data(data_offset), field_meta);
}
// step 4: set pks to offset
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != INVALID_FIELD_ID, "Primary key is -1");
std::vector<PkType> pks(size);
ParsePksFromFieldData(pks, insert_data->fields_data(field_id_to_offset[field_id]));
for (int i = 0; i < size; ++i) {
ordering[i] = std::make_tuple(timestamps_raw[i], uids_raw[i], i);
}
std::sort(ordering.begin(), ordering.end());
// step 3: and convert row-based data to column-based data accordingly
auto sizeof_infos = schema_->get_sizeof_infos();
std::vector<int> offset_infos(schema_->size() + 1, 0);
std::partial_sum(sizeof_infos.begin(), sizeof_infos.end(), offset_infos.begin() + 1);
std::vector<aligned_vector<uint8_t>> entities(schema_->size());
for (int fid = 0; fid < schema_->size(); ++fid) {
auto len = sizeof_infos[fid];
entities[fid].resize(len * size);
pk2offset_.insert(std::make_pair(pks[i], reserved_offset + i));
}
std::vector<idx_t> uids(size);
std::vector<Timestamp> timestamps(size);
// #pragma omp parallel for
for (int index = 0; index < size; ++index) {
auto [t, uid, order_index] = ordering[index];
timestamps[index] = t;
uids[index] = uid;
for (int fid = 0; fid < schema_->size(); ++fid) {
auto len = sizeof_infos[fid];
auto offset = offset_infos[fid];
auto src = raw_data + order_index * len_per_row + offset;
auto dst = entities[fid].data() + index * len;
memcpy(dst, src, len);
}
}
do_insert(reserved_begin, size, uids.data(), timestamps.data(), entities);
return Status::OK();
}
void
SegmentGrowingImpl::do_insert(int64_t reserved_begin,
int64_t size,
const idx_t* row_ids,
const Timestamp* timestamps,
const std::vector<aligned_vector<uint8_t>>& columns_data) {
// step 4: fill into Segment.ConcurrentVector
record_.timestamps_.set_data(reserved_begin, timestamps, size);
record_.uids_.set_data(reserved_begin, row_ids, size);
for (int fid = 0; fid < schema_->size(); ++fid) {
auto field_offset = FieldOffset(fid);
record_.get_field_data_base(field_offset)->set_data_raw(reserved_begin, columns_data[fid].data(), size);
}
if (schema_->get_is_auto_id()) {
for (int i = 0; i < size; ++i) {
auto row_id = row_ids[i];
// NOTE: this must be the last step, cannot be put above
uid2offset_.insert(std::make_pair(row_id, reserved_begin + i));
}
} else {
auto offset = schema_->get_primary_key_offset().value_or(FieldOffset(-1));
AssertInfo(offset.get() != -1, "Primary key offset is -1");
auto& row = columns_data[offset.get()];
auto row_ptr = reinterpret_cast<const int64_t*>(row.data());
for (int i = 0; i < size; ++i) {
uid2offset_.insert(std::make_pair(row_ptr[i], reserved_begin + i));
}
}
record_.ack_responder_.AddSegment(reserved_begin, reserved_begin + size);
// step 5: update small indexes
insert_record_.ack_responder_.AddSegment(reserved_offset, reserved_offset + size);
if (enable_small_index_) {
int64_t chunk_rows = segcore_config_.get_chunk_rows();
indexing_record_.UpdateResourceAck(record_.ack_responder_.GetAck() / chunk_rows, record_);
indexing_record_.UpdateResourceAck(insert_record_.ack_responder_.GetAck() / chunk_rows, insert_record_);
}
}
Status
SegmentGrowingImpl::Delete(int64_t reserved_begin,
int64_t size,
const int64_t* uids_raw,
const Timestamp* timestamps_raw) {
std::vector<std::tuple<Timestamp, idx_t>> ordering;
ordering.resize(size);
// #pragma omp parallel for
for (int i = 0; i < size; ++i) {
ordering[i] = std::make_tuple(timestamps_raw[i], uids_raw[i]);
SegmentGrowingImpl::Delete(int64_t reserved_begin, int64_t size, const IdArray* ids, const Timestamp* timestamps_raw) {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id);
std::vector<PkType> pks(size);
ParsePksFromIDs(pks, field_meta.get_data_type(), *ids);
// step 1: sort timestamp
std::vector<std::tuple<Timestamp, PkType>> ordering(size);
for (int i = 0; i < size; i++) {
ordering[i] = std::make_tuple(timestamps_raw[i], pks[i]);
}
std::sort(ordering.begin(), ordering.end());
std::vector<idx_t> uids(size);
std::vector<Timestamp> timestamps(size);
// #pragma omp parallel for
for (int index = 0; index < size; ++index) {
auto [t, uid] = ordering[index];
timestamps[index] = t;
uids[index] = uid;
std::vector<PkType> sort_pks(size);
std::vector<Timestamp> sort_timestamps(size);
for (int i = 0; i < size; i++) {
auto [t, pk] = ordering[i];
sort_timestamps[i] = t;
sort_pks[i] = pk;
}
deleted_record_.timestamps_.set_data(reserved_begin, timestamps.data(), size);
deleted_record_.uids_.set_data(reserved_begin, uids.data(), size);
// step 2: fill delete record
deleted_record_.timestamps_.set_data_raw(reserved_begin, sort_timestamps.data(), size);
deleted_record_.pks_.set_data_raw(reserved_begin, sort_pks.data(), size);
deleted_record_.ack_responder_.AddSegment(reserved_begin, reserved_begin + size);
return Status::OK();
}
@ -228,7 +188,7 @@ int64_t
SegmentGrowingImpl::GetMemoryUsageInBytes() const {
int64_t total_bytes = 0;
auto chunk_rows = segcore_config_.get_chunk_rows();
int64_t ins_n = upper_align(record_.reserved, chunk_rows);
int64_t ins_n = upper_align(insert_record_.reserved, chunk_rows);
total_bytes += ins_n * (schema_->get_total_sizeof() + 16 + 1);
int64_t del_n = upper_align(deleted_record_.reserved, chunk_rows);
total_bytes += del_n * (16 * 2);
@ -236,8 +196,8 @@ SegmentGrowingImpl::GetMemoryUsageInBytes() const {
}
SpanBase
SegmentGrowingImpl::chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const {
auto vec = get_insert_record().get_field_data_base(field_offset);
SegmentGrowingImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const {
auto vec = get_insert_record().get_field_data_base(field_id);
return vec->get_span_base(chunk_id);
}
@ -256,7 +216,7 @@ SegmentGrowingImpl::vector_search(int64_t vec_count,
const BitsetView& bitset,
SearchResult& output) const {
auto& sealed_indexing = this->get_sealed_indexing_record();
if (sealed_indexing.is_ready(search_info.field_offset_)) {
if (sealed_indexing.is_ready(search_info.field_id_)) {
query::SearchOnSealed(this->get_schema(), sealed_indexing, search_info, query_data, query_count, bitset, output,
id_);
} else {
@ -264,54 +224,64 @@ SegmentGrowingImpl::vector_search(int64_t vec_count,
}
}
void
SegmentGrowingImpl::bulk_subscript(FieldOffset field_offset,
const int64_t* seg_offsets,
int64_t count,
void* output) const {
std::unique_ptr<DataArray>
SegmentGrowingImpl::bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const {
// TODO: support more types
auto vec_ptr = record_.get_field_data_base(field_offset);
auto& field_meta = schema_->operator[](field_offset);
auto vec_ptr = insert_record_.get_field_data_base(field_id);
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector()) {
aligned_vector<char> output(field_meta.get_sizeof() * count);
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
bulk_subscript_impl<FloatVector>(field_meta.get_sizeof(), *vec_ptr, seg_offsets, count, output);
bulk_subscript_impl<FloatVector>(field_meta.get_sizeof(), *vec_ptr, seg_offsets, count, output.data());
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
bulk_subscript_impl<BinaryVector>(field_meta.get_sizeof(), *vec_ptr, seg_offsets, count, output);
bulk_subscript_impl<BinaryVector>(field_meta.get_sizeof(), *vec_ptr, seg_offsets, count, output.data());
} else {
PanicInfo("logical error");
}
return;
return CreateVectorDataArrayFrom(output.data(), count, field_meta);
}
AssertInfo(!field_meta.is_vector(), "Scalar field meta type is vector type");
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
bulk_subscript_impl<bool>(*vec_ptr, seg_offsets, count, false, output);
break;
FixedVector<bool> output(count);
bulk_subscript_impl<bool>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT8: {
bulk_subscript_impl<int8_t>(*vec_ptr, seg_offsets, count, -1, output);
break;
FixedVector<bool> output(count);
bulk_subscript_impl<int8_t>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT16: {
bulk_subscript_impl<int16_t>(*vec_ptr, seg_offsets, count, -1, output);
break;
FixedVector<int16_t> output(count);
bulk_subscript_impl<int16_t>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT32: {
bulk_subscript_impl<int32_t>(*vec_ptr, seg_offsets, count, -1, output);
break;
FixedVector<int32_t> output(count);
bulk_subscript_impl<int32_t>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT64: {
bulk_subscript_impl<int64_t>(*vec_ptr, seg_offsets, count, -1, output);
break;
FixedVector<int64_t> output(count);
bulk_subscript_impl<int64_t>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::FLOAT: {
bulk_subscript_impl<float>(*vec_ptr, seg_offsets, count, -1.0, output);
break;
FixedVector<float> output(count);
bulk_subscript_impl<float>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::DOUBLE: {
bulk_subscript_impl<double>(*vec_ptr, seg_offsets, count, -1.0, output);
break;
FixedVector<double> output(count);
bulk_subscript_impl<double>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::VARCHAR: {
FixedVector<std::string> output(count);
bulk_subscript_impl<std::string>(*vec_ptr, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
default: {
PanicInfo("unsupported type");
@ -342,8 +312,10 @@ SegmentGrowingImpl::bulk_subscript_impl(int64_t element_sizeof,
template <typename T>
void
SegmentGrowingImpl::bulk_subscript_impl(
const VectorBase& vec_raw, const int64_t* seg_offsets, int64_t count, T default_value, void* output_raw) const {
SegmentGrowingImpl::bulk_subscript_impl(const VectorBase& vec_raw,
const int64_t* seg_offsets,
int64_t count,
void* output_raw) const {
static_assert(IsScalar<T>);
auto vec_ptr = dynamic_cast<const ConcurrentVector<T>*>(&vec_raw);
AssertInfo(vec_ptr, "Pointer of vec_raw is nullptr");
@ -351,7 +323,9 @@ SegmentGrowingImpl::bulk_subscript_impl(
auto output = reinterpret_cast<T*>(output_raw);
for (int64_t i = 0; i < count; ++i) {
auto offset = seg_offsets[i];
output[i] = (offset == INVALID_SEG_OFFSET ? default_value : vec[offset]);
if (offset != INVALID_SEG_OFFSET) {
output[i] = vec[offset];
}
}
}
@ -364,79 +338,23 @@ SegmentGrowingImpl::bulk_subscript(SystemFieldType system_type,
case SystemFieldType::Timestamp:
PanicInfo("timestamp unsupported");
case SystemFieldType::RowId:
bulk_subscript_impl<int64_t>(this->record_.uids_, seg_offsets, count, INVALID_ID, output);
bulk_subscript_impl<int64_t>(this->insert_record_.row_ids_, seg_offsets, count, output);
break;
default:
PanicInfo("unknown subscript fields");
}
}
// copied from stack overflow
template <typename T>
std::vector<size_t>
sort_indexes(const T* src, int64_t size) {
// initialize original index locations
std::vector<size_t> idx(size);
iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
// using std::stable_sort instead of std::sort
// to avoid unnecessary index re-orderings
// when v contains elements of equal values
std::stable_sort(idx.begin(), idx.end(), [src](size_t i1, size_t i2) { return src[i1] < src[i2]; });
return idx;
}
void
SegmentGrowingImpl::Insert(int64_t reserved_offset,
int64_t size,
const int64_t* row_ids_raw,
const Timestamp* timestamps_raw,
const ColumnBasedRawData& values) {
auto indexes = sort_indexes(timestamps_raw, size);
std::vector<Timestamp> timestamps(size);
std::vector<idx_t> row_ids(size);
AssertInfo(values.count == size, "Insert values count not equal to insert size");
for (int64_t i = 0; i < size; ++i) {
auto offset = indexes[i];
timestamps[i] = timestamps_raw[offset];
row_ids[i] = row_ids_raw[i];
}
std::vector<aligned_vector<uint8_t>> columns_data;
for (int field_offset = 0; field_offset < schema_->size(); ++field_offset) {
auto& field_meta = schema_->operator[](FieldOffset(field_offset));
aligned_vector<uint8_t> column;
auto element_sizeof = field_meta.get_sizeof();
auto& src_vec = values.columns_[field_offset];
AssertInfo(src_vec.size() == element_sizeof * size, "Vector size is not aligned");
for (int64_t i = 0; i < size; ++i) {
auto offset = indexes[i];
auto beg = src_vec.data() + offset * element_sizeof;
column.insert(column.end(), beg, beg + element_sizeof);
}
columns_data.emplace_back(std::move(column));
}
do_insert(reserved_offset, size, row_ids.data(), timestamps.data(), columns_data);
}
std::vector<SegOffset>
SegmentGrowingImpl::search_ids(const BitsetType& bitset, Timestamp timestamp) const {
std::vector<SegOffset> res_offsets;
for (int i = 0; i < bitset.size(); i++) {
if (bitset[i]) {
SegOffset the_offset(-1);
auto offset = SegOffset(i);
if (record_.timestamps_[offset.get()] < timestamp) {
the_offset = std::max(the_offset, offset);
if (insert_record_.timestamps_[offset.get()] <= timestamp) {
res_offsets.push_back(offset);
}
if (the_offset == SegOffset(-1)) {
continue;
}
res_offsets.push_back(the_offset);
}
}
return res_offsets;
@ -448,16 +366,10 @@ SegmentGrowingImpl::search_ids(const BitsetView& bitset, Timestamp timestamp) co
for (int i = 0; i < bitset.size(); ++i) {
if (!bitset.test(i)) {
SegOffset the_offset(-1);
auto offset = SegOffset(i);
if (record_.timestamps_[offset.get()] < timestamp) {
the_offset = std::max(the_offset, offset);
if (insert_record_.timestamps_[offset.get()] <= timestamp) {
res_offsets.push_back(offset);
}
if (the_offset == SegOffset(-1)) {
continue;
}
res_offsets.push_back(the_offset);
}
}
return res_offsets;
@ -466,24 +378,36 @@ SegmentGrowingImpl::search_ids(const BitsetView& bitset, Timestamp timestamp) co
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
SegmentGrowingImpl::search_ids(const IdArray& id_array, Timestamp timestamp) const {
AssertInfo(id_array.has_int_id(), "Id array doesn't have int_id element");
auto& src_int_arr = id_array.int_id();
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id);
auto data_type = field_meta.get_data_type();
auto ids_size = GetSizeOfIdArray(id_array);
std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array);
auto res_id_arr = std::make_unique<IdArray>();
auto res_int_id_arr = res_id_arr->mutable_int_id();
std::vector<SegOffset> res_offsets;
for (auto uid : src_int_arr.data()) {
auto [iter_b, iter_e] = uid2offset_.equal_range(uid);
SegOffset the_offset(-1);
for (auto pk : pks) {
auto [iter_b, iter_e] = pk2offset_.equal_range(pk);
for (auto iter = iter_b; iter != iter_e; ++iter) {
auto offset = SegOffset(iter->second);
if (record_.timestamps_[offset.get()] < timestamp) {
the_offset = std::max(the_offset, offset);
if (insert_record_.timestamps_[offset.get()] <= timestamp) {
switch (data_type) {
case DataType::INT64: {
res_id_arr->mutable_int_id()->add_data(std::get<int64_t>(pk));
break;
}
case DataType::VARCHAR: {
res_id_arr->mutable_str_id()->add_data(std::get<std::string>(pk));
break;
}
default: {
PanicInfo("unsupported type");
}
}
res_offsets.push_back(offset);
}
// if not found, skip
if (the_offset == SegOffset(-1)) {
continue;
}
res_int_id_arr->add_data(uid);
res_offsets.push_back(the_offset);
}
}
return {std::move(res_id_arr), std::move(res_offsets)};

View File

@ -42,26 +42,19 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t
PreInsert(int64_t size) override;
Status
Insert(int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const Timestamp* timestamps,
const RowBasedRawData& values) override;
void
Insert(int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const Timestamp* timestamps,
const ColumnBasedRawData& values) override;
const InsertData* insert_data) override;
int64_t
PreDelete(int64_t size) override;
// TODO: add id into delete log, possibly bitmap
Status
Delete(int64_t reserverd_offset, int64_t size, const int64_t* row_ids, const Timestamp* timestamps) override;
Delete(int64_t reserverd_offset, int64_t size, const IdArray* pks, const Timestamp* timestamps) override;
int64_t
GetMemoryUsageInBytes() const override;
@ -72,7 +65,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
public:
const InsertRecord&
get_insert_record() const {
return record_;
return insert_record_;
}
const IndexingRecord&
@ -97,14 +90,14 @@ class SegmentGrowingImpl : public SegmentGrowing {
// return count of index that has index, i.e., [0, num_chunk_index) have built index
int64_t
num_chunk_index(FieldOffset field_offset) const final {
num_chunk_index(FieldId field_id) const final {
return indexing_record_.get_finished_ack();
}
// deprecated
const knowhere::Index*
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const final {
return indexing_record_.get_field_indexing(field_offset).get_chunk_indexing(chunk_id);
chunk_index_impl(FieldId field_id, int64_t chunk_id) const final {
return indexing_record_.get_field_indexing(field_id).get_chunk_indexing(chunk_id);
}
int64_t
@ -121,7 +114,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t
get_row_count() const override {
return record_.ack_responder_.GetAck();
return insert_record_.ack_responder_.GetAck();
}
ssize_t
@ -135,8 +128,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
// for scalar vectors
template <typename T>
void
bulk_subscript_impl(
const VectorBase& vec_raw, const int64_t* seg_offsets, int64_t count, T default_value, void* output_raw) const;
bulk_subscript_impl(const VectorBase& vec_raw, const int64_t* seg_offsets, int64_t count, void* output_raw) const;
template <typename T>
void
@ -149,8 +141,8 @@ class SegmentGrowingImpl : public SegmentGrowing {
void
bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const override;
void
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const override;
std::unique_ptr<DataArray>
bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const override;
public:
friend std::unique_ptr<SegmentGrowing>
@ -159,7 +151,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
explicit SegmentGrowingImpl(SchemaPtr schema, const SegcoreConfig& segcore_config, int64_t segment_id)
: segcore_config_(segcore_config),
schema_(std::move(schema)),
record_(*schema_, segcore_config.get_chunk_rows()),
insert_record_(*schema_, segcore_config.get_chunk_rows()),
indexing_record_(*schema_, segcore_config_),
id_(segment_id) {
}
@ -189,6 +181,16 @@ class SegmentGrowingImpl : public SegmentGrowing {
std::vector<SegOffset>
search_ids(const BitsetView& view, Timestamp timestamp) const override;
bool
HasIndex(FieldId field_id) const override {
return true;
}
bool
HasFieldData(FieldId field_id) const override {
return true;
}
protected:
std::shared_ptr<DeletedRecord::TmpBitmap>
get_deleted_bitmap(int64_t del_barrier,
@ -200,31 +202,29 @@ class SegmentGrowingImpl : public SegmentGrowing {
num_chunk() const override;
SpanBase
chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const override;
chunk_data_impl(FieldId field_id, int64_t chunk_id) const override;
void
check_search(const query::Plan* plan) const override {
Assert(plan);
}
private:
void
do_insert(int64_t reserved_begin,
int64_t size,
const idx_t* row_ids,
const Timestamp* timestamps,
const std::vector<aligned_vector<uint8_t>>& columns_data);
private:
SegcoreConfig segcore_config_;
SchemaPtr schema_;
InsertRecord record_;
mutable DeletedRecord deleted_record_;
// small indexes for every chunk
IndexingRecord indexing_record_;
SealedIndexingRecord sealed_indexing_record_;
SealedIndexingRecord sealed_indexing_record_; // not used
tbb::concurrent_unordered_multimap<idx_t, int64_t> uid2offset_;
// inserted fields data and row_ids, timestamps
InsertRecord insert_record_;
// deleted pks
mutable DeletedRecord deleted_record_;
// pks to row offset
tbb::concurrent_unordered_multimap<PkType, int64_t, std::hash<PkType>> pk2offset_;
int64_t id_;
private:

View File

@ -11,6 +11,7 @@
#include "SegmentInterface.h"
#include "query/generated/ExecPlanNodeVisitor.h"
#include "Utils.h"
namespace milvus::segcore {
@ -19,23 +20,21 @@ SegmentInternalInterface::FillPrimaryKeys(const query::Plan* plan, SearchResult&
std::shared_lock lck(mutex_);
AssertInfo(plan, "empty plan");
auto size = results.distances_.size();
AssertInfo(results.ids_.size() == size, "Size of result distances is not equal to size of ids");
AssertInfo(results.seg_offsets_.size() == size, "Size of result distances is not equal to size of ids");
Assert(results.primary_keys_.size() == 0);
results.primary_keys_.resize(size);
auto element_sizeof = sizeof(int64_t);
aligned_vector<char> blob(size * element_sizeof);
if (plan->schema_.get_is_auto_id()) {
bulk_subscript(SystemFieldType::RowId, results.ids_.data(), size, blob.data());
} else {
auto key_offset_opt = get_schema().get_primary_key_offset();
AssertInfo(key_offset_opt.has_value(), "Cannot get primary key offset from schema");
auto key_offset = key_offset_opt.value();
AssertInfo(get_schema()[key_offset].get_data_type() == DataType::INT64, "Primary key field is not INT64 type");
bulk_subscript(key_offset, results.ids_.data(), size, blob.data());
}
auto pk_field_id_opt = get_schema().get_primary_field_id();
AssertInfo(pk_field_id_opt.has_value(), "Cannot get primary key offset from schema");
auto pk_field_id = pk_field_id_opt.value();
AssertInfo(IsPrimaryKeyDataType(get_schema()[pk_field_id].get_data_type()),
"Primary key field is not INT64 or VARCHAR type");
auto field_data = bulk_subscript(pk_field_id, results.seg_offsets_.data(), size);
results.pk_type_ = engine::DataType(field_data->type());
memcpy(results.primary_keys_.data(), blob.data(), element_sizeof * size);
std::vector<PkType> pks(size);
ParsePksFromFieldData(pks, *field_data.get());
results.primary_keys_ = std::move(pks);
}
void
@ -43,39 +42,12 @@ SegmentInternalInterface::FillTargetEntry(const query::Plan* plan, SearchResult&
std::shared_lock lck(mutex_);
AssertInfo(plan, "empty plan");
auto size = results.distances_.size();
AssertInfo(results.ids_.size() == size, "Size of result distances is not equal to size of ids");
std::vector<int64_t> element_sizeofs;
std::vector<aligned_vector<char>> blobs;
// fill row_ids
{
results.ids_data_.resize(size * sizeof(int64_t));
if (plan->schema_.get_is_auto_id()) {
bulk_subscript(SystemFieldType::RowId, results.ids_.data(), size, results.ids_data_.data());
} else {
auto key_offset_opt = get_schema().get_primary_key_offset();
AssertInfo(key_offset_opt.has_value(), "Cannot get primary key offset from schema");
auto key_offset = key_offset_opt.value();
AssertInfo(get_schema()[key_offset].get_data_type() == DataType::INT64,
"Primary key field is not INT64 type");
bulk_subscript(key_offset, results.ids_.data(), size, results.ids_data_.data());
}
}
AssertInfo(results.seg_offsets_.size() == size, "Size of result distances is not equal to size of ids");
// fill other entries except primary key by result_offset
for (auto field_offset : plan->target_entries_) {
auto& field_meta = get_schema()[field_offset];
auto element_sizeof = field_meta.get_sizeof();
aligned_vector<char> blob(size * element_sizeof);
bulk_subscript(field_offset, results.ids_.data(), size, blob.data());
results.output_fields_data_.emplace_back(std::move(blob));
if (field_meta.is_vector()) {
results.AddField(field_meta.get_name(), field_meta.get_id(), field_meta.get_data_type(),
field_meta.get_dim(), field_meta.get_metric_type());
} else {
results.AddField(field_meta.get_name(), field_meta.get_id(), field_meta.get_data_type());
}
for (auto field_id : plan->target_entries_) {
auto field_data = bulk_subscript(field_id, results.seg_offsets_.data(), size);
results.output_fields_data_[field_id] = std::move(field_data);
}
}
@ -92,114 +64,6 @@ SegmentInternalInterface::Search(const query::Plan* plan,
return results;
}
// Note: this is temporary solution.
// modify bulk script implement to make process more clear
static std::unique_ptr<ScalarArray>
CreateScalarArrayFrom(const void* data_raw, int64_t count, DataType data_type) {
auto scalar_array = std::make_unique<ScalarArray>();
switch (data_type) {
case DataType::BOOL: {
auto data = reinterpret_cast<const double*>(data_raw);
auto obj = scalar_array->mutable_bool_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT8: {
auto data = reinterpret_cast<const int8_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT16: {
auto data = reinterpret_cast<const int16_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT32: {
auto data = reinterpret_cast<const int32_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT64: {
auto data = reinterpret_cast<const int64_t*>(data_raw);
auto obj = scalar_array->mutable_long_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::FLOAT: {
auto data = reinterpret_cast<const float*>(data_raw);
auto obj = scalar_array->mutable_float_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::DOUBLE: {
auto data = reinterpret_cast<const double*>(data_raw);
auto obj = scalar_array->mutable_double_data();
obj->mutable_data()->Add(data, data + count);
break;
}
default: {
PanicInfo("unsupported datatype");
}
}
return scalar_array;
}
std::unique_ptr<DataArray>
CreateDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
data_array->set_type(milvus::proto::schema::DataType(field_meta.get_data_type()));
if (!datatype_is_vector(data_type)) {
auto scalar_array = CreateScalarArrayFrom(data_raw, count, data_type);
data_array->set_allocated_scalars(scalar_array.release());
} else {
auto vector_array = data_array->mutable_vectors();
auto dim = field_meta.get_dim();
vector_array->set_dim(dim);
switch (data_type) {
case DataType::VECTOR_FLOAT: {
auto length = count * dim;
auto data = reinterpret_cast<const float*>(data_raw);
auto obj = vector_array->mutable_float_vector();
obj->mutable_data()->Add(data, data + length);
break;
}
case DataType::VECTOR_BINARY: {
AssertInfo(dim % 8 == 0, "Binary vector field dimension is not a multiple of 8");
auto num_bytes = count * dim / 8;
auto data = reinterpret_cast<const char*>(data_raw);
auto obj = vector_array->mutable_binary_vector();
obj->assign(data, num_bytes);
break;
}
default: {
PanicInfo("unsupported datatype");
}
}
}
return data_array;
}
std::unique_ptr<DataArray>
SegmentInternalInterface::BulkSubScript(FieldOffset field_offset, const SegOffset* seg_offsets, int64_t count) const {
if (field_offset.get() >= 0) {
auto& field_meta = get_schema()[field_offset];
aligned_vector<char> data(field_meta.get_sizeof() * count);
bulk_subscript(field_offset, (const int64_t*)seg_offsets, count, data.data());
return CreateDataArrayFrom(data.data(), count, field_meta);
} else {
Assert(field_offset.get() == -1);
aligned_vector<char> data(sizeof(int64_t) * count);
bulk_subscript(SystemFieldType::RowId, (const int64_t*)seg_offsets, count, data.data());
return CreateDataArrayFrom(data.data(), count, FieldMeta::RowIdMeta);
}
}
std::unique_ptr<proto::segcore::RetrieveResults>
SegmentInternalInterface::Retrieve(const query::RetrievePlan* plan, Timestamp timestamp) const {
std::shared_lock lck(mutex_);
@ -212,16 +76,33 @@ SegmentInternalInterface::Retrieve(const query::RetrievePlan* plan, Timestamp ti
auto fields_data = results->mutable_fields_data();
auto ids = results->mutable_ids();
auto pk_offset = plan->schema_.get_primary_key_offset();
for (auto field_offset : plan->field_offsets_) {
auto col = BulkSubScript(field_offset, (SegOffset*)retrieve_results.result_offsets_.data(),
retrieve_results.result_offsets_.size());
auto pk_field_id = plan->schema_.get_primary_field_id();
for (auto field_id : plan->field_ids_) {
auto& field_mata = plan->schema_[field_id];
auto col =
bulk_subscript(field_id, retrieve_results.result_offsets_.data(), retrieve_results.result_offsets_.size());
auto col_data = col.release();
fields_data->AddAllocated(col_data);
if (pk_offset.has_value() && pk_offset.value() == field_offset) {
auto int_ids = ids->mutable_int_id();
auto src_data = col_data->scalars().long_data();
int_ids->mutable_data()->Add(src_data.data().begin(), src_data.data().end());
if (pk_field_id.has_value() && pk_field_id.value() == field_id) {
switch (field_mata.get_data_type()) {
case DataType::INT64: {
auto int_ids = ids->mutable_int_id();
auto src_data = col_data->scalars().long_data();
int_ids->mutable_data()->Add(src_data.data().begin(), src_data.data().end());
break;
}
case DataType::VARCHAR: {
auto str_ids = ids->mutable_str_id();
auto src_data = col_data->scalars().string_data();
for (auto i = 0; i < src_data.data_size(); ++i)
*(str_ids->mutable_data()->Add()) = src_data.data(i);
break;
}
default: {
PanicInfo("unsupported data type");
}
}
}
}
return results;

View File

@ -51,6 +51,7 @@ class SegmentInterface {
virtual std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(const query::RetrievePlan* Plan, Timestamp timestamp) const = 0;
// TODO: memory use is not correct when load string or load string index
virtual int64_t
GetMemoryUsageInBytes() const = 0;
@ -64,7 +65,7 @@ class SegmentInterface {
PreDelete(int64_t size) = 0;
virtual Status
Delete(int64_t reserved_offset, int64_t size, const int64_t* row_ids, const Timestamp* timestamps) = 0;
Delete(int64_t reserved_offset, int64_t size, const IdArray* pks, const Timestamp* timestamps) = 0;
};
// internal API for DSL calculation
@ -73,16 +74,16 @@ class SegmentInternalInterface : public SegmentInterface {
public:
template <typename T>
Span<T>
chunk_data(FieldOffset field_offset, int64_t chunk_id) const {
return static_cast<Span<T>>(chunk_data_impl(field_offset, chunk_id));
chunk_data(FieldId field_id, int64_t chunk_id) const {
return static_cast<Span<T>>(chunk_data_impl(field_id, chunk_id));
}
template <typename T>
const scalar::ScalarIndex<T>&
chunk_scalar_index(FieldOffset field_offset, int64_t chunk_id) const {
chunk_scalar_index(FieldId field_id, int64_t chunk_id) const {
static_assert(IsScalar<T>);
using IndexType = scalar::ScalarIndex<T>;
auto base_ptr = chunk_index_impl(field_offset, chunk_id);
auto base_ptr = chunk_index_impl(field_id, chunk_id);
auto ptr = dynamic_cast<const IndexType*>(base_ptr);
AssertInfo(ptr, "entry mismatch");
return *ptr;
@ -102,6 +103,12 @@ class SegmentInternalInterface : public SegmentInterface {
std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(const query::RetrievePlan* plan, Timestamp timestamp) const override;
virtual bool
HasIndex(FieldId field_id) const = 0;
virtual bool
HasFieldData(FieldId field_id) const = 0;
virtual std::string
debug() const = 0;
@ -120,7 +127,7 @@ class SegmentInternalInterface : public SegmentInterface {
// count of chunk that has index available
virtual int64_t
num_chunk_index(FieldOffset field_offset) const = 0;
num_chunk_index(FieldId field_id) const = 0;
virtual void
mask_with_timestamps(BitsetType& bitset_chunk, Timestamp timestamp) const = 0;
@ -148,11 +155,11 @@ class SegmentInternalInterface : public SegmentInterface {
protected:
// internal API: return chunk_data in span
virtual SpanBase
chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const = 0;
chunk_data_impl(FieldId field_id, int64_t chunk_id) const = 0;
// internal API: return chunk_index in span, support scalar index only
virtual const knowhere::Index*
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const = 0;
chunk_index_impl(FieldId field_id, int64_t chunk_id) const = 0;
// TODO remove system fields
// calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to system_type
@ -160,13 +167,8 @@ class SegmentInternalInterface : public SegmentInterface {
bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
// calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to field_offset
virtual void
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const = 0;
// TODO: special hack: FieldOffset == -1 -> RowId.
// TODO: remove this hack when transfer is done
virtual std::unique_ptr<DataArray>
BulkSubScript(FieldOffset field_offset, const SegOffset* seg_offsets, int64_t count) const;
bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const = 0;
virtual void
check_search(const query::Plan* plan) const = 0;
@ -175,10 +177,4 @@ class SegmentInternalInterface : public SegmentInterface {
mutable std::shared_mutex mutex_;
};
static std::unique_ptr<ScalarArray>
CreateScalarArrayFrom(const void* data_raw, int64_t count, DataType data_type);
std::unique_ptr<DataArray>
CreateDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta);
} // namespace milvus::segcore

View File

@ -34,10 +34,6 @@ class SegmentSealed : public SegmentInternalInterface {
DropIndex(const FieldId field_id) = 0;
virtual void
DropFieldData(const FieldId field_id) = 0;
virtual bool
HasIndex(FieldId field_id) const = 0;
virtual bool
HasFieldData(FieldId field_id) const = 0;
};
using SegmentSealedPtr = std::unique_ptr<SegmentSealed>;

View File

@ -14,17 +14,22 @@
#include "query/SearchBruteForce.h"
#include "query/SearchOnSealed.h"
#include "query/ScalarIndex.h"
#include "Utils.h"
namespace milvus::segcore {
static inline void
set_bit(BitsetType& bitset, FieldOffset field_offset, bool flag = true) {
bitset[field_offset.get()] = flag;
set_bit(BitsetType& bitset, FieldId field_id, bool flag = true) {
auto pos = field_id.get() - START_USER_FIELDID;
AssertInfo(pos >= 0, "invalid field id");
bitset[pos] = flag;
}
static inline bool
get_bit(const BitsetType& bitset, FieldOffset field_offset) {
return bitset[field_offset.get()];
get_bit(const BitsetType& bitset, FieldId field_id) {
auto pos = field_id.get() - START_USER_FIELDID;
AssertInfo(pos >= 0, "invalid field id");
return bitset[pos];
}
int64_t
@ -33,136 +38,168 @@ SegmentSealedImpl::PreDelete(int64_t size) {
return reserved_begin;
}
void
print(const std::map<std::string, std::string>& m) {
for (const auto& [k, v] : m) {
std::cout << k << ": " << v << std::endl;
}
}
void
print(const LoadIndexInfo& info) {
std::cout << "------------------LoadIndexInfo----------------------" << std::endl;
std::cout << "field_id: " << info.field_id << std::endl;
std::cout << "field_type: " << info.field_type << std::endl;
std::cout << "index_params:" << std::endl;
print(info.index_params);
std::cout << "------------------LoadIndexInfo----------------------" << std::endl;
}
void
print(const LoadFieldDataInfo& info) {
std::cout << "------------------LoadFieldDataInfo----------------------" << std::endl;
std::cout << "field_id: " << info.field_id << std::endl;
std::cout << "------------------LoadFieldDataInfo----------------------" << std::endl;
}
void
SegmentSealedImpl::LoadIndex(const LoadIndexInfo& info) {
// print(info);
// NOTE: lock only when data is ready to avoid starvation
auto field_id = FieldId(info.field_id);
auto field_offset = schema_->get_offset(field_id);
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector()) {
LoadVecIndex(info);
} else {
LoadScalarIndex(info);
}
}
void
SegmentSealedImpl::LoadVecIndex(const LoadIndexInfo& info) {
// NOTE: lock only when data is ready to avoid starvation
auto field_id = FieldId(info.field_id);
auto index = std::dynamic_pointer_cast<knowhere::VecIndex>(info.index);
AssertInfo(info.index_params.count("metric_type"), "Can't get metric_type in index_params");
auto metric_type_str = info.index_params.at("metric_type");
auto row_count = info.index->Count();
auto row_count = index->Count();
AssertInfo(row_count > 0, "Index count is 0");
std::unique_lock lck(mutex_);
AssertInfo(!get_bit(vecindex_ready_bitset_, field_offset),
"Can't get bitset element at " + std::to_string(field_offset.get()));
AssertInfo(!get_bit(vecindex_ready_bitset_, field_id),
"Can't get bitset element at " + std::to_string(field_id.get()));
if (row_count_opt_.has_value()) {
AssertInfo(row_count_opt_.value() == row_count, "load data has different row count from other columns");
} else {
row_count_opt_ = row_count;
}
AssertInfo(!vecindexs_.is_ready(field_offset), "vec index is not ready");
vecindexs_.append_field_indexing(field_offset, GetMetricType(metric_type_str), info.index);
AssertInfo(!vector_indexings_.is_ready(field_id), "vec index is not ready");
vector_indexings_.append_field_indexing(field_id, GetMetricType(metric_type_str), index);
set_bit(vecindex_ready_bitset_, field_offset, true);
set_bit(vecindex_ready_bitset_, field_id, true);
lck.unlock();
}
void
SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
// NOTE: lock only when data is ready to avoid starvation
auto field_id = FieldId(info.field_id);
auto index = std::dynamic_pointer_cast<scalar::IndexBase>(info.index);
auto row_count = index->Count();
AssertInfo(row_count > 0, "Index count is 0");
std::unique_lock lck(mutex_);
if (row_count_opt_.has_value()) {
AssertInfo(row_count_opt_.value() == row_count, "load data has different row count from other columns");
} else {
row_count_opt_ = row_count;
}
scalar_indexings_[field_id] = std::move(index);
set_bit(field_data_ready_bitset_, field_id, true);
lck.unlock();
}
void
SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& info) {
// print(info);
// NOTE: lock only when data is ready to avoid starvation
AssertInfo(info.row_count > 0, "The row count of field data is 0");
auto field_id = FieldId(info.field_id);
AssertInfo(info.blob, "Field info blob is null");
auto create_index = [](const int64_t* data, int64_t size) {
AssertInfo(size, "Vector data size is 0 when create index");
auto pk_index = std::make_unique<ScalarIndexVector>();
pk_index->append_data(data, size, SegOffset(0));
pk_index->build();
return pk_index;
};
AssertInfo(info.field_data != nullptr, "Field info blob is null");
auto size = info.row_count;
if (SystemProperty::Instance().IsSystem(field_id)) {
auto system_field_type = SystemProperty::Instance().GetSystemFieldType(field_id);
if (system_field_type == SystemFieldType::Timestamp) {
auto src_ptr = reinterpret_cast<const Timestamp*>(info.blob);
aligned_vector<Timestamp> vec_data(info.row_count);
std::copy_n(src_ptr, info.row_count, vec_data.data());
auto timestamps = reinterpret_cast<const Timestamp*>(info.field_data->scalars().long_data().data().data());
auto size = info.row_count;
// TODO: load from outside
TimestampIndex index;
auto min_slice_length = size < 4096 ? 1 : 4096;
auto meta = GenerateFakeSlices(src_ptr, size, min_slice_length);
auto meta = GenerateFakeSlices(timestamps, size, min_slice_length);
index.set_length_meta(std::move(meta));
index.build_with(src_ptr, size);
index.build_with(timestamps, size);
// use special index
std::unique_lock lck(mutex_);
update_row_count(info.row_count);
AssertInfo(timestamps_.empty(), "already exists");
timestamps_ = std::move(vec_data);
timestamp_index_ = std::move(index);
AssertInfo(insert_record_.timestamps_.empty(), "already exists");
insert_record_.timestamps_.fill_chunk_data(timestamps, size);
insert_record_.timestamp_index_ = std::move(index);
AssertInfo(insert_record_.timestamps_.num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
} else {
AssertInfo(system_field_type == SystemFieldType::RowId, "System field type of id column is not RowId");
auto src_ptr = reinterpret_cast<const idx_t*>(info.blob);
// prepare data
aligned_vector<idx_t> vec_data(info.row_count);
std::copy_n(src_ptr, info.row_count, vec_data.data());
std::unique_ptr<ScalarIndexBase> pk_index_;
// fix unintentional index update
if (schema_->get_is_auto_id()) {
pk_index_ = create_index(vec_data.data(), vec_data.size());
}
auto row_ids = reinterpret_cast<const idx_t*>(info.field_data->scalars().long_data().data().data());
// write data under lock
std::unique_lock lck(mutex_);
update_row_count(info.row_count);
AssertInfo(row_ids_.empty(), "already exists");
row_ids_ = std::move(vec_data);
if (schema_->get_is_auto_id()) {
primary_key_index_ = std::move(pk_index_);
}
AssertInfo(insert_record_.row_ids_.empty(), "already exists");
insert_record_.row_ids_.fill_chunk_data(row_ids, size);
AssertInfo(insert_record_.row_ids_.num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
}
++system_ready_count_;
} else {
// prepare data
auto field_offset = schema_->get_offset(field_id);
auto& field_meta = schema_->operator[](field_offset);
// Assert(!field_meta.is_vector());
auto element_sizeof = field_meta.get_sizeof();
auto span = SpanBase(info.blob, info.row_count, element_sizeof);
auto length_in_bytes = element_sizeof * info.row_count;
aligned_vector<char> vec_data(length_in_bytes);
memcpy(vec_data.data(), info.blob, length_in_bytes);
// generate scalar index
std::unique_ptr<knowhere::Index> index;
if (!field_meta.is_vector()) {
index = query::generate_scalar_index(span, field_meta.get_data_type());
}
std::unique_ptr<ScalarIndexBase> pk_index_;
if (schema_->get_primary_key_offset() == field_offset) {
pk_index_ = create_index((const int64_t*)vec_data.data(), info.row_count);
}
auto& field_meta = schema_->operator[](field_id);
auto data_type = field_meta.get_data_type();
AssertInfo(data_type == engine::DataType(info.field_data->type()),
"field type of load data is inconsistent with the schema");
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data->empty(), "already exists");
// write data under lock
std::unique_lock lck(mutex_);
update_row_count(info.row_count);
AssertInfo(fields_data_[field_offset.get()].empty(), "field data already exists");
// insert data to insertRecord
field_data->fill_chunk_data(size, info.field_data, field_meta);
AssertInfo(field_data->num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
// set pks to offset
if (schema_->get_primary_field_id() == field_id) {
AssertInfo(field_id.get() != -1, "Primary key is -1");
AssertInfo(pk2offset_.empty(), "already exists");
std::vector<PkType> pks(size);
ParsePksFromFieldData(pks, *info.field_data);
for (int i = 0; i < size; ++i) {
pk2offset_.insert(std::make_pair(pks[i], i));
}
}
if (field_meta.is_vector()) {
AssertInfo(!vecindexs_.is_ready(field_offset), "field data can't be loaded when indexing exists");
fields_data_[field_offset.get()] = std::move(vec_data);
} else {
AssertInfo(!scalar_indexings_[field_offset.get()], "scalar indexing not cleared");
fields_data_[field_offset.get()] = std::move(vec_data);
scalar_indexings_[field_offset.get()] = std::move(index);
AssertInfo(!vector_indexings_.is_ready(field_id), "field data can't be loaded when indexing exists");
} else if (!scalar_indexings_.count(field_id)) {
// generate scalar index
std::unique_ptr<knowhere::Index> index;
index = query::generate_scalar_index(field_data->get_span_base(0), data_type);
scalar_indexings_[field_id] = std::move(index);
}
if (schema_->get_primary_key_offset() == field_offset) {
primary_key_index_ = std::move(pk_index_);
}
set_bit(field_data_ready_bitset_, field_offset, true);
set_bit(field_data_ready_bitset_, field_id, true);
}
update_row_count(info.row_count);
}
void
@ -170,19 +207,25 @@ SegmentSealedImpl::LoadDeletedRecord(const LoadDeletedRecordInfo& info) {
AssertInfo(info.row_count > 0, "The row count of deleted record is 0");
AssertInfo(info.primary_keys, "Deleted primary keys is null");
AssertInfo(info.timestamps, "Deleted timestamps is null");
auto primary_keys = reinterpret_cast<const idx_t*>(info.primary_keys);
auto timestamps = reinterpret_cast<const Timestamp*>(info.timestamps);
// step 1: get pks and timestamps
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id);
int64_t size = info.row_count;
std::vector<PkType> pks(size);
ParsePksFromIDs(pks, field_meta.get_data_type(), *info.primary_keys);
auto timestamps = reinterpret_cast<const Timestamp*>(info.timestamps);
deleted_record_.uids_.set_data(0, primary_keys, size);
deleted_record_.timestamps_.set_data(0, timestamps, size);
// step 2: fill pks and timestamps
deleted_record_.pks_.set_data_raw(0, pks.data(), size);
deleted_record_.timestamps_.set_data_raw(0, timestamps, size);
deleted_record_.ack_responder_.AddSegment(0, size);
deleted_record_.reserved.fetch_add(size);
deleted_record_.record_size_ = size;
}
int64_t
SegmentSealedImpl::num_chunk_index(FieldOffset field_offset) const {
SegmentSealedImpl::num_chunk_index(FieldId field_id) const {
return 1;
}
@ -197,22 +240,23 @@ SegmentSealedImpl::size_per_chunk() const {
}
SpanBase
SegmentSealedImpl::chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const {
SegmentSealedImpl::chunk_data_impl(FieldId field_id, int64_t chunk_id) const {
std::shared_lock lck(mutex_);
AssertInfo(get_bit(field_data_ready_bitset_, field_offset),
"Can't get bitset element at " + std::to_string(field_offset.get()));
auto& field_meta = schema_->operator[](field_offset);
AssertInfo(get_bit(field_data_ready_bitset_, field_id),
"Can't get bitset element at " + std::to_string(field_id.get()));
auto& field_meta = schema_->operator[](field_id);
auto element_sizeof = field_meta.get_sizeof();
SpanBase base(fields_data_[field_offset.get()].data(), row_count_opt_.value(), element_sizeof);
return base;
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data->num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
return field_data->get_span_base(0);
}
const knowhere::Index*
SegmentSealedImpl::chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const {
SegmentSealedImpl::chunk_index_impl(FieldId field_id, int64_t chunk_id) const {
AssertInfo(chunk_id == 0, "Chunk_id is not equal to 0");
// TODO: support scalar index
auto ptr = scalar_indexings_[field_offset.get()].get();
AssertInfo(ptr, "Scalar index of " + std::to_string(field_offset.get()) + " is null");
auto ptr = scalar_indexings_.at(field_id).get();
AssertInfo(ptr, "Scalar index of " + std::to_string(field_id.get()) + " is null");
return ptr;
}
@ -245,17 +289,25 @@ SegmentSealedImpl::get_deleted_bitmap(int64_t del_barrier,
current->del_barrier = del_barrier;
auto bitmap = current->bitmap_ptr;
// Sealed segment only has one chunk with chunk_id 0
auto span = deleted_record_.uids_.get_span_base(0);
auto uids_ptr = reinterpret_cast<const idx_t*>(span.data());
auto delete_pks_data = deleted_record_.pks_.get_chunk_data(0);
auto delete_pks = reinterpret_cast<const PkType*>(delete_pks_data);
auto del_size = deleted_record_.reserved.load();
std::vector<idx_t> ids(del_size);
std::copy_n(uids_ptr, del_size, ids.data());
auto [uids, seg_offsets] = primary_key_index_->do_search_ids(ids);
for (int i = 0; i < uids.size(); ++i) {
std::vector<SegOffset> seg_offsets;
std::vector<PkType> pks;
for (int i = 0; i < del_size; ++i) {
auto [iter_b, iter_e] = pk2offset_.equal_range(delete_pks[i]);
for (auto iter = iter_b; iter != iter_e; ++iter) {
auto [entry_pk, entry_offset] = *iter;
pks.emplace_back(entry_pk);
seg_offsets.emplace_back(SegOffset(entry_offset));
}
}
for (int i = 0; i < pks.size(); ++i) {
bitmap->set(seg_offsets[i].get());
}
if (uids.size() == 0 || seg_offsets.size() == 0) {
if (pks.size() == 0 || seg_offsets.size() == 0) {
return current;
}
@ -305,16 +357,16 @@ SegmentSealedImpl::vector_search(int64_t vec_count,
const BitsetView& bitset,
SearchResult& output) const {
AssertInfo(is_system_field_ready(), "System field is not ready");
auto field_offset = search_info.field_offset_;
auto& field_meta = schema_->operator[](field_offset);
auto field_id = search_info.field_id_;
auto& field_meta = schema_->operator[](field_id);
AssertInfo(field_meta.is_vector(), "The meta type of vector field is not vector type");
if (get_bit(vecindex_ready_bitset_, field_offset)) {
AssertInfo(vecindexs_.is_ready(field_offset),
"vector indexes isn't ready for field " + std::to_string(field_offset.get()));
query::SearchOnSealed(*schema_, vecindexs_, search_info, query_data, query_count, bitset, output, id_);
if (get_bit(vecindex_ready_bitset_, field_id)) {
AssertInfo(vector_indexings_.is_ready(field_id),
"vector indexes isn't ready for field " + std::to_string(field_id.get()));
query::SearchOnSealed(*schema_, vector_indexings_, search_info, query_data, query_count, bitset, output, id_);
return;
} else if (!get_bit(field_data_ready_bitset_, field_offset)) {
} else if (!get_bit(field_data_ready_bitset_, field_id)) {
PanicInfo("Field Data is not loaded");
}
@ -327,11 +379,13 @@ SegmentSealedImpl::vector_search(int64_t vec_count,
dataset.dim = field_meta.get_dim();
dataset.round_decimal = search_info.round_decimal_;
AssertInfo(get_bit(field_data_ready_bitset_, field_offset),
"Can't get bitset element at " + std::to_string(field_offset.get()));
AssertInfo(get_bit(field_data_ready_bitset_, field_id),
"Can't get bitset element at " + std::to_string(field_id.get()));
AssertInfo(row_count_opt_.has_value(), "Can't get row count value");
auto row_count = row_count_opt_.value();
auto chunk_data = fields_data_[field_offset.get()].data();
auto vec_data = insert_record_.get_field_data_base(field_id);
AssertInfo(vec_data->num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
auto chunk_data = vec_data->get_chunk_data(0);
auto sub_qr = [&] {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
@ -343,7 +397,7 @@ SegmentSealedImpl::vector_search(int64_t vec_count,
SearchResult results;
results.distances_ = std::move(sub_qr.mutable_distances());
results.ids_ = std::move(sub_qr.mutable_ids());
results.seg_offsets_ = std::move(sub_qr.mutable_seg_offsets());
results.topk_ = dataset.topk;
results.num_queries_ = dataset.num_queries;
@ -358,21 +412,17 @@ SegmentSealedImpl::DropFieldData(const FieldId field_id) {
std::unique_lock lck(mutex_);
--system_ready_count_;
if (system_field_type == SystemFieldType::RowId) {
auto row_ids = std::move(row_ids_);
insert_record_.row_ids_.clear();
} else if (system_field_type == SystemFieldType::Timestamp) {
auto ts = std::move(timestamps_);
insert_record_.timestamps_.clear();
}
lck.unlock();
} else {
auto field_offset = schema_->get_offset(field_id);
auto& field_meta = schema_->operator[](field_offset);
auto& field_meta = schema_->operator[](field_id);
std::unique_lock lck(mutex_);
set_bit(field_data_ready_bitset_, field_offset, false);
auto vec = std::move(fields_data_[field_offset.get()]);
set_bit(field_data_ready_bitset_, field_id, false);
insert_record_.drop_field_data(field_id);
lck.unlock();
vec.clear();
}
}
@ -380,14 +430,13 @@ void
SegmentSealedImpl::DropIndex(const FieldId field_id) {
AssertInfo(!SystemProperty::Instance().IsSystem(field_id),
"Field id:" + std::to_string(field_id.get()) + " isn't one of system type when drop index");
auto field_offset = schema_->get_offset(field_id);
auto& field_meta = schema_->operator[](field_offset);
auto& field_meta = schema_->operator[](field_id);
AssertInfo(field_meta.is_vector(),
"Field meta of offset:" + std::to_string(field_offset.get()) + " is not vector type");
"Field meta of offset:" + std::to_string(field_id.get()) + " is not vector type");
std::unique_lock lck(mutex_);
vecindexs_.drop_field_indexing(field_offset);
set_bit(vecindex_ready_bitset_, field_offset, false);
vector_indexings_.drop_field_indexing(field_id);
set_bit(vecindex_ready_bitset_, field_id, false);
}
void
@ -406,15 +455,16 @@ SegmentSealedImpl::check_search(const query::Plan* plan) const {
auto absent_fields = request_fields - field_ready_bitset;
if (absent_fields.any()) {
auto field_offset = FieldOffset(absent_fields.find_first());
auto& field_meta = schema_->operator[](field_offset);
auto field_id = FieldId(absent_fields.find_first() + START_USER_FIELDID);
auto& field_meta = schema_->operator[](field_id);
PanicInfo("User Field(" + field_meta.get_name().get() + ") is not loaded");
}
}
SegmentSealedImpl::SegmentSealedImpl(SchemaPtr schema, int64_t segment_id)
: schema_(schema),
fields_data_(schema->size()),
// fields_data_(schema->size()),
insert_record_(*schema, MAX_ROW_COUNT),
field_data_ready_bitset_(schema->size()),
vecindex_ready_bitset_(schema->size()),
scalar_indexings_(schema->size()),
@ -428,7 +478,9 @@ SegmentSealedImpl::bulk_subscript(SystemFieldType system_type,
void* output) const {
AssertInfo(is_system_field_ready(), "System field isn't ready when do bulk_insert");
AssertInfo(system_type == SystemFieldType::RowId, "System field type of id column is not RowId");
bulk_subscript_impl<int64_t>(row_ids_.data(), seg_offsets, count, output);
AssertInfo(insert_record_.row_ids_.num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
auto field_data = insert_record_.row_ids_.get_chunk_data(0);
bulk_subscript_impl<int64_t>(field_data, seg_offsets, count, output);
}
template <typename T>
@ -439,7 +491,9 @@ SegmentSealedImpl::bulk_subscript_impl(const void* src_raw, const int64_t* seg_o
auto dst = reinterpret_cast<T*>(dst_raw);
for (int64_t i = 0; i < count; ++i) {
auto offset = seg_offsets[i];
dst[i] = (offset == INVALID_SEG_OFFSET ? INVALID_ID : src[offset]);
if (offset != INVALID_SEG_OFFSET) {
dst[i] = src[offset];
}
}
}
@ -458,51 +512,115 @@ SegmentSealedImpl::bulk_subscript_impl(
}
}
void
SegmentSealedImpl::bulk_subscript(FieldOffset field_offset,
const int64_t* seg_offsets,
int64_t count,
void* output) const {
// Assert(get_bit(field_data_ready_bitset_, field_offset));
if (!get_bit(field_data_ready_bitset_, field_offset)) {
return;
}
auto& field_meta = schema_->operator[](field_offset);
auto src_vec = fields_data_[field_offset.get()].data();
std::unique_ptr<DataArray>
SegmentSealedImpl::fill_with_empty(FieldId field_id, int64_t count) const {
auto& field_meta = schema_->operator[](field_id);
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
bulk_subscript_impl<bool>(src_vec, seg_offsets, count, output);
break;
FixedVector<bool> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT8: {
bulk_subscript_impl<int8_t>(src_vec, seg_offsets, count, output);
break;
FixedVector<int8_t> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT16: {
bulk_subscript_impl<int16_t>(src_vec, seg_offsets, count, output);
break;
FixedVector<int16_t> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT32: {
bulk_subscript_impl<int32_t>(src_vec, seg_offsets, count, output);
break;
FixedVector<int32_t> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT64: {
bulk_subscript_impl<int64_t>(src_vec, seg_offsets, count, output);
break;
FixedVector<int64_t> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::FLOAT: {
bulk_subscript_impl<float>(src_vec, seg_offsets, count, output);
break;
FixedVector<float> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::DOUBLE: {
bulk_subscript_impl<double>(src_vec, seg_offsets, count, output);
break;
FixedVector<double> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::VARCHAR: {
FixedVector<std::string> output(count);
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::VECTOR_FLOAT:
case DataType::VECTOR_BINARY: {
bulk_subscript_impl(field_meta.get_sizeof(), src_vec, seg_offsets, count, output);
break;
aligned_vector<char> output(field_meta.get_sizeof() * count);
return CreateVectorDataArrayFrom(output.data(), count, field_meta);
}
default: {
PanicInfo("unsupported");
}
}
}
std::unique_ptr<DataArray>
SegmentSealedImpl::bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const {
if (!HasFieldData(field_id)) {
return fill_with_empty(field_id, count);
}
Assert(get_bit(field_data_ready_bitset_, field_id));
auto& field_meta = schema_->operator[](field_id);
auto field_data = insert_record_.get_field_data_base(field_id);
AssertInfo(field_data->num_chunk() == 1, std::string("num chunk not equal to 1 for sealed segment, num_chunk: ") +
std::to_string(field_data->num_chunk()));
auto src_vec = field_data->get_chunk_data(0);
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
FixedVector<bool> output(count);
bulk_subscript_impl<bool>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT8: {
FixedVector<int8_t> output(count);
bulk_subscript_impl<int8_t>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT16: {
FixedVector<int16_t> output(count);
bulk_subscript_impl<int16_t>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT32: {
FixedVector<int32_t> output(count);
bulk_subscript_impl<int32_t>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::INT64: {
FixedVector<int64_t> output(count);
bulk_subscript_impl<int64_t>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::FLOAT: {
FixedVector<float> output(count);
bulk_subscript_impl<float>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::DOUBLE: {
FixedVector<double> output(count);
bulk_subscript_impl<double>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::VARCHAR: {
FixedVector<std::string> output(count);
bulk_subscript_impl<std::string>(src_vec, seg_offsets, count, output.data());
return CreateScalarDataArrayFrom(output.data(), count, field_meta);
}
case DataType::VECTOR_FLOAT:
case DataType::VECTOR_BINARY: {
aligned_vector<char> output(field_meta.get_sizeof() * count);
bulk_subscript_impl(field_meta.get_sizeof(), src_vec, seg_offsets, count, output.data());
return CreateVectorDataArrayFrom(output.data(), count, field_meta);
}
default: {
@ -516,8 +634,7 @@ SegmentSealedImpl::HasIndex(FieldId field_id) const {
std::shared_lock lck(mutex_);
AssertInfo(!SystemProperty::Instance().IsSystem(field_id),
"Field id:" + std::to_string(field_id.get()) + " isn't one of system type when drop index");
auto field_offset = schema_->get_offset(field_id);
return get_bit(vecindex_ready_bitset_, field_offset);
return get_bit(vecindex_ready_bitset_, field_id);
}
bool
@ -526,40 +643,73 @@ SegmentSealedImpl::HasFieldData(FieldId field_id) const {
if (SystemProperty::Instance().IsSystem(field_id)) {
return is_system_field_ready();
} else {
auto field_offset = schema_->get_offset(field_id);
return get_bit(field_data_ready_bitset_, field_offset);
return get_bit(field_data_ready_bitset_, field_id);
}
}
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
SegmentSealedImpl::search_ids(const IdArray& id_array, Timestamp timestamp) const {
AssertInfo(id_array.has_int_id(), "string ids are not implemented");
auto arr = id_array.int_id();
AssertInfo(primary_key_index_, "Primary key index is null");
return primary_key_index_->do_search_ids(id_array);
AssertInfo(id_array.has_int_id(), "Id array doesn't have int_id element");
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id);
auto data_type = field_meta.get_data_type();
auto ids_size = GetSizeOfIdArray(id_array);
std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array);
auto res_id_arr = std::make_unique<IdArray>();
std::vector<SegOffset> res_offsets;
for (auto pk : pks) {
auto [iter_b, iter_e] = pk2offset_.equal_range(pk);
for (auto iter = iter_b; iter != iter_e; ++iter) {
auto offset = SegOffset(iter->second);
if (insert_record_.timestamps_[offset.get()] <= timestamp) {
switch (data_type) {
case DataType::INT64: {
res_id_arr->mutable_int_id()->add_data(std::get<int64_t>(pk));
break;
}
case DataType::VARCHAR: {
res_id_arr->mutable_str_id()->add_data(std::get<std::string>(pk));
break;
}
default: {
PanicInfo("unsupported type");
}
}
res_offsets.push_back(offset);
}
}
}
return {std::move(res_id_arr), std::move(res_offsets)};
}
Status
SegmentSealedImpl::Delete(int64_t reserved_offset,
int64_t row_count,
const int64_t* uids_raw,
const Timestamp* timestamps_raw) {
std::vector<std::tuple<Timestamp, idx_t>> ordering(row_count);
for (int i = 0; i < row_count; i++) {
ordering[i] = std::make_tuple(timestamps_raw[i], uids_raw[i]);
SegmentSealedImpl::Delete(int64_t reserved_offset, int64_t size, const IdArray* ids, const Timestamp* timestamps_raw) {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id);
std::vector<PkType> pks(size);
ParsePksFromIDs(pks, field_meta.get_data_type(), *ids);
// step 1: sort timestamp
std::vector<std::tuple<Timestamp, PkType>> ordering(size);
for (int i = 0; i < size; i++) {
ordering[i] = std::make_tuple(timestamps_raw[i], pks[i]);
}
std::sort(ordering.begin(), ordering.end());
std::vector<idx_t> src_uids(row_count);
std::vector<Timestamp> src_timestamps(row_count);
std::vector<PkType> sort_pks(size);
std::vector<Timestamp> sort_timestamps(size);
for (int i = 0; i < row_count; i++) {
auto [t, uid] = ordering[i];
src_timestamps[i] = t;
src_uids[i] = uid;
for (int i = 0; i < size; i++) {
auto [t, pk] = ordering[i];
sort_timestamps[i] = t;
sort_pks[i] = pk;
}
deleted_record_.timestamps_.set_data(reserved_offset, src_timestamps.data(), row_count);
deleted_record_.uids_.set_data(reserved_offset, src_uids.data(), row_count);
deleted_record_.ack_responder_.AddSegment(reserved_offset, row_count);
deleted_record_.timestamps_.set_data_raw(reserved_offset, sort_timestamps.data(), size);
deleted_record_.pks_.set_data_raw(reserved_offset, sort_pks.data(), size);
deleted_record_.ack_responder_.AddSegment(reserved_offset, size);
return Status::OK();
}
@ -568,7 +718,10 @@ SegmentSealedImpl::search_ids(const BitsetType& bitset, Timestamp timestamp) con
std::vector<SegOffset> dst_offset;
for (int i = 0; i < bitset.size(); i++) {
if (bitset[i]) {
dst_offset.emplace_back(SegOffset(i));
auto offset = SegOffset(i);
if (insert_record_.timestamps_[offset.get()] <= timestamp) {
dst_offset.push_back(offset);
}
}
}
return dst_offset;
@ -579,7 +732,10 @@ SegmentSealedImpl::search_ids(const BitsetView& bitset, Timestamp timestamp) con
std::vector<SegOffset> dst_offset;
for (int i = 0; i < bitset.size(); i++) {
if (!bitset.test(i)) {
dst_offset.emplace_back(SegOffset(i));
auto offset = SegOffset(i);
if (insert_record_.timestamps_[offset.get()] <= timestamp) {
dst_offset.push_back(offset);
}
}
}
return dst_offset;
@ -589,7 +745,6 @@ std::string
SegmentSealedImpl::debug() const {
std::string log_str;
log_str += "Sealed\n";
log_str += "Index:" + primary_key_index_->debug();
log_str += "\n";
return log_str;
}
@ -601,7 +756,7 @@ SegmentSealedImpl::LoadSegmentMeta(const proto::segcore::LoadSegmentMeta& segmen
for (auto& info : segment_meta.metas()) {
slice_lengths.push_back(info.row_count());
}
timestamp_index_.set_length_meta(std::move(slice_lengths));
insert_record_.timestamp_index_.set_length_meta(std::move(slice_lengths));
PanicInfo("unimplemented");
}
@ -614,13 +769,15 @@ SegmentSealedImpl::get_active_count(Timestamp ts) const {
void
SegmentSealedImpl::mask_with_timestamps(BitsetType& bitset_chunk, Timestamp timestamp) const {
// TODO change the
AssertInfo(this->timestamps_.size() == get_row_count(), "Timestamp size not equal to row count");
auto range = timestamp_index_.get_active_range(timestamp);
AssertInfo(insert_record_.timestamps_.num_chunk() == 1, "num chunk not equal to 1 for sealed segment");
auto timestamps_data = insert_record_.timestamps_.get_chunk(0);
AssertInfo(timestamps_data.size() == get_row_count(), "Timestamp size not equal to row count");
auto range = insert_record_.timestamp_index_.get_active_range(timestamp);
// range == (size_, size_) and size_ is this->timestamps_.size().
// it means these data are all useful, we don't need to update bitset_chunk.
// It can be thought of as an AND operation with another bitmask that is all 1s, but it is not necessary to do so.
if (range.first == range.second && range.first == this->timestamps_.size()) {
if (range.first == range.second && range.first == timestamps_data.size()) {
// just skip
return;
}
@ -630,7 +787,7 @@ SegmentSealedImpl::mask_with_timestamps(BitsetType& bitset_chunk, Timestamp time
bitset_chunk.reset();
return;
}
auto mask = TimestampIndex::GenerateBitset(timestamp, range, this->timestamps_.data(), this->timestamps_.size());
auto mask = TimestampIndex::GenerateBitset(timestamp, range, timestamps_data.data(), timestamps_data.size());
bitset_chunk &= mask;
}

View File

@ -12,6 +12,7 @@
#pragma once
#include <deque>
#include <unordered_map>
#include <map>
#include <memory>
#include <string>
@ -62,7 +63,7 @@ class SegmentSealedImpl : public SegmentSealed {
public:
int64_t
num_chunk_index(FieldOffset field_offset) const override;
num_chunk_index(FieldId field_id) const override;
int64_t
num_chunk() const override;
@ -78,15 +79,15 @@ class SegmentSealedImpl : public SegmentSealed {
PreDelete(int64_t size) override;
Status
Delete(int64_t reserved_offset, int64_t size, const int64_t* row_ids, const Timestamp* timestamps) override;
Delete(int64_t reserved_offset, int64_t size, const IdArray* pks, const Timestamp* timestamps) override;
protected:
// blob and row_count
SpanBase
chunk_data_impl(FieldOffset field_offset, int64_t chunk_id) const override;
chunk_data_impl(FieldId field_id, int64_t chunk_id) const override;
const knowhere::Index*
chunk_index_impl(FieldOffset field_offset, int64_t chunk_id) const override;
chunk_index_impl(FieldId field_id, int64_t chunk_id) const override;
// Calculate: output[i] = Vec[seg_offset[i]],
// where Vec is determined from field_offset
@ -95,8 +96,8 @@ class SegmentSealedImpl : public SegmentSealed {
// Calculate: output[i] = Vec[seg_offset[i]]
// where Vec is determined from field_offset
void
bulk_subscript(FieldOffset field_offset, const int64_t* seg_offsets, int64_t count, void* output) const override;
std::unique_ptr<DataArray>
bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const override;
void
check_search(const query::Plan* plan) const override;
@ -119,6 +120,9 @@ class SegmentSealedImpl : public SegmentSealed {
bulk_subscript_impl(
int64_t element_sizeof, const void* src_raw, const int64_t* seg_offsets, int64_t count, void* dst_raw);
std::unique_ptr<DataArray>
fill_with_empty(FieldId field_id, int64_t count) const;
void
update_row_count(int64_t row_count) {
if (row_count_opt_.has_value()) {
@ -162,8 +166,11 @@ class SegmentSealedImpl : public SegmentSealed {
std::vector<SegOffset>
search_ids(const BitsetType& view, Timestamp timestamp) const override;
// virtual void
// build_index_if_primary_key(FieldId field_id);
void
LoadVecIndex(const LoadIndexInfo& info);
void
LoadScalarIndex(const LoadIndexInfo& info);
private:
// segment loading state
@ -175,18 +182,21 @@ class SegmentSealedImpl : public SegmentSealed {
// TODO: generate index for scalar
std::optional<int64_t> row_count_opt_;
// TODO: use protobuf format
// TODO: remove duplicated indexing
std::vector<std::unique_ptr<knowhere::Index>> scalar_indexings_;
std::unique_ptr<ScalarIndexBase> primary_key_index_;
// scalar field index
std::unordered_map<FieldId, knowhere::IndexPtr> scalar_indexings_;
// vector field index
SealedIndexingRecord vector_indexings_;
std::vector<aligned_vector<char>> fields_data_;
// inserted fields data and row_ids, timestamps
InsertRecord insert_record_;
// deleted pks
mutable DeletedRecord deleted_record_;
SealedIndexingRecord vecindexs_;
aligned_vector<idx_t> row_ids_;
aligned_vector<Timestamp> timestamps_;
TimestampIndex timestamp_index_;
// pks to row offset
tbb::concurrent_unordered_multimap<PkType, int64_t, std::hash<PkType>> pk2offset_;
// std::unique_ptr<ScalarIndexBase> primary_key_index_;
SchemaPtr schema_;
int64_t id_;
};

View File

@ -0,0 +1,258 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "Utils.h"
namespace milvus::segcore {
void
ParsePksFromFieldData(std::vector<PkType>& pks, const DataArray& data) {
switch (DataType(data.type())) {
case DataType::INT64: {
auto source_data = reinterpret_cast<const int64_t*>(data.scalars().long_data().data().data());
std::copy_n(source_data, pks.size(), pks.data());
break;
}
case DataType::VARCHAR: {
auto src_data = data.scalars().string_data().data();
std::copy(src_data.begin(), src_data.end(), pks.begin());
break;
}
default: {
PanicInfo("unsupported");
}
}
}
void
ParsePksFromIDs(std::vector<PkType>& pks, DataType data_type, const IdArray& data) {
switch (data_type) {
case DataType::INT64: {
auto source_data = reinterpret_cast<const int64_t*>(data.int_id().data().data());
std::copy_n(source_data, pks.size(), pks.data());
break;
}
case DataType::VARCHAR: {
auto source_data = data.str_id().data();
std::copy(source_data.begin(), source_data.end(), pks.begin());
break;
}
default: {
PanicInfo("unsupported");
}
}
}
int64_t
GetSizeOfIdArray(const IdArray& data) {
if (data.has_int_id()) {
return data.int_id().data_size();
}
if (data.has_str_id()) {
return data.str_id().data_size();
}
PanicInfo("unsupported id type");
}
// Note: this is temporary solution.
// modify bulk script implement to make process more clear
std::unique_ptr<DataArray>
CreateScalarDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
data_array->set_type(milvus::proto::schema::DataType(field_meta.get_data_type()));
auto scalar_array = data_array->mutable_scalars();
switch (data_type) {
case DataType::BOOL: {
auto data = reinterpret_cast<const double*>(data_raw);
auto obj = scalar_array->mutable_bool_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT8: {
auto data = reinterpret_cast<const int8_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT16: {
auto data = reinterpret_cast<const int16_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT32: {
auto data = reinterpret_cast<const int32_t*>(data_raw);
auto obj = scalar_array->mutable_int_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::INT64: {
auto data = reinterpret_cast<const int64_t*>(data_raw);
auto obj = scalar_array->mutable_long_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::FLOAT: {
auto data = reinterpret_cast<const float*>(data_raw);
auto obj = scalar_array->mutable_float_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::DOUBLE: {
auto data = reinterpret_cast<const double*>(data_raw);
auto obj = scalar_array->mutable_double_data();
obj->mutable_data()->Add(data, data + count);
break;
}
case DataType::VARCHAR: {
auto data = reinterpret_cast<const std::string*>(data_raw);
auto obj = scalar_array->mutable_string_data();
for (auto i = 0; i < count; i++) *(obj->mutable_data()->Add()) = data[i];
break;
}
default: {
PanicInfo("unsupported datatype");
}
}
return data_array;
}
std::unique_ptr<DataArray>
CreateVectorDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
data_array->set_type(milvus::proto::schema::DataType(field_meta.get_data_type()));
auto vector_array = data_array->mutable_vectors();
auto dim = field_meta.get_dim();
vector_array->set_dim(dim);
switch (data_type) {
case DataType::VECTOR_FLOAT: {
auto length = count * dim;
auto data = reinterpret_cast<const float*>(data_raw);
auto obj = vector_array->mutable_float_vector();
obj->mutable_data()->Add(data, data + length);
break;
}
case DataType::VECTOR_BINARY: {
AssertInfo(dim % 8 == 0, "Binary vector field dimension is not a multiple of 8");
auto num_bytes = count * dim / 8;
auto data = reinterpret_cast<const char*>(data_raw);
auto obj = vector_array->mutable_binary_vector();
obj->assign(data, num_bytes);
break;
}
default: {
PanicInfo("unsupported datatype");
}
}
return data_array;
}
std::unique_ptr<DataArray>
CreateDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
if (!datatype_is_vector(data_type)) {
return CreateScalarDataArrayFrom(data_raw, count, field_meta);
}
return CreateVectorDataArrayFrom(data_raw, count, field_meta);
}
// TODO remove merge dataArray, instead fill target entity when get data slice
std::unique_ptr<DataArray>
MergeDataArray(std::vector<std::pair<milvus::SearchResult*, int64_t>>& result_offsets, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
data_array->set_type(milvus::proto::schema::DataType(field_meta.get_data_type()));
for (auto& result_pair : result_offsets) {
auto src_field_data = result_pair.first->output_fields_data_[field_meta.get_id()].get();
auto src_offset = result_pair.second;
AssertInfo(data_type == DataType(src_field_data->type()), "merge field data type not consistent");
if (field_meta.is_vector()) {
auto vector_array = data_array->mutable_vectors();
auto dim = field_meta.get_dim();
vector_array->set_dim(dim);
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
auto data = src_field_data->vectors().float_vector().data().data();
auto obj = vector_array->mutable_float_vector();
obj->mutable_data()->Add(data + src_offset * dim, data + (src_offset + 1) * dim);
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
AssertInfo(dim % 8 == 0, "Binary vector field dimension is not a multiple of 8");
auto num_bytes = dim / 8;
auto data = src_field_data->vectors().binary_vector().data();
auto obj = vector_array->mutable_binary_vector();
obj->assign(data + src_offset * num_bytes, num_bytes);
} else {
PanicInfo("logical error");
}
continue;
}
auto scalar_array = data_array->mutable_scalars();
switch (data_type) {
case DataType::BOOL: {
auto data = src_field_data->scalars().bool_data().data().data();
auto obj = scalar_array->mutable_bool_data();
*(obj->mutable_data()->Add()) = data[src_offset];
continue;
}
case DataType::INT8:
case DataType::INT16:
case DataType::INT32: {
auto data = src_field_data->scalars().int_data().data().data();
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()->Add()) = data[src_offset];
continue;
}
case DataType::INT64: {
auto data = src_field_data->scalars().long_data().data().data();
auto obj = scalar_array->mutable_long_data();
*(obj->mutable_data()->Add()) = data[src_offset];
continue;
}
case DataType::FLOAT: {
auto data = src_field_data->scalars().float_data().data().data();
auto obj = scalar_array->mutable_float_data();
*(obj->mutable_data()->Add()) = data[src_offset];
continue;
}
case DataType::DOUBLE: {
auto data = src_field_data->scalars().double_data().data().data();
auto obj = scalar_array->mutable_double_data();
*(obj->mutable_data()->Add()) = data[src_offset];
continue;
}
case DataType::VARCHAR: {
auto data = src_field_data->scalars().string_data();
auto obj = scalar_array->mutable_string_data();
*(obj->mutable_data()->Add()) = data.data(src_offset);
continue;
}
default: {
PanicInfo("unsupported datatype");
}
}
}
return data_array;
}
} // namespace milvus::segcore

View File

@ -12,8 +12,12 @@
#include <stdlib.h>
#include <string>
#include <exception>
#include <memory>
#include <utility>
#include <vector>
#include <stdexcept>
#include <knowhere/common/MetricType.h>
#include "common/QueryResult.h"
namespace milvus::segcore {
@ -51,4 +55,28 @@ MetricTypeToString(faiss::MetricType metric_type) {
}
}
void
ParsePksFromFieldData(std::vector<PkType>& pks, const DataArray& data);
void
ParsePksFromIDs(std::vector<PkType>& pks, DataType data_type, const IdArray& data);
int64_t
GetSizeOfIdArray(const IdArray& data);
// Note: this is temporary solution.
// modify bulk script implement to make process more clear
std::unique_ptr<DataArray>
CreateScalarDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta);
std::unique_ptr<DataArray>
CreateVectorDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta);
std::unique_ptr<DataArray>
CreateDataArrayFrom(const void* data_raw, int64_t count, const FieldMeta& field_meta);
// TODO remove merge dataArray, instead fill target entity when get data slice
std::unique_ptr<DataArray>
MergeDataArray(std::vector<std::pair<milvus::SearchResult*, int64_t>>& result_offsets, const FieldMeta& field_meta);
} // namespace milvus::segcore

View File

@ -14,6 +14,8 @@
#include "knowhere/common/BinarySet.h"
#include "knowhere/index/vector_index/VecIndexFactory.h"
#include "segcore/load_index_c.h"
#include "index/IndexFactory.h"
#include "common/CDataType.h"
CStatus
NewLoadIndexInfo(CLoadIndexInfo* c_load_index_info) {
@ -59,10 +61,11 @@ AppendIndexParam(CLoadIndexInfo c_load_index_info, const char* c_index_key, cons
}
CStatus
AppendFieldInfo(CLoadIndexInfo c_load_index_info, int64_t field_id) {
AppendFieldInfo(CLoadIndexInfo c_load_index_info, int64_t field_id, enum CDataType field_type) {
try {
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
load_index_info->field_id = field_id;
load_index_info->field_type = field_type;
auto status = CStatus();
status.error_code = Success;
@ -77,7 +80,7 @@ AppendFieldInfo(CLoadIndexInfo c_load_index_info, int64_t field_id) {
}
CStatus
AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
appendVecIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
try {
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
auto binary_set = (knowhere::BinarySet*)c_binary_set;
@ -107,3 +110,37 @@ AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
return status;
}
}
CStatus
appendScalarIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
try {
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
auto field_type = load_index_info->field_type;
auto binary_set = (knowhere::BinarySet*)c_binary_set;
auto& index_params = load_index_info->index_params;
bool find_index_type = index_params.count("index_type") > 0 ? true : false;
AssertInfo(find_index_type == true, "Can't find index type in index_params");
load_index_info->index =
milvus::scalar::IndexFactory::GetInstance().CreateIndex(field_type, index_params["index_type"]);
load_index_info->index->Load(*binary_set);
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
auto field_type = load_index_info->field_type;
if (milvus::IsVectorType(field_type)) {
return appendVecIndex(c_load_index_info, c_binary_set);
}
return appendScalarIndex(c_load_index_info, c_binary_set);
}

View File

@ -33,7 +33,7 @@ CStatus
AppendIndexParam(CLoadIndexInfo c_load_index_info, const char* index_key, const char* index_value);
CStatus
AppendFieldInfo(CLoadIndexInfo c_load_index_info, int64_t field_id);
AppendFieldInfo(CLoadIndexInfo c_load_index_info, int64_t field_id, enum CDataType field_type);
CStatus
AppendIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set);

View File

@ -25,6 +25,7 @@
#include "segcore/ReduceStructure.h"
#include "segcore/SegmentInterface.h"
#include "segcore/reduce_c.h"
#include "segcore/Utils.h"
using SearchResult = milvus::SearchResult;
@ -40,32 +41,36 @@ using SearchResult = milvus::SearchResult;
void
ReduceResultData(std::vector<SearchResult*>& search_results, int64_t nq, int64_t topk) {
AssertInfo(topk > 0, "topk must greater than 0");
auto num_segments = search_results.size();
AssertInfo(num_segments > 0, "num segment must greater than 0");
for (int i = 0; i < num_segments; i++) {
auto search_result = search_results[i];
auto result_count = search_result->get_total_result_count();
AssertInfo(search_result != nullptr, "search result must not equal to nullptr");
AssertInfo(search_result->primary_keys_.size() == nq * topk, "incorrect search result primary key size");
AssertInfo(search_result->distances_.size() == nq * topk, "incorrect search result distance size");
AssertInfo(search_result->primary_keys_.size() == result_count, "incorrect search result primary key size");
AssertInfo(search_result->distances_.size() == result_count, "incorrect search result distance size");
}
std::vector<std::vector<int64_t>> final_real_topks(num_segments);
for (auto& topks : final_real_topks) {
topks.resize(nq);
}
std::vector<std::vector<int64_t>> search_records(num_segments);
std::unordered_set<int64_t> pk_set;
std::unordered_set<milvus::PkType> pk_set;
int64_t skip_dup_cnt = 0;
// reduce search results
int64_t result_offset = 0;
for (int64_t qi = 0; qi < nq; qi++) {
std::vector<SearchResultPair> result_pairs;
int64_t base_offset = qi * topk;
for (int i = 0; i < num_segments; i++) {
auto search_result = search_results[i];
auto base_offset = search_result->get_result_count(qi);
auto primary_key = search_result->primary_keys_[base_offset];
auto distance = search_result->distances_[base_offset];
result_pairs.push_back(
SearchResultPair(primary_key, distance, search_result, i, base_offset, base_offset + topk));
result_pairs.push_back(SearchResultPair(primary_key, distance, search_result, i, base_offset,
base_offset + search_result->real_topK_per_nq_[qi]));
}
int64_t curr_offset = base_offset;
#if 0
for (int i = 0; i < topk; ++i) {
@ -78,20 +83,22 @@ ReduceResultData(std::vector<SearchResult*>& search_results, int64_t nq, int64_t
}
#else
pk_set.clear();
while (curr_offset - base_offset < topk) {
int64_t last_nq_result_offset = result_offset;
while (result_offset - last_nq_result_offset < topk) {
std::sort(result_pairs.begin(), result_pairs.end(), std::greater<>());
auto& pilot = result_pairs[0];
auto index = pilot.index_;
int64_t curr_pk = pilot.primary_key_;
auto curr_pk = pilot.primary_key_;
// no valid search result for this nq, break to next
if (curr_pk == INVALID_PK) {
break;
}
// remove duplicates
if (curr_pk == INVALID_ID || pk_set.count(curr_pk) == 0) {
pilot.search_result_->result_offsets_.push_back(curr_offset++);
// when inserted data are dirty, it's possible that primary keys are duplicated,
// in this case, "offset_" may be greater than "offset_rb_" (#10530)
search_records[index].push_back(pilot.offset_ < pilot.offset_rb_ ? pilot.offset_ : INVALID_OFFSET);
if (curr_pk != INVALID_ID) {
pk_set.insert(curr_pk);
}
if (pk_set.count(curr_pk) == 0) {
pilot.search_result_->result_offsets_.push_back(result_offset++);
search_records[index].push_back(pilot.offset_);
pk_set.insert(curr_pk);
final_real_topks[index][qi]++;
} else {
// skip entity with same primary key
skip_dup_cnt++;
@ -109,123 +116,167 @@ ReduceResultData(std::vector<SearchResult*>& search_results, int64_t nq, int64_t
continue;
}
std::vector<int64_t> primary_keys;
std::vector<milvus::PkType> primary_keys;
std::vector<float> distances;
std::vector<int64_t> ids;
for (int j = 0; j < search_records[i].size(); j++) {
auto& offset = search_records[i][j];
primary_keys.push_back(offset != INVALID_OFFSET ? search_result->primary_keys_[offset] : INVALID_ID);
distances.push_back(offset != INVALID_OFFSET ? search_result->distances_[offset]
: std::numeric_limits<float>::max());
ids.push_back(offset != INVALID_OFFSET ? search_result->ids_[offset] : INVALID_ID);
primary_keys.push_back(search_result->primary_keys_[offset]);
distances.push_back(search_result->distances_[offset]);
ids.push_back(search_result->seg_offsets_[offset]);
}
search_result->primary_keys_ = primary_keys;
search_result->distances_ = distances;
search_result->ids_ = ids;
search_result->primary_keys_ = std::move(primary_keys);
search_result->distances_ = std::move(distances);
search_result->seg_offsets_ = std::move(ids);
search_result->real_topK_per_nq_ = std::move(final_real_topks[i]);
}
}
void
ReorganizeSearchResults(std::vector<SearchResult*>& search_results,
int32_t nq,
int32_t topK,
milvus::aligned_vector<int64_t>& result_ids,
std::vector<float>& result_distances,
std::vector<milvus::aligned_vector<char>>& result_output_fields_data) {
auto num_segments = search_results.size();
auto results_count = 0;
for (int i = 0; i < num_segments; i++) {
auto search_result = search_results[i];
AssertInfo(search_result != nullptr, "null search result when reorganize");
AssertInfo(search_result->output_fields_meta_.size() == result_output_fields_data.size(),
"illegal fields meta size"
", fields_meta_size = " +
std::to_string(search_result->output_fields_meta_.size()) +
", expected_size = " + std::to_string(result_output_fields_data.size()));
auto num_results = search_result->result_offsets_.size();
if (num_results == 0) {
continue;
}
#pragma omp parallel for
for (int j = 0; j < num_results; j++) {
auto loc = search_result->result_offsets_[j];
// AssertInfo(loc < nq * topK, "result location of out range, location = " +
// std::to_string(loc));
// set result ids
memcpy(&result_ids[loc], &search_result->ids_data_[j * sizeof(int64_t)], sizeof(int64_t));
// set result distances
result_distances[loc] = search_result->distances_[j];
// set result output fields data
for (int k = 0; k < search_result->output_fields_meta_.size(); k++) {
auto ele_size = search_result->output_fields_meta_[k].get_sizeof();
memcpy(&result_output_fields_data[k][loc * ele_size],
&search_result->output_fields_data_[k][j * ele_size], ele_size);
}
}
results_count += num_results;
struct Int64PKVisitor {
template <typename T>
int64_t
operator()(T t) const {
PanicInfo("invalid int64 pk value");
}
};
AssertInfo(results_count == nq * topK,
"size of reduce result is less than nq * topK"
", result_count = " +
std::to_string(results_count) + ", nq * topK = " + std::to_string(nq * topK));
template <>
int64_t
Int64PKVisitor::operator()<int64_t>(int64_t t) const {
return t;
}
struct StrPKVisitor {
template <typename T>
std::string
operator()(T t) const {
PanicInfo("invalid string pk value");
}
};
template <>
std::string
StrPKVisitor::operator()<std::string>(std::string t) const {
return t;
}
std::vector<char>
GetSearchResultDataSlice(milvus::aligned_vector<int64_t>& result_ids,
std::vector<float>& result_distances,
std::vector<milvus::aligned_vector<char>>& result_output_fields_data,
int32_t nq,
int32_t topK,
int32_t nq_begin,
int32_t nq_end,
std::vector<milvus::FieldMeta>& output_fields_meta) {
GetSearchResultDataSlice(std::vector<SearchResult*>& search_results,
milvus::query::Plan* plan,
int64_t nq_offset_begin,
int64_t nq_offset_end,
int64_t result_offset_begin,
int64_t result_offset_end,
int64_t nq,
int64_t topK) {
AssertInfo(nq_offset_begin <= nq_offset_end,
"illegal offsets when GetSearchResultDataSlice, nq_offset_begin = " + std::to_string(nq_offset_begin) +
", nq_offset_end = " + std::to_string(nq_offset_end));
AssertInfo(nq_offset_end <= nq, "illegal nq_offset_end when GetSearchResultDataSlice, nq_offset_end = " +
std::to_string(nq_offset_end) + ", nq = " + std::to_string(nq));
AssertInfo(result_offset_begin <= result_offset_end,
"illegal result offsets when GetSearchResultDataSlice, result_offset_begin = " +
std::to_string(result_offset_begin) + ", result_offset_end = " + std::to_string(result_offset_end));
AssertInfo(result_offset_end <= nq * topK,
"illegal result_offset_end when GetSearchResultDataSlice, result_offset_end = " +
std::to_string(result_offset_end) + ", nq = " + std::to_string(nq) +
", topk = " + std::to_string(topK));
auto search_result_data = std::make_unique<milvus::proto::schema::SearchResultData>();
// set topK and nq
search_result_data->set_top_k(topK);
search_result_data->set_num_queries(nq);
search_result_data->set_num_queries(nq_offset_end - nq_offset_begin);
search_result_data->mutable_topks()->Resize(nq_offset_end - nq_offset_begin, 0);
auto offset_begin = nq_begin * topK;
auto offset_end = nq_end * topK;
AssertInfo(offset_begin <= offset_end,
"illegal offsets when GetSearchResultDataSlice"
", offset_begin = " +
std::to_string(offset_begin) + ", offset_end = " + std::to_string(offset_end));
AssertInfo(offset_end <= topK * nq,
"illegal offset_end when GetSearchResultDataSlice"
", offset_end = " +
std::to_string(offset_end) + ", nq = " + std::to_string(nq) + ", topK = " + std::to_string(topK));
auto num_segments = search_results.size();
auto total_result_count = result_offset_end - result_offset_begin;
// set ids
auto proto_ids = std::make_unique<milvus::proto::schema::IDs>();
auto ids = std::make_unique<milvus::proto::schema::LongArray>();
*ids->mutable_data() = {result_ids.begin() + offset_begin, result_ids.begin() + offset_end};
proto_ids->set_allocated_int_id(ids.release());
search_result_data->set_allocated_ids(proto_ids.release());
AssertInfo(search_result_data->ids().int_id().data_size() == offset_end - offset_begin,
"wrong ids size"
", size = " +
std::to_string(search_result_data->ids().int_id().data_size()) +
", expected size = " + std::to_string(offset_end - offset_begin));
// use for fill field data
std::vector<std::pair<SearchResult*, int64_t>> result_offsets(total_result_count);
// set scores
*search_result_data->mutable_scores() = {result_distances.begin() + offset_begin,
result_distances.begin() + offset_end};
AssertInfo(search_result_data->scores_size() == offset_end - offset_begin,
// reverse space for pks
auto primary_field_id = plan->schema_.get_primary_field_id().value_or(milvus::FieldId(-1));
AssertInfo(primary_field_id.get() != INVALID_FIELD_ID, "Primary key is -1");
auto pk_type = plan->schema_[primary_field_id].get_data_type();
switch (pk_type) {
case milvus::DataType::INT64: {
auto ids = std::make_unique<milvus::proto::schema::LongArray>();
ids->mutable_data()->Resize(total_result_count, 0);
search_result_data->mutable_ids()->set_allocated_int_id(ids.release());
break;
}
case milvus::DataType::VARCHAR: {
auto ids = std::make_unique<milvus::proto::schema::StringArray>();
std::vector<std::string> string_pks(total_result_count);
*ids->mutable_data() = {string_pks.begin(), string_pks.end()};
search_result_data->mutable_ids()->set_allocated_str_id(ids.release());
break;
}
default: {
PanicInfo("unsupported primary key type");
}
}
// reverse space for distances
search_result_data->mutable_scores()->Resize(total_result_count, 0);
// fill pks and distances
for (auto nq_offset = nq_offset_begin; nq_offset < nq_offset_end; nq_offset++) {
int64_t result_count = 0;
for (int i = 0; i < num_segments; i++) {
auto search_result = search_results[i];
AssertInfo(search_result != nullptr, "null search result when reorganize");
if (search_result->result_offsets_.size() == 0) {
continue;
}
auto seg_result_offset_start = search_result->get_result_count(nq_offset);
auto seg_result_offset_end = seg_result_offset_start + search_result->real_topK_per_nq_[nq_offset];
for (auto j = seg_result_offset_start; j < seg_result_offset_end; j++) {
auto loc = search_result->result_offsets_[j] - result_offset_begin;
// set result pks
switch (pk_type) {
case milvus::DataType::INT64: {
search_result_data->mutable_ids()->mutable_int_id()->mutable_data()->Set(
loc, std::visit(Int64PKVisitor{}, search_result->primary_keys_[j]));
break;
}
case milvus::DataType::VARCHAR: {
*search_result_data->mutable_ids()->mutable_str_id()->mutable_data()->Mutable(loc) =
std::visit(StrPKVisitor{}, search_result->primary_keys_[j]);
break;
}
default: {
PanicInfo("unsupported primary key type");
}
}
// set result distances
search_result_data->mutable_scores()->Set(loc, search_result->distances_[j]);
// set result offset to fill output fields data
result_offsets[loc] = std::make_pair(search_result, j);
}
result_count += search_result->real_topK_per_nq_[nq_offset];
}
// update result topks
search_result_data->mutable_topks()->Set(nq_offset - nq_offset_begin, result_count);
}
AssertInfo(search_result_data->scores_size() == total_result_count,
"wrong scores size"
", size = " +
std::to_string(search_result_data->scores_size()) +
", expected size = " + std::to_string(offset_end - offset_begin));
", expected size = " + std::to_string(total_result_count));
// set output fields
for (int i = 0; i < result_output_fields_data.size(); i++) {
auto& field_meta = output_fields_meta[i];
auto field_size = field_meta.get_sizeof();
auto array = milvus::segcore::CreateDataArrayFrom(
result_output_fields_data[i].data() + offset_begin * field_size, offset_end - offset_begin, field_meta);
search_result_data->mutable_fields_data()->AddAllocated(array.release());
for (auto field_id : plan->target_entries_) {
auto& field_meta = plan->schema_[field_id];
auto field_data = milvus::segcore::MergeDataArray(result_offsets, field_meta);
search_result_data->mutable_fields_data()->AddAllocated(field_data.release());
}
// SearchResultData to blob
@ -239,6 +290,7 @@ GetSearchResultDataSlice(milvus::aligned_vector<int64_t>& result_ids,
CStatus
Marshal(CSearchResultDataBlobs* cSearchResultDataBlobs,
CSearchResult* c_search_results,
CSearchPlan c_plan,
int32_t num_segments,
int32_t* nq_slice_sizes,
int32_t num_slices) {
@ -249,46 +301,44 @@ Marshal(CSearchResultDataBlobs* cSearchResultDataBlobs,
search_results[i] = static_cast<SearchResult*>(c_search_results[i]);
}
AssertInfo(search_results.size() > 0, "empty search result when Marshal");
auto plan = (milvus::query::Plan*)c_plan;
auto topK = search_results[0]->topk_;
auto nq = search_results[0]->num_queries_;
// init result ids, distances
auto result_ids = milvus::aligned_vector<int64_t>(nq * topK);
auto result_distances = std::vector<float>(nq * topK);
// init result output fields data
auto& output_fields_meta = search_results[0]->output_fields_meta_;
auto num_output_fields = output_fields_meta.size();
auto result_output_fields_data = std::vector<milvus::aligned_vector<char>>(num_output_fields);
for (int i = 0; i < num_output_fields; i++) {
auto size = output_fields_meta[i].get_sizeof();
result_output_fields_data[i].resize(size * nq * topK);
std::vector<int64_t> result_count_per_nq(nq);
for (auto search_result : search_results) {
AssertInfo(search_result->real_topK_per_nq_.size() == nq,
"incorrect real_topK_per_nq_ size in search result");
for (int j = 0; j < nq; j++) {
result_count_per_nq[j] += search_result->real_topK_per_nq_[j];
}
}
// Reorganize search results, get result ids, distances and output fields data
ReorganizeSearchResults(search_results, nq, topK, result_ids, result_distances, result_output_fields_data);
// prefix sum, get slices offsets
AssertInfo(num_slices > 0, "empty nq_slice_sizes is not allowed");
auto slice_offsets_size = num_slices + 1;
auto slice_offsets = std::vector<int32_t>(slice_offsets_size);
slice_offsets[0] = 0;
slice_offsets[1] = nq_slice_sizes[0];
for (int i = 2; i < slice_offsets_size; i++) {
slice_offsets[i] = slice_offsets[i - 1] + nq_slice_sizes[i - 1];
auto nq_slice_offsets = std::vector<int32_t>(slice_offsets_size);
auto result_slice_offset = std::vector<int64_t>(slice_offsets_size);
for (int i = 1; i < slice_offsets_size; i++) {
nq_slice_offsets[i] = nq_slice_offsets[i - 1] + nq_slice_sizes[i - 1];
result_slice_offset[i] = result_slice_offset[i - 1];
for (auto j = nq_slice_offsets[i - 1]; j < nq_slice_offsets[i]; j++) {
result_slice_offset[i] += result_count_per_nq[j];
}
}
AssertInfo(slice_offsets[num_slices] == nq,
AssertInfo(nq_slice_offsets[num_slices] == nq,
"illegal req sizes"
", slice_offsets[last] = " +
std::to_string(slice_offsets[num_slices]) + ", nq = " + std::to_string(nq));
", nq_slice_offsets[last] = " +
std::to_string(nq_slice_offsets[num_slices]) + ", nq = " + std::to_string(nq));
// get search result data blobs by slices
auto search_result_data_blobs = std::make_unique<milvus::segcore::SearchResultDataBlobs>();
search_result_data_blobs->blobs.resize(num_slices);
#pragma omp parallel for
//#pragma omp parallel for
for (int i = 0; i < num_slices; i++) {
auto proto = GetSearchResultDataSlice(result_ids, result_distances, result_output_fields_data, nq, topK,
slice_offsets[i], slice_offsets[i + 1], output_fields_meta);
auto proto = GetSearchResultDataSlice(search_results, plan, nq_slice_offsets[i], nq_slice_offsets[i + 1],
result_slice_offset[i], result_slice_offset[i + 1], nq, topK);
search_result_data_blobs->blobs[i] = proto;
}
@ -328,6 +378,36 @@ DeleteSearchResultDataBlobs(CSearchResultDataBlobs cSearchResultDataBlobs) {
delete search_result_data_blobs;
}
void
FilterInvalidSearchResult(SearchResult* search_result) {
auto nq = search_result->num_queries_;
auto topk = search_result->topk_;
AssertInfo(search_result->seg_offsets_.size() == nq * topk,
"wrong seg offsets size, size = " + std::to_string(search_result->seg_offsets_.size()) +
", expected size = " + std::to_string(nq * topk));
AssertInfo(search_result->distances_.size() == nq * topk,
"wrong distances size, size = " + std::to_string(search_result->distances_.size()) +
", expected size = " + std::to_string(nq * topk));
std::vector<int64_t> real_topks(nq);
std::vector<float> distances;
std::vector<int64_t> seg_offsets;
for (auto i = 0; i < nq; i++) {
real_topks[i] = 0;
for (auto j = 0; j < topk; j++) {
auto offset = i * topk + j;
if (search_result->seg_offsets_[offset] != INVALID_SEG_OFFSET) {
real_topks[i]++;
seg_offsets.push_back(search_result->seg_offsets_[offset]);
distances.push_back(search_result->distances_[offset]);
}
}
}
search_result->distances_ = std::move(distances);
search_result->seg_offsets_ = std::move(seg_offsets);
search_result->real_topK_per_nq_ = std::move(real_topks);
}
CStatus
ReduceSearchResultsAndFillData(CSearchPlan c_plan, CSearchResult* c_search_results, int64_t num_segments) {
try {
@ -339,13 +419,20 @@ ReduceSearchResultsAndFillData(CSearchPlan c_plan, CSearchResult* c_search_resul
auto topk = search_results[0]->topk_;
auto num_queries = search_results[0]->num_queries_;
std::vector<SearchResult*> valid_search_results;
// get primary keys for duplicates removal
for (auto& search_result : search_results) {
for (auto search_result : search_results) {
auto segment = (milvus::segcore::SegmentInterface*)(search_result->segment_);
FilterInvalidSearchResult(search_result);
segment->FillPrimaryKeys(plan, *search_result);
if (search_result->get_total_result_count() > 0) {
valid_search_results.push_back(search_result);
}
}
ReduceResultData(search_results, num_queries, topk);
if (valid_search_results.size() > 0) {
ReduceResultData(valid_search_results, num_queries, topk);
}
// fill in other entities
for (auto& search_result : search_results) {

View File

@ -25,6 +25,7 @@ ReduceSearchResultsAndFillData(CSearchPlan c_plan, CSearchResult* search_results
CStatus
Marshal(CSearchResultDataBlobs* cSearchResultDataBlobs,
CSearchResult* c_search_results,
CSearchPlan c_plan,
int32_t num_segments,
int32_t* nq_slice_sizes,
int32_t num_slices);

View File

@ -20,6 +20,7 @@
#include "segcore/SegmentSealedImpl.h"
#include "segcore/SimilarityCorelation.h"
#include "segcore/segment_c.h"
#include "google/protobuf/text_format.h"
////////////////////////////// common interfaces //////////////////////////////
CSegmentInterface
@ -133,54 +134,16 @@ Insert(CSegmentInterface c_segment,
int64_t size,
const int64_t* row_ids,
const uint64_t* timestamps,
void* raw_data,
int sizeof_per_row,
int64_t count) {
const char* data_info) {
try {
auto segment = (milvus::segcore::SegmentGrowing*)c_segment;
milvus::segcore::RowBasedRawData dataChunk{};
auto proto = std::string(data_info);
Assert(!proto.empty());
auto insert_data = std::make_unique<milvus::InsertData>();
auto suc = google::protobuf::TextFormat::ParseFromString(proto, insert_data.get());
AssertInfo(suc, "unmarshal field data string failed");
dataChunk.raw_data = raw_data;
dataChunk.sizeof_per_row = sizeof_per_row;
dataChunk.count = count;
segment->Insert(reserved_offset, size, row_ids, timestamps, dataChunk);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
}
}
CStatus
InsertColumnData(CSegmentInterface c_segment,
int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const uint64_t* timestamps,
void* raw_data,
int64_t count) {
try {
auto segment = (milvus::segcore::SegmentGrowing*)c_segment;
milvus::segcore::ColumnBasedRawData dataChunk{};
auto& schema = segment->get_schema();
auto sizeof_infos = schema.get_sizeof_infos();
dataChunk.columns_ = std::vector<milvus::aligned_vector<uint8_t>>(schema.size());
// reverse space for each field
for (int fid = 0; fid < schema.size(); ++fid) {
auto len = sizeof_infos[fid];
dataChunk.columns_[fid].resize(len * size);
}
auto col_data = reinterpret_cast<const char*>(raw_data);
int64_t offset = 0;
for (int fid = 0; fid < schema.size(); ++fid) {
auto len = sizeof_infos[fid] * size;
auto src = col_data + offset;
auto dst = dataChunk.columns_[fid].data();
memcpy(dst, src, len);
offset += len;
}
dataChunk.count = count;
segment->Insert(reserved_offset, size, row_ids, timestamps, dataChunk);
segment->Insert(reserved_offset, size, row_ids, timestamps, insert_data.get());
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
@ -199,15 +162,16 @@ PreInsert(CSegmentInterface c_segment, int64_t size, int64_t* offset) {
}
CStatus
Delete(CSegmentInterface c_segment,
int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const uint64_t* timestamps) {
Delete(
CSegmentInterface c_segment, int64_t reserved_offset, int64_t size, const char* ids, const uint64_t* timestamps) {
auto segment = (milvus::segcore::SegmentInterface*)c_segment;
auto proto = std::string(ids);
Assert(!proto.empty());
auto pks = std::make_unique<milvus::proto::schema::IDs>();
auto suc = google::protobuf::TextFormat::ParseFromString(proto, pks.get());
AssertInfo(suc, "unmarshal field data string failed");
try {
auto res = segment->Delete(reserved_offset, size, row_ids, timestamps);
auto res = segment->Delete(reserved_offset, size, pks.get(), timestamps);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(UnexpectedError, e.what());
@ -228,8 +192,13 @@ LoadFieldData(CSegmentInterface c_segment, CLoadFieldDataInfo load_field_data_in
auto segment_interface = reinterpret_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto segment = dynamic_cast<milvus::segcore::SegmentSealed*>(segment_interface);
AssertInfo(segment != nullptr, "segment conversion failed");
auto proto = std::string(load_field_data_info.blob);
Assert(!proto.empty());
auto field_data = std::make_unique<milvus::DataArray>();
auto suc = google::protobuf::TextFormat::ParseFromString(proto, field_data.get());
AssertInfo(suc, "unmarshal field data string failed");
auto load_info =
LoadFieldDataInfo{load_field_data_info.field_id, load_field_data_info.blob, load_field_data_info.row_count};
LoadFieldDataInfo{load_field_data_info.field_id, field_data.get(), load_field_data_info.row_count};
segment->LoadFieldData(load_info);
return milvus::SuccessCStatus();
} catch (std::exception& e) {
@ -243,8 +212,13 @@ LoadDeletedRecord(CSegmentInterface c_segment, CLoadDeletedRecordInfo deleted_re
auto segment_interface = reinterpret_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto segment = dynamic_cast<milvus::segcore::SegmentSealed*>(segment_interface);
AssertInfo(segment != nullptr, "segment conversion failed");
auto load_info = LoadDeletedRecordInfo{deleted_record_info.timestamps, deleted_record_info.primary_keys,
deleted_record_info.row_count};
auto proto = std::string(deleted_record_info.primary_keys);
Assert(!proto.empty());
auto pks = std::make_unique<milvus::proto::schema::IDs>();
auto suc = google::protobuf::TextFormat::ParseFromString(proto, pks.get());
AssertInfo(suc, "unmarshal field data string failed");
auto load_info =
LoadDeletedRecordInfo{deleted_record_info.timestamps, pks.get(), deleted_record_info.row_count};
segment->LoadDeletedRecord(load_info);
return milvus::SuccessCStatus();
} catch (std::exception& e) {

View File

@ -67,32 +67,11 @@ Insert(CSegmentInterface c_segment,
int64_t size,
const int64_t* row_ids,
const uint64_t* timestamps,
void* raw_data,
int sizeof_per_row,
int64_t count);
CStatus
InsertColumnData(CSegmentInterface c_segment,
int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const uint64_t* timestamps,
void* raw_data,
int64_t count);
const char* data_info);
CStatus
PreInsert(CSegmentInterface c_segment, int64_t size, int64_t* offset);
CStatus
Delete(CSegmentInterface c_segment,
int64_t reserved_offset,
int64_t size,
const int64_t* row_ids,
const uint64_t* timestamps);
int64_t
PreDelete(CSegmentInterface c_segment, int64_t size);
////////////////////////////// interfaces for sealed segment //////////////////////////////
CStatus
LoadFieldData(CSegmentInterface c_segment, CLoadFieldDataInfo load_field_data_info);
@ -109,6 +88,12 @@ DropFieldData(CSegmentInterface c_segment, int64_t field_id);
CStatus
DropSealedSegmentIndex(CSegmentInterface c_segment, int64_t field_id);
////////////////////////////// interfaces for SegmentInterface //////////////////////////////
CStatus
Delete(CSegmentInterface c_segment, int64_t reserved_offset, int64_t size, const char* ids, const uint64_t* timestamps);
int64_t
PreDelete(CSegmentInterface c_segment, int64_t size);
#ifdef __cplusplus
}
#endif

View File

@ -41,6 +41,7 @@ enum class DataType {
DOUBLE = 11,
STRING = 20,
VARCHAR = 21,
VECTOR_BINARY = 100,
VECTOR_FLOAT = 101,

View File

@ -33,12 +33,14 @@ set(MILVUS_TEST_FILES
test_query.cpp
test_reduce.cpp
test_reduce_c.cpp
test_relational.cpp
test_retrieve.cpp
test_scalar_index.cpp
test_sealed.cpp
test_segcore.cpp
test_similarity_corelation.cpp
test_span.cpp
test_string_expr.cpp
test_timestamp_index.cpp
test_utils.cpp
)

View File

@ -21,6 +21,7 @@
#include "indexbuilder/index_c.h"
#include "indexbuilder/utils.h"
#include "test_utils/indexbuilder_test_utils.h"
#include "common/Consts.h"
constexpr int64_t NB = 1000000;
@ -59,7 +60,7 @@ IndexBuilder_build(benchmark::State& state) {
auto is_binary = state.range(2);
auto dataset = GenDataset(NB, metric_type, is_binary);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(START_USER_FIELDID));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
for (auto _ : state) {
@ -88,7 +89,7 @@ IndexBuilder_build_and_codec(benchmark::State& state) {
auto is_binary = state.range(2);
auto dataset = GenDataset(NB, metric_type, is_binary);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
for (auto _ : state) {

View File

@ -76,10 +76,7 @@ Search_SmallIndex(benchmark::State& state) {
segment->disable_small_index();
}
segment->PreInsert(N);
ColumnBasedRawData raw_data;
raw_data.columns_ = dataset_.cols_;
raw_data.count = N;
segment->Insert(0, N, dataset_.row_ids_.data(), dataset_.timestamps_.data(), raw_data);
segment->Insert(0, N, dataset_.row_ids_.data(), dataset_.timestamps_.data(), dataset_.raw_);
Timestamp time = 10000000;
@ -104,8 +101,8 @@ Search_Sealed(benchmark::State& state) {
// Brute Force
} else if (choice == 1) {
// ivf
auto vec = (const float*)dataset_.cols_[0].data();
auto indexing = GenIndexing(N, dim, vec);
auto vec = dataset_.get_col<float>(milvus::FieldId(100));
auto indexing = GenIndexing(N, dim, vec.data());
LoadIndexInfo info;
info.index = indexing;
info.field_id = (*schema)[FieldName("fakevec")].get_id().get();

View File

@ -18,8 +18,9 @@ using namespace milvus::segcore;
TEST(Binary, Insert) {
int64_t N = 100000;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("vecbin", DataType::VECTOR_BINARY, 128, MetricType::METRIC_Jaccard);
schema->AddDebugField("age", DataType::INT32);
auto vec_fid = schema->AddDebugField("vecbin", DataType::VECTOR_BINARY, 128, MetricType::METRIC_Jaccard);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto dataset = DataGen(schema, N, 10);
auto segment = CreateGrowingSegment(schema);
auto offset = segment->PreInsert(N);

View File

@ -18,10 +18,10 @@ TEST(Bitmap, Naive) {
using namespace milvus::segcore;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("height", DataType::FLOAT);
auto field_id = schema->AddDebugField("height", DataType::FLOAT);
int N = 10000;
auto raw_data = DataGen(schema, N);
auto vec = raw_data.get_col<float>(0);
auto vec = raw_data.get_col<float>(field_id);
auto sort_index = std::make_shared<scalar::ScalarIndexSort<float>>();
sort_index->Build(N, vec.data());
{

View File

@ -53,6 +53,26 @@ TEST_F(BoolIndexTest, Constructor) {
auto index = milvus::scalar::CreateBoolIndex();
}
TEST_F(BoolIndexTest, Count) {
{
auto index = milvus::scalar::CreateBoolIndex();
index->BuildWithDataset(all_true_ds);
ASSERT_EQ(n, index->Count());
}
{
auto index = milvus::scalar::CreateBoolIndex();
index->BuildWithDataset(all_false_ds);
ASSERT_EQ(n, index->Count());
}
{
auto index = milvus::scalar::CreateBoolIndex();
index->BuildWithDataset(half_ds);
ASSERT_EQ(n, index->Count());
}
}
TEST_F(BoolIndexTest, In) {
auto true_test = std::make_unique<bool>(true);
auto false_test = std::make_unique<bool>(false);

File diff suppressed because it is too large Load Diff

View File

@ -103,7 +103,7 @@ TEST(Expr, Range) {
schema->AddDebugField("age", DataType::INT32);
auto plan = CreatePlan(*schema, dsl_string);
ShowPlanNodeVisitor shower;
Assert(plan->tag2field_.at("$0") == schema->get_offset(FieldName("fakevec")));
Assert(plan->tag2field_.at("$0") == schema->get_field_id(FieldName("fakevec")));
auto out = shower.call_child(*plan->plan_node_);
std::cout << out.dump(4);
}
@ -145,7 +145,7 @@ TEST(Expr, RangeBinary) {
schema->AddDebugField("age", DataType::INT32);
auto plan = CreatePlan(*schema, dsl_string);
ShowPlanNodeVisitor shower;
Assert(plan->tag2field_.at("$0") == schema->get_offset(FieldName("fakevec")));
Assert(plan->tag2field_.at("$0") == schema->get_field_id(FieldName("fakevec")));
auto out = shower.call_child(*plan->plan_node_);
std::cout << out.dump(4);
}
@ -231,14 +231,14 @@ TEST(Expr, ShowExecutor) {
using namespace milvus::segcore;
auto node = std::make_unique<FloatVectorANNS>();
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto field_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
int64_t num_queries = 100L;
auto raw_data = DataGen(schema, num_queries);
auto& info = node->search_info_;
info.metric_type_ = MetricType::METRIC_L2;
info.topk_ = 20;
info.field_offset_ = FieldOffset(0);
info.field_id_ = field_id;
node->predicate_ = std::nullopt;
ShowPlanNodeVisitor show_visitor;
PlanNodePtr base(node.release());
@ -291,8 +291,9 @@ TEST(Expr, TestRange) {
}
})";
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::INT32);
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema);
int N = 1000;
@ -300,7 +301,7 @@ TEST(Expr, TestRange) {
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_age_col = raw_data.get_col<int>(1);
auto new_age_col = raw_data.get_col<int>(i64_fid);
age_col.insert(age_col.end(), new_age_col.begin(), new_age_col.end());
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
@ -373,8 +374,9 @@ TEST(Expr, TestTerm) {
}
})";
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::INT32);
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema);
int N = 1000;
@ -382,7 +384,7 @@ TEST(Expr, TestTerm) {
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_age_col = raw_data.get_col<int>(1);
auto new_age_col = raw_data.get_col<int>(i64_fid);
age_col.insert(age_col.end(), new_age_col.begin(), new_age_col.end());
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
@ -445,7 +447,7 @@ TEST(Expr, TestSimpleDsl) {
{
Json dsl;
dsl["must"] = Json::array({vec_dsl, get_item(0), get_item(1), get_item(2, 0), get_item(3)});
testcases.emplace_back(dsl, [](int x) { return (x & 0b1111) == 0b1011; });
testcases.emplace_back(dsl, [](int64_t x) { return (x & 0b1111) == 0b1011; });
}
{
@ -453,7 +455,7 @@ TEST(Expr, TestSimpleDsl) {
Json sub_dsl;
sub_dsl["must"] = Json::array({get_item(0), get_item(1), get_item(2, 0), get_item(3)});
dsl["must"] = Json::array({sub_dsl, vec_dsl});
testcases.emplace_back(dsl, [](int x) { return (x & 0b1111) == 0b1011; });
testcases.emplace_back(dsl, [](int64_t x) { return (x & 0b1111) == 0b1011; });
}
{
@ -461,7 +463,7 @@ TEST(Expr, TestSimpleDsl) {
Json sub_dsl;
sub_dsl["should"] = Json::array({get_item(0), get_item(1), get_item(2, 0), get_item(3)});
dsl["must"] = Json::array({sub_dsl, vec_dsl});
testcases.emplace_back(dsl, [](int x) { return !!((x & 0b1111) ^ 0b0100); });
testcases.emplace_back(dsl, [](int64_t x) { return !!((x & 0b1111) ^ 0b0100); });
}
{
@ -469,19 +471,20 @@ TEST(Expr, TestSimpleDsl) {
Json sub_dsl;
sub_dsl["must_not"] = Json::array({get_item(0), get_item(1), get_item(2, 0), get_item(3)});
dsl["must"] = Json::array({sub_dsl, vec_dsl});
testcases.emplace_back(dsl, [](int x) { return (x & 0b1111) != 0b1011; });
testcases.emplace_back(dsl, [](int64_t x) { return (x & 0b1111) != 0b1011; });
}
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::INT32);
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema);
std::vector<int> age_col;
std::vector<int64_t> age_col;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_age_col = raw_data.get_col<int>(1);
auto new_age_col = raw_data.get_col<int64_t>(i64_fid);
age_col.insert(age_col.end(), new_age_col.begin(), new_age_col.end());
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
@ -543,9 +546,10 @@ TEST(Expr, TestCompare) {
}
})";
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age1", DataType::INT32);
schema->AddDebugField("age2", DataType::INT64);
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i32_fid = schema->AddDebugField("age1", DataType::INT32);
auto i64_fid = schema->AddDebugField("age2", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema);
int N = 1000;
@ -554,8 +558,8 @@ TEST(Expr, TestCompare) {
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_age1_col = raw_data.get_col<int>(1);
auto new_age2_col = raw_data.get_col<int64_t>(2);
auto new_age1_col = raw_data.get_col<int>(i32_fid);
auto new_age2_col = raw_data.get_col<int64_t>(i64_fid);
age1_col.insert(age1_col.end(), new_age1_col.begin(), new_age1_col.end());
age2_col.insert(age2_col.end(), new_age2_col.begin(), new_age2_col.end());
seg->PreInsert(N);
@ -592,80 +596,93 @@ TEST(Expr, TestBinaryArithOpEvalRange) {
"right_operand": 4,
"value": 8
}
})", [](int8_t v) { return (v + 4) == 8; }, DataType::INT8},
})",
[](int8_t v) { return (v + 4) == 8; }, DataType::INT8},
{R"("EQ": {
"SUB": {
"right_operand": 500,
"value": 1500
}
})", [](int16_t v) { return (v - 500) == 1500; }, DataType::INT16},
})",
[](int16_t v) { return (v - 500) == 1500; }, DataType::INT16},
{R"("EQ": {
"MUL": {
"right_operand": 2,
"value": 4000
}
})", [](int32_t v) { return (v * 2) == 4000; }, DataType::INT32},
})",
[](int32_t v) { return (v * 2) == 4000; }, DataType::INT32},
{R"("EQ": {
"DIV": {
"right_operand": 2,
"value": 1000
}
})", [](int64_t v) { return (v / 2) == 1000; }, DataType::INT64},
})",
[](int64_t v) { return (v / 2) == 1000; }, DataType::INT64},
{R"("EQ": {
"MOD": {
"right_operand": 100,
"value": 0
}
})", [](int32_t v) { return (v % 100) == 0; }, DataType::INT32},
})",
[](int32_t v) { return (v % 100) == 0; }, DataType::INT32},
{R"("EQ": {
"ADD": {
"right_operand": 500,
"value": 2500
}
})", [](float v) { return (v + 500) == 2500; }, DataType::FLOAT},
})",
[](float v) { return (v + 500) == 2500; }, DataType::FLOAT},
{R"("EQ": {
"ADD": {
"right_operand": 500,
"value": 2500
}
})", [](double v) { return (v + 500) == 2500; }, DataType::DOUBLE},
})",
[](double v) { return (v + 500) == 2500; }, DataType::DOUBLE},
// Add test cases for BinaryArithOpEvalRangeExpr NE of various data types
{R"("NE": {
"ADD": {
"right_operand": 500,
"value": 2500
}
})", [](float v) { return (v + 500) != 2500; }, DataType::FLOAT},
})",
[](float v) { return (v + 500) != 2500; }, DataType::FLOAT},
{R"("NE": {
"SUB": {
"right_operand": 500,
"value": 2500
}
})", [](double v) { return (v - 500) != 2500; }, DataType::DOUBLE},
})",
[](double v) { return (v - 500) != 2500; }, DataType::DOUBLE},
{R"("NE": {
"MUL": {
"right_operand": 2,
"value": 2
}
})", [](int8_t v) { return (v * 2) != 2; }, DataType::INT8},
})",
[](int8_t v) { return (v * 2) != 2; }, DataType::INT8},
{R"("NE": {
"DIV": {
"right_operand": 2,
"value": 1000
}
})", [](int16_t v) { return (v / 2) != 1000; }, DataType::INT16},
})",
[](int16_t v) { return (v / 2) != 1000; }, DataType::INT16},
{R"("NE": {
"MOD": {
"right_operand": 100,
"value": 0
}
})", [](int32_t v) { return (v % 100) != 0; }, DataType::INT32},
})",
[](int32_t v) { return (v % 100) != 0; }, DataType::INT32},
{R"("NE": {
"ADD": {
"right_operand": 500,
"value": 2500
}
})", [](int64_t v) { return (v + 500) != 2500; }, DataType::INT64},
})",
[](int64_t v) { return (v + 500) != 2500; }, DataType::INT64},
};
std::string dsl_string_tmp = R"({
@ -713,7 +730,6 @@ TEST(Expr, TestBinaryArithOpEvalRange) {
@@@@
})";
std::string dsl_string_float = R"(
"age_float": {
@@@@
@ -725,13 +741,14 @@ TEST(Expr, TestBinaryArithOpEvalRange) {
})";
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age8", DataType::INT8);
schema->AddDebugField("age16", DataType::INT16);
schema->AddDebugField("age32", DataType::INT32);
schema->AddDebugField("age64", DataType::INT64);
schema->AddDebugField("age_float", DataType::FLOAT);
schema->AddDebugField("age_double", DataType::DOUBLE);
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto i8_fid = schema->AddDebugField("age8", DataType::INT8);
auto i16_fid = schema->AddDebugField("age16", DataType::INT16);
auto i32_fid = schema->AddDebugField("age32", DataType::INT32);
auto i64_fid = schema->AddDebugField("age64", DataType::INT64);
auto float_fid = schema->AddDebugField("age_float", DataType::FLOAT);
auto double_fid = schema->AddDebugField("age_double", DataType::DOUBLE);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema);
int N = 1000;
@ -745,12 +762,12 @@ TEST(Expr, TestBinaryArithOpEvalRange) {
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_age8_col = raw_data.get_col<int8_t>(1);
auto new_age16_col = raw_data.get_col<int16_t>(2);
auto new_age32_col = raw_data.get_col<int32_t>(3);
auto new_age64_col = raw_data.get_col<int64_t>(4);
auto new_age_float_col = raw_data.get_col<float>(5);
auto new_age_double_col = raw_data.get_col<double>(6);
auto new_age8_col = raw_data.get_col<int8_t>(i8_fid);
auto new_age16_col = raw_data.get_col<int16_t>(i16_fid);
auto new_age32_col = raw_data.get_col<int32_t>(i32_fid);
auto new_age64_col = raw_data.get_col<int64_t>(i64_fid);
auto new_age_float_col = raw_data.get_col<float>(float_fid);
auto new_age_double_col = raw_data.get_col<double>(double_fid);
age8_col.insert(age8_col.end(), new_age8_col.begin(), new_age8_col.end());
age16_col.insert(age16_col.end(), new_age16_col.begin(), new_age16_col.end());
@ -832,39 +849,45 @@ TEST(Expr, TestBinaryArithOpEvalRangeExceptions) {
"right_operand": 500,
"value": 2500.00
}
})", "Assert \"(value.is_number_integer())\"", DataType::INT32},
})",
"Assert \"(value.is_number_integer())\"", DataType::INT32},
{R"("EQ": {
"ADD": {
"right_operand": 500.0,
"value": 2500
}
})", "Assert \"(right_operand.is_number_integer())\"", DataType::INT32},
})",
"Assert \"(right_operand.is_number_integer())\"", DataType::INT32},
{R"("EQ": {
"ADD": {
"right_operand": 500.0,
"value": true
}
})", "Assert \"(value.is_number())\"", DataType::FLOAT},
})",
"Assert \"(value.is_number())\"", DataType::FLOAT},
{R"("EQ": {
"ADD": {
"right_operand": "500",
"value": 2500.0
}
})", "Assert \"(right_operand.is_number())\"", DataType::FLOAT},
})",
"Assert \"(right_operand.is_number())\"", DataType::FLOAT},
// Check unsupported arithmetic operator type
{R"("EQ": {
"EXP": {
"right_operand": 500,
"value": 2500
}
})", "arith op(exp) not found", DataType::INT32},
})",
"arith op(exp) not found", DataType::INT32},
// Check unsupported data type
{R"("EQ": {
"ADD": {
"right_operand": true,
"value": false
}
})", "bool type is not supported", DataType::BOOL},
})",
"bool type is not supported", DataType::BOOL},
};
std::string dsl_string_tmp = R"({
@ -932,12 +955,10 @@ TEST(Expr, TestBinaryArithOpEvalRangeExceptions) {
try {
auto plan = CreatePlan(*schema, dsl_string);
FAIL() << "Expected AssertionError: " << assert_info << " not thrown";
}
catch(const std::exception& err) {
} catch (const std::exception& err) {
std::string err_msg = err.what();
ASSERT_TRUE(err_msg.find(assert_info) != std::string::npos);
}
catch(...) {
} catch (...) {
FAIL() << "Expected AssertionError: " << assert_info << " not thrown";
}
}

View File

@ -43,7 +43,7 @@ TEST(FloatVecIndex, All) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
CDataType dtype = FloatVector;
CIndex index;
@ -94,7 +94,7 @@ TEST(BinaryVecIndex, All) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
auto xb_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
CDataType dtype = BinaryVector;
CIndex index;

View File

@ -76,14 +76,14 @@ class IndexWrapperTest : public ::testing::TestWithParam<Param> {
auto dataset = GenDataset(NB, metric_type, is_binary);
if (!is_binary) {
xb_data = dataset.get_col<float>(0);
xb_data = dataset.get_col<float>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
xq_data = dataset.get_col<float>(0);
xq_data = dataset.get_col<float>(milvus::FieldId(100));
xq_dataset = knowhere::GenDataset(NQ, DIM, xq_data.data());
} else {
xb_bin_data = dataset.get_col<uint8_t>(0);
xb_bin_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataset(NB, DIM, xb_bin_data.data());
xq_bin_data = dataset.get_col<uint8_t>(0);
xq_bin_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
xq_dataset = knowhere::GenDataset(NQ, DIM, xq_bin_data.data());
}
}
@ -113,7 +113,7 @@ TEST(PQ, Build) {
auto conf = generate_conf(index_type, metric_type);
auto index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
ASSERT_NO_THROW(index->Train(xb_dataset, conf));
ASSERT_NO_THROW(index->AddWithoutIds(xb_dataset, conf));
@ -125,7 +125,7 @@ TEST(IVFFLATNM, Build) {
auto conf = generate_conf(index_type, metric_type);
auto index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
ASSERT_NO_THROW(index->Train(xb_dataset, conf));
ASSERT_NO_THROW(index->AddWithoutIds(xb_dataset, conf));
@ -139,7 +139,7 @@ TEST(IVFFLATNM, Query) {
auto conf = generate_conf(index_type, metric_type);
auto index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
ASSERT_NO_THROW(index->Train(xb_dataset, conf));
ASSERT_NO_THROW(index->AddWithoutIds(xb_dataset, conf));
@ -149,7 +149,7 @@ TEST(IVFFLATNM, Query) {
bptr->size = DIM * NB * sizeof(float);
bs.Append(RAW_DATA, bptr);
index->Load(bs);
auto xq_data = dataset.get_col<float>(0);
auto xq_data = dataset.get_col<float>(milvus::FieldId(100));
auto xq_dataset = knowhere::GenDataset(NQ, DIM, xq_data.data());
auto result = index->Query(xq_dataset, conf, nullptr);
@ -189,7 +189,7 @@ TEST(BINFLAT, Build) {
auto conf = generate_conf(index_type, metric_type);
auto index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
auto xb_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
std::vector<knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
@ -222,12 +222,12 @@ TEST(BinIVFFlat, Build_and_Query) {
auto dim = 128;
auto nq = 10;
auto dataset = GenDataset(std::max(nq, nb), metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
auto xb_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
std::vector<knowhere::IDType> ids(nb, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = knowhere::GenDataset(nb, dim, xb_data.data());
index->BuildAll(xb_dataset, conf);
auto xq_data = dataset.get_col<float>(0);
auto xq_data = dataset.get_col<float>(milvus::FieldId(100));
auto xq_dataset = knowhere::GenDataset(nq, dim, xq_data.data());
auto result = index->Query(xq_dataset, conf, nullptr);
@ -258,7 +258,7 @@ TEST(BINIDMAP, Build) {
auto conf = generate_conf(index_type, metric_type);
auto index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
auto xb_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
std::vector<knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
@ -278,7 +278,7 @@ TEST(PQWrapper, Build) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
auto index =
std::make_unique<milvus::indexbuilder::VecIndexCreator>(type_params_str.c_str(), index_params_str.c_str());
@ -298,7 +298,7 @@ TEST(IVFFLATNMWrapper, Build) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
auto index =
std::make_unique<milvus::indexbuilder::VecIndexCreator>(type_params_str.c_str(), index_params_str.c_str());
@ -319,7 +319,7 @@ TEST(IVFFLATNMWrapper, Codec) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(flat_nb, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataset(flat_nb, DIM, xb_data.data());
auto index_wrapper =
std::make_unique<milvus::indexbuilder::VecIndexCreator>(type_params_str.c_str(), index_params_str.c_str());
@ -353,7 +353,7 @@ TEST(BinFlatWrapper, Build) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
auto xb_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
std::vector<knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());
@ -376,7 +376,7 @@ TEST(BinIdMapWrapper, Build) {
ok = google::protobuf::TextFormat::PrintToString(index_params, &index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
auto xb_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
std::vector<knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = knowhere::GenDataset(NB, DIM, xb_data.data());

View File

@ -268,10 +268,10 @@ TEST(Indexing, BinaryBruteForce) {
int64_t dim = 8192;
auto result_count = topk * num_queries;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("vecbin", DataType::VECTOR_BINARY, dim, MetricType::METRIC_Jaccard);
schema->AddDebugField("age", DataType::INT64);
auto vec_fid = schema->AddDebugField("vecbin", DataType::VECTOR_BINARY, dim, MetricType::METRIC_Jaccard);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
auto dataset = DataGen(schema, N, 10);
auto bin_vec = dataset.get_col<uint8_t>(0);
auto bin_vec = dataset.get_col<uint8_t>(vec_fid);
auto query_data = 1024 * dim / 8 + bin_vec.data();
query::dataset::SearchDataset search_dataset{
faiss::MetricType::METRIC_Jaccard, //
@ -287,7 +287,7 @@ TEST(Indexing, BinaryBruteForce) {
SearchResult sr;
sr.num_queries_ = num_queries;
sr.topk_ = topk;
sr.ids_ = std::move(sub_result.mutable_ids());
sr.seg_offsets_ = std::move(sub_result.mutable_seg_offsets());
sr.distances_ = std::move(sub_result.mutable_distances());
auto json = SearchResultToJson(sr);

View File

@ -29,20 +29,18 @@ namespace spb = proto::schema;
static SchemaPtr
getStandardSchema() {
auto schema = std::make_shared<Schema>();
schema->AddField(FieldName("FloatVectorField"), FieldId(100 + spb::DataType::FloatVector), DataType::VECTOR_FLOAT,
16, MetricType::METRIC_L2);
schema->AddField(FieldName("BinaryVectorField"), FieldId(100 + spb::DataType::BinaryVector),
DataType::VECTOR_BINARY, 16, MetricType::METRIC_Jaccard);
schema->AddField(FieldName("Int64Field"), FieldId(100 + spb::DataType::Int64), DataType::INT64);
schema->AddField(FieldName("Int32Field"), FieldId(100 + spb::DataType::Int32), DataType::INT32);
schema->AddField(FieldName("Int16Field"), FieldId(100 + spb::DataType::Int16), DataType::INT16);
schema->AddField(FieldName("Int8Field"), FieldId(100 + spb::DataType::Int8), DataType::INT8);
schema->AddField(FieldName("DoubleField"), FieldId(100 + spb::DataType::Double), DataType::DOUBLE);
schema->AddField(FieldName("FloatField"), FieldId(100 + spb::DataType::Float), DataType::FLOAT);
schema->AddDebugField("FloatVectorField", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("BinaryVectorField", DataType::VECTOR_BINARY, 16, MetricType::METRIC_Jaccard);
schema->AddDebugField("Int64Field", DataType::INT64);
schema->AddDebugField("Int32Field", DataType::INT32);
schema->AddDebugField("Int16Field", DataType::INT16);
schema->AddDebugField("Int8Field", DataType::INT8);
schema->AddDebugField("DoubleField", DataType::DOUBLE);
schema->AddDebugField("FloatField", DataType::FLOAT);
return schema;
}
class PlanProtoTest : public ::testing::TestWithParam<std::tuple<spb::DataType>> {
class PlanProtoTest : public ::testing::TestWithParam<std::tuple<std::string>> {
public:
PlanProtoTest() {
schema = getStandardSchema();
@ -54,40 +52,44 @@ class PlanProtoTest : public ::testing::TestWithParam<std::tuple<spb::DataType>>
INSTANTIATE_TEST_CASE_P(InstName,
PlanProtoTest,
::testing::Values( //
std::make_tuple(spb::DataType::Double), //
std::make_tuple(spb::DataType::Float), //
std::make_tuple(spb::DataType::Int64), //
std::make_tuple(spb::DataType::Int32), //
std::make_tuple(spb::DataType::Int16), //
std::make_tuple(spb::DataType::Int8) //
::testing::Values( //
std::make_tuple("DoubleField"), //
std::make_tuple("FloatField"), //
std::make_tuple("Int64Field"), //
std::make_tuple("Int32Field"), //
std::make_tuple("Int16Field"), //
std::make_tuple("Int8Field") //
));
TEST_P(PlanProtoTest, Range) {
// xxx.query(predicates = "int64field > 3", topk = 10, ...)
auto data_type = std::get<0>(GetParam());
auto data_type_str = spb::DataType_Name(data_type);
auto field_id = 100 + (int)data_type;
auto field_name = data_type_str + "Field";
FieldName vec_field_name = FieldName("FloatVectorField");
FieldId vec_float_field_id = schema->get_field_id(vec_field_name);
auto field_name = std::get<0>(GetParam());
auto field_id = schema->get_field_id(FieldName(field_name));
auto data_type = schema->operator[](field_id).get_data_type();
auto data_type_str = spb::DataType_Name(int(data_type));
string value_tag = "bool_val";
if (datatype_is_floating((DataType)data_type)) {
if (datatype_is_floating(data_type)) {
value_tag = "float_val";
} else if (datatype_is_integer((DataType)data_type)) {
} else if (datatype_is_integer(data_type)) {
value_tag = "int64_val";
}
auto fmt1 = boost::format(R"(
vector_anns: <
field_id: 201
field_id: %1%
predicates: <
unary_range_expr: <
column_info: <
field_id: %1%
data_type: %2%
field_id: %2%
data_type: %3%
>
op: GreaterThan
value: <
%3%: 3
%4%: 3
>
>
>
@ -99,8 +101,8 @@ vector_anns: <
>
placeholder_tag: "$0"
>
)") % field_id % data_type_str %
value_tag;
)") % vec_float_field_id.get() %
field_id.get() % data_type_str % value_tag;
auto proto_text = fmt1.str();
planpb::PlanNode node_proto;
@ -148,34 +150,38 @@ vector_anns: <
TEST_P(PlanProtoTest, TermExpr) {
// xxx.query(predicates = "int64field in [1, 2, 3]", topk = 10, ...)
auto data_type = std::get<0>(GetParam());
auto data_type_str = spb::DataType_Name(data_type);
auto field_id = 100 + (int)data_type;
auto field_name = data_type_str + "Field";
FieldName vec_field_name = FieldName("FloatVectorField");
FieldId vec_float_field_id = schema->get_field_id(vec_field_name);
auto field_name = std::get<0>(GetParam());
auto field_id = schema->get_field_id(FieldName(field_name));
auto data_type = schema->operator[](field_id).get_data_type();
auto data_type_str = spb::DataType_Name(int(data_type));
string value_tag = "bool_val";
if (datatype_is_floating((DataType)data_type)) {
if (datatype_is_floating(data_type)) {
value_tag = "float_val";
} else if (datatype_is_integer((DataType)data_type)) {
} else if (datatype_is_integer(data_type)) {
value_tag = "int64_val";
}
auto fmt1 = boost::format(R"(
vector_anns: <
field_id: 201
field_id: %1%
predicates: <
term_expr: <
column_info: <
field_id: %1%
data_type: %2%
field_id: %2%
data_type: %3%
>
values: <
%3%: 1
%4%: 1
>
values: <
%3%: 2
%4%: 2
>
values: <
%3%: 3
%4%: 3
>
>
>
@ -187,8 +193,8 @@ vector_anns: <
>
placeholder_tag: "$0"
>
)") % field_id % data_type_str %
value_tag;
)") % vec_float_field_id.get() %
field_id.get() % data_type_str % value_tag;
auto proto_text = fmt1.str();
planpb::PlanNode node_proto;
@ -237,32 +243,31 @@ vector_anns: <
TEST(PlanProtoTest, NotExpr) {
auto schema = getStandardSchema();
// xxx.query(predicates = "not (int64field > 3)", topk = 10, ...)
FieldName vec_field_name = FieldName("FloatVectorField");
FieldId vec_float_field_id = schema->get_field_id(vec_field_name);
FieldName int64_field_name = FieldName("Int64Field");
FieldId int64_field_id = schema->get_field_id(int64_field_name);
string value_tag = "int64_val";
auto data_type = spb::DataType::Int64;
auto data_type_str = spb::DataType_Name(data_type);
auto field_id = 100 + (int)data_type;
auto field_name = data_type_str + "Field";
string value_tag = "bool_val";
if (datatype_is_floating((DataType)data_type)) {
value_tag = "float_val";
} else if (datatype_is_integer((DataType)data_type)) {
value_tag = "int64_val";
}
auto data_type_str = spb::DataType_Name(int(data_type));
auto fmt1 = boost::format(R"(
vector_anns: <
field_id: 201
field_id: %1%
predicates: <
unary_expr: <
op: Not
child: <
unary_range_expr: <
column_info: <
field_id: %1%
data_type: %2%
field_id: %2%
data_type: %3%
>
op: GreaterThan
value: <
%3%: 3
%4%: 3
>
>
>
@ -276,8 +281,8 @@ vector_anns: <
>
placeholder_tag: "$0"
>
)") % field_id % data_type_str %
value_tag;
)") % vec_float_field_id.get() %
int64_field_id.get() % data_type_str % value_tag;
auto proto_text = fmt1.str();
planpb::PlanNode node_proto;
@ -319,7 +324,7 @@ vector_anns: <
]
}
}
)") % field_name);
)") % int64_field_name.get());
auto ref_plan = CreatePlan(*schema, dsl_text);
auto ref_json = ShowPlanNodeVisitor().call_child(*ref_plan->plan_node_);
@ -330,32 +335,31 @@ vector_anns: <
TEST(PlanProtoTest, AndOrExpr) {
auto schema = getStandardSchema();
// xxx.query(predicates = "(int64field < 3) && (int64field > 2 || int64field == 1)", topk = 10, ...)
FieldName vec_field_name = FieldName("FloatVectorField");
FieldId vec_float_field_id = schema->get_field_id(vec_field_name);
FieldName int64_field_name = FieldName("Int64Field");
FieldId int64_field_id = schema->get_field_id(int64_field_name);
string value_tag = "int64_val";
auto data_type = spb::DataType::Int64;
auto data_type_str = spb::DataType_Name(data_type);
auto field_id = 100 + (int)data_type;
auto field_name = data_type_str + "Field";
string value_tag = "bool_val";
if (datatype_is_floating((DataType)data_type)) {
value_tag = "float_val";
} else if (datatype_is_integer((DataType)data_type)) {
value_tag = "int64_val";
}
auto data_type_str = spb::DataType_Name(int(data_type));
auto fmt1 = boost::format(R"(
vector_anns: <
field_id: 201
field_id: %1%
predicates: <
binary_expr: <
op: LogicalAnd
left: <
unary_range_expr: <
column_info: <
field_id: 105
data_type: Int64
field_id: %2%
data_type: %3%
>
op: LessThan
value: <
int64_val: 3
%4%: 3
>
>
>
@ -365,24 +369,24 @@ vector_anns: <
left: <
unary_range_expr: <
column_info: <
field_id: 105
data_type: Int64
field_id: %2%
data_type: %3%
>
op: GreaterThan
value: <
int64_val: 2
%4%: 2
>
>
>
right: <
unary_range_expr: <
column_info: <
field_id: 105
data_type: Int64
field_id: %2%
data_type: %3%
>
op: Equal
value: <
int64_val: 1
%4%: 1
>
>
>
@ -398,7 +402,8 @@ vector_anns: <
>
placeholder_tag: "$0"
>
)");
)") % vec_float_field_id.get() %
int64_field_id.get() % data_type_str % value_tag;
auto proto_text = fmt1.str();
planpb::PlanNode node_proto;
@ -457,7 +462,7 @@ vector_anns: <
]
}
}
)") % field_name);
)") % int64_field_name.get());
auto ref_plan = CreatePlan(*schema, dsl_text);
auto ref_json = ShowPlanNodeVisitor().call_child(*ref_plan->plan_node_);
@ -467,25 +472,29 @@ vector_anns: <
TEST_P(PlanProtoTest, CompareExpr) {
auto schema = getStandardSchema();
schema->AddField(FieldName("age1"), FieldId(128), DataType::INT64);
auto age_fid = schema->AddDebugField("age1", DataType::INT64);
// xxx.query(predicates = "int64field < int64field", topk = 10, ...)
auto data_type = std::get<0>(GetParam());
auto field_id = 100 + (int)data_type;
auto data_type_str = spb::DataType_Name(data_type);
auto field_name = data_type_str + "Field";
FieldName vec_field_name = FieldName("FloatVectorField");
FieldId vec_float_field_id = schema->get_field_id(vec_field_name);
auto field_name = std::get<0>(GetParam());
auto field_id = schema->get_field_id(FieldName(field_name));
auto data_type = schema->operator[](field_id).get_data_type();
auto data_type_str = spb::DataType_Name(int(data_type));
auto fmt1 = boost::format(R"(
vector_anns: <
field_id: 201
field_id: %1%
predicates: <
compare_expr: <
left_column_info: <
field_id: 128
field_id: %2%
data_type: Int64
>
right_column_info: <
field_id: %1%
data_type: %2%
field_id: %3%
data_type: %4%
>
op: LessThan
>
@ -498,7 +507,8 @@ vector_anns: <
>
placeholder_tag: "$0"
>
)") % field_id % data_type_str;
)") % vec_float_field_id.get() %
age_fid.get() % field_id.get() % data_type_str;
auto proto_text = fmt1.str();
planpb::PlanNode node_proto;
@ -547,33 +557,48 @@ vector_anns: <
TEST_P(PlanProtoTest, BinaryArithOpEvalRange) {
// xxx.query(predicates = "int64field > 3", topk = 10, ...)
auto data_type = std::get<0>(GetParam());
auto data_type_str = spb::DataType_Name(data_type);
auto field_id = 100 + (int)data_type;
auto field_name = data_type_str + "Field";
// auto data_type = std::get<0>(GetParam());
// auto data_type_str = spb::DataType_Name(data_type);
// auto field_id = 100 + (int)data_type;
// auto field_name = data_type_str + "Field";
// string value_tag = "bool_val";
// if (datatype_is_floating((DataType)data_type)) {
// value_tag = "float_val";
// } else if (datatype_is_integer((DataType)data_type)) {
// value_tag = "int64_val";
// }
FieldName vec_field_name = FieldName("FloatVectorField");
FieldId vec_float_field_id = schema->get_field_id(vec_field_name);
auto field_name = std::get<0>(GetParam());
auto field_id = schema->get_field_id(FieldName(field_name));
auto data_type = schema->operator[](field_id).get_data_type();
auto data_type_str = spb::DataType_Name(int(data_type));
string value_tag = "bool_val";
if (datatype_is_floating((DataType)data_type)) {
if (datatype_is_floating(data_type)) {
value_tag = "float_val";
} else if (datatype_is_integer((DataType)data_type)) {
} else if (datatype_is_integer(data_type)) {
value_tag = "int64_val";
}
auto fmt1 = boost::format(R"(
vector_anns: <
field_id: 201
field_id: %1%
predicates: <
binary_arith_op_eval_range_expr: <
column_info: <
field_id: %1%
data_type: %2%
field_id: %2%
data_type: %3%
>
arith_op: Add
right_operand: <
%3%: 1029
%4%: 1029
>
op: Equal
value: <
%3%: 2016
%4%: 2016
>
>
>
@ -585,8 +610,8 @@ vector_anns: <
>
placeholder_tag: "$0"
>
)") % field_id % data_type_str %
value_tag;
)") % vec_float_field_id.get() %
field_id.get() % data_type_str % value_tag;
auto proto_text = fmt1.str();
planpb::PlanNode node_proto;

View File

@ -35,13 +35,13 @@ TEST(Query, ShowExecutor) {
using namespace milvus;
auto node = std::make_unique<FloatVectorANNS>();
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto field_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
int64_t num_queries = 100L;
auto raw_data = DataGen(schema, num_queries);
auto& info = node->search_info_;
info.metric_type_ = MetricType::METRIC_L2;
info.topk_ = 20;
info.field_offset_ = FieldOffset(1000);
info.field_id_ = field_id;
node->predicate_ = std::nullopt;
ShowPlanNodeVisitor show_visitor;
PlanNodePtr base(node.release());
@ -140,6 +140,8 @@ TEST(Query, ExecWithPredicateLoader) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::FLOAT);
auto counter_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(counter_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -171,10 +173,7 @@ TEST(Query, ExecWithPredicateLoader) {
auto dataset = DataGen(schema, N);
auto segment = CreateGrowingSegment(schema);
segment->PreInsert(N);
ColumnBasedRawData raw_data;
raw_data.columns_ = dataset.cols_;
raw_data.count = N;
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), raw_data);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto plan = CreatePlan(*schema, dsl);
auto num_queries = 5;
@ -219,6 +218,8 @@ TEST(Query, ExecWithPredicateSmallN) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 7, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -271,6 +272,8 @@ TEST(Query, ExecWithPredicate) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -347,6 +350,8 @@ TEST(Query, ExecTerm) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -430,7 +435,7 @@ TEST(Query, ExecEmpty) {
auto sr = segment->Search(plan.get(), *ph_group, time);
std::cout << SearchResultToJson(*sr);
for (auto i : sr->ids_) {
for (auto i : sr->seg_offsets_) {
ASSERT_EQ(i, -1);
}
@ -445,6 +450,8 @@ TEST(Query, ExecWithoutPredicateFlat) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, std::nullopt);
schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -489,6 +496,8 @@ TEST(Query, ExecWithoutPredicate) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -576,13 +585,15 @@ TEST(Indexing, InnerProduct) {
]
}
})";
schema->AddDebugField("normalized", DataType::VECTOR_FLOAT, dim, MetricType::METRIC_INNER_PRODUCT);
auto vec_fid = schema->AddDebugField("normalized", DataType::VECTOR_FLOAT, dim, MetricType::METRIC_INNER_PRODUCT);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto dataset = DataGen(schema, N);
auto segment = CreateGrowingSegment(schema);
auto plan = CreatePlan(*schema, dsl);
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto col = dataset.get_col<float>(0);
auto col = dataset.get_col<float>(vec_fid);
auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, col.data());
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
@ -637,9 +648,9 @@ TEST(Query, FillSegment) {
// dispatch here
int N = 100000;
auto dataset = DataGen(schema, N);
const auto std_vec = dataset.get_col<int64_t>(1); // ids field
const auto std_vfloat_vec = dataset.get_col<float>(0); // vector field
const auto std_i32_vec = dataset.get_col<int32_t>(2); // scalar field
const auto std_vec = dataset.get_col<int64_t>(FieldId(101)); // ids field
const auto std_vfloat_vec = dataset.get_col<float>(FieldId(100)); // vector field
const auto std_i32_vec = dataset.get_col<int32_t>(FieldId(102)); // scalar field
std::vector<std::unique_ptr<SegmentInternalInterface>> segments;
segments.emplace_back([&] {
@ -694,27 +705,32 @@ TEST(Query, FillSegment) {
for (auto& segment : segments) {
plan->target_entries_.clear();
plan->target_entries_.push_back(schema->get_offset(FieldName("fakevec")));
plan->target_entries_.push_back(schema->get_offset(FieldName("the_value")));
plan->target_entries_.push_back(schema->get_field_id(FieldName("fakevec")));
plan->target_entries_.push_back(schema->get_field_id(FieldName("the_value")));
auto result = segment->Search(plan.get(), *ph, ts);
// std::cout << SearchResultToJson(result).dump(2);
result->result_offsets_.resize(topk * num_queries);
segment->FillTargetEntry(plan.get(), *result);
segment->FillPrimaryKeys(plan.get(), *result);
auto fields_data = result->output_fields_data_;
auto fields_meta = result->output_fields_meta_;
auto& fields_data = result->output_fields_data_;
ASSERT_EQ(fields_data.size(), 2);
ASSERT_EQ(fields_data.size(), 2);
ASSERT_EQ(fields_meta[0].get_sizeof(), sizeof(float) * dim);
ASSERT_EQ(fields_meta[1].get_sizeof(), sizeof(int32_t));
ASSERT_EQ(fields_data[0].size(), fields_meta[0].get_sizeof() * topk * num_queries);
ASSERT_EQ(fields_data[1].size(), fields_meta[1].get_sizeof() * topk * num_queries);
for (auto field_id : plan->target_entries_) {
ASSERT_EQ(fields_data.count(field_id), true);
}
auto vec_field_id = schema->get_field_id(FieldName("fakevec"));
auto output_vec_field_data = fields_data.at(vec_field_id)->vectors().float_vector().data();
ASSERT_EQ(output_vec_field_data.size(), topk * num_queries * dim);
auto i32_field_id = schema->get_field_id(FieldName("the_value"));
auto output_i32_field_data = fields_data.at(i32_field_id)->scalars().int_data().data();
ASSERT_EQ(output_i32_field_data.size(), topk * num_queries);
for (int i = 0; i < topk * num_queries; i++) {
int64_t val;
memcpy(&val, &result->ids_data_[i * sizeof(int64_t)], sizeof(int64_t));
int64_t val = std::get<int64_t>(result->primary_keys_[i]);
auto internal_offset = result->ids_[i];
auto internal_offset = result->seg_offsets_[i];
auto std_val = std_vec[internal_offset];
auto std_i32 = std_i32_vec[internal_offset];
std::vector<float> std_vfloat(dim);
@ -724,12 +740,12 @@ TEST(Query, FillSegment) {
if (val != -1) {
// check vector field
std::vector<float> vfloat(dim);
memcpy(vfloat.data(), &fields_data[0][i * sizeof(float) * dim], dim * sizeof(float));
memcpy(vfloat.data(), &output_vec_field_data[i * dim], dim * sizeof(float));
ASSERT_EQ(vfloat, std_vfloat);
// check int32 field
int i32;
memcpy(&i32, &fields_data[1][i * sizeof(int32_t)], sizeof(int32_t));
memcpy(&i32, &output_i32_field_data[i], sizeof(int32_t));
ASSERT_EQ(i32, std_i32);
}
}
@ -740,8 +756,10 @@ TEST(Query, ExecWithPredicateBinary) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddDebugField("age", DataType::FLOAT);
auto vec_fid = schema->AddDebugField("fakevec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
auto float_fid = schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -774,7 +792,7 @@ TEST(Query, ExecWithPredicateBinary) {
auto segment = CreateGrowingSegment(schema);
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto vec_ptr = dataset.get_col<uint8_t>(0);
auto vec_ptr = dataset.get_col<uint8_t>(vec_fid);
auto plan = CreatePlan(*schema, dsl);
auto num_queries = 5;

View File

@ -52,7 +52,7 @@ TEST(Reduce, SubQueryResult) {
}
SubSearchResult sub_result(num_queries, topk, metric_type, round_decimal);
sub_result.mutable_distances() = distances;
sub_result.mutable_ids() = ids;
sub_result.mutable_seg_offsets() = ids;
final_result.merge(sub_result);
}
@ -62,7 +62,7 @@ TEST(Reduce, SubQueryResult) {
auto ref_x = ref_results[n].top();
ref_results[n].pop();
auto index = n * topk + topk - 1 - k;
auto id = final_result.get_ids()[index];
auto id = final_result.get_seg_offsets()[index];
auto distance = final_result.get_distances()[index];
ASSERT_EQ(id, ref_x);
ASSERT_EQ(distance, ref_x);
@ -104,7 +104,7 @@ TEST(Reduce, SubSearchResultDesc) {
}
SubSearchResult sub_result(num_queries, topk, metric_type, round_decimal);
sub_result.mutable_distances() = distances;
sub_result.mutable_ids() = ids;
sub_result.mutable_seg_offsets() = ids;
final_result.merge(sub_result);
}
@ -114,7 +114,7 @@ TEST(Reduce, SubSearchResultDesc) {
auto ref_x = ref_results[n].top();
ref_results[n].pop();
auto index = n * topk + topk - 1 - k;
auto id = final_result.get_ids()[index];
auto id = final_result.get_seg_offsets()[index];
auto distance = final_result.get_distances()[index];
ASSERT_EQ(id, ref_x);
ASSERT_EQ(distance, ref_x);

View File

@ -19,15 +19,15 @@ TEST(SearchResultPair, Greater) {
auto pair2 = SearchResultPair(1, 2.0, nullptr, 1, 0, 10);
ASSERT_EQ(pair1 > pair2, false);
pair1.primary_key_ = INVALID_ID;
pair1.primary_key_ = INVALID_PK;
pair2.primary_key_ = 1;
ASSERT_EQ(pair1 > pair2, false);
pair1.primary_key_ = 0;
pair2.primary_key_ = INVALID_ID;
pair2.primary_key_ = INVALID_PK;
ASSERT_EQ(pair1 > pair2, true);
pair1.primary_key_ = INVALID_ID;
pair2.primary_key_ = INVALID_ID;
pair1.primary_key_ = INVALID_PK;
pair2.primary_key_ = INVALID_PK;
ASSERT_EQ(pair1 > pair2, false);
}

View File

@ -0,0 +1,81 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include "query/Relational.h"
#include "common/Utils.h"
#include <string>
TEST(Relational, Basic) {
using namespace milvus::query;
int64_t i64 = 4;
int64_t another_i64 = 5;
std::string s = "str4";
std::string another_s = "str5";
ASSERT_EQ(Relational<decltype(std::equal_to<>{})>()(i64, another_i64), i64 == another_i64);
ASSERT_EQ(Relational<decltype(std::not_equal_to<>{})>()(i64, another_i64), i64 != another_i64);
ASSERT_EQ(Relational<decltype(std::greater_equal<>{})>()(i64, another_i64), i64 >= another_i64);
ASSERT_EQ(Relational<decltype(std::greater<>{})>()(i64, another_i64), i64 > another_i64);
ASSERT_EQ(Relational<decltype(std::less_equal<>{})>()(i64, another_i64), i64 <= another_i64);
ASSERT_EQ(Relational<decltype(std::less<>{})>()(i64, another_i64), i64 < another_i64);
ASSERT_EQ(Relational<decltype(std::equal_to<>{})>()(s, another_s), s == another_s);
ASSERT_EQ(Relational<decltype(std::not_equal_to<>{})>()(s, another_s), s != another_s);
ASSERT_EQ(Relational<decltype(std::greater_equal<>{})>()(s, another_s), s >= another_s);
ASSERT_EQ(Relational<decltype(std::greater<>{})>()(s, another_s), s > another_s);
ASSERT_EQ(Relational<decltype(std::less_equal<>{})>()(s, another_s), s <= another_s);
ASSERT_EQ(Relational<decltype(std::less<>{})>()(s, another_s), s < another_s);
ASSERT_EQ(Relational<decltype(MatchOp<OpType::PrefixMatch>{})>()(s, another_s), milvus::PrefixMatch(s, another_s));
ASSERT_EQ(Relational<decltype(MatchOp<OpType::PostfixMatch>{})>()(s, another_s),
milvus::PostfixMatch(s, another_s));
}
TEST(Relational, DifferentFundamentalType) {
using namespace milvus::query;
int32_t i32 = 3;
int64_t i64 = 4;
ASSERT_EQ(Relational<decltype(std::equal_to<>{})>()(i64, i32), i64 == i32);
ASSERT_EQ(Relational<decltype(std::not_equal_to<>{})>()(i64, i32), i64 != i32);
ASSERT_EQ(Relational<decltype(std::greater_equal<>{})>()(i64, i32), i64 >= i32);
ASSERT_EQ(Relational<decltype(std::greater<>{})>()(i64, i32), i64 > i32);
ASSERT_EQ(Relational<decltype(std::less_equal<>{})>()(i64, i32), i64 <= i32);
ASSERT_EQ(Relational<decltype(std::less<>{})>()(i64, i32), i64 < i32);
}
TEST(Relational, DifferentInCompatibleType) {
using namespace milvus::query;
int64_t i64 = 4;
std::string s = "str4";
ASSERT_ANY_THROW(Relational<decltype(std::equal_to<>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(std::not_equal_to<>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(std::greater_equal<>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(std::greater<>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(std::less_equal<>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(std::less<>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(MatchOp<OpType::PrefixMatch>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(MatchOp<OpType::PostfixMatch>{})>()(s, i64));
ASSERT_ANY_THROW(Relational<decltype(std::equal_to<>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(std::not_equal_to<>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(std::greater_equal<>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(std::greater<>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(std::less_equal<>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(std::less<>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(MatchOp<OpType::PrefixMatch>{})>()(i64, s));
ASSERT_ANY_THROW(Relational<decltype(MatchOp<OpType::PostfixMatch>{})>()(i64, s));
}

View File

@ -49,7 +49,7 @@ TEST(Retrieve, AutoID) {
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
schema->set_primary_field_id(fid_64);
int64_t N = 100;
int64_t req_size = 10;
@ -58,22 +58,21 @@ TEST(Retrieve, AutoID) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(0);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
std::vector<int64_t> values;
for (int i = 0; i < req_size; ++i) {
values.emplace_back(i64_col[choose(i)]);
}
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldOffset(0), DataType::INT64, values);
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(fid_64, DataType::INT64, values);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
plan->field_offsets_ = target_offsets;
std::vector<FieldId> target_fields_id{fid_64, fid_vec};
plan->field_ids_ = target_fields_id;
auto retrieve_results = segment->Retrieve(plan.get(), 100);
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
Assert(retrieve_results->fields_data_size() == target_fields_id.size());
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();
@ -100,7 +99,7 @@ TEST(Retrieve, AutoID2) {
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
schema->set_primary_field_id(fid_64);
int64_t N = 100;
int64_t req_size = 10;
@ -109,22 +108,21 @@ TEST(Retrieve, AutoID2) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(0);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
std::vector<int64_t> values;
for (int i = 0; i < req_size; ++i) {
values.emplace_back(i64_col[choose(i)]);
}
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldOffset(0), DataType::INT64, values);
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(fid_64, DataType::INT64, values);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
plan->field_offsets_ = target_offsets;
std::vector<FieldId> target_offsets{fid_64, fid_vec};
plan->field_ids_ = target_offsets;
auto retrieve_results = segment->Retrieve(plan.get(), 100);
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();
@ -146,7 +144,7 @@ TEST(Retrieve, NotExist) {
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
schema->set_primary_field_id(fid_64);
int64_t N = 100;
int64_t req_size = 10;
@ -156,7 +154,7 @@ TEST(Retrieve, NotExist) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(0);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
std::vector<int64_t> values;
@ -165,15 +163,14 @@ TEST(Retrieve, NotExist) {
values.emplace_back(choose2(i));
}
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldOffset(0), DataType::INT64, values);
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(fid_64, DataType::INT64, values);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
plan->field_offsets_ = target_offsets;
std::vector<FieldId> target_offsets{fid_64, fid_vec};
plan->field_ids_ = target_offsets;
auto retrieve_results = segment->Retrieve(plan.get(), 100);
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();
@ -195,7 +192,7 @@ TEST(Retrieve, Empty) {
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
schema->set_primary_field_id(fid_64);
int64_t N = 100;
int64_t req_size = 10;
@ -208,11 +205,11 @@ TEST(Retrieve, Empty) {
for (int i = 0; i < req_size; ++i) {
values.emplace_back(choose(i));
}
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldOffset(0), DataType::INT64, values);
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(fid_64, DataType::INT64, values);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
plan->field_offsets_ = target_offsets;
std::vector<FieldId> target_offsets{fid_64, fid_vec};
plan->field_ids_ = target_offsets;
auto retrieve_results = segment->Retrieve(plan.get(), 100);
@ -230,7 +227,7 @@ TEST(Retrieve, LargeTimestamp) {
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
schema->set_primary_field_id(fid_64);
int64_t N = 100;
int64_t req_size = 10;
@ -240,32 +237,38 @@ TEST(Retrieve, LargeTimestamp) {
auto dataset = DataGen(schema, N, 42, ts_offset + 1);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(0);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
std::vector<int64_t> values;
for (int i = 0; i < req_size; ++i) {
values.emplace_back(i64_col[choose(i)]);
}
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldOffset(0), DataType::INT64, values);
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(fid_64, DataType::INT64, values);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
plan->field_offsets_ = target_offsets;
std::vector<FieldId> target_offsets{fid_64, fid_vec};
plan->field_ids_ = target_offsets;
std::vector<int> filter_timestamps{-1, 0, 1, 10, 20};
filter_timestamps.push_back(N / 2);
for (const auto& f_ts : filter_timestamps) {
auto retrieve_results = segment->Retrieve(plan.get(), ts_offset + 1 + f_ts);
Assert(retrieve_results->fields_data_size() == 2);
auto field0 = retrieve_results->fields_data(0);
auto field1 = retrieve_results->fields_data(1);
int target_num = (f_ts + choose_sep) / choose_sep;
if (target_num > req_size) {
target_num = req_size;
}
Assert(field0.scalars().long_data().data_size() == target_num);
Assert(field1.vectors().float_vector().data_size() == target_num * DIM);
for (auto field_data : retrieve_results->fields_data()) {
if (DataType(field_data.type()) == DataType::INT64) {
Assert(field_data.scalars().long_data().data_size() == target_num);
}
if (DataType(field_data.type()) == DataType::VECTOR_FLOAT) {
Assert(field_data.vectors().float_vector().data_size() == target_num * DIM);
}
}
}
}
@ -274,7 +277,7 @@ TEST(Retrieve, Delete) {
auto fid_64 = schema->AddDebugField("i64", DataType::INT64);
auto DIM = 16;
auto fid_vec = schema->AddDebugField("vector_64", DataType::VECTOR_FLOAT, DIM, MetricType::METRIC_L2);
schema->set_primary_key(FieldOffset(0));
schema->set_primary_field_id(fid_64);
int64_t N = 10;
int64_t req_size = 10;
@ -283,23 +286,22 @@ TEST(Retrieve, Delete) {
auto dataset = DataGen(schema, N);
auto segment = CreateSealedSegment(schema);
SealedLoader(dataset, *segment);
auto i64_col = dataset.get_col<int64_t>(0);
auto i64_col = dataset.get_col<int64_t>(fid_64);
auto plan = std::make_unique<query::RetrievePlan>(*schema);
std::vector<int64_t> values;
for (int i = 0; i < req_size; ++i) {
values.emplace_back(i64_col[choose(i)]);
}
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldOffset(0), DataType::INT64, values);
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(fid_64, DataType::INT64, values);
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
plan->plan_node_->predicate_ = std::move(term_expr);
std::vector<FieldOffset> target_offsets{FieldOffset(0), FieldOffset(1)};
plan->field_offsets_ = target_offsets;
std::vector<FieldId> target_offsets{fid_64, fid_vec};
plan->field_ids_ = target_offsets;
{
auto retrieve_results = segment->Retrieve(plan.get(), 100);
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();
@ -326,25 +328,28 @@ TEST(Retrieve, Delete) {
auto load_delete_record = false;
if (load_delete_record) {
std::vector<idx_t> pks{1, 2, 3, 4, 5};
auto ids = std::make_unique<IdArray>();
ids->mutable_int_id()->mutable_data()->Add(pks.begin(), pks.end());
std::vector<Timestamp> timestamps{10, 10, 10, 10, 10};
LoadDeletedRecordInfo info = {timestamps.data(), pks.data(), row_count};
LoadDeletedRecordInfo info = {timestamps.data(), ids.get(), row_count};
segment->LoadDeletedRecord(info);
row_count = 5;
}
int64_t new_count = 6;
std::vector<idx_t> new_pks{0, 1, 2, 3, 4, 5};
auto ids = std::make_unique<IdArray>();
ids->mutable_int_id()->mutable_data()->Add(new_pks.begin(), new_pks.end());
std::vector<idx_t> new_timestamps{10, 10, 10, 10, 10, 10};
auto reserved_offset = segment->PreDelete(new_count);
ASSERT_EQ(reserved_offset, row_count);
segment->Delete(reserved_offset, new_count, reinterpret_cast<const int64_t*>(new_pks.data()),
reinterpret_cast<const Timestamp*>(new_timestamps.data()));
segment->Delete(reserved_offset, new_count, ids.get(), reinterpret_cast<const Timestamp*>(new_timestamps.data()));
{
auto retrieve_results = segment->Retrieve(plan.get(), 100);
Assert(retrieve_results->fields_data_size() == target_offsets.size());
FieldOffset field_offset(0);
auto field0 = retrieve_results->fields_data(0);
Assert(field0.has_scalars());
auto field0_data = field0.scalars().long_data();

View File

@ -52,6 +52,18 @@ TYPED_TEST_P(TypedScalarIndexTest, Constructor) {
}
}
TYPED_TEST_P(TypedScalarIndexTest, Count) {
using T = TypeParam;
auto dtype = milvus::GetDType<T>();
auto index_types = GetIndexTypes<T>();
for (const auto& index_type : index_types) {
auto index = milvus::scalar::IndexFactory::GetInstance().CreateIndex<T>(index_type);
auto arr = GenArr<T>(nb);
index->Build(nb, arr.data());
ASSERT_EQ(nb, index->Count());
}
}
TYPED_TEST_P(TypedScalarIndexTest, In) {
using T = TypeParam;
auto dtype = milvus::GetDType<T>();
@ -101,6 +113,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
auto copy_index = milvus::scalar::IndexFactory::GetInstance().CreateIndex<T>(index_type);
copy_index->Load(binary_set);
ASSERT_EQ(nb, copy_index->Count());
assert_in<T>(copy_index, arr);
assert_not_in<T>(copy_index, arr);
assert_range<T>(copy_index, arr);
@ -110,6 +123,6 @@ TYPED_TEST_P(TypedScalarIndexTest, Codec) {
// TODO: it's easy to overflow for int8_t. Design more reasonable ut.
using ScalarT = ::testing::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Dummy, Constructor, In, NotIn, Range, Codec);
REGISTER_TYPED_TEST_CASE_P(TypedScalarIndexTest, Dummy, Constructor, Count, In, NotIn, Range, Codec);
INSTANTIATE_TYPED_TEST_CASE_P(ArithmeticCheck, TypedScalarIndexTest, ScalarT);

View File

@ -31,7 +31,9 @@ TEST(Sealed, without_predicate) {
auto topK = 5;
auto metric_type = MetricType::METRIC_L2;
auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
schema->AddDebugField("age", DataType::FLOAT);
auto float_fid = schema->AddDebugField("age", DataType::FLOAT);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -55,7 +57,7 @@ TEST(Sealed, without_predicate) {
auto N = ROW_COUNT;
auto dataset = DataGen(schema, N);
auto vec_col = dataset.get_col<float>(0);
auto vec_col = dataset.get_col<float>(fake_id);
for (int64_t i = 0; i < 1000 * dim; ++i) {
vec_col.push_back(0);
}
@ -99,7 +101,7 @@ TEST(Sealed, without_predicate) {
std::vector<int64_t> vec_ids(ids, ids + topK * num_queries);
std::vector<float> vec_dis(dis, dis + topK * num_queries);
sr->ids_ = vec_ids;
sr->seg_offsets_ = vec_ids;
sr->distances_ = vec_dis;
auto ref_result = SearchResultToJson(*sr);
@ -127,7 +129,8 @@ TEST(Sealed, with_predicate) {
auto topK = 5;
auto metric_type = MetricType::METRIC_L2;
auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
schema->AddDebugField("counter", DataType::INT64);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
std::string dsl = R"({
"bool": {
"must": [
@ -159,7 +162,7 @@ TEST(Sealed, with_predicate) {
auto N = ROW_COUNT;
auto dataset = DataGen(schema, N);
auto vec_col = dataset.get_col<float>(0);
auto vec_col = dataset.get_col<float>(fake_id);
auto query_ptr = vec_col.data() + 42000 * dim;
auto segment = CreateGrowingSegment(schema);
segment->PreInsert(N);
@ -204,7 +207,7 @@ TEST(Sealed, with_predicate) {
for (int i = 0; i < num_queries; ++i) {
auto offset = i * topK;
ASSERT_EQ(sr->ids_[offset], 42000 + i);
ASSERT_EQ(sr->seg_offsets_[offset], 42000 + i);
ASSERT_EQ(sr->distances_[offset], 0.0);
}
}
@ -219,10 +222,11 @@ TEST(Sealed, LoadFieldData) {
auto counter_id = schema->AddDebugField("counter", DataType::INT64);
auto double_id = schema->AddDebugField("double", DataType::DOUBLE);
auto nothing_id = schema->AddDebugField("nothing", DataType::INT32);
schema->set_primary_field_id(counter_id);
auto dataset = DataGen(schema, N);
auto fakevec = dataset.get_col<float>(0);
auto fakevec = dataset.get_col<float>(fakevec_id);
auto indexing = GenIndexing(N, dim, fakevec.data());
@ -277,10 +281,10 @@ TEST(Sealed, LoadFieldData) {
segment->LoadIndex(vec_info);
ASSERT_EQ(segment->num_chunk(), 1);
auto chunk_span1 = segment->chunk_data<int64_t>(FieldOffset(1), 0);
auto chunk_span2 = segment->chunk_data<double>(FieldOffset(2), 0);
auto ref1 = dataset.get_col<int64_t>(1);
auto ref2 = dataset.get_col<double>(2);
auto chunk_span1 = segment->chunk_data<int64_t>(counter_id, 0);
auto chunk_span2 = segment->chunk_data<double>(double_id, 0);
auto ref1 = dataset.get_col<int64_t>(counter_id);
auto ref2 = dataset.get_col<double>(double_id);
for (int i = 0; i < N; ++i) {
ASSERT_EQ(chunk_span1[i], ref1[i]);
ASSERT_EQ(chunk_span2[i], ref2[i]);
@ -324,6 +328,96 @@ TEST(Sealed, LoadFieldData) {
ASSERT_EQ(std_json.dump(-2), json.dump(-2));
}
TEST(Sealed, LoadScalarIndex) {
auto dim = 16;
auto N = ROW_COUNT;
auto metric_type = MetricType::METRIC_L2;
auto schema = std::make_shared<Schema>();
auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
auto counter_id = schema->AddDebugField("counter", DataType::INT64);
auto double_id = schema->AddDebugField("double", DataType::DOUBLE);
auto nothing_id = schema->AddDebugField("nothing", DataType::INT32);
schema->set_primary_field_id(counter_id);
auto dataset = DataGen(schema, N);
auto fakevec = dataset.get_col<float>(fakevec_id);
auto indexing = GenIndexing(N, dim, fakevec.data());
auto segment = CreateSealedSegment(schema);
std::string dsl = R"({
"bool": {
"must": [
{
"range": {
"double": {
"GE": -1,
"LT": 1
}
}
},
{
"vector": {
"fakevec": {
"metric_type": "L2",
"params": {
"nprobe": 10
},
"query": "$0",
"topk": 5,
"round_decimal": 3
}
}
}
]
}
})";
Timestamp time = 1000000;
auto plan = CreatePlan(*schema, dsl);
auto num_queries = 5;
auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024);
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
SealedLoader(dataset, *segment);
LoadIndexInfo vec_info;
vec_info.field_id = fakevec_id.get();
vec_info.field_type = CDataType::FloatVector;
vec_info.index = indexing;
vec_info.index_params["metric_type"] = knowhere::Metric::L2;
segment->LoadIndex(vec_info);
LoadIndexInfo counter_index;
counter_index.field_id = counter_id.get();
counter_index.field_type = CDataType::Int64;
counter_index.index_params["index_type"] = "sort";
auto counter_data = dataset.get_col<int64_t>(counter_id);
counter_index.index = std::move(GenScalarIndexing<int64_t>(N, counter_data.data()));
segment->LoadIndex(counter_index);
LoadIndexInfo double_index;
double_index.field_id = double_id.get();
double_index.field_type = CDataType::Double;
double_index.index_params["index_type"] = "sort";
auto double_data = dataset.get_col<double>(double_id);
double_index.index = std::move(GenScalarIndexing<double>(N, double_data.data()));
segment->LoadIndex(double_index);
LoadIndexInfo nothing_index;
nothing_index.field_id = nothing_id.get();
nothing_index.field_type = CDataType::Int32;
nothing_index.index_params["index_type"] = "sort";
auto nothing_data = dataset.get_col<int32_t>(nothing_id);
nothing_index.index = std::move(GenScalarIndexing<int32_t>(N, nothing_data.data()));
segment->LoadIndex(nothing_index);
auto sr = segment->Search(plan.get(), *ph_group, time);
auto json = SearchResultToJson(*sr);
std::cout << json.dump(1);
}
TEST(Sealed, Delete) {
auto dim = 16;
auto topK = 5;
@ -334,10 +428,11 @@ TEST(Sealed, Delete) {
auto counter_id = schema->AddDebugField("counter", DataType::INT64);
auto double_id = schema->AddDebugField("double", DataType::DOUBLE);
auto nothing_id = schema->AddDebugField("nothing", DataType::INT32);
schema->set_primary_field_id(counter_id);
auto dataset = DataGen(schema, N);
auto fakevec = dataset.get_col<float>(0);
auto fakevec = dataset.get_col<float>(fakevec_id);
auto segment = CreateSealedSegment(schema);
std::string dsl = R"({
@ -380,9 +475,11 @@ TEST(Sealed, Delete) {
int64_t row_count = 5;
std::vector<idx_t> pks{1, 2, 3, 4, 5};
auto ids = std::make_unique<IdArray>();
ids->mutable_int_id()->mutable_data()->Add(pks.begin(), pks.end());
std::vector<Timestamp> timestamps{10, 10, 10, 10, 10};
LoadDeletedRecordInfo info = {timestamps.data(), pks.data(), row_count};
LoadDeletedRecordInfo info = {timestamps.data(), ids.get(), row_count};
segment->LoadDeletedRecord(info);
std::vector<uint8_t> tmp_block{0, 0};
@ -392,9 +489,11 @@ TEST(Sealed, Delete) {
int64_t new_count = 3;
std::vector<idx_t> new_pks{6, 7, 8};
auto new_ids = std::make_unique<IdArray>();
new_ids->mutable_int_id()->mutable_data()->Add(new_pks.begin(), new_pks.end());
std::vector<idx_t> new_timestamps{10, 10, 10};
auto reserved_offset = segment->PreDelete(new_count);
ASSERT_EQ(reserved_offset, row_count);
segment->Delete(reserved_offset, new_count, reinterpret_cast<const int64_t*>(new_pks.data()),
segment->Delete(reserved_offset, new_count, new_ids.get(),
reinterpret_cast<const Timestamp*>(new_timestamps.data()));
}

View File

@ -14,6 +14,7 @@
#include <string>
#include "segcore/SegmentGrowingImpl.h"
#include "test_utils/DataGen.h"
using namespace milvus;
@ -56,46 +57,6 @@ TEST(SegmentCoreTest, NormalDistributionTest) {
segment->PreDelete(N);
}
// Test insert row-based data
TEST(SegmentCoreTest, MockTest) {
using namespace milvus::segcore;
using namespace milvus::engine;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::INT32);
std::vector<char> raw_data;
std::vector<Timestamp> timestamps;
std::vector<int64_t> uids;
int N = 10000;
std::default_random_engine e(67);
for (int i = 0; i < N; ++i) {
uids.push_back(100000 + i);
timestamps.push_back(0);
// append vec
float vec[16];
for (auto& x : vec) {
x = e() % 2000 * 0.001 - 1.0;
}
raw_data.insert(raw_data.end(), (const char*)std::begin(vec), (const char*)std::end(vec));
int age = e() % 100;
raw_data.insert(raw_data.end(), (const char*)&age, ((const char*)&age) + sizeof(age));
}
auto line_sizeof = (sizeof(int) + sizeof(float) * 16);
assert(raw_data.size() == line_sizeof * N);
// auto index_meta = std::make_shared<IndexMeta>(schema);
auto segment = CreateGrowingSegment(schema);
RowBasedRawData data_chunk{raw_data.data(), (int)line_sizeof, N};
auto offset = segment->PreInsert(N);
segment->Insert(offset, N, uids.data(), timestamps.data(), data_chunk);
SearchResult search_result;
// segment->Query(nullptr, 0, query_result);
// segment->BuildIndex();
int i = 0;
i++;
}
// Test insert column-based data
TEST(SegmentCoreTest, MockTest2) {
using namespace milvus::segcore;
@ -104,70 +65,14 @@ TEST(SegmentCoreTest, MockTest2) {
// schema
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("age", DataType::INT32);
auto i64_fid = schema->AddDebugField("age", DataType::INT64);
schema->set_primary_field_id(i64_fid);
// generate random row-based data
std::vector<char> row_data;
std::vector<Timestamp> timestamps;
std::vector<int64_t> uids;
int N = 10000; // number of records
std::default_random_engine e(67);
for (int i = 0; i < N; ++i) {
uids.push_back(100000 + i);
timestamps.push_back(0);
// append vec
float vec[16];
for (auto& x : vec) {
x = e() % 2000 * 0.001 - 1.0;
}
row_data.insert(row_data.end(), (const char*)std::begin(vec), (const char*)std::end(vec));
int age = e() % 100;
row_data.insert(row_data.end(), (const char*)&age, ((const char*)&age) + sizeof(age));
}
auto line_sizeof = (sizeof(int) + sizeof(float) * 16);
assert(row_data.size() == line_sizeof * N);
int64_t size = N;
const int64_t* uids_raw = uids.data();
const Timestamp* timestamps_raw = timestamps.data();
std::vector<std::tuple<Timestamp, idx_t, int64_t>> ordering(size); // timestamp, pk, order_index
for (int i = 0; i < size; ++i) {
ordering[i] = std::make_tuple(timestamps_raw[i], uids_raw[i], i);
}
std::sort(ordering.begin(), ordering.end()); // sort according to timestamp
// convert row-based data to column-based data accordingly
auto sizeof_infos = schema->get_sizeof_infos();
std::vector<int> offset_infos(schema->size() + 1, 0);
std::partial_sum(sizeof_infos.begin(), sizeof_infos.end(), offset_infos.begin() + 1);
std::vector<aligned_vector<uint8_t>> entities(schema->size());
for (int fid = 0; fid < schema->size(); ++fid) {
auto len = sizeof_infos[fid];
entities[fid].resize(len * size);
}
auto raw_data = row_data.data();
std::vector<idx_t> sorted_uids(size);
std::vector<Timestamp> sorted_timestamps(size);
for (int index = 0; index < size; ++index) {
auto [t, uid, order_index] = ordering[index];
sorted_timestamps[index] = t;
sorted_uids[index] = uid;
for (int fid = 0; fid < schema->size(); ++fid) {
auto len = sizeof_infos[fid];
auto offset = offset_infos[fid];
auto src = raw_data + order_index * line_sizeof + offset;
auto dst = entities[fid].data() + index * len;
memcpy(dst, src, len);
}
}
// insert column-based data
ColumnBasedRawData data_chunk{entities, N};
auto dataset = DataGen(schema, N);
auto segment = CreateGrowingSegment(schema);
auto reserved_begin = segment->PreInsert(N);
segment->Insert(reserved_begin, size, sorted_uids.data(), sorted_timestamps.data(), data_chunk);
segment->Insert(reserved_begin, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
}
TEST(SegmentCoreTest, SmallIndex) {

View File

@ -24,27 +24,29 @@ TEST(Span, Naive) {
int64_t N = ROW_COUNT;
constexpr int64_t size_per_chunk = 32 * 1024;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("binaryvec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddDebugField("age", DataType::FLOAT);
schema->AddDebugField("floatvec", DataType::VECTOR_FLOAT, 32, MetricType::METRIC_L2);
auto bin_vec_fid = schema->AddDebugField("binaryvec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
auto float_fid = schema->AddDebugField("age", DataType::FLOAT);
auto float_vec_fid = schema->AddDebugField("floatvec", DataType::VECTOR_FLOAT, 32, MetricType::METRIC_L2);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
auto dataset = DataGen(schema, N);
auto seg_conf = SegcoreConfig::default_config();
auto segment = CreateGrowingSegment(schema, -1, seg_conf);
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto vec_ptr = dataset.get_col<uint8_t>(0);
auto age_ptr = dataset.get_col<float>(1);
auto float_ptr = dataset.get_col<float>(2);
auto vec_ptr = dataset.get_col<uint8_t>(bin_vec_fid);
auto age_ptr = dataset.get_col<float>(float_fid);
auto float_ptr = dataset.get_col<float>(float_vec_fid);
SegmentInternalInterface& interface = *segment;
auto num_chunk = interface.num_chunk();
ASSERT_EQ(num_chunk, upper_div(N, size_per_chunk));
auto row_count = interface.get_row_count();
ASSERT_EQ(N, row_count);
for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
auto vec_span = interface.chunk_data<BinaryVector>(FieldOffset(0), chunk_id);
auto age_span = interface.chunk_data<float>(FieldOffset(1), chunk_id);
auto float_span = interface.chunk_data<FloatVector>(FieldOffset(2), chunk_id);
auto vec_span = interface.chunk_data<milvus::BinaryVector>(bin_vec_fid, chunk_id);
auto age_span = interface.chunk_data<float>(float_fid, chunk_id);
auto float_span = interface.chunk_data<milvus::FloatVector>(float_vec_fid, chunk_id);
auto begin = chunk_id * size_per_chunk;
auto end = std::min((chunk_id + 1) * size_per_chunk, N);
auto size_of_chunk = end - begin;

View File

@ -0,0 +1,589 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <memory>
#include <boost/format.hpp>
#include <regex>
#include "pb/plan.pb.h"
#include "query/Expr.h"
#include "query/generated/PlanNodeVisitor.h"
#include "query/generated/ExecExprVisitor.h"
#include "segcore/SegmentGrowingImpl.h"
#include "test_utils/DataGen.h"
#include "query/PlanProto.h"
#include "query/Utils.h"
#include "query/SearchBruteForce.h"
using namespace milvus;
namespace {
template <typename T>
auto
GenGenericValue(T value) {
auto generic = new proto::plan::GenericValue();
if constexpr (std::is_same_v<T, bool>) {
generic->set_bool_val(static_cast<bool>(value));
} else if constexpr (std::is_integral_v<T>) {
generic->set_int64_val(static_cast<int64_t>(value));
} else if constexpr (std::is_floating_point_v<T>) {
generic->set_float_val(static_cast<float>(value));
} else if constexpr (std::is_same_v<T, std::string>) {
generic->set_string_val(static_cast<std::string>(value));
} else {
static_assert(always_false<T>);
}
return generic;
}
auto
GenColumnInfo(int64_t field_id, proto::schema::DataType field_type, bool auto_id, bool is_pk) {
auto column_info = new proto::plan::ColumnInfo();
column_info->set_field_id(field_id);
column_info->set_data_type(field_type);
column_info->set_is_autoid(auto_id);
column_info->set_is_primary_key(is_pk);
return column_info;
}
auto
GenQueryInfo(int64_t topk, std::string metric_type, std::string search_params, int64_t round_decimal = -1) {
auto query_info = new proto::plan::QueryInfo();
query_info->set_topk(topk);
query_info->set_metric_type(metric_type);
query_info->set_search_params(search_params);
query_info->set_round_decimal(round_decimal);
return query_info;
}
auto
GenAnns(proto::plan::Expr* predicate, bool is_binary, int64_t field_id, std::string placeholder_tag = "$0") {
auto query_info = GenQueryInfo(10, "L2", "{\"nprobe\": 10}", -1);
auto anns = new proto::plan::VectorANNS();
anns->set_is_binary(is_binary);
anns->set_field_id(field_id);
anns->set_allocated_predicates(predicate);
anns->set_allocated_query_info(query_info);
anns->set_placeholder_tag(placeholder_tag);
return anns;
}
template <typename T>
auto
GenTermExpr(const std::vector<T>& values) {
auto term_expr = new proto::plan::TermExpr();
for (int i = 0; i < values.size(); i++) {
auto add_value = term_expr->add_values();
if constexpr (std::is_same_v<T, bool>) {
add_value->set_bool_val(static_cast<T>(values[i]));
} else if constexpr (std::is_integral_v<T>) {
add_value->set_int64_val(static_cast<int64_t>(values[i]));
} else if constexpr (std::is_floating_point_v<T>) {
add_value->set_float_val(static_cast<double>(values[i]));
} else if constexpr (std::is_same_v<T, std::string>) {
add_value->set_string_val(static_cast<T>(values[i]));
} else {
static_assert(always_false<T>);
}
}
return term_expr;
}
auto
GenCompareExpr(proto::plan::OpType op) {
auto compare_expr = new proto::plan::CompareExpr();
compare_expr->set_op(op);
return compare_expr;
}
template <typename T>
auto
GenUnaryRangeExpr(proto::plan::OpType op, T value) {
auto unary_range_expr = new proto::plan::UnaryRangeExpr();
unary_range_expr->set_op(op);
auto generic = GenGenericValue(value);
unary_range_expr->set_allocated_value(generic);
return unary_range_expr;
}
template <typename T>
auto
GenBinaryRangeExpr(bool lb_inclusive, bool ub_inclusive, T lb, T ub) {
auto binary_range_expr = new proto::plan::BinaryRangeExpr();
binary_range_expr->set_lower_inclusive(lb_inclusive);
binary_range_expr->set_upper_inclusive(ub_inclusive);
auto lb_generic = GenGenericValue(lb);
auto ub_generic = GenGenericValue(ub);
binary_range_expr->set_allocated_lower_value(lb_generic);
binary_range_expr->set_allocated_upper_value(ub_generic);
return binary_range_expr;
}
auto
GenNotExpr() {
auto not_expr = new proto::plan::UnaryExpr();
not_expr->set_op(proto::plan::UnaryExpr_UnaryOp_Not);
return not_expr;
}
auto
GenExpr() {
return std::make_unique<proto::plan::Expr>();
}
auto
GenPlanNode() {
return std::make_unique<proto::plan::PlanNode>();
}
void
SetTargetEntry(std::unique_ptr<proto::plan::PlanNode>& plan_node, const std::vector<int64_t>& output_fields) {
for (auto id : output_fields) {
plan_node->add_output_field_ids(id);
}
}
auto
GenTermPlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta, const std::vector<std::string>& strs)
-> std::unique_ptr<proto::plan::PlanNode> {
auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false);
auto term_expr = GenTermExpr<std::string>(strs);
term_expr->set_allocated_column_info(column_info);
auto expr = GenExpr().release();
expr->set_allocated_term_expr(term_expr);
auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0");
auto plan_node = GenPlanNode();
plan_node->set_allocated_vector_anns(anns);
return std::move(plan_node);
}
auto
GenAlwaysFalseExpr(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false);
auto term_expr = GenTermExpr<std::string>({}); // in empty set, always false.
term_expr->set_allocated_column_info(column_info);
auto expr = GenExpr().release();
expr->set_allocated_term_expr(term_expr);
return expr;
}
auto
GenAlwaysTrueExpr(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
auto always_false_expr = GenAlwaysFalseExpr(fvec_meta, str_meta);
auto not_expr = GenNotExpr();
not_expr->set_allocated_child(always_false_expr);
auto expr = GenExpr().release();
expr->set_allocated_unary_expr(not_expr);
return expr;
}
auto
GenAlwaysFalsePlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
auto always_false_expr = GenAlwaysFalseExpr(fvec_meta, str_meta);
auto anns = GenAnns(always_false_expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY,
fvec_meta.get_id().get(), "$0");
auto plan_node = GenPlanNode();
plan_node->set_allocated_vector_anns(anns);
return std::move(plan_node);
}
auto
GenAlwaysTruePlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
auto always_true_expr = GenAlwaysTrueExpr(fvec_meta, str_meta);
auto anns =
GenAnns(always_true_expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0");
auto plan_node = GenPlanNode();
plan_node->set_allocated_vector_anns(anns);
return std::move(plan_node);
}
SchemaPtr
GenTestSchema() {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField("fvec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
return schema;
}
SchemaPtr
GenStrPKSchema() {
auto schema = std::make_shared<Schema>();
auto pk = schema->AddDebugField("str", DataType::VARCHAR);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField("fvec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
return schema;
}
} // namespace
TEST(StringExpr, Term) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto vec_2k_3k = []() -> std::vector<std::string> {
std::vector<std::string> ret;
for (int i = 2000; i < 3000; i++) {
ret.push_back(std::to_string(i));
}
return ret;
}();
std::map<int, std::vector<std::string>> terms = {
{0, {"2000", "3000"}}, {1, {"2000"}}, {2, {"3000"}}, {3, {}}, {4, {vec_2k_3k}},
};
auto seg = CreateGrowingSegment(schema);
int N = 1000;
std::vector<std::string> str_col;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_str_col = raw_data.get_col(str_meta.get_id());
auto begin = new_str_col->scalars().string_data().data().begin();
auto end = new_str_col->scalars().string_data().data().end();
str_col.insert(str_col.end(), begin, end);
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP);
for (const auto& [_, term] : terms) {
auto plan_proto = GenTermPlan(fvec_meta, str_meta, term);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val = str_col[i];
auto ref = std::find(term.begin(), term.end(), val) != term.end();
ASSERT_EQ(ans, ref) << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, Compare) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
const auto& another_str_meta = schema->operator[](FieldName("another_str"));
auto gen_compare_plan = [&, fvec_meta, str_meta,
another_str_meta](proto::plan::OpType op) -> std::unique_ptr<proto::plan::PlanNode> {
auto str_col_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false);
auto another_str_col_info =
GenColumnInfo(another_str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false);
auto compare_expr = GenCompareExpr(op);
compare_expr->set_allocated_left_column_info(str_col_info);
compare_expr->set_allocated_right_column_info(another_str_col_info);
auto expr = GenExpr().release();
expr->set_allocated_compare_expr(compare_expr);
auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return std::move(plan_node);
};
std::vector<std::tuple<proto::plan::OpType, std::function<bool(std::string, std::string)>>> testcases{
{proto::plan::OpType::GreaterThan, [](std::string v1, std::string v2) { return v1 > v2; }},
{proto::plan::OpType::GreaterEqual, [](std::string v1, std::string v2) { return v1 >= v2; }},
{proto::plan::OpType::LessThan, [](std::string v1, std::string v2) { return v1 < v2; }},
{proto::plan::OpType::LessEqual, [](std::string v1, std::string v2) { return v1 <= v2; }},
{proto::plan::OpType::Equal, [](std::string v1, std::string v2) { return v1 == v2; }},
{proto::plan::OpType::NotEqual, [](std::string v1, std::string v2) { return v1 != v2; }},
{proto::plan::OpType::PrefixMatch, [](std::string v1, std::string v2) { return PrefixMatch(v1, v2); }},
};
auto seg = CreateGrowingSegment(schema);
int N = 1000;
std::vector<std::string> str_col;
std::vector<std::string> another_str_col;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto reserve_col = [&, raw_data](const FieldMeta& field_meta, std::vector<std::string>& str_col) {
auto new_str_col = raw_data.get_col(field_meta.get_id());
auto begin = new_str_col->scalars().string_data().data().begin();
auto end = new_str_col->scalars().string_data().data().end();
str_col.insert(str_col.end(), begin, end);
};
reserve_col(str_meta, str_col);
reserve_col(another_str_meta, another_str_col);
{
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
}
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP);
for (const auto& [op, ref_func] : testcases) {
auto plan_proto = gen_compare_plan(op);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val = str_col[i];
auto another_val = another_str_col[i];
auto ref = ref_func(val, another_val);
ASSERT_EQ(ans, ref) << "@" << op << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, UnaryRange) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto gen_unary_range_plan = [&, fvec_meta, str_meta](proto::plan::OpType op,
std::string value) -> std::unique_ptr<proto::plan::PlanNode> {
auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false);
auto unary_range_expr = GenUnaryRangeExpr(op, value);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = GenExpr().release();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return std::move(plan_node);
};
std::vector<std::tuple<proto::plan::OpType, std::string, std::function<bool(std::string)>>> testcases{
{proto::plan::OpType::GreaterThan, "2000", [](std::string val) { return val > "2000"; }},
{proto::plan::OpType::GreaterEqual, "2000", [](std::string val) { return val >= "2000"; }},
{proto::plan::OpType::LessThan, "3000", [](std::string val) { return val < "3000"; }},
{proto::plan::OpType::LessEqual, "3000", [](std::string val) { return val <= "3000"; }},
{proto::plan::OpType::PrefixMatch, "a", [](std::string val) { return PrefixMatch(val, "a"); }},
};
auto seg = CreateGrowingSegment(schema);
int N = 1000;
std::vector<std::string> str_col;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_str_col = raw_data.get_col(str_meta.get_id());
auto begin = new_str_col->scalars().string_data().data().begin();
auto end = new_str_col->scalars().string_data().data().end();
str_col.insert(str_col.end(), begin, end);
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP);
for (const auto& [op, value, ref_func] : testcases) {
auto plan_proto = gen_unary_range_plan(op, value);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val = str_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << "@" << op << "@" << value << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, BinaryRange) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto gen_binary_range_plan = [&, fvec_meta, str_meta](bool lb_inclusive, bool ub_inclusive, std::string lb,
std::string ub) -> std::unique_ptr<proto::plan::PlanNode> {
auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false);
auto binary_range_expr = GenBinaryRangeExpr(lb_inclusive, ub_inclusive, lb, ub);
binary_range_expr->set_allocated_column_info(column_info);
auto expr = GenExpr().release();
expr->set_allocated_binary_range_expr(binary_range_expr);
auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return std::move(plan_node);
};
// bool lb_inclusive, bool ub_inclusive, std::string lb, std::string ub
std::vector<std::tuple<bool, bool, std::string, std::string, std::function<bool(std::string)>>> testcases{
{false, false, "2000", "3000", [](std::string val) { return val > "2000" && val < "3000"; }},
{false, true, "2000", "3000", [](std::string val) { return val > "2000" && val <= "3000"; }},
{true, false, "2000", "3000", [](std::string val) { return val >= "2000" && val < "3000"; }},
{true, true, "2000", "3000", [](std::string val) { return val >= "2000" && val <= "3000"; }},
{true, true, "2000", "1000", [](std::string val) { return false; }},
};
auto seg = CreateGrowingSegment(schema);
int N = 1000;
std::vector<std::string> str_col;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_str_col = raw_data.get_col(str_meta.get_id());
auto begin = new_str_col->scalars().string_data().data().begin();
auto end = new_str_col->scalars().string_data().data().end();
str_col.insert(str_col.end(), begin, end);
seg->PreInsert(N);
seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP);
for (const auto& [lb_inclusive, ub_inclusive, lb, ub, ref_func] : testcases) {
auto plan_proto = gen_binary_range_plan(lb_inclusive, ub_inclusive, lb, ub);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
auto final = visitor.call_child(*plan->plan_node_->predicate_.value());
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val = str_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref) << "@" << lb_inclusive << "@" << ub_inclusive << "@" << lb << "@" << ub << "@" << i
<< "!!" << val;
}
}
}
TEST(AlwaysTrueStringPlan, SearchWithOutputFields) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = GenStrPKSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto N = 100000;
auto dim = fvec_meta.get_dim();
auto round_decimal = -1;
auto dataset = DataGen(schema, N);
auto vec_col = dataset.get_col<float>(fvec_meta.get_id());
auto str_col = dataset.get_col(str_meta.get_id())->scalars().string_data().data();
auto query_ptr = vec_col.data();
auto segment = CreateGrowingSegment(schema);
segment->disable_small_index(); // brute-force search.
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto plan_proto = GenAlwaysTruePlan(fvec_meta, str_meta);
SetTargetEntry(plan_proto, {str_meta.get_id().get()});
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
auto num_queries = 5;
auto topk = 10;
auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr);
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
Timestamp time = MAX_TIMESTAMP;
std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};
query::dataset::SearchDataset search_dataset{
faiss::MetricType::METRIC_L2, //
num_queries, //
topk, //
round_decimal,
dim, //
query_ptr //
};
auto sub_result = FloatSearchBruteForce(search_dataset, vec_col.data(), N, nullptr);
auto sr = segment->Search(plan.get(), *ph_group, time);
segment->FillPrimaryKeys(plan.get(), *sr);
segment->FillTargetEntry(plan.get(), *sr);
ASSERT_EQ(sr->pk_type_, DataType::VARCHAR);
ASSERT_TRUE(sr->output_fields_data_.find(str_meta.get_id()) != sr->output_fields_data_.end());
auto retrieved_str_col = sr->output_fields_data_[str_meta.get_id()]->scalars().string_data().data();
for (auto q = 0; q < num_queries; q++) {
for (auto k = 0; k < topk; k++) {
auto offset = q * topk + k;
auto seg_offset = sub_result.get_seg_offsets()[offset];
ASSERT_EQ(std::get<std::string>(sr->primary_keys_[offset]), str_col[seg_offset]);
ASSERT_EQ(retrieved_str_col[offset], str_col[seg_offset]);
}
}
}
TEST(AlwaysTrueStringPlan, QueryWithOutputFields) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = GenStrPKSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto N = 100000;
auto dataset = DataGen(schema, N);
auto vec_col = dataset.get_col<float>(fvec_meta.get_id());
auto str_col = dataset.get_col(str_meta.get_id())->scalars().string_data().data();
auto segment = CreateGrowingSegment(schema);
segment->disable_small_index(); // brute-force search.
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto expr_proto = GenAlwaysTrueExpr(fvec_meta, str_meta);
auto plan_proto = GenPlanNode();
plan_proto->set_allocated_predicates(expr_proto);
SetTargetEntry(plan_proto, {str_meta.get_id().get()});
auto plan = ProtoParser(*schema).CreateRetrievePlan(*plan_proto);
Timestamp time = MAX_TIMESTAMP;
auto retrieved = segment->Retrieve(plan.get(), time);
ASSERT_EQ(retrieved->ids().str_id().data().size(), N);
ASSERT_EQ(retrieved->offset().size(), N);
ASSERT_EQ(retrieved->fields_data().size(), 1);
ASSERT_EQ(retrieved->fields_data(0).scalars().string_data().data().size(), N);
}

View File

@ -61,6 +61,12 @@ TEST_F(StringIndexMarisaTest, BuildWithDataset) {
index->BuildWithDataset(str_ds);
}
TEST_F(StringIndexMarisaTest, Count) {
auto index = milvus::scalar::CreateStringIndexMarisa();
index->BuildWithDataset(str_ds);
ASSERT_EQ(strs.size(), index->Count());
}
TEST_F(StringIndexMarisaTest, In) {
auto index = milvus::scalar::CreateStringIndexMarisa();
index->BuildWithDataset(str_ds);

View File

@ -13,6 +13,8 @@
#include <string.h>
#include <knowhere/common/MetricType.h>
#include "common/Utils.h"
#include "query/Utils.h"
#include "segcore/Utils.h"
TEST(Util, FaissMetricTypeToString) {
@ -33,3 +35,22 @@ TEST(Util, FaissMetricTypeToString) {
ASSERT_EQ(MetricTypeToString(MetricType::METRIC_BrayCurtis), "METRIC_BrayCurtis");
ASSERT_EQ(MetricTypeToString(MetricType::METRIC_JensenShannon), "METRIC_JensenShannon");
}
TEST(Util, StringMatch) {
using namespace milvus;
using namespace milvus::query;
ASSERT_ANY_THROW(Match(1, 2, OpType::PrefixMatch));
ASSERT_ANY_THROW(Match(std::string("not_match_operation"), std::string("not_match"), OpType::LessEqual));
ASSERT_TRUE(PrefixMatch("prefix1", "prefix"));
ASSERT_TRUE(PostfixMatch("1postfix", "postfix"));
ASSERT_TRUE(Match(std::string("prefix1"), std::string("prefix"), OpType::PrefixMatch));
ASSERT_TRUE(Match(std::string("1postfix"), std::string("postfix"), OpType::PostfixMatch));
ASSERT_FALSE(PrefixMatch("", "longer"));
ASSERT_FALSE(PostfixMatch("", "longer"));
ASSERT_FALSE(PrefixMatch("dontmatch", "prefix"));
ASSERT_FALSE(PostfixMatch("dontmatch", "postfix"));
}

View File

@ -25,89 +25,127 @@
#include "query/SearchOnIndex.h"
#include "segcore/SegmentGrowingImpl.h"
#include "segcore/SegmentSealedImpl.h"
#include "segcore/Utils.h"
#include "index/ScalarIndexSort.h"
#include "index/StringIndexSort.h"
using boost::algorithm::starts_with;
namespace milvus::segcore {
struct GeneratedData {
std::vector<uint8_t> rows_;
std::vector<aligned_vector<uint8_t>> cols_;
std::vector<idx_t> row_ids_;
std::vector<Timestamp> timestamps_;
RowBasedRawData raw_;
InsertData* raw_;
std::vector<FieldId> field_ids;
SchemaPtr schema_;
template <typename T>
auto
get_col(int index) const {
auto& target = cols_.at(index);
std::vector<T> ret(target.size() / sizeof(T));
memcpy(ret.data(), target.data(), target.size());
return ret;
std::vector<T>
get_col(FieldId field_id) const {
std::vector<T> ret(raw_->num_rows());
for (auto target_field_data : raw_->fields_data()) {
if (field_id.get() != target_field_data.field_id()) {
continue;
}
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data =
reinterpret_cast<const T*>(target_field_data.vectors().float_vector().data().data());
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
int len = raw_->num_rows() * (field_meta.get_dim() / 8);
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(target_field_data.vectors().binary_vector().data());
std::copy_n(src_data, len, ret.data());
} else {
PanicInfo("unsupported");
}
return std::move(ret);
}
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
auto src_data = reinterpret_cast<const T*>(target_field_data.scalars().bool_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::INT8:
case DataType::INT16:
case DataType::INT32: {
auto src_data =
reinterpret_cast<const int32_t*>(target_field_data.scalars().int_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::INT64: {
auto src_data = reinterpret_cast<const T*>(target_field_data.scalars().long_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::FLOAT: {
auto src_data = reinterpret_cast<const T*>(target_field_data.scalars().float_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::DOUBLE: {
auto src_data = reinterpret_cast<const T*>(target_field_data.scalars().double_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::VARCHAR: {
auto src_data = reinterpret_cast<const T*>(target_field_data.scalars().string_data().data().data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
default: {
PanicInfo("unsupported");
}
}
}
return std::move(ret);
}
template <typename T>
auto
get_mutable_col(int index) {
auto& target = cols_.at(index);
assert(target.size() == row_ids_.size() * sizeof(T));
auto ptr = reinterpret_cast<T*>(target.data());
return ptr;
std::unique_ptr<DataArray>
get_col(FieldId field_id) const {
for (auto target_field_data : raw_->fields_data()) {
if (field_id.get() == target_field_data.field_id()) {
return std::make_unique<DataArray>(target_field_data);
}
}
PanicInfo("field id not find");
}
private:
GeneratedData() = default;
friend GeneratedData
DataGen(SchemaPtr schema, int64_t N, uint64_t seed, uint64_t ts_offset);
void
generate_rows(int64_t N, SchemaPtr schema);
};
inline void
GeneratedData::generate_rows(int64_t N, SchemaPtr schema) {
std::vector<int> offset_infos(schema->size() + 1, 0);
auto sizeof_infos = schema->get_sizeof_infos();
std::partial_sum(sizeof_infos.begin(), sizeof_infos.end(), offset_infos.begin() + 1);
int64_t len_per_row = offset_infos.back();
assert(len_per_row == schema->get_total_sizeof());
// change column-based data to row-based data
std::vector<uint8_t> result(len_per_row * N);
for (int index = 0; index < N; ++index) {
for (int fid = 0; fid < schema->size(); ++fid) {
auto len = sizeof_infos[fid];
auto offset = offset_infos[fid];
auto src = cols_[fid].data() + index * len;
auto dst = result.data() + index * len_per_row + offset;
memcpy(dst, src, len);
}
}
rows_ = std::move(result);
raw_.raw_data = rows_.data();
raw_.sizeof_per_row = schema->get_total_sizeof();
raw_.count = N;
}
inline GeneratedData
DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0) {
using std::vector;
std::vector<aligned_vector<uint8_t>> cols;
std::default_random_engine er(seed);
std::normal_distribution<> distr(0, 1);
int offset = 0;
auto insert_cols = [&cols](auto& data) {
using T = std::remove_reference_t<decltype(data)>;
auto len = sizeof(typename T::value_type) * data.size();
auto ptr = aligned_vector<uint8_t>(len);
memcpy(ptr.data(), data.data(), len);
cols.emplace_back(std::move(ptr));
auto insert_data = std::make_unique<InsertData>();
auto insert_cols = [&insert_data](auto& data, int64_t count, auto& field_meta) {
auto array = milvus::segcore::CreateDataArrayFrom(data.data(), count, field_meta);
insert_data->mutable_fields_data()->AddAllocated(array.release());
};
for (auto& field : schema->get_fields()) {
switch (field.get_data_type()) {
for (auto field_id : schema->get_field_ids()) {
auto field_meta = schema->operator[](field_id);
switch (field_meta.get_data_type()) {
case engine::DataType::VECTOR_FLOAT: {
auto dim = field.get_dim();
auto dim = field_meta.get_dim();
vector<float> final(dim * N);
bool is_ip = starts_with(field.get_name().get(), "normalized");
bool is_ip = starts_with(field_meta.get_name().get(), "normalized");
#pragma omp parallel for
for (int n = 0; n < N; ++n) {
vector<float> data(dim);
@ -128,23 +166,23 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
std::copy(data.begin(), data.end(), final.begin() + dim * n);
}
insert_cols(final);
insert_cols(final, N, field_meta);
break;
}
case engine::DataType::VECTOR_BINARY: {
auto dim = field.get_dim();
auto dim = field_meta.get_dim();
Assert(dim % 8 == 0);
vector<uint8_t> data(dim / 8 * N);
for (auto& x : data) {
x = er();
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::INT64: {
vector<int64_t> data(N);
// begin with counter
if (starts_with(field.get_name().get(), "counter")) {
if (starts_with(field_meta.get_name().get(), "counter")) {
int64_t index = 0;
for (auto& x : data) {
x = index++;
@ -157,7 +195,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
i++;
}
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::INT32: {
@ -165,7 +203,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
for (auto& x : data) {
x = er() % (2 * N);
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::INT16: {
@ -173,7 +211,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
for (auto& x : data) {
x = er() % (2 * N);
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::INT8: {
@ -181,7 +219,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
for (auto& x : data) {
x = er() % (2 * N);
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::FLOAT: {
@ -189,7 +227,7 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
for (auto& x : data) {
x = distr(er);
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::DOUBLE: {
@ -197,7 +235,15 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
for (auto& x : data) {
x = distr(er);
}
insert_cols(data);
insert_cols(data, N, field_meta);
break;
}
case engine::DataType::VARCHAR: {
vector<std::string> data(N);
for (auto& x : data) {
x = std::to_string(er());
}
insert_cols(data, N, field_meta);
break;
}
default: {
@ -206,14 +252,16 @@ DataGen(SchemaPtr schema, int64_t N, uint64_t seed = 42, uint64_t ts_offset = 0)
}
++offset;
}
GeneratedData res;
res.cols_ = std::move(cols);
res.schema_ = schema;
res.raw_ = insert_data.release();
res.raw_->set_num_rows(N);
for (int i = 0; i < N; ++i) {
res.row_ids_.push_back(i);
res.timestamps_.push_back(i + ts_offset);
}
// std::shuffle(res.row_ids_.begin(), res.row_ids_.end(), er);
res.generate_rows(N, schema);
return res;
}
@ -306,7 +354,7 @@ SearchResultToJson(const SearchResult& sr) {
std::vector<std::string> result;
for (int k = 0; k < topk; ++k) {
int index = q * topk + k;
result.emplace_back(std::to_string(sr.ids_[index]) + "->" + std::to_string(sr.distances_[index]));
result.emplace_back(std::to_string(sr.seg_offsets_[index]) + "->" + std::to_string(sr.distances_[index]));
}
results.emplace_back(std::move(result));
}
@ -319,26 +367,28 @@ SealedLoader(const GeneratedData& dataset, SegmentSealed& seg) {
auto row_count = dataset.row_ids_.size();
{
LoadFieldDataInfo info;
info.blob = dataset.row_ids_.data();
FieldMeta field_meta(FieldName("RowID"), RowFieldID, engine::DataType::INT64);
auto array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), row_count, field_meta);
info.field_data = array.release();
info.row_count = dataset.row_ids_.size();
info.field_id = 0; // field id for RowId
info.field_id = RowFieldID.get(); // field id for RowId
seg.LoadFieldData(info);
}
{
LoadFieldDataInfo info;
info.blob = dataset.timestamps_.data();
FieldMeta field_meta(FieldName("Timestamp"), TimestampFieldID, engine::DataType::INT64);
auto array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), row_count, field_meta);
info.field_data = array.release();
info.row_count = dataset.timestamps_.size();
info.field_id = 1;
info.field_id = TimestampFieldID.get();
seg.LoadFieldData(info);
}
int field_offset = 0;
for (auto& meta : seg.get_schema().get_fields()) {
for (auto field_data : dataset.raw_->fields_data()) {
LoadFieldDataInfo info;
info.field_id = meta.get_id().get();
info.field_id = field_data.field_id();
info.row_count = row_count;
info.blob = dataset.cols_[field_offset].data();
info.field_data = &field_data;
seg.LoadFieldData(info);
++field_offset;
}
}
@ -364,4 +414,18 @@ GenIndexing(int64_t N, int64_t dim, const float* vec) {
return indexing;
}
template <typename T>
inline scalar::IndexBasePtr
GenScalarIndexing(int64_t N, const T* data) {
if constexpr (std::is_same_v<T, std::string>) {
auto indexing = scalar::CreateStringIndexSort();
indexing->Build(N, data);
return indexing;
} else {
auto indexing = scalar::CreateScalarIndexSort<T>();
indexing->Build(N, data);
return indexing;
}
}
} // namespace milvus::segcore

View File

@ -497,7 +497,6 @@ func (it *IndexBuildTask) saveIndex(ctx context.Context, blobs []*storage.Blob)
it.savePaths = make([]string, blobCnt)
saveIndexFile := func(idx int) error {
blob := blobs[idx]
savePath := getSavePathByKey(blob.Key)
saveIndexFileFn := func() error {
v, err := it.etcdKV.Load(it.req.MetaPath)

View File

@ -12,6 +12,8 @@ enum OpType {
LessEqual = 4;
Equal = 5;
NotEqual = 6;
PrefixMatch = 7; // startsWith
PostfixMatch = 8; // endsWith
};
enum ArithOpType {
@ -46,12 +48,16 @@ message ColumnInfo {
bool is_autoID = 4;
}
// For example: a startsWith "prefix", a >= "str", b < 2 and etc,
// where both a and b are field in schema.
message UnaryRangeExpr {
ColumnInfo column_info = 1;
OpType op = 2;
GenericValue value = 3;
}
// For example: "str1" < a <= "str9", 1 <= b < 9 and etc,
// where both a and b are field in schema.
message BinaryRangeExpr {
ColumnInfo column_info = 1;
bool lower_inclusive = 2;
@ -60,17 +66,22 @@ message BinaryRangeExpr {
GenericValue upper_value = 5;
}
// For example: a startsWith b, a >= b, a < b, a == b and etc,
// where both a and b are field in schema.
message CompareExpr {
ColumnInfo left_column_info = 1;
ColumnInfo right_column_info = 2;
OpType op = 3;
}
// For example: a in ["term0", "term1"], b in [1, 2, 3, 4] and etc,
// where both a and b are field in schema.
message TermExpr {
ColumnInfo column_info = 1;
repeated GenericValue values = 2;
}
// !(expr).
message UnaryExpr {
enum UnaryOp {
Invalid = 0;
@ -80,6 +91,7 @@ message UnaryExpr {
Expr child = 2;
}
// (expr) op (expr), where op is of (LogicalAnd, LogicalOr).
message BinaryExpr {
enum BinaryOp {
Invalid = 0;

Some files were not shown because too many files have changed in this diff Show More