feat: support groupby on growing and non-indexed sealed egment(#30307) (#30644)

related: #30308

Signed-off-by: MrPresent-Han <chun.han@zilliz.com>
pull/30750/head
MrPresent-Han 2024-02-21 14:02:53 +08:00 committed by GitHub
parent 18aac076de
commit 77eb6defb1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 1024 additions and 381 deletions

View File

@ -20,6 +20,7 @@
#include <map>
#include <limits>
#include <string>
#include <queue>
#include <utility>
#include <vector>
#include <boost/align/aligned_allocator.hpp>
@ -31,6 +32,115 @@
#include "knowhere/index_node.h"
namespace milvus {
struct OffsetDisPair {
private:
std::pair<int64_t, float> off_dis_;
int iterator_idx_;
public:
OffsetDisPair(std::pair<int64_t, float> off_dis, int iter_idx)
: off_dis_(off_dis), iterator_idx_(iter_idx) {
}
const std::pair<int64_t, float>&
GetOffDis() const {
return off_dis_;
}
int
GetIteratorIdx() const {
return iterator_idx_;
}
};
struct OffsetDisPairComparator {
bool
operator()(const std::shared_ptr<OffsetDisPair>& left,
const std::shared_ptr<OffsetDisPair>& right) const {
if (left->GetOffDis().second != right->GetOffDis().second) {
return left->GetOffDis().second < right->GetOffDis().second;
}
return left->GetOffDis().first < right->GetOffDis().first;
}
};
struct VectorIterator {
public:
VectorIterator(int chunk_count, int64_t chunk_rows = -1)
: chunk_rows_(chunk_rows) {
iterators_.reserve(chunk_count);
}
std::optional<std::pair<int64_t, float>>
Next() {
if (!heap_.empty()) {
auto top = heap_.top();
heap_.pop();
if (iterators_[top->GetIteratorIdx()]->HasNext()) {
auto origin_pair = iterators_[top->GetIteratorIdx()]->Next();
origin_pair.first = convert_to_segment_offset(
origin_pair.first, top->GetIteratorIdx());
auto off_dis_pair = std::make_shared<OffsetDisPair>(
origin_pair, top->GetIteratorIdx());
heap_.push(off_dis_pair);
}
return top->GetOffDis();
}
return std::nullopt;
}
bool
HasNext() {
return !heap_.empty();
}
bool
AddIterator(std::shared_ptr<knowhere::IndexNode::iterator> iter) {
if (!sealed && iter != nullptr) {
iterators_.emplace_back(iter);
return true;
}
return false;
}
void
seal() {
sealed = true;
int idx = 0;
for (auto& iter : iterators_) {
if (iter->HasNext()) {
auto off_dis_pair =
std::make_shared<OffsetDisPair>(iter->Next(), idx++);
heap_.push(off_dis_pair);
}
}
}
private:
int64_t
convert_to_segment_offset(int64_t chunk_offset, int chunk_idx) {
if (chunk_rows_ == -1) {
AssertInfo(
iterators_.size() == 1,
"Wrong state for vectorIterators, which having incorrect "
"kw_iterator count:{} "
"without setting value for chunk_rows, "
"cannot convert chunk_offset to segment_offset correctly",
iterators_.size());
return chunk_offset;
}
return chunk_idx * chunk_rows_ + chunk_offset;
}
private:
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>> iterators_;
std::priority_queue<std::shared_ptr<OffsetDisPair>,
std::vector<std::shared_ptr<OffsetDisPair>>,
OffsetDisPairComparator>
heap_;
bool sealed = false;
int64_t chunk_rows_ = -1;
//currently, VectorIterator is guaranteed to be used serially without concurrent problem, in the future
//we may need to add mutex to protect the variable sealed
};
struct SearchResult {
SearchResult() = default;
@ -45,6 +155,37 @@ struct SearchResult {
return topk_per_nq_prefix_sum_[total_nq_];
}
public:
void
AssembleChunkVectorIterators(
int64_t nq,
int chunk_count,
int64_t rows_per_chunk,
const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
kw_iterators) {
AssertInfo(kw_iterators.size() == nq * chunk_count,
"kw_iterators count:{} is not equal to nq*chunk_count:{}, "
"wrong state",
kw_iterators.size(),
nq * chunk_count);
std::vector<std::shared_ptr<VectorIterator>> vector_iterators;
vector_iterators.reserve(nq);
for (int i = 0, vec_iter_idx = 0; i < kw_iterators.size(); i++) {
vec_iter_idx = vec_iter_idx % nq;
if (vector_iterators.size() < nq) {
auto vector_iterator = std::make_shared<VectorIterator>(
chunk_count, rows_per_chunk);
vector_iterators.emplace_back(vector_iterator);
}
auto kw_iterator = kw_iterators[i];
vector_iterators[vec_iter_idx++]->AddIterator(kw_iterator);
}
for (auto vector_iter : vector_iterators) {
vector_iter->seal();
}
this->vector_iterators_ = vector_iterators;
}
public:
int64_t total_nq_;
int64_t unity_topK_;
@ -70,9 +211,9 @@ struct SearchResult {
// used for reduce, filter invalid pk, get real topks count
std::vector<size_t> topk_per_nq_prefix_sum_;
//knowhere iterators, used for group by or other operators in the future
std::optional<std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
iterators;
//Vector iterators, used for group by
std::optional<std::vector<std::shared_ptr<VectorIterator>>>
vector_iterators_;
};
using SearchResultPtr = std::shared_ptr<SearchResult>;

View File

@ -314,10 +314,11 @@ VectorDiskAnnIndex<T>::BuildWithDataset(const DatasetPtr& dataset,
}
template <typename T>
std::unique_ptr<SearchResult>
void
VectorDiskAnnIndex<T>::Query(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) {
const BitsetView& bitset,
SearchResult& search_result) const {
AssertInfo(GetMetricType() == search_info.metric_type_,
"Metric type of field index isn't the same with search info");
auto num_queries = dataset->GetRows();
@ -392,16 +393,21 @@ VectorDiskAnnIndex<T>::Query(const DatasetPtr dataset,
distances[i] = std::round(distances[i] * multiplier) / multiplier;
}
}
auto result = std::make_unique<SearchResult>();
result->seg_offsets_.resize(total_num);
result->distances_.resize(total_num);
result->total_nq_ = num_queries;
result->unity_topK_ = topk;
search_result.seg_offsets_.resize(total_num);
search_result.distances_.resize(total_num);
search_result.total_nq_ = num_queries;
search_result.unity_topK_ = topk;
std::copy_n(ids, total_num, search_result.seg_offsets_.data());
std::copy_n(distances, total_num, search_result.distances_.data());
}
std::copy_n(ids, total_num, result->seg_offsets_.data());
std::copy_n(distances, total_num, result->distances_.data());
return result;
template <typename T>
knowhere::expected<std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
VectorDiskAnnIndex<T>::VectorIterators(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) const {
return this->index_.AnnIterator(
*dataset, search_info.search_params_, bitset);
}
template <typename T>

View File

@ -86,10 +86,11 @@ class VectorDiskAnnIndex : public VectorIndex {
void
BuildV2(const Config& config = {}) override;
std::unique_ptr<SearchResult>
void
Query(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) override;
const BitsetView& bitset,
SearchResult& search_result) const override;
const bool
HasRawData() const override;
@ -100,6 +101,12 @@ class VectorDiskAnnIndex : public VectorIndex {
void
CleanLocalData() override;
knowhere::expected<
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
VectorIterators(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) const override;
private:
knowhere::Json
update_load_json(const Config& config);

View File

@ -54,10 +54,21 @@ class VectorIndex : public IndexBase {
PanicInfo(Unsupported, "vector index don't support add with dataset");
}
virtual std::unique_ptr<SearchResult>
virtual void
Query(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) = 0;
const BitsetView& bitset,
SearchResult& search_result) const = 0;
virtual knowhere::expected<
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
VectorIterators(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) const {
throw std::runtime_error(
"VectorIndex didn't implement VectorIterator interface, "
"there must be sth wrong in the code");
}
virtual const bool
HasRawData() const = 0;
@ -89,7 +100,7 @@ class VectorIndex : public IndexBase {
CleanLocalData() {
}
void
virtual void
CheckCompatible(const IndexVersion& version) {
std::string err_msg =
"version not support : " + std::to_string(version) +

View File

@ -128,6 +128,15 @@ VectorMemIndex<T>::UploadV2(const Config& config) {
return ret;
}
template <typename T>
knowhere::expected<std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
VectorMemIndex<T>::VectorIterators(const milvus::DatasetPtr dataset,
const milvus::SearchInfo& search_info,
const milvus::BitsetView& bitset) const {
return this->index_.AnnIterator(
*dataset, search_info.search_params_, bitset);
}
template <typename T>
BinarySet
VectorMemIndex<T>::Upload(const Config& config) {
@ -522,48 +531,16 @@ VectorMemIndex<T>::AddWithDataset(const DatasetPtr& dataset,
}
template <typename T>
std::unique_ptr<SearchResult>
void
VectorMemIndex<T>::Query(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) {
const BitsetView& bitset,
SearchResult& search_result) const {
// AssertInfo(GetMetricType() == search_info.metric_type_,
// "Metric type of field index isn't the same with search info");
auto num_queries = dataset->GetRows();
knowhere::Json search_conf = search_info.search_params_;
if (search_info.group_by_field_id_.has_value()) {
auto result = std::make_unique<SearchResult>();
if (search_conf.contains(knowhere::indexparam::EF)) {
search_conf[knowhere::indexparam::SEED_EF] =
search_conf[knowhere::indexparam::EF];
}
try {
knowhere::expected<
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
iterators_val =
index_.AnnIterator(*dataset, search_conf, bitset);
if (iterators_val.has_value()) {
result->iterators = iterators_val.value();
} else {
LOG_ERROR(
"Returned knowhere iterator has non-ready iterators "
"inside, terminate group_by operation");
PanicInfo(ErrorCode::Unsupported,
"Returned knowhere iterator has non-ready iterators "
"inside, terminate group_by operation");
}
} catch (const std::runtime_error& e) {
LOG_ERROR(
"Caught error:{} when trying to initialize ann iterators for "
"group_by: "
"group_by operation will be terminated",
e.what());
throw e;
}
return result;
//if the target index doesn't support iterators, directly return empty search result
//and the reduce process to filter empty results
}
auto topk = search_info.topk_;
// TODO :: check dim of search data
auto final = [&] {
@ -615,16 +592,12 @@ VectorMemIndex<T>::Query(const DatasetPtr dataset,
distances[i] = std::round(distances[i] * multiplier) / multiplier;
}
}
auto result = std::make_unique<SearchResult>();
result->seg_offsets_.resize(total_num);
result->distances_.resize(total_num);
result->total_nq_ = num_queries;
result->unity_topK_ = topk;
std::copy_n(ids, total_num, result->seg_offsets_.data());
std::copy_n(distances, total_num, result->distances_.data());
return result;
search_result.seg_offsets_.resize(total_num);
search_result.distances_.resize(total_num);
search_result.total_nq_ = num_queries;
search_result.unity_topK_ = topk;
std::copy_n(ids, total_num, search_result.seg_offsets_.data());
std::copy_n(distances, total_num, search_result.distances_.data());
}
template <typename T>

View File

@ -73,10 +73,11 @@ class VectorMemIndex : public VectorIndex {
return index_.Count();
}
std::unique_ptr<SearchResult>
void
Query(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) override;
const BitsetView& bitset,
SearchResult& search_result) const override;
const bool
HasRawData() const override;
@ -90,6 +91,12 @@ class VectorMemIndex : public VectorIndex {
BinarySet
UploadV2(const Config& config = {}) override;
knowhere::expected<
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
VectorIterators(const DatasetPtr dataset,
const SearchInfo& search_info,
const BitsetView& bitset) const override;
protected:
virtual void
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);

View File

@ -85,7 +85,9 @@ VecIndexCreator::Query(const milvus::DatasetPtr& dataset,
const SearchInfo& search_info,
const BitsetView& bitset) {
auto vector_index = dynamic_cast<index::VectorIndex*>(index_.get());
return vector_index->Query(dataset, search_info, bitset);
auto search_result = std::make_unique<SearchResult>();
vector_index->Query(dataset, search_info, bitset, *search_result);
return search_result;
}
BinarySet

View File

@ -22,33 +22,22 @@ namespace milvus {
namespace query {
void
GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
iterators,
GroupBy(const std::vector<std::shared_ptr<VectorIterator>>& iterators,
const SearchInfo& search_info,
std::vector<GroupByValueType>& group_by_values,
const segcore::SegmentInternalInterface& segment,
std::vector<int64_t>& seg_offsets,
std::vector<float>& distances) {
//0. check segment type, for period-1, only support group by for sealed segments
if (!dynamic_cast<const segcore::SegmentSealedImpl*>(&segment)) {
LOG_ERROR(
"Not support group_by operation for non-sealed segment, "
"segment_id:{}",
segment.get_segment_id());
return;
}
//1. get search meta
FieldId group_by_field_id = search_info.group_by_field_id_.value();
auto data_type = segment.GetFieldDataType(group_by_field_id);
switch (data_type) {
case DataType::INT8: {
DataGetter<int8_t> dataGetter(segment, group_by_field_id);
auto dataGetter = GetDataGetter<int8_t>(segment, group_by_field_id);
GroupIteratorsByType<int8_t>(iterators,
group_by_field_id,
search_info.topk_,
dataGetter,
*dataGetter,
group_by_values,
seg_offsets,
distances,
@ -56,11 +45,11 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
break;
}
case DataType::INT16: {
DataGetter<int16_t> dataGetter(segment, group_by_field_id);
auto dataGetter =
GetDataGetter<int16_t>(segment, group_by_field_id);
GroupIteratorsByType<int16_t>(iterators,
group_by_field_id,
search_info.topk_,
dataGetter,
*dataGetter,
group_by_values,
seg_offsets,
distances,
@ -68,11 +57,11 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
break;
}
case DataType::INT32: {
DataGetter<int32_t> dataGetter(segment, group_by_field_id);
auto dataGetter =
GetDataGetter<int32_t>(segment, group_by_field_id);
GroupIteratorsByType<int32_t>(iterators,
group_by_field_id,
search_info.topk_,
dataGetter,
*dataGetter,
group_by_values,
seg_offsets,
distances,
@ -80,11 +69,11 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
break;
}
case DataType::INT64: {
DataGetter<int64_t> dataGetter(segment, group_by_field_id);
auto dataGetter =
GetDataGetter<int64_t>(segment, group_by_field_id);
GroupIteratorsByType<int64_t>(iterators,
group_by_field_id,
search_info.topk_,
dataGetter,
*dataGetter,
group_by_values,
seg_offsets,
distances,
@ -92,11 +81,10 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
break;
}
case DataType::BOOL: {
DataGetter<bool> dataGetter(segment, group_by_field_id);
auto dataGetter = GetDataGetter<bool>(segment, group_by_field_id);
GroupIteratorsByType<bool>(iterators,
group_by_field_id,
search_info.topk_,
dataGetter,
*dataGetter,
group_by_values,
seg_offsets,
distances,
@ -104,11 +92,11 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
break;
}
case DataType::VARCHAR: {
DataGetter<std::string> dataGetter(segment, group_by_field_id);
auto dataGetter =
GetDataGetter<std::string>(segment, group_by_field_id);
GroupIteratorsByType<std::string>(iterators,
group_by_field_id,
search_info.topk_,
dataGetter,
*dataGetter,
group_by_values,
seg_offsets,
distances,
@ -127,9 +115,7 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
template <typename T>
void
GroupIteratorsByType(
const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
iterators,
FieldId field_id,
const std::vector<std::shared_ptr<VectorIterator>>& iterators,
int64_t topK,
const DataGetter<T>& data_getter,
std::vector<GroupByValueType>& group_by_values,
@ -138,7 +124,6 @@ GroupIteratorsByType(
const knowhere::MetricType& metrics_type) {
for (auto& iterator : iterators) {
GroupIteratorResult<T>(iterator,
field_id,
topK,
data_getter,
group_by_values,
@ -150,15 +135,13 @@ GroupIteratorsByType(
template <typename T>
void
GroupIteratorResult(
const std::shared_ptr<knowhere::IndexNode::iterator>& iterator,
FieldId field_id,
int64_t topK,
const DataGetter<T>& data_getter,
std::vector<GroupByValueType>& group_by_values,
std::vector<int64_t>& offsets,
std::vector<float>& distances,
const knowhere::MetricType& metrics_type) {
GroupIteratorResult(const std::shared_ptr<VectorIterator>& iterator,
int64_t topK,
const DataGetter<T>& data_getter,
std::vector<GroupByValueType>& group_by_values,
std::vector<int64_t>& offsets,
std::vector<float>& distances,
const knowhere::MetricType& metrics_type) {
//1.
std::unordered_map<T, std::pair<int64_t, float>> groupMap;
@ -171,7 +154,13 @@ GroupIteratorResult(
return l < r;
};
while (iterator->HasNext() && groupMap.size() < topK) {
auto [offset, dis] = iterator->Next();
auto offset_dis_pair = iterator->Next();
AssertInfo(
offset_dis_pair.has_value(),
"Wrong state! iterator cannot return valid result whereas it still"
"tells hasNext, terminate groupBy operation");
auto offset = offset_dis_pair.value().first;
auto dis = offset_dis_pair.value().second;
T row_data = data_getter.Get(offset);
auto it = groupMap.find(row_data);
if (it == groupMap.end()) {

View File

@ -19,19 +19,51 @@
#include "common/QueryInfo.h"
#include "knowhere/index_node.h"
#include "segcore/SegmentInterface.h"
#include "segcore/SegmentGrowingImpl.h"
#include "segcore/SegmentSealedImpl.h"
#include "segcore/ConcurrentVector.h"
#include "common/Span.h"
namespace milvus {
namespace query {
template <typename T>
struct DataGetter {
class DataGetter {
public:
virtual T
Get(int64_t idx) const = 0;
};
template <typename T>
class GrowingDataGetter : public DataGetter<T> {
public:
const segcore::ConcurrentVector<T>* growing_raw_data_;
GrowingDataGetter(const segcore::SegmentGrowingImpl& segment,
FieldId fieldId) {
growing_raw_data_ =
segment.get_insert_record().get_field_data<T>(fieldId);
}
GrowingDataGetter(const GrowingDataGetter<T>& other)
: growing_raw_data_(other.growing_raw_data_) {
}
T
Get(int64_t idx) const {
return growing_raw_data_->operator[](idx);
}
};
template <typename T>
class SealedDataGetter : public DataGetter<T> {
private:
std::shared_ptr<Span<T>> field_data_;
std::shared_ptr<Span<std::string_view>> str_field_data_;
const index::ScalarIndex<T>* field_index_;
DataGetter(const segcore::SegmentInternalInterface& segment,
FieldId& field_id) {
public:
SealedDataGetter(const segcore::SegmentSealedImpl& segment,
FieldId& field_id) {
if (segment.HasFieldData(field_id)) {
if constexpr (std::is_same_v<T, std::string>) {
auto span = segment.chunk_data<std::string_view>(field_id, 0);
@ -52,7 +84,12 @@ struct DataGetter {
}
}
public:
SealedDataGetter(const SealedDataGetter<T>& other)
: field_data_(other.field_data_),
str_field_data_(other.str_field_data_),
field_index_(other.field_index_) {
}
T
Get(int64_t idx) const {
if (field_data_ || str_field_data_) {
@ -68,9 +105,65 @@ struct DataGetter {
}
};
template <typename T>
static const std::shared_ptr<DataGetter<T>>
GetDataGetter(const segcore::SegmentInternalInterface& segment,
FieldId fieldId) {
if (const segcore::SegmentGrowingImpl* growing_segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(&segment)) {
return std::make_shared<GrowingDataGetter<T>>(*growing_segment,
fieldId);
} else if (const segcore::SegmentSealedImpl* sealed_segment =
dynamic_cast<const segcore::SegmentSealedImpl*>(&segment)) {
return std::make_shared<SealedDataGetter<T>>(*sealed_segment, fieldId);
} else {
PanicInfo(UnexpectedError,
"The segment used to init data getter is neither growing or "
"sealed, wrong state");
}
}
static bool
PrepareVectorIteratorsFromIndex(const SearchInfo& search_info,
int nq,
const DatasetPtr dataset,
SearchResult& search_result,
const BitsetView& bitset,
const index::VectorIndex& index) {
if (search_info.group_by_field_id_.has_value()) {
try {
knowhere::expected<
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
iterators_val =
index.VectorIterators(dataset, search_info, bitset);
if (iterators_val.has_value()) {
search_result.AssembleChunkVectorIterators(
nq, 1, -1, iterators_val.value());
} else {
LOG_ERROR(
"Returned knowhere iterator has non-ready iterators "
"inside, terminate group_by operation");
PanicInfo(ErrorCode::Unsupported,
"Returned knowhere iterator has non-ready iterators "
"inside, terminate group_by operation");
}
search_result.total_nq_ = dataset->GetRows();
search_result.unity_topK_ = search_info.topk_;
} catch (const std::runtime_error& e) {
LOG_ERROR(
"Caught error:{} when trying to initialize ann iterators for "
"group_by: "
"group_by operation will be terminated",
e.what());
throw e;
}
return true;
}
return false;
}
void
GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
iterators,
GroupBy(const std::vector<std::shared_ptr<VectorIterator>>& iterators,
const SearchInfo& searchInfo,
std::vector<GroupByValueType>& group_by_values,
const segcore::SegmentInternalInterface& segment,
@ -80,9 +173,7 @@ GroupBy(const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
template <typename T>
void
GroupIteratorsByType(
const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
iterators,
FieldId field_id,
const std::vector<std::shared_ptr<VectorIterator>>& iterators,
int64_t topK,
const DataGetter<T>& data_getter,
std::vector<GroupByValueType>& group_by_values,
@ -92,15 +183,13 @@ GroupIteratorsByType(
template <typename T>
void
GroupIteratorResult(
const std::shared_ptr<knowhere::IndexNode::iterator>& iterator,
FieldId field_id,
int64_t topK,
const DataGetter<T>& data_getter,
std::vector<GroupByValueType>& group_by_values,
std::vector<int64_t>& offsets,
std::vector<float>& distances,
const knowhere::MetricType& metrics_type);
GroupIteratorResult(const std::shared_ptr<VectorIterator>& iterator,
int64_t topK,
const DataGetter<T>& data_getter,
std::vector<GroupByValueType>& group_by_values,
std::vector<int64_t>& offsets,
std::vector<float>& distances,
const knowhere::MetricType& metrics_type);
} // namespace query
} // namespace milvus

View File

@ -21,6 +21,8 @@
#include "SubSearchResult.h"
#include "knowhere/comp/brute_force.h"
#include "knowhere/comp/index_param.h"
#include "knowhere/index_node.h"
#include "log/Log.h"
namespace milvus::query {
void
@ -145,4 +147,61 @@ BruteForceSearch(const dataset::SearchDataset& dataset,
sub_result.round_values();
return sub_result;
}
SubSearchResult
BruteForceSearchIterators(const dataset::SearchDataset& dataset,
const void* chunk_data_raw,
int64_t chunk_rows,
const knowhere::Json& conf,
const BitsetView& bitset,
DataType data_type) {
auto nq = dataset.num_queries;
auto dim = dataset.dim;
auto base_dataset = knowhere::GenDataSet(chunk_rows, dim, chunk_data_raw);
auto query_dataset = knowhere::GenDataSet(nq, dim, dataset.query_data);
knowhere::expected<
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
iterators_val;
switch (data_type) {
case DataType::VECTOR_FLOAT:
iterators_val = knowhere::BruteForce::AnnIterator<float>(
base_dataset, query_dataset, conf, bitset);
break;
case DataType::VECTOR_FLOAT16:
iterators_val = knowhere::BruteForce::AnnIterator<float16>(
base_dataset, query_dataset, conf, bitset);
break;
case DataType::VECTOR_BFLOAT16:
iterators_val = knowhere::BruteForce::AnnIterator<bfloat16>(
base_dataset, query_dataset, conf, bitset);
break;
default:
PanicInfo(ErrorCode::Unsupported,
"Unsupported dataType for chunk brute force iterator:{}",
data_type);
}
if (iterators_val.has_value()) {
AssertInfo(
iterators_val.value().size() == nq,
"Wrong state, initialized knowhere_iterators count:{} is not "
"equal to nq:{} for single chunk",
iterators_val.value().size(),
nq);
SubSearchResult subSearchResult(dataset.num_queries,
dataset.topk,
dataset.metric_type,
dataset.round_decimal,
iterators_val.value());
return std::move(subSearchResult);
} else {
LOG_ERROR(
"Failed to get valid knowhere brute-force-iterators from chunk, "
"terminate search_group_by operation");
PanicInfo(ErrorCode::Unsupported,
"Returned knowhere brute-force-iterator has non-ready "
"iterators inside, terminate search_group_by operation");
}
}
} // namespace milvus::query

View File

@ -31,4 +31,12 @@ BruteForceSearch(const dataset::SearchDataset& dataset,
const BitsetView& bitset,
DataType data_type);
SubSearchResult
BruteForceSearchIterators(const dataset::SearchDataset& dataset,
const void* chunk_data_raw,
int64_t chunk_rows,
const knowhere::Json& conf,
const BitsetView& bitset,
DataType data_type);
} // namespace milvus::query

View File

@ -24,9 +24,8 @@ FloatSegmentIndexSearch(const segcore::SegmentGrowingImpl& segment,
const SearchInfo& info,
const void* query_data,
int64_t num_queries,
int64_t ins_barrier,
const BitsetView& bitset,
SubSearchResult& results) {
SearchResult& search_result) {
auto& schema = segment.get_schema();
auto& indexing_record = segment.get_indexing_record();
auto& record = segment.get_insert_record();
@ -49,9 +48,8 @@ FloatSegmentIndexSearch(const segcore::SegmentGrowingImpl& segment,
auto indexing = field_indexing.get_segment_indexing();
SearchInfo search_conf = field_indexing.get_search_params(info);
auto vec_index = dynamic_cast<index::VectorIndex*>(indexing);
auto result =
SearchOnIndex(search_dataset, *vec_index, search_conf, bitset);
results.merge(result);
SearchOnIndex(
search_dataset, *vec_index, search_conf, bitset, search_result);
}
}
@ -62,7 +60,7 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
int64_t num_queries,
Timestamp timestamp,
const BitsetView& bitset,
SearchResult& results) {
SearchResult& search_result) {
auto& schema = segment.get_schema();
auto& record = segment.get_insert_record();
auto active_count =
@ -84,23 +82,13 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
auto round_decimal = info.round_decimal_;
// step 2: small indexing search
SubSearchResult final_qr(num_queries, topk, metric_type, round_decimal);
dataset::SearchDataset search_dataset{
metric_type, num_queries, topk, round_decimal, dim, query_data};
if (segment.get_indexing_record().SyncDataWithIndex(field.get_id())) {
FloatSegmentIndexSearch(segment,
info,
query_data,
num_queries,
active_count,
bitset,
final_qr);
results.distances_ = std::move(final_qr.mutable_distances());
results.seg_offsets_ = std::move(final_qr.mutable_seg_offsets());
results.unity_topK_ = topk;
results.total_nq_ = num_queries;
FloatSegmentIndexSearch(
segment, info, query_data, num_queries, bitset, search_result);
} else {
SubSearchResult final_qr(num_queries, topk, metric_type, round_decimal);
dataset::SearchDataset search_dataset{
metric_type, num_queries, topk, round_decimal, dim, query_data};
std::shared_lock<std::shared_mutex> read_chunk_mutex(
segment.get_chunk_mutex());
int32_t current_chunk_id = 0;
@ -119,25 +107,44 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
auto size_per_chunk = element_end - element_begin;
auto sub_view = bitset.subview(element_begin, size_per_chunk);
auto sub_qr = BruteForceSearch(search_dataset,
chunk_data,
size_per_chunk,
info.search_params_,
sub_view,
data_type);
if (info.group_by_field_id_.has_value()) {
auto sub_qr = BruteForceSearchIterators(search_dataset,
chunk_data,
size_per_chunk,
info.search_params_,
sub_view,
data_type);
final_qr.merge(sub_qr);
} else {
auto sub_qr = BruteForceSearch(search_dataset,
chunk_data,
size_per_chunk,
info.search_params_,
sub_view,
data_type);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_seg_offsets()) {
if (x != -1) {
x += chunk_id * vec_size_per_chunk;
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_seg_offsets()) {
if (x != -1) {
x += chunk_id * vec_size_per_chunk;
}
}
final_qr.merge(sub_qr);
}
final_qr.merge(sub_qr);
}
results.distances_ = std::move(final_qr.mutable_distances());
results.seg_offsets_ = std::move(final_qr.mutable_seg_offsets());
results.unity_topK_ = topk;
results.total_nq_ = num_queries;
if (info.group_by_field_id_.has_value()) {
search_result.AssembleChunkVectorIterators(
num_queries,
max_chunk,
vec_size_per_chunk,
final_qr.chunk_iterators());
} else {
search_result.distances_ = std::move(final_qr.mutable_distances());
search_result.seg_offsets_ =
std::move(final_qr.mutable_seg_offsets());
}
search_result.unity_topK_ = topk;
search_result.total_nq_ = num_queries;
}
}

View File

@ -23,6 +23,6 @@ SearchOnGrowing(const segcore::SegmentGrowingImpl& segment,
int64_t num_queries,
Timestamp timestamp,
const BitsetView& bitset,
SearchResult& results);
SearchResult& search_result);
} // namespace milvus::query

View File

@ -10,33 +10,28 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "SearchOnIndex.h"
#include "query/GroupByOperator.h"
namespace milvus::query {
SubSearchResult
void
SearchOnIndex(const dataset::SearchDataset& search_dataset,
const index::VectorIndex& indexing,
const SearchInfo& search_conf,
const BitsetView& bitset) {
const BitsetView& bitset,
SearchResult& search_result) {
auto num_queries = search_dataset.num_queries;
auto topK = search_dataset.topk;
auto dim = search_dataset.dim;
auto metric_type = search_dataset.metric_type;
auto round_decimal = search_dataset.round_decimal;
auto dataset =
knowhere::GenDataSet(num_queries, dim, search_dataset.query_data);
// NOTE: VecIndex Query API forget to add const qualifier
// NOTE: use const_cast as a workaround
auto& indexing_nonconst = const_cast<index::VectorIndex&>(indexing);
auto ans = indexing_nonconst.Query(dataset, search_conf, bitset);
SubSearchResult sub_qr(num_queries, topK, metric_type, round_decimal);
std::copy_n(
ans->distances_.data(), num_queries * topK, sub_qr.get_distances());
std::copy_n(
ans->seg_offsets_.data(), num_queries * topK, sub_qr.get_seg_offsets());
sub_qr.round_values();
return sub_qr;
if (!PrepareVectorIteratorsFromIndex(search_conf,
num_queries,
dataset,
search_result,
bitset,
indexing)) {
indexing.Query(dataset, search_conf, bitset, search_result);
}
}
} // namespace milvus::query

View File

@ -19,10 +19,11 @@
namespace milvus::query {
SubSearchResult
void
SearchOnIndex(const dataset::SearchDataset& search_dataset,
const index::VectorIndex& indexing,
const SearchInfo& search_conf,
const BitsetView& bitset);
const BitsetView& bitset,
SearchResult& search_result);
} // namespace milvus::query

View File

@ -17,6 +17,7 @@
#include "query/SearchBruteForce.h"
#include "query/SearchOnSealed.h"
#include "query/helper.h"
#include "query/GroupByOperator.h"
namespace milvus::query {
@ -27,8 +28,8 @@ SearchOnSealedIndex(const Schema& schema,
const void* query_data,
int64_t num_queries,
const BitsetView& bitset,
SearchResult& result) {
auto topk = search_info.topk_;
SearchResult& search_result) {
auto topK = search_info.topk_;
auto round_decimal = search_info.round_decimal_;
auto field_id = search_info.field_id_;
@ -42,20 +43,19 @@ SearchOnSealedIndex(const Schema& schema,
AssertInfo(field_indexing->metric_type_ == search_info.metric_type_,
"Metric type of field index isn't the same with search info");
auto final = [&] {
auto ds = knowhere::GenDataSet(num_queries, dim, query_data);
auto vec_index =
dynamic_cast<index::VectorIndex*>(field_indexing->indexing_.get());
auto dataset = knowhere::GenDataSet(num_queries, dim, query_data);
auto vec_index =
dynamic_cast<index::VectorIndex*>(field_indexing->indexing_.get());
if (!PrepareVectorIteratorsFromIndex(search_info,
num_queries,
dataset,
search_result,
bitset,
*vec_index)) {
auto index_type = vec_index->GetIndexType();
return vec_index->Query(ds, search_info, bitset);
}();
if (final->iterators.has_value()) {
result.iterators = std::move(final->iterators);
} else {
float* distances = final->distances_.data();
auto total_num = num_queries * topk;
vec_index->Query(dataset, search_info, bitset, search_result);
float* distances = search_result.distances_.data();
auto total_num = num_queries * topK;
if (round_decimal != -1) {
const float multiplier = pow(10.0, round_decimal);
for (int i = 0; i < total_num; i++) {
@ -63,11 +63,9 @@ SearchOnSealedIndex(const Schema& schema,
std::round(distances[i] * multiplier) / multiplier;
}
}
result.seg_offsets_ = std::move(final->seg_offsets_);
result.distances_ = std::move(final->distances_);
}
result.total_nq_ = num_queries;
result.unity_topK_ = topk;
search_result.total_nq_ = num_queries;
search_result.unity_topK_ = topK;
}
void
@ -91,15 +89,26 @@ SearchOnSealed(const Schema& schema,
auto data_type = field.get_data_type();
CheckBruteForceSearchParam(field, search_info);
auto sub_qr = BruteForceSearch(dataset,
vec_data,
row_count,
search_info.search_params_,
bitset,
data_type);
if (search_info.group_by_field_id_.has_value()) {
auto sub_qr = BruteForceSearchIterators(dataset,
vec_data,
row_count,
search_info.search_params_,
bitset,
data_type);
result.AssembleChunkVectorIterators(
num_queries, 1, -1, sub_qr.chunk_iterators());
} else {
auto sub_qr = BruteForceSearch(dataset,
vec_data,
row_count,
search_info.search_params_,
bitset,
data_type);
result.distances_ = std::move(sub_qr.mutable_distances());
result.seg_offsets_ = std::move(sub_qr.mutable_seg_offsets());
result.distances_ = std::move(sub_qr.mutable_distances());
result.seg_offsets_ = std::move(sub_qr.mutable_seg_offsets());
}
result.unity_topK_ = dataset.topk;
result.total_nq_ = dataset.num_queries;
}

View File

@ -25,7 +25,7 @@ SearchOnSealedIndex(const Schema& schema,
const void* query_data,
int64_t num_queries,
const BitsetView& view,
SearchResult& result);
SearchResult& search_result);
void
SearchOnSealed(const Schema& schema,

View File

@ -74,13 +74,19 @@ SubSearchResult::merge_impl(const SubSearchResult& right) {
}
void
SubSearchResult::merge(const SubSearchResult& sub_result) {
AssertInfo(metric_type_ == sub_result.metric_type_,
SubSearchResult::merge(const SubSearchResult& other) {
AssertInfo(metric_type_ == other.metric_type_,
"[SubSearchResult]Metric type check failed when merge");
if (PositivelyRelated(metric_type_)) {
this->merge_impl<true>(sub_result);
if (!other.chunk_iterators_.empty()) {
std::move(std::begin(other.chunk_iterators_),
std::end(other.chunk_iterators_),
std::back_inserter(this->chunk_iterators_));
} else {
this->merge_impl<false>(sub_result);
if (PositivelyRelated(metric_type_)) {
this->merge_impl<true>(other);
} else {
this->merge_impl<false>(other);
}
}
}

View File

@ -17,21 +17,37 @@
#include "common/Types.h"
#include "common/Utils.h"
#include "knowhere/index_node.h"
namespace milvus::query {
class SubSearchResult {
public:
SubSearchResult(int64_t num_queries,
int64_t topk,
const MetricType& metric_type,
int64_t round_decimal)
SubSearchResult(
int64_t num_queries,
int64_t topk,
const MetricType& metric_type,
int64_t round_decimal,
const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
iters)
: num_queries_(num_queries),
topk_(topk),
round_decimal_(round_decimal),
metric_type_(metric_type),
seg_offsets_(num_queries * topk, INVALID_SEG_OFFSET),
distances_(num_queries * topk, init_value(metric_type)) {
distances_(num_queries * topk, init_value(metric_type)),
chunk_iterators_(std::move(iters)) {
}
SubSearchResult(int64_t num_queries,
int64_t topk,
const MetricType& metric_type,
int64_t round_decimal)
: SubSearchResult(
num_queries,
topk,
metric_type,
round_decimal,
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>{}) {
}
SubSearchResult(SubSearchResult&& other) noexcept
@ -40,7 +56,8 @@ class SubSearchResult {
round_decimal_(other.round_decimal_),
metric_type_(std::move(other.metric_type_)),
seg_offsets_(std::move(other.seg_offsets_)),
distances_(std::move(other.distances_)) {
distances_(std::move(other.distances_)),
chunk_iterators_(std::move(other.chunk_iterators_)) {
}
public:
@ -95,7 +112,12 @@ class SubSearchResult {
round_values();
void
merge(const SubSearchResult& sub_result);
merge(const SubSearchResult& other);
const std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>&
chunk_iterators() {
return this->chunk_iterators_;
}
private:
template <bool is_desc>
@ -109,6 +131,8 @@ class SubSearchResult {
knowhere::MetricType metric_type_;
std::vector<int64_t> seg_offsets_;
std::vector<float> distances_;
std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>
chunk_iterators_;
};
} // namespace milvus::query

View File

@ -190,8 +190,8 @@ ExecPlanNodeVisitor::VectorVisitorImpl(VectorPlanNode& node) {
timestamp_,
final_view,
search_result);
if (search_result.iterators.has_value()) {
GroupBy(search_result.iterators.value(),
if (search_result.vector_iterators_.has_value()) {
GroupBy(search_result.vector_iterators_.value(),
node.search_info_,
search_result.group_by_values_,
*segment,

View File

@ -87,23 +87,24 @@ VectorFieldIndexing::GetDataFromIndex(const int64_t* seg_offsets,
void
VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
int64_t size,
const VectorBase* vec_base,
const VectorBase* field_raw_data,
const void* data_source) {
AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT,
"Data type of vector field is not VECTOR_FLOAT");
auto dim = field_meta_.get_dim();
auto conf = get_build_params();
auto source = dynamic_cast<const ConcurrentVector<FloatVector>*>(vec_base);
auto source =
dynamic_cast<const ConcurrentVector<FloatVector>*>(field_raw_data);
auto per_chunk = source->get_size_per_chunk();
auto size_per_chunk = source->get_size_per_chunk();
//append vector [vector_id_beg, vector_id_end] into index
//build index [vector_id_beg, build_threshold) when index not exist
if (!build) {
idx_t vector_id_beg = index_cur_.load();
idx_t vector_id_end = get_build_threshold() - 1;
auto chunk_id_beg = vector_id_beg / per_chunk;
auto chunk_id_end = vector_id_end / per_chunk;
auto chunk_id_beg = vector_id_beg / size_per_chunk;
auto chunk_id_end = vector_id_end / size_per_chunk;
int64_t vec_num = vector_id_end - vector_id_beg + 1;
// for train index
@ -111,7 +112,7 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
unique_ptr<float[]> vec_data;
//all train data in one chunk
if (chunk_id_beg == chunk_id_end) {
data_addr = vec_base->get_chunk_data(chunk_id_beg);
data_addr = field_raw_data->get_chunk_data(chunk_id_beg);
} else {
//merge data from multiple chunks together
vec_data = std::make_unique<float[]>(vec_num * dim);
@ -122,12 +123,13 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
int chunk_offset = 0;
int chunk_copysz =
chunk_id == chunk_id_end
? vector_id_end - chunk_id * per_chunk + 1
: per_chunk;
std::memcpy(vec_data.get() + offset * dim,
(const float*)vec_base->get_chunk_data(chunk_id) +
chunk_offset * dim,
chunk_copysz * dim * sizeof(float));
? vector_id_end - chunk_id * size_per_chunk + 1
: size_per_chunk;
std::memcpy(
vec_data.get() + offset * dim,
(const float*)field_raw_data->get_chunk_data(chunk_id) +
chunk_offset * dim,
chunk_copysz * dim * sizeof(float));
offset += chunk_copysz;
}
data_addr = vec_data.get();
@ -146,8 +148,8 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
//append rest data when index has built
idx_t vector_id_beg = index_cur_.load();
idx_t vector_id_end = reserved_offset + size - 1;
auto chunk_id_beg = vector_id_beg / per_chunk;
auto chunk_id_end = vector_id_end / per_chunk;
auto chunk_id_beg = vector_id_beg / size_per_chunk;
auto chunk_id_end = vector_id_end / size_per_chunk;
int64_t vec_num = vector_id_end - vector_id_beg + 1;
if (vec_num <= 0) {
@ -163,11 +165,12 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
for (int chunk_id = chunk_id_beg; chunk_id <= chunk_id_end;
chunk_id++) {
int chunk_offset = chunk_id == chunk_id_beg
? index_cur_ - chunk_id * per_chunk
? index_cur_ - chunk_id * size_per_chunk
: 0;
int chunk_sz = chunk_id == chunk_id_end
? vector_id_end % per_chunk - chunk_offset + 1
: per_chunk - chunk_offset;
int chunk_sz =
chunk_id == chunk_id_end
? vector_id_end % size_per_chunk - chunk_offset + 1
: size_per_chunk - chunk_offset;
auto dataset = knowhere::GenDataSet(
chunk_sz,
dim,

View File

@ -173,7 +173,7 @@ class VectorFieldIndexing : public FieldIndexing {
void
AppendSegmentIndex(int64_t reserved_offset,
int64_t size,
const VectorBase* vec_base,
const VectorBase* field_raw_data,
const void* data_source) override;
void
@ -289,11 +289,11 @@ class IndexingRecord {
indexing->get_field_meta().get_data_type() ==
DataType::VECTOR_FLOAT &&
reserved_offset + size >= indexing->get_build_threshold()) {
auto vec_base = record.get_field_data_base(fieldId);
auto field_raw_data = record.get_field_data_base(fieldId);
indexing->AppendSegmentIndex(
reserved_offset,
size,
vec_base,
field_raw_data,
stream_data->vectors().float_vector().data().data());
}
}

View File

@ -441,24 +441,8 @@ SegmentGrowingImpl::vector_search(SearchInfo& search_info,
Timestamp timestamp,
const BitsetView& bitset,
SearchResult& output) const {
auto& sealed_indexing = this->get_sealed_indexing_record();
if (sealed_indexing.is_ready(search_info.field_id_)) {
query::SearchOnSealedIndex(this->get_schema(),
sealed_indexing,
search_info,
query_data,
query_count,
bitset,
output);
} else {
query::SearchOnGrowing(*this,
search_info,
query_data,
query_count,
timestamp,
bitset,
output);
}
query::SearchOnGrowing(
*this, search_info, query_data, query_count, timestamp, bitset, output);
}
std::unique_ptr<DataArray>

View File

@ -2125,10 +2125,10 @@ TEST(CApiTest, Indexing_Without_Predicate) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -2276,10 +2276,10 @@ TEST(CApiTest, Indexing_Expr_Without_Predicate) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -2456,10 +2456,10 @@ TEST(CApiTest, Indexing_With_float_Predicate_Range) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -2638,10 +2638,10 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Range) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -2812,10 +2812,10 @@ TEST(CApiTest, Indexing_With_float_Predicate_Term) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -2987,10 +2987,10 @@ TEST(CApiTest, Indexing_Expr_With_float_Predicate_Term) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -3169,10 +3169,10 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Range) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -3351,10 +3351,10 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Range) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -3527,10 +3527,10 @@ TEST(CApiTest, Indexing_With_binary_Predicate_Term) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -3725,10 +3725,10 @@ TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Term) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -3945,9 +3945,9 @@ TEST(CApiTest, SealedSegment_search_float_Predicate_Range) {
search_info.metric_type_ = knowhere::metric::L2;
search_info.search_params_ = generate_search_conf(
IndexEnum::INDEX_FAISS_IVFSQ8, knowhere::metric::L2);
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
EXPECT_EQ(result_on_index->distances_.size(), num_queries * TOPK);
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
EXPECT_EQ(result_on_index.distances_.size(), num_queries * TOPK);
status = LoadFieldRawData(segment, 101, counter_col.data(), N);
ASSERT_EQ(status.error_code, Success);
@ -4192,10 +4192,10 @@ TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -4943,10 +4943,10 @@ TEST(CApiTest, Indexing_Without_Predicate_float16) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {
@ -5095,10 +5095,10 @@ TEST(CApiTest, Indexing_Without_Predicate_bfloat16) {
auto vec_index = dynamic_cast<VectorIndex*>(indexing.get());
auto search_plan = reinterpret_cast<milvus::query::Plan*>(plan);
SearchInfo search_info = search_plan->plan_node_->search_info_;
auto result_on_index =
vec_index->Query(query_dataset, search_info, nullptr);
auto ids = result_on_index->seg_offsets_.data();
auto dis = result_on_index->distances_.data();
SearchResult result_on_index;
vec_index->Query(query_dataset, search_info, nullptr, result_on_index);
auto ids = result_on_index.seg_offsets_.data();
auto dis = result_on_index.distances_.data();
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
std::vector<float> vec_dis;
for (int j = 0; j < TOPK * num_queries; ++j) {

View File

@ -9,10 +9,6 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
//
// Created by zilliz on 2023/12/1.
//
#include <gtest/gtest.h>
#include "common/Schema.h"
#include "segcore/SegmentSealedImpl.h"
@ -52,7 +48,30 @@ prepareSegmentSystemFieldData(const std::unique_ptr<SegmentSealed>& segment,
segment->LoadFieldData(TimestampFieldID, field_data_info);
}
TEST(GroupBY, Normal2) {
int GetSearchResultBound(const SearchResult& search_result){
int i = 0;
for(; i < search_result.seg_offsets_.size(); i++){
if(search_result.seg_offsets_[i]==INVALID_SEG_OFFSET) break;
}
return i - 1;
}
void CheckGroupBySearchResult(const SearchResult& search_result, int topK, int nq, bool strict){
int total = topK * nq;
ASSERT_EQ(search_result.group_by_values_.size(), total);
ASSERT_EQ(search_result.seg_offsets_.size(), total);
ASSERT_EQ(search_result.distances_.size(), total);
ASSERT_TRUE(search_result.seg_offsets_[0]!=INVALID_SEG_OFFSET);
int res_bound = GetSearchResultBound(search_result);
ASSERT_TRUE(res_bound>0);
if(strict){
ASSERT_TRUE(res_bound==total-1);
} else {
ASSERT_TRUE(res_bound==total-1||search_result.seg_offsets_[res_bound+1]==INVALID_SEG_OFFSET);
}
}
TEST(GroupBY, SealedIndex) {
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
@ -97,6 +116,7 @@ TEST(GroupBY, Normal2) {
load_index_info.index = std::move(indexing);
load_index_info.index_params[METRICS_TYPE] = knowhere::metric::L2;
segment->LoadIndex(load_index_info);
int topK = 100;
//4. search group by int8
{
@ -111,7 +131,6 @@ TEST(GroupBY, Normal2) {
placeholder_tag: "$0"
>)";
auto plan_str = translate_text_plan_to_binary_plan(raw_plan);
auto plan =
CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
@ -122,12 +141,9 @@ TEST(GroupBY, Normal2) {
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
auto& group_by_values = search_result->group_by_values_;
ASSERT_EQ(search_result->group_by_values_.size(),
search_result->seg_offsets_.size());
ASSERT_EQ(search_result->distances_.size(),
search_result->seg_offsets_.size());
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
int size = group_by_values.size();
std::unordered_set<int8_t> i8_set;
float lastDistance = 0.0;
@ -174,12 +190,9 @@ TEST(GroupBY, Normal2) {
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
auto& group_by_values = search_result->group_by_values_;
ASSERT_EQ(search_result->group_by_values_.size(),
search_result->seg_offsets_.size());
ASSERT_EQ(search_result->distances_.size(),
search_result->seg_offsets_.size());
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
int size = group_by_values.size();
std::unordered_set<int16_t> i16_set;
float lastDistance = 0.0;
@ -226,13 +239,10 @@ TEST(GroupBY, Normal2) {
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
ASSERT_EQ(search_result->group_by_values_.size(),
search_result->seg_offsets_.size());
ASSERT_EQ(search_result->distances_.size(),
search_result->seg_offsets_.size());
int size = group_by_values.size();
std::unordered_set<int32_t> i32_set;
float lastDistance = 0.0;
for (size_t i = 0; i < size; i++) {
@ -278,11 +288,8 @@ TEST(GroupBY, Normal2) {
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
ASSERT_EQ(search_result->group_by_values_.size(),
search_result->seg_offsets_.size());
ASSERT_EQ(search_result->distances_.size(),
search_result->seg_offsets_.size());
int size = group_by_values.size();
std::unordered_set<int64_t> i64_set;
@ -330,12 +337,8 @@ TEST(GroupBY, Normal2) {
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
ASSERT_EQ(search_result->group_by_values_.size(),
search_result->seg_offsets_.size());
ASSERT_EQ(search_result->distances_.size(),
search_result->seg_offsets_.size());
int size = group_by_values.size();
std::unordered_set<std::string> strs_set;
float lastDistance = 0.0;
@ -383,12 +386,8 @@ TEST(GroupBY, Normal2) {
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
ASSERT_EQ(search_result->group_by_values_.size(),
search_result->seg_offsets_.size());
ASSERT_EQ(search_result->distances_.size(),
search_result->seg_offsets_.size());
int size = group_by_values.size();
std::unordered_set<bool> bools_set;
int boolValCount = 0;
@ -415,6 +414,92 @@ TEST(GroupBY, Normal2) {
}
}
TEST(GroupBY, SealedData){
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
//0. prepare schema
int dim = 64;
auto schema = std::make_shared<Schema>();
auto vec_fid = schema->AddDebugField(
"fakevec", DataType::VECTOR_FLOAT, dim, knowhere::metric::L2);
auto int8_fid = schema->AddDebugField("int8", DataType::INT8);
auto int16_fid = schema->AddDebugField("int16", DataType::INT16);
auto int32_fid = schema->AddDebugField("int32", DataType::INT32);
auto int64_fid = schema->AddDebugField("int64", DataType::INT64);
auto str_fid = schema->AddDebugField("string1", DataType::VARCHAR);
auto bool_fid = schema->AddDebugField("bool", DataType::BOOL);
schema->set_primary_field_id(str_fid);
auto segment = CreateSealedSegment(schema);
size_t N = 100;
//2. load raw data
auto raw_data = DataGen(schema, N);
auto fields = schema->get_fields();
for (auto field_data : raw_data.raw_->fields_data()) {
int64_t field_id = field_data.field_id();
auto info = FieldDataInfo(field_data.field_id(), N);
auto field_meta = fields.at(FieldId(field_id));
info.channel->push(
CreateFieldDataFromDataArray(N, &field_data, field_meta));
info.channel->close();
segment->LoadFieldData(FieldId(field_id), info);
}
prepareSegmentSystemFieldData(segment, N, raw_data);
int topK = 100;
//3. search group by int8
{
const char* raw_plan = R"(vector_anns: <
field_id: 100
query_info: <
topk: 100
metric_type: "L2"
search_params: "{\"ef\": 10}"
group_by_field_id: 101
>
placeholder_tag: "$0"
>)";
auto plan_str = translate_text_plan_to_binary_plan(raw_plan);
auto plan =
CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
auto num_queries = 1;
auto seed = 1024;
auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, seed);
auto ph_group =
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, false);
auto& group_by_values = search_result->group_by_values_;
int size = group_by_values.size();
std::unordered_set<int8_t> i8_set;
float lastDistance = 0.0;
for (size_t i = 0; i < size; i++) {
if (std::holds_alternative<int8_t>(group_by_values[i])) {
int8_t g_val = std::get<int8_t>(group_by_values[i]);
ASSERT_FALSE(i8_set.count(g_val) >
0); //no repetition on groupBy field
i8_set.insert(g_val);
auto distance = search_result->distances_.at(i);
ASSERT_TRUE(
lastDistance <=
distance); //distance should be decreased as metrics_type is L2
lastDistance = distance;
} else {
//check padding
ASSERT_EQ(search_result->seg_offsets_[i], INVALID_SEG_OFFSET);
ASSERT_EQ(search_result->distances_[i], 0.0);
}
}
}
}
TEST(GroupBY, Reduce) {
using namespace milvus;
using namespace milvus::query;
@ -540,3 +625,171 @@ TEST(GroupBY, Reduce) {
DeleteSegment(c_segment_1);
DeleteSegment(c_segment_2);
}
TEST(GroupBY, GrowingRawData){
//0. set up growing segment
int dim = 128;
uint64_t seed = 512;
auto schema = std::make_shared<Schema>();
auto metric_type = knowhere::metric::L2;
auto int64_field_id = schema->AddDebugField("int64", DataType::INT64);
auto int32_field_id = schema->AddDebugField("int32", DataType::INT32);
auto vec_field_id = schema->AddDebugField(
"embeddings", DataType::VECTOR_FLOAT, 128, metric_type);
schema->set_primary_field_id(int64_field_id);
auto config = SegcoreConfig::default_config();
config.set_chunk_rows(128);
config.set_enable_interim_segment_index(false);//no growing index, test brute force
auto segment_growing = CreateGrowingSegment(schema, nullptr, 1, config);
auto segment_growing_impl = dynamic_cast<SegmentGrowingImpl*>(segment_growing.get());
//1. prepare raw data in growing segment
int64_t rows_per_batch = 512;
int n_batch = 3;
for(int i = 0; i < n_batch; i++){
auto data_set = DataGen(schema, rows_per_batch);
auto offset = segment_growing_impl->PreInsert(rows_per_batch);
segment_growing_impl->Insert(offset, rows_per_batch, data_set.row_ids_.data(),
data_set.timestamps_.data(),
data_set.raw_);
}
//2. Search group by
const char* raw_plan = R"(vector_anns: <
field_id: 102
query_info: <
topk: 100
metric_type: "L2"
search_params: "{\"ef\": 10}"
group_by_field_id: 101
>
placeholder_tag: "$0"
>)";
auto plan_str = translate_text_plan_to_binary_plan(raw_plan);
auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
auto num_queries = 10;
auto topK = 100;
auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, seed);
auto ph_group =
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment_growing_impl->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, true);
auto& group_by_values = search_result->group_by_values_;
int idx = 0;
for (int i = 0; i < num_queries; i++){
std::unordered_set<int32_t> i32_set;
float lastDistance = 0.0;
for (int j = 0; j < topK; j++){
if (std::holds_alternative<int32_t>(group_by_values[idx])) {
int32_t g_val = std::get<int32_t>(group_by_values[idx]);
ASSERT_FALSE(i32_set.count(g_val) >0); //no repetition on groupBy field
i32_set.insert(g_val);
auto distance = search_result->distances_.at(idx);
ASSERT_TRUE(
lastDistance <=
distance); //distance should be decreased as metrics_type is L2
lastDistance = distance;
} else {
//check padding
ASSERT_EQ(search_result->seg_offsets_[idx], INVALID_SEG_OFFSET);
ASSERT_EQ(search_result->distances_[idx], 0.0);
}
idx++;
}
}
}
TEST(GroupBY, GrowingIndex){
//0. set up growing segment
int dim = 128;
uint64_t seed = 512;
auto schema = std::make_shared<Schema>();
auto metric_type = knowhere::metric::L2;
auto int64_field_id = schema->AddDebugField("int64", DataType::INT64);
auto int32_field_id = schema->AddDebugField("int32", DataType::INT32);
auto vec_field_id = schema->AddDebugField(
"embeddings", DataType::VECTOR_FLOAT, 128, metric_type);
schema->set_primary_field_id(int64_field_id);
std::map<std::string, std::string> index_params = {
{"index_type", "IVF_FLAT"},
{"metric_type", metric_type},
{"nlist", "128"}};
std::map<std::string, std::string> type_params = {{"dim", "128"}};
FieldIndexMeta fieldIndexMeta(
vec_field_id, std::move(index_params), std::move(type_params));
std::map<FieldId, FieldIndexMeta> fieldMap = {{vec_field_id, fieldIndexMeta}};
IndexMetaPtr metaPtr =
std::make_shared<CollectionIndexMeta>(10000, std::move(fieldMap));
auto config = SegcoreConfig::default_config();
config.set_chunk_rows(128);
config.set_enable_interim_segment_index(true);//no growing index, test growing inter index
config.set_nlist(128);
auto segment_growing = CreateGrowingSegment(schema, metaPtr, 1, config);
auto segment_growing_impl = dynamic_cast<SegmentGrowingImpl*>(segment_growing.get());
//1. prepare raw data in growing segment
int64_t rows_per_batch = 1024;
int n_batch = 10;
for(int i = 0; i < n_batch; i++){
auto data_set = DataGen(schema, rows_per_batch);
auto offset = segment_growing_impl->PreInsert(rows_per_batch);
segment_growing_impl->Insert(offset, rows_per_batch, data_set.row_ids_.data(),
data_set.timestamps_.data(),
data_set.raw_);
}
//2. Search group by int32
const char* raw_plan = R"(vector_anns: <
field_id: 102
query_info: <
topk: 100
metric_type: "L2"
search_params: "{\"ef\": 10}"
group_by_field_id: 101
>
placeholder_tag: "$0"
>)";
auto plan_str = translate_text_plan_to_binary_plan(raw_plan);
auto plan = CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
auto num_queries = 10;
auto topK = 100;
auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, seed);
auto ph_group =
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
auto search_result =
segment_growing_impl->Search(plan.get(), ph_group.get(), 1L << 63);
CheckGroupBySearchResult(*search_result, topK, num_queries, true);
auto& group_by_values = search_result->group_by_values_;
int idx = 0;
for (int i = 0; i < num_queries; i++){
std::unordered_set<int32_t> i32_set;
float lastDistance = 0.0;
for (int j = 0; j < topK; j++){
if (std::holds_alternative<int32_t>(group_by_values[idx])) {
int32_t g_val = std::get<int32_t>(group_by_values[idx]);
ASSERT_FALSE(i32_set.count(g_val) >0); //no repetition on groupBy field
i32_set.insert(g_val);
auto distance = search_result->distances_.at(idx);
ASSERT_TRUE(
lastDistance <=
distance); //distance should be decreased as metrics_type is L2
lastDistance = distance;
} else {
//check padding
ASSERT_EQ(search_result->seg_offsets_[idx], INVALID_SEG_OFFSET);
ASSERT_EQ(search_result->distances_[idx], 0.0);
}
idx++;
}
}
}

View File

@ -288,13 +288,14 @@ TEST(Indexing, Naive) {
searchInfo.metric_type_ = knowhere::metric::L2;
searchInfo.search_params_ = search_conf;
auto vec_index = dynamic_cast<index::VectorIndex*>(index.get());
auto result = vec_index->Query(query_ds, searchInfo, view);
SearchResult result;
vec_index->Query(query_ds, searchInfo, view, result);
for (int i = 0; i < TOPK; ++i) {
if (result->seg_offsets_[i] < N / 2) {
if (result.seg_offsets_[i] < N / 2) {
std::cout << "WRONG: ";
}
std::cout << result->seg_offsets_[i] << "->" << result->distances_[i]
std::cout << result.seg_offsets_[i] << "->" << result.distances_[i]
<< std::endl;
}
}
@ -397,6 +398,73 @@ INSTANTIATE_TEST_CASE_P(
#endif
std::pair(knowhere::IndexEnum::INDEX_HNSW, knowhere::metric::L2)));
TEST(Indexing, Iterator) {
constexpr int N = 10240;
constexpr int TOPK = 100;
constexpr int dim = 128;
constexpr int chunk_size = 5120;
auto [raw_data, timestamps, uids] = generate_data<dim>(N);
milvus::index::CreateIndexInfo create_index_info;
create_index_info.field_type = DataType::VECTOR_FLOAT;
create_index_info.metric_type = knowhere::metric::L2;
create_index_info.index_type = knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC;
create_index_info.index_engine_version =
knowhere::Version::GetCurrentVersion().VersionNumber();
auto index = milvus::index::IndexFactory::GetInstance().CreateIndex(
create_index_info, milvus::storage::FileManagerContext());
auto build_conf = knowhere::Json{
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
{knowhere::meta::DIM, std::to_string(dim)},
{knowhere::indexparam::NLIST, "128"},
};
auto search_conf = knowhere::Json{
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
{knowhere::indexparam::NPROBE, 4},
};
std::vector<knowhere::DataSetPtr> datasets;
auto raw = raw_data.data();
for (int beg = 0; beg < N; beg += chunk_size) {
auto end = beg + chunk_size;
if (end > N) {
end = N;
}
std::vector<float> ft(raw + dim * beg, raw + dim * end);
auto ds = knowhere::GenDataSet(end - beg, dim, ft.data());
datasets.push_back(ds);
}
for (auto& ds : datasets) {
index->BuildWithDataset(ds, build_conf);
}
auto bitmap = BitsetType(N, false);
BitsetView view = bitmap;
auto query_ds = knowhere::GenDataSet(1, dim, raw_data.data());
milvus::SearchInfo searchInfo;
searchInfo.topk_ = TOPK;
searchInfo.metric_type_ = knowhere::metric::L2;
searchInfo.search_params_ = search_conf;
auto vec_index = dynamic_cast<index::VectorIndex*>(index.get());
knowhere::expected<std::vector<std::shared_ptr<knowhere::IndexNode::iterator>>>
kw_iterators = vec_index->VectorIterators(query_ds, searchInfo, view);
ASSERT_TRUE(kw_iterators.has_value());
ASSERT_EQ(kw_iterators.value().size(), 1);
auto iterator = kw_iterators.value()[0];
ASSERT_TRUE(iterator->HasNext());
while(iterator->HasNext()){
auto [off, dis] = iterator->Next();
ASSERT_TRUE(off >= 0);
ASSERT_TRUE(dis >= 0);
}
}
TEST_P(IndexTest, BuildAndQuery) {
milvus::index::CreateIndexInfo create_index_info;
create_index_info.index_type = index_type;
@ -439,16 +507,17 @@ TEST_P(IndexTest, BuildAndQuery) {
search_info.topk_ = K;
search_info.metric_type_ = metric_type;
search_info.search_params_ = search_conf;
auto result = vec_index->Query(xq_dataset, search_info, nullptr);
EXPECT_EQ(result->total_nq_, NQ);
EXPECT_EQ(result->unity_topK_, K);
EXPECT_EQ(result->distances_.size(), NQ * K);
EXPECT_EQ(result->seg_offsets_.size(), NQ * K);
SearchResult result;
vec_index->Query(xq_dataset, search_info, nullptr, result);
EXPECT_EQ(result.total_nq_, NQ);
EXPECT_EQ(result.unity_topK_, K);
EXPECT_EQ(result.distances_.size(), NQ * K);
EXPECT_EQ(result.seg_offsets_.size(), NQ * K);
if (!is_binary) {
EXPECT_EQ(result->seg_offsets_[0], query_offset);
EXPECT_EQ(result.seg_offsets_[0], query_offset);
}
search_info.search_params_ = range_search_conf;
vec_index->Query(xq_dataset, search_info, nullptr);
vec_index->Query(xq_dataset, search_info, nullptr, result);
}
TEST_P(IndexTest, Mmap) {
@ -497,16 +566,17 @@ TEST_P(IndexTest, Mmap) {
search_info.topk_ = K;
search_info.metric_type_ = metric_type;
search_info.search_params_ = search_conf;
auto result = vec_index->Query(xq_dataset, search_info, nullptr);
EXPECT_EQ(result->total_nq_, NQ);
EXPECT_EQ(result->unity_topK_, K);
EXPECT_EQ(result->distances_.size(), NQ * K);
EXPECT_EQ(result->seg_offsets_.size(), NQ * K);
SearchResult result;
vec_index->Query(xq_dataset, search_info, nullptr, result);
EXPECT_EQ(result.total_nq_, NQ);
EXPECT_EQ(result.unity_topK_, K);
EXPECT_EQ(result.distances_.size(), NQ * K);
EXPECT_EQ(result.seg_offsets_.size(), NQ * K);
if (!is_binary) {
EXPECT_EQ(result->seg_offsets_[0], query_offset);
EXPECT_EQ(result.seg_offsets_[0], query_offset);
}
search_info.search_params_ = range_search_conf;
vec_index->Query(xq_dataset, search_info, nullptr);
vec_index->Query(xq_dataset, search_info, nullptr, result);
}
TEST_P(IndexTest, GetVector) {
@ -658,7 +728,8 @@ TEST(Indexing, SearchDiskAnnWithInvalidParam) {
{knowhere::meta::METRIC_TYPE, metric_type},
{milvus::index::DISK_ANN_QUERY_LIST, K - 1},
};
EXPECT_THROW(vec_index->Query(xq_dataset, search_info, nullptr),
SearchResult result;
EXPECT_THROW(vec_index->Query(xq_dataset, search_info, nullptr, result),
std::runtime_error);
}

View File

@ -115,8 +115,9 @@ TEST(Sealed, without_predicate) {
searchInfo.topk_ = topK;
searchInfo.metric_type_ = knowhere::metric::L2;
searchInfo.search_params_ = search_conf;
auto result = vec_index->Query(query_dataset, searchInfo, nullptr);
auto ref_result = SearchResultToJson(*result);
SearchResult result;
vec_index->Query(query_dataset, searchInfo, nullptr, result);
auto ref_result = SearchResultToJson(result);
LoadIndexInfo load_info;
load_info.field_id = fake_id.get();
@ -235,7 +236,8 @@ TEST(Sealed, with_predicate) {
searchInfo.topk_ = topK;
searchInfo.metric_type_ = knowhere::metric::L2;
searchInfo.search_params_ = search_conf;
auto result = vec_index->Query(query_dataset, searchInfo, nullptr);
SearchResult result;
vec_index->Query(query_dataset, searchInfo, nullptr, result);
LoadIndexInfo load_info;
load_info.field_id = fake_id.get();

View File

@ -78,10 +78,6 @@ func initSearchRequest(ctx context.Context, t *searchTask) error {
if err != nil {
return err
}
if queryInfo.GroupByFieldId != 0 {
t.SearchRequest.IgnoreGrowing = true
// for group by operation, currently, we ignore growing segments
}
t.offset = offset
plan, err := planparserv2.CreateSearchPlan(t.schema.CollectionSchema, t.request.Dsl, annsField, queryInfo)