fix: search-group-by failed to get data from multi-chunked-segment(##… (#38383)

related: #38343

Signed-off-by: MrPresent-Han <chun.han@gmail.com>
Co-authored-by: MrPresent-Han <chun.han@gmail.com>
pull/38536/head
Chun Han 2024-12-13 03:54:43 -05:00 committed by GitHub
parent 3038383e36
commit c1f9158996
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 44 additions and 34 deletions

View File

@ -61,50 +61,53 @@ class GrowingDataGetter : public DataGetter<T> {
template <typename T>
class SealedDataGetter : public DataGetter<T> {
private:
std::shared_ptr<Span<T>> field_data_;
std::shared_ptr<std::vector<std::string_view>> str_field_data_;
const index::ScalarIndex<T>* field_index_;
const segcore::SegmentSealed& segment_;
const FieldId field_id_;
bool from_data_;
mutable std::unordered_map<int64_t, std::vector<std::string_view>>
str_view_map_;
// Getting str_view from segment is cpu-costly, this map is to cache this view for performance
public:
SealedDataGetter(const segcore::SegmentSealed& segment, FieldId& field_id) {
if (segment.HasFieldData(field_id)) {
if constexpr (std::is_same_v<T, std::string>) {
str_field_data_ =
std::make_shared<std::vector<std::string_view>>(
segment.chunk_view<std::string_view>(field_id, 0)
.first);
} else {
auto span = segment.chunk_data<T>(field_id, 0);
field_data_ = std::make_shared<Span<T>>(
span.data(), span.valid_data(), span.row_count());
}
} else if (segment.HasIndex(field_id)) {
this->field_index_ = &(segment.chunk_scalar_index<T>(field_id, 0));
} else {
PanicInfo(UnexpectedError,
"The segment used to init data getter has no effective "
"data source, neither"
"index or data");
SealedDataGetter(const segcore::SegmentSealed& segment, FieldId& field_id)
: segment_(segment), field_id_(field_id) {
from_data_ = segment_.HasFieldData(field_id_);
if (!from_data_ && !segment_.HasIndex(field_id_)) {
PanicInfo(
UnexpectedError,
"The segment:{} used to init data getter has no effective "
"data source, neither"
"index or data",
segment_.get_segment_id());
}
}
SealedDataGetter(const SealedDataGetter<T>& other)
: field_data_(other.field_data_),
str_field_data_(other.str_field_data_),
field_index_(other.field_index_) {
}
T
Get(int64_t idx) const {
if (field_data_ || str_field_data_) {
if (from_data_) {
auto id_offset_pair = segment_.get_chunk_by_offset(field_id_, idx);
auto chunk_id = id_offset_pair.first;
auto inner_offset = id_offset_pair.second;
if constexpr (std::is_same_v<T, std::string>) {
if (str_view_map_.find(chunk_id) == str_view_map_.end()) {
// for now, search_group_by does not handle null values
auto [str_chunk_view, _] =
segment_.chunk_view<std::string_view>(field_id_,
chunk_id);
str_view_map_[chunk_id] = std::move(str_chunk_view);
}
auto& str_chunk_view = str_view_map_[chunk_id];
std::string_view str_val_view =
str_field_data_->operator[](idx);
str_chunk_view.operator[](inner_offset);
return std::string(str_val_view.data(), str_val_view.length());
} else {
Span<T> span = segment_.chunk_data<T>(field_id_, chunk_id);
auto raw = span.operator[](inner_offset);
return raw;
}
return field_data_->operator[](idx);
} else {
auto raw = (*field_index_).Reverse_Lookup(idx);
auto& chunk_index = segment_.chunk_scalar_index<T>(field_id_, 0);
auto raw = chunk_index.Reverse_Lookup(idx);
AssertInfo(raw.has_value(), "field data not found");
return raw.value();
}

View File

@ -134,7 +134,7 @@ class ScalarIndex : public IndexBase {
}
virtual bool
IsMmapSupported() const {
IsMmapSupported() const override {
return index_type_ == milvus::index::BITMAP_INDEX_TYPE ||
index_type_ == milvus::index::HYBRID_INDEX_TYPE;
}

View File

@ -165,7 +165,14 @@ class SegmentSealedImpl : public SegmentSealed {
std::pair<int64_t, int64_t>
get_chunk_by_offset(FieldId field_id, int64_t offset) const override {
PanicInfo(ErrorCode::Unsupported, "Not implemented");
if (fields_.find(field_id) == fields_.end()) {
PanicInfo(
ErrorCode::FieldIDInvalid,
"Failed to get chunk offset towards a non-existing field:{}",
field_id.get());
}
// for sealed segment, chunk id is always zero and input offset is the target offset
return std::make_pair(0, offset);
}
int64_t