Use bitset_view at search

Signed-off-by: FluorineDog <guilin.gou@zilliz.com>
pull/4973/head^2
FluorineDog 2021-01-15 13:40:51 +08:00 committed by yefu.chen
parent 7d81222550
commit 8fccf7e630
8 changed files with 57 additions and 42 deletions

View File

@ -13,6 +13,8 @@
#include <type_traits>
#include "common/Types.h"
#include <cassert>
#include "VectorTrait.h"
namespace milvus {
// type erasure to work around virtual restriction
class SpanBase {

View File

@ -11,6 +11,7 @@
#pragma once
#include "utils/Types.h"
#include "faiss/utils/BitsetView.h"
#include <faiss/MetricType.h>
#include <string>
#include <boost/align/aligned_allocator.hpp>
@ -75,6 +76,14 @@ using FieldId = fluent::NamedType<int64_t, struct FieldIdTag, fluent::Comparable
using FieldName = fluent::NamedType<std::string, struct FieldNameTag, fluent::Comparable, fluent::Hashable>;
using FieldOffset = fluent::NamedType<int64_t, struct FieldOffsetTag, fluent::Comparable, fluent::Hashable>;
} // namespace milvus
using BitsetView = faiss::BitsetView;
inline BitsetView
BitsetSubView(const BitsetView& view, int64_t offset, int64_t size) {
if (view.empty()) {
return BitsetView();
}
assert(offset % 8 == 0);
return BitsetView(view.data() + offset / 8, size);
}
#include "VectorTrait.h"
} // namespace milvus

View File

@ -40,7 +40,7 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
const float* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
const BitsetView& bitset,
QueryResult& results) {
auto& schema = segment.get_schema();
auto& indexing_record = segment.get_indexing_record();
@ -79,14 +79,16 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
// TODO: use sub_qr
for (int chunk_id = 0; chunk_id < max_indexed_id; ++chunk_id) {
auto bitset = create_bitmap_view(bitmaps_opt, chunk_id);
auto chunk_size = indexing_entry.get_chunk_size();
auto indexing = indexing_entry.get_vec_indexing(chunk_id);
auto sub_qr = SearchOnIndex(query_dataset, *indexing, search_conf, bitset);
auto sub_view = BitsetSubView(bitset, chunk_id * chunk_size, chunk_size);
auto sub_qr = SearchOnIndex(query_dataset, *indexing, search_conf, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_labels()) {
if (x != -1) {
x += chunk_id * indexing_entry.get_chunk_size();
x += chunk_id * chunk_size;
}
}
@ -100,15 +102,14 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
auto max_chunk = upper_div(ins_barrier, vec_chunk_size);
for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) {
auto bitmap_view = create_bitmap_view(bitmaps_opt, chunk_id);
auto& chunk = vec_ptr->get_chunk(chunk_id);
auto element_begin = chunk_id * vec_chunk_size;
auto element_end = std::min(ins_barrier, (chunk_id + 1) * vec_chunk_size);
auto chunk_size = element_end - element_begin;
auto sub_qr = FloatSearchBruteForce(query_dataset, chunk.data(), chunk_size, bitmap_view);
auto sub_view = BitsetSubView(bitset, element_begin, chunk_size);
auto sub_qr = FloatSearchBruteForce(query_dataset, chunk.data(), chunk_size, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_qr.mutable_labels()) {
@ -133,7 +134,7 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
const uint8_t* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
const faiss::BitsetView& bitset,
QueryResult& results) {
auto& schema = segment.get_schema();
auto& indexing_record = segment.get_indexing_record();
@ -160,8 +161,6 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
auto total_count = topK * num_queries;
// step 3: small indexing search
// TODO: this is too intrusive
// TODO: use QuerySubResult instead
query::dataset::BinaryQueryDataset query_dataset{metric_type, num_queries, topK, dim, query_data};
auto vec_ptr = record.get_entity<BinaryVector>(vecfield_offset);
@ -178,8 +177,8 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
auto element_end = std::min(ins_barrier, (chunk_id + 1) * vec_chunk_size);
auto nsize = element_end - element_begin;
auto bitmap_view = create_bitmap_view(bitmaps_opt, chunk_id);
auto sub_result = BinarySearchBruteForce(query_dataset, chunk.data(), nsize, bitmap_view);
auto sub_view = BitsetSubView(bitset, element_begin, nsize);
auto sub_result = BinarySearchBruteForce(query_dataset, chunk.data(), nsize, sub_view);
// convert chunk uid to segment uid
for (auto& x : sub_result.mutable_labels()) {

View File

@ -28,7 +28,7 @@ FloatSearch(const segcore::SegmentGrowingImpl& segment,
const float* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmap_opt,
const faiss::BitsetView& bitset,
QueryResult& results);
Status
@ -37,6 +37,6 @@ BinarySearch(const segcore::SegmentGrowingImpl& segment,
const uint8_t* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
const faiss::BitsetView& bitset,
QueryResult& results);
} // namespace milvus::query

View File

@ -20,8 +20,9 @@
namespace milvus::query {
// negate bitset, and merge them into one
aligned_vector<uint8_t>
AssembleBitmap(const BitmapSimple& bitmap_simple) {
AssembleNegBitmap(const BitmapSimple& bitmap_simple) {
int64_t N = 0;
for (auto& bitmap : bitmap_simple) {
@ -52,7 +53,7 @@ SearchOnSealed(const Schema& schema,
const void* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
const faiss::BitsetView& bitset,
QueryResult& result) {
auto topK = query_info.topK_;
@ -73,12 +74,7 @@ SearchOnSealed(const Schema& schema,
auto conf = query_info.search_params_;
conf[milvus::knowhere::meta::TOPK] = query_info.topK_;
conf[milvus::knowhere::Metric::TYPE] = MetricTypeToName(indexing_entry->metric_type_);
if (bitmaps_opt.has_value()) {
auto bitmap = AssembleBitmap(*bitmaps_opt.value());
return indexing_entry->indexing_->Query(ds, conf, faiss::BitsetView(bitmap.data(), num_queries));
} else {
return indexing_entry->indexing_->Query(ds, conf, nullptr);
}
return indexing_entry->indexing_->Query(ds, conf, bitset);
}();
auto ids = final->Get<idx_t*>(knowhere::meta::IDS);

View File

@ -16,6 +16,10 @@
#include "query/Search.h"
namespace milvus::query {
aligned_vector<uint8_t>
AssembleNegBitmap(const BitmapSimple& bitmap_simple);
void
SearchOnSealed(const Schema& schema,
const segcore::SealedIndexingRecord& record,
@ -23,7 +27,7 @@ SearchOnSealed(const Schema& schema,
const void* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
const faiss::BitsetView& view,
QueryResult& result);
} // namespace milvus::query

View File

@ -67,21 +67,20 @@ ExecPlanNodeVisitor::visit(FloatVectorANNS& node) {
auto src_data = ph.get_blob<float>();
auto num_queries = ph.num_of_queries_;
ExecExprVisitor::RetType bitmap_holder;
std::optional<const ExecExprVisitor::RetType*> bitset_pack;
aligned_vector<uint8_t> bitset_holder;
BitsetView view;
if (node.predicate_.has_value()) {
bitmap_holder = ExecExprVisitor(*segment).call_child(*node.predicate_.value());
bitset_pack = &bitmap_holder;
ExecExprVisitor::RetType expr_ret = ExecExprVisitor(*segment).call_child(*node.predicate_.value());
bitset_holder = AssembleNegBitmap(expr_ret);
view = BitsetView(bitset_holder.data(), bitset_holder.size() * 8);
}
auto& sealed_indexing = segment->get_sealed_indexing_record();
if (sealed_indexing.is_ready(node.query_info_.field_offset_)) {
SearchOnSealed(segment->get_schema(), sealed_indexing, node.query_info_, src_data, num_queries, timestamp_,
bitset_pack, ret);
view, ret);
} else {
FloatSearch(*segment, node.query_info_, src_data, num_queries, timestamp_, bitset_pack, ret);
FloatSearch(*segment, node.query_info_, src_data, num_queries, timestamp_, view, ret);
}
ret_ = ret;
@ -98,20 +97,20 @@ ExecPlanNodeVisitor::visit(BinaryVectorANNS& node) {
auto src_data = ph.get_blob<uint8_t>();
auto num_queries = ph.num_of_queries_;
ExecExprVisitor::RetType bitmap_holder;
std::optional<const ExecExprVisitor::RetType*> bitset_pack;
aligned_vector<uint8_t> bitset_holder;
BitsetView view;
if (node.predicate_.has_value()) {
bitmap_holder = ExecExprVisitor(*segment).call_child(*node.predicate_.value());
bitset_pack = &bitmap_holder;
ExecExprVisitor::RetType expr_ret = ExecExprVisitor(*segment).call_child(*node.predicate_.value());
bitset_holder = AssembleNegBitmap(expr_ret);
view = BitsetView(bitset_holder.data(), bitset_holder.size() * 8);
}
auto& sealed_indexing = segment->get_sealed_indexing_record();
if (sealed_indexing.is_ready(node.query_info_.field_offset_)) {
SearchOnSealed(segment->get_schema(), sealed_indexing, node.query_info_, src_data, num_queries, timestamp_,
bitset_pack, ret);
view, ret);
} else {
BinarySearch(*segment, node.query_info_, src_data, num_queries, timestamp_, bitset_pack, ret);
BinarySearch(*segment, node.query_info_, src_data, num_queries, timestamp_, view, ret);
}
ret_ = ret;
}

View File

@ -9,7 +9,6 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include "utils/tools.h"
#include "test_utils/DataGen.h"
@ -22,14 +21,17 @@ TEST(Span, Naive) {
int64_t N = 1000 * 1000;
constexpr int64_t chunk_size = 32 * 1024;
auto schema = std::make_shared<Schema>();
schema->AddDebugField("fakevec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddDebugField("binaryvec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddDebugField("age", DataType::FLOAT);
schema->AddDebugField("floatvec", DataType::VECTOR_FLOAT, 32, MetricType::METRIC_L2);
auto dataset = DataGen(schema, N);
auto segment = CreateGrowingSegment(schema, chunk_size);
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto vec_ptr = dataset.get_col<uint8_t>(0);
auto age_ptr = dataset.get_col<float>(1);
auto float_ptr = dataset.get_col<float>(2);
SegmentInternalInterface& interface = *segment;
auto num_chunk = interface.get_safe_num_chunk();
ASSERT_EQ(num_chunk, upper_div(N, chunk_size));
@ -38,6 +40,7 @@ TEST(Span, Naive) {
for (auto chunk_id = 0; chunk_id < num_chunk; ++chunk_id) {
auto vec_span = interface.chunk_data<BinaryVector>(FieldOffset(0), chunk_id);
auto age_span = interface.chunk_data<float>(FieldOffset(1), chunk_id);
auto float_span = interface.chunk_data<FloatVector>(FieldOffset(2), chunk_id);
auto begin = chunk_id * chunk_size;
auto end = std::min((chunk_id + 1) * chunk_size, N);
auto chunk_size = end - begin;
@ -47,5 +50,8 @@ TEST(Span, Naive) {
for (int i = 0; i < chunk_size; ++i) {
ASSERT_EQ(age_span.data()[i], age_ptr[i + begin]);
}
for (int i = 0; i < chunk_size; ++i) {
ASSERT_EQ(float_span.data()[i], float_ptr[i + begin * 32]);
}
}
}