From 6939c60befd44b460c9d3928e0d14a9f17750767 Mon Sep 17 00:00:00 2001 From: groot Date: Fri, 25 Sep 2020 13:59:16 +0800 Subject: [PATCH] reduce uid copy during search (#3867) Signed-off-by: groot Signed-off-by: shengjun.li --- core/src/segment/SegmentReader.cpp | 48 +++++++++++++++++++++++++----- core/src/segment/SegmentReader.h | 3 ++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/core/src/segment/SegmentReader.cpp b/core/src/segment/SegmentReader.cpp index ad74ac9c53..3fc856f173 100644 --- a/core/src/segment/SegmentReader.cpp +++ b/core/src/segment/SegmentReader.cpp @@ -244,6 +244,8 @@ SegmentReader::LoadUids(std::vector& uids) { return Status(DB_ERROR, err_msg); } + TimeRecorderAuto recorder("SegmentReader::LoadUids"); + uids.clear(); uids.resize(raw->data_.size() / sizeof(engine::idx_t)); memcpy(uids.data(), raw->data_.data(), raw->data_.size()); @@ -269,12 +271,9 @@ SegmentReader::LoadVectorIndex(const std::string& field_name, knowhere::VecIndex return Status(DB_ERROR, "Field is not vector type"); } - // load uids - std::vector uids; - STATUS_CHECK(LoadUids(uids)); - // load deleted doc - faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared(uids.size()); + int64_t row_count = GetRowCount(); + faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared(row_count); segment::DeletedDocsPtr deleted_docs_ptr; LoadDeletedDocs(deleted_docs_ptr); if (deleted_docs_ptr != nullptr) { @@ -307,7 +306,11 @@ SegmentReader::LoadVectorIndex(const std::string& field_name, knowhere::VecIndex engine::BinaryDataPtr raw; STATUS_CHECK(LoadField(field_name, raw, false)); - auto dataset = knowhere::GenDataset(uids.size(), dimension, raw->data_.data()); + // load uids + std::vector uids; + STATUS_CHECK(LoadUids(uids)); + + auto dataset = knowhere::GenDataset(row_count, dimension, raw->data_.data()); // construct IDMAP index knowhere::VecIndexFactory& vec_index_factory = knowhere::VecIndexFactory::GetInstance(); @@ -326,9 +329,9 @@ SegmentReader::LoadVectorIndex(const std::string& field_name, knowhere::VecIndex segment_ptr_->SetVectorIndex(field_name, index_ptr); cache::CpuCacheMgr::GetInstance().InsertItem(temp_index_path, index_ptr); + recorder.RecordSection("construct temp IDMAP index"); } - recorder.RecordSection("create temp IDMAP index"); return Status::OK(); } @@ -377,11 +380,16 @@ SegmentReader::LoadVectorIndex(const std::string& field_name, knowhere::VecIndex STATUS_CHECK(ss_codec.GetVectorIndexFormat()->ConstructIndex(index_type, index_data, raw_data, compress_data, index_ptr)); + // load uids + std::vector uids; + STATUS_CHECK(LoadUids(uids)); + index_ptr->SetUids(uids); index_ptr->SetBlacklist(concurrent_bitset_ptr); segment_ptr_->SetVectorIndex(field_name, index_ptr); cache::CpuCacheMgr::GetInstance().InsertItem(index_file_path, index_ptr); // put into cache + recorder.RecordSection("construct index"); } catch (std::exception& e) { std::string err_msg = "Failed to load vector index: " + std::string(e.what()); LOG_ENGINE_ERROR_ << err_msg; @@ -506,7 +514,7 @@ SegmentReader::LoadBloomFilter(segment::IdBloomFilterPtr& id_bloom_filter_ptr) { Status SegmentReader::LoadDeletedDocs(segment::DeletedDocsPtr& deleted_docs_ptr) { try { - TimeRecorder recorder("SegmentReader::LoadDeletedDocs"); + TimeRecorderAuto recorder("SegmentReader::LoadDeletedDocs"); deleted_docs_ptr = segment_ptr_->GetDeletedDocs(); if (deleted_docs_ptr != nullptr) { @@ -611,6 +619,30 @@ SegmentReader::GetTempIndexPath(const std::string& field_name, std::string& path return Status::OK(); } +int64_t +SegmentReader::GetRowCount() { + engine::BinaryDataPtr raw; + auto status = LoadField(engine::FIELD_UID, raw); + if (!status.ok()) { + LOG_ENGINE_ERROR_ << status.message(); + return 0; + } + + if (raw == nullptr) { + LOG_ENGINE_ERROR_ << "Failed to load id field"; + return 0; + } + + if (raw->data_.size() % sizeof(engine::idx_t) != 0) { + std::string err_msg = "Failed to load uids: illegal file size"; + LOG_ENGINE_ERROR_ << err_msg; + return 0; + } + + int64_t count = raw->data_.size() / sizeof(engine::idx_t); + return count; +} + Status SegmentReader::ClearCache() { TimeRecorderAuto recorder("SegmentReader::ClearCache"); diff --git a/core/src/segment/SegmentReader.h b/core/src/segment/SegmentReader.h index 13bb2c831c..07efbe00d6 100644 --- a/core/src/segment/SegmentReader.h +++ b/core/src/segment/SegmentReader.h @@ -95,6 +95,9 @@ class SegmentReader { return segment_visitor_; } + int64_t + GetRowCount(); + // clear cache from cache manager, use this method for segment merge/compact and collection/partition drop Status ClearCache();