From 4fa45dc754ed98f5d6cbfc7cb72ad685c94a23e6 Mon Sep 17 00:00:00 2001 From: op-hunter Date: Mon, 23 Mar 2020 18:04:46 +0800 Subject: [PATCH] #1661 support HNSW deletion on nmslib (#1729) * support HNSW deletion on nmslib Signed-off-by: lichengming * update changelog Signed-off-by: lichengming * fix lint error on test_hnsw.cpp Signed-off-by: lichengming Co-authored-by: lichengming --- CHANGELOG.md | 1 + .../knowhere/index/vector_index/IndexHNSW.cpp | 16 ++- core/src/index/thirdparty/hnswlib/hnswalg.h | 24 ++-- core/src/index/thirdparty/hnswlib/hnswlib.h | 5 +- core/src/index/unittest/CMakeLists.txt | 11 ++ core/src/index/unittest/test_hnsw.cpp | 134 ++++++++++++++++++ .../delivery/request/DeleteByIDRequest.cpp | 1 + 7 files changed, 173 insertions(+), 19 deletions(-) create mode 100644 core/src/index/unittest/test_hnsw.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 5390d6edf9..91307eece5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Please mark all change in change log and use the issue from GitHub ## Feature - \#1603 BinaryFlat add 2 Metric: Substructure and Superstructure - \#1660 IVF PQ CPU support deleted vectors searching +- \#1661 HNSW support deleted vectors searching ## Improvement - \#267 Improve search performance: reduce delay diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexHNSW.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexHNSW.cpp index cbae93da68..1cb94f4e40 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexHNSW.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexHNSW.cpp @@ -132,8 +132,9 @@ IndexHNSW::Query(const DatasetPtr& dataset_ptr, const Config& config) { } GETTENSOR(dataset_ptr) - size_t id_size = sizeof(int64_t) * config[meta::TOPK].get(); - size_t dist_size = sizeof(float) * config[meta::TOPK].get(); + size_t k = config[meta::TOPK].get(); + size_t id_size = sizeof(int64_t) * k; + size_t dist_size = sizeof(float) * k; auto p_id = (int64_t*)malloc(id_size * rows); auto p_dist = (float*)malloc(dist_size * rows); @@ -141,6 +142,9 @@ IndexHNSW::Query(const DatasetPtr& dataset_ptr, const Config& config) { using P = std::pair; auto compare = [](const P& v1, const P& v2) { return v1.first < v2.first; }; + + faiss::ConcurrentBitsetPtr blacklist = nullptr; + GetBlacklist(blacklist); #pragma omp parallel for for (unsigned int i = 0; i < rows; ++i) { std::vector

ret; @@ -153,9 +157,9 @@ IndexHNSW::Query(const DatasetPtr& dataset_ptr, const Config& config) { // } else { // ret = index_->searchKnn((float*)single_query, config[meta::TOPK].get(), compare); // } - ret = index_->searchKnn((float*)single_query, config[meta::TOPK].get(), compare); + ret = index_->searchKnn((float*)single_query, k, compare, blacklist); - while (ret.size() < config[meta::TOPK]) { + while (ret.size() < k) { ret.push_back(std::make_pair(-1, -1)); } std::vector dist; @@ -171,8 +175,8 @@ IndexHNSW::Query(const DatasetPtr& dataset_ptr, const Config& config) { std::transform(ret.begin(), ret.end(), std::back_inserter(ids), [](const std::pair& e) { return e.second; }); - memcpy(p_dist + i * config[meta::TOPK].get(), dist.data(), dist_size); - memcpy(p_id + i * config[meta::TOPK].get(), ids.data(), id_size); + memcpy(p_dist + i * k, dist.data(), dist_size); + memcpy(p_id + i * k, ids.data(), id_size); } auto ret_ds = std::make_shared(); diff --git a/core/src/index/thirdparty/hnswlib/hnswalg.h b/core/src/index/thirdparty/hnswlib/hnswalg.h index 3ac3dd4dc1..96c42e6bd4 100644 --- a/core/src/index/thirdparty/hnswlib/hnswalg.h +++ b/core/src/index/thirdparty/hnswlib/hnswalg.h @@ -253,7 +253,7 @@ public: template std::priority_queue, std::vector>, CompareByFirst> - searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef) const { + searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, faiss::ConcurrentBitsetPtr bitset) const { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; vl_type visited_array_tag = vl->curV; @@ -262,7 +262,8 @@ public: std::priority_queue, std::vector>, CompareByFirst> candidate_set; dist_t lowerBound; - if (!has_deletions || !isMarkedDeleted(ep_id)) { +// if (!has_deletions || !isMarkedDeleted(ep_id)) { + if (!has_deletions || !bitset->test((faiss::ConcurrentBitset::id_type_t)getExternalLabel(ep_id))) { dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_); lowerBound = dist; top_candidates.emplace(dist, ep_id); @@ -318,7 +319,8 @@ public: _MM_HINT_T0);//////////////////////// #endif - if (!has_deletions || !isMarkedDeleted(candidate_id)) +// if (!has_deletions || !isMarkedDeleted(candidate_id)) + if (!has_deletions || (!bitset->test((faiss::ConcurrentBitset::id_type_t)getExternalLabel(candidate_id)))) top_candidates.emplace(dist, candidate_id); if (top_candidates.size() > ef) @@ -1061,7 +1063,7 @@ public: }; std::priority_queue> - searchKnn(const void *query_data, size_t k) const { + searchKnn(const void *query_data, size_t k, faiss::ConcurrentBitsetPtr bitset) const { std::priority_queue> result; if (cur_element_count == 0) return result; @@ -1093,14 +1095,14 @@ public: } std::priority_queue, std::vector>, CompareByFirst> top_candidates; - if (has_deletions_) { - std::priority_queue, std::vector>, CompareByFirst> top_candidates1=searchBaseLayerST( - currObj, query_data, std::max(ef_, k)); + if (bitset != nullptr) { + std::priority_queue, std::vector>, CompareByFirst> + top_candidates1 = searchBaseLayerST(currObj, query_data, std::max(ef_, k), bitset); top_candidates.swap(top_candidates1); } else{ - std::priority_queue, std::vector>, CompareByFirst> top_candidates1=searchBaseLayerST( - currObj, query_data, std::max(ef_, k)); + std::priority_queue, std::vector>, CompareByFirst> + top_candidates1 = searchBaseLayerST(currObj, query_data, std::max(ef_, k), bitset); top_candidates.swap(top_candidates1); } while (top_candidates.size() > k) { @@ -1116,11 +1118,11 @@ public: template std::vector> - searchKnn(const void* query_data, size_t k, Comp comp) { + searchKnn(const void* query_data, size_t k, Comp comp, faiss::ConcurrentBitsetPtr bitset) { std::vector> result; if (cur_element_count == 0) return result; - auto ret = searchKnn(query_data, k); + auto ret = searchKnn(query_data, k, bitset); while (!ret.empty()) { result.push_back(ret.top()); diff --git a/core/src/index/thirdparty/hnswlib/hnswlib.h b/core/src/index/thirdparty/hnswlib/hnswlib.h index 6089a30b96..ca69daedf3 100644 --- a/core/src/index/thirdparty/hnswlib/hnswlib.h +++ b/core/src/index/thirdparty/hnswlib/hnswlib.h @@ -27,6 +27,7 @@ #include #include +#include namespace hnswlib { typedef int64_t labeltype; @@ -80,9 +81,9 @@ namespace hnswlib { class AlgorithmInterface { public: virtual void addPoint(const void *datapoint, labeltype label)=0; - virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; + virtual std::priority_queue> searchKnn(const void *, size_t, faiss::ConcurrentBitsetPtr bitset) const = 0; template - std::vector> searchKnn(const void*, size_t, Comp) { + std::vector> searchKnn(const void*, size_t, Comp, faiss::ConcurrentBitsetPtr bitset) { } virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ diff --git a/core/src/index/unittest/CMakeLists.txt b/core/src/index/unittest/CMakeLists.txt index 329a991355..234b75394c 100644 --- a/core/src/index/unittest/CMakeLists.txt +++ b/core/src/index/unittest/CMakeLists.txt @@ -89,6 +89,16 @@ if (NOT TARGET test_idmap) endif () target_link_libraries(test_idmap ${depend_libs} ${unittest_libs} ${basic_libs}) +# +set(hnsw_srcs + ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexHNSW.cpp + ) + +if (NOT TARGET test_hnsw) + add_executable(test_hnsw test_hnsw.cpp ${hnsw_srcs} ${util_srcs}) +endif () +target_link_libraries(test_hnsw ${depend_libs} ${unittest_libs} ${basic_libs}) + # if (NOT TARGET test_binaryidmap) add_executable(test_binaryidmap test_binaryidmap.cpp ${ivf_srcs} ${util_srcs}) @@ -128,6 +138,7 @@ endif () target_link_libraries(test_knowhere_common ${depend_libs} ${unittest_libs} ${basic_libs}) install(TARGETS test_ivf DESTINATION unittest) +install(TARGETS test_hnsw DESTINATION unittest) install(TARGETS test_binaryivf DESTINATION unittest) install(TARGETS test_idmap DESTINATION unittest) install(TARGETS test_binaryidmap DESTINATION unittest) diff --git a/core/src/index/unittest/test_hnsw.cpp b/core/src/index/unittest/test_hnsw.cpp new file mode 100644 index 0000000000..8e5a584cba --- /dev/null +++ b/core/src/index/unittest/test_hnsw.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#include +#include +#include +#include +#include "./utils.h" + +int +main() { + int64_t d = 64; // dimension + int64_t nb = 10000; // database size + int64_t nq = 10; // 10000; // nb of queries + faiss::ConcurrentBitsetPtr bitset = std::make_shared(nb); + + int64_t* ids = new int64_t[nb]; + float* xb = new float[d * nb]; + float* xq = new float[d * nq]; + // int64_t *ids = (int64_t*)malloc(nb * sizeof(int64_t)); + // float* xb = (float*)malloc(d * nb * sizeof(float)); + // float* xq = (float*)malloc(d * nq * sizeof(float)); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < d; j++) xb[d * i + j] = drand48(); + xb[d * i] += i / 1000.; + ids[i] = i; + } + printf("gen xb and ids done! \n"); + + // srand((unsigned)time(NULL)); + auto random_seed = (unsigned)time(NULL); + printf("delete ids: \n"); + for (int i = 0; i < nq; i++) { + auto tmp = rand_r(&random_seed) % nb; + printf("%ld\n", tmp); + // std::cout << "before delete, test result: " << bitset->test(tmp) << std::endl; + bitset->set(tmp); + // std::cout << "after delete, test result: " << bitset->test(tmp) << std::endl; + for (int j = 0; j < d; j++) xq[d * i + j] = xb[d * tmp + j]; + // xq[d * i] += i / 1000.; + } + printf("\n"); + + int k = 4; + int m = 16; + int ef = 200; + milvus::knowhere::IndexHNSW index; + milvus::knowhere::DatasetPtr base_dataset = generate_dataset(nb, d, (const void*)xb, ids); + /* + base_dataset->Set(milvus::knowhere::meta::ROWS, nb); + base_dataset->Set(milvus::knowhere::meta::DIM, d); + base_dataset->Set(milvus::knowhere::meta::TENSOR, (const void*)xb); + base_dataset->Set(milvus::knowhere::meta::IDS, (const int64_t*)ids); + */ + + milvus::knowhere::Config base_conf{ + {milvus::knowhere::meta::DIM, d}, + {milvus::knowhere::meta::TOPK, k}, + {milvus::knowhere::IndexParams::M, m}, + {milvus::knowhere::IndexParams::efConstruction, ef}, + {milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, + }; + milvus::knowhere::DatasetPtr query_dataset = generate_query_dataset(nq, d, (const void*)xq); + milvus::knowhere::Config query_conf{ + {milvus::knowhere::meta::DIM, d}, + {milvus::knowhere::meta::TOPK, k}, + {milvus::knowhere::IndexParams::M, m}, + {milvus::knowhere::IndexParams::ef, ef}, + {milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, + }; + + index.Train(base_dataset, base_conf); + index.Add(base_dataset, base_conf); + + printf("------------sanity check----------------\n"); + { // sanity check + auto res = index.Query(query_dataset, query_conf); + printf("Query done!\n"); + const int64_t* I = res->Get(milvus::knowhere::meta::IDS); + float* D = res->Get(milvus::knowhere::meta::DISTANCE); + + printf("I=\n"); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]); + printf("\n"); + } + + printf("D=\n"); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < k; j++) printf("%7g ", D[i * k + j]); + printf("\n"); + } + } + + printf("---------------search xq-------------\n"); + { // search xq + auto res = index.Query(query_dataset, query_conf); + const int64_t* I = res->Get(milvus::knowhere::meta::IDS); + + printf("I=\n"); + for (int i = 0; i < nq; i++) { + for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]); + printf("\n"); + } + } + + printf("----------------search xq with delete------------\n"); + { // search xq with delete + index.SetBlacklist(bitset); + auto res = index.Query(query_dataset, query_conf); + auto I = res->Get(milvus::knowhere::meta::IDS); + + printf("I=\n"); + for (int i = 0; i < nq; i++) { + for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]); + printf("\n"); + } + } + + delete[] xb; + delete[] xq; + delete[] ids; + + return 0; +} diff --git a/core/src/server/delivery/request/DeleteByIDRequest.cpp b/core/src/server/delivery/request/DeleteByIDRequest.cpp index 1136d8a894..dac7e28776 100644 --- a/core/src/server/delivery/request/DeleteByIDRequest.cpp +++ b/core/src/server/delivery/request/DeleteByIDRequest.cpp @@ -70,6 +70,7 @@ DeleteByIDRequest::OnExecute() { // Check table's index type supports delete if (table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IDMAP && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_BIN_IDMAP && + table_schema.engine_type_ != (int32_t)engine::EngineType::HNSW && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IVFFLAT && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_BIN_IVFFLAT && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IVFSQ8 &&