Caiyd 1655 gpu ivfflat delete (#1809)

* support GPUIVF delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * calc bitset offset correctly Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * support GPUSQ8 delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * support GPUIVFPQ delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * update unittest for GPU delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * delete FAISS compile option -DFAISS_USE_FLOAT16 Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * code opt Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * update changelog Signed-off-by: yudong.cai <yudong.cai@zilliz.com>
2020-04-01 10:42:48 +08:00 · 2020-04-01 10:42:48 +08:00 · 531fca8869
parent c091929298
commit 531fca8869
18 changed files with 247 additions and 75 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@ Please mark all change in change log and use the issue from GitHub
 -   \#1762 Server is not forbidden to create new partition which tag is "_default"

 ## Feature
+-   \#1655 GPU index support delete vectors

 ## Improvement

--- a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp
@ -145,7 +145,7 @@ GPUIVF::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int
    if (device_index) {
        device_index->nprobe = config[IndexParams::nprobe];
        ResScope rs(res_, gpu_id_);
-        device_index->search(n, (float*)data, k, distances, labels);
+        device_index->search(n, (float*)data, k, distances, labels, bitset_);
    } else {
        KNOWHERE_THROW_MSG("Not a GpuIndexIVF type.");
    }
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
@ -366,9 +366,15 @@ GpuIndexIVFPQ::searchImpl_(int n,
  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});

-  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-
-  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  if (!bitset) {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  } else {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
+                                             const_cast<uint8_t*>(bitset->data()), stream,
+                                             {(int) bitset->size()});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  }
 }

 int
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
@ -275,9 +275,15 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});

-  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-
-  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  if (!bitset) {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  } else {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
+                                             const_cast<uint8_t*>(bitset->data()), stream,
+                                             {(int) bitset->size()});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  }
 }

 } } // namespace
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
@ -164,8 +164,6 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();

-  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
-
  // Number of valid vectors that we actually add; we return this
  int numAdded = 0;

@ -176,6 +174,8 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
    listIds2d(mem, {vecs.getSize(0), 1},  stream);
  auto listIds = listIds2d.view<1>({vecs.getSize(0)});

+  /* pseudo bitset */
+  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
  quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false);

  // Calculate residuals for these vectors, if needed
@ -354,8 +354,9 @@ IVFFlat::query(Tensor<float, 2, true>& queries,

  // Find the `nprobe` closest lists; we can use int indices both
  // internally and externally
+  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
  quantizer_->query(queries,
-                    bitset,
+                    coarseBitset,
                    nprobe,
                    coarseDistances,
                    coarseIndices,
@ -371,6 +372,7 @@ IVFFlat::query(Tensor<float, 2, true>& queries,

  runIVFFlatScan(queries,
                 coarseIndices,
+                 bitset,
                 deviceListDataPointers_,
                 deviceListIndexPointers_,
                 indicesOptions_,
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
@ -160,6 +160,7 @@ ivfFlatScan(Tensor<float, 2, true> queries,
 void
 runIVFFlatScanTile(Tensor<float, 2, true>& queries,
                   Tensor<int, 2, true>& listIds,
+                   Tensor<uint8_t, 1, true>& bitset,
                   thrust::device_vector<void*>& listData,
                   thrust::device_vector<void*>& listIndices,
                   IndicesOptions indicesOptions,
@ -315,7 +316,11 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
 #undef RUN_IVF_FLAT

  // k-select the output in chunks, to increase parallelism
-  runPass1SelectLists(prefixSumOffsets,
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      listIds,
+                      bitset,
                      allDistances,
                      listIds.getSize(1),
                      k,
@ -344,6 +349,7 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
 void
 runIVFFlatScan(Tensor<float, 2, true>& queries,
               Tensor<int, 2, true>& listIds,
+               Tensor<uint8_t, 1, true>& bitset,
               thrust::device_vector<void*>& listData,
               thrust::device_vector<void*>& listIndices,
               IndicesOptions indicesOptions,
@ -489,6 +495,7 @@ runIVFFlatScan(Tensor<float, 2, true>& queries,

    runIVFFlatScanTile(queryView,
                       listIdsView,
+                       bitset,
                       listData,
                       listIndices,
                       indicesOptions,
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
@ -20,6 +20,7 @@ class GpuResources;

 void runIVFFlatScan(Tensor<float, 2, true>& queries,
                    Tensor<int, 2, true>& listIds,
+                    Tensor<uint8_t, 1, true>& bitset,
                    thrust::device_vector<void*>& listData,
                    thrust::device_vector<void*>& listIndices,
                    IndicesOptions indicesOptions,
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
@ -119,8 +119,6 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();
  
-  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
-
  // Number of valid vectors that we actually add; we return this
  int numAdded = 0;

@ -130,6 +128,8 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
  auto listIds = listIds2d.view<1>({vecs.getSize(0)});

+  /* pseudo bitset */
+  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
  quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false);

  // Copy the lists that we wish to append to back to the CPU
@ -532,10 +532,11 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
  DeviceTensor<int, 2, true>
    coarseIndices(mem, {queries.getSize(0), nprobe}, stream);

+  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
  // Find the `nprobe` closest coarse centroids; we can use int
  // indices both internally and externally
  quantizer_->query(queries,
-                    bitset,
+                    coarseBitset,
                    nprobe,
                    coarseDistances,
                    coarseIndices,
@ -543,6 +544,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,

  if (precomputedCodes_) {
    runPQPrecomputedCodes_(queries,
+                           bitset,
                           coarseDistances,
                           coarseIndices,
                           k,
@ -550,6 +552,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
                           outIndices);
  } else {
    runPQNoPrecomputedCodes_(queries,
+                             bitset,
                             coarseDistances,
                             coarseIndices,
                             k,
@ -592,6 +595,7 @@ IVFPQ::getPQCentroids() {
 void
 IVFPQ::runPQPrecomputedCodes_(
  Tensor<float, 2, true>& queries,
+  Tensor<uint8_t, 1, true>& bitset,
  DeviceTensor<float, 2, true>& coarseDistances,
  DeviceTensor<int, 2, true>& coarseIndices,
  int k,
@ -655,6 +659,7 @@ IVFPQ::runPQPrecomputedCodes_(
                                term2, // term 2
                                term3, // term 3
                                coarseIndices,
+                                bitset,
                                useFloat16LookupTables_,
                                bytesPerVector_,
                                numSubQuantizers_,
@ -673,6 +678,7 @@ IVFPQ::runPQPrecomputedCodes_(
 void
 IVFPQ::runPQNoPrecomputedCodes_(
  Tensor<float, 2, true>& queries,
+  Tensor<uint8_t, 1, true>& bitset,
  DeviceTensor<float, 2, true>& coarseDistances,
  DeviceTensor<int, 2, true>& coarseIndices,
  int k,
@ -685,6 +691,7 @@ IVFPQ::runPQNoPrecomputedCodes_(
                                  coarseCentroids,
                                  pqCentroidsInnermostCode_,
                                  coarseIndices,
+                                  bitset,
                                  useFloat16LookupTables_,
                                  bytesPerVector_,
                                  numSubQuantizers_,
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
@ -83,6 +83,7 @@ class IVFPQ : public IVFBase {

  /// Runs kernels for scanning inverted lists with precomputed codes
  void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
+                              Tensor<uint8_t, 1, true>& bitset,
                              DeviceTensor<float, 2, true>& coarseDistances,
                              DeviceTensor<int, 2, true>& coarseIndices,
                              int k,
@ -91,6 +92,7 @@ class IVFPQ : public IVFBase {

  /// Runs kernels for scanning inverted lists without precomputed codes
  void runPQNoPrecomputedCodes_(Tensor<float, 2, true>& queries,
+                                Tensor<uint8_t, 1, true>& bitset,
                                DeviceTensor<float, 2, true>& coarseDistances,
                                DeviceTensor<int, 2, true>& coarseIndices,
                                int k,
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
@ -16,6 +16,70 @@
 // post-processing and k-selecting the results
 namespace faiss { namespace gpu {

+// This is warp divergence central, but this is really a final step
+// and happening a small number of times
+inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
+                                            int size,
+                                            int val) {
+  int start = 0;
+  int end = size;
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    int midVal = prefixSumOffsets[mid];
+
+    // Find the first bucket that we are <=
+    if (midVal <= val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  // We must find the bucket that it is in
+  assert(start != size);
+
+  return start;
+}
+
+inline __device__ long
+getListIndex(int queryId,
+             int offset,
+             void** listIndices,
+             Tensor<int, 2, true>& prefixSumOffsets,
+             Tensor<int, 2, true>& topQueryToCentroid,
+             IndicesOptions opt) {
+  long index = -1;
+
+  // In order to determine the actual user index, we need to first
+  // determine what list it was in.
+  // We do this by binary search in the prefix sum list.
+  int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
+                                    prefixSumOffsets.getSize(1),
+                                    offset);
+
+  // This is then the probe for the query; we can find the actual
+  // list ID from this
+  int listId = topQueryToCentroid[queryId][probe];
+
+  // Now, we need to know the offset within the list
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
+  int listOffset = offset - listStart;
+
+  // This gives us our final index
+  if (opt == INDICES_32_BIT) {
+    index = (long) ((int*) listIndices[listId])[listOffset];
+  } else if (opt == INDICES_64_BIT) {
+    index = ((long*) listIndices[listId])[listOffset];
+  } else {
+    index = ((long) listId << 32 | (long) listOffset);
+  }
+
+  return index;
+}
+
 /// Function for multi-pass scanning that collects the length of
 /// intermediate results for all (query, probe) pair
 void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
@ -25,7 +89,11 @@ void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
                        cudaStream_t stream);

 /// Performs a first pass of k-selection on the results
-void runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
+void runPass1SelectLists(thrust::device_vector<void*>& listIndices,
+                         IndicesOptions indicesOptions,
+                         Tensor<int, 2, true>& prefixSumOffsets,
+                         Tensor<int, 2, true>& topQueryToCentroid,
+                         Tensor<uint8_t, 1, true>& bitset,
                         Tensor<float, 1, true>& distance,
                         int nprobe,
                         int k,
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
@ -23,10 +23,14 @@ namespace faiss { namespace gpu {

 template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
 __global__ void
-pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
+pass1SelectLists(void** listIndices,
+                 Tensor<int, 2, true> prefixSumOffsets,
+                 Tensor<int, 2, true> topQueryToCentroid,
+                 Tensor<uint8_t, 1, true> bitset,
                 Tensor<float, 1, true> distance,
                 int nprobe,
                 int k,
+                 IndicesOptions opt,
                 Tensor<float, 3, true> heapDistances,
                 Tensor<int, 3, true> heapIndices) {
  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
@ -58,16 +62,38 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,

  int i = threadIdx.x;
  auto distanceStart = distance[start].data();
+  bool bitsetEmpty = (bitset.getSize(0) == 0);
+  long index = -1;

  // BlockSelect add cannot be used in a warp divergent circumstance; we
  // handle the remainder warp below
  for (; i < limit; i += blockDim.x) {
-    heap.add(distanceStart[i], start + i);
+    index = getListIndex(queryId,
+                         start + i,
+                         listIndices,
+                         prefixSumOffsets,
+                         topQueryToCentroid,
+                         opt);
+    if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) {
+      heap.add(distanceStart[i], start + i);
+    } else {
+      heap.add((1.0 / 0.0), start + i);
+    }
  }

  // Handle warp divergence separately
  if (i < num) {
-    heap.addThreadQ(distanceStart[i], start + i);
+    index = getListIndex(queryId,
+                         start + i,
+                         listIndices,
+                         prefixSumOffsets,
+                         topQueryToCentroid,
+                         opt);
+    if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) {
+      heap.addThreadQ(distanceStart[i], start + i);
+    } else {
+      heap.addThreadQ((1.0 / 0.0), start + i);
+    }
  }

  // Merge all final results
@ -82,7 +108,11 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
 }

 void
-runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
+runPass1SelectLists(thrust::device_vector<void*>& listIndices,
+                    IndicesOptions indicesOptions,
+                    Tensor<int, 2, true>& prefixSumOffsets,
+                    Tensor<int, 2, true>& topQueryToCentroid,
+                    Tensor<uint8_t, 1, true>& bitset,
                    Tensor<float, 1, true>& distance,
                    int nprobe,
                    int k,
@ -98,10 +128,14 @@ runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
 #define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)                  \
  do {                                                                  \
    pass1SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR>              \
-      <<<grid, BLOCK, 0, stream>>>(prefixSumOffsets,                    \
+      <<<grid, BLOCK, 0, stream>>>(listIndices.data().get(),            \
+                                   prefixSumOffsets,                    \
+                                   topQueryToCentroid,                  \
+                                   bitset,                              \
                                   distance,                            \
                                   nprobe,                              \
                                   k,                                   \
+                                   indicesOptions,                      \
                                   heapDistances,                       \
                                   heapIndices);                        \
    CUDA_TEST_ERROR();                                                  \
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
@ -23,30 +23,30 @@ namespace faiss { namespace gpu {

 // This is warp divergence central, but this is really a final step
 // and happening a small number of times
-inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
-                                            int size,
-                                            int val) {
-  int start = 0;
-  int end = size;
-
-  while (end - start > 0) {
-    int mid = start + (end - start) / 2;
-
-    int midVal = prefixSumOffsets[mid];
-
-    // Find the first bucket that we are <=
-    if (midVal <= val) {
-      start = mid + 1;
-    } else {
-      end = mid;
-    }
-  }
-
-  // We must find the bucket that it is in
-  assert(start != size);
-
-  return start;
-}
+//inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
+//                                            int size,
+//                                            int val) {
+//  int start = 0;
+//  int end = size;
+//
+//  while (end - start > 0) {
+//    int mid = start + (end - start) / 2;
+//
+//    int midVal = prefixSumOffsets[mid];
+//
+//    // Find the first bucket that we are <=
+//    if (midVal <= val) {
+//      start = mid + 1;
+//    } else {
+//      end = mid;
+//    }
+//  }
+//
+//  // We must find the bucket that it is in
+//  assert(start != size);
+//
+//  return start;
+//}

 template <int ThreadsPerBlock,
          int NumWarpQ,
@ -113,30 +113,12 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
      // calculated by the original scan.
      int offset = heapIndices[queryId][v];

-      // In order to determine the actual user index, we need to first
-      // determine what list it was in.
-      // We do this by binary search in the prefix sum list.
-      int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
-                                        prefixSumOffsets.getSize(1),
-                                        offset);
-
-      // This is then the probe for the query; we can find the actual
-      // list ID from this
-      int listId = topQueryToCentroid[queryId][probe];
-
-      // Now, we need to know the offset within the list
-      // We ensure that before the array (at offset -1), there is a 0 value
-      int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
-      int listOffset = offset - listStart;
-
-      // This gives us our final index
-      if (opt == INDICES_32_BIT) {
-        index = (long) ((int*) listIndices[listId])[listOffset];
-      } else if (opt == INDICES_64_BIT) {
-        index = ((long*) listIndices[listId])[listOffset];
-      } else {
-        index = ((long) listId << 32 | (long) listOffset);
-      }
+      index = getListIndex(queryId,
+                           offset,
+                           listIndices,
+                           prefixSumOffsets,
+                           topQueryToCentroid,
+                           opt);
    }

    outIndices[queryId][i] = index;
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
@ -224,6 +224,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                 Tensor<float, 3, true>& pqCentroidsInnermostCode,
                 NoTypeTensor<4, true>& codeDistances,
                 Tensor<int, 2, true>& topQueryToCentroid,
+                 Tensor<uint8_t, 1, true>& bitset,
                 bool useFloat16Lookup,
                 int bytesPerCode,
                 int numSubQuantizers,
@ -357,7 +358,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
  CUDA_TEST_ERROR();

  // k-select the output in chunks, to increase parallelism
-  runPass1SelectLists(prefixSumOffsets,
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      bitset,
                      allDistances,
                      topQueryToCentroid.getSize(1),
                      k,
@ -387,6 +392,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                     Tensor<float, 2, true>& centroids,
                                     Tensor<float, 3, true>& pqCentroidsInnermostCode,
                                     Tensor<int, 2, true>& topQueryToCentroid,
+                                     Tensor<uint8_t, 1, true>& bitset,
                                     bool useFloat16Lookup,
                                     int bytesPerCode,
                                     int numSubQuantizers,
@ -560,6 +566,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                     pqCentroidsInnermostCode,
                     codeDistancesView,
                     coarseIndicesView,
+                     bitset,
                     useFloat16Lookup,
                     bytesPerCode,
                     numSubQuantizers,
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
@ -24,6 +24,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                     Tensor<float, 2, true>& centroids,
                                     Tensor<float, 3, true>& pqCentroidsInnermostCode,
                                     Tensor<int, 2, true>& topQueryToCentroid,
+                                     Tensor<uint8_t, 1, true>& bitset,
                                     bool useFloat16Lookup,
                                     int bytesPerCode,
                                     int numSubQuantizers,
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
@ -219,6 +219,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                 NoTypeTensor<3, true>& precompTerm2,
                 NoTypeTensor<3, true>& precompTerm3,
                 Tensor<int, 2, true>& topQueryToCentroid,
+                 Tensor<uint8_t, 1, true>& bitset,
                 bool useFloat16Lookup,
                 int bytesPerCode,
                 int numSubQuantizers,
@ -344,7 +345,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
  }

  // k-select the output in chunks, to increase parallelism
-  runPass1SelectLists(prefixSumOffsets,
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      bitset,
                      allDistances,
                      topQueryToCentroid.getSize(1),
                      k,
@ -377,6 +382,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
                                   NoTypeTensor<3, true>& precompTerm2,
                                   NoTypeTensor<3, true>& precompTerm3,
                                   Tensor<int, 2, true>& topQueryToCentroid,
+                                   Tensor<uint8_t, 1, true>& bitset,
                                   bool useFloat16Lookup,
                                   int bytesPerCode,
                                   int numSubQuantizers,
@ -527,6 +533,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
                     precompTerm2,
                     term3View,
                     coarseIndicesView,
+                     bitset,
                     useFloat16Lookup,
                     bytesPerCode,
                     numSubQuantizers,
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
@ -22,6 +22,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
                                   NoTypeTensor<3, true>& precompTerm2,
                                   NoTypeTensor<3, true>& precompTerm3,
                                   Tensor<int, 2, true>& topQueryToCentroid,
+                                   Tensor<uint8_t, 1, true>& bitset,
                                   bool useFloat16Lookup,
                                   int bytesPerCode,
                                   int numSubQuantizers,
--- a/core/src/index/thirdparty/faiss/makefile.inc.in
+++ b/core/src/index/thirdparty/faiss/makefile.inc.in
@ -22,7 +22,7 @@ NVCCFLAGS    = -I $(CUDA_ROOT)/targets/x86_64-linux/include/ \
 -Xcudafe --diag_suppress=unrecognized_attribute \
 $(CUDA_ARCH) \
 -lineinfo \
-ccbin $(CXX) -DFAISS_USE_FLOAT16
+-ccbin $(CXX)

 OS = $(shell uname -s)

--- a/core/src/index/unittest/test_ivf.cpp
+++ b/core/src/index/unittest/test_ivf.cpp
@ -90,9 +90,13 @@ INSTANTIATE_TEST_CASE_P(
        std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, milvus::knowhere::IndexMode::MODE_CPU),
        std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, milvus::knowhere::IndexMode::MODE_CPU)));

-TEST_P(IVFTest, ivf_basic) {
+TEST_P(IVFTest, ivf_basic_cpu) {
    assert(!xb.empty());

+    if (index_mode_ != milvus::knowhere::IndexMode::MODE_CPU) {
+        return;
+    }
+
    // null faiss index
    ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
    ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
@ -103,11 +107,10 @@ TEST_P(IVFTest, ivf_basic) {
    EXPECT_EQ(index_->Dim(), dim);

    auto result = index_->Query(query_dataset, conf_);
-    AssertAnns(result, nq, conf_[milvus::knowhere::meta::TOPK]);
+    AssertAnns(result, nq, k);
    // PrintResult(result, nq, k);

-    if (index_mode_ == milvus::knowhere::IndexMode::MODE_CPU &&
-        index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
+    if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
        auto result2 = index_->QueryById(id_dataset, conf_);
        AssertAnns(result2, nq, k);

@ -143,6 +146,43 @@ TEST_P(IVFTest, ivf_basic) {
 #endif
 }

+TEST_P(IVFTest, ivf_basic_gpu) {
+    assert(!xb.empty());
+
+    if (index_mode_ != milvus::knowhere::IndexMode::MODE_GPU) {
+        return;
+    }
+
+    // null faiss index
+    ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
+    ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
+
+    index_->Train(base_dataset, conf_);
+    index_->Add(base_dataset, conf_);
+    EXPECT_EQ(index_->Count(), nb);
+    EXPECT_EQ(index_->Dim(), dim);
+
+    auto result = index_->Query(query_dataset, conf_);
+    AssertAnns(result, nq, k);
+    // PrintResult(result, nq, k);
+
+    if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8H) {
+        faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(nb);
+        for (int64_t i = 0; i < nq; ++i) {
+            concurrent_bitset_ptr->set(i);
+        }
+        index_->SetBlacklist(concurrent_bitset_ptr);
+
+        auto result_bs_1 = index_->Query(query_dataset, conf_);
+        AssertAnns(result_bs_1, nq, k, CheckMode::CHECK_NOT_EQUAL);
+        // PrintResult(result, nq, k);
+    }
+
+#ifdef MILVUS_GPU_VERSION
+    milvus::knowhere::FaissGpuResourceMgr::GetInstance().Dump();
+#endif
+}
+
 TEST_P(IVFTest, ivf_serialize) {
    fiu_init(0);
    auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) {