diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea5ca83c98..d72c7df614 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ Please mark all change in change log and use the issue from GitHub
 -   \#1762 Server is not forbidden to create new partition which tag is "_default"
 
 ## Feature
+-   \#1655 GPU index support delete vectors
 
 ## Improvement
 
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp
index 599f602fd7..860d32d57a 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp
@@ -145,7 +145,7 @@ GPUIVF::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int
     if (device_index) {
         device_index->nprobe = config[IndexParams::nprobe];
         ResScope rs(res_, gpu_id_);
-        device_index->search(n, (float*)data, k, distances, labels);
+        device_index->search(n, (float*)data, k, distances, labels, bitset_);
     } else {
         KNOWHERE_THROW_MSG("Not a GpuIndexIVF type.");
     }
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
index 9fb52911b0..0f969c2ac9 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
@@ -366,9 +366,15 @@ GpuIndexIVFPQ::searchImpl_(int n,
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
 
-  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-
-  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  if (!bitset) {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  } else {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
+                                             const_cast<uint8_t*>(bitset->data()), stream,
+                                             {(int) bitset->size()});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  }
 }
 
 int
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
index 1599cfab6a..c0e194d45b 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
@@ -275,9 +275,15 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
 
-  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-
-  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  if (!bitset) {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  } else {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
+                                             const_cast<uint8_t*>(bitset->data()), stream,
+                                             {(int) bitset->size()});
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+  }
 }
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
index 5c1db4b7fb..8de3964e1d 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
@@ -164,8 +164,6 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   auto& mem = resources_->getMemoryManagerCurrentDevice();
   auto stream = resources_->getDefaultStreamCurrentDevice();
 
-  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
-
   // Number of valid vectors that we actually add; we return this
   int numAdded = 0;
 
@@ -176,6 +174,8 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
     listIds2d(mem, {vecs.getSize(0), 1},  stream);
   auto listIds = listIds2d.view<1>({vecs.getSize(0)});
 
+  /* pseudo bitset */
+  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
   quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false);
 
   // Calculate residuals for these vectors, if needed
@@ -354,8 +354,9 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
 
   // Find the `nprobe` closest lists; we can use int indices both
   // internally and externally
+  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
   quantizer_->query(queries,
-                    bitset,
+                    coarseBitset,
                     nprobe,
                     coarseDistances,
                     coarseIndices,
@@ -371,6 +372,7 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
 
   runIVFFlatScan(queries,
                  coarseIndices,
+                 bitset,
                  deviceListDataPointers_,
                  deviceListIndexPointers_,
                  indicesOptions_,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
index 5b7c94022f..deb2720db4 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
@@ -160,6 +160,7 @@ ivfFlatScan(Tensor<float, 2, true> queries,
 void
 runIVFFlatScanTile(Tensor<float, 2, true>& queries,
                    Tensor<int, 2, true>& listIds,
+                   Tensor<uint8_t, 1, true>& bitset,
                    thrust::device_vector<void*>& listData,
                    thrust::device_vector<void*>& listIndices,
                    IndicesOptions indicesOptions,
@@ -315,7 +316,11 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
 #undef RUN_IVF_FLAT
 
   // k-select the output in chunks, to increase parallelism
-  runPass1SelectLists(prefixSumOffsets,
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      listIds,
+                      bitset,
                       allDistances,
                       listIds.getSize(1),
                       k,
@@ -344,6 +349,7 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
 void
 runIVFFlatScan(Tensor<float, 2, true>& queries,
                Tensor<int, 2, true>& listIds,
+               Tensor<uint8_t, 1, true>& bitset,
                thrust::device_vector<void*>& listData,
                thrust::device_vector<void*>& listIndices,
                IndicesOptions indicesOptions,
@@ -489,6 +495,7 @@ runIVFFlatScan(Tensor<float, 2, true>& queries,
 
     runIVFFlatScanTile(queryView,
                        listIdsView,
+                       bitset,
                        listData,
                        listIndices,
                        indicesOptions,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
index 475e71ab5d..ac63579dd2 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
@@ -20,6 +20,7 @@ class GpuResources;
 
 void runIVFFlatScan(Tensor<float, 2, true>& queries,
                     Tensor<int, 2, true>& listIds,
+                    Tensor<uint8_t, 1, true>& bitset,
                     thrust::device_vector<void*>& listData,
                     thrust::device_vector<void*>& listIndices,
                     IndicesOptions indicesOptions,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
index 636eef78c0..6f48f3a6db 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
@@ -119,8 +119,6 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   auto& mem = resources_->getMemoryManagerCurrentDevice();
   auto stream = resources_->getDefaultStreamCurrentDevice();
   
-  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
-
   // Number of valid vectors that we actually add; we return this
   int numAdded = 0;
 
@@ -130,6 +128,8 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
   auto listIds = listIds2d.view<1>({vecs.getSize(0)});
 
+  /* pseudo bitset */
+  DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
   quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false);
 
   // Copy the lists that we wish to append to back to the CPU
@@ -532,10 +532,11 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
   DeviceTensor<int, 2, true>
     coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
 
+  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
   // Find the `nprobe` closest coarse centroids; we can use int
   // indices both internally and externally
   quantizer_->query(queries,
-                    bitset,
+                    coarseBitset,
                     nprobe,
                     coarseDistances,
                     coarseIndices,
@@ -543,6 +544,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
 
   if (precomputedCodes_) {
     runPQPrecomputedCodes_(queries,
+                           bitset,
                            coarseDistances,
                            coarseIndices,
                            k,
@@ -550,6 +552,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
                            outIndices);
   } else {
     runPQNoPrecomputedCodes_(queries,
+                             bitset,
                              coarseDistances,
                              coarseIndices,
                              k,
@@ -592,6 +595,7 @@ IVFPQ::getPQCentroids() {
 void
 IVFPQ::runPQPrecomputedCodes_(
   Tensor<float, 2, true>& queries,
+  Tensor<uint8_t, 1, true>& bitset,
   DeviceTensor<float, 2, true>& coarseDistances,
   DeviceTensor<int, 2, true>& coarseIndices,
   int k,
@@ -655,6 +659,7 @@ IVFPQ::runPQPrecomputedCodes_(
                                 term2, // term 2
                                 term3, // term 3
                                 coarseIndices,
+                                bitset,
                                 useFloat16LookupTables_,
                                 bytesPerVector_,
                                 numSubQuantizers_,
@@ -673,6 +678,7 @@ IVFPQ::runPQPrecomputedCodes_(
 void
 IVFPQ::runPQNoPrecomputedCodes_(
   Tensor<float, 2, true>& queries,
+  Tensor<uint8_t, 1, true>& bitset,
   DeviceTensor<float, 2, true>& coarseDistances,
   DeviceTensor<int, 2, true>& coarseIndices,
   int k,
@@ -685,6 +691,7 @@ IVFPQ::runPQNoPrecomputedCodes_(
                                   coarseCentroids,
                                   pqCentroidsInnermostCode_,
                                   coarseIndices,
+                                  bitset,
                                   useFloat16LookupTables_,
                                   bytesPerVector_,
                                   numSubQuantizers_,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
index 4d41d0a6a6..8771e7a507 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
@@ -83,6 +83,7 @@ class IVFPQ : public IVFBase {
 
   /// Runs kernels for scanning inverted lists with precomputed codes
   void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
+                              Tensor<uint8_t, 1, true>& bitset,
                               DeviceTensor<float, 2, true>& coarseDistances,
                               DeviceTensor<int, 2, true>& coarseIndices,
                               int k,
@@ -91,6 +92,7 @@ class IVFPQ : public IVFBase {
 
   /// Runs kernels for scanning inverted lists without precomputed codes
   void runPQNoPrecomputedCodes_(Tensor<float, 2, true>& queries,
+                                Tensor<uint8_t, 1, true>& bitset,
                                 DeviceTensor<float, 2, true>& coarseDistances,
                                 DeviceTensor<int, 2, true>& coarseIndices,
                                 int k,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
index eba3a1051b..3eb226568d 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh
@@ -16,6 +16,70 @@
 // post-processing and k-selecting the results
 namespace faiss { namespace gpu {
 
+// This is warp divergence central, but this is really a final step
+// and happening a small number of times
+inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
+                                            int size,
+                                            int val) {
+  int start = 0;
+  int end = size;
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    int midVal = prefixSumOffsets[mid];
+
+    // Find the first bucket that we are <=
+    if (midVal <= val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  // We must find the bucket that it is in
+  assert(start != size);
+
+  return start;
+}
+
+inline __device__ long
+getListIndex(int queryId,
+             int offset,
+             void** listIndices,
+             Tensor<int, 2, true>& prefixSumOffsets,
+             Tensor<int, 2, true>& topQueryToCentroid,
+             IndicesOptions opt) {
+  long index = -1;
+
+  // In order to determine the actual user index, we need to first
+  // determine what list it was in.
+  // We do this by binary search in the prefix sum list.
+  int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
+                                    prefixSumOffsets.getSize(1),
+                                    offset);
+
+  // This is then the probe for the query; we can find the actual
+  // list ID from this
+  int listId = topQueryToCentroid[queryId][probe];
+
+  // Now, we need to know the offset within the list
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
+  int listOffset = offset - listStart;
+
+  // This gives us our final index
+  if (opt == INDICES_32_BIT) {
+    index = (long) ((int*) listIndices[listId])[listOffset];
+  } else if (opt == INDICES_64_BIT) {
+    index = ((long*) listIndices[listId])[listOffset];
+  } else {
+    index = ((long) listId << 32 | (long) listOffset);
+  }
+
+  return index;
+}
+
 /// Function for multi-pass scanning that collects the length of
 /// intermediate results for all (query, probe) pair
 void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
@@ -25,7 +89,11 @@ void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
                         cudaStream_t stream);
 
 /// Performs a first pass of k-selection on the results
-void runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
+void runPass1SelectLists(thrust::device_vector<void*>& listIndices,
+                         IndicesOptions indicesOptions,
+                         Tensor<int, 2, true>& prefixSumOffsets,
+                         Tensor<int, 2, true>& topQueryToCentroid,
+                         Tensor<uint8_t, 1, true>& bitset,
                          Tensor<float, 1, true>& distance,
                          int nprobe,
                          int k,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
index 63c563c8fd..bbb32ad1d6 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu
@@ -23,10 +23,14 @@ namespace faiss { namespace gpu {
 
 template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
 __global__ void
-pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
+pass1SelectLists(void** listIndices,
+                 Tensor<int, 2, true> prefixSumOffsets,
+                 Tensor<int, 2, true> topQueryToCentroid,
+                 Tensor<uint8_t, 1, true> bitset,
                  Tensor<float, 1, true> distance,
                  int nprobe,
                  int k,
+                 IndicesOptions opt,
                  Tensor<float, 3, true> heapDistances,
                  Tensor<int, 3, true> heapIndices) {
   constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
@@ -58,16 +62,38 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
 
   int i = threadIdx.x;
   auto distanceStart = distance[start].data();
+  bool bitsetEmpty = (bitset.getSize(0) == 0);
+  long index = -1;
 
   // BlockSelect add cannot be used in a warp divergent circumstance; we
   // handle the remainder warp below
   for (; i < limit; i += blockDim.x) {
-    heap.add(distanceStart[i], start + i);
+    index = getListIndex(queryId,
+                         start + i,
+                         listIndices,
+                         prefixSumOffsets,
+                         topQueryToCentroid,
+                         opt);
+    if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) {
+      heap.add(distanceStart[i], start + i);
+    } else {
+      heap.add((1.0 / 0.0), start + i);
+    }
   }
 
   // Handle warp divergence separately
   if (i < num) {
-    heap.addThreadQ(distanceStart[i], start + i);
+    index = getListIndex(queryId,
+                         start + i,
+                         listIndices,
+                         prefixSumOffsets,
+                         topQueryToCentroid,
+                         opt);
+    if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) {
+      heap.addThreadQ(distanceStart[i], start + i);
+    } else {
+      heap.addThreadQ((1.0 / 0.0), start + i);
+    }
   }
 
   // Merge all final results
@@ -82,7 +108,11 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
 }
 
 void
-runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
+runPass1SelectLists(thrust::device_vector<void*>& listIndices,
+                    IndicesOptions indicesOptions,
+                    Tensor<int, 2, true>& prefixSumOffsets,
+                    Tensor<int, 2, true>& topQueryToCentroid,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<float, 1, true>& distance,
                     int nprobe,
                     int k,
@@ -98,10 +128,14 @@ runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
 #define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR)                  \
   do {                                                                  \
     pass1SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR>              \
-      <<<grid, BLOCK, 0, stream>>>(prefixSumOffsets,                    \
+      <<<grid, BLOCK, 0, stream>>>(listIndices.data().get(),            \
+                                   prefixSumOffsets,                    \
+                                   topQueryToCentroid,                  \
+                                   bitset,                              \
                                    distance,                            \
                                    nprobe,                              \
                                    k,                                   \
+                                   indicesOptions,                      \
                                    heapDistances,                       \
                                    heapIndices);                        \
     CUDA_TEST_ERROR();                                                  \
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
index e629dbdfe4..8c6b9eb3b8 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu
@@ -23,30 +23,30 @@ namespace faiss { namespace gpu {
 
 // This is warp divergence central, but this is really a final step
 // and happening a small number of times
-inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
-                                            int size,
-                                            int val) {
-  int start = 0;
-  int end = size;
-
-  while (end - start > 0) {
-    int mid = start + (end - start) / 2;
-
-    int midVal = prefixSumOffsets[mid];
-
-    // Find the first bucket that we are <=
-    if (midVal <= val) {
-      start = mid + 1;
-    } else {
-      end = mid;
-    }
-  }
-
-  // We must find the bucket that it is in
-  assert(start != size);
-
-  return start;
-}
+//inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
+//                                            int size,
+//                                            int val) {
+//  int start = 0;
+//  int end = size;
+//
+//  while (end - start > 0) {
+//    int mid = start + (end - start) / 2;
+//
+//    int midVal = prefixSumOffsets[mid];
+//
+//    // Find the first bucket that we are <=
+//    if (midVal <= val) {
+//      start = mid + 1;
+//    } else {
+//      end = mid;
+//    }
+//  }
+//
+//  // We must find the bucket that it is in
+//  assert(start != size);
+//
+//  return start;
+//}
 
 template <int ThreadsPerBlock,
           int NumWarpQ,
@@ -113,30 +113,12 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
       // calculated by the original scan.
       int offset = heapIndices[queryId][v];
 
-      // In order to determine the actual user index, we need to first
-      // determine what list it was in.
-      // We do this by binary search in the prefix sum list.
-      int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
-                                        prefixSumOffsets.getSize(1),
-                                        offset);
-
-      // This is then the probe for the query; we can find the actual
-      // list ID from this
-      int listId = topQueryToCentroid[queryId][probe];
-
-      // Now, we need to know the offset within the list
-      // We ensure that before the array (at offset -1), there is a 0 value
-      int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
-      int listOffset = offset - listStart;
-
-      // This gives us our final index
-      if (opt == INDICES_32_BIT) {
-        index = (long) ((int*) listIndices[listId])[listOffset];
-      } else if (opt == INDICES_64_BIT) {
-        index = ((long*) listIndices[listId])[listOffset];
-      } else {
-        index = ((long) listId << 32 | (long) listOffset);
-      }
+      index = getListIndex(queryId,
+                           offset,
+                           listIndices,
+                           prefixSumOffsets,
+                           topQueryToCentroid,
+                           opt);
     }
 
     outIndices[queryId][i] = index;
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
index d885d5f7ba..57030c9e34 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
@@ -224,6 +224,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                  Tensor<float, 3, true>& pqCentroidsInnermostCode,
                  NoTypeTensor<4, true>& codeDistances,
                  Tensor<int, 2, true>& topQueryToCentroid,
+                 Tensor<uint8_t, 1, true>& bitset,
                  bool useFloat16Lookup,
                  int bytesPerCode,
                  int numSubQuantizers,
@@ -357,7 +358,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
   CUDA_TEST_ERROR();
 
   // k-select the output in chunks, to increase parallelism
-  runPass1SelectLists(prefixSumOffsets,
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      bitset,
                       allDistances,
                       topQueryToCentroid.getSize(1),
                       k,
@@ -387,6 +392,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                      Tensor<float, 2, true>& centroids,
                                      Tensor<float, 3, true>& pqCentroidsInnermostCode,
                                      Tensor<int, 2, true>& topQueryToCentroid,
+                                     Tensor<uint8_t, 1, true>& bitset,
                                      bool useFloat16Lookup,
                                      int bytesPerCode,
                                      int numSubQuantizers,
@@ -560,6 +566,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                      pqCentroidsInnermostCode,
                      codeDistancesView,
                      coarseIndicesView,
+                     bitset,
                      useFloat16Lookup,
                      bytesPerCode,
                      numSubQuantizers,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
index 3d77a0ff5c..50c017c04f 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
@@ -24,6 +24,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                      Tensor<float, 2, true>& centroids,
                                      Tensor<float, 3, true>& pqCentroidsInnermostCode,
                                      Tensor<int, 2, true>& topQueryToCentroid,
+                                     Tensor<uint8_t, 1, true>& bitset,
                                      bool useFloat16Lookup,
                                      int bytesPerCode,
                                      int numSubQuantizers,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
index 58c2114595..583ee477dc 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
@@ -219,6 +219,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                  NoTypeTensor<3, true>& precompTerm2,
                  NoTypeTensor<3, true>& precompTerm3,
                  Tensor<int, 2, true>& topQueryToCentroid,
+                 Tensor<uint8_t, 1, true>& bitset,
                  bool useFloat16Lookup,
                  int bytesPerCode,
                  int numSubQuantizers,
@@ -344,7 +345,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
   }
 
   // k-select the output in chunks, to increase parallelism
-  runPass1SelectLists(prefixSumOffsets,
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      bitset,
                       allDistances,
                       topQueryToCentroid.getSize(1),
                       k,
@@ -377,6 +382,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
                                    NoTypeTensor<3, true>& precompTerm2,
                                    NoTypeTensor<3, true>& precompTerm3,
                                    Tensor<int, 2, true>& topQueryToCentroid,
+                                   Tensor<uint8_t, 1, true>& bitset,
                                    bool useFloat16Lookup,
                                    int bytesPerCode,
                                    int numSubQuantizers,
@@ -527,6 +533,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
                      precompTerm2,
                      term3View,
                      coarseIndicesView,
+                     bitset,
                      useFloat16Lookup,
                      bytesPerCode,
                      numSubQuantizers,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
index ffe548b785..644ba7d99d 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh
@@ -22,6 +22,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
                                    NoTypeTensor<3, true>& precompTerm2,
                                    NoTypeTensor<3, true>& precompTerm3,
                                    Tensor<int, 2, true>& topQueryToCentroid,
+                                   Tensor<uint8_t, 1, true>& bitset,
                                    bool useFloat16Lookup,
                                    int bytesPerCode,
                                    int numSubQuantizers,
diff --git a/core/src/index/thirdparty/faiss/makefile.inc.in b/core/src/index/thirdparty/faiss/makefile.inc.in
index 744ca6d50e..01ff95ac1c 100644
--- a/core/src/index/thirdparty/faiss/makefile.inc.in
+++ b/core/src/index/thirdparty/faiss/makefile.inc.in
@@ -22,7 +22,7 @@ NVCCFLAGS    = -I $(CUDA_ROOT)/targets/x86_64-linux/include/ \
 -Xcudafe --diag_suppress=unrecognized_attribute \
 $(CUDA_ARCH) \
 -lineinfo \
--ccbin $(CXX) -DFAISS_USE_FLOAT16
+-ccbin $(CXX)
 
 OS = $(shell uname -s)
 
diff --git a/core/src/index/unittest/test_ivf.cpp b/core/src/index/unittest/test_ivf.cpp
index a57b66ec1b..b549fe17fb 100644
--- a/core/src/index/unittest/test_ivf.cpp
+++ b/core/src/index/unittest/test_ivf.cpp
@@ -90,9 +90,13 @@ INSTANTIATE_TEST_CASE_P(
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, milvus::knowhere::IndexMode::MODE_CPU),
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, milvus::knowhere::IndexMode::MODE_CPU)));
 
-TEST_P(IVFTest, ivf_basic) {
+TEST_P(IVFTest, ivf_basic_cpu) {
     assert(!xb.empty());
 
+    if (index_mode_ != milvus::knowhere::IndexMode::MODE_CPU) {
+        return;
+    }
+
     // null faiss index
     ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
     ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
@@ -103,11 +107,10 @@ TEST_P(IVFTest, ivf_basic) {
     EXPECT_EQ(index_->Dim(), dim);
 
     auto result = index_->Query(query_dataset, conf_);
-    AssertAnns(result, nq, conf_[milvus::knowhere::meta::TOPK]);
+    AssertAnns(result, nq, k);
     // PrintResult(result, nq, k);
 
-    if (index_mode_ == milvus::knowhere::IndexMode::MODE_CPU &&
-        index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
+    if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
         auto result2 = index_->QueryById(id_dataset, conf_);
         AssertAnns(result2, nq, k);
 
@@ -143,6 +146,43 @@ TEST_P(IVFTest, ivf_basic) {
 #endif
 }
 
+TEST_P(IVFTest, ivf_basic_gpu) {
+    assert(!xb.empty());
+
+    if (index_mode_ != milvus::knowhere::IndexMode::MODE_GPU) {
+        return;
+    }
+
+    // null faiss index
+    ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
+    ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
+
+    index_->Train(base_dataset, conf_);
+    index_->Add(base_dataset, conf_);
+    EXPECT_EQ(index_->Count(), nb);
+    EXPECT_EQ(index_->Dim(), dim);
+
+    auto result = index_->Query(query_dataset, conf_);
+    AssertAnns(result, nq, k);
+    // PrintResult(result, nq, k);
+
+    if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8H) {
+        faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(nb);
+        for (int64_t i = 0; i < nq; ++i) {
+            concurrent_bitset_ptr->set(i);
+        }
+        index_->SetBlacklist(concurrent_bitset_ptr);
+
+        auto result_bs_1 = index_->Query(query_dataset, conf_);
+        AssertAnns(result_bs_1, nq, k, CheckMode::CHECK_NOT_EQUAL);
+        // PrintResult(result, nq, k);
+    }
+
+#ifdef MILVUS_GPU_VERSION
+    milvus::knowhere::FaissGpuResourceMgr::GetInstance().Dump();
+#endif
+}
+
 TEST_P(IVFTest, ivf_serialize) {
     fiu_init(0);
     auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) {