diff --git a/CHANGELOG.md b/CHANGELOG.md index ea5ca83c98..d72c7df614 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Please mark all change in change log and use the issue from GitHub - \#1762 Server is not forbidden to create new partition which tag is "_default" ## Feature +- \#1655 GPU index support delete vectors ## Improvement diff --git a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp index 599f602fd7..860d32d57a 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVF.cpp @@ -145,7 +145,7 @@ GPUIVF::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int if (device_index) { device_index->nprobe = config[IndexParams::nprobe]; ResScope rs(res_, gpu_id_); - device_index->search(n, (float*)data, k, distances, labels); + device_index->search(n, (float*)data, k, distances, labels, bitset_); } else { KNOWHERE_THROW_MSG("Not a GpuIndexIVF type."); } diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu index 9fb52911b0..0f969c2ac9 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu @@ -366,9 +366,15 @@ GpuIndexIVFPQ::searchImpl_(int n, static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor outLabels(const_cast(labels), {n, k}); - auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); - - index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); + if (!bitset) { + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); + } else { + auto bitsetDevice = toDevice(resources_, device_, + const_cast(bitset->data()), stream, + {(int) bitset->size()}); + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); + } } int diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu index 1599cfab6a..c0e194d45b 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu @@ -275,9 +275,15 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n, static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor outLabels(const_cast(labels), {n, k}); - auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); - - index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); + if (!bitset) { + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); + } else { + auto bitsetDevice = toDevice(resources_, device_, + const_cast(bitset->data()), stream, + {(int) bitset->size()}); + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); + } } } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu index 5c1db4b7fb..8de3964e1d 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu @@ -164,8 +164,6 @@ IVFFlat::classifyAndAddVectors(Tensor& vecs, auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); - DeviceTensor bitset(mem, {0}, stream); - // Number of valid vectors that we actually add; we return this int numAdded = 0; @@ -176,6 +174,8 @@ IVFFlat::classifyAndAddVectors(Tensor& vecs, listIds2d(mem, {vecs.getSize(0), 1}, stream); auto listIds = listIds2d.view<1>({vecs.getSize(0)}); + /* pseudo bitset */ + DeviceTensor bitset(mem, {0}, stream); quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false); // Calculate residuals for these vectors, if needed @@ -354,8 +354,9 @@ IVFFlat::query(Tensor& queries, // Find the `nprobe` closest lists; we can use int indices both // internally and externally + DeviceTensor coarseBitset(mem, {0}, stream); quantizer_->query(queries, - bitset, + coarseBitset, nprobe, coarseDistances, coarseIndices, @@ -371,6 +372,7 @@ IVFFlat::query(Tensor& queries, runIVFFlatScan(queries, coarseIndices, + bitset, deviceListDataPointers_, deviceListIndexPointers_, indicesOptions_, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu index 5b7c94022f..deb2720db4 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu @@ -160,6 +160,7 @@ ivfFlatScan(Tensor queries, void runIVFFlatScanTile(Tensor& queries, Tensor& listIds, + Tensor& bitset, thrust::device_vector& listData, thrust::device_vector& listIndices, IndicesOptions indicesOptions, @@ -315,7 +316,11 @@ runIVFFlatScanTile(Tensor& queries, #undef RUN_IVF_FLAT // k-select the output in chunks, to increase parallelism - runPass1SelectLists(prefixSumOffsets, + runPass1SelectLists(listIndices, + indicesOptions, + prefixSumOffsets, + listIds, + bitset, allDistances, listIds.getSize(1), k, @@ -344,6 +349,7 @@ runIVFFlatScanTile(Tensor& queries, void runIVFFlatScan(Tensor& queries, Tensor& listIds, + Tensor& bitset, thrust::device_vector& listData, thrust::device_vector& listIndices, IndicesOptions indicesOptions, @@ -489,6 +495,7 @@ runIVFFlatScan(Tensor& queries, runIVFFlatScanTile(queryView, listIdsView, + bitset, listData, listIndices, indicesOptions, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh index 475e71ab5d..ac63579dd2 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh @@ -20,6 +20,7 @@ class GpuResources; void runIVFFlatScan(Tensor& queries, Tensor& listIds, + Tensor& bitset, thrust::device_vector& listData, thrust::device_vector& listIndices, IndicesOptions indicesOptions, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu index 636eef78c0..6f48f3a6db 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu @@ -119,8 +119,6 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, auto& mem = resources_->getMemoryManagerCurrentDevice(); auto stream = resources_->getDefaultStreamCurrentDevice(); - DeviceTensor bitset(mem, {0}, stream); - // Number of valid vectors that we actually add; we return this int numAdded = 0; @@ -130,6 +128,8 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, DeviceTensor listIds2d(mem, {vecs.getSize(0), 1}, stream); auto listIds = listIds2d.view<1>({vecs.getSize(0)}); + /* pseudo bitset */ + DeviceTensor bitset(mem, {0}, stream); quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false); // Copy the lists that we wish to append to back to the CPU @@ -532,10 +532,11 @@ IVFPQ::query(Tensor& queries, DeviceTensor coarseIndices(mem, {queries.getSize(0), nprobe}, stream); + DeviceTensor coarseBitset(mem, {0}, stream); // Find the `nprobe` closest coarse centroids; we can use int // indices both internally and externally quantizer_->query(queries, - bitset, + coarseBitset, nprobe, coarseDistances, coarseIndices, @@ -543,6 +544,7 @@ IVFPQ::query(Tensor& queries, if (precomputedCodes_) { runPQPrecomputedCodes_(queries, + bitset, coarseDistances, coarseIndices, k, @@ -550,6 +552,7 @@ IVFPQ::query(Tensor& queries, outIndices); } else { runPQNoPrecomputedCodes_(queries, + bitset, coarseDistances, coarseIndices, k, @@ -592,6 +595,7 @@ IVFPQ::getPQCentroids() { void IVFPQ::runPQPrecomputedCodes_( Tensor& queries, + Tensor& bitset, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, @@ -655,6 +659,7 @@ IVFPQ::runPQPrecomputedCodes_( term2, // term 2 term3, // term 3 coarseIndices, + bitset, useFloat16LookupTables_, bytesPerVector_, numSubQuantizers_, @@ -673,6 +678,7 @@ IVFPQ::runPQPrecomputedCodes_( void IVFPQ::runPQNoPrecomputedCodes_( Tensor& queries, + Tensor& bitset, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, @@ -685,6 +691,7 @@ IVFPQ::runPQNoPrecomputedCodes_( coarseCentroids, pqCentroidsInnermostCode_, coarseIndices, + bitset, useFloat16LookupTables_, bytesPerVector_, numSubQuantizers_, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh index 4d41d0a6a6..8771e7a507 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh @@ -83,6 +83,7 @@ class IVFPQ : public IVFBase { /// Runs kernels for scanning inverted lists with precomputed codes void runPQPrecomputedCodes_(Tensor& queries, + Tensor& bitset, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, @@ -91,6 +92,7 @@ class IVFPQ : public IVFBase { /// Runs kernels for scanning inverted lists without precomputed codes void runPQNoPrecomputedCodes_(Tensor& queries, + Tensor& bitset, DeviceTensor& coarseDistances, DeviceTensor& coarseIndices, int k, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh index eba3a1051b..3eb226568d 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtils.cuh @@ -16,6 +16,70 @@ // post-processing and k-selecting the results namespace faiss { namespace gpu { +// This is warp divergence central, but this is really a final step +// and happening a small number of times +inline __device__ int binarySearchForBucket(int* prefixSumOffsets, + int size, + int val) { + int start = 0; + int end = size; + + while (end - start > 0) { + int mid = start + (end - start) / 2; + + int midVal = prefixSumOffsets[mid]; + + // Find the first bucket that we are <= + if (midVal <= val) { + start = mid + 1; + } else { + end = mid; + } + } + + // We must find the bucket that it is in + assert(start != size); + + return start; +} + +inline __device__ long +getListIndex(int queryId, + int offset, + void** listIndices, + Tensor& prefixSumOffsets, + Tensor& topQueryToCentroid, + IndicesOptions opt) { + long index = -1; + + // In order to determine the actual user index, we need to first + // determine what list it was in. + // We do this by binary search in the prefix sum list. + int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(), + prefixSumOffsets.getSize(1), + offset); + + // This is then the probe for the query; we can find the actual + // list ID from this + int listId = topQueryToCentroid[queryId][probe]; + + // Now, we need to know the offset within the list + // We ensure that before the array (at offset -1), there is a 0 value + int listStart = *(prefixSumOffsets[queryId][probe].data() - 1); + int listOffset = offset - listStart; + + // This gives us our final index + if (opt == INDICES_32_BIT) { + index = (long) ((int*) listIndices[listId])[listOffset]; + } else if (opt == INDICES_64_BIT) { + index = ((long*) listIndices[listId])[listOffset]; + } else { + index = ((long) listId << 32 | (long) listOffset); + } + + return index; +} + /// Function for multi-pass scanning that collects the length of /// intermediate results for all (query, probe) pair void runCalcListOffsets(Tensor& topQueryToCentroid, @@ -25,7 +89,11 @@ void runCalcListOffsets(Tensor& topQueryToCentroid, cudaStream_t stream); /// Performs a first pass of k-selection on the results -void runPass1SelectLists(Tensor& prefixSumOffsets, +void runPass1SelectLists(thrust::device_vector& listIndices, + IndicesOptions indicesOptions, + Tensor& prefixSumOffsets, + Tensor& topQueryToCentroid, + Tensor& bitset, Tensor& distance, int nprobe, int k, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu index 63c563c8fd..bbb32ad1d6 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect1.cu @@ -23,10 +23,14 @@ namespace faiss { namespace gpu { template __global__ void -pass1SelectLists(Tensor prefixSumOffsets, +pass1SelectLists(void** listIndices, + Tensor prefixSumOffsets, + Tensor topQueryToCentroid, + Tensor bitset, Tensor distance, int nprobe, int k, + IndicesOptions opt, Tensor heapDistances, Tensor heapIndices) { constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; @@ -58,16 +62,38 @@ pass1SelectLists(Tensor prefixSumOffsets, int i = threadIdx.x; auto distanceStart = distance[start].data(); + bool bitsetEmpty = (bitset.getSize(0) == 0); + long index = -1; // BlockSelect add cannot be used in a warp divergent circumstance; we // handle the remainder warp below for (; i < limit; i += blockDim.x) { - heap.add(distanceStart[i], start + i); + index = getListIndex(queryId, + start + i, + listIndices, + prefixSumOffsets, + topQueryToCentroid, + opt); + if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) { + heap.add(distanceStart[i], start + i); + } else { + heap.add((1.0 / 0.0), start + i); + } } // Handle warp divergence separately if (i < num) { - heap.addThreadQ(distanceStart[i], start + i); + index = getListIndex(queryId, + start + i, + listIndices, + prefixSumOffsets, + topQueryToCentroid, + opt); + if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) { + heap.addThreadQ(distanceStart[i], start + i); + } else { + heap.addThreadQ((1.0 / 0.0), start + i); + } } // Merge all final results @@ -82,7 +108,11 @@ pass1SelectLists(Tensor prefixSumOffsets, } void -runPass1SelectLists(Tensor& prefixSumOffsets, +runPass1SelectLists(thrust::device_vector& listIndices, + IndicesOptions indicesOptions, + Tensor& prefixSumOffsets, + Tensor& topQueryToCentroid, + Tensor& bitset, Tensor& distance, int nprobe, int k, @@ -98,10 +128,14 @@ runPass1SelectLists(Tensor& prefixSumOffsets, #define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR) \ do { \ pass1SelectLists \ - <<>>(prefixSumOffsets, \ + <<>>(listIndices.data().get(), \ + prefixSumOffsets, \ + topQueryToCentroid, \ + bitset, \ distance, \ nprobe, \ k, \ + indicesOptions, \ heapDistances, \ heapIndices); \ CUDA_TEST_ERROR(); \ diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu index e629dbdfe4..8c6b9eb3b8 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFUtilsSelect2.cu @@ -23,30 +23,30 @@ namespace faiss { namespace gpu { // This is warp divergence central, but this is really a final step // and happening a small number of times -inline __device__ int binarySearchForBucket(int* prefixSumOffsets, - int size, - int val) { - int start = 0; - int end = size; - - while (end - start > 0) { - int mid = start + (end - start) / 2; - - int midVal = prefixSumOffsets[mid]; - - // Find the first bucket that we are <= - if (midVal <= val) { - start = mid + 1; - } else { - end = mid; - } - } - - // We must find the bucket that it is in - assert(start != size); - - return start; -} +//inline __device__ int binarySearchForBucket(int* prefixSumOffsets, +// int size, +// int val) { +// int start = 0; +// int end = size; +// +// while (end - start > 0) { +// int mid = start + (end - start) / 2; +// +// int midVal = prefixSumOffsets[mid]; +// +// // Find the first bucket that we are <= +// if (midVal <= val) { +// start = mid + 1; +// } else { +// end = mid; +// } +// } +// +// // We must find the bucket that it is in +// assert(start != size); +// +// return start; +//} template heapDistances, // calculated by the original scan. int offset = heapIndices[queryId][v]; - // In order to determine the actual user index, we need to first - // determine what list it was in. - // We do this by binary search in the prefix sum list. - int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(), - prefixSumOffsets.getSize(1), - offset); - - // This is then the probe for the query; we can find the actual - // list ID from this - int listId = topQueryToCentroid[queryId][probe]; - - // Now, we need to know the offset within the list - // We ensure that before the array (at offset -1), there is a 0 value - int listStart = *(prefixSumOffsets[queryId][probe].data() - 1); - int listOffset = offset - listStart; - - // This gives us our final index - if (opt == INDICES_32_BIT) { - index = (long) ((int*) listIndices[listId])[listOffset]; - } else if (opt == INDICES_64_BIT) { - index = ((long*) listIndices[listId])[listOffset]; - } else { - index = ((long) listId << 32 | (long) listOffset); - } + index = getListIndex(queryId, + offset, + listIndices, + prefixSumOffsets, + topQueryToCentroid, + opt); } outIndices[queryId][i] = index; diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu index d885d5f7ba..57030c9e34 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu @@ -224,6 +224,7 @@ runMultiPassTile(Tensor& queries, Tensor& pqCentroidsInnermostCode, NoTypeTensor<4, true>& codeDistances, Tensor& topQueryToCentroid, + Tensor& bitset, bool useFloat16Lookup, int bytesPerCode, int numSubQuantizers, @@ -357,7 +358,11 @@ runMultiPassTile(Tensor& queries, CUDA_TEST_ERROR(); // k-select the output in chunks, to increase parallelism - runPass1SelectLists(prefixSumOffsets, + runPass1SelectLists(listIndices, + indicesOptions, + prefixSumOffsets, + topQueryToCentroid, + bitset, allDistances, topQueryToCentroid.getSize(1), k, @@ -387,6 +392,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor& queries, Tensor& centroids, Tensor& pqCentroidsInnermostCode, Tensor& topQueryToCentroid, + Tensor& bitset, bool useFloat16Lookup, int bytesPerCode, int numSubQuantizers, @@ -560,6 +566,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor& queries, pqCentroidsInnermostCode, codeDistancesView, coarseIndicesView, + bitset, useFloat16Lookup, bytesPerCode, numSubQuantizers, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh index 3d77a0ff5c..50c017c04f 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh @@ -24,6 +24,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor& queries, Tensor& centroids, Tensor& pqCentroidsInnermostCode, Tensor& topQueryToCentroid, + Tensor& bitset, bool useFloat16Lookup, int bytesPerCode, int numSubQuantizers, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu index 58c2114595..583ee477dc 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu @@ -219,6 +219,7 @@ runMultiPassTile(Tensor& queries, NoTypeTensor<3, true>& precompTerm2, NoTypeTensor<3, true>& precompTerm3, Tensor& topQueryToCentroid, + Tensor& bitset, bool useFloat16Lookup, int bytesPerCode, int numSubQuantizers, @@ -344,7 +345,11 @@ runMultiPassTile(Tensor& queries, } // k-select the output in chunks, to increase parallelism - runPass1SelectLists(prefixSumOffsets, + runPass1SelectLists(listIndices, + indicesOptions, + prefixSumOffsets, + topQueryToCentroid, + bitset, allDistances, topQueryToCentroid.getSize(1), k, @@ -377,6 +382,7 @@ void runPQScanMultiPassPrecomputed(Tensor& queries, NoTypeTensor<3, true>& precompTerm2, NoTypeTensor<3, true>& precompTerm3, Tensor& topQueryToCentroid, + Tensor& bitset, bool useFloat16Lookup, int bytesPerCode, int numSubQuantizers, @@ -527,6 +533,7 @@ void runPQScanMultiPassPrecomputed(Tensor& queries, precompTerm2, term3View, coarseIndicesView, + bitset, useFloat16Lookup, bytesPerCode, numSubQuantizers, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh index ffe548b785..644ba7d99d 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh @@ -22,6 +22,7 @@ void runPQScanMultiPassPrecomputed(Tensor& queries, NoTypeTensor<3, true>& precompTerm2, NoTypeTensor<3, true>& precompTerm3, Tensor& topQueryToCentroid, + Tensor& bitset, bool useFloat16Lookup, int bytesPerCode, int numSubQuantizers, diff --git a/core/src/index/thirdparty/faiss/makefile.inc.in b/core/src/index/thirdparty/faiss/makefile.inc.in index 744ca6d50e..01ff95ac1c 100644 --- a/core/src/index/thirdparty/faiss/makefile.inc.in +++ b/core/src/index/thirdparty/faiss/makefile.inc.in @@ -22,7 +22,7 @@ NVCCFLAGS = -I $(CUDA_ROOT)/targets/x86_64-linux/include/ \ -Xcudafe --diag_suppress=unrecognized_attribute \ $(CUDA_ARCH) \ -lineinfo \ --ccbin $(CXX) -DFAISS_USE_FLOAT16 +-ccbin $(CXX) OS = $(shell uname -s) diff --git a/core/src/index/unittest/test_ivf.cpp b/core/src/index/unittest/test_ivf.cpp index a57b66ec1b..b549fe17fb 100644 --- a/core/src/index/unittest/test_ivf.cpp +++ b/core/src/index/unittest/test_ivf.cpp @@ -90,9 +90,13 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, milvus::knowhere::IndexMode::MODE_CPU), std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, milvus::knowhere::IndexMode::MODE_CPU))); -TEST_P(IVFTest, ivf_basic) { +TEST_P(IVFTest, ivf_basic_cpu) { assert(!xb.empty()); + if (index_mode_ != milvus::knowhere::IndexMode::MODE_CPU) { + return; + } + // null faiss index ASSERT_ANY_THROW(index_->Add(base_dataset, conf_)); ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_)); @@ -103,11 +107,10 @@ TEST_P(IVFTest, ivf_basic) { EXPECT_EQ(index_->Dim(), dim); auto result = index_->Query(query_dataset, conf_); - AssertAnns(result, nq, conf_[milvus::knowhere::meta::TOPK]); + AssertAnns(result, nq, k); // PrintResult(result, nq, k); - if (index_mode_ == milvus::knowhere::IndexMode::MODE_CPU && - index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) { + if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) { auto result2 = index_->QueryById(id_dataset, conf_); AssertAnns(result2, nq, k); @@ -143,6 +146,43 @@ TEST_P(IVFTest, ivf_basic) { #endif } +TEST_P(IVFTest, ivf_basic_gpu) { + assert(!xb.empty()); + + if (index_mode_ != milvus::knowhere::IndexMode::MODE_GPU) { + return; + } + + // null faiss index + ASSERT_ANY_THROW(index_->Add(base_dataset, conf_)); + ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_)); + + index_->Train(base_dataset, conf_); + index_->Add(base_dataset, conf_); + EXPECT_EQ(index_->Count(), nb); + EXPECT_EQ(index_->Dim(), dim); + + auto result = index_->Query(query_dataset, conf_); + AssertAnns(result, nq, k); + // PrintResult(result, nq, k); + + if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8H) { + faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared(nb); + for (int64_t i = 0; i < nq; ++i) { + concurrent_bitset_ptr->set(i); + } + index_->SetBlacklist(concurrent_bitset_ptr); + + auto result_bs_1 = index_->Query(query_dataset, conf_); + AssertAnns(result_bs_1, nq, k, CheckMode::CHECK_NOT_EQUAL); + // PrintResult(result, nq, k); + } + +#ifdef MILVUS_GPU_VERSION + milvus::knowhere::FaissGpuResourceMgr::GetInstance().Dump(); +#endif +} + TEST_P(IVFTest, ivf_serialize) { fiu_init(0); auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) {