mirror of https://github.com/milvus-io/milvus.git
Caiyd 1655 gpu ivfflat delete (#1809)
* support GPUIVF delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * calc bitset offset correctly Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * support GPUSQ8 delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * support GPUIVFPQ delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * update unittest for GPU delete Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * delete FAISS compile option -DFAISS_USE_FLOAT16 Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * code opt Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * update changelog Signed-off-by: yudong.cai <yudong.cai@zilliz.com>pull/1822/head
parent
c091929298
commit
531fca8869
|
@ -8,6 +8,7 @@ Please mark all change in change log and use the issue from GitHub
|
|||
- \#1762 Server is not forbidden to create new partition which tag is "_default"
|
||||
|
||||
## Feature
|
||||
- \#1655 GPU index support delete vectors
|
||||
|
||||
## Improvement
|
||||
|
||||
|
|
|
@ -145,7 +145,7 @@ GPUIVF::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int
|
|||
if (device_index) {
|
||||
device_index->nprobe = config[IndexParams::nprobe];
|
||||
ResScope rs(res_, gpu_id_);
|
||||
device_index->search(n, (float*)data, k, distances, labels);
|
||||
device_index->search(n, (float*)data, k, distances, labels, bitset_);
|
||||
} else {
|
||||
KNOWHERE_THROW_MSG("Not a GpuIndexIVF type.");
|
||||
}
|
||||
|
|
|
@ -366,9 +366,15 @@ GpuIndexIVFPQ::searchImpl_(int n,
|
|||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
||||
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
if (!bitset) {
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
} else {
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
|
||||
const_cast<uint8_t*>(bitset->data()), stream,
|
||||
{(int) bitset->size()});
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
|
|
|
@ -275,9 +275,15 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
|
|||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
||||
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
if (!bitset) {
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
} else {
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
|
||||
const_cast<uint8_t*>(bitset->data()), stream,
|
||||
{(int) bitset->size()});
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
}
|
||||
}
|
||||
|
||||
} } // namespace
|
||||
|
|
|
@ -164,8 +164,6 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
|
||||
DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
|
||||
|
||||
// Number of valid vectors that we actually add; we return this
|
||||
int numAdded = 0;
|
||||
|
||||
|
@ -176,6 +174,8 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||
listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
||||
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
||||
|
||||
/* pseudo bitset */
|
||||
DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
|
||||
quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false);
|
||||
|
||||
// Calculate residuals for these vectors, if needed
|
||||
|
@ -354,8 +354,9 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
|
|||
|
||||
// Find the `nprobe` closest lists; we can use int indices both
|
||||
// internally and externally
|
||||
DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
|
||||
quantizer_->query(queries,
|
||||
bitset,
|
||||
coarseBitset,
|
||||
nprobe,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
|
@ -371,6 +372,7 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
|
|||
|
||||
runIVFFlatScan(queries,
|
||||
coarseIndices,
|
||||
bitset,
|
||||
deviceListDataPointers_,
|
||||
deviceListIndexPointers_,
|
||||
indicesOptions_,
|
||||
|
|
|
@ -160,6 +160,7 @@ ivfFlatScan(Tensor<float, 2, true> queries,
|
|||
void
|
||||
runIVFFlatScanTile(Tensor<float, 2, true>& queries,
|
||||
Tensor<int, 2, true>& listIds,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
thrust::device_vector<void*>& listData,
|
||||
thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
|
@ -315,7 +316,11 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
|
|||
#undef RUN_IVF_FLAT
|
||||
|
||||
// k-select the output in chunks, to increase parallelism
|
||||
runPass1SelectLists(prefixSumOffsets,
|
||||
runPass1SelectLists(listIndices,
|
||||
indicesOptions,
|
||||
prefixSumOffsets,
|
||||
listIds,
|
||||
bitset,
|
||||
allDistances,
|
||||
listIds.getSize(1),
|
||||
k,
|
||||
|
@ -344,6 +349,7 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
|
|||
void
|
||||
runIVFFlatScan(Tensor<float, 2, true>& queries,
|
||||
Tensor<int, 2, true>& listIds,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
thrust::device_vector<void*>& listData,
|
||||
thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
|
@ -489,6 +495,7 @@ runIVFFlatScan(Tensor<float, 2, true>& queries,
|
|||
|
||||
runIVFFlatScanTile(queryView,
|
||||
listIdsView,
|
||||
bitset,
|
||||
listData,
|
||||
listIndices,
|
||||
indicesOptions,
|
||||
|
|
|
@ -20,6 +20,7 @@ class GpuResources;
|
|||
|
||||
void runIVFFlatScan(Tensor<float, 2, true>& queries,
|
||||
Tensor<int, 2, true>& listIds,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
thrust::device_vector<void*>& listData,
|
||||
thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
|
|
|
@ -119,8 +119,6 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
|
||||
DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
|
||||
|
||||
// Number of valid vectors that we actually add; we return this
|
||||
int numAdded = 0;
|
||||
|
||||
|
@ -130,6 +128,8 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||
DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
||||
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
||||
|
||||
/* pseudo bitset */
|
||||
DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
|
||||
quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false);
|
||||
|
||||
// Copy the lists that we wish to append to back to the CPU
|
||||
|
@ -532,10 +532,11 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
|
|||
DeviceTensor<int, 2, true>
|
||||
coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
|
||||
|
||||
DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
|
||||
// Find the `nprobe` closest coarse centroids; we can use int
|
||||
// indices both internally and externally
|
||||
quantizer_->query(queries,
|
||||
bitset,
|
||||
coarseBitset,
|
||||
nprobe,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
|
@ -543,6 +544,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
|
|||
|
||||
if (precomputedCodes_) {
|
||||
runPQPrecomputedCodes_(queries,
|
||||
bitset,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
k,
|
||||
|
@ -550,6 +552,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
|
|||
outIndices);
|
||||
} else {
|
||||
runPQNoPrecomputedCodes_(queries,
|
||||
bitset,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
k,
|
||||
|
@ -592,6 +595,7 @@ IVFPQ::getPQCentroids() {
|
|||
void
|
||||
IVFPQ::runPQPrecomputedCodes_(
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
|
@ -655,6 +659,7 @@ IVFPQ::runPQPrecomputedCodes_(
|
|||
term2, // term 2
|
||||
term3, // term 3
|
||||
coarseIndices,
|
||||
bitset,
|
||||
useFloat16LookupTables_,
|
||||
bytesPerVector_,
|
||||
numSubQuantizers_,
|
||||
|
@ -673,6 +678,7 @@ IVFPQ::runPQPrecomputedCodes_(
|
|||
void
|
||||
IVFPQ::runPQNoPrecomputedCodes_(
|
||||
Tensor<float, 2, true>& queries,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
|
@ -685,6 +691,7 @@ IVFPQ::runPQNoPrecomputedCodes_(
|
|||
coarseCentroids,
|
||||
pqCentroidsInnermostCode_,
|
||||
coarseIndices,
|
||||
bitset,
|
||||
useFloat16LookupTables_,
|
||||
bytesPerVector_,
|
||||
numSubQuantizers_,
|
||||
|
|
|
@ -83,6 +83,7 @@ class IVFPQ : public IVFBase {
|
|||
|
||||
/// Runs kernels for scanning inverted lists with precomputed codes
|
||||
void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
|
@ -91,6 +92,7 @@ class IVFPQ : public IVFBase {
|
|||
|
||||
/// Runs kernels for scanning inverted lists without precomputed codes
|
||||
void runPQNoPrecomputedCodes_(Tensor<float, 2, true>& queries,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
DeviceTensor<float, 2, true>& coarseDistances,
|
||||
DeviceTensor<int, 2, true>& coarseIndices,
|
||||
int k,
|
||||
|
|
|
@ -16,6 +16,70 @@
|
|||
// post-processing and k-selecting the results
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
// This is warp divergence central, but this is really a final step
|
||||
// and happening a small number of times
|
||||
inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
|
||||
int size,
|
||||
int val) {
|
||||
int start = 0;
|
||||
int end = size;
|
||||
|
||||
while (end - start > 0) {
|
||||
int mid = start + (end - start) / 2;
|
||||
|
||||
int midVal = prefixSumOffsets[mid];
|
||||
|
||||
// Find the first bucket that we are <=
|
||||
if (midVal <= val) {
|
||||
start = mid + 1;
|
||||
} else {
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
|
||||
// We must find the bucket that it is in
|
||||
assert(start != size);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
inline __device__ long
|
||||
getListIndex(int queryId,
|
||||
int offset,
|
||||
void** listIndices,
|
||||
Tensor<int, 2, true>& prefixSumOffsets,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
IndicesOptions opt) {
|
||||
long index = -1;
|
||||
|
||||
// In order to determine the actual user index, we need to first
|
||||
// determine what list it was in.
|
||||
// We do this by binary search in the prefix sum list.
|
||||
int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
|
||||
prefixSumOffsets.getSize(1),
|
||||
offset);
|
||||
|
||||
// This is then the probe for the query; we can find the actual
|
||||
// list ID from this
|
||||
int listId = topQueryToCentroid[queryId][probe];
|
||||
|
||||
// Now, we need to know the offset within the list
|
||||
// We ensure that before the array (at offset -1), there is a 0 value
|
||||
int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
|
||||
int listOffset = offset - listStart;
|
||||
|
||||
// This gives us our final index
|
||||
if (opt == INDICES_32_BIT) {
|
||||
index = (long) ((int*) listIndices[listId])[listOffset];
|
||||
} else if (opt == INDICES_64_BIT) {
|
||||
index = ((long*) listIndices[listId])[listOffset];
|
||||
} else {
|
||||
index = ((long) listId << 32 | (long) listOffset);
|
||||
}
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
/// Function for multi-pass scanning that collects the length of
|
||||
/// intermediate results for all (query, probe) pair
|
||||
void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
|
||||
|
@ -25,7 +89,11 @@ void runCalcListOffsets(Tensor<int, 2, true>& topQueryToCentroid,
|
|||
cudaStream_t stream);
|
||||
|
||||
/// Performs a first pass of k-selection on the results
|
||||
void runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
|
||||
void runPass1SelectLists(thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
Tensor<int, 2, true>& prefixSumOffsets,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
Tensor<float, 1, true>& distance,
|
||||
int nprobe,
|
||||
int k,
|
||||
|
|
|
@ -23,10 +23,14 @@ namespace faiss { namespace gpu {
|
|||
|
||||
template <int ThreadsPerBlock, int NumWarpQ, int NumThreadQ, bool Dir>
|
||||
__global__ void
|
||||
pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
|
||||
pass1SelectLists(void** listIndices,
|
||||
Tensor<int, 2, true> prefixSumOffsets,
|
||||
Tensor<int, 2, true> topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true> bitset,
|
||||
Tensor<float, 1, true> distance,
|
||||
int nprobe,
|
||||
int k,
|
||||
IndicesOptions opt,
|
||||
Tensor<float, 3, true> heapDistances,
|
||||
Tensor<int, 3, true> heapIndices) {
|
||||
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
|
||||
|
@ -58,16 +62,38 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
|
|||
|
||||
int i = threadIdx.x;
|
||||
auto distanceStart = distance[start].data();
|
||||
bool bitsetEmpty = (bitset.getSize(0) == 0);
|
||||
long index = -1;
|
||||
|
||||
// BlockSelect add cannot be used in a warp divergent circumstance; we
|
||||
// handle the remainder warp below
|
||||
for (; i < limit; i += blockDim.x) {
|
||||
heap.add(distanceStart[i], start + i);
|
||||
index = getListIndex(queryId,
|
||||
start + i,
|
||||
listIndices,
|
||||
prefixSumOffsets,
|
||||
topQueryToCentroid,
|
||||
opt);
|
||||
if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) {
|
||||
heap.add(distanceStart[i], start + i);
|
||||
} else {
|
||||
heap.add((1.0 / 0.0), start + i);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle warp divergence separately
|
||||
if (i < num) {
|
||||
heap.addThreadQ(distanceStart[i], start + i);
|
||||
index = getListIndex(queryId,
|
||||
start + i,
|
||||
listIndices,
|
||||
prefixSumOffsets,
|
||||
topQueryToCentroid,
|
||||
opt);
|
||||
if (bitsetEmpty || (!(bitset[index >> 3] & (0x1 << (index & 0x7))))) {
|
||||
heap.addThreadQ(distanceStart[i], start + i);
|
||||
} else {
|
||||
heap.addThreadQ((1.0 / 0.0), start + i);
|
||||
}
|
||||
}
|
||||
|
||||
// Merge all final results
|
||||
|
@ -82,7 +108,11 @@ pass1SelectLists(Tensor<int, 2, true> prefixSumOffsets,
|
|||
}
|
||||
|
||||
void
|
||||
runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
|
||||
runPass1SelectLists(thrust::device_vector<void*>& listIndices,
|
||||
IndicesOptions indicesOptions,
|
||||
Tensor<int, 2, true>& prefixSumOffsets,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
Tensor<float, 1, true>& distance,
|
||||
int nprobe,
|
||||
int k,
|
||||
|
@ -98,10 +128,14 @@ runPass1SelectLists(Tensor<int, 2, true>& prefixSumOffsets,
|
|||
#define RUN_PASS(BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR) \
|
||||
do { \
|
||||
pass1SelectLists<BLOCK, NUM_WARP_Q, NUM_THREAD_Q, DIR> \
|
||||
<<<grid, BLOCK, 0, stream>>>(prefixSumOffsets, \
|
||||
<<<grid, BLOCK, 0, stream>>>(listIndices.data().get(), \
|
||||
prefixSumOffsets, \
|
||||
topQueryToCentroid, \
|
||||
bitset, \
|
||||
distance, \
|
||||
nprobe, \
|
||||
k, \
|
||||
indicesOptions, \
|
||||
heapDistances, \
|
||||
heapIndices); \
|
||||
CUDA_TEST_ERROR(); \
|
||||
|
|
|
@ -23,30 +23,30 @@ namespace faiss { namespace gpu {
|
|||
|
||||
// This is warp divergence central, but this is really a final step
|
||||
// and happening a small number of times
|
||||
inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
|
||||
int size,
|
||||
int val) {
|
||||
int start = 0;
|
||||
int end = size;
|
||||
|
||||
while (end - start > 0) {
|
||||
int mid = start + (end - start) / 2;
|
||||
|
||||
int midVal = prefixSumOffsets[mid];
|
||||
|
||||
// Find the first bucket that we are <=
|
||||
if (midVal <= val) {
|
||||
start = mid + 1;
|
||||
} else {
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
|
||||
// We must find the bucket that it is in
|
||||
assert(start != size);
|
||||
|
||||
return start;
|
||||
}
|
||||
//inline __device__ int binarySearchForBucket(int* prefixSumOffsets,
|
||||
// int size,
|
||||
// int val) {
|
||||
// int start = 0;
|
||||
// int end = size;
|
||||
//
|
||||
// while (end - start > 0) {
|
||||
// int mid = start + (end - start) / 2;
|
||||
//
|
||||
// int midVal = prefixSumOffsets[mid];
|
||||
//
|
||||
// // Find the first bucket that we are <=
|
||||
// if (midVal <= val) {
|
||||
// start = mid + 1;
|
||||
// } else {
|
||||
// end = mid;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // We must find the bucket that it is in
|
||||
// assert(start != size);
|
||||
//
|
||||
// return start;
|
||||
//}
|
||||
|
||||
template <int ThreadsPerBlock,
|
||||
int NumWarpQ,
|
||||
|
@ -113,30 +113,12 @@ pass2SelectLists(Tensor<float, 2, true> heapDistances,
|
|||
// calculated by the original scan.
|
||||
int offset = heapIndices[queryId][v];
|
||||
|
||||
// In order to determine the actual user index, we need to first
|
||||
// determine what list it was in.
|
||||
// We do this by binary search in the prefix sum list.
|
||||
int probe = binarySearchForBucket(prefixSumOffsets[queryId].data(),
|
||||
prefixSumOffsets.getSize(1),
|
||||
offset);
|
||||
|
||||
// This is then the probe for the query; we can find the actual
|
||||
// list ID from this
|
||||
int listId = topQueryToCentroid[queryId][probe];
|
||||
|
||||
// Now, we need to know the offset within the list
|
||||
// We ensure that before the array (at offset -1), there is a 0 value
|
||||
int listStart = *(prefixSumOffsets[queryId][probe].data() - 1);
|
||||
int listOffset = offset - listStart;
|
||||
|
||||
// This gives us our final index
|
||||
if (opt == INDICES_32_BIT) {
|
||||
index = (long) ((int*) listIndices[listId])[listOffset];
|
||||
} else if (opt == INDICES_64_BIT) {
|
||||
index = ((long*) listIndices[listId])[listOffset];
|
||||
} else {
|
||||
index = ((long) listId << 32 | (long) listOffset);
|
||||
}
|
||||
index = getListIndex(queryId,
|
||||
offset,
|
||||
listIndices,
|
||||
prefixSumOffsets,
|
||||
topQueryToCentroid,
|
||||
opt);
|
||||
}
|
||||
|
||||
outIndices[queryId][i] = index;
|
||||
|
|
|
@ -224,6 +224,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||
Tensor<float, 3, true>& pqCentroidsInnermostCode,
|
||||
NoTypeTensor<4, true>& codeDistances,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
|
@ -357,7 +358,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||
CUDA_TEST_ERROR();
|
||||
|
||||
// k-select the output in chunks, to increase parallelism
|
||||
runPass1SelectLists(prefixSumOffsets,
|
||||
runPass1SelectLists(listIndices,
|
||||
indicesOptions,
|
||||
prefixSumOffsets,
|
||||
topQueryToCentroid,
|
||||
bitset,
|
||||
allDistances,
|
||||
topQueryToCentroid.getSize(1),
|
||||
k,
|
||||
|
@ -387,6 +392,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
|||
Tensor<float, 2, true>& centroids,
|
||||
Tensor<float, 3, true>& pqCentroidsInnermostCode,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
|
@ -560,6 +566,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
|||
pqCentroidsInnermostCode,
|
||||
codeDistancesView,
|
||||
coarseIndicesView,
|
||||
bitset,
|
||||
useFloat16Lookup,
|
||||
bytesPerCode,
|
||||
numSubQuantizers,
|
||||
|
|
|
@ -24,6 +24,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
|||
Tensor<float, 2, true>& centroids,
|
||||
Tensor<float, 3, true>& pqCentroidsInnermostCode,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
|
|
|
@ -219,6 +219,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||
NoTypeTensor<3, true>& precompTerm2,
|
||||
NoTypeTensor<3, true>& precompTerm3,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
|
@ -344,7 +345,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||
}
|
||||
|
||||
// k-select the output in chunks, to increase parallelism
|
||||
runPass1SelectLists(prefixSumOffsets,
|
||||
runPass1SelectLists(listIndices,
|
||||
indicesOptions,
|
||||
prefixSumOffsets,
|
||||
topQueryToCentroid,
|
||||
bitset,
|
||||
allDistances,
|
||||
topQueryToCentroid.getSize(1),
|
||||
k,
|
||||
|
@ -377,6 +382,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
|
|||
NoTypeTensor<3, true>& precompTerm2,
|
||||
NoTypeTensor<3, true>& precompTerm3,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
|
@ -527,6 +533,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
|
|||
precompTerm2,
|
||||
term3View,
|
||||
coarseIndicesView,
|
||||
bitset,
|
||||
useFloat16Lookup,
|
||||
bytesPerCode,
|
||||
numSubQuantizers,
|
||||
|
|
|
@ -22,6 +22,7 @@ void runPQScanMultiPassPrecomputed(Tensor<float, 2, true>& queries,
|
|||
NoTypeTensor<3, true>& precompTerm2,
|
||||
NoTypeTensor<3, true>& precompTerm3,
|
||||
Tensor<int, 2, true>& topQueryToCentroid,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
bool useFloat16Lookup,
|
||||
int bytesPerCode,
|
||||
int numSubQuantizers,
|
||||
|
|
|
@ -22,7 +22,7 @@ NVCCFLAGS = -I $(CUDA_ROOT)/targets/x86_64-linux/include/ \
|
|||
-Xcudafe --diag_suppress=unrecognized_attribute \
|
||||
$(CUDA_ARCH) \
|
||||
-lineinfo \
|
||||
-ccbin $(CXX) -DFAISS_USE_FLOAT16
|
||||
-ccbin $(CXX)
|
||||
|
||||
OS = $(shell uname -s)
|
||||
|
||||
|
|
|
@ -90,9 +90,13 @@ INSTANTIATE_TEST_CASE_P(
|
|||
std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, milvus::knowhere::IndexMode::MODE_CPU),
|
||||
std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, milvus::knowhere::IndexMode::MODE_CPU)));
|
||||
|
||||
TEST_P(IVFTest, ivf_basic) {
|
||||
TEST_P(IVFTest, ivf_basic_cpu) {
|
||||
assert(!xb.empty());
|
||||
|
||||
if (index_mode_ != milvus::knowhere::IndexMode::MODE_CPU) {
|
||||
return;
|
||||
}
|
||||
|
||||
// null faiss index
|
||||
ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
|
||||
ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
|
||||
|
@ -103,11 +107,10 @@ TEST_P(IVFTest, ivf_basic) {
|
|||
EXPECT_EQ(index_->Dim(), dim);
|
||||
|
||||
auto result = index_->Query(query_dataset, conf_);
|
||||
AssertAnns(result, nq, conf_[milvus::knowhere::meta::TOPK]);
|
||||
AssertAnns(result, nq, k);
|
||||
// PrintResult(result, nq, k);
|
||||
|
||||
if (index_mode_ == milvus::knowhere::IndexMode::MODE_CPU &&
|
||||
index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
|
||||
if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
|
||||
auto result2 = index_->QueryById(id_dataset, conf_);
|
||||
AssertAnns(result2, nq, k);
|
||||
|
||||
|
@ -143,6 +146,43 @@ TEST_P(IVFTest, ivf_basic) {
|
|||
#endif
|
||||
}
|
||||
|
||||
TEST_P(IVFTest, ivf_basic_gpu) {
|
||||
assert(!xb.empty());
|
||||
|
||||
if (index_mode_ != milvus::knowhere::IndexMode::MODE_GPU) {
|
||||
return;
|
||||
}
|
||||
|
||||
// null faiss index
|
||||
ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
|
||||
ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
|
||||
|
||||
index_->Train(base_dataset, conf_);
|
||||
index_->Add(base_dataset, conf_);
|
||||
EXPECT_EQ(index_->Count(), nb);
|
||||
EXPECT_EQ(index_->Dim(), dim);
|
||||
|
||||
auto result = index_->Query(query_dataset, conf_);
|
||||
AssertAnns(result, nq, k);
|
||||
// PrintResult(result, nq, k);
|
||||
|
||||
if (index_type_ != milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8H) {
|
||||
faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(nb);
|
||||
for (int64_t i = 0; i < nq; ++i) {
|
||||
concurrent_bitset_ptr->set(i);
|
||||
}
|
||||
index_->SetBlacklist(concurrent_bitset_ptr);
|
||||
|
||||
auto result_bs_1 = index_->Query(query_dataset, conf_);
|
||||
AssertAnns(result_bs_1, nq, k, CheckMode::CHECK_NOT_EQUAL);
|
||||
// PrintResult(result, nq, k);
|
||||
}
|
||||
|
||||
#ifdef MILVUS_GPU_VERSION
|
||||
milvus::knowhere::FaissGpuResourceMgr::GetInstance().Dump();
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_P(IVFTest, ivf_serialize) {
|
||||
fiu_init(0);
|
||||
auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) {
|
||||
|
|
Loading…
Reference in New Issue