mirror of https://github.com/milvus-io/milvus.git
Modify faiss files to reduce the usage of GPU in 0.10.4 (#3893)
* Modify faiss files to reduce the usage of GPU in 0.10.4 Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Change the format of the code and release memories of arrays resident on host Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Change the format of the code and release memories of arrays resident on host, fix some typos Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Delete some blank lines Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Temporarily comment the delete function in IVFFlat.cu Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Uncomment some delete functions Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Uncomment some delete functions Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Use tensor to replace arrays in IVFFlat.cu Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Fix some typos Signed-off-by: lighteningzhang <lighteningzhang@163.com>pull/3931/head
parent
7692a6d437
commit
237ce2aba3
|
@ -250,7 +250,7 @@ GpuIndexIVFFlat::searchImpl_(int n,
|
|||
|
||||
if (!bitset) {
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels, distances, labels);
|
||||
} else {
|
||||
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
|
||||
const_cast<uint8_t*>(bitset->data()), stream,
|
||||
|
@ -262,3 +262,4 @@ GpuIndexIVFFlat::searchImpl_(int n,
|
|||
|
||||
} } // namespace
|
||||
|
||||
|
||||
|
|
|
@ -295,6 +295,331 @@ void runDistance(bool computeL2,
|
|||
}
|
||||
}
|
||||
|
||||
void runDist(bool computeL2,
|
||||
GpuResources* resources,
|
||||
Tensor<float, 2, true>& centroids,
|
||||
bool centroidsRowMajor,
|
||||
Tensor<float, 1, true>* centroidNorms,
|
||||
Tensor<float, 2, true>& queries,
|
||||
bool queriesRowMajor,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
|
||||
float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int pos,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
|
||||
bool ignoreOutDistances) {
|
||||
|
||||
k = curTile;
|
||||
|
||||
// The # of centroids in `centroids` based on memory layout
|
||||
auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
|
||||
|
||||
// The # of queries in `queries` based on memory layout
|
||||
auto numQueries = queries.getSize(queriesRowMajor ? 0 : 1);
|
||||
|
||||
// The dimensions of the vectors to consider
|
||||
auto dim = queries.getSize(queriesRowMajor ? 1 : 0);
|
||||
FAISS_ASSERT((numQueries == 0 || numCentroids == 0) ||
|
||||
dim == centroids.getSize(centroidsRowMajor ? 1 : 0));
|
||||
|
||||
FAISS_ASSERT(outDistances.getSize(0) == numQueries);
|
||||
FAISS_ASSERT(outIndices.getSize(0) == numQueries);
|
||||
// FAISS_ASSERT(outDistances.getSize(1) == k);
|
||||
// FAISS_ASSERT(outIndices.getSize(1) == k);
|
||||
|
||||
auto& mem = resources->getMemoryManagerCurrentDevice();
|
||||
auto defaultStream = resources->getDefaultStreamCurrentDevice();
|
||||
|
||||
// If we're quering against a 0 sized set, just return empty results
|
||||
if (centroids.numElements() == 0) {
|
||||
thrust::fill(thrust::cuda::par.on(defaultStream),
|
||||
outDistances.data(), outDistances.end(),
|
||||
Limits<float>::getMax());
|
||||
|
||||
thrust::fill(thrust::cuda::par.on(defaultStream),
|
||||
outIndices.data(), outIndices.end(),
|
||||
-1);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// L2: If ||c||^2 is not pre-computed, calculate it
|
||||
DeviceTensor<float, 1, true> cNorms;
|
||||
if (computeL2 && !centroidNorms) {
|
||||
cNorms =
|
||||
std::move(DeviceTensor<float, 1, true>(
|
||||
mem, {numCentroids}, defaultStream));
|
||||
runL2Norm(centroids, centroidsRowMajor, cNorms, true, defaultStream);
|
||||
centroidNorms = &cNorms;
|
||||
}
|
||||
|
||||
//
|
||||
// Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
|
||||
//
|
||||
int qNormSize[1] = {numQueries};
|
||||
DeviceTensor<float, 1, true> queryNorms(mem, qNormSize, defaultStream);
|
||||
|
||||
// ||q||^2
|
||||
if (computeL2) {
|
||||
runL2Norm(queries, queriesRowMajor, queryNorms, true, defaultStream);
|
||||
}
|
||||
|
||||
// By default, aim to use up to 512 MB of memory for the processing, with both
|
||||
// number of queries and number of centroids being at least 512.
|
||||
int tileRows = 0;
|
||||
int tileCols = 0;
|
||||
chooseTileSize(numQueries,
|
||||
numCentroids,
|
||||
dim,
|
||||
sizeof(float),
|
||||
mem.getSizeAvailable(),
|
||||
tileRows,
|
||||
tileCols);
|
||||
|
||||
int numColTiles = utils::divUp(numCentroids, tileCols);
|
||||
|
||||
// We can have any number of vectors to query against, even less than k, in
|
||||
// which case we'll return -1 for the index
|
||||
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
|
||||
|
||||
// Temporary output memory space we'll use
|
||||
DeviceTensor<float, 2, true> distanceBuf1(
|
||||
mem, {tileRows, tileCols}, defaultStream);
|
||||
DeviceTensor<float, 2, true> distanceBuf2(
|
||||
mem, {tileRows, tileCols}, defaultStream);
|
||||
DeviceTensor<float, 2, true>* distanceBufs[2] =
|
||||
{&distanceBuf1, &distanceBuf2};
|
||||
|
||||
DeviceTensor<float, 2, true> outDistanceBuf1(
|
||||
mem, {tileRows, numColTiles * k}, defaultStream);
|
||||
DeviceTensor<float, 2, true> outDistanceBuf2(
|
||||
mem, {tileRows, numColTiles * k}, defaultStream);
|
||||
DeviceTensor<float, 2, true>* outDistanceBufs[2] =
|
||||
{&outDistanceBuf1, &outDistanceBuf2};
|
||||
|
||||
DeviceTensor<int, 2, true> outIndexBuf1(
|
||||
mem, {tileRows, numColTiles * k}, defaultStream);
|
||||
DeviceTensor<int, 2, true> outIndexBuf2(
|
||||
mem, {tileRows, numColTiles * k}, defaultStream);
|
||||
DeviceTensor<int, 2, true>* outIndexBufs[2] =
|
||||
{&outIndexBuf1, &outIndexBuf2};
|
||||
|
||||
auto streams = resources->getAlternateStreamsCurrentDevice();
|
||||
streamWait(streams, {defaultStream});
|
||||
|
||||
int curStream = 0;
|
||||
bool interrupt = false;
|
||||
|
||||
// Tile over the input queries
|
||||
for (int i = 0; i < numQueries; i += tileRows) {
|
||||
if (interrupt || InterruptCallback::is_interrupted()) {
|
||||
interrupt = true;
|
||||
break;
|
||||
}
|
||||
|
||||
int curQuerySize = std::min(tileRows, numQueries - i);
|
||||
|
||||
auto outDistanceView =
|
||||
outDistances.narrow(0, i, curQuerySize);
|
||||
auto outIndexView =
|
||||
outIndices.narrow(0, i, curQuerySize);
|
||||
|
||||
auto queryView =
|
||||
queries.narrow(queriesRowMajor ? 0 : 1, i, curQuerySize);
|
||||
auto queryNormNiew =
|
||||
queryNorms.narrow(0, i, curQuerySize);
|
||||
|
||||
auto outDistanceBufRowView =
|
||||
outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
|
||||
auto outIndexBufRowView =
|
||||
outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
|
||||
|
||||
// Tile over the centroids
|
||||
for (int j = 0; j < numCentroids; j += tileCols) {
|
||||
if (InterruptCallback::is_interrupted()) {
|
||||
interrupt = true;
|
||||
break;
|
||||
}
|
||||
|
||||
int curCentroidSize = std::min(tileCols, numCentroids - j);
|
||||
int curColTile = j / tileCols;
|
||||
|
||||
auto centroidsView =
|
||||
sliceCentroids(centroids, centroidsRowMajor, j, curCentroidSize);
|
||||
|
||||
auto distanceBufView = distanceBufs[curStream]->
|
||||
narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
|
||||
|
||||
auto outDistanceBufColView =
|
||||
outDistanceBufRowView.narrow(1, k * curColTile, k);
|
||||
auto outIndexBufColView =
|
||||
outIndexBufRowView.narrow(1, k * curColTile, k);
|
||||
|
||||
// L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
|
||||
// IP: just compute qc
|
||||
// (query id x dim) x (centroid id, dim)' = (query id, centroid id)
|
||||
runMatrixMult(distanceBufView,
|
||||
false, // not transposed
|
||||
queryView,
|
||||
!queriesRowMajor, // transposed MM if col major
|
||||
centroidsView,
|
||||
centroidsRowMajor, // transposed MM if row major
|
||||
computeL2 ? -2.0f : 1.0f,
|
||||
0.0f,
|
||||
resources->getBlasHandleCurrentDevice(),
|
||||
streams[curStream]);
|
||||
|
||||
if (computeL2) {
|
||||
// For L2 distance, we use this fused kernel that performs both
|
||||
// adding ||c||^2 to -2qc and k-selection, so we only need two
|
||||
// passes (one write by the gemm, one read here) over the huge
|
||||
// region of output memory
|
||||
//
|
||||
// If we aren't tiling along the number of centroids, we can perform the
|
||||
// output work directly
|
||||
if (tileCols == numCentroids) {
|
||||
// Write into the final output
|
||||
runL2SelMn(hostOutDistances,
|
||||
hostOutIndices,
|
||||
i,
|
||||
curQuerySize,
|
||||
pos,
|
||||
nprobe,
|
||||
|
||||
distanceBufView,
|
||||
*centroidNorms,
|
||||
bitset,
|
||||
outDistanceView,
|
||||
outIndexView,
|
||||
k,
|
||||
streams[curStream]);
|
||||
|
||||
if (!ignoreOutDistances) {
|
||||
// expand (query id) to (query id, k) by duplicating along rows
|
||||
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
|
||||
runSumAlongRows(queryNormNiew,
|
||||
outDistanceView,
|
||||
true, // L2 distances should not go below zero due
|
||||
// to roundoff error
|
||||
streams[curStream]);
|
||||
}
|
||||
} else {
|
||||
auto centroidNormsView = centroidNorms->narrow(0, j, curCentroidSize);
|
||||
|
||||
// Write into our intermediate output
|
||||
runL2SelectMin(distanceBufView,
|
||||
centroidNormsView,
|
||||
bitset,
|
||||
outDistanceBufColView,
|
||||
outIndexBufColView,
|
||||
k,
|
||||
streams[curStream]);
|
||||
|
||||
if (!ignoreOutDistances) {
|
||||
// expand (query id) to (query id, k) by duplicating along rows
|
||||
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
|
||||
runSumAlongRows(queryNormNiew,
|
||||
outDistanceBufColView,
|
||||
true, // L2 distances should not go below zero due
|
||||
// to roundoff error
|
||||
streams[curStream]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// For IP, just k-select the output for this tile
|
||||
if (tileCols == numCentroids) {
|
||||
// Write into the final output
|
||||
runBlockSelect(distanceBufView,
|
||||
bitset,
|
||||
outDistanceView,
|
||||
outIndexView,
|
||||
true, k, streams[curStream]);
|
||||
} else {
|
||||
// Write into the intermediate output
|
||||
runBlockSelect(distanceBufView,
|
||||
bitset,
|
||||
outDistanceBufColView,
|
||||
outIndexBufColView,
|
||||
true, k, streams[curStream]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// As we're finished with processing a full set of centroids, perform the
|
||||
// final k-selection
|
||||
if (tileCols != numCentroids) {
|
||||
// The indices are tile-relative; for each tile of k, we need to add
|
||||
// tileCols to the index
|
||||
runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
|
||||
|
||||
runBlockSelectPair(outDistanceBufRowView,
|
||||
outIndexBufRowView,
|
||||
bitset,
|
||||
outDistanceView,
|
||||
outIndexView,
|
||||
computeL2 ? false : true, k, streams[curStream]);
|
||||
}
|
||||
|
||||
curStream = (curStream + 1) % 2;
|
||||
}
|
||||
|
||||
// Have the desired ordering stream wait on the multi-stream
|
||||
streamWait({defaultStream}, streams);
|
||||
|
||||
if (interrupt) {
|
||||
FAISS_THROW_MSG("interrupted");
|
||||
}
|
||||
}
|
||||
|
||||
void runL2Dist(GpuResources* resources,
|
||||
Tensor<float, 2, true>& vectors,
|
||||
bool vectorsRowMajor,
|
||||
// can be optionally pre-computed; nullptr if we
|
||||
// have to compute it upon the call
|
||||
Tensor<float, 1, true>* vectorNorms,
|
||||
Tensor<float, 2, true>& queries,
|
||||
bool queriesRowMajor,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
|
||||
float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int i,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
|
||||
bool ignoreOutDistances
|
||||
) {
|
||||
runDist(true, // L2
|
||||
resources,
|
||||
vectors,
|
||||
vectorsRowMajor,
|
||||
vectorNorms,
|
||||
queries,
|
||||
queriesRowMajor,
|
||||
bitset,
|
||||
k,
|
||||
outDistances,
|
||||
outIndices,
|
||||
|
||||
hostOutDistances,
|
||||
hostOutIndices,
|
||||
i,
|
||||
curTile,
|
||||
nprobe,
|
||||
|
||||
ignoreOutDistances);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void runL2Distance(GpuResources* resources,
|
||||
Tensor<T, 2, true>& centroids,
|
||||
|
|
|
@ -35,6 +35,48 @@ void runL2Distance(GpuResources* resources,
|
|||
// take shortcuts.
|
||||
bool ignoreOutDistances = false);
|
||||
|
||||
void runL2Dist(GpuResources* resources,
|
||||
Tensor<float, 2, true>& vectors,
|
||||
bool vectorsRowMajor,
|
||||
// can be optionally pre-computed; nullptr if we
|
||||
// have to compute it upon the call
|
||||
Tensor<float, 1, true>* vectorNorms,
|
||||
Tensor<float, 2, true>& queries,
|
||||
bool queriesRowMajor,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
|
||||
float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int pos,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
|
||||
bool ignoreOutDistances = false
|
||||
);
|
||||
|
||||
void runDist(bool computeL2,
|
||||
GpuResources* resources,
|
||||
Tensor<float, 2, true>& centroids,
|
||||
bool centroidsRowMajor,
|
||||
Tensor<float, 1, true>* centroidNorms,
|
||||
Tensor<float, 2, true>& queries,
|
||||
bool queriesRowMajor,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
|
||||
float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int i,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
|
||||
bool ignoreOutDistances);
|
||||
|
||||
/// Calculates brute-force inner product distance between `vectors`
|
||||
/// and `queries`, returning the k closest results seen
|
||||
void runIPDistance(GpuResources* resources,
|
||||
|
@ -91,6 +133,7 @@ void bfKnnOnDevice(GpuResources* resources,
|
|||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool ignoreOutDistances) {
|
||||
|
||||
// We are guaranteed that all data arguments are resident on our preferred
|
||||
// `device` here, and are ordered wrt `stream`
|
||||
|
||||
|
@ -213,4 +256,155 @@ void bfKnnOnDevice(GpuResources* resources,
|
|||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void bfKnnOnDev(GpuResources* resources,
|
||||
int device,
|
||||
cudaStream_t stream,
|
||||
Tensor<T, 2, true>& vectors,
|
||||
bool vectorsRowMajor,
|
||||
Tensor<float, 1, true>* vectorNorms,
|
||||
Tensor<T, 2, true>& queries,
|
||||
bool queriesRowMajor,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
|
||||
float* outDis_h,
|
||||
int* outInd_h,
|
||||
int i,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
|
||||
bool ignoreOutDistances) {
|
||||
|
||||
// We are guaranteed that all data arguments are resident on our preferred
|
||||
// `device` here, and are ordered wrt `stream`
|
||||
|
||||
// L2 and IP are specialized to use GEMM and an optimized L2 + selection or
|
||||
// pure k-selection kernel.
|
||||
if ((metric == faiss::MetricType::METRIC_L2) ||
|
||||
(metric == faiss::MetricType::METRIC_Lp &&
|
||||
metricArg == 2)) {
|
||||
runL2Dist(resources,
|
||||
vectors,
|
||||
vectorsRowMajor,
|
||||
vectorNorms,
|
||||
queries,
|
||||
queriesRowMajor,
|
||||
bitset,
|
||||
k,
|
||||
outDistances,
|
||||
outIndices,
|
||||
|
||||
outDis_h,
|
||||
outInd_h,
|
||||
i,
|
||||
curTile,
|
||||
nprobe);
|
||||
} else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
|
||||
runIPDistance(resources,
|
||||
vectors,
|
||||
vectorsRowMajor,
|
||||
queries,
|
||||
queriesRowMajor,
|
||||
bitset,
|
||||
k,
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else {
|
||||
//
|
||||
// General pairwise distance kernel
|
||||
//
|
||||
// The general distance kernel does not have specializations for
|
||||
// transpositions (NN, NT, TN); instead, the transposition is just handled
|
||||
// upon data load for now, which could result in poor data loading behavior
|
||||
// for NT / TN. This can be fixed at a later date if desired, but efficiency
|
||||
// is low versus GEMM anyways.
|
||||
//
|
||||
|
||||
Tensor<T, 2> tVectorsDimInnermost =
|
||||
vectorsRowMajor ?
|
||||
vectors.transposeInnermost(1) :
|
||||
vectors.transposeInnermost(0);
|
||||
Tensor<T, 2> tQueriesDimInnermost =
|
||||
queriesRowMajor ?
|
||||
queries.transposeInnermost(1) :
|
||||
queries.transposeInnermost(0);
|
||||
|
||||
if ((metric == faiss::MetricType::METRIC_L1) ||
|
||||
(metric == faiss::MetricType::METRIC_Lp &&
|
||||
metricArg == 1)) {
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
L1Distance(),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else if (metric == faiss::MetricType::METRIC_Lp &&
|
||||
metricArg == -1) {
|
||||
// A way to test L2 distance
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
L2Distance(),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else if (metric == faiss::MetricType::METRIC_Lp) {
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
LpDistance(metricArg),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else if (metric == faiss::MetricType::METRIC_Linf) {
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
LinfDistance(),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else if (metric == faiss::MetricType::METRIC_Canberra) {
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
CanberraDistance(),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else if (metric == faiss::MetricType::METRIC_BrayCurtis) {
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
BrayCurtisDistance(),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else if (metric == faiss::MetricType::METRIC_JensenShannon) {
|
||||
runGeneralDistance(resources,
|
||||
tVectorsDimInnermost,
|
||||
tQueriesDimInnermost,
|
||||
bitset,
|
||||
k,
|
||||
JensenShannonDistance(),
|
||||
outDistances,
|
||||
outIndices);
|
||||
} else {
|
||||
FAISS_THROW_FMT("unsupported metric type %d", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} } // namespace
|
||||
|
|
|
@ -1,388 +1,460 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include <faiss/gpu/impl/FlatIndex.cuh>
|
||||
#include <faiss/gpu/impl/Distance.cuh>
|
||||
#include <faiss/gpu/impl/L2Norm.cuh>
|
||||
#include <faiss/gpu/impl/VectorResidual.cuh>
|
||||
#include <faiss/gpu/utils/ConversionOperators.cuh>
|
||||
#include <faiss/gpu/utils/CopyUtils.cuh>
|
||||
#include <faiss/gpu/utils/DeviceUtils.h>
|
||||
#include <faiss/gpu/utils/Transpose.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
FlatIndex::FlatIndex(GpuResources* res,
|
||||
int dim,
|
||||
bool useFloat16,
|
||||
bool storeTransposed,
|
||||
MemorySpace space) :
|
||||
resources_(res),
|
||||
dim_(dim),
|
||||
useFloat16_(useFloat16),
|
||||
storeTransposed_(storeTransposed),
|
||||
space_(space),
|
||||
num_(0),
|
||||
rawData_(space) {
|
||||
#ifndef FAISS_USE_FLOAT16
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
FlatIndex::getUseFloat16() const {
|
||||
return useFloat16_;
|
||||
}
|
||||
|
||||
/// Returns the number of vectors we contain
|
||||
int FlatIndex::getSize() const {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
return vectorsHalf_.getSize(0);
|
||||
} else {
|
||||
return vectors_.getSize(0);
|
||||
}
|
||||
#else
|
||||
return vectors_.getSize(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
int FlatIndex::getDim() const {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
return vectorsHalf_.getSize(1);
|
||||
} else {
|
||||
return vectors_.getSize(1);
|
||||
}
|
||||
#else
|
||||
return vectors_.getSize(1);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
|
||||
} else {
|
||||
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
||||
}
|
||||
#else
|
||||
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Tensor<float, 2, true>&
|
||||
FlatIndex::getVectorsRef<float>() {
|
||||
// Should not call this unless we are in float32 mode
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
return getVectorsFloat32Ref();
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
template <>
|
||||
Tensor<half, 2, true>&
|
||||
FlatIndex::getVectorsRef<half>() {
|
||||
// Should not call this unless we are in float16 mode
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
return getVectorsFloat16Ref();
|
||||
}
|
||||
#endif
|
||||
|
||||
Tensor<float, 2, true>&
|
||||
FlatIndex::getVectorsFloat32Ref() {
|
||||
// Should not call this unless we are in float32 mode
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
|
||||
return vectors_;
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
Tensor<half, 2, true>&
|
||||
FlatIndex::getVectorsFloat16Ref() {
|
||||
// Should not call this unless we are in float16 mode
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
|
||||
return vectorsHalf_;
|
||||
}
|
||||
#endif
|
||||
|
||||
DeviceTensor<float, 2, true>
|
||||
FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
|
||||
return getVectorsFloat32Copy(0, num_, stream);
|
||||
}
|
||||
|
||||
DeviceTensor<float, 2, true>
|
||||
FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
|
||||
DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
|
||||
convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
|
||||
} else {
|
||||
vectors_.copyTo(vecFloat32, stream);
|
||||
}
|
||||
#else
|
||||
vectors_.copyTo(vecFloat32, stream);
|
||||
#endif
|
||||
|
||||
return vecFloat32;
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::query(Tensor<float, 2, true>& input,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance) {
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
// We need to convert the input to float16 for comparison to ourselves
|
||||
|
||||
auto inputHalf =
|
||||
convertTensor<float, half, 2>(resources_, stream, input);
|
||||
|
||||
query(inputHalf, bitset, k, metric, metricArg,
|
||||
outDistances, outIndices, exactDistance);
|
||||
|
||||
} else {
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
stream,
|
||||
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
}
|
||||
#else
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
stream,
|
||||
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
void
|
||||
FlatIndex::query(Tensor<half, 2, true>& input,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance) {
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
resources_->getDefaultStreamCurrentDevice(),
|
||||
storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
|
||||
Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& residuals) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
runCalcResidual(vecs,
|
||||
getVectorsFloat16Ref(),
|
||||
listIds,
|
||||
residuals,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
} else {
|
||||
runCalcResidual(vecs,
|
||||
getVectorsFloat32Ref(),
|
||||
listIds,
|
||||
residuals,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
}
|
||||
#else
|
||||
runCalcResidual(vecs,
|
||||
getVectorsFloat32Ref(),
|
||||
listIds,
|
||||
residuals,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& vecs) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
runReconstruct(listIds,
|
||||
getVectorsFloat16Ref(),
|
||||
vecs,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
} else {
|
||||
runReconstruct(listIds,
|
||||
getVectorsFloat32Ref(),
|
||||
vecs,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
}
|
||||
#else
|
||||
runReconstruct(listIds,
|
||||
getVectorsFloat32Ref(),
|
||||
vecs,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
#endif
|
||||
}
|
||||
void
|
||||
FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
|
||||
Tensor<float, 3, true>& vecs) {
|
||||
auto listIds1 = listIds.downcastOuter<1>();
|
||||
auto vecs2 = vecs.downcastOuter<2>();
|
||||
|
||||
reconstruct(listIds1, vecs2);
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
||||
if (numVecs == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
// Make sure that `data` is on our device; we'll run the
|
||||
// conversion on our device
|
||||
auto devData = toDevice<float, 2>(resources_,
|
||||
getCurrentDevice(),
|
||||
(float*) data,
|
||||
stream,
|
||||
{numVecs, dim_});
|
||||
|
||||
auto devDataHalf =
|
||||
convertTensor<float, half, 2>(resources_, stream, devData);
|
||||
|
||||
rawData_.append((char*) devDataHalf.data(),
|
||||
devDataHalf.getSizeInBytes(),
|
||||
stream,
|
||||
true /* reserve exactly */);
|
||||
} else {
|
||||
rawData_.append((char*) data,
|
||||
(size_t) dim_ * numVecs * sizeof(float),
|
||||
stream,
|
||||
true /* reserve exactly */);
|
||||
}
|
||||
|
||||
#else
|
||||
rawData_.append((char*) data,
|
||||
(size_t) dim_ * numVecs * sizeof(float),
|
||||
stream,
|
||||
true /* reserve exactly */);
|
||||
#endif
|
||||
num_ += numVecs;
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
DeviceTensor<half, 2, true> vectorsHalf(
|
||||
(half*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||
vectorsHalf_ = std::move(vectorsHalf);
|
||||
} else {
|
||||
DeviceTensor<float, 2, true> vectors(
|
||||
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||
vectors_ = std::move(vectors);
|
||||
}
|
||||
#else
|
||||
DeviceTensor<float, 2, true> vectors(
|
||||
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||
vectors_ = std::move(vectors);
|
||||
#endif
|
||||
|
||||
if (storeTransposed_) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
vectorsHalfTransposed_ =
|
||||
std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
|
||||
runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
|
||||
} else {
|
||||
vectorsTransposed_ =
|
||||
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
||||
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
||||
}
|
||||
#else
|
||||
vectorsTransposed_ =
|
||||
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
||||
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Precompute L2 norms of our database
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||
runL2Norm(vectorsHalf_, true, norms, true, stream);
|
||||
norms_ = std::move(norms);
|
||||
} else {
|
||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||
runL2Norm(vectors_, true, norms, true, stream);
|
||||
norms_ = std::move(norms);
|
||||
}
|
||||
#else
|
||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||
runL2Norm(vectors_, true, norms, true, stream);
|
||||
norms_ = std::move(norms);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::reset() {
|
||||
rawData_.clear();
|
||||
vectors_ = std::move(DeviceTensor<float, 2, true>());
|
||||
vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
|
||||
vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
|
||||
#endif
|
||||
norms_ = std::move(DeviceTensor<float, 1, true>());
|
||||
num_ = 0;
|
||||
}
|
||||
|
||||
} }
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include <faiss/gpu/impl/FlatIndex.cuh>
|
||||
#include <faiss/gpu/impl/Distance.cuh>
|
||||
#include <faiss/gpu/impl/L2Norm.cuh>
|
||||
#include <faiss/gpu/impl/VectorResidual.cuh>
|
||||
#include <faiss/gpu/utils/ConversionOperators.cuh>
|
||||
#include <faiss/gpu/utils/CopyUtils.cuh>
|
||||
#include <faiss/gpu/utils/DeviceUtils.h>
|
||||
#include <faiss/gpu/utils/Transpose.cuh>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
FlatIndex::FlatIndex(GpuResources* res,
|
||||
int dim,
|
||||
bool useFloat16,
|
||||
bool storeTransposed,
|
||||
MemorySpace space) :
|
||||
resources_(res),
|
||||
dim_(dim),
|
||||
useFloat16_(useFloat16),
|
||||
storeTransposed_(storeTransposed),
|
||||
space_(space),
|
||||
num_(0),
|
||||
rawData_(space) {
|
||||
#ifndef FAISS_USE_FLOAT16
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
FlatIndex::getUseFloat16() const {
|
||||
return useFloat16_;
|
||||
}
|
||||
|
||||
/// Returns the number of vectors we contain
|
||||
int FlatIndex::getSize() const {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
return vectorsHalf_.getSize(0);
|
||||
} else {
|
||||
return vectors_.getSize(0);
|
||||
}
|
||||
#else
|
||||
return vectors_.getSize(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
int FlatIndex::getDim() const {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
return vectorsHalf_.getSize(1);
|
||||
} else {
|
||||
return vectors_.getSize(1);
|
||||
}
|
||||
#else
|
||||
return vectors_.getSize(1);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
|
||||
} else {
|
||||
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
||||
}
|
||||
#else
|
||||
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Tensor<float, 2, true>&
|
||||
FlatIndex::getVectorsRef<float>() {
|
||||
// Should not call this unless we are in float32 mode
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
return getVectorsFloat32Ref();
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
template <>
|
||||
Tensor<half, 2, true>&
|
||||
FlatIndex::getVectorsRef<half>() {
|
||||
// Should not call this unless we are in float16 mode
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
return getVectorsFloat16Ref();
|
||||
}
|
||||
#endif
|
||||
|
||||
Tensor<float, 2, true>&
|
||||
FlatIndex::getVectorsFloat32Ref() {
|
||||
// Should not call this unless we are in float32 mode
|
||||
FAISS_ASSERT(!useFloat16_);
|
||||
|
||||
return vectors_;
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
Tensor<half, 2, true>&
|
||||
FlatIndex::getVectorsFloat16Ref() {
|
||||
// Should not call this unless we are in float16 mode
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
|
||||
return vectorsHalf_;
|
||||
}
|
||||
#endif
|
||||
|
||||
DeviceTensor<float, 2, true>
|
||||
FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
|
||||
return getVectorsFloat32Copy(0, num_, stream);
|
||||
}
|
||||
|
||||
DeviceTensor<float, 2, true>
|
||||
FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
|
||||
DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
|
||||
convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
|
||||
} else {
|
||||
vectors_.copyTo(vecFloat32, stream);
|
||||
}
|
||||
#else
|
||||
vectors_.copyTo(vecFloat32, stream);
|
||||
#endif
|
||||
|
||||
return vecFloat32;
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::query(Tensor<float, 2, true>& input,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance) {
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
// We need to convert the input to float16 for comparison to ourselves
|
||||
|
||||
auto inputHalf =
|
||||
convertTensor<float, half, 2>(resources_, stream, input);
|
||||
|
||||
query(inputHalf, bitset, k, metric, metricArg,
|
||||
outDistances, outIndices, exactDistance);
|
||||
|
||||
} else {
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
stream,
|
||||
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
}
|
||||
#else
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
stream,
|
||||
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::query(Tensor<float, 2, true>& input,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
|
||||
float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int i,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
|
||||
bool exactDistance) {
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
// We need to convert the input to float16 for comparison to ourselves
|
||||
|
||||
auto inputHalf =
|
||||
convertTensor<float, half, 2>(resources_, stream, input);
|
||||
|
||||
query(inputHalf, bitset, k, metric, metricArg,
|
||||
outDistances, outIndices, exactDistance);
|
||||
|
||||
} else {
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
stream,
|
||||
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
}
|
||||
#else
|
||||
bfKnnOnDev(resources_,
|
||||
getCurrentDevice(),
|
||||
stream,
|
||||
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
|
||||
hostOutDistances,
|
||||
hostOutIndices,
|
||||
i,
|
||||
curTile,
|
||||
nprobe,
|
||||
|
||||
!exactDistance);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
void
|
||||
FlatIndex::query(Tensor<half, 2, true>& input,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance) {
|
||||
FAISS_ASSERT(useFloat16_);
|
||||
|
||||
bfKnnOnDevice(resources_,
|
||||
getCurrentDevice(),
|
||||
resources_->getDefaultStreamCurrentDevice(),
|
||||
storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
|
||||
!storeTransposed_, // is vectors row major?
|
||||
&norms_,
|
||||
input,
|
||||
true, // input is row major
|
||||
bitset,
|
||||
k,
|
||||
metric,
|
||||
metricArg,
|
||||
outDistances,
|
||||
outIndices,
|
||||
!exactDistance);
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
|
||||
Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& residuals) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
runCalcResidual(vecs,
|
||||
getVectorsFloat16Ref(),
|
||||
listIds,
|
||||
residuals,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
} else {
|
||||
runCalcResidual(vecs,
|
||||
getVectorsFloat32Ref(),
|
||||
listIds,
|
||||
residuals,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
}
|
||||
#else
|
||||
runCalcResidual(vecs,
|
||||
getVectorsFloat32Ref(),
|
||||
listIds,
|
||||
residuals,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& vecs) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
runReconstruct(listIds,
|
||||
getVectorsFloat16Ref(),
|
||||
vecs,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
} else {
|
||||
runReconstruct(listIds,
|
||||
getVectorsFloat32Ref(),
|
||||
vecs,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
}
|
||||
#else
|
||||
runReconstruct(listIds,
|
||||
getVectorsFloat32Ref(),
|
||||
vecs,
|
||||
resources_->getDefaultStreamCurrentDevice());
|
||||
#endif
|
||||
}
|
||||
void
|
||||
FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
|
||||
Tensor<float, 3, true>& vecs) {
|
||||
auto listIds1 = listIds.downcastOuter<1>();
|
||||
auto vecs2 = vecs.downcastOuter<2>();
|
||||
|
||||
reconstruct(listIds1, vecs2);
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
||||
if (numVecs == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
// Make sure that `data` is on our device; we'll run the
|
||||
// conversion on our device
|
||||
auto devData = toDevice<float, 2>(resources_,
|
||||
getCurrentDevice(),
|
||||
(float*) data,
|
||||
stream,
|
||||
{numVecs, dim_});
|
||||
|
||||
auto devDataHalf =
|
||||
convertTensor<float, half, 2>(resources_, stream, devData);
|
||||
|
||||
rawData_.append((char*) devDataHalf.data(),
|
||||
devDataHalf.getSizeInBytes(),
|
||||
stream,
|
||||
true /* reserve exactly */);
|
||||
} else {
|
||||
rawData_.append((char*) data,
|
||||
(size_t) dim_ * numVecs * sizeof(float),
|
||||
stream,
|
||||
true /* reserve exactly */);
|
||||
}
|
||||
|
||||
#else
|
||||
rawData_.append((char*) data,
|
||||
(size_t) dim_ * numVecs * sizeof(float),
|
||||
stream,
|
||||
true /* reserve exactly */);
|
||||
#endif
|
||||
num_ += numVecs;
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
DeviceTensor<half, 2, true> vectorsHalf(
|
||||
(half*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||
vectorsHalf_ = std::move(vectorsHalf);
|
||||
} else {
|
||||
DeviceTensor<float, 2, true> vectors(
|
||||
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||
vectors_ = std::move(vectors);
|
||||
}
|
||||
#else
|
||||
DeviceTensor<float, 2, true> vectors(
|
||||
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||
vectors_ = std::move(vectors);
|
||||
#endif
|
||||
|
||||
if (storeTransposed_) {
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
vectorsHalfTransposed_ =
|
||||
std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
|
||||
runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
|
||||
} else {
|
||||
vectorsTransposed_ =
|
||||
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
||||
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
||||
}
|
||||
#else
|
||||
vectorsTransposed_ =
|
||||
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
||||
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Precompute L2 norms of our database
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
if (useFloat16_) {
|
||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||
runL2Norm(vectorsHalf_, true, norms, true, stream);
|
||||
norms_ = std::move(norms);
|
||||
} else {
|
||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||
runL2Norm(vectors_, true, norms, true, stream);
|
||||
norms_ = std::move(norms);
|
||||
}
|
||||
#else
|
||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||
runL2Norm(vectors_, true, norms, true, stream);
|
||||
norms_ = std::move(norms);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
FlatIndex::reset() {
|
||||
rawData_.clear();
|
||||
vectors_ = std::move(DeviceTensor<float, 2, true>());
|
||||
vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
|
||||
vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
|
||||
#endif
|
||||
norms_ = std::move(DeviceTensor<float, 1, true>());
|
||||
num_ = 0;
|
||||
}
|
||||
|
||||
} }
|
||||
|
|
|
@ -1,139 +1,153 @@
|
|||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <faiss/MetricType.h>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/DeviceVector.cuh>
|
||||
#include <faiss/gpu/utils/MemorySpace.h>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
class GpuResources;
|
||||
|
||||
/// Holder of GPU resources for a particular flat index
|
||||
class FlatIndex {
|
||||
public:
|
||||
FlatIndex(GpuResources* res,
|
||||
int dim,
|
||||
bool useFloat16,
|
||||
bool storeTransposed,
|
||||
MemorySpace space);
|
||||
|
||||
/// Whether or not this flat index primarily stores data in float16
|
||||
bool getUseFloat16() const;
|
||||
|
||||
/// Returns the number of vectors we contain
|
||||
int getSize() const;
|
||||
|
||||
/// Returns the dimensionality of the vectors
|
||||
int getDim() const;
|
||||
|
||||
/// Reserve storage that can contain at least this many vectors
|
||||
void reserve(size_t numVecs, cudaStream_t stream);
|
||||
|
||||
/// Returns the vectors based on the type desired; the FlatIndex must be of
|
||||
/// the same type (float16 or float32) to not assert
|
||||
template <typename T>
|
||||
Tensor<T, 2, true>& getVectorsRef();
|
||||
|
||||
/// Returns a reference to our vectors currently in use
|
||||
Tensor<float, 2, true>& getVectorsFloat32Ref();
|
||||
|
||||
/// Returns a reference to our vectors currently in use (useFloat16 mode)
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
Tensor<half, 2, true>& getVectorsFloat16Ref();
|
||||
#endif
|
||||
|
||||
/// Performs a copy of the vectors on the given device, converting
|
||||
/// as needed from float16
|
||||
DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
|
||||
|
||||
/// Returns only a subset of the vectors
|
||||
DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
|
||||
int num,
|
||||
cudaStream_t stream);
|
||||
|
||||
void query(Tensor<float, 2, true>& vecs,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance);
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
void query(Tensor<half, 2, true>& vecs,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance);
|
||||
#endif
|
||||
|
||||
/// Compute residual for set of vectors
|
||||
void computeResidual(Tensor<float, 2, true>& vecs,
|
||||
Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& residuals);
|
||||
|
||||
/// Gather vectors given the set of IDs
|
||||
void reconstruct(Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& vecs);
|
||||
|
||||
void reconstruct(Tensor<int, 2, true>& listIds,
|
||||
Tensor<float, 3, true>& vecs);
|
||||
|
||||
/// Add vectors to ourselves; the pointer passed can be on the host
|
||||
/// or the device
|
||||
void add(const float* data, int numVecs, cudaStream_t stream);
|
||||
|
||||
/// Free all storage
|
||||
void reset();
|
||||
|
||||
private:
|
||||
/// Collection of GPU resources that we use
|
||||
GpuResources* resources_;
|
||||
|
||||
/// Dimensionality of our vectors
|
||||
const int dim_;
|
||||
|
||||
/// Float16 data format
|
||||
const bool useFloat16_;
|
||||
|
||||
/// Store vectors in transposed layout for speed; makes addition to
|
||||
/// the index slower
|
||||
const bool storeTransposed_;
|
||||
|
||||
/// Memory space for our allocations
|
||||
MemorySpace space_;
|
||||
|
||||
/// How many vectors we have
|
||||
int num_;
|
||||
|
||||
/// The underlying expandable storage
|
||||
DeviceVector<char> rawData_;
|
||||
|
||||
/// Vectors currently in rawData_
|
||||
DeviceTensor<float, 2, true> vectors_;
|
||||
DeviceTensor<float, 2, true> vectorsTransposed_;
|
||||
|
||||
/// Vectors currently in rawData_, float16 form
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
DeviceTensor<half, 2, true> vectorsHalf_;
|
||||
DeviceTensor<half, 2, true> vectorsHalfTransposed_;
|
||||
#endif
|
||||
|
||||
/// Precomputed L2 norms
|
||||
DeviceTensor<float, 1, true> norms_;
|
||||
};
|
||||
|
||||
} } // namespace
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <faiss/MetricType.h>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/DeviceVector.cuh>
|
||||
#include <faiss/gpu/utils/MemorySpace.h>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
class GpuResources;
|
||||
|
||||
/// Holder of GPU resources for a particular flat index
|
||||
class FlatIndex {
|
||||
public:
|
||||
FlatIndex(GpuResources* res,
|
||||
int dim,
|
||||
bool useFloat16,
|
||||
bool storeTransposed,
|
||||
MemorySpace space);
|
||||
|
||||
/// Whether or not this flat index primarily stores data in float16
|
||||
bool getUseFloat16() const;
|
||||
|
||||
/// Returns the number of vectors we contain
|
||||
int getSize() const;
|
||||
|
||||
/// Returns the dimensionality of the vectors
|
||||
int getDim() const;
|
||||
|
||||
/// Reserve storage that can contain at least this many vectors
|
||||
void reserve(size_t numVecs, cudaStream_t stream);
|
||||
|
||||
/// Returns the vectors based on the type desired; the FlatIndex must be of
|
||||
/// the same type (float16 or float32) to not assert
|
||||
template <typename T>
|
||||
Tensor<T, 2, true>& getVectorsRef();
|
||||
|
||||
/// Returns a reference to our vectors currently in use
|
||||
Tensor<float, 2, true>& getVectorsFloat32Ref();
|
||||
|
||||
/// Returns a reference to our vectors currently in use (useFloat16 mode)
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
Tensor<half, 2, true>& getVectorsFloat16Ref();
|
||||
#endif
|
||||
|
||||
/// Performs a copy of the vectors on the given device, converting
|
||||
/// as needed from float16
|
||||
DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
|
||||
|
||||
/// Returns only a subset of the vectors
|
||||
DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
|
||||
int num,
|
||||
cudaStream_t stream);
|
||||
|
||||
void query(Tensor<float, 2, true>& vecs,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int i,
|
||||
int curTile,
|
||||
int nprobe,
|
||||
bool exactDistance);
|
||||
|
||||
void query(Tensor<float, 2, true>& vecs,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance);
|
||||
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
void query(Tensor<half, 2, true>& vecs,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int k,
|
||||
faiss::MetricType metric,
|
||||
float metricArg,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
bool exactDistance);
|
||||
#endif
|
||||
|
||||
/// Compute residual for set of vectors
|
||||
void computeResidual(Tensor<float, 2, true>& vecs,
|
||||
Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& residuals);
|
||||
|
||||
/// Gather vectors given the set of IDs
|
||||
void reconstruct(Tensor<int, 1, true>& listIds,
|
||||
Tensor<float, 2, true>& vecs);
|
||||
|
||||
void reconstruct(Tensor<int, 2, true>& listIds,
|
||||
Tensor<float, 3, true>& vecs);
|
||||
|
||||
/// Add vectors to ourselves; the pointer passed can be on the host
|
||||
/// or the device
|
||||
void add(const float* data, int numVecs, cudaStream_t stream);
|
||||
|
||||
/// Free all storage
|
||||
void reset();
|
||||
|
||||
private:
|
||||
/// Collection of GPU resources that we use
|
||||
GpuResources* resources_;
|
||||
|
||||
/// Dimensionality of our vectors
|
||||
const int dim_;
|
||||
|
||||
/// Float16 data format
|
||||
const bool useFloat16_;
|
||||
|
||||
/// Store vectors in transposed layout for speed; makes addition to
|
||||
/// the index slower
|
||||
const bool storeTransposed_;
|
||||
|
||||
/// Memory space for our allocations
|
||||
MemorySpace space_;
|
||||
|
||||
/// How many vectors we have
|
||||
int num_;
|
||||
|
||||
/// The underlying expandable storage
|
||||
DeviceVector<char> rawData_;
|
||||
|
||||
/// Vectors currently in rawData_
|
||||
DeviceTensor<float, 2, true> vectors_;
|
||||
DeviceTensor<float, 2, true> vectorsTransposed_;
|
||||
|
||||
/// Vectors currently in rawData_, float16 form
|
||||
#ifdef FAISS_USE_FLOAT16
|
||||
DeviceTensor<half, 2, true> vectorsHalf_;
|
||||
DeviceTensor<half, 2, true> vectorsHalfTransposed_;
|
||||
#endif
|
||||
|
||||
/// Precomputed L2 norms
|
||||
DeviceTensor<float, 1, true> norms_;
|
||||
};
|
||||
|
||||
} } // namespace
|
||||
|
|
|
@ -25,6 +25,9 @@
|
|||
#include <unordered_map>
|
||||
#include <numeric>
|
||||
|
||||
#include <string.h>
|
||||
#include <iostream>
|
||||
|
||||
namespace faiss { namespace gpu {
|
||||
|
||||
IVFFlat::IVFFlat(GpuResources* resources,
|
||||
|
@ -410,4 +413,158 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
|
|||
}
|
||||
}
|
||||
|
||||
void Usort(float *dis, int *ind, int sz){
|
||||
for(int i = 0;i < sz;i ++) {
|
||||
for(int j = i + 1; j < sz; j ++) {
|
||||
if(dis[j] < dis[i]){
|
||||
std::swap(dis[i], dis[j]);
|
||||
std::swap(ind[i], ind[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
IVFFlat::query(Tensor<float, 2, true>& queries,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int nprobe,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices,
|
||||
float* distances,
|
||||
Index::idx_t* labels) {
|
||||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||
|
||||
// These are caught at a higher level
|
||||
FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
|
||||
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
|
||||
nprobe = std::min(nprobe, quantizer_->getSize());
|
||||
|
||||
FAISS_ASSERT(queries.getSize(1) == dim_);
|
||||
|
||||
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
|
||||
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
|
||||
|
||||
// Reserve space for the quantized information
|
||||
DeviceTensor<float, 2, true>
|
||||
coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
|
||||
DeviceTensor<int, 2, true>
|
||||
coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
|
||||
|
||||
DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
|
||||
// Find the `nprobe` closest lists; we can use int indices both
|
||||
// internally and externally
|
||||
|
||||
HostTensor<int, 2, true> hostOutStoreIndicesTensor ({queries.getSize(0), k * 2});
|
||||
HostTensor<float, 2, true> hostOutStoreDistancesTensor ({queries.getSize(0), k * 2});
|
||||
|
||||
float* hostCoarseDistances = new float[queries.getSize(0) * nprobe];
|
||||
int* hostCoarseIndices = new int[queries.getSize(0) * nprobe];
|
||||
|
||||
float* hostOutStoreDistances = hostOutStoreDistancesTensor.data();
|
||||
int* hostOutStoreIndices = hostOutStoreIndicesTensor.data();
|
||||
|
||||
HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
|
||||
HostTensor<float, 2, true> hostOutDistances(outDistances, stream);
|
||||
float* tmpDistances = hostOutDistances.data();
|
||||
long* tmpIndices = hostOutIndices.data();
|
||||
const int nprobeTile = 8;
|
||||
|
||||
for (int i = 0; i < nprobe; i += nprobeTile) {
|
||||
int curTile = min(nprobeTile, nprobe-i);
|
||||
|
||||
quantizer_->query(queries,
|
||||
coarseBitset,
|
||||
curTile,
|
||||
metric_,
|
||||
metricArg_,
|
||||
coarseDistances,
|
||||
coarseIndices,
|
||||
hostCoarseDistances,
|
||||
hostCoarseIndices,
|
||||
i,
|
||||
curTile,
|
||||
nprobe,
|
||||
false);
|
||||
DeviceTensor<float, 3, true>
|
||||
residualBase(mem, {queries.getSize(0), nprobe, dim_}, stream);
|
||||
|
||||
if (useResidual_) {
|
||||
// Reconstruct vectors from the quantizer
|
||||
quantizer_->reconstruct(coarseIndices, residualBase);
|
||||
}
|
||||
|
||||
runIVFFlatScan(queries,
|
||||
coarseIndices,
|
||||
bitset,
|
||||
deviceListDataPointers_,
|
||||
deviceListIndexPointers_,
|
||||
indicesOptions_,
|
||||
deviceListLengths_,
|
||||
maxListLength_,
|
||||
k,
|
||||
metric_,
|
||||
useResidual_,
|
||||
residualBase,
|
||||
scalarQ_.get(),
|
||||
outDistances,
|
||||
outIndices,
|
||||
resources_);
|
||||
|
||||
fromDevice<float,2>(outDistances, tmpDistances, stream);
|
||||
fromDevice<long,2>(outIndices, tmpIndices, stream);
|
||||
if(i) {
|
||||
for(int d = 0; d < queries.getSize(0); d ++) {
|
||||
for(int m = 0; m < k; m ++) {
|
||||
hostOutStoreDistances[d * 2 * k + k + m] = tmpDistances[d * k + m];
|
||||
hostOutStoreIndices[d * 2 * k + k + m] = tmpIndices[d * k + m];
|
||||
}
|
||||
Usort(hostOutStoreDistances + k * 2 * d, hostOutStoreIndices + k * 2 * d, 2 * k);
|
||||
}
|
||||
}
|
||||
|
||||
else{
|
||||
for(int d = 0; d < queries.getSize(0); d ++) {
|
||||
for(int m = 0; m < k; m ++) {
|
||||
hostOutStoreDistances[d * 2 * k + m] = tmpDistances[d * k + m];
|
||||
hostOutStoreIndices[d * 2 * k + m] = tmpIndices[d * k + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for(int d = 0; d < queries.getSize(0); d ++) {
|
||||
for(int m = 0; m < k; m ++) {
|
||||
tmpDistances[d * k + m] = hostOutStoreDistances[d * 2 * k + m];
|
||||
tmpIndices[d * k + m] = hostOutStoreIndices[d * 2 * k + m];
|
||||
}
|
||||
}
|
||||
outIndices.copyFrom(hostOutIndices, stream);
|
||||
outDistances.copyFrom(hostOutDistances, stream);
|
||||
// If the GPU isn't storing indices (they are on the CPU side), we
|
||||
// need to perform the re-mapping here
|
||||
// FIXME: we might ultimately be calling this function with inputs
|
||||
// from the CPU, these are unnecessary copies
|
||||
|
||||
if (indicesOptions_ == INDICES_CPU) {
|
||||
HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
|
||||
|
||||
ivfOffsetToUserIndex(hostOutIndices.data(),
|
||||
numLists_,
|
||||
hostOutIndices.getSize(0),
|
||||
hostOutIndices.getSize(1),
|
||||
listOffsetToUserIndex_);
|
||||
|
||||
// Copy back to GPU, since the input to this function is on the
|
||||
// GPU
|
||||
outIndices.copyFrom(hostOutIndices, stream);
|
||||
}
|
||||
|
||||
delete [] hostCoarseDistances;
|
||||
delete [] hostCoarseIndices;
|
||||
|
||||
}
|
||||
|
||||
} } // namespace
|
||||
|
|
|
@ -55,7 +55,15 @@ class IVFFlat : public IVFBase {
|
|||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices);
|
||||
|
||||
|
||||
void query(Tensor<float, 2, true>& queries,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
int nprobe,
|
||||
int k,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<long, 2, true>& outIndices,
|
||||
float* distances,
|
||||
Index::idx_t* labels);
|
||||
private:
|
||||
/// Returns the size of our stored vectors, in bytes
|
||||
size_t getVectorMemorySize() const;
|
||||
|
|
|
@ -7,6 +7,13 @@
|
|||
|
||||
|
||||
#include <faiss/gpu/impl/L2Select.cuh>
|
||||
#include <stdio.h>
|
||||
#include <cstring>
|
||||
#include <faiss/gpu/utils/MemorySpace.h>
|
||||
#include <faiss/gpu/StandardGpuResources.h>
|
||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||
#include <faiss/gpu/utils/CopyUtils.cuh>
|
||||
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
|
||||
#include <faiss/gpu/utils/DeviceDefs.cuh>
|
||||
|
@ -181,6 +188,49 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
|
||||
__global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
|
||||
Tensor<T, 1, true> centroidDistances,
|
||||
Tensor<T, 2, true> outDistances,
|
||||
Tensor<int, 2, true> outIndices,
|
||||
int k, T initK, int prev) {
|
||||
// Each block handles a single row of the distances (results)
|
||||
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
|
||||
|
||||
__shared__ T smemK[kNumWarps * NumWarpQ];
|
||||
__shared__ int smemV[kNumWarps * NumWarpQ];
|
||||
|
||||
BlockSelect<T, int, false, Comparator<T>,
|
||||
NumWarpQ, NumThreadQ, ThreadsPerBlock>
|
||||
heap(initK, -1, smemK, smemV, k);
|
||||
|
||||
int row = blockIdx.x;
|
||||
|
||||
// Whole warps must participate in the selection
|
||||
int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
|
||||
int i = threadIdx.x;
|
||||
|
||||
for (; i < limit; i += blockDim.x) {
|
||||
T v = Math<T>::add(centroidDistances[i],
|
||||
productDistances[row][i]);
|
||||
heap.add(v, i);
|
||||
}
|
||||
|
||||
if (i < productDistances.getSize(1)) {
|
||||
T v = Math<T>::add(centroidDistances[i],
|
||||
productDistances[row][i]);
|
||||
heap.addThreadQ(v, i);
|
||||
}
|
||||
|
||||
heap.reduce();
|
||||
for (int i = threadIdx.x+prev; i < k+prev; i += blockDim.x) {
|
||||
outDistances[row][i-prev] = smemK[i];
|
||||
outIndices[row][i-prev] = smemV[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
||||
Tensor<T, 1, true>& centroidDistances,
|
||||
|
@ -245,6 +295,94 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
|||
CUDA_TEST_ERROR();
|
||||
}
|
||||
|
||||
|
||||
void runL2SelMn(float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int startPos,
|
||||
int curQuerySize,
|
||||
int i,
|
||||
int nprobe,
|
||||
|
||||
Tensor<float, 2, true>& productDistances,
|
||||
Tensor<float, 1, true>& centroidDistances,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
int k,
|
||||
cudaStream_t stream) {
|
||||
FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
|
||||
FAISS_ASSERT(productDistances.getSize(0) == outIndices.getSize(0));
|
||||
FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
|
||||
// FAISS_ASSERT(outDistances.getSize(1) == k);
|
||||
// FAISS_ASSERT(outIndices.getSize(1) == k);
|
||||
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
|
||||
|
||||
if (k == 1) {
|
||||
constexpr int kThreadsPerBlock = 256;
|
||||
constexpr int kRowsPerBlock = 8;
|
||||
|
||||
auto block = dim3(kThreadsPerBlock);
|
||||
auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
|
||||
|
||||
l2SelectMin1<float, kRowsPerBlock, kThreadsPerBlock>
|
||||
<<<grid, block, 0, stream>>>(productDistances, centroidDistances, bitset,
|
||||
outDistances, outIndices);
|
||||
} else {
|
||||
auto grid = dim3(outDistances.getSize(0));
|
||||
|
||||
#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \
|
||||
do { \
|
||||
l2SelectMinK<float, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
|
||||
<<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, \
|
||||
outDistances, outIndices, \
|
||||
k, Limits<float>::getMax(), i); \
|
||||
} while (0)
|
||||
|
||||
// block size 128 for everything <= 1024
|
||||
if (k <= 32) {
|
||||
RUN_L2_SELECT(128, 32, 2);
|
||||
} else if (k <= 64) {
|
||||
RUN_L2_SELECT(128, 64, 3);
|
||||
} else if (k <= 128) {
|
||||
RUN_L2_SELECT(128, 128, 3);
|
||||
} else if (k <= 256) {
|
||||
RUN_L2_SELECT(128, 256, 4);
|
||||
} else if (k <= 512) {
|
||||
RUN_L2_SELECT(128, 512, 8);
|
||||
} else if (k <= 1024) {
|
||||
RUN_L2_SELECT(128, 1024, 8);
|
||||
|
||||
#if GPU_MAX_SELECTION_K >= 2048
|
||||
} else if (k <= 2048) {
|
||||
// smaller block for less shared memory
|
||||
RUN_L2_SELECT(64, 2048, 8);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
FAISS_ASSERT(false);
|
||||
}
|
||||
|
||||
float* tmpDistances = new float[outDistances.getSize(0) * outDistances.getSize(1)];
|
||||
int* tmpIndices = new int[outDistances.getSize(0) * outDistances.getSize(1)];
|
||||
fromDevice<float,2>(outDistances, tmpDistances, stream);
|
||||
fromDevice<int,2>(outIndices, tmpIndices, stream);
|
||||
|
||||
for(int j = 0; j < curQuerySize; j ++) {
|
||||
for(int m = 0; m < k; m ++) {
|
||||
hostOutDistances[(startPos + j) * nprobe + i + m] = tmpDistances[k * j + m];
|
||||
hostOutIndices[(startPos + j) * nprobe + i + m] = tmpIndices[k * j + m];
|
||||
}
|
||||
}
|
||||
|
||||
delete [] tmpDistances;
|
||||
delete [] tmpIndices;
|
||||
|
||||
}
|
||||
|
||||
CUDA_TEST_ERROR();
|
||||
}
|
||||
|
||||
|
||||
void runL2SelectMin(Tensor<float, 2, true>& productDistances,
|
||||
Tensor<float, 1, true>& centroidDistances,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
|
|
|
@ -20,4 +20,18 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
|
|||
int k,
|
||||
cudaStream_t stream);
|
||||
|
||||
void runL2SelMn(float* hostOutDistances,
|
||||
int* hostOutIndices,
|
||||
int startPos,
|
||||
int curQuerySize,
|
||||
int i,
|
||||
int nprobe,
|
||||
Tensor<float, 2, true>& productDistances,
|
||||
Tensor<float, 1, true>& centroidDistances,
|
||||
Tensor<uint8_t, 1, true>& bitset,
|
||||
Tensor<float, 2, true>& outDistances,
|
||||
Tensor<int, 2, true>& outIndices,
|
||||
int k,
|
||||
cudaStream_t stream);
|
||||
|
||||
} } // namespace
|
||||
|
|
Loading…
Reference in New Issue