Modify faiss files to reduce the usage of GPU in 0.10.4 (#3893)

* Modify faiss files to reduce the usage of GPU in 0.10.4

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Change the format of the code and release memories of arrays resident on host

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Change the format of the code and release memories of arrays resident on host, fix some typos

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Delete some blank lines

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Temporarily comment the delete function in IVFFlat.cu

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Uncomment some delete functions

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Uncomment some delete functions

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Use tensor to replace arrays in IVFFlat.cu

Signed-off-by: lighteningzhang <lighteningzhang@163.com>

* Fix some typos

Signed-off-by: lighteningzhang <lighteningzhang@163.com>
pull/3931/head
lighteningzhang 2020-09-28 17:58:32 +08:00 committed by GitHub
parent 7692a6d437
commit 237ce2aba3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1452 additions and 529 deletions

View File

@ -250,7 +250,7 @@ GpuIndexIVFFlat::searchImpl_(int n,
if (!bitset) {
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels, distances, labels);
} else {
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
const_cast<uint8_t*>(bitset->data()), stream,
@ -262,3 +262,4 @@ GpuIndexIVFFlat::searchImpl_(int n,
} } // namespace

View File

@ -295,6 +295,331 @@ void runDistance(bool computeL2,
}
}
void runDist(bool computeL2,
GpuResources* resources,
Tensor<float, 2, true>& centroids,
bool centroidsRowMajor,
Tensor<float, 1, true>* centroidNorms,
Tensor<float, 2, true>& queries,
bool queriesRowMajor,
Tensor<uint8_t, 1, true>& bitset,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* hostOutDistances,
int* hostOutIndices,
int pos,
int curTile,
int nprobe,
bool ignoreOutDistances) {
k = curTile;
// The # of centroids in `centroids` based on memory layout
auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
// The # of queries in `queries` based on memory layout
auto numQueries = queries.getSize(queriesRowMajor ? 0 : 1);
// The dimensions of the vectors to consider
auto dim = queries.getSize(queriesRowMajor ? 1 : 0);
FAISS_ASSERT((numQueries == 0 || numCentroids == 0) ||
dim == centroids.getSize(centroidsRowMajor ? 1 : 0));
FAISS_ASSERT(outDistances.getSize(0) == numQueries);
FAISS_ASSERT(outIndices.getSize(0) == numQueries);
// FAISS_ASSERT(outDistances.getSize(1) == k);
// FAISS_ASSERT(outIndices.getSize(1) == k);
auto& mem = resources->getMemoryManagerCurrentDevice();
auto defaultStream = resources->getDefaultStreamCurrentDevice();
// If we're quering against a 0 sized set, just return empty results
if (centroids.numElements() == 0) {
thrust::fill(thrust::cuda::par.on(defaultStream),
outDistances.data(), outDistances.end(),
Limits<float>::getMax());
thrust::fill(thrust::cuda::par.on(defaultStream),
outIndices.data(), outIndices.end(),
-1);
return;
}
// L2: If ||c||^2 is not pre-computed, calculate it
DeviceTensor<float, 1, true> cNorms;
if (computeL2 && !centroidNorms) {
cNorms =
std::move(DeviceTensor<float, 1, true>(
mem, {numCentroids}, defaultStream));
runL2Norm(centroids, centroidsRowMajor, cNorms, true, defaultStream);
centroidNorms = &cNorms;
}
//
// Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
//
int qNormSize[1] = {numQueries};
DeviceTensor<float, 1, true> queryNorms(mem, qNormSize, defaultStream);
// ||q||^2
if (computeL2) {
runL2Norm(queries, queriesRowMajor, queryNorms, true, defaultStream);
}
// By default, aim to use up to 512 MB of memory for the processing, with both
// number of queries and number of centroids being at least 512.
int tileRows = 0;
int tileCols = 0;
chooseTileSize(numQueries,
numCentroids,
dim,
sizeof(float),
mem.getSizeAvailable(),
tileRows,
tileCols);
int numColTiles = utils::divUp(numCentroids, tileCols);
// We can have any number of vectors to query against, even less than k, in
// which case we'll return -1 for the index
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
// Temporary output memory space we'll use
DeviceTensor<float, 2, true> distanceBuf1(
mem, {tileRows, tileCols}, defaultStream);
DeviceTensor<float, 2, true> distanceBuf2(
mem, {tileRows, tileCols}, defaultStream);
DeviceTensor<float, 2, true>* distanceBufs[2] =
{&distanceBuf1, &distanceBuf2};
DeviceTensor<float, 2, true> outDistanceBuf1(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<float, 2, true> outDistanceBuf2(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<float, 2, true>* outDistanceBufs[2] =
{&outDistanceBuf1, &outDistanceBuf2};
DeviceTensor<int, 2, true> outIndexBuf1(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<int, 2, true> outIndexBuf2(
mem, {tileRows, numColTiles * k}, defaultStream);
DeviceTensor<int, 2, true>* outIndexBufs[2] =
{&outIndexBuf1, &outIndexBuf2};
auto streams = resources->getAlternateStreamsCurrentDevice();
streamWait(streams, {defaultStream});
int curStream = 0;
bool interrupt = false;
// Tile over the input queries
for (int i = 0; i < numQueries; i += tileRows) {
if (interrupt || InterruptCallback::is_interrupted()) {
interrupt = true;
break;
}
int curQuerySize = std::min(tileRows, numQueries - i);
auto outDistanceView =
outDistances.narrow(0, i, curQuerySize);
auto outIndexView =
outIndices.narrow(0, i, curQuerySize);
auto queryView =
queries.narrow(queriesRowMajor ? 0 : 1, i, curQuerySize);
auto queryNormNiew =
queryNorms.narrow(0, i, curQuerySize);
auto outDistanceBufRowView =
outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
auto outIndexBufRowView =
outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
// Tile over the centroids
for (int j = 0; j < numCentroids; j += tileCols) {
if (InterruptCallback::is_interrupted()) {
interrupt = true;
break;
}
int curCentroidSize = std::min(tileCols, numCentroids - j);
int curColTile = j / tileCols;
auto centroidsView =
sliceCentroids(centroids, centroidsRowMajor, j, curCentroidSize);
auto distanceBufView = distanceBufs[curStream]->
narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
auto outDistanceBufColView =
outDistanceBufRowView.narrow(1, k * curColTile, k);
auto outIndexBufColView =
outIndexBufRowView.narrow(1, k * curColTile, k);
// L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
// IP: just compute qc
// (query id x dim) x (centroid id, dim)' = (query id, centroid id)
runMatrixMult(distanceBufView,
false, // not transposed
queryView,
!queriesRowMajor, // transposed MM if col major
centroidsView,
centroidsRowMajor, // transposed MM if row major
computeL2 ? -2.0f : 1.0f,
0.0f,
resources->getBlasHandleCurrentDevice(),
streams[curStream]);
if (computeL2) {
// For L2 distance, we use this fused kernel that performs both
// adding ||c||^2 to -2qc and k-selection, so we only need two
// passes (one write by the gemm, one read here) over the huge
// region of output memory
//
// If we aren't tiling along the number of centroids, we can perform the
// output work directly
if (tileCols == numCentroids) {
// Write into the final output
runL2SelMn(hostOutDistances,
hostOutIndices,
i,
curQuerySize,
pos,
nprobe,
distanceBufView,
*centroidNorms,
bitset,
outDistanceView,
outIndexView,
k,
streams[curStream]);
if (!ignoreOutDistances) {
// expand (query id) to (query id, k) by duplicating along rows
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
runSumAlongRows(queryNormNiew,
outDistanceView,
true, // L2 distances should not go below zero due
// to roundoff error
streams[curStream]);
}
} else {
auto centroidNormsView = centroidNorms->narrow(0, j, curCentroidSize);
// Write into our intermediate output
runL2SelectMin(distanceBufView,
centroidNormsView,
bitset,
outDistanceBufColView,
outIndexBufColView,
k,
streams[curStream]);
if (!ignoreOutDistances) {
// expand (query id) to (query id, k) by duplicating along rows
// top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
runSumAlongRows(queryNormNiew,
outDistanceBufColView,
true, // L2 distances should not go below zero due
// to roundoff error
streams[curStream]);
}
}
} else {
// For IP, just k-select the output for this tile
if (tileCols == numCentroids) {
// Write into the final output
runBlockSelect(distanceBufView,
bitset,
outDistanceView,
outIndexView,
true, k, streams[curStream]);
} else {
// Write into the intermediate output
runBlockSelect(distanceBufView,
bitset,
outDistanceBufColView,
outIndexBufColView,
true, k, streams[curStream]);
}
}
}
// As we're finished with processing a full set of centroids, perform the
// final k-selection
if (tileCols != numCentroids) {
// The indices are tile-relative; for each tile of k, we need to add
// tileCols to the index
runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
runBlockSelectPair(outDistanceBufRowView,
outIndexBufRowView,
bitset,
outDistanceView,
outIndexView,
computeL2 ? false : true, k, streams[curStream]);
}
curStream = (curStream + 1) % 2;
}
// Have the desired ordering stream wait on the multi-stream
streamWait({defaultStream}, streams);
if (interrupt) {
FAISS_THROW_MSG("interrupted");
}
}
void runL2Dist(GpuResources* resources,
Tensor<float, 2, true>& vectors,
bool vectorsRowMajor,
// can be optionally pre-computed; nullptr if we
// have to compute it upon the call
Tensor<float, 1, true>* vectorNorms,
Tensor<float, 2, true>& queries,
bool queriesRowMajor,
Tensor<uint8_t, 1, true>& bitset,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* hostOutDistances,
int* hostOutIndices,
int i,
int curTile,
int nprobe,
bool ignoreOutDistances
) {
runDist(true, // L2
resources,
vectors,
vectorsRowMajor,
vectorNorms,
queries,
queriesRowMajor,
bitset,
k,
outDistances,
outIndices,
hostOutDistances,
hostOutIndices,
i,
curTile,
nprobe,
ignoreOutDistances);
}
template <typename T>
void runL2Distance(GpuResources* resources,
Tensor<T, 2, true>& centroids,

View File

@ -35,6 +35,48 @@ void runL2Distance(GpuResources* resources,
// take shortcuts.
bool ignoreOutDistances = false);
void runL2Dist(GpuResources* resources,
Tensor<float, 2, true>& vectors,
bool vectorsRowMajor,
// can be optionally pre-computed; nullptr if we
// have to compute it upon the call
Tensor<float, 1, true>* vectorNorms,
Tensor<float, 2, true>& queries,
bool queriesRowMajor,
Tensor<uint8_t, 1, true>& bitset,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* hostOutDistances,
int* hostOutIndices,
int pos,
int curTile,
int nprobe,
bool ignoreOutDistances = false
);
void runDist(bool computeL2,
GpuResources* resources,
Tensor<float, 2, true>& centroids,
bool centroidsRowMajor,
Tensor<float, 1, true>* centroidNorms,
Tensor<float, 2, true>& queries,
bool queriesRowMajor,
Tensor<uint8_t, 1, true>& bitset,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* hostOutDistances,
int* hostOutIndices,
int i,
int curTile,
int nprobe,
bool ignoreOutDistances);
/// Calculates brute-force inner product distance between `vectors`
/// and `queries`, returning the k closest results seen
void runIPDistance(GpuResources* resources,
@ -91,6 +133,7 @@ void bfKnnOnDevice(GpuResources* resources,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool ignoreOutDistances) {
// We are guaranteed that all data arguments are resident on our preferred
// `device` here, and are ordered wrt `stream`
@ -213,4 +256,155 @@ void bfKnnOnDevice(GpuResources* resources,
}
template <typename T>
void bfKnnOnDev(GpuResources* resources,
int device,
cudaStream_t stream,
Tensor<T, 2, true>& vectors,
bool vectorsRowMajor,
Tensor<float, 1, true>* vectorNorms,
Tensor<T, 2, true>& queries,
bool queriesRowMajor,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* outDis_h,
int* outInd_h,
int i,
int curTile,
int nprobe,
bool ignoreOutDistances) {
// We are guaranteed that all data arguments are resident on our preferred
// `device` here, and are ordered wrt `stream`
// L2 and IP are specialized to use GEMM and an optimized L2 + selection or
// pure k-selection kernel.
if ((metric == faiss::MetricType::METRIC_L2) ||
(metric == faiss::MetricType::METRIC_Lp &&
metricArg == 2)) {
runL2Dist(resources,
vectors,
vectorsRowMajor,
vectorNorms,
queries,
queriesRowMajor,
bitset,
k,
outDistances,
outIndices,
outDis_h,
outInd_h,
i,
curTile,
nprobe);
} else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
runIPDistance(resources,
vectors,
vectorsRowMajor,
queries,
queriesRowMajor,
bitset,
k,
outDistances,
outIndices);
} else {
//
// General pairwise distance kernel
//
// The general distance kernel does not have specializations for
// transpositions (NN, NT, TN); instead, the transposition is just handled
// upon data load for now, which could result in poor data loading behavior
// for NT / TN. This can be fixed at a later date if desired, but efficiency
// is low versus GEMM anyways.
//
Tensor<T, 2> tVectorsDimInnermost =
vectorsRowMajor ?
vectors.transposeInnermost(1) :
vectors.transposeInnermost(0);
Tensor<T, 2> tQueriesDimInnermost =
queriesRowMajor ?
queries.transposeInnermost(1) :
queries.transposeInnermost(0);
if ((metric == faiss::MetricType::METRIC_L1) ||
(metric == faiss::MetricType::METRIC_Lp &&
metricArg == 1)) {
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
L1Distance(),
outDistances,
outIndices);
} else if (metric == faiss::MetricType::METRIC_Lp &&
metricArg == -1) {
// A way to test L2 distance
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
L2Distance(),
outDistances,
outIndices);
} else if (metric == faiss::MetricType::METRIC_Lp) {
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
LpDistance(metricArg),
outDistances,
outIndices);
} else if (metric == faiss::MetricType::METRIC_Linf) {
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
LinfDistance(),
outDistances,
outIndices);
} else if (metric == faiss::MetricType::METRIC_Canberra) {
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
CanberraDistance(),
outDistances,
outIndices);
} else if (metric == faiss::MetricType::METRIC_BrayCurtis) {
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
BrayCurtisDistance(),
outDistances,
outIndices);
} else if (metric == faiss::MetricType::METRIC_JensenShannon) {
runGeneralDistance(resources,
tVectorsDimInnermost,
tQueriesDimInnermost,
bitset,
k,
JensenShannonDistance(),
outDistances,
outIndices);
} else {
FAISS_THROW_FMT("unsupported metric type %d", metric);
}
}
}
} } // namespace

View File

@ -1,388 +1,460 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/impl/FlatIndex.cuh>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/impl/L2Norm.cuh>
#include <faiss/gpu/impl/VectorResidual.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Transpose.cuh>
namespace faiss { namespace gpu {
FlatIndex::FlatIndex(GpuResources* res,
int dim,
bool useFloat16,
bool storeTransposed,
MemorySpace space) :
resources_(res),
dim_(dim),
useFloat16_(useFloat16),
storeTransposed_(storeTransposed),
space_(space),
num_(0),
rawData_(space) {
#ifndef FAISS_USE_FLOAT16
FAISS_ASSERT(!useFloat16_);
#endif
}
bool
FlatIndex::getUseFloat16() const {
return useFloat16_;
}
/// Returns the number of vectors we contain
int FlatIndex::getSize() const {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
return vectorsHalf_.getSize(0);
} else {
return vectors_.getSize(0);
}
#else
return vectors_.getSize(0);
#endif
}
int FlatIndex::getDim() const {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
return vectorsHalf_.getSize(1);
} else {
return vectors_.getSize(1);
}
#else
return vectors_.getSize(1);
#endif
}
void
FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
} else {
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
}
#else
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
#endif
}
template <>
Tensor<float, 2, true>&
FlatIndex::getVectorsRef<float>() {
// Should not call this unless we are in float32 mode
FAISS_ASSERT(!useFloat16_);
return getVectorsFloat32Ref();
}
#ifdef FAISS_USE_FLOAT16
template <>
Tensor<half, 2, true>&
FlatIndex::getVectorsRef<half>() {
// Should not call this unless we are in float16 mode
FAISS_ASSERT(useFloat16_);
return getVectorsFloat16Ref();
}
#endif
Tensor<float, 2, true>&
FlatIndex::getVectorsFloat32Ref() {
// Should not call this unless we are in float32 mode
FAISS_ASSERT(!useFloat16_);
return vectors_;
}
#ifdef FAISS_USE_FLOAT16
Tensor<half, 2, true>&
FlatIndex::getVectorsFloat16Ref() {
// Should not call this unless we are in float16 mode
FAISS_ASSERT(useFloat16_);
return vectorsHalf_;
}
#endif
DeviceTensor<float, 2, true>
FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
return getVectorsFloat32Copy(0, num_, stream);
}
DeviceTensor<float, 2, true>
FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
} else {
vectors_.copyTo(vecFloat32, stream);
}
#else
vectors_.copyTo(vecFloat32, stream);
#endif
return vecFloat32;
}
void
FlatIndex::query(Tensor<float, 2, true>& input,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance) {
auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice();
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
// We need to convert the input to float16 for comparison to ourselves
auto inputHalf =
convertTensor<float, half, 2>(resources_, stream, input);
query(inputHalf, bitset, k, metric, metricArg,
outDistances, outIndices, exactDistance);
} else {
bfKnnOnDevice(resources_,
getCurrentDevice(),
stream,
storeTransposed_ ? vectorsTransposed_ : vectors_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
}
#else
bfKnnOnDevice(resources_,
getCurrentDevice(),
stream,
storeTransposed_ ? vectorsTransposed_ : vectors_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
#endif
}
#ifdef FAISS_USE_FLOAT16
void
FlatIndex::query(Tensor<half, 2, true>& input,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance) {
FAISS_ASSERT(useFloat16_);
bfKnnOnDevice(resources_,
getCurrentDevice(),
resources_->getDefaultStreamCurrentDevice(),
storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
}
#endif
void
FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& residuals) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
runCalcResidual(vecs,
getVectorsFloat16Ref(),
listIds,
residuals,
resources_->getDefaultStreamCurrentDevice());
} else {
runCalcResidual(vecs,
getVectorsFloat32Ref(),
listIds,
residuals,
resources_->getDefaultStreamCurrentDevice());
}
#else
runCalcResidual(vecs,
getVectorsFloat32Ref(),
listIds,
residuals,
resources_->getDefaultStreamCurrentDevice());
#endif
}
void
FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& vecs) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
runReconstruct(listIds,
getVectorsFloat16Ref(),
vecs,
resources_->getDefaultStreamCurrentDevice());
} else {
runReconstruct(listIds,
getVectorsFloat32Ref(),
vecs,
resources_->getDefaultStreamCurrentDevice());
}
#else
runReconstruct(listIds,
getVectorsFloat32Ref(),
vecs,
resources_->getDefaultStreamCurrentDevice());
#endif
}
void
FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
Tensor<float, 3, true>& vecs) {
auto listIds1 = listIds.downcastOuter<1>();
auto vecs2 = vecs.downcastOuter<2>();
reconstruct(listIds1, vecs2);
}
void
FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
if (numVecs == 0) {
return;
}
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
// Make sure that `data` is on our device; we'll run the
// conversion on our device
auto devData = toDevice<float, 2>(resources_,
getCurrentDevice(),
(float*) data,
stream,
{numVecs, dim_});
auto devDataHalf =
convertTensor<float, half, 2>(resources_, stream, devData);
rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(),
stream,
true /* reserve exactly */);
} else {
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream,
true /* reserve exactly */);
}
#else
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream,
true /* reserve exactly */);
#endif
num_ += numVecs;
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
DeviceTensor<half, 2, true> vectorsHalf(
(half*) rawData_.data(), {(int) num_, dim_}, space_);
vectorsHalf_ = std::move(vectorsHalf);
} else {
DeviceTensor<float, 2, true> vectors(
(float*) rawData_.data(), {(int) num_, dim_}, space_);
vectors_ = std::move(vectors);
}
#else
DeviceTensor<float, 2, true> vectors(
(float*) rawData_.data(), {(int) num_, dim_}, space_);
vectors_ = std::move(vectors);
#endif
if (storeTransposed_) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
vectorsHalfTransposed_ =
std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
} else {
vectorsTransposed_ =
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
}
#else
vectorsTransposed_ =
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
#endif
}
// Precompute L2 norms of our database
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
runL2Norm(vectorsHalf_, true, norms, true, stream);
norms_ = std::move(norms);
} else {
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
runL2Norm(vectors_, true, norms, true, stream);
norms_ = std::move(norms);
}
#else
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
runL2Norm(vectors_, true, norms, true, stream);
norms_ = std::move(norms);
#endif
}
void
FlatIndex::reset() {
rawData_.clear();
vectors_ = std::move(DeviceTensor<float, 2, true>());
vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
#ifdef FAISS_USE_FLOAT16
vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
#endif
norms_ = std::move(DeviceTensor<float, 1, true>());
num_ = 0;
}
} }
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/impl/FlatIndex.cuh>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/impl/L2Norm.cuh>
#include <faiss/gpu/impl/VectorResidual.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/Transpose.cuh>
namespace faiss { namespace gpu {
FlatIndex::FlatIndex(GpuResources* res,
int dim,
bool useFloat16,
bool storeTransposed,
MemorySpace space) :
resources_(res),
dim_(dim),
useFloat16_(useFloat16),
storeTransposed_(storeTransposed),
space_(space),
num_(0),
rawData_(space) {
#ifndef FAISS_USE_FLOAT16
FAISS_ASSERT(!useFloat16_);
#endif
}
bool
FlatIndex::getUseFloat16() const {
return useFloat16_;
}
/// Returns the number of vectors we contain
int FlatIndex::getSize() const {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
return vectorsHalf_.getSize(0);
} else {
return vectors_.getSize(0);
}
#else
return vectors_.getSize(0);
#endif
}
int FlatIndex::getDim() const {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
return vectorsHalf_.getSize(1);
} else {
return vectors_.getSize(1);
}
#else
return vectors_.getSize(1);
#endif
}
void
FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
} else {
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
}
#else
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
#endif
}
template <>
Tensor<float, 2, true>&
FlatIndex::getVectorsRef<float>() {
// Should not call this unless we are in float32 mode
FAISS_ASSERT(!useFloat16_);
return getVectorsFloat32Ref();
}
#ifdef FAISS_USE_FLOAT16
template <>
Tensor<half, 2, true>&
FlatIndex::getVectorsRef<half>() {
// Should not call this unless we are in float16 mode
FAISS_ASSERT(useFloat16_);
return getVectorsFloat16Ref();
}
#endif
Tensor<float, 2, true>&
FlatIndex::getVectorsFloat32Ref() {
// Should not call this unless we are in float32 mode
FAISS_ASSERT(!useFloat16_);
return vectors_;
}
#ifdef FAISS_USE_FLOAT16
Tensor<half, 2, true>&
FlatIndex::getVectorsFloat16Ref() {
// Should not call this unless we are in float16 mode
FAISS_ASSERT(useFloat16_);
return vectorsHalf_;
}
#endif
DeviceTensor<float, 2, true>
FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
return getVectorsFloat32Copy(0, num_, stream);
}
DeviceTensor<float, 2, true>
FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
} else {
vectors_.copyTo(vecFloat32, stream);
}
#else
vectors_.copyTo(vecFloat32, stream);
#endif
return vecFloat32;
}
void
FlatIndex::query(Tensor<float, 2, true>& input,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance) {
auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice();
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
// We need to convert the input to float16 for comparison to ourselves
auto inputHalf =
convertTensor<float, half, 2>(resources_, stream, input);
query(inputHalf, bitset, k, metric, metricArg,
outDistances, outIndices, exactDistance);
} else {
bfKnnOnDevice(resources_,
getCurrentDevice(),
stream,
storeTransposed_ ? vectorsTransposed_ : vectors_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
}
#else
bfKnnOnDevice(resources_,
getCurrentDevice(),
stream,
storeTransposed_ ? vectorsTransposed_ : vectors_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
#endif
}
void
FlatIndex::query(Tensor<float, 2, true>& input,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* hostOutDistances,
int* hostOutIndices,
int i,
int curTile,
int nprobe,
bool exactDistance) {
auto stream = resources_->getDefaultStreamCurrentDevice();
auto& mem = resources_->getMemoryManagerCurrentDevice();
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
// We need to convert the input to float16 for comparison to ourselves
auto inputHalf =
convertTensor<float, half, 2>(resources_, stream, input);
query(inputHalf, bitset, k, metric, metricArg,
outDistances, outIndices, exactDistance);
} else {
bfKnnOnDevice(resources_,
getCurrentDevice(),
stream,
storeTransposed_ ? vectorsTransposed_ : vectors_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
}
#else
bfKnnOnDev(resources_,
getCurrentDevice(),
stream,
storeTransposed_ ? vectorsTransposed_ : vectors_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
hostOutDistances,
hostOutIndices,
i,
curTile,
nprobe,
!exactDistance);
#endif
}
#ifdef FAISS_USE_FLOAT16
void
FlatIndex::query(Tensor<half, 2, true>& input,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance) {
FAISS_ASSERT(useFloat16_);
bfKnnOnDevice(resources_,
getCurrentDevice(),
resources_->getDefaultStreamCurrentDevice(),
storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
!storeTransposed_, // is vectors row major?
&norms_,
input,
true, // input is row major
bitset,
k,
metric,
metricArg,
outDistances,
outIndices,
!exactDistance);
}
#endif
void
FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& residuals) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
runCalcResidual(vecs,
getVectorsFloat16Ref(),
listIds,
residuals,
resources_->getDefaultStreamCurrentDevice());
} else {
runCalcResidual(vecs,
getVectorsFloat32Ref(),
listIds,
residuals,
resources_->getDefaultStreamCurrentDevice());
}
#else
runCalcResidual(vecs,
getVectorsFloat32Ref(),
listIds,
residuals,
resources_->getDefaultStreamCurrentDevice());
#endif
}
void
FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& vecs) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
runReconstruct(listIds,
getVectorsFloat16Ref(),
vecs,
resources_->getDefaultStreamCurrentDevice());
} else {
runReconstruct(listIds,
getVectorsFloat32Ref(),
vecs,
resources_->getDefaultStreamCurrentDevice());
}
#else
runReconstruct(listIds,
getVectorsFloat32Ref(),
vecs,
resources_->getDefaultStreamCurrentDevice());
#endif
}
void
FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
Tensor<float, 3, true>& vecs) {
auto listIds1 = listIds.downcastOuter<1>();
auto vecs2 = vecs.downcastOuter<2>();
reconstruct(listIds1, vecs2);
}
void
FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
if (numVecs == 0) {
return;
}
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
// Make sure that `data` is on our device; we'll run the
// conversion on our device
auto devData = toDevice<float, 2>(resources_,
getCurrentDevice(),
(float*) data,
stream,
{numVecs, dim_});
auto devDataHalf =
convertTensor<float, half, 2>(resources_, stream, devData);
rawData_.append((char*) devDataHalf.data(),
devDataHalf.getSizeInBytes(),
stream,
true /* reserve exactly */);
} else {
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream,
true /* reserve exactly */);
}
#else
rawData_.append((char*) data,
(size_t) dim_ * numVecs * sizeof(float),
stream,
true /* reserve exactly */);
#endif
num_ += numVecs;
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
DeviceTensor<half, 2, true> vectorsHalf(
(half*) rawData_.data(), {(int) num_, dim_}, space_);
vectorsHalf_ = std::move(vectorsHalf);
} else {
DeviceTensor<float, 2, true> vectors(
(float*) rawData_.data(), {(int) num_, dim_}, space_);
vectors_ = std::move(vectors);
}
#else
DeviceTensor<float, 2, true> vectors(
(float*) rawData_.data(), {(int) num_, dim_}, space_);
vectors_ = std::move(vectors);
#endif
if (storeTransposed_) {
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
vectorsHalfTransposed_ =
std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
} else {
vectorsTransposed_ =
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
}
#else
vectorsTransposed_ =
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
#endif
}
// Precompute L2 norms of our database
#ifdef FAISS_USE_FLOAT16
if (useFloat16_) {
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
runL2Norm(vectorsHalf_, true, norms, true, stream);
norms_ = std::move(norms);
} else {
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
runL2Norm(vectors_, true, norms, true, stream);
norms_ = std::move(norms);
}
#else
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
runL2Norm(vectors_, true, norms, true, stream);
norms_ = std::move(norms);
#endif
}
void
FlatIndex::reset() {
rawData_.clear();
vectors_ = std::move(DeviceTensor<float, 2, true>());
vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
#ifdef FAISS_USE_FLOAT16
vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
#endif
norms_ = std::move(DeviceTensor<float, 1, true>());
num_ = 0;
}
} }

View File

@ -1,139 +1,153 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/MetricType.h>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/DeviceVector.cuh>
#include <faiss/gpu/utils/MemorySpace.h>
namespace faiss { namespace gpu {
class GpuResources;
/// Holder of GPU resources for a particular flat index
class FlatIndex {
public:
FlatIndex(GpuResources* res,
int dim,
bool useFloat16,
bool storeTransposed,
MemorySpace space);
/// Whether or not this flat index primarily stores data in float16
bool getUseFloat16() const;
/// Returns the number of vectors we contain
int getSize() const;
/// Returns the dimensionality of the vectors
int getDim() const;
/// Reserve storage that can contain at least this many vectors
void reserve(size_t numVecs, cudaStream_t stream);
/// Returns the vectors based on the type desired; the FlatIndex must be of
/// the same type (float16 or float32) to not assert
template <typename T>
Tensor<T, 2, true>& getVectorsRef();
/// Returns a reference to our vectors currently in use
Tensor<float, 2, true>& getVectorsFloat32Ref();
/// Returns a reference to our vectors currently in use (useFloat16 mode)
#ifdef FAISS_USE_FLOAT16
Tensor<half, 2, true>& getVectorsFloat16Ref();
#endif
/// Performs a copy of the vectors on the given device, converting
/// as needed from float16
DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
/// Returns only a subset of the vectors
DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
int num,
cudaStream_t stream);
void query(Tensor<float, 2, true>& vecs,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance);
#ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance);
#endif
/// Compute residual for set of vectors
void computeResidual(Tensor<float, 2, true>& vecs,
Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& residuals);
/// Gather vectors given the set of IDs
void reconstruct(Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& vecs);
void reconstruct(Tensor<int, 2, true>& listIds,
Tensor<float, 3, true>& vecs);
/// Add vectors to ourselves; the pointer passed can be on the host
/// or the device
void add(const float* data, int numVecs, cudaStream_t stream);
/// Free all storage
void reset();
private:
/// Collection of GPU resources that we use
GpuResources* resources_;
/// Dimensionality of our vectors
const int dim_;
/// Float16 data format
const bool useFloat16_;
/// Store vectors in transposed layout for speed; makes addition to
/// the index slower
const bool storeTransposed_;
/// Memory space for our allocations
MemorySpace space_;
/// How many vectors we have
int num_;
/// The underlying expandable storage
DeviceVector<char> rawData_;
/// Vectors currently in rawData_
DeviceTensor<float, 2, true> vectors_;
DeviceTensor<float, 2, true> vectorsTransposed_;
/// Vectors currently in rawData_, float16 form
#ifdef FAISS_USE_FLOAT16
DeviceTensor<half, 2, true> vectorsHalf_;
DeviceTensor<half, 2, true> vectorsHalfTransposed_;
#endif
/// Precomputed L2 norms
DeviceTensor<float, 1, true> norms_;
};
} } // namespace
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/MetricType.h>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/DeviceVector.cuh>
#include <faiss/gpu/utils/MemorySpace.h>
namespace faiss { namespace gpu {
class GpuResources;
/// Holder of GPU resources for a particular flat index
class FlatIndex {
public:
FlatIndex(GpuResources* res,
int dim,
bool useFloat16,
bool storeTransposed,
MemorySpace space);
/// Whether or not this flat index primarily stores data in float16
bool getUseFloat16() const;
/// Returns the number of vectors we contain
int getSize() const;
/// Returns the dimensionality of the vectors
int getDim() const;
/// Reserve storage that can contain at least this many vectors
void reserve(size_t numVecs, cudaStream_t stream);
/// Returns the vectors based on the type desired; the FlatIndex must be of
/// the same type (float16 or float32) to not assert
template <typename T>
Tensor<T, 2, true>& getVectorsRef();
/// Returns a reference to our vectors currently in use
Tensor<float, 2, true>& getVectorsFloat32Ref();
/// Returns a reference to our vectors currently in use (useFloat16 mode)
#ifdef FAISS_USE_FLOAT16
Tensor<half, 2, true>& getVectorsFloat16Ref();
#endif
/// Performs a copy of the vectors on the given device, converting
/// as needed from float16
DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
/// Returns only a subset of the vectors
DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
int num,
cudaStream_t stream);
void query(Tensor<float, 2, true>& vecs,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
float* hostOutDistances,
int* hostOutIndices,
int i,
int curTile,
int nprobe,
bool exactDistance);
void query(Tensor<float, 2, true>& vecs,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance);
#ifdef FAISS_USE_FLOAT16
void query(Tensor<half, 2, true>& vecs,
Tensor<uint8_t, 1, true>& bitset,
int k,
faiss::MetricType metric,
float metricArg,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
bool exactDistance);
#endif
/// Compute residual for set of vectors
void computeResidual(Tensor<float, 2, true>& vecs,
Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& residuals);
/// Gather vectors given the set of IDs
void reconstruct(Tensor<int, 1, true>& listIds,
Tensor<float, 2, true>& vecs);
void reconstruct(Tensor<int, 2, true>& listIds,
Tensor<float, 3, true>& vecs);
/// Add vectors to ourselves; the pointer passed can be on the host
/// or the device
void add(const float* data, int numVecs, cudaStream_t stream);
/// Free all storage
void reset();
private:
/// Collection of GPU resources that we use
GpuResources* resources_;
/// Dimensionality of our vectors
const int dim_;
/// Float16 data format
const bool useFloat16_;
/// Store vectors in transposed layout for speed; makes addition to
/// the index slower
const bool storeTransposed_;
/// Memory space for our allocations
MemorySpace space_;
/// How many vectors we have
int num_;
/// The underlying expandable storage
DeviceVector<char> rawData_;
/// Vectors currently in rawData_
DeviceTensor<float, 2, true> vectors_;
DeviceTensor<float, 2, true> vectorsTransposed_;
/// Vectors currently in rawData_, float16 form
#ifdef FAISS_USE_FLOAT16
DeviceTensor<half, 2, true> vectorsHalf_;
DeviceTensor<half, 2, true> vectorsHalfTransposed_;
#endif
/// Precomputed L2 norms
DeviceTensor<float, 1, true> norms_;
};
} } // namespace

View File

@ -25,6 +25,9 @@
#include <unordered_map>
#include <numeric>
#include <string.h>
#include <iostream>
namespace faiss { namespace gpu {
IVFFlat::IVFFlat(GpuResources* resources,
@ -410,4 +413,158 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
}
}
void Usort(float *dis, int *ind, int sz){
for(int i = 0;i < sz;i ++) {
for(int j = i + 1; j < sz; j ++) {
if(dis[j] < dis[i]){
std::swap(dis[i], dis[j]);
std::swap(ind[i], ind[j]);
}
}
}
}
void
IVFFlat::query(Tensor<float, 2, true>& queries,
Tensor<uint8_t, 1, true>& bitset,
int nprobe,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices,
float* distances,
Index::idx_t* labels) {
auto& mem = resources_->getMemoryManagerCurrentDevice();
auto stream = resources_->getDefaultStreamCurrentDevice();
// These are caught at a higher level
FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
nprobe = std::min(nprobe, quantizer_->getSize());
FAISS_ASSERT(queries.getSize(1) == dim_);
FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
// Reserve space for the quantized information
DeviceTensor<float, 2, true>
coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
DeviceTensor<int, 2, true>
coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
// Find the `nprobe` closest lists; we can use int indices both
// internally and externally
HostTensor<int, 2, true> hostOutStoreIndicesTensor ({queries.getSize(0), k * 2});
HostTensor<float, 2, true> hostOutStoreDistancesTensor ({queries.getSize(0), k * 2});
float* hostCoarseDistances = new float[queries.getSize(0) * nprobe];
int* hostCoarseIndices = new int[queries.getSize(0) * nprobe];
float* hostOutStoreDistances = hostOutStoreDistancesTensor.data();
int* hostOutStoreIndices = hostOutStoreIndicesTensor.data();
HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
HostTensor<float, 2, true> hostOutDistances(outDistances, stream);
float* tmpDistances = hostOutDistances.data();
long* tmpIndices = hostOutIndices.data();
const int nprobeTile = 8;
for (int i = 0; i < nprobe; i += nprobeTile) {
int curTile = min(nprobeTile, nprobe-i);
quantizer_->query(queries,
coarseBitset,
curTile,
metric_,
metricArg_,
coarseDistances,
coarseIndices,
hostCoarseDistances,
hostCoarseIndices,
i,
curTile,
nprobe,
false);
DeviceTensor<float, 3, true>
residualBase(mem, {queries.getSize(0), nprobe, dim_}, stream);
if (useResidual_) {
// Reconstruct vectors from the quantizer
quantizer_->reconstruct(coarseIndices, residualBase);
}
runIVFFlatScan(queries,
coarseIndices,
bitset,
deviceListDataPointers_,
deviceListIndexPointers_,
indicesOptions_,
deviceListLengths_,
maxListLength_,
k,
metric_,
useResidual_,
residualBase,
scalarQ_.get(),
outDistances,
outIndices,
resources_);
fromDevice<float,2>(outDistances, tmpDistances, stream);
fromDevice<long,2>(outIndices, tmpIndices, stream);
if(i) {
for(int d = 0; d < queries.getSize(0); d ++) {
for(int m = 0; m < k; m ++) {
hostOutStoreDistances[d * 2 * k + k + m] = tmpDistances[d * k + m];
hostOutStoreIndices[d * 2 * k + k + m] = tmpIndices[d * k + m];
}
Usort(hostOutStoreDistances + k * 2 * d, hostOutStoreIndices + k * 2 * d, 2 * k);
}
}
else{
for(int d = 0; d < queries.getSize(0); d ++) {
for(int m = 0; m < k; m ++) {
hostOutStoreDistances[d * 2 * k + m] = tmpDistances[d * k + m];
hostOutStoreIndices[d * 2 * k + m] = tmpIndices[d * k + m];
}
}
}
}
for(int d = 0; d < queries.getSize(0); d ++) {
for(int m = 0; m < k; m ++) {
tmpDistances[d * k + m] = hostOutStoreDistances[d * 2 * k + m];
tmpIndices[d * k + m] = hostOutStoreIndices[d * 2 * k + m];
}
}
outIndices.copyFrom(hostOutIndices, stream);
outDistances.copyFrom(hostOutDistances, stream);
// If the GPU isn't storing indices (they are on the CPU side), we
// need to perform the re-mapping here
// FIXME: we might ultimately be calling this function with inputs
// from the CPU, these are unnecessary copies
if (indicesOptions_ == INDICES_CPU) {
HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
ivfOffsetToUserIndex(hostOutIndices.data(),
numLists_,
hostOutIndices.getSize(0),
hostOutIndices.getSize(1),
listOffsetToUserIndex_);
// Copy back to GPU, since the input to this function is on the
// GPU
outIndices.copyFrom(hostOutIndices, stream);
}
delete [] hostCoarseDistances;
delete [] hostCoarseIndices;
}
} } // namespace

View File

@ -55,7 +55,15 @@ class IVFFlat : public IVFBase {
int k,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices);
void query(Tensor<float, 2, true>& queries,
Tensor<uint8_t, 1, true>& bitset,
int nprobe,
int k,
Tensor<float, 2, true>& outDistances,
Tensor<long, 2, true>& outIndices,
float* distances,
Index::idx_t* labels);
private:
/// Returns the size of our stored vectors, in bytes
size_t getVectorMemorySize() const;

View File

@ -7,6 +7,13 @@
#include <faiss/gpu/impl/L2Select.cuh>
#include <stdio.h>
#include <cstring>
#include <faiss/gpu/utils/MemorySpace.h>
#include <faiss/gpu/StandardGpuResources.h>
#include <faiss/gpu/utils/DeviceTensor.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <faiss/impl/FaissAssert.h>
#include <faiss/gpu/utils/DeviceDefs.cuh>
@ -181,6 +188,49 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
}
}
template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
__global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
Tensor<T, 1, true> centroidDistances,
Tensor<T, 2, true> outDistances,
Tensor<int, 2, true> outIndices,
int k, T initK, int prev) {
// Each block handles a single row of the distances (results)
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
__shared__ T smemK[kNumWarps * NumWarpQ];
__shared__ int smemV[kNumWarps * NumWarpQ];
BlockSelect<T, int, false, Comparator<T>,
NumWarpQ, NumThreadQ, ThreadsPerBlock>
heap(initK, -1, smemK, smemV, k);
int row = blockIdx.x;
// Whole warps must participate in the selection
int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
int i = threadIdx.x;
for (; i < limit; i += blockDim.x) {
T v = Math<T>::add(centroidDistances[i],
productDistances[row][i]);
heap.add(v, i);
}
if (i < productDistances.getSize(1)) {
T v = Math<T>::add(centroidDistances[i],
productDistances[row][i]);
heap.addThreadQ(v, i);
}
heap.reduce();
for (int i = threadIdx.x+prev; i < k+prev; i += blockDim.x) {
outDistances[row][i-prev] = smemK[i];
outIndices[row][i-prev] = smemV[i];
}
}
template <typename T>
void runL2SelectMin(Tensor<T, 2, true>& productDistances,
Tensor<T, 1, true>& centroidDistances,
@ -245,6 +295,94 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
CUDA_TEST_ERROR();
}
void runL2SelMn(float* hostOutDistances,
int* hostOutIndices,
int startPos,
int curQuerySize,
int i,
int nprobe,
Tensor<float, 2, true>& productDistances,
Tensor<float, 1, true>& centroidDistances,
Tensor<uint8_t, 1, true>& bitset,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
int k,
cudaStream_t stream) {
FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
FAISS_ASSERT(productDistances.getSize(0) == outIndices.getSize(0));
FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
// FAISS_ASSERT(outDistances.getSize(1) == k);
// FAISS_ASSERT(outIndices.getSize(1) == k);
FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
if (k == 1) {
constexpr int kThreadsPerBlock = 256;
constexpr int kRowsPerBlock = 8;
auto block = dim3(kThreadsPerBlock);
auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
l2SelectMin1<float, kRowsPerBlock, kThreadsPerBlock>
<<<grid, block, 0, stream>>>(productDistances, centroidDistances, bitset,
outDistances, outIndices);
} else {
auto grid = dim3(outDistances.getSize(0));
#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \
do { \
l2SelectMinK<float, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
<<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, \
outDistances, outIndices, \
k, Limits<float>::getMax(), i); \
} while (0)
// block size 128 for everything <= 1024
if (k <= 32) {
RUN_L2_SELECT(128, 32, 2);
} else if (k <= 64) {
RUN_L2_SELECT(128, 64, 3);
} else if (k <= 128) {
RUN_L2_SELECT(128, 128, 3);
} else if (k <= 256) {
RUN_L2_SELECT(128, 256, 4);
} else if (k <= 512) {
RUN_L2_SELECT(128, 512, 8);
} else if (k <= 1024) {
RUN_L2_SELECT(128, 1024, 8);
#if GPU_MAX_SELECTION_K >= 2048
} else if (k <= 2048) {
// smaller block for less shared memory
RUN_L2_SELECT(64, 2048, 8);
#endif
} else {
FAISS_ASSERT(false);
}
float* tmpDistances = new float[outDistances.getSize(0) * outDistances.getSize(1)];
int* tmpIndices = new int[outDistances.getSize(0) * outDistances.getSize(1)];
fromDevice<float,2>(outDistances, tmpDistances, stream);
fromDevice<int,2>(outIndices, tmpIndices, stream);
for(int j = 0; j < curQuerySize; j ++) {
for(int m = 0; m < k; m ++) {
hostOutDistances[(startPos + j) * nprobe + i + m] = tmpDistances[k * j + m];
hostOutIndices[(startPos + j) * nprobe + i + m] = tmpIndices[k * j + m];
}
}
delete [] tmpDistances;
delete [] tmpIndices;
}
CUDA_TEST_ERROR();
}
void runL2SelectMin(Tensor<float, 2, true>& productDistances,
Tensor<float, 1, true>& centroidDistances,
Tensor<uint8_t, 1, true>& bitset,

View File

@ -20,4 +20,18 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
int k,
cudaStream_t stream);
void runL2SelMn(float* hostOutDistances,
int* hostOutIndices,
int startPos,
int curQuerySize,
int i,
int nprobe,
Tensor<float, 2, true>& productDistances,
Tensor<float, 1, true>& centroidDistances,
Tensor<uint8_t, 1, true>& bitset,
Tensor<float, 2, true>& outDistances,
Tensor<int, 2, true>& outIndices,
int k,
cudaStream_t stream);
} } // namespace