mirror of https://github.com/milvus-io/milvus.git
* Add Flat Index Delete Signed-off-by: sahuang <xiaohai.xu@zilliz.com> * Fix log Signed-off-by: sahuang <xiaohai.xu@zilliz.com> * Fix bitset Signed-off-by: sahuang <xiaohai.xu@zilliz.com> * Fix reference Signed-off-by: sahuang <xiaohai.xu@zilliz.com> * fix bug Signed-off-by: sahuang <xiaohai.xu@zilliz.com> * fix bug Signed-off-by: sahuang <xiaohai.xu@zilliz.com> * fix bug Signed-off-by: sahuang <xiaohai.xu@zilliz.com>pull/1769/head
parent
65ffaedae3
commit
f93b464172
|
@ -64,6 +64,9 @@ void bruteForceKnn(GpuResources* resources,
|
||||||
// temporary memory for it
|
// temporary memory for it
|
||||||
DeviceTensor<int, 2, true> tOutIntIndices(mem, {numQueries, k}, stream);
|
DeviceTensor<int, 2, true> tOutIntIndices(mem, {numQueries, k}, stream);
|
||||||
|
|
||||||
|
// Empty bitset
|
||||||
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources, device, nullptr, stream, {0});
|
||||||
|
|
||||||
// Do the work
|
// Do the work
|
||||||
if (metric == faiss::MetricType::METRIC_L2) {
|
if (metric == faiss::MetricType::METRIC_L2) {
|
||||||
runL2Distance(resources,
|
runL2Distance(resources,
|
||||||
|
@ -72,6 +75,7 @@ void bruteForceKnn(GpuResources* resources,
|
||||||
nullptr, // compute norms in temp memory
|
nullptr, // compute norms in temp memory
|
||||||
tQueries,
|
tQueries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitsetDevice,
|
||||||
k,
|
k,
|
||||||
tOutDistances,
|
tOutDistances,
|
||||||
tOutIntIndices);
|
tOutIntIndices);
|
||||||
|
@ -81,6 +85,7 @@ void bruteForceKnn(GpuResources* resources,
|
||||||
vectorsRowMajor,
|
vectorsRowMajor,
|
||||||
tQueries,
|
tQueries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitsetDevice,
|
||||||
k,
|
k,
|
||||||
tOutDistances,
|
tOutDistances,
|
||||||
tOutIntIndices);
|
tOutIntIndices);
|
||||||
|
|
|
@ -235,7 +235,8 @@ GpuIndex::search(Index::idx_t n,
|
||||||
if (dataSize >= minPagedSize_) {
|
if (dataSize >= minPagedSize_) {
|
||||||
searchFromCpuPaged_(n, x, k,
|
searchFromCpuPaged_(n, x, k,
|
||||||
outDistances.data(),
|
outDistances.data(),
|
||||||
outLabels.data());
|
outLabels.data(),
|
||||||
|
bitset);
|
||||||
usePaged = true;
|
usePaged = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -243,7 +244,8 @@ GpuIndex::search(Index::idx_t n,
|
||||||
if (!usePaged) {
|
if (!usePaged) {
|
||||||
searchNonPaged_(n, x, k,
|
searchNonPaged_(n, x, k,
|
||||||
outDistances.data(),
|
outDistances.data(),
|
||||||
outLabels.data());
|
outLabels.data(),
|
||||||
|
bitset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy back if necessary
|
// Copy back if necessary
|
||||||
|
@ -256,7 +258,8 @@ GpuIndex::searchNonPaged_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* outDistancesData,
|
float* outDistancesData,
|
||||||
Index::idx_t* outIndicesData) const {
|
Index::idx_t* outIndicesData,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
auto stream = resources_->getDefaultStream(device_);
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Make sure arguments are on the device we desire; use temporary
|
// Make sure arguments are on the device we desire; use temporary
|
||||||
|
@ -267,7 +270,7 @@ GpuIndex::searchNonPaged_(int n,
|
||||||
stream,
|
stream,
|
||||||
{n, (int) this->d});
|
{n, (int) this->d});
|
||||||
|
|
||||||
searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
|
searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, bitset);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -275,7 +278,8 @@ GpuIndex::searchFromCpuPaged_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* outDistancesData,
|
float* outDistancesData,
|
||||||
Index::idx_t* outIndicesData) const {
|
Index::idx_t* outIndicesData,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
|
Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
|
||||||
Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
|
Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
|
||||||
|
|
||||||
|
@ -300,7 +304,8 @@ GpuIndex::searchFromCpuPaged_(int n,
|
||||||
x + (size_t) cur * this->d,
|
x + (size_t) cur * this->d,
|
||||||
k,
|
k,
|
||||||
outDistancesSlice.data(),
|
outDistancesSlice.data(),
|
||||||
outIndicesSlice.data());
|
outIndicesSlice.data(),
|
||||||
|
bitset);
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -411,7 +416,8 @@ GpuIndex::searchFromCpuPaged_(int n,
|
||||||
bufGpus[cur3BufIndex]->data(),
|
bufGpus[cur3BufIndex]->data(),
|
||||||
k,
|
k,
|
||||||
outDistancesSlice.data(),
|
outDistancesSlice.data(),
|
||||||
outIndicesSlice.data());
|
outIndicesSlice.data(),
|
||||||
|
bitset);
|
||||||
|
|
||||||
// Create completion event
|
// Create completion event
|
||||||
eventGpuExecuteDone[cur3BufIndex] =
|
eventGpuExecuteDone[cur3BufIndex] =
|
||||||
|
|
|
@ -103,7 +103,8 @@ class GpuIndex : public faiss::Index {
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const = 0;
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const = 0;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Handles paged adds if the add set is too large, passes to
|
/// Handles paged adds if the add set is too large, passes to
|
||||||
|
@ -122,7 +123,8 @@ private:
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* outDistancesData,
|
float* outDistancesData,
|
||||||
Index::idx_t* outIndicesData) const;
|
Index::idx_t* outIndicesData,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const;
|
||||||
|
|
||||||
/// Calls searchImpl_ for a single page of GPU-resident data,
|
/// Calls searchImpl_ for a single page of GPU-resident data,
|
||||||
/// handling paging of the data and copies from the CPU
|
/// handling paging of the data and copies from the CPU
|
||||||
|
@ -130,7 +132,8 @@ private:
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* outDistancesData,
|
float* outDistancesData,
|
||||||
Index::idx_t* outIndicesData) const;
|
Index::idx_t* outIndicesData,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
/// Manages streams, cuBLAS handles and scratch memory for devices
|
/// Manages streams, cuBLAS handles and scratch memory for devices
|
||||||
|
|
|
@ -203,7 +203,8 @@ GpuIndexFlat::searchImpl_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const {
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
auto stream = resources_->getDefaultStream(device_);
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Input and output data are already resident on the GPU
|
// Input and output data are already resident on the GPU
|
||||||
|
@ -215,7 +216,17 @@ GpuIndexFlat::searchImpl_(int n,
|
||||||
DeviceTensor<int, 2, true> outIntLabels(
|
DeviceTensor<int, 2, true> outIntLabels(
|
||||||
resources_->getMemoryManagerCurrentDevice(), {n, k}, stream);
|
resources_->getMemoryManagerCurrentDevice(), {n, k}, stream);
|
||||||
|
|
||||||
data_->query(queries, k, outDistances, outIntLabels, true);
|
// Copy bitset to GPU
|
||||||
|
if (!bitset) {
|
||||||
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
|
||||||
|
} else {
|
||||||
|
auto bitsetData = bitset->bitset();
|
||||||
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
|
||||||
|
const_cast<uint8_t*>(bitsetData), stream,
|
||||||
|
{(int) bitset->size()});
|
||||||
|
data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
|
||||||
|
}
|
||||||
|
|
||||||
// Convert int to idx_t
|
// Convert int to idx_t
|
||||||
convertTensor<int, faiss::Index::idx_t, 2>(stream,
|
convertTensor<int, faiss::Index::idx_t, 2>(stream,
|
||||||
|
|
|
@ -126,7 +126,8 @@ class GpuIndexFlat : public GpuIndex {
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
faiss::Index::idx_t* labels) const override;
|
faiss::Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Checks user settings for consistency
|
/// Checks user settings for consistency
|
||||||
|
|
|
@ -207,14 +207,18 @@ GpuIndexIVFFlat::addImpl_(int n,
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
||||||
|
|
||||||
|
auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
||||||
|
|
||||||
// Not all vectors may be able to be added (some may contain NaNs etc)
|
// Not all vectors may be able to be added (some may contain NaNs etc)
|
||||||
index_->classifyAndAddVectors(data, labels);
|
index_->classifyAndAddVectors(data, labels, bitset);
|
||||||
|
|
||||||
// but keep the ntotal based on the total number of vectors that we attempted
|
// but keep the ntotal based on the total number of vectors that we attempted
|
||||||
// to add
|
// to add
|
||||||
|
@ -226,11 +230,14 @@ GpuIndexIVFFlat::searchImpl_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const {
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
// Device is already set in GpuIndex::search
|
// Device is already set in GpuIndex::search
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
||||||
Tensor<float, 2, true> outDistances(distances, {n, k});
|
Tensor<float, 2, true> outDistances(distances, {n, k});
|
||||||
|
@ -238,7 +245,9 @@ GpuIndexIVFFlat::searchImpl_(int n,
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
||||||
|
|
||||||
index_->query(queries, nprobe, k, outDistances, outLabels);
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
|
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const override;
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
GpuIndexIVFFlatConfig ivfFlatConfig_;
|
GpuIndexIVFFlatConfig ivfFlatConfig_;
|
||||||
|
|
|
@ -330,14 +330,18 @@ GpuIndexIVFPQ::addImpl_(int n,
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
||||||
|
|
||||||
|
auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
||||||
|
|
||||||
// Not all vectors may be able to be added (some may contain NaNs etc)
|
// Not all vectors may be able to be added (some may contain NaNs etc)
|
||||||
index_->classifyAndAddVectors(data, labels);
|
index_->classifyAndAddVectors(data, labels, bitset);
|
||||||
|
|
||||||
// but keep the ntotal based on the total number of vectors that we attempted
|
// but keep the ntotal based on the total number of vectors that we attempted
|
||||||
// to add
|
// to add
|
||||||
|
@ -349,11 +353,14 @@ GpuIndexIVFPQ::searchImpl_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const {
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
// Device is already set in GpuIndex::search
|
// Device is already set in GpuIndex::search
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
||||||
Tensor<float, 2, true> outDistances(distances, {n, k});
|
Tensor<float, 2, true> outDistances(distances, {n, k});
|
||||||
|
@ -361,7 +368,9 @@ GpuIndexIVFPQ::searchImpl_(int n,
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
||||||
|
|
||||||
index_->query(queries, nprobe, k, outDistances, outLabels);
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
|
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
|
|
@ -116,7 +116,8 @@ class GpuIndexIVFPQ : public GpuIndexIVF {
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const override;
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void verifySettings_() const;
|
void verifySettings_() const;
|
||||||
|
|
|
@ -304,14 +304,18 @@ GpuIndexIVFSQHybrid::addImpl_(int n,
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
||||||
|
|
||||||
|
auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
||||||
|
|
||||||
// Not all vectors may be able to be added (some may contain NaNs etc)
|
// Not all vectors may be able to be added (some may contain NaNs etc)
|
||||||
index_->classifyAndAddVectors(data, labels);
|
index_->classifyAndAddVectors(data, labels, bitset);
|
||||||
|
|
||||||
// but keep the ntotal based on the total number of vectors that we attempted
|
// but keep the ntotal based on the total number of vectors that we attempted
|
||||||
// to add
|
// to add
|
||||||
|
@ -323,11 +327,14 @@ GpuIndexIVFSQHybrid::searchImpl_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const {
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
// Device is already set in GpuIndex::search
|
// Device is already set in GpuIndex::search
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
||||||
Tensor<float, 2, true> outDistances(distances, {n, k});
|
Tensor<float, 2, true> outDistances(distances, {n, k});
|
||||||
|
@ -335,7 +342,9 @@ GpuIndexIVFSQHybrid::searchImpl_(int n,
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
||||||
|
|
||||||
index_->query(queries, nprobe, k, outDistances, outLabels);
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
|
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||||
}
|
}
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
|
|
@ -79,7 +79,8 @@ class GpuIndexIVFSQHybrid : public GpuIndexIVF {
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const override;
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const override;
|
||||||
|
|
||||||
/// Called from train to handle SQ residual training
|
/// Called from train to handle SQ residual training
|
||||||
void trainResiduals_(Index::idx_t n, const float* x);
|
void trainResiduals_(Index::idx_t n, const float* x);
|
||||||
|
|
|
@ -239,14 +239,18 @@ GpuIndexIVFScalarQuantizer::addImpl_(int n,
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
|
||||||
|
|
||||||
|
auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
|
||||||
|
|
||||||
// Not all vectors may be able to be added (some may contain NaNs etc)
|
// Not all vectors may be able to be added (some may contain NaNs etc)
|
||||||
index_->classifyAndAddVectors(data, labels);
|
index_->classifyAndAddVectors(data, labels, bitset);
|
||||||
|
|
||||||
// but keep the ntotal based on the total number of vectors that we attempted
|
// but keep the ntotal based on the total number of vectors that we attempted
|
||||||
// to add
|
// to add
|
||||||
|
@ -258,11 +262,14 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const {
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset) const {
|
||||||
// Device is already set in GpuIndex::search
|
// Device is already set in GpuIndex::search
|
||||||
FAISS_ASSERT(index_);
|
FAISS_ASSERT(index_);
|
||||||
FAISS_ASSERT(n > 0);
|
FAISS_ASSERT(n > 0);
|
||||||
|
|
||||||
|
auto stream = resources_->getDefaultStream(device_);
|
||||||
|
|
||||||
// Data is already resident on the GPU
|
// Data is already resident on the GPU
|
||||||
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
|
||||||
Tensor<float, 2, true> outDistances(distances, {n, k});
|
Tensor<float, 2, true> outDistances(distances, {n, k});
|
||||||
|
@ -270,7 +277,9 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
|
||||||
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
|
||||||
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
|
||||||
|
|
||||||
index_->query(queries, nprobe, k, outDistances, outLabels);
|
auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
|
||||||
|
|
||||||
|
index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
|
||||||
}
|
}
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
|
|
@ -75,7 +75,8 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
|
||||||
const float* x,
|
const float* x,
|
||||||
int k,
|
int k,
|
||||||
float* distances,
|
float* distances,
|
||||||
Index::idx_t* labels) const override;
|
Index::idx_t* labels,
|
||||||
|
ConcurrentBitsetPtr bitset = nullptr) const override;
|
||||||
|
|
||||||
/// Called from train to handle SQ residual training
|
/// Called from train to handle SQ residual training
|
||||||
void trainResiduals_(Index::idx_t n, const float* x);
|
void trainResiduals_(Index::idx_t n, const float* x);
|
||||||
|
|
|
@ -130,6 +130,7 @@ void runDistance(bool computeL2,
|
||||||
Tensor<T, 1, true>* centroidNorms,
|
Tensor<T, 1, true>* centroidNorms,
|
||||||
Tensor<T, 2, true>& queries,
|
Tensor<T, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<T, 2, true>& outDistances,
|
Tensor<T, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -306,6 +307,7 @@ void runDistance(bool computeL2,
|
||||||
// Write into the final output
|
// Write into the final output
|
||||||
runL2SelectMin(distanceBufView,
|
runL2SelectMin(distanceBufView,
|
||||||
*centroidNorms,
|
*centroidNorms,
|
||||||
|
bitset,
|
||||||
outDistanceView,
|
outDistanceView,
|
||||||
outIndexView,
|
outIndexView,
|
||||||
k,
|
k,
|
||||||
|
@ -326,6 +328,7 @@ void runDistance(bool computeL2,
|
||||||
// Write into our intermediate output
|
// Write into our intermediate output
|
||||||
runL2SelectMin(distanceBufView,
|
runL2SelectMin(distanceBufView,
|
||||||
centroidNormsView,
|
centroidNormsView,
|
||||||
|
bitset,
|
||||||
outDistanceBufColView,
|
outDistanceBufColView,
|
||||||
outIndexBufColView,
|
outIndexBufColView,
|
||||||
k,
|
k,
|
||||||
|
@ -346,12 +349,14 @@ void runDistance(bool computeL2,
|
||||||
if (tileCols == numCentroids) {
|
if (tileCols == numCentroids) {
|
||||||
// Write into the final output
|
// Write into the final output
|
||||||
runBlockSelect(distanceBufView,
|
runBlockSelect(distanceBufView,
|
||||||
|
bitset,
|
||||||
outDistanceView,
|
outDistanceView,
|
||||||
outIndexView,
|
outIndexView,
|
||||||
true, k, streams[curStream]);
|
true, k, streams[curStream]);
|
||||||
} else {
|
} else {
|
||||||
// Write into the intermediate output
|
// Write into the intermediate output
|
||||||
runBlockSelect(distanceBufView,
|
runBlockSelect(distanceBufView,
|
||||||
|
bitset,
|
||||||
outDistanceBufColView,
|
outDistanceBufColView,
|
||||||
outIndexBufColView,
|
outIndexBufColView,
|
||||||
true, k, streams[curStream]);
|
true, k, streams[curStream]);
|
||||||
|
@ -368,6 +373,7 @@ void runDistance(bool computeL2,
|
||||||
|
|
||||||
runBlockSelectPair(outDistanceBufRowView,
|
runBlockSelectPair(outDistanceBufRowView,
|
||||||
outIndexBufRowView,
|
outIndexBufRowView,
|
||||||
|
bitset,
|
||||||
outDistanceView,
|
outDistanceView,
|
||||||
outIndexView,
|
outIndexView,
|
||||||
computeL2 ? false : true, k, streams[curStream]);
|
computeL2 ? false : true, k, streams[curStream]);
|
||||||
|
@ -384,6 +390,7 @@ void runDistance(bool computeL2,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Bitset added
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void runL2Distance(GpuResources* resources,
|
void runL2Distance(GpuResources* resources,
|
||||||
Tensor<T, 2, true>& centroids,
|
Tensor<T, 2, true>& centroids,
|
||||||
|
@ -391,6 +398,7 @@ void runL2Distance(GpuResources* resources,
|
||||||
Tensor<T, 1, true>* centroidNorms,
|
Tensor<T, 1, true>* centroidNorms,
|
||||||
Tensor<T, 2, true>& queries,
|
Tensor<T, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<T, 2, true>& outDistances,
|
Tensor<T, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -403,6 +411,7 @@ void runL2Distance(GpuResources* resources,
|
||||||
centroidNorms,
|
centroidNorms,
|
||||||
queries,
|
queries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -416,6 +425,7 @@ void runIPDistance(GpuResources* resources,
|
||||||
bool centroidsRowMajor,
|
bool centroidsRowMajor,
|
||||||
Tensor<T, 2, true>& queries,
|
Tensor<T, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<T, 2, true>& outDistances,
|
Tensor<T, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -427,6 +437,7 @@ void runIPDistance(GpuResources* resources,
|
||||||
nullptr, // no centroid norms provided
|
nullptr, // no centroid norms provided
|
||||||
queries,
|
queries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -444,6 +455,7 @@ runIPDistance(GpuResources* resources,
|
||||||
bool vectorsRowMajor,
|
bool vectorsRowMajor,
|
||||||
Tensor<float, 2, true>& queries,
|
Tensor<float, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices) {
|
Tensor<int, 2, true>& outIndices) {
|
||||||
|
@ -452,6 +464,7 @@ runIPDistance(GpuResources* resources,
|
||||||
vectorsRowMajor,
|
vectorsRowMajor,
|
||||||
queries,
|
queries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -464,6 +477,7 @@ runIPDistance(GpuResources* resources,
|
||||||
bool vectorsRowMajor,
|
bool vectorsRowMajor,
|
||||||
Tensor<half, 2, true>& queries,
|
Tensor<half, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<half, 2, true>& outDistances,
|
Tensor<half, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -473,6 +487,7 @@ runIPDistance(GpuResources* resources,
|
||||||
vectorsRowMajor,
|
vectorsRowMajor,
|
||||||
queries,
|
queries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -486,6 +501,7 @@ runL2Distance(GpuResources* resources,
|
||||||
Tensor<float, 1, true>* vectorNorms,
|
Tensor<float, 1, true>* vectorNorms,
|
||||||
Tensor<float, 2, true>& queries,
|
Tensor<float, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -496,6 +512,7 @@ runL2Distance(GpuResources* resources,
|
||||||
vectorNorms,
|
vectorNorms,
|
||||||
queries,
|
queries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -510,6 +527,7 @@ runL2Distance(GpuResources* resources,
|
||||||
Tensor<half, 1, true>* vectorNorms,
|
Tensor<half, 1, true>* vectorNorms,
|
||||||
Tensor<half, 2, true>& queries,
|
Tensor<half, 2, true>& queries,
|
||||||
bool queriesRowMajor,
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<half, 2, true>& outDistances,
|
Tensor<half, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -521,6 +539,7 @@ runL2Distance(GpuResources* resources,
|
||||||
vectorNorms,
|
vectorNorms,
|
||||||
queries,
|
queries,
|
||||||
queriesRowMajor,
|
queriesRowMajor,
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
|
|
||||||
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
#include <faiss/gpu/utils/DeviceTensor.cuh>
|
||||||
#include <faiss/gpu/utils/Float16.cuh>
|
#include <faiss/gpu/utils/Float16.cuh>
|
||||||
|
#include <faiss/utils/ConcurrentBitset.h>
|
||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
@ -65,4 +66,58 @@ void runL2Distance(GpuResources* resources,
|
||||||
bool useHgemm,
|
bool useHgemm,
|
||||||
bool ignoreOutDistances = false);
|
bool ignoreOutDistances = false);
|
||||||
|
|
||||||
|
// Bitset added
|
||||||
|
void runL2Distance(GpuResources* resources,
|
||||||
|
Tensor<float, 2, true>& vectors,
|
||||||
|
bool vectorsRowMajor,
|
||||||
|
// can be optionally pre-computed; nullptr if we
|
||||||
|
// have to compute it upon the call
|
||||||
|
Tensor<float, 1, true>* vectorNorms,
|
||||||
|
Tensor<float, 2, true>& queries,
|
||||||
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
|
int k,
|
||||||
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
// Do we care about `outDistances`? If not, we can
|
||||||
|
// take shortcuts.
|
||||||
|
bool ignoreOutDistances = false);
|
||||||
|
|
||||||
|
/// Calculates brute-force inner product distance between `vectors`
|
||||||
|
/// and `queries`, returning the k closest results seen
|
||||||
|
void runIPDistance(GpuResources* resources,
|
||||||
|
Tensor<float, 2, true>& vectors,
|
||||||
|
bool vectorsRowMajor,
|
||||||
|
Tensor<float, 2, true>& queries,
|
||||||
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
|
int k,
|
||||||
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
Tensor<int, 2, true>& outIndices);
|
||||||
|
|
||||||
|
void runIPDistance(GpuResources* resources,
|
||||||
|
Tensor<half, 2, true>& vectors,
|
||||||
|
bool vectorsRowMajor,
|
||||||
|
Tensor<half, 2, true>& queries,
|
||||||
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
|
int k,
|
||||||
|
Tensor<half, 2, true>& outDistances,
|
||||||
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
bool useHgemm);
|
||||||
|
|
||||||
|
void runL2Distance(GpuResources* resources,
|
||||||
|
Tensor<half, 2, true>& vectors,
|
||||||
|
bool vectorsRowMajor,
|
||||||
|
Tensor<half, 1, true>* vectorNorms,
|
||||||
|
Tensor<half, 2, true>& queries,
|
||||||
|
bool queriesRowMajor,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
|
int k,
|
||||||
|
Tensor<half, 2, true>& outDistances,
|
||||||
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
bool useHgemm,
|
||||||
|
bool ignoreOutDistances = false);
|
||||||
|
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
|
|
@ -103,6 +103,7 @@ FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
|
||||||
|
|
||||||
void
|
void
|
||||||
FlatIndex::query(Tensor<float, 2, true>& input,
|
FlatIndex::query(Tensor<float, 2, true>& input,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -119,7 +120,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
|
||||||
DeviceTensor<half, 2, true> outDistancesHalf(
|
DeviceTensor<half, 2, true> outDistancesHalf(
|
||||||
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
|
mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
|
||||||
|
|
||||||
query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
|
query(inputHalf, bitset, k, outDistancesHalf, outIndices, exactDistance);
|
||||||
|
|
||||||
if (exactDistance) {
|
if (exactDistance) {
|
||||||
// Convert outDistances back
|
// Convert outDistances back
|
||||||
|
@ -135,6 +136,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
|
||||||
&norms_,
|
&norms_,
|
||||||
input,
|
input,
|
||||||
true, // input is row major
|
true, // input is row major
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -145,6 +147,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
|
||||||
!storeTransposed_, // is vectors row major?
|
!storeTransposed_, // is vectors row major?
|
||||||
input,
|
input,
|
||||||
true, // input is row major
|
true, // input is row major
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices);
|
outIndices);
|
||||||
|
@ -154,6 +157,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
|
||||||
|
|
||||||
void
|
void
|
||||||
FlatIndex::query(Tensor<half, 2, true>& input,
|
FlatIndex::query(Tensor<half, 2, true>& input,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<half, 2, true>& outDistances,
|
Tensor<half, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
@ -167,6 +171,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
|
||||||
&normsHalf_,
|
&normsHalf_,
|
||||||
input,
|
input,
|
||||||
true, // input is row major
|
true, // input is row major
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
@ -179,6 +184,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
|
||||||
!storeTransposed_, // is vectors row major?
|
!storeTransposed_, // is vectors row major?
|
||||||
input,
|
input,
|
||||||
true, // input is row major
|
true, // input is row major
|
||||||
|
bitset,
|
||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
|
|
|
@ -53,12 +53,14 @@ class FlatIndex {
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
void query(Tensor<float, 2, true>& vecs,
|
void query(Tensor<float, 2, true>& vecs,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool exactDistance);
|
bool exactDistance);
|
||||||
|
|
||||||
void query(Tensor<half, 2, true>& vecs,
|
void query(Tensor<half, 2, true>& vecs,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
Tensor<half, 2, true>& outDistances,
|
Tensor<half, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
|
|
@ -157,7 +157,8 @@ IVFFlat::addCodeVectorsFromCpu(int listId,
|
||||||
|
|
||||||
int
|
int
|
||||||
IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
Tensor<long, 1, true>& indices) {
|
Tensor<long, 1, true>& indices,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset) {
|
||||||
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
||||||
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
||||||
|
|
||||||
|
@ -174,7 +175,7 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
||||||
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
||||||
|
|
||||||
quantizer_->query(vecs, 1, listDistance2d, listIds2d, false);
|
quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false);
|
||||||
|
|
||||||
// Calculate residuals for these vectors, if needed
|
// Calculate residuals for these vectors, if needed
|
||||||
DeviceTensor<float, 2, true>
|
DeviceTensor<float, 2, true>
|
||||||
|
@ -326,6 +327,7 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
|
|
||||||
void
|
void
|
||||||
IVFFlat::query(Tensor<float, 2, true>& queries,
|
IVFFlat::query(Tensor<float, 2, true>& queries,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int nprobe,
|
int nprobe,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
@ -352,6 +354,7 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
|
||||||
// Find the `nprobe` closest lists; we can use int indices both
|
// Find the `nprobe` closest lists; we can use int indices both
|
||||||
// internally and externally
|
// internally and externally
|
||||||
quantizer_->query(queries,
|
quantizer_->query(queries,
|
||||||
|
bitset,
|
||||||
nprobe,
|
nprobe,
|
||||||
coarseDistances,
|
coarseDistances,
|
||||||
coarseIndices,
|
coarseIndices,
|
||||||
|
|
|
@ -44,11 +44,14 @@ class IVFFlat : public IVFBase {
|
||||||
/// Returns the number of vectors successfully added. Vectors may
|
/// Returns the number of vectors successfully added. Vectors may
|
||||||
/// not be able to be added because they contain NaNs.
|
/// not be able to be added because they contain NaNs.
|
||||||
int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
Tensor<long, 1, true>& indices);
|
Tensor<long, 1, true>& indices,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset);
|
||||||
|
|
||||||
|
|
||||||
/// Find the approximate k nearest neigbors for `queries` against
|
/// Find the approximate k nearest neigbors for `queries` against
|
||||||
/// our database
|
/// our database
|
||||||
void query(Tensor<float, 2, true>& queries,
|
void query(Tensor<float, 2, true>& queries,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int nprobe,
|
int nprobe,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
|
|
@ -110,7 +110,8 @@ IVFPQ::setPrecomputedCodes(bool enable) {
|
||||||
|
|
||||||
int
|
int
|
||||||
IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
Tensor<long, 1, true>& indices) {
|
Tensor<long, 1, true>& indices,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset) {
|
||||||
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
|
||||||
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
FAISS_ASSERT(vecs.getSize(1) == dim_);
|
||||||
|
|
||||||
|
@ -128,7 +129,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
|
||||||
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
auto listIds = listIds2d.view<1>({vecs.getSize(0)});
|
||||||
|
|
||||||
quantizer_->query(vecs, 1, listDistance, listIds2d, false);
|
quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false);
|
||||||
|
|
||||||
// Copy the lists that we wish to append to back to the CPU
|
// Copy the lists that we wish to append to back to the CPU
|
||||||
// FIXME: really this can be into pinned memory and a true async
|
// FIXME: really this can be into pinned memory and a true async
|
||||||
|
@ -184,6 +185,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
nullptr, // no precomputed norms
|
nullptr, // no precomputed norms
|
||||||
residualsTransposeView,
|
residualsTransposeView,
|
||||||
true, // residualsTransposeView is row major
|
true, // residualsTransposeView is row major
|
||||||
|
bitset,
|
||||||
1,
|
1,
|
||||||
closestSubQDistanceView,
|
closestSubQDistanceView,
|
||||||
closestSubQIndexView,
|
closestSubQIndexView,
|
||||||
|
@ -506,6 +508,7 @@ IVFPQ::precomputeCodes_() {
|
||||||
|
|
||||||
void
|
void
|
||||||
IVFPQ::query(Tensor<float, 2, true>& queries,
|
IVFPQ::query(Tensor<float, 2, true>& queries,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int nprobe,
|
int nprobe,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
@ -531,6 +534,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
|
||||||
// Find the `nprobe` closest coarse centroids; we can use int
|
// Find the `nprobe` closest coarse centroids; we can use int
|
||||||
// indices both internally and externally
|
// indices both internally and externally
|
||||||
quantizer_->query(queries,
|
quantizer_->query(queries,
|
||||||
|
bitset,
|
||||||
nprobe,
|
nprobe,
|
||||||
coarseDistances,
|
coarseDistances,
|
||||||
coarseIndices,
|
coarseIndices,
|
||||||
|
|
|
@ -52,11 +52,13 @@ class IVFPQ : public IVFBase {
|
||||||
/// Returns the number of vectors successfully added. Vectors may
|
/// Returns the number of vectors successfully added. Vectors may
|
||||||
/// not be able to be added because they contain NaNs.
|
/// not be able to be added because they contain NaNs.
|
||||||
int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
||||||
Tensor<long, 1, true>& indices);
|
Tensor<long, 1, true>& indices,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset);
|
||||||
|
|
||||||
/// Find the approximate k nearest neigbors for `queries` against
|
/// Find the approximate k nearest neigbors for `queries` against
|
||||||
/// our database
|
/// our database
|
||||||
void query(Tensor<float, 2, true>& queries,
|
void query(Tensor<float, 2, true>& queries,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int nprobe,
|
int nprobe,
|
||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
|
|
@ -159,9 +159,160 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// With bitset included
|
||||||
|
// L2 + select kernel for k == 1, implements re-use of ||c||^2
|
||||||
|
template <typename T, int kRowsPerBlock, int kBlockSize>
|
||||||
|
__global__ void l2SelectMin1(Tensor<T, 2, true> productDistances,
|
||||||
|
Tensor<T, 1, true> centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true> bitset,
|
||||||
|
Tensor<T, 2, true> outDistances,
|
||||||
|
Tensor<int, 2, true> outIndices) {
|
||||||
|
// Each block handles kRowsPerBlock rows of the distances (results)
|
||||||
|
Pair<T, int> threadMin[kRowsPerBlock];
|
||||||
|
__shared__ Pair<T, int> blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
|
||||||
|
|
||||||
|
T distance[kRowsPerBlock];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < kRowsPerBlock; ++i) {
|
||||||
|
threadMin[i].k = Limits<T>::getMax();
|
||||||
|
threadMin[i].v = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// blockIdx.x: which chunk of rows we are responsible for updating
|
||||||
|
int rowStart = blockIdx.x * kRowsPerBlock;
|
||||||
|
|
||||||
|
// FIXME: if we have exact multiples, don't need this
|
||||||
|
bool endRow = (blockIdx.x == gridDim.x - 1);
|
||||||
|
|
||||||
|
if (endRow) {
|
||||||
|
if (productDistances.getSize(0) % kRowsPerBlock == 0) {
|
||||||
|
endRow = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endRow) {
|
||||||
|
for (int row = rowStart; row < productDistances.getSize(0); ++row) {
|
||||||
|
for (int col = threadIdx.x; col < productDistances.getSize(1);
|
||||||
|
col += blockDim.x) {
|
||||||
|
if (!(bitset[col >> 3] & (0x1 << (col & 0x7)))) {
|
||||||
|
distance[0] = Math<T>::add(centroidDistances[col],
|
||||||
|
productDistances[row][col]);
|
||||||
|
|
||||||
|
if (Math<T>::lt(distance[0], threadMin[0].k)) {
|
||||||
|
threadMin[0].k = distance[0];
|
||||||
|
threadMin[0].v = col;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reduce within the block
|
||||||
|
threadMin[0] =
|
||||||
|
blockReduceAll<Pair<T, int>, Min<Pair<T, int> >, false, false>(
|
||||||
|
threadMin[0], Min<Pair<T, int> >(), blockMin);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
outDistances[row][0] = threadMin[0].k;
|
||||||
|
outIndices[row][0] = threadMin[0].v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// so we can use the shared memory again
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
threadMin[0].k = Limits<T>::getMax();
|
||||||
|
threadMin[0].v = -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int col = threadIdx.x; col < productDistances.getSize(1);
|
||||||
|
col += blockDim.x) {
|
||||||
|
T centroidDistance = centroidDistances[col];
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int row = 0; row < kRowsPerBlock; ++row) {
|
||||||
|
distance[row] = productDistances[rowStart + row][col];
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int row = 0; row < kRowsPerBlock; ++row) {
|
||||||
|
distance[row] = Math<T>::add(distance[row], centroidDistance);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int row = 0; row < kRowsPerBlock; ++row) {
|
||||||
|
if (Math<T>::lt(distance[row], threadMin[row].k)) {
|
||||||
|
threadMin[row].k = distance[row];
|
||||||
|
threadMin[row].v = col;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reduce within the block
|
||||||
|
blockReduceAll<kRowsPerBlock, Pair<T, int>, Min<Pair<T, int> >, false, false>(
|
||||||
|
threadMin, Min<Pair<T, int> >(), blockMin);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int row = 0; row < kRowsPerBlock; ++row) {
|
||||||
|
outDistances[rowStart + row][0] = threadMin[row].k;
|
||||||
|
outIndices[rowStart + row][0] = threadMin[row].v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// With bitset included
|
||||||
|
// L2 + select kernel for k > 1, no re-use of ||c||^2
|
||||||
|
template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
|
||||||
|
__global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
|
||||||
|
Tensor<T, 1, true> centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true> bitset,
|
||||||
|
Tensor<T, 2, true> outDistances,
|
||||||
|
Tensor<int, 2, true> outIndices,
|
||||||
|
int k, T initK) {
|
||||||
|
// Each block handles a single row of the distances (results)
|
||||||
|
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
|
||||||
|
|
||||||
|
__shared__ T smemK[kNumWarps * NumWarpQ];
|
||||||
|
__shared__ int smemV[kNumWarps * NumWarpQ];
|
||||||
|
|
||||||
|
BlockSelect<T, int, false, Comparator<T>,
|
||||||
|
NumWarpQ, NumThreadQ, ThreadsPerBlock>
|
||||||
|
heap(initK, -1, smemK, smemV, k);
|
||||||
|
|
||||||
|
int row = blockIdx.x;
|
||||||
|
|
||||||
|
// Whole warps must participate in the selection
|
||||||
|
int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
|
||||||
|
int i = threadIdx.x;
|
||||||
|
|
||||||
|
for (; i < limit; i += blockDim.x) {
|
||||||
|
if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
|
||||||
|
T v = Math<T>::add(centroidDistances[i],
|
||||||
|
productDistances[row][i]);
|
||||||
|
heap.add(v, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < productDistances.getSize(1)) {
|
||||||
|
if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
|
||||||
|
T v = Math<T>::add(centroidDistances[i],
|
||||||
|
productDistances[row][i]);
|
||||||
|
heap.addThreadQ(v, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
heap.reduce();
|
||||||
|
for (int i = threadIdx.x; i < k; i += blockDim.x) {
|
||||||
|
outDistances[row][i] = smemK[i];
|
||||||
|
outIndices[row][i] = smemV[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
||||||
Tensor<T, 1, true>& centroidDistances,
|
Tensor<T, 1, true>& centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<T, 2, true>& outDistances,
|
Tensor<T, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
int k,
|
int k,
|
||||||
|
@ -181,7 +332,7 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
||||||
auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
|
auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
|
||||||
|
|
||||||
l2SelectMin1<T, kRowsPerBlock, kThreadsPerBlock>
|
l2SelectMin1<T, kRowsPerBlock, kThreadsPerBlock>
|
||||||
<<<grid, block, 0, stream>>>(productDistances, centroidDistances,
|
<<<grid, block, 0, stream>>>(productDistances, centroidDistances, bitset,
|
||||||
outDistances, outIndices);
|
outDistances, outIndices);
|
||||||
} else {
|
} else {
|
||||||
auto grid = dim3(outDistances.getSize(0));
|
auto grid = dim3(outDistances.getSize(0));
|
||||||
|
@ -194,28 +345,63 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
||||||
k, Limits<T>::getMax()); \
|
k, Limits<T>::getMax()); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
// block size 128 for everything <= 1024
|
#define RUN_L2_SELECT_BITSET(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \
|
||||||
if (k <= 32) {
|
do { \
|
||||||
RUN_L2_SELECT(128, 32, 2);
|
l2SelectMinK<T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK> \
|
||||||
} else if (k <= 64) {
|
<<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, \
|
||||||
RUN_L2_SELECT(128, 64, 3);
|
bitset, outDistances, outIndices, \
|
||||||
} else if (k <= 128) {
|
k, Limits<T>::getMax()); \
|
||||||
RUN_L2_SELECT(128, 128, 3);
|
} while (0)
|
||||||
} else if (k <= 256) {
|
|
||||||
RUN_L2_SELECT(128, 256, 4);
|
|
||||||
} else if (k <= 512) {
|
|
||||||
RUN_L2_SELECT(128, 512, 8);
|
|
||||||
} else if (k <= 1024) {
|
|
||||||
RUN_L2_SELECT(128, 1024, 8);
|
|
||||||
|
|
||||||
#if GPU_MAX_SELECTION_K >= 2048
|
if (bitset.getSize(0) == 0) {
|
||||||
} else if (k <= 2048) {
|
// block size 128 for everything <= 1024
|
||||||
// smaller block for less shared memory
|
if (k <= 32) {
|
||||||
RUN_L2_SELECT(64, 2048, 8);
|
RUN_L2_SELECT(128, 32, 2);
|
||||||
#endif
|
} else if (k <= 64) {
|
||||||
|
RUN_L2_SELECT(128, 64, 3);
|
||||||
|
} else if (k <= 128) {
|
||||||
|
RUN_L2_SELECT(128, 128, 3);
|
||||||
|
} else if (k <= 256) {
|
||||||
|
RUN_L2_SELECT(128, 256, 4);
|
||||||
|
} else if (k <= 512) {
|
||||||
|
RUN_L2_SELECT(128, 512, 8);
|
||||||
|
} else if (k <= 1024) {
|
||||||
|
RUN_L2_SELECT(128, 1024, 8);
|
||||||
|
|
||||||
|
#if GPU_MAX_SELECTION_K >= 2048
|
||||||
|
} else if (k <= 2048) {
|
||||||
|
// smaller block for less shared memory
|
||||||
|
RUN_L2_SELECT(64, 2048, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} else {
|
||||||
|
FAISS_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
FAISS_ASSERT(false);
|
// With bitset
|
||||||
|
if (k <= 32) {
|
||||||
|
RUN_L2_SELECT_BITSET(128, 32, 2);
|
||||||
|
} else if (k <= 64) {
|
||||||
|
RUN_L2_SELECT_BITSET(128, 64, 3);
|
||||||
|
} else if (k <= 128) {
|
||||||
|
RUN_L2_SELECT_BITSET(128, 128, 3);
|
||||||
|
} else if (k <= 256) {
|
||||||
|
RUN_L2_SELECT_BITSET(128, 256, 4);
|
||||||
|
} else if (k <= 512) {
|
||||||
|
RUN_L2_SELECT_BITSET(128, 512, 8);
|
||||||
|
} else if (k <= 1024) {
|
||||||
|
RUN_L2_SELECT_BITSET(128, 1024, 8);
|
||||||
|
|
||||||
|
#if GPU_MAX_SELECTION_K >= 2048
|
||||||
|
} else if (k <= 2048) {
|
||||||
|
// smaller block for less shared memory
|
||||||
|
RUN_L2_SELECT_BITSET(64, 2048, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} else {
|
||||||
|
FAISS_ASSERT(false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -224,12 +410,14 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
|
||||||
|
|
||||||
void runL2SelectMin(Tensor<float, 2, true>& productDistances,
|
void runL2SelectMin(Tensor<float, 2, true>& productDistances,
|
||||||
Tensor<float, 1, true>& centroidDistances,
|
Tensor<float, 1, true>& centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
int k,
|
int k,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
runL2SelectMin<float>(productDistances,
|
runL2SelectMin<float>(productDistances,
|
||||||
centroidDistances,
|
centroidDistances,
|
||||||
|
bitset,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
k,
|
k,
|
||||||
|
@ -238,12 +426,14 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
|
||||||
|
|
||||||
void runL2SelectMin(Tensor<half, 2, true>& productDistances,
|
void runL2SelectMin(Tensor<half, 2, true>& productDistances,
|
||||||
Tensor<half, 1, true>& centroidDistances,
|
Tensor<half, 1, true>& centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<half, 2, true>& outDistances,
|
Tensor<half, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
int k,
|
int k,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
runL2SelectMin<half>(productDistances,
|
runL2SelectMin<half>(productDistances,
|
||||||
centroidDistances,
|
centroidDistances,
|
||||||
|
bitset,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices,
|
outIndices,
|
||||||
k,
|
k,
|
||||||
|
|
|
@ -26,4 +26,20 @@ void runL2SelectMin(Tensor<half, 2, true>& productDistances,
|
||||||
int k,
|
int k,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
void runL2SelectMin(Tensor<float, 2, true>& productDistances,
|
||||||
|
Tensor<float, 1, true>& centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
|
Tensor<float, 2, true>& outDistances,
|
||||||
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
int k,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
void runL2SelectMin(Tensor<half, 2, true>& productDistances,
|
||||||
|
Tensor<half, 1, true>& centroidDistances,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
|
Tensor<half, 2, true>& outDistances,
|
||||||
|
Tensor<int, 2, true>& outIndices,
|
||||||
|
int k,
|
||||||
|
cudaStream_t stream);
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
|
|
@ -51,6 +51,7 @@ int main(int argc, char** argv) {
|
||||||
limitK = GPU_MAX_SELECTION_K;
|
limitK = GPU_MAX_SELECTION_K;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
faiss::gpu::DeviceTensor<uint8_t, 1, true> bitset(nullptr, {0});
|
||||||
for (int k = startK; k <= limitK; k *= 2) {
|
for (int k = startK; k <= limitK; k *= 2) {
|
||||||
faiss::gpu::DeviceTensor<float, 2, true> gpuOutVal({FLAGS_rows, k});
|
faiss::gpu::DeviceTensor<float, 2, true> gpuOutVal({FLAGS_rows, k});
|
||||||
faiss::gpu::DeviceTensor<int, 2, true> gpuOutInd({FLAGS_rows, k});
|
faiss::gpu::DeviceTensor<int, 2, true> gpuOutInd({FLAGS_rows, k});
|
||||||
|
@ -60,7 +61,7 @@ int main(int argc, char** argv) {
|
||||||
faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd,
|
faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd,
|
||||||
FLAGS_dir, k, 0);
|
FLAGS_dir, k, 0);
|
||||||
} else {
|
} else {
|
||||||
faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd,
|
faiss::gpu::runBlockSelect(gpuVal, bitset, gpuOutVal, gpuOutInd,
|
||||||
FLAGS_dir, k, 0);
|
FLAGS_dir, k, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,8 @@ void testForSize(int rows, int cols, int k, bool dir, bool warp) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
faiss::gpu::DeviceTensor<uint8_t, 1, true> bitset(nullptr, {0});
|
||||||
|
|
||||||
// row -> (val -> idx)
|
// row -> (val -> idx)
|
||||||
std::unordered_map<int, std::vector<std::pair<int, float>>> hostOutValAndInd;
|
std::unordered_map<int, std::vector<std::pair<int, float>>> hostOutValAndInd;
|
||||||
for (int r = 0; r < rows; ++r) {
|
for (int r = 0; r < rows; ++r) {
|
||||||
|
@ -59,7 +61,8 @@ void testForSize(int rows, int cols, int k, bool dir, bool warp) {
|
||||||
if (warp) {
|
if (warp) {
|
||||||
faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
|
faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
|
||||||
} else {
|
} else {
|
||||||
faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
|
|
||||||
|
faiss::gpu::runBlockSelect(gpuVal, bitset, gpuOutVal, gpuOutInd, dir, k, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy back to CPU
|
// Copy back to CPU
|
||||||
|
|
|
@ -43,6 +43,7 @@ BLOCK_SELECT_DECL(float, false, 2048);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void runBlockSelect(Tensor<float, 2, true>& in,
|
void runBlockSelect(Tensor<float, 2, true>& in,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<float, 2, true>& outK,
|
Tensor<float, 2, true>& outK,
|
||||||
Tensor<int, 2, true>& outV,
|
Tensor<int, 2, true>& outV,
|
||||||
bool dir, int k, cudaStream_t stream) {
|
bool dir, int k, cudaStream_t stream) {
|
||||||
|
@ -93,6 +94,7 @@ void runBlockSelect(Tensor<float, 2, true>& in,
|
||||||
|
|
||||||
void runBlockSelectPair(Tensor<float, 2, true>& inK,
|
void runBlockSelectPair(Tensor<float, 2, true>& inK,
|
||||||
Tensor<int, 2, true>& inV,
|
Tensor<int, 2, true>& inV,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<float, 2, true>& outK,
|
Tensor<float, 2, true>& outK,
|
||||||
Tensor<int, 2, true>& outV,
|
Tensor<int, 2, true>& outV,
|
||||||
bool dir, int k, cudaStream_t stream) {
|
bool dir, int k, cudaStream_t stream) {
|
||||||
|
|
|
@ -43,6 +43,7 @@ BLOCK_SELECT_DECL(half, false, 2048);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void runBlockSelect(Tensor<half, 2, true>& in,
|
void runBlockSelect(Tensor<half, 2, true>& in,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<half, 2, true>& outK,
|
Tensor<half, 2, true>& outK,
|
||||||
Tensor<int, 2, true>& outV,
|
Tensor<int, 2, true>& outV,
|
||||||
bool dir, int k, cudaStream_t stream) {
|
bool dir, int k, cudaStream_t stream) {
|
||||||
|
@ -93,6 +94,7 @@ void runBlockSelect(Tensor<half, 2, true>& in,
|
||||||
|
|
||||||
void runBlockSelectPair(Tensor<half, 2, true>& inK,
|
void runBlockSelectPair(Tensor<half, 2, true>& inK,
|
||||||
Tensor<int, 2, true>& inV,
|
Tensor<int, 2, true>& inV,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<half, 2, true>& outK,
|
Tensor<half, 2, true>& outK,
|
||||||
Tensor<int, 2, true>& outV,
|
Tensor<int, 2, true>& outV,
|
||||||
bool dir, int k, cudaStream_t stream) {
|
bool dir, int k, cudaStream_t stream) {
|
||||||
|
|
|
@ -110,24 +110,138 @@ __global__ void blockSelectPair(Tensor<K, 2, true> inK,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Bitset included
|
||||||
|
template <typename K,
|
||||||
|
typename IndexType,
|
||||||
|
bool Dir,
|
||||||
|
int NumWarpQ,
|
||||||
|
int NumThreadQ,
|
||||||
|
int ThreadsPerBlock>
|
||||||
|
__global__ void blockSelect(Tensor<K, 2, true> in,
|
||||||
|
Tensor<uint8_t, 1, true> bitset,
|
||||||
|
Tensor<K, 2, true> outK,
|
||||||
|
Tensor<IndexType, 2, true> outV,
|
||||||
|
K initK,
|
||||||
|
IndexType initV,
|
||||||
|
int k) {
|
||||||
|
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
|
||||||
|
|
||||||
|
__shared__ K smemK[kNumWarps * NumWarpQ];
|
||||||
|
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
|
||||||
|
|
||||||
|
BlockSelect<K, IndexType, Dir, Comparator<K>,
|
||||||
|
NumWarpQ, NumThreadQ, ThreadsPerBlock>
|
||||||
|
heap(initK, initV, smemK, smemV, k);
|
||||||
|
|
||||||
|
// Grid is exactly sized to rows available
|
||||||
|
int row = blockIdx.x;
|
||||||
|
|
||||||
|
int i = threadIdx.x;
|
||||||
|
K* inStart = in[row][i].data();
|
||||||
|
|
||||||
|
// Whole warps must participate in the selection
|
||||||
|
int limit = utils::roundDown(in.getSize(1), kWarpSize);
|
||||||
|
|
||||||
|
for (; i < limit; i += ThreadsPerBlock) {
|
||||||
|
if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
|
||||||
|
heap.add(*inStart, (IndexType) i);
|
||||||
|
inStart += ThreadsPerBlock;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle last remainder fraction of a warp of elements
|
||||||
|
if (i < in.getSize(1)) {
|
||||||
|
if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
|
||||||
|
heap.addThreadQ(*inStart, (IndexType) i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
heap.reduce();
|
||||||
|
|
||||||
|
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
|
||||||
|
outK[row][i] = smemK[i];
|
||||||
|
outV[row][i] = smemV[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename K,
|
||||||
|
typename IndexType,
|
||||||
|
bool Dir,
|
||||||
|
int NumWarpQ,
|
||||||
|
int NumThreadQ,
|
||||||
|
int ThreadsPerBlock>
|
||||||
|
__global__ void blockSelectPair(Tensor<K, 2, true> inK,
|
||||||
|
Tensor<IndexType, 2, true> inV,
|
||||||
|
Tensor<uint8_t, 1, true> bitset,
|
||||||
|
Tensor<K, 2, true> outK,
|
||||||
|
Tensor<IndexType, 2, true> outV,
|
||||||
|
K initK,
|
||||||
|
IndexType initV,
|
||||||
|
int k) {
|
||||||
|
constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
|
||||||
|
|
||||||
|
__shared__ K smemK[kNumWarps * NumWarpQ];
|
||||||
|
__shared__ IndexType smemV[kNumWarps * NumWarpQ];
|
||||||
|
|
||||||
|
BlockSelect<K, IndexType, Dir, Comparator<K>,
|
||||||
|
NumWarpQ, NumThreadQ, ThreadsPerBlock>
|
||||||
|
heap(initK, initV, smemK, smemV, k);
|
||||||
|
|
||||||
|
// Grid is exactly sized to rows available
|
||||||
|
int row = blockIdx.x;
|
||||||
|
|
||||||
|
int i = threadIdx.x;
|
||||||
|
K* inKStart = inK[row][i].data();
|
||||||
|
IndexType* inVStart = inV[row][i].data();
|
||||||
|
|
||||||
|
// Whole warps must participate in the selection
|
||||||
|
int limit = utils::roundDown(inK.getSize(1), kWarpSize);
|
||||||
|
|
||||||
|
for (; i < limit; i += ThreadsPerBlock) {
|
||||||
|
if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
|
||||||
|
heap.add(*inKStart, *inVStart);
|
||||||
|
inKStart += ThreadsPerBlock;
|
||||||
|
inVStart += ThreadsPerBlock;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle last remainder fraction of a warp of elements
|
||||||
|
if (i < inK.getSize(1)) {
|
||||||
|
if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
|
||||||
|
heap.addThreadQ(*inKStart, *inVStart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
heap.reduce();
|
||||||
|
|
||||||
|
for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
|
||||||
|
outK[row][i] = smemK[i];
|
||||||
|
outV[row][i] = smemV[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void runBlockSelect(Tensor<float, 2, true>& in,
|
void runBlockSelect(Tensor<float, 2, true>& in,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<float, 2, true>& outKeys,
|
Tensor<float, 2, true>& outKeys,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
|
||||||
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
|
void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
|
||||||
Tensor<int, 2, true>& inIndices,
|
Tensor<int, 2, true>& inIndices,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<float, 2, true>& outKeys,
|
Tensor<float, 2, true>& outKeys,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
|
||||||
void runBlockSelect(Tensor<half, 2, true>& in,
|
void runBlockSelect(Tensor<half, 2, true>& in,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<half, 2, true>& outKeys,
|
Tensor<half, 2, true>& outKeys,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
|
||||||
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
|
void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
|
||||||
Tensor<int, 2, true>& inIndices,
|
Tensor<int, 2, true>& inIndices,
|
||||||
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<half, 2, true>& outKeys,
|
Tensor<half, 2, true>& outKeys,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q) \
|
#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q) \
|
||||||
extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
||||||
Tensor<TYPE, 2, true>& in, \
|
Tensor<TYPE, 2, true>& in, \
|
||||||
|
Tensor<uint8_t, 1, true>& bitset, \
|
||||||
Tensor<TYPE, 2, true>& outK, \
|
Tensor<TYPE, 2, true>& outK, \
|
||||||
Tensor<int, 2, true>& outV, \
|
Tensor<int, 2, true>& outV, \
|
||||||
bool dir, \
|
bool dir, \
|
||||||
|
@ -22,15 +23,17 @@
|
||||||
extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
||||||
Tensor<TYPE, 2, true>& inK, \
|
Tensor<TYPE, 2, true>& inK, \
|
||||||
Tensor<int, 2, true>& inV, \
|
Tensor<int, 2, true>& inV, \
|
||||||
|
Tensor<uint8_t, 1, true>& bitset, \
|
||||||
Tensor<TYPE, 2, true>& outK, \
|
Tensor<TYPE, 2, true>& outK, \
|
||||||
Tensor<int, 2, true>& outV, \
|
Tensor<int, 2, true>& outV, \
|
||||||
bool dir, \
|
bool dir, \
|
||||||
int k, \
|
int k, \
|
||||||
cudaStream_t stream)
|
cudaStream_t stream);
|
||||||
|
|
||||||
#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \
|
#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \
|
||||||
void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
||||||
Tensor<TYPE, 2, true>& in, \
|
Tensor<TYPE, 2, true>& in, \
|
||||||
|
Tensor<uint8_t, 1, true>& bitset, \
|
||||||
Tensor<TYPE, 2, true>& outK, \
|
Tensor<TYPE, 2, true>& outK, \
|
||||||
Tensor<int, 2, true>& outV, \
|
Tensor<int, 2, true>& outV, \
|
||||||
bool dir, \
|
bool dir, \
|
||||||
|
@ -52,14 +55,19 @@
|
||||||
auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
|
auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
|
||||||
auto vInit = -1; \
|
auto vInit = -1; \
|
||||||
\
|
\
|
||||||
blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
|
if (bitset.getSize(0) == 0) \
|
||||||
<<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
|
blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
|
||||||
|
<<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
|
||||||
|
else \
|
||||||
|
blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
|
||||||
|
<<<grid, block, 0, stream>>>(in, bitset, outK, outV, kInit, vInit, k); \
|
||||||
CUDA_TEST_ERROR(); \
|
CUDA_TEST_ERROR(); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
||||||
Tensor<TYPE, 2, true>& inK, \
|
Tensor<TYPE, 2, true>& inK, \
|
||||||
Tensor<int, 2, true>& inV, \
|
Tensor<int, 2, true>& inV, \
|
||||||
|
Tensor<uint8_t, 1, true>& bitset, \
|
||||||
Tensor<TYPE, 2, true>& outK, \
|
Tensor<TYPE, 2, true>& outK, \
|
||||||
Tensor<int, 2, true>& outV, \
|
Tensor<int, 2, true>& outV, \
|
||||||
bool dir, \
|
bool dir, \
|
||||||
|
@ -79,16 +87,20 @@
|
||||||
auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
|
auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
|
||||||
auto vInit = -1; \
|
auto vInit = -1; \
|
||||||
\
|
\
|
||||||
blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
|
if (bitset.getSize(0) == 0) \
|
||||||
<<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
|
blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
|
||||||
|
<<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
|
||||||
|
else \
|
||||||
|
blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
|
||||||
|
<<<grid, block, 0, stream>>>(inK, inV, bitset, outK, outV, kInit, vInit, k); \
|
||||||
CUDA_TEST_ERROR(); \
|
CUDA_TEST_ERROR(); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \
|
#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \
|
||||||
runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
||||||
in, outK, outV, dir, k, stream)
|
in, bitset, outK, outV, dir, k, stream)
|
||||||
|
|
||||||
#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q) \
|
#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q) \
|
||||||
runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
|
||||||
inK, inV, outK, outV, dir, k, stream)
|
inK, inV, bitset, outK, outV, dir, k, stream)
|
||||||
|
|
Loading…
Reference in New Issue