Modify faiss files to reduce the usage of GPU in 0.10.4 (#3893)

* Modify faiss files to reduce the usage of GPU in 0.10.4 Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Change the format of the code and release memories of arrays resident on host Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Change the format of the code and release memories of arrays resident on host, fix some typos Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Delete some blank lines Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Temporarily comment the delete function in IVFFlat.cu Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Uncomment some delete functions Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Uncomment some delete functions Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Use tensor to replace arrays in IVFFlat.cu Signed-off-by: lighteningzhang <lighteningzhang@163.com> * Fix some typos Signed-off-by: lighteningzhang <lighteningzhang@163.com>
2020-09-28 17:58:32 +08:00 · 2020-09-28 17:58:32 +08:00 · 237ce2aba3
parent 7692a6d437
commit 237ce2aba3
9 changed files with 1452 additions and 529 deletions
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
@ -250,7 +250,7 @@ GpuIndexIVFFlat::searchImpl_(int n,

  if (!bitset) {
    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels, distances, labels);
  } else {
    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
                                             const_cast<uint8_t*>(bitset->data()), stream,
@ -262,3 +262,4 @@ GpuIndexIVFFlat::searchImpl_(int n,

 } } // namespace

+
--- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
@ -295,6 +295,331 @@ void runDistance(bool computeL2,
  }
 }

+void runDist(bool computeL2,
+                 GpuResources* resources,
+                 Tensor<float, 2, true>& centroids,
+                 bool centroidsRowMajor,
+                 Tensor<float, 1, true>* centroidNorms,
+                 Tensor<float, 2, true>& queries,
+                 bool queriesRowMajor,
+                 Tensor<uint8_t, 1, true>& bitset,
+                 int k,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+
+                 float* hostOutDistances,
+                 int* hostOutIndices,
+                 int pos,
+                 int curTile,
+                 int nprobe,
+
+                 bool ignoreOutDistances) {
+
+  k = curTile;
+
+  // The # of centroids in `centroids` based on memory layout
+  auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
+
+  // The # of queries in `queries` based on memory layout
+  auto numQueries = queries.getSize(queriesRowMajor ? 0 : 1);
+
+  // The dimensions of the vectors to consider
+  auto dim = queries.getSize(queriesRowMajor ? 1 : 0);
+  FAISS_ASSERT((numQueries == 0 || numCentroids == 0) ||
+               dim == centroids.getSize(centroidsRowMajor ? 1 : 0));
+
+  FAISS_ASSERT(outDistances.getSize(0) == numQueries);
+  FAISS_ASSERT(outIndices.getSize(0) == numQueries);
+  // FAISS_ASSERT(outDistances.getSize(1) == k);
+  // FAISS_ASSERT(outIndices.getSize(1) == k);
+
+  auto& mem = resources->getMemoryManagerCurrentDevice();
+  auto defaultStream = resources->getDefaultStreamCurrentDevice();
+
+  // If we're quering against a 0 sized set, just return empty results
+  if (centroids.numElements() == 0) {
+    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 outDistances.data(), outDistances.end(),
+                 Limits<float>::getMax());
+
+    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 outIndices.data(), outIndices.end(),
+                 -1);
+
+    return;
+  }
+
+  // L2: If ||c||^2 is not pre-computed, calculate it
+  DeviceTensor<float, 1, true> cNorms;
+  if (computeL2 && !centroidNorms) {
+    cNorms =
+      std::move(DeviceTensor<float, 1, true>(
+                  mem, {numCentroids}, defaultStream));
+    runL2Norm(centroids, centroidsRowMajor, cNorms, true, defaultStream);
+    centroidNorms = &cNorms;
+  }
+
+  //
+  // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
+  //
+  int qNormSize[1] = {numQueries};
+  DeviceTensor<float, 1, true> queryNorms(mem, qNormSize, defaultStream);
+
+  // ||q||^2
+  if (computeL2) {
+    runL2Norm(queries, queriesRowMajor, queryNorms, true, defaultStream);
+  }
+
+  // By default, aim to use up to 512 MB of memory for the processing, with both
+  // number of queries and number of centroids being at least 512.
+  int tileRows = 0;
+  int tileCols = 0;
+  chooseTileSize(numQueries,
+                 numCentroids,
+                 dim,
+                 sizeof(float),
+                 mem.getSizeAvailable(),
+                 tileRows,
+                 tileCols);
+
+  int numColTiles = utils::divUp(numCentroids, tileCols);
+
+  // We can have any number of vectors to query against, even less than k, in
+  // which case we'll return -1 for the index
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
+
+  // Temporary output memory space we'll use
+  DeviceTensor<float, 2, true> distanceBuf1(
+    mem, {tileRows, tileCols}, defaultStream);
+  DeviceTensor<float, 2, true> distanceBuf2(
+    mem, {tileRows, tileCols}, defaultStream);
+  DeviceTensor<float, 2, true>* distanceBufs[2] =
+    {&distanceBuf1, &distanceBuf2};
+
+  DeviceTensor<float, 2, true> outDistanceBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<float, 2, true> outDistanceBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<float, 2, true>* outDistanceBufs[2] =
+    {&outDistanceBuf1, &outDistanceBuf2};
+
+  DeviceTensor<int, 2, true> outIndexBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true> outIndexBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true>* outIndexBufs[2] =
+    {&outIndexBuf1, &outIndexBuf2};
+
+  auto streams = resources->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {defaultStream});
+
+  int curStream = 0;
+  bool interrupt = false;
+
+  // Tile over the input queries
+  for (int i = 0; i < numQueries; i += tileRows) {
+    if (interrupt || InterruptCallback::is_interrupted()) {
+      interrupt = true;
+      break;
+    }
+
+    int curQuerySize = std::min(tileRows, numQueries - i);
+
+    auto outDistanceView =
+      outDistances.narrow(0, i, curQuerySize);
+    auto outIndexView =
+      outIndices.narrow(0, i, curQuerySize);
+
+    auto queryView =
+      queries.narrow(queriesRowMajor ? 0 : 1, i, curQuerySize);
+    auto queryNormNiew =
+      queryNorms.narrow(0, i, curQuerySize);
+
+    auto outDistanceBufRowView =
+      outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
+    auto outIndexBufRowView =
+      outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
+
+    // Tile over the centroids
+    for (int j = 0; j < numCentroids; j += tileCols) {
+      if (InterruptCallback::is_interrupted()) {
+        interrupt = true;
+        break;
+      }
+
+      int curCentroidSize = std::min(tileCols, numCentroids - j);
+      int curColTile = j / tileCols;
+
+      auto centroidsView =
+        sliceCentroids(centroids, centroidsRowMajor, j, curCentroidSize);
+
+      auto distanceBufView = distanceBufs[curStream]->
+        narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
+
+      auto outDistanceBufColView =
+        outDistanceBufRowView.narrow(1, k * curColTile, k);
+      auto outIndexBufColView =
+        outIndexBufRowView.narrow(1, k * curColTile, k);
+
+      // L2: distance is ||c||^2 - 2qc + ||q||^2, we compute -2qc
+      // IP: just compute qc
+      // (query id x dim) x (centroid id, dim)' = (query id, centroid id)
+      runMatrixMult(distanceBufView,
+                    false, // not transposed
+                    queryView,
+                    !queriesRowMajor, // transposed MM if col major
+                    centroidsView,
+                    centroidsRowMajor, // transposed MM if row major
+                    computeL2 ? -2.0f : 1.0f,
+                    0.0f,
+                    resources->getBlasHandleCurrentDevice(),
+                    streams[curStream]);
+
+      if (computeL2) {
+        // For L2 distance, we use this fused kernel that performs both
+        // adding ||c||^2 to -2qc and k-selection, so we only need two
+        // passes (one write by the gemm, one read here) over the huge
+        // region of output memory
+        //
+        // If we aren't tiling along the number of centroids, we can perform the
+        // output work directly
+        if (tileCols == numCentroids) {
+          // Write into the final output
+          runL2SelMn(hostOutDistances,
+                        hostOutIndices,
+                        i,
+                        curQuerySize,
+                        pos,
+                        nprobe,
+
+                        distanceBufView,
+                         *centroidNorms,
+                         bitset,
+                         outDistanceView,
+                         outIndexView,
+                         k,
+                         streams[curStream]);
+
+          if (!ignoreOutDistances) {
+            // expand (query id) to (query id, k) by duplicating along rows
+            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
+            runSumAlongRows(queryNormNiew,
+                            outDistanceView,
+                            true, // L2 distances should not go below zero due
+                                  // to roundoff error
+                            streams[curStream]);
+          }
+        } else {
+          auto centroidNormsView = centroidNorms->narrow(0, j, curCentroidSize);
+
+          // Write into our intermediate output
+          runL2SelectMin(distanceBufView,
+                         centroidNormsView,
+                         bitset,
+                         outDistanceBufColView,
+                         outIndexBufColView,
+                         k,
+                         streams[curStream]);
+
+          if (!ignoreOutDistances) {
+            // expand (query id) to (query id, k) by duplicating along rows
+            // top-k ||c||^2 - 2qc + ||q||^2 in the form (query id, k)
+            runSumAlongRows(queryNormNiew,
+                            outDistanceBufColView,
+                            true, // L2 distances should not go below zero due
+                                  // to roundoff error
+                            streams[curStream]);
+          }
+        }
+      } else {
+        // For IP, just k-select the output for this tile
+        if (tileCols == numCentroids) {
+          // Write into the final output
+          runBlockSelect(distanceBufView,
+                         bitset,
+                         outDistanceView,
+                         outIndexView,
+                         true, k, streams[curStream]);
+        } else {
+          // Write into the intermediate output
+          runBlockSelect(distanceBufView,
+                         bitset,
+                         outDistanceBufColView,
+                         outIndexBufColView,
+                         true, k, streams[curStream]);
+        }
+      }
+    }
+
+    // As we're finished with processing a full set of centroids, perform the
+    // final k-selection
+    if (tileCols != numCentroids) {
+      // The indices are tile-relative; for each tile of k, we need to add
+      // tileCols to the index
+      runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
+
+      runBlockSelectPair(outDistanceBufRowView,
+                         outIndexBufRowView,
+                         bitset,
+                         outDistanceView,
+                         outIndexView,
+                         computeL2 ? false : true, k, streams[curStream]);
+    }
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  // Have the desired ordering stream wait on the multi-stream
+  streamWait({defaultStream}, streams);
+
+  if (interrupt) {
+    FAISS_THROW_MSG("interrupted");
+  }
+}
+
+void runL2Dist(GpuResources* resources,
+                   Tensor<float, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   // can be optionally pre-computed; nullptr if we
+                   // have to compute it upon the call
+                   Tensor<float, 1, true>* vectorNorms,
+                   Tensor<float, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+
+                    float* hostOutDistances,
+                    int* hostOutIndices,
+                    int i,
+                    int curTile,
+                    int nprobe,
+
+                    bool ignoreOutDistances
+                    ) {
+    runDist(true, // L2
+                    resources,
+                    vectors,
+                    vectorsRowMajor,
+                    vectorNorms,
+                    queries,
+                    queriesRowMajor,
+                    bitset,
+                    k,
+                    outDistances,
+                    outIndices,
+
+                    hostOutDistances,
+                    hostOutIndices,
+                    i,
+                    curTile,
+                    nprobe,
+
+                    ignoreOutDistances);
+}
+
+
 template <typename T>
 void runL2Distance(GpuResources* resources,
                   Tensor<T, 2, true>& centroids,
--- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
@ -35,6 +35,48 @@ void runL2Distance(GpuResources* resources,
                   // take shortcuts.
                   bool ignoreOutDistances = false);

+void runL2Dist(GpuResources* resources,
+                   Tensor<float, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   // can be optionally pre-computed; nullptr if we
+                   // have to compute it upon the call
+                   Tensor<float, 1, true>* vectorNorms,
+                   Tensor<float, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+
+                    float* hostOutDistances,
+                    int* hostOutIndices,
+                    int pos,
+                    int curTile,
+                    int nprobe,
+
+                    bool ignoreOutDistances = false
+                    );
+
+void runDist(bool computeL2,
+            GpuResources* resources,
+            Tensor<float, 2, true>& centroids,
+            bool centroidsRowMajor,
+            Tensor<float, 1, true>* centroidNorms,
+            Tensor<float, 2, true>& queries,
+            bool queriesRowMajor,
+            Tensor<uint8_t, 1, true>& bitset,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<int, 2, true>& outIndices,
+
+            float* hostOutDistances,
+            int* hostOutIndices,
+            int i,
+            int curTile,
+            int nprobe,
+
+            bool ignoreOutDistances);
+
 /// Calculates brute-force inner product distance between `vectors`
 /// and `queries`, returning the k closest results seen
 void runIPDistance(GpuResources* resources,
@ -91,6 +133,7 @@ void bfKnnOnDevice(GpuResources* resources,
                   Tensor<float, 2, true>& outDistances,
                   Tensor<int, 2, true>& outIndices,
                   bool ignoreOutDistances) {
+
  // We are guaranteed that all data arguments are resident on our preferred
  // `device` here, and are ordered wrt `stream`

@ -213,4 +256,155 @@ void bfKnnOnDevice(GpuResources* resources,
 }


+template <typename T>
+void bfKnnOnDev(GpuResources* resources,
+                   int device,
+                   cudaStream_t stream,
+                   Tensor<T, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<float, 1, true>* vectorNorms,
+                   Tensor<T, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   faiss::MetricType metric,
+                   float metricArg,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+
+                    float* outDis_h,
+                    int* outInd_h,
+                    int i,
+                    int curTile,
+                    int nprobe,
+
+                   bool ignoreOutDistances) {
+
+  // We are guaranteed that all data arguments are resident on our preferred
+  // `device` here, and are ordered wrt `stream`
+
+  // L2 and IP are specialized to use GEMM and an optimized L2 + selection or
+  // pure k-selection kernel.
+  if ((metric == faiss::MetricType::METRIC_L2) ||
+      (metric == faiss::MetricType::METRIC_Lp &&
+       metricArg == 2)) {
+    runL2Dist(resources,
+                  vectors,
+                  vectorsRowMajor,
+                  vectorNorms,
+                  queries,
+                  queriesRowMajor,
+                  bitset,
+                  k,
+                  outDistances,
+                  outIndices,
+
+                  outDis_h,
+                  outInd_h,
+                  i,
+                  curTile,
+                  nprobe);
+  } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
+    runIPDistance(resources,
+                  vectors,
+                  vectorsRowMajor,
+                  queries,
+                  queriesRowMajor,
+                  bitset,
+                  k,
+                  outDistances,
+                  outIndices);
+  } else {
+    //
+    // General pairwise distance kernel
+    //
+    // The general distance kernel does not have specializations for
+    // transpositions (NN, NT, TN); instead, the transposition is just handled
+    // upon data load for now, which could result in poor data loading behavior
+    // for NT / TN. This can be fixed at a later date if desired, but efficiency
+    // is low versus GEMM anyways.
+    //
+
+    Tensor<T, 2> tVectorsDimInnermost =
+      vectorsRowMajor ?
+      vectors.transposeInnermost(1) :
+      vectors.transposeInnermost(0);
+    Tensor<T, 2> tQueriesDimInnermost =
+      queriesRowMajor ?
+      queries.transposeInnermost(1) :
+      queries.transposeInnermost(0);
+
+    if ((metric == faiss::MetricType::METRIC_L1) ||
+        (metric == faiss::MetricType::METRIC_Lp &&
+         metricArg == 1)) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         L1Distance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Lp &&
+               metricArg == -1) {
+      // A way to test L2 distance
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         L2Distance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Lp) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         LpDistance(metricArg),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Linf) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         LinfDistance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Canberra) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         CanberraDistance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_BrayCurtis) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         BrayCurtisDistance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_JensenShannon) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         JensenShannonDistance(),
+                         outDistances,
+                         outIndices);
+    } else {
+      FAISS_THROW_FMT("unsupported metric type %d", metric);
+    }
+  }
+}
+
 } } // namespace
--- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
@ -1,388 +1,460 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/Distance.cuh>
-#include <faiss/gpu/impl/L2Norm.cuh>
-#include <faiss/gpu/impl/VectorResidual.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Transpose.cuh>
-
-namespace faiss { namespace gpu {
-
-FlatIndex::FlatIndex(GpuResources* res,
-                     int dim,
-                     bool useFloat16,
-                     bool storeTransposed,
-                     MemorySpace space) :
-    resources_(res),
-    dim_(dim),
-    useFloat16_(useFloat16),
-    storeTransposed_(storeTransposed),
-    space_(space),
-    num_(0),
-    rawData_(space) {
-#ifndef FAISS_USE_FLOAT16
-    FAISS_ASSERT(!useFloat16_);
-#endif
-}
-
-bool
-FlatIndex::getUseFloat16() const {
-  return useFloat16_;
-}
-
-/// Returns the number of vectors we contain
-int FlatIndex::getSize() const {
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    return vectorsHalf_.getSize(0);
-  } else {
-    return vectors_.getSize(0);
-  }
-#else
-  return vectors_.getSize(0);
-#endif
-}
-
-int FlatIndex::getDim() const {
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    return vectorsHalf_.getSize(1);
-  } else {
-    return vectors_.getSize(1);
-  }
-#else
-  return vectors_.getSize(1);
-#endif
-}
-
-void
-FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
-  } else {
-    rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
-  }
-#else
-  rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
-#endif
-}
-
-template <>
-Tensor<float, 2, true>&
-FlatIndex::getVectorsRef<float>() {
-  // Should not call this unless we are in float32 mode
-  FAISS_ASSERT(!useFloat16_);
-  return getVectorsFloat32Ref();
-}
-
-#ifdef FAISS_USE_FLOAT16
-template <>
-Tensor<half, 2, true>&
-FlatIndex::getVectorsRef<half>() {
-  // Should not call this unless we are in float16 mode
-  FAISS_ASSERT(useFloat16_);
-  return getVectorsFloat16Ref();
-}
-#endif
-
-Tensor<float, 2, true>&
-FlatIndex::getVectorsFloat32Ref() {
-  // Should not call this unless we are in float32 mode
-  FAISS_ASSERT(!useFloat16_);
-
-  return vectors_;
-}
-
-#ifdef FAISS_USE_FLOAT16
-Tensor<half, 2, true>&
-FlatIndex::getVectorsFloat16Ref() {
-  // Should not call this unless we are in float16 mode
-  FAISS_ASSERT(useFloat16_);
-
-  return vectorsHalf_;
-}
-#endif
-
-DeviceTensor<float, 2, true>
-FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
-  return getVectorsFloat32Copy(0, num_, stream);
-}
-
-DeviceTensor<float, 2, true>
-FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
-  DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
-
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
-    convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
-  } else {
-    vectors_.copyTo(vecFloat32, stream);
-  }
-#else
-    vectors_.copyTo(vecFloat32, stream);
-#endif
-
-  return vecFloat32;
-}
-
-void
-FlatIndex::query(Tensor<float, 2, true>& input,
-                 Tensor<uint8_t, 1, true>& bitset,
-                 int k,
-                 faiss::MetricType metric,
-                 float metricArg,
-                 Tensor<float, 2, true>& outDistances,
-                 Tensor<int, 2, true>& outIndices,
-                 bool exactDistance) {
-  auto stream = resources_->getDefaultStreamCurrentDevice();
-  auto& mem = resources_->getMemoryManagerCurrentDevice();
-
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    // We need to convert the input to float16 for comparison to ourselves
-
-    auto inputHalf =
-      convertTensor<float, half, 2>(resources_, stream, input);
-
-    query(inputHalf, bitset, k, metric, metricArg,
-          outDistances, outIndices, exactDistance);
-
-  } else {
-    bfKnnOnDevice(resources_,
-                  getCurrentDevice(),
-                  stream,
-                  storeTransposed_ ? vectorsTransposed_ : vectors_,
-                  !storeTransposed_, // is vectors row major?
-                  &norms_,
-                  input,
-                  true, // input is row major
-                  bitset,
-                  k,
-                  metric,
-                  metricArg,
-                  outDistances,
-                  outIndices,
-                  !exactDistance);
-  }
-#else
-    bfKnnOnDevice(resources_,
-                  getCurrentDevice(),
-                  stream,
-                  storeTransposed_ ? vectorsTransposed_ : vectors_,
-                  !storeTransposed_, // is vectors row major?
-                  &norms_,
-                  input,
-                  true, // input is row major
-                  bitset,
-                  k,
-                  metric,
-                  metricArg,
-                  outDistances,
-                  outIndices,
-                  !exactDistance);
-#endif
-}
-
-#ifdef FAISS_USE_FLOAT16
-void
-FlatIndex::query(Tensor<half, 2, true>& input,
-                 Tensor<uint8_t, 1, true>& bitset,
-                 int k,
-                 faiss::MetricType metric,
-                 float metricArg,
-                 Tensor<float, 2, true>& outDistances,
-                 Tensor<int, 2, true>& outIndices,
-                 bool exactDistance) {
-  FAISS_ASSERT(useFloat16_);
-
-  bfKnnOnDevice(resources_,
-                getCurrentDevice(),
-                resources_->getDefaultStreamCurrentDevice(),
-                storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
-                !storeTransposed_, // is vectors row major?
-                &norms_,
-                input,
-                true, // input is row major
-                bitset,
-                k,
-                metric,
-                metricArg,
-                outDistances,
-                outIndices,
-                !exactDistance);
-}
-#endif
-
-void
-FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
-                           Tensor<int, 1, true>& listIds,
-                           Tensor<float, 2, true>& residuals) {
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    runCalcResidual(vecs,
-                    getVectorsFloat16Ref(),
-                    listIds,
-                    residuals,
-                    resources_->getDefaultStreamCurrentDevice());
-  } else {
-    runCalcResidual(vecs,
-                    getVectorsFloat32Ref(),
-                    listIds,
-                    residuals,
-                    resources_->getDefaultStreamCurrentDevice());
-  }
-#else
-  runCalcResidual(vecs,
-                  getVectorsFloat32Ref(),
-                  listIds,
-                  residuals,
-                  resources_->getDefaultStreamCurrentDevice());
-#endif
-}
-
-void
-FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
-                       Tensor<float, 2, true>& vecs) {
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    runReconstruct(listIds,
-                   getVectorsFloat16Ref(),
-                   vecs,
-                   resources_->getDefaultStreamCurrentDevice());
-  } else {
-    runReconstruct(listIds,
-                   getVectorsFloat32Ref(),
-                   vecs,
-                   resources_->getDefaultStreamCurrentDevice());
-  }
-#else
-  runReconstruct(listIds,
-                 getVectorsFloat32Ref(),
-                 vecs,
-                 resources_->getDefaultStreamCurrentDevice());
-#endif
-}
-void
-FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
-                       Tensor<float, 3, true>& vecs) {
-  auto listIds1 = listIds.downcastOuter<1>();
-  auto vecs2 = vecs.downcastOuter<2>();
-
-  reconstruct(listIds1, vecs2);
-}
-
-void
-FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
-  if (numVecs == 0) {
-    return;
-  }
-
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    // Make sure that `data` is on our device; we'll run the
-    // conversion on our device
-    auto devData = toDevice<float, 2>(resources_,
-                                      getCurrentDevice(),
-                                      (float*) data,
-                                      stream,
-                                      {numVecs, dim_});
-
-    auto devDataHalf =
-      convertTensor<float, half, 2>(resources_, stream, devData);
-
-    rawData_.append((char*) devDataHalf.data(),
-                    devDataHalf.getSizeInBytes(),
-                    stream,
-                    true /* reserve exactly */);
-  } else {
-    rawData_.append((char*) data,
-                    (size_t) dim_ * numVecs * sizeof(float),
-                    stream,
-                    true /* reserve exactly */);
-  }
-
-#else
-  rawData_.append((char*) data,
-          (size_t) dim_ * numVecs * sizeof(float),
-          stream,
-          true /* reserve exactly */);
-#endif
-  num_ += numVecs;
-
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    DeviceTensor<half, 2, true> vectorsHalf(
-      (half*) rawData_.data(), {(int) num_, dim_}, space_);
-    vectorsHalf_ = std::move(vectorsHalf);
-  } else {
-    DeviceTensor<float, 2, true> vectors(
-      (float*) rawData_.data(), {(int) num_, dim_}, space_);
-    vectors_ = std::move(vectors);
-  }
-#else
-  DeviceTensor<float, 2, true> vectors(
-     (float*) rawData_.data(), {(int) num_, dim_}, space_);
-    vectors_ = std::move(vectors);
-#endif
-
-  if (storeTransposed_) {
-#ifdef FAISS_USE_FLOAT16
-    if (useFloat16_) {
-      vectorsHalfTransposed_ =
-        std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
-      runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
-    } else {
-      vectorsTransposed_ =
-        std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
-      runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
-    }
-#else
-  vectorsTransposed_ =
-    std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
-  runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
-#endif
-  }
-
-  // Precompute L2 norms of our database
-#ifdef FAISS_USE_FLOAT16
-  if (useFloat16_) {
-    DeviceTensor<float, 1, true> norms({(int) num_}, space_);
-    runL2Norm(vectorsHalf_, true, norms, true, stream);
-    norms_ = std::move(norms);
-  } else {
-    DeviceTensor<float, 1, true> norms({(int) num_}, space_);
-    runL2Norm(vectors_, true, norms, true, stream);
-    norms_ = std::move(norms);
-  }
-#else
-  DeviceTensor<float, 1, true> norms({(int) num_}, space_);
-  runL2Norm(vectors_, true, norms, true, stream);
-  norms_ = std::move(norms);
-#endif
-}
-
-void
-FlatIndex::reset() {
-  rawData_.clear();
-  vectors_ = std::move(DeviceTensor<float, 2, true>());
-  vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
-#ifdef FAISS_USE_FLOAT16
-  vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
-  vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
-#endif
-  norms_ = std::move(DeviceTensor<float, 1, true>());
-  num_ = 0;
-}
-
-} }
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/impl/VectorResidual.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+namespace faiss { namespace gpu {
+
+FlatIndex::FlatIndex(GpuResources* res,
+                     int dim,
+                     bool useFloat16,
+                     bool storeTransposed,
+                     MemorySpace space) :
+    resources_(res),
+    dim_(dim),
+    useFloat16_(useFloat16),
+    storeTransposed_(storeTransposed),
+    space_(space),
+    num_(0),
+    rawData_(space) {
+#ifndef FAISS_USE_FLOAT16
+    FAISS_ASSERT(!useFloat16_);
+#endif
+}
+
+bool
+FlatIndex::getUseFloat16() const {
+  return useFloat16_;
+}
+
+/// Returns the number of vectors we contain
+int FlatIndex::getSize() const {
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    return vectorsHalf_.getSize(0);
+  } else {
+    return vectors_.getSize(0);
+  }
+#else
+  return vectors_.getSize(0);
+#endif
+}
+
+int FlatIndex::getDim() const {
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    return vectorsHalf_.getSize(1);
+  } else {
+    return vectors_.getSize(1);
+  }
+#else
+  return vectors_.getSize(1);
+#endif
+}
+
+void
+FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
+  } else {
+    rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
+  }
+#else
+  rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
+#endif
+}
+
+template <>
+Tensor<float, 2, true>&
+FlatIndex::getVectorsRef<float>() {
+  // Should not call this unless we are in float32 mode
+  FAISS_ASSERT(!useFloat16_);
+  return getVectorsFloat32Ref();
+}
+
+#ifdef FAISS_USE_FLOAT16
+template <>
+Tensor<half, 2, true>&
+FlatIndex::getVectorsRef<half>() {
+  // Should not call this unless we are in float16 mode
+  FAISS_ASSERT(useFloat16_);
+  return getVectorsFloat16Ref();
+}
+#endif
+
+Tensor<float, 2, true>&
+FlatIndex::getVectorsFloat32Ref() {
+  // Should not call this unless we are in float32 mode
+  FAISS_ASSERT(!useFloat16_);
+
+  return vectors_;
+}
+
+#ifdef FAISS_USE_FLOAT16
+Tensor<half, 2, true>&
+FlatIndex::getVectorsFloat16Ref() {
+  // Should not call this unless we are in float16 mode
+  FAISS_ASSERT(useFloat16_);
+
+  return vectorsHalf_;
+}
+#endif
+
+DeviceTensor<float, 2, true>
+FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
+  return getVectorsFloat32Copy(0, num_, stream);
+}
+
+DeviceTensor<float, 2, true>
+FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
+  DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
+
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
+    convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
+  } else {
+    vectors_.copyTo(vecFloat32, stream);
+  }
+#else
+    vectors_.copyTo(vecFloat32, stream);
+#endif
+
+  return vecFloat32;
+}
+
+void
+FlatIndex::query(Tensor<float, 2, true>& input,
+                 Tensor<uint8_t, 1, true>& bitset,
+                 int k,
+                 faiss::MetricType metric,
+                 float metricArg,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+                 bool exactDistance) {
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    // We need to convert the input to float16 for comparison to ourselves
+
+    auto inputHalf =
+      convertTensor<float, half, 2>(resources_, stream, input);
+
+    query(inputHalf, bitset, k, metric, metricArg,
+          outDistances, outIndices, exactDistance);
+
+  } else {
+    bfKnnOnDevice(resources_,
+                  getCurrentDevice(),
+                  stream,
+                  storeTransposed_ ? vectorsTransposed_ : vectors_,
+                  !storeTransposed_, // is vectors row major?
+                  &norms_,
+                  input,
+                  true, // input is row major
+                  bitset,
+                  k,
+                  metric,
+                  metricArg,
+                  outDistances,
+                  outIndices,
+                  !exactDistance);
+  }
+#else
+    bfKnnOnDevice(resources_,
+                  getCurrentDevice(),
+                  stream,
+                  storeTransposed_ ? vectorsTransposed_ : vectors_,
+                  !storeTransposed_, // is vectors row major?
+                  &norms_,
+                  input,
+                  true, // input is row major
+                  bitset,
+                  k,
+                  metric,
+                  metricArg,
+                  outDistances,
+                  outIndices,
+                  !exactDistance);
+#endif
+}
+
+void
+FlatIndex::query(Tensor<float, 2, true>& input,
+                 Tensor<uint8_t, 1, true>& bitset,
+                 int k,
+                 faiss::MetricType metric,
+                 float metricArg,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+                 
+                 float* hostOutDistances,
+                 int* hostOutIndices,
+                 int i,
+                 int curTile,
+                 int nprobe,
+
+                 bool exactDistance) {
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    // We need to convert the input to float16 for comparison to ourselves
+
+    auto inputHalf =
+      convertTensor<float, half, 2>(resources_, stream, input);
+
+    query(inputHalf, bitset, k, metric, metricArg,
+          outDistances, outIndices, exactDistance);
+
+  } else {
+    bfKnnOnDevice(resources_,
+                  getCurrentDevice(),
+                  stream,
+                  storeTransposed_ ? vectorsTransposed_ : vectors_,
+                  !storeTransposed_, // is vectors row major?
+                  &norms_,
+                  input,
+                  true, // input is row major
+                  bitset,
+                  k,
+                  metric,
+                  metricArg,
+                  outDistances,
+                  outIndices,
+                  !exactDistance);
+  }
+#else
+    bfKnnOnDev(resources_,
+                  getCurrentDevice(),
+                  stream,
+                  storeTransposed_ ? vectorsTransposed_ : vectors_,
+                  !storeTransposed_, // is vectors row major?
+                  &norms_,
+                  input,
+                  true, // input is row major
+                  bitset,
+                  k,
+                  metric,
+                  metricArg,
+                  outDistances,
+                  outIndices,
+
+                  hostOutDistances,
+                  hostOutIndices,
+                  i,
+                  curTile,
+                  nprobe,
+
+                  !exactDistance);
+#endif
+}
+
+#ifdef FAISS_USE_FLOAT16
+void
+FlatIndex::query(Tensor<half, 2, true>& input,
+                 Tensor<uint8_t, 1, true>& bitset,
+                 int k,
+                 faiss::MetricType metric,
+                 float metricArg,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<int, 2, true>& outIndices,
+                 bool exactDistance) {
+  FAISS_ASSERT(useFloat16_);
+
+  bfKnnOnDevice(resources_,
+                getCurrentDevice(),
+                resources_->getDefaultStreamCurrentDevice(),
+                storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
+                !storeTransposed_, // is vectors row major?
+                &norms_,
+                input,
+                true, // input is row major
+                bitset,
+                k,
+                metric,
+                metricArg,
+                outDistances,
+                outIndices,
+                !exactDistance);
+}
+#endif
+
+void
+FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
+                           Tensor<int, 1, true>& listIds,
+                           Tensor<float, 2, true>& residuals) {
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    runCalcResidual(vecs,
+                    getVectorsFloat16Ref(),
+                    listIds,
+                    residuals,
+                    resources_->getDefaultStreamCurrentDevice());
+  } else {
+    runCalcResidual(vecs,
+                    getVectorsFloat32Ref(),
+                    listIds,
+                    residuals,
+                    resources_->getDefaultStreamCurrentDevice());
+  }
+#else
+  runCalcResidual(vecs,
+                  getVectorsFloat32Ref(),
+                  listIds,
+                  residuals,
+                  resources_->getDefaultStreamCurrentDevice());
+#endif
+}
+
+void
+FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
+                       Tensor<float, 2, true>& vecs) {
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    runReconstruct(listIds,
+                   getVectorsFloat16Ref(),
+                   vecs,
+                   resources_->getDefaultStreamCurrentDevice());
+  } else {
+    runReconstruct(listIds,
+                   getVectorsFloat32Ref(),
+                   vecs,
+                   resources_->getDefaultStreamCurrentDevice());
+  }
+#else
+  runReconstruct(listIds,
+                 getVectorsFloat32Ref(),
+                 vecs,
+                 resources_->getDefaultStreamCurrentDevice());
+#endif
+}
+void
+FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
+                       Tensor<float, 3, true>& vecs) {
+  auto listIds1 = listIds.downcastOuter<1>();
+  auto vecs2 = vecs.downcastOuter<2>();
+
+  reconstruct(listIds1, vecs2);
+}
+
+void
+FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
+  if (numVecs == 0) {
+    return;
+  }
+
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    // Make sure that `data` is on our device; we'll run the
+    // conversion on our device
+    auto devData = toDevice<float, 2>(resources_,
+                                      getCurrentDevice(),
+                                      (float*) data,
+                                      stream,
+                                      {numVecs, dim_});
+
+    auto devDataHalf =
+      convertTensor<float, half, 2>(resources_, stream, devData);
+
+    rawData_.append((char*) devDataHalf.data(),
+                    devDataHalf.getSizeInBytes(),
+                    stream,
+                    true /* reserve exactly */);
+  } else {
+    rawData_.append((char*) data,
+                    (size_t) dim_ * numVecs * sizeof(float),
+                    stream,
+                    true /* reserve exactly */);
+  }
+
+#else
+  rawData_.append((char*) data,
+          (size_t) dim_ * numVecs * sizeof(float),
+          stream,
+          true /* reserve exactly */);
+#endif
+  num_ += numVecs;
+
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    DeviceTensor<half, 2, true> vectorsHalf(
+      (half*) rawData_.data(), {(int) num_, dim_}, space_);
+    vectorsHalf_ = std::move(vectorsHalf);
+  } else {
+    DeviceTensor<float, 2, true> vectors(
+      (float*) rawData_.data(), {(int) num_, dim_}, space_);
+    vectors_ = std::move(vectors);
+  }
+#else
+  DeviceTensor<float, 2, true> vectors(
+     (float*) rawData_.data(), {(int) num_, dim_}, space_);
+    vectors_ = std::move(vectors);
+#endif
+
+  if (storeTransposed_) {
+#ifdef FAISS_USE_FLOAT16
+    if (useFloat16_) {
+      vectorsHalfTransposed_ =
+        std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
+      runTransposeAny(vectorsHalf_, 0, 1, vectorsHalfTransposed_, stream);
+    } else {
+      vectorsTransposed_ =
+        std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
+      runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
+    }
+#else
+  vectorsTransposed_ =
+    std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
+  runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
+#endif
+  }
+
+  // Precompute L2 norms of our database
+#ifdef FAISS_USE_FLOAT16
+  if (useFloat16_) {
+    DeviceTensor<float, 1, true> norms({(int) num_}, space_);
+    runL2Norm(vectorsHalf_, true, norms, true, stream);
+    norms_ = std::move(norms);
+  } else {
+    DeviceTensor<float, 1, true> norms({(int) num_}, space_);
+    runL2Norm(vectors_, true, norms, true, stream);
+    norms_ = std::move(norms);
+  }
+#else
+  DeviceTensor<float, 1, true> norms({(int) num_}, space_);
+  runL2Norm(vectors_, true, norms, true, stream);
+  norms_ = std::move(norms);
+#endif
+}
+
+void
+FlatIndex::reset() {
+  rawData_.clear();
+  vectors_ = std::move(DeviceTensor<float, 2, true>());
+  vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
+#ifdef FAISS_USE_FLOAT16
+  vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
+  vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
+#endif
+  norms_ = std::move(DeviceTensor<float, 1, true>());
+  num_ = 0;
+}
+
+} }
--- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
@ -1,139 +1,153 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#pragma once
-
-#include <faiss/MetricType.h>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/DeviceVector.cuh>
-#include <faiss/gpu/utils/MemorySpace.h>
-
-namespace faiss { namespace gpu {
-
-class GpuResources;
-
-/// Holder of GPU resources for a particular flat index
-class FlatIndex {
- public:
-  FlatIndex(GpuResources* res,
-            int dim,
-            bool useFloat16,
-            bool storeTransposed,
-            MemorySpace space);
-
-  /// Whether or not this flat index primarily stores data in float16
-  bool getUseFloat16() const;
-
-  /// Returns the number of vectors we contain
-  int getSize() const;
-
-  /// Returns the dimensionality of the vectors
-  int getDim() const;
-
-  /// Reserve storage that can contain at least this many vectors
-  void reserve(size_t numVecs, cudaStream_t stream);
-
-  /// Returns the vectors based on the type desired; the FlatIndex must be of
-  /// the same type (float16 or float32) to not assert
-  template <typename T>
-  Tensor<T, 2, true>& getVectorsRef();
-
-  /// Returns a reference to our vectors currently in use
-  Tensor<float, 2, true>& getVectorsFloat32Ref();
-
-  /// Returns a reference to our vectors currently in use (useFloat16 mode)
-#ifdef FAISS_USE_FLOAT16
-  Tensor<half, 2, true>& getVectorsFloat16Ref();
-#endif
-
-  /// Performs a copy of the vectors on the given device, converting
-  /// as needed from float16
-  DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
-
-  /// Returns only a subset of the vectors
-  DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
-                                                     int num,
-                                                     cudaStream_t stream);
-
-  void query(Tensor<float, 2, true>& vecs,
-             Tensor<uint8_t, 1, true>& bitset,
-             int k,
-             faiss::MetricType metric,
-             float metricArg,
-             Tensor<float, 2, true>& outDistances,
-             Tensor<int, 2, true>& outIndices,
-             bool exactDistance);
-
-#ifdef FAISS_USE_FLOAT16
-  void query(Tensor<half, 2, true>& vecs,
-             Tensor<uint8_t, 1, true>& bitset,
-             int k,
-             faiss::MetricType metric,
-             float metricArg,
-             Tensor<float, 2, true>& outDistances,
-             Tensor<int, 2, true>& outIndices,
-             bool exactDistance);
-#endif
-
-  /// Compute residual for set of vectors
-  void computeResidual(Tensor<float, 2, true>& vecs,
-                       Tensor<int, 1, true>& listIds,
-                       Tensor<float, 2, true>& residuals);
-
-  /// Gather vectors given the set of IDs
-  void reconstruct(Tensor<int, 1, true>& listIds,
-                   Tensor<float, 2, true>& vecs);
-
-  void reconstruct(Tensor<int, 2, true>& listIds,
-                   Tensor<float, 3, true>& vecs);
-
-  /// Add vectors to ourselves; the pointer passed can be on the host
-  /// or the device
-  void add(const float* data, int numVecs, cudaStream_t stream);
-
-  /// Free all storage
-  void reset();
-
- private:
-  /// Collection of GPU resources that we use
-  GpuResources* resources_;
-
-  /// Dimensionality of our vectors
-  const int dim_;
-
-  /// Float16 data format
-  const bool useFloat16_;
-
-  /// Store vectors in transposed layout for speed; makes addition to
-  /// the index slower
-  const bool storeTransposed_;
-
-  /// Memory space for our allocations
-  MemorySpace space_;
-
-  /// How many vectors we have
-  int num_;
-
-  /// The underlying expandable storage
-  DeviceVector<char> rawData_;
-
-  /// Vectors currently in rawData_
-  DeviceTensor<float, 2, true> vectors_;
-  DeviceTensor<float, 2, true> vectorsTransposed_;
-
-  /// Vectors currently in rawData_, float16 form
-#ifdef FAISS_USE_FLOAT16
-  DeviceTensor<half, 2, true> vectorsHalf_;
-  DeviceTensor<half, 2, true> vectorsHalfTransposed_;
-#endif
-
-  /// Precomputed L2 norms
-  DeviceTensor<float, 1, true> norms_;
-};
-
-} } // namespace
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/MetricType.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceVector.cuh>
+#include <faiss/gpu/utils/MemorySpace.h>
+
+namespace faiss { namespace gpu {
+
+class GpuResources;
+
+/// Holder of GPU resources for a particular flat index
+class FlatIndex {
+ public:
+  FlatIndex(GpuResources* res,
+            int dim,
+            bool useFloat16,
+            bool storeTransposed,
+            MemorySpace space);
+
+  /// Whether or not this flat index primarily stores data in float16
+  bool getUseFloat16() const;
+
+  /// Returns the number of vectors we contain
+  int getSize() const;
+
+  /// Returns the dimensionality of the vectors
+  int getDim() const;
+
+  /// Reserve storage that can contain at least this many vectors
+  void reserve(size_t numVecs, cudaStream_t stream);
+
+  /// Returns the vectors based on the type desired; the FlatIndex must be of
+  /// the same type (float16 or float32) to not assert
+  template <typename T>
+  Tensor<T, 2, true>& getVectorsRef();
+
+  /// Returns a reference to our vectors currently in use
+  Tensor<float, 2, true>& getVectorsFloat32Ref();
+
+  /// Returns a reference to our vectors currently in use (useFloat16 mode)
+#ifdef FAISS_USE_FLOAT16
+  Tensor<half, 2, true>& getVectorsFloat16Ref();
+#endif
+
+  /// Performs a copy of the vectors on the given device, converting
+  /// as needed from float16
+  DeviceTensor<float, 2, true> getVectorsFloat32Copy(cudaStream_t stream);
+
+  /// Returns only a subset of the vectors
+  DeviceTensor<float, 2, true> getVectorsFloat32Copy(int from,
+                                                     int num,
+                                                     cudaStream_t stream);
+
+  void query(Tensor<float, 2, true>& vecs,
+             Tensor<uint8_t, 1, true>& bitset,
+             int k,
+             faiss::MetricType metric,
+             float metricArg,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<int, 2, true>& outIndices,
+             float* hostOutDistances,
+             int* hostOutIndices,
+             int i,
+             int curTile,
+             int nprobe,
+             bool exactDistance);
+
+  void query(Tensor<float, 2, true>& vecs,
+             Tensor<uint8_t, 1, true>& bitset,
+             int k,
+             faiss::MetricType metric,
+             float metricArg,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<int, 2, true>& outIndices,
+             bool exactDistance);
+
+#ifdef FAISS_USE_FLOAT16
+  void query(Tensor<half, 2, true>& vecs,
+             Tensor<uint8_t, 1, true>& bitset,
+             int k,
+             faiss::MetricType metric,
+             float metricArg,
+             Tensor<float, 2, true>& outDistances,
+             Tensor<int, 2, true>& outIndices,
+             bool exactDistance);
+#endif
+
+  /// Compute residual for set of vectors
+  void computeResidual(Tensor<float, 2, true>& vecs,
+                       Tensor<int, 1, true>& listIds,
+                       Tensor<float, 2, true>& residuals);
+
+  /// Gather vectors given the set of IDs
+  void reconstruct(Tensor<int, 1, true>& listIds,
+                   Tensor<float, 2, true>& vecs);
+
+  void reconstruct(Tensor<int, 2, true>& listIds,
+                   Tensor<float, 3, true>& vecs);
+
+  /// Add vectors to ourselves; the pointer passed can be on the host
+  /// or the device
+  void add(const float* data, int numVecs, cudaStream_t stream);
+
+  /// Free all storage
+  void reset();
+
+ private:
+  /// Collection of GPU resources that we use
+  GpuResources* resources_;
+
+  /// Dimensionality of our vectors
+  const int dim_;
+
+  /// Float16 data format
+  const bool useFloat16_;
+
+  /// Store vectors in transposed layout for speed; makes addition to
+  /// the index slower
+  const bool storeTransposed_;
+
+  /// Memory space for our allocations
+  MemorySpace space_;
+
+  /// How many vectors we have
+  int num_;
+
+  /// The underlying expandable storage
+  DeviceVector<char> rawData_;
+
+  /// Vectors currently in rawData_
+  DeviceTensor<float, 2, true> vectors_;
+  DeviceTensor<float, 2, true> vectorsTransposed_;
+
+  /// Vectors currently in rawData_, float16 form
+#ifdef FAISS_USE_FLOAT16
+  DeviceTensor<half, 2, true> vectorsHalf_;
+  DeviceTensor<half, 2, true> vectorsHalfTransposed_;
+#endif
+
+  /// Precomputed L2 norms
+  DeviceTensor<float, 1, true> norms_;
+};
+
+} } // namespace
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
@ -25,6 +25,9 @@
 #include <unordered_map>
 #include <numeric>

+#include <string.h>
+#include <iostream>
+
 namespace faiss { namespace gpu {

 IVFFlat::IVFFlat(GpuResources* resources,
@ -410,4 +413,158 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
  }
 }

+void Usort(float *dis, int *ind, int sz){
+    for(int i = 0;i < sz;i ++) {
+        for(int j = i + 1; j < sz; j ++) {
+            if(dis[j] < dis[i]){
+            std::swap(dis[i], dis[j]);
+            std::swap(ind[i], ind[j]);
+            }
+        }
+    }
+}
+
+void
+IVFFlat::query(Tensor<float, 2, true>& queries,
+               Tensor<uint8_t, 1, true>& bitset,
+               int nprobe,
+               int k,
+               Tensor<float, 2, true>& outDistances,
+               Tensor<long, 2, true>& outIndices, 
+               float* distances,
+               Index::idx_t* labels) {
+  auto& mem = resources_->getMemoryManagerCurrentDevice();
+  auto stream = resources_->getDefaultStreamCurrentDevice();
+
+  // These are caught at a higher level
+  FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+  nprobe = std::min(nprobe, quantizer_->getSize());
+
+  FAISS_ASSERT(queries.getSize(1) == dim_);
+
+  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
+  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));
+
+  // Reserve space for the quantized information
+  DeviceTensor<float, 2, true>
+    coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
+  DeviceTensor<int, 2, true>
+    coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
+
+  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
+  // Find the `nprobe` closest lists; we can use int indices both
+  // internally and externally
+
+  HostTensor<int, 2, true> hostOutStoreIndicesTensor ({queries.getSize(0), k * 2});
+  HostTensor<float, 2, true> hostOutStoreDistancesTensor ({queries.getSize(0), k * 2});
+
+  float* hostCoarseDistances = new float[queries.getSize(0) * nprobe];
+  int* hostCoarseIndices = new int[queries.getSize(0) * nprobe];
+
+  float* hostOutStoreDistances = hostOutStoreDistancesTensor.data();  
+  int* hostOutStoreIndices = hostOutStoreIndicesTensor.data();
+
+  HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
+  HostTensor<float, 2, true> hostOutDistances(outDistances, stream);
+  float* tmpDistances = hostOutDistances.data(); 
+  long* tmpIndices = hostOutIndices.data();
+  const int nprobeTile = 8;
+  
+  for (int i = 0; i < nprobe; i += nprobeTile) {
+    int curTile = min(nprobeTile, nprobe-i);
+
+    quantizer_->query(queries,
+                        coarseBitset,
+                        curTile,
+                        metric_,
+                        metricArg_,
+                        coarseDistances,
+                        coarseIndices,
+                        hostCoarseDistances,
+                        hostCoarseIndices,
+                        i,
+                        curTile,
+                        nprobe,
+                        false);
+    DeviceTensor<float, 3, true>
+    residualBase(mem, {queries.getSize(0), nprobe, dim_}, stream);
+
+    if (useResidual_) {
+        // Reconstruct vectors from the quantizer
+        quantizer_->reconstruct(coarseIndices, residualBase);
+    }
+
+    runIVFFlatScan(queries,
+                    coarseIndices,
+                    bitset,
+                    deviceListDataPointers_,
+                    deviceListIndexPointers_,
+                    indicesOptions_,
+                    deviceListLengths_,
+                    maxListLength_,
+                    k,
+                    metric_,
+                    useResidual_,
+                    residualBase,
+                    scalarQ_.get(),
+                    outDistances,
+                    outIndices,
+                    resources_);
+
+    fromDevice<float,2>(outDistances, tmpDistances, stream);
+    fromDevice<long,2>(outIndices, tmpIndices, stream);
+    if(i) {
+          for(int d = 0; d < queries.getSize(0); d ++) {
+            for(int m = 0; m < k; m ++) {
+                hostOutStoreDistances[d * 2 * k + k + m] = tmpDistances[d * k + m];
+                hostOutStoreIndices[d * 2 * k + k + m] = tmpIndices[d * k + m];
+            }
+            Usort(hostOutStoreDistances + k * 2 * d, hostOutStoreIndices + k * 2 * d, 2 * k);
+        }
+    }
+
+    else{
+        for(int d = 0; d < queries.getSize(0); d ++) {
+            for(int m = 0; m < k; m ++) {
+                hostOutStoreDistances[d * 2 * k + m] = tmpDistances[d * k + m];
+                hostOutStoreIndices[d * 2 * k + m] = tmpIndices[d * k + m];
+            }
+        }
+    }
+
+  }
+
+    for(int d = 0; d < queries.getSize(0); d ++) {
+        for(int m = 0; m < k; m ++) {
+            tmpDistances[d * k + m] = hostOutStoreDistances[d * 2 * k + m];
+            tmpIndices[d * k + m] = hostOutStoreIndices[d * 2 * k + m];
+        }
+    }
+    outIndices.copyFrom(hostOutIndices, stream);
+    outDistances.copyFrom(hostOutDistances, stream);
+  // If the GPU isn't storing indices (they are on the CPU side), we
+  // need to perform the re-mapping here
+  // FIXME: we might ultimately be calling this function with inputs
+  // from the CPU, these are unnecessary copies 
+  
+  if (indicesOptions_ == INDICES_CPU) {
+    HostTensor<long, 2, true> hostOutIndices(outIndices, stream);
+
+    ivfOffsetToUserIndex(hostOutIndices.data(),
+                         numLists_,
+                         hostOutIndices.getSize(0),
+                         hostOutIndices.getSize(1),
+                         listOffsetToUserIndex_);
+
+    // Copy back to GPU, since the input to this function is on the
+    // GPU
+    outIndices.copyFrom(hostOutIndices, stream);
+  }
+
+  delete [] hostCoarseDistances;
+  delete [] hostCoarseIndices;
+
+}
+
 } } // namespace
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
@ -55,7 +55,15 @@ class IVFFlat : public IVFBase {
             int k,
             Tensor<float, 2, true>& outDistances,
             Tensor<long, 2, true>& outIndices);
-
+             
+    void query(Tensor<float, 2, true>& queries,
+                Tensor<uint8_t, 1, true>& bitset,
+                int nprobe,
+                int k,
+                Tensor<float, 2, true>& outDistances,
+                Tensor<long, 2, true>& outIndices, 
+                float* distances,
+                Index::idx_t* labels);
 private:
  /// Returns the size of our stored vectors, in bytes
  size_t getVectorMemorySize() const;
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
@ -7,6 +7,13 @@


 #include <faiss/gpu/impl/L2Select.cuh>
+#include <stdio.h>
+#include <cstring>
+#include <faiss/gpu/utils/MemorySpace.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+
 #include <faiss/impl/FaissAssert.h>

 #include <faiss/gpu/utils/DeviceDefs.cuh>
@ -181,6 +188,49 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
  }
 }

+template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
+__global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
+                             Tensor<T, 1, true> centroidDistances,
+                             Tensor<T, 2, true> outDistances,
+                             Tensor<int, 2, true> outIndices,
+                             int k, T initK, int prev) {
+  // Each block handles a single row of the distances (results)
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ T smemK[kNumWarps * NumWarpQ];
+  __shared__ int smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<T, int, false, Comparator<T>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, -1, smemK, smemV, k);
+
+  int row = blockIdx.x;
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
+  int i = threadIdx.x;
+
+  for (; i < limit; i += blockDim.x) {
+    T v = Math<T>::add(centroidDistances[i],
+                       productDistances[row][i]);
+    heap.add(v, i);
+  }
+
+  if (i < productDistances.getSize(1)) {
+    T v = Math<T>::add(centroidDistances[i],
+                       productDistances[row][i]);
+    heap.addThreadQ(v, i);
+  }
+
+  heap.reduce();
+  for (int i = threadIdx.x+prev; i < k+prev; i += blockDim.x) {
+    outDistances[row][i-prev] = smemK[i];
+    outIndices[row][i-prev] = smemV[i];
+  }
+}
+
+
+
 template <typename T>
 void runL2SelectMin(Tensor<T, 2, true>& productDistances,
                    Tensor<T, 1, true>& centroidDistances,
@ -245,6 +295,94 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
  CUDA_TEST_ERROR();
 }

+
+void runL2SelMn(float* hostOutDistances,
+                int* hostOutIndices,
+                int startPos,
+                int curQuerySize,
+                int i,
+                int nprobe,
+
+                Tensor<float, 2, true>& productDistances,
+                Tensor<float, 1, true>& centroidDistances,
+                Tensor<uint8_t, 1, true>& bitset,
+                Tensor<float, 2, true>& outDistances,
+                Tensor<int, 2, true>& outIndices,
+                int k,
+                cudaStream_t stream) {
+  FAISS_ASSERT(productDistances.getSize(0) == outDistances.getSize(0));
+  FAISS_ASSERT(productDistances.getSize(0) == outIndices.getSize(0));
+  FAISS_ASSERT(centroidDistances.getSize(0) == productDistances.getSize(1));
+  // FAISS_ASSERT(outDistances.getSize(1) == k);
+  // FAISS_ASSERT(outIndices.getSize(1) == k);
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);
+
+  if (k == 1) {
+    constexpr int kThreadsPerBlock = 256;
+    constexpr int kRowsPerBlock = 8;
+
+    auto block = dim3(kThreadsPerBlock);
+    auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
+
+    l2SelectMin1<float, kRowsPerBlock, kThreadsPerBlock>
+      <<<grid, block, 0, stream>>>(productDistances, centroidDistances, bitset,
+                                   outDistances, outIndices);
+  } else {
+    auto grid = dim3(outDistances.getSize(0));
+
+#define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)                  \
+    do {                                                                \
+      l2SelectMinK<float, NUM_WARP_Q, NUM_THREAD_Q, BLOCK>                  \
+        <<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances,  \
+                                     outDistances, outIndices,          \
+                                     k, Limits<float>::getMax(), i);           \
+    } while (0)
+
+    // block size 128 for everything <= 1024
+    if (k <= 32) {
+      RUN_L2_SELECT(128, 32, 2);
+    } else if (k <= 64) {
+      RUN_L2_SELECT(128, 64, 3);
+    } else if (k <= 128) {
+      RUN_L2_SELECT(128, 128, 3);
+    } else if (k <= 256) {
+      RUN_L2_SELECT(128, 256, 4);
+    } else if (k <= 512) {
+      RUN_L2_SELECT(128, 512, 8);
+    } else if (k <= 1024) {
+      RUN_L2_SELECT(128, 1024, 8);
+
+#if GPU_MAX_SELECTION_K >= 2048
+    } else if (k <= 2048) {
+      // smaller block for less shared memory
+      RUN_L2_SELECT(64, 2048, 8);
+#endif
+
+    } else {
+      FAISS_ASSERT(false);
+    }
+
+    float* tmpDistances = new float[outDistances.getSize(0) * outDistances.getSize(1)];
+    int* tmpIndices = new int[outDistances.getSize(0) * outDistances.getSize(1)];
+    fromDevice<float,2>(outDistances, tmpDistances, stream);
+    fromDevice<int,2>(outIndices, tmpIndices, stream);
+
+    for(int j = 0; j < curQuerySize; j ++) {
+        for(int m = 0; m < k; m ++) {
+            hostOutDistances[(startPos + j) * nprobe + i + m] = tmpDistances[k * j + m];
+            hostOutIndices[(startPos + j) * nprobe + i + m] = tmpIndices[k * j + m];
+        }
+    }
+
+    delete [] tmpDistances;
+    delete [] tmpIndices;
+
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+
 void runL2SelectMin(Tensor<float, 2, true>& productDistances,
                    Tensor<float, 1, true>& centroidDistances,
                    Tensor<uint8_t, 1, true>& bitset,
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
@ -20,4 +20,18 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
                    int k,
                    cudaStream_t stream);

+void runL2SelMn(float* hostOutDistances,
+                int* hostOutIndices,
+                int startPos,
+                int curQuerySize,
+                int i,
+                int nprobe,
+                Tensor<float, 2, true>& productDistances,
+                Tensor<float, 1, true>& centroidDistances,
+                Tensor<uint8_t, 1, true>& bitset,
+                Tensor<float, 2, true>& outDistances,
+                Tensor<int, 2, true>& outIndices,
+                int k,
+                cudaStream_t stream);
+
 } } // namespace