Merge remote-tracking branch 'upstream/0.6.0' into 0.6.0-yk-refactor-scheduler

2019-11-28 18:54:07 +08:00 · 2019-11-28 18:54:07 +08:00 · 8fa80c1e88
parent 4cf4cc7789 98270022a2
commit 8fa80c1e88
498 changed files with 98914 additions and 122 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -22,6 +22,7 @@ Please mark all change in change log and use the ticket from JIRA.
 - \#458 - Index data is not compatible between 0.5 and 0.6
 - \#465 - Server hang caused by searching with nsg index
 - \#486 - gpu no usage during index building
+- \#497 - CPU-version search performance decreased
 - \#504 - The code coverage rate of core/src/scheduler/optimizer is too low
 - \#509 - IVF_PQ index build trapped into dead loop caused by invalid params
 - \#513 - Unittest DELETE_BY_RANGE sometimes failed
@ -31,7 +32,10 @@ Please mark all change in change log and use the ticket from JIRA.
 - \#532 - assigin value to `table_name` from confest shell
 - \#533 - NSG build failed with MetricType Inner Product
 - \#543 - client raise exception in shards when search results is empty
- \#497 - CPU-version search performance decreased 
+- \#545 - Avoid dead circle of build index thread when error occurs
+- \#552 - Server down during building index_type: IVF_PQ using GPU-edition
+- \#561 - Milvus server should report exception/error message or terminate on mysql metadata backend error
+- \#599 - Build index log is incorrect

 ## Feature
 - \#12 - Pure CPU version for Milvus
@ -48,6 +52,7 @@ Please mark all change in change log and use the ticket from JIRA.
 - \#255 - Add ivfsq8 test report detailed version
 - \#260 - C++ SDK README
 - \#266 - Rpc request source code refactor
+- \#274 - Logger the time cost during preloading data
 - \#275 - Rename C++ SDK IndexType
 - \#284 - Change C++ SDK to shared library
 - \#306 - Use int64 for all config integer
@ -61,6 +66,7 @@ Please mark all change in change log and use the ticket from JIRA.
 - \#433 - C++ SDK query result is not easy to use
 - \#449 - Add ShowPartitions example for C++ SDK
 - \#470 - Small raw files should not be build index
+- \#584 - Intergrate internal FAISS

 ## Task

--- a/ci/jenkins/Jenkinsfile
+++ b/ci/jenkins/Jenkinsfile
@ -17,7 +17,7 @@ pipeline {
    }

    parameters{
-        choice choices: ['Release', 'Debug'], description: '', name: 'BUILD_TYPE'
+        choice choices: ['Release', 'Debug'], description: 'Build Type', name: 'BUILD_TYPE'
        string defaultValue: 'registry.zilliz.com', description: 'DOCKER REGISTRY URL', name: 'DOKCER_REGISTRY_URL', trim: true
        string defaultValue: 'ba070c98-c8cc-4f7c-b657-897715f359fc', description: 'DOCKER CREDENTIALS ID', name: 'DOCKER_CREDENTIALS_ID', trim: true
        string defaultValue: 'http://192.168.1.202/artifactory/milvus', description: 'JFROG ARTFACTORY URL', name: 'JFROG_ARTFACTORY_URL', trim: true
@ -27,9 +27,8 @@ pipeline {
    environment {
        PROJECT_NAME = "milvus"
        LOWER_BUILD_TYPE = params.BUILD_TYPE.toLowerCase()
-        SEMVER = "${BRANCH_NAME}"
-        JOBNAMES = env.JOB_NAME.split('/')
-        PIPELINE_NAME = "${JOBNAMES[0]}"
+        SEMVER = "${BRANCH_NAME.contains('/') ? BRANCH_NAME.substring(BRANCH_NAME.lastIndexOf('/') + 1) : BRANCH_NAME}"
+        PIPELINE_NAME = "${env.JOB_NAME.contains('/') ? env.JOB_NAME.getAt(0..(env.JOB_NAME.indexOf('/') - 1)) : env.JOB_NAME}"
    }

    stages {
@ -102,7 +101,7 @@ pipeline {
                            stages {
                                stage('Publish') {
                                    steps {
-                                        container('publish-images'){
+                                        container('publish-images') {
                                            script {
                                                load "${env.WORKSPACE}/ci/jenkins/step/publishImages.groovy"
                                            }
--- a/ci/jenkins/internalJenkinsfile.groovy
+++ b/ci/jenkins/internalJenkinsfile.groovy
@ -0,0 +1,477 @@
+#!/usr/bin/env groovy
+
+pipeline {
+    agent none
+    
+    options {
+        timestamps()
+    }
+
+    parameters{
+        choice choices: ['Release', 'Debug'], description: 'Build Type', name: 'BUILD_TYPE'
+        string defaultValue: 'registry.zilliz.com', description: 'DOCKER REGISTRY URL', name: 'DOKCER_REGISTRY_URL', trim: true
+        string defaultValue: 'a54e38ef-c424-4ea9-9224-b25fc20e3924', description: 'DOCKER CREDENTIALS ID', name: 'DOCKER_CREDENTIALS_ID', trim: true
+        string defaultValue: 'http://192.168.1.201/artifactory/milvus', description: 'JFROG ARTFACTORY URL', name: 'JFROG_ARTFACTORY_URL', trim: true
+        string defaultValue: '76fd48ab-2b8e-4eed-834d-2eefd23bb3a6', description: 'JFROG CREDENTIALS ID', name: 'JFROG_CREDENTIALS_ID', trim: true
+    }
+
+    environment {
+        PROJECT_NAME = "milvus"
+        LOWER_BUILD_TYPE = params.BUILD_TYPE.toLowerCase()
+        SEMVER = "${BRANCH_NAME.contains('/') ? BRANCH_NAME.substring(BRANCH_NAME.lastIndexOf('/') + 1) : BRANCH_NAME}"
+        PIPELINE_NAME = "${env.JOB_NAME.contains('/') ? env.JOB_NAME.getAt(0..(env.JOB_NAME.indexOf('/') - 1)) : env.JOB_NAME}"
+    }
+
+    stages {
+        stage("Ubuntu 18.04 x86_64") {
+            environment {
+                OS_NAME = "ubuntu18.04"
+                CPU_ARCH = "amd64"
+            }
+
+            parallel {
+                stage ("GPU Version") {
+                    environment {
+                        BINRARY_VERSION = "gpu"
+                        PACKAGE_VERSION = VersionNumber([
+                            versionNumberString : '${SEMVER}-gpu-${OS_NAME}-${CPU_ARCH}-${LOWER_BUILD_TYPE}-${BUILD_DATE_FORMATTED, "yyyyMMdd"}-${BUILDS_TODAY}'
+                        ]);
+                        DOCKER_VERSION = "${SEMVER}-gpu-${OS_NAME}-${LOWER_BUILD_TYPE}"
+                    }
+
+                    stages {
+                        stage("Run Build") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-build"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  name: milvus-gpu-build-env
+  labels:
+    app: milvus
+    componet: gpu-build-env
+spec:
+  containers:
+  - name: milvus-gpu-build-env
+    image: registry.zilliz.com/milvus/milvus-gpu-build-env:v0.6.0-ubuntu18.04
+    env:
+    - name: POD_IP
+      valueFrom:
+        fieldRef:
+          fieldPath: status.podIP
+    - name: BUILD_ENV_IMAGE_ID
+      value: "da9023b0f858f072672f86483a869aa87e90a5140864f89e5a012ec766d96dea"
+    command:
+    - cat
+    tty: true
+    resources:
+      limits:
+        memory: "24Gi"
+        cpu: "8.0"
+        nvidia.com/gpu: 1
+      requests:
+        memory: "16Gi"
+        cpu: "4.0"
+  - name: milvus-mysql
+    image: mysql:5.6
+    env:
+    - name: MYSQL_ROOT_PASSWORD
+      value: 123456
+    ports:
+    - containerPort: 3306
+      name: mysql
+                                    """
+                                }
+                            }
+
+                            stages {
+                                stage('Build') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/build.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Code Coverage') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/internalCoverage.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Upload Package') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/package.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Publish docker images") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-publish"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: publish
+    componet: docker
+spec:
+  containers:
+  - name: publish-images
+    image: registry.zilliz.com/library/docker:v1.0.0
+    securityContext:
+      privileged: true
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+  volumes:
+  - name: docker-sock
+    hostPath:
+      path: /var/run/docker.sock
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Publish') {
+                                    steps {
+                                        container('publish-images') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/publishImages.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Deploy to Development") {
+                            environment {
+                                FROMAT_SEMVER = "${env.SEMVER}".replaceAll("\\.", "-")
+                                HELM_RELEASE_NAME = "${env.PIPELINE_NAME}-${env.FROMAT_SEMVER}-${env.BUILD_NUMBER}-single-${env.BINRARY_VERSION}".toLowerCase()
+                            }
+
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-dev-test"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: milvus
+    componet: test-env
+spec:
+  containers:
+  - name: milvus-test-env
+    image: registry.zilliz.com/milvus/milvus-test-env:v0.1
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: kubeconf
+      mountPath: /root/.kube/
+      readOnly: true
+  volumes:
+  - name: kubeconf
+    secret:
+      secretName: test-cluster-config
+"""
+                                }
+                            }
+
+                            stages {
+                                stage("Deploy to Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/deploySingle2Dev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Dev Test") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                boolean isNightlyTest = isTimeTriggeredBuild()
+                                                if (isNightlyTest) {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevNightlyTest.groovy"
+                                                } else {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevTest.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage ("Cleanup Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            post {
+                                unsuccessful {
+                                    container('milvus-test-env') {
+                                        script {
+                                            load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                        }
+                                    }
+                                }
+                            }
+                        }
+    				}
+                }
+
+                stage ("CPU Version") {
+                    environment {
+                        BINRARY_VERSION = "cpu"
+                        PACKAGE_VERSION = VersionNumber([
+                            versionNumberString : '${SEMVER}-cpu-${OS_NAME}-${CPU_ARCH}-${LOWER_BUILD_TYPE}-${BUILD_DATE_FORMATTED, "yyyyMMdd"}-${BUILDS_TODAY}'
+                        ]);
+                        DOCKER_VERSION = "${SEMVER}-cpu-${OS_NAME}-${LOWER_BUILD_TYPE}"
+                    }
+
+                    stages {
+                        stage("Run Build") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-build"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  name: milvus-cpu-build-env
+  labels:
+    app: milvus
+    componet: cpu-build-env
+spec:
+  containers:
+  - name: milvus-cpu-build-env
+    image: registry.zilliz.com/milvus/milvus-cpu-build-env:v0.6.0-ubuntu18.04
+    env:
+    - name: POD_IP
+      valueFrom:
+        fieldRef:
+          fieldPath: status.podIP
+    - name: BUILD_ENV_IMAGE_ID
+      value: "23476391bec80c64f10d44a6370c73c71f011a6b95114b10ff82a60e771e11c7"
+    command:
+    - cat
+    tty: true
+    resources:
+      limits:
+        memory: "24Gi"
+        cpu: "8.0"
+      requests:
+        memory: "16Gi"
+        cpu: "4.0"
+  - name: milvus-mysql
+    image: mysql:5.6
+    env:
+    - name: MYSQL_ROOT_PASSWORD
+      value: 123456
+    ports:
+    - containerPort: 3306
+      name: mysql
+                                    """
+                                }
+                            }
+
+                            stages {
+                                stage('Build') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/build.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Code Coverage') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/internalCoverage.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                                stage('Upload Package') {
+                                    steps {
+                                        container("milvus-${env.BINRARY_VERSION}-build-env") {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/package.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Publish docker images") {
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-publish"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: publish
+    componet: docker
+spec:
+  containers:
+  - name: publish-images
+    image: registry.zilliz.com/library/docker:v1.0.0
+    securityContext:
+      privileged: true
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: docker-sock
+      mountPath: /var/run/docker.sock
+  volumes:
+  - name: docker-sock
+    hostPath:
+      path: /var/run/docker.sock
+"""
+                                }
+                            }
+
+                            stages {
+                                stage('Publish') {
+                                    steps {
+                                        container('publish-images'){
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/publishImages.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        stage("Deploy to Development") {
+                            environment {
+                                FROMAT_SEMVER = "${env.SEMVER}".replaceAll("\\.", "-")
+                                HELM_RELEASE_NAME = "${env.PIPELINE_NAME}-${env.FROMAT_SEMVER}-${env.BUILD_NUMBER}-single-${env.BINRARY_VERSION}".toLowerCase()
+                            }
+
+                            agent {
+                                kubernetes {
+                                    label "${env.BINRARY_VERSION}-dev-test"
+                                    defaultContainer 'jnlp'
+                                    yaml """
+apiVersion: v1
+kind: Pod
+metadata:
+  labels:
+    app: milvus
+    componet: test-env
+spec:
+  containers:
+  - name: milvus-test-env
+    image: registry.zilliz.com/milvus/milvus-test-env:v0.1
+    command:
+    - cat
+    tty: true
+    volumeMounts:
+    - name: kubeconf
+      mountPath: /root/.kube/
+      readOnly: true
+  volumes:
+  - name: kubeconf
+    secret:
+      secretName: test-cluster-config
+"""
+                                }
+                            }
+
+                            stages {
+                                stage("Deploy to Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/deploySingle2Dev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage("Dev Test") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                boolean isNightlyTest = isTimeTriggeredBuild()
+                                                if (isNightlyTest) {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevNightlyTest.groovy"
+                                                } else {
+                                                    load "${env.WORKSPACE}/ci/jenkins/step/singleDevTest.groovy"
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+
+                                stage ("Cleanup Dev") {
+                                    steps {
+                                        container('milvus-test-env') {
+                                            script {
+                                                load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                            post {
+                                unsuccessful {
+                                    container('milvus-test-env') {
+                                        script {
+                                            load "${env.WORKSPACE}/ci/jenkins/step/cleanupSingleDev.groovy"
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+boolean isTimeTriggeredBuild() {
+    if (currentBuild.getBuildCauses('hudson.triggers.TimerTrigger$TimerTriggerCause').size() != 0) {
+        return true
+    }
+    return false
+}
--- a/ci/jenkins/step/build.groovy
+++ b/ci/jenkins/step/build.groovy
@ -3,9 +3,9 @@ timeout(time: 60, unit: 'MINUTES') {
        withCredentials([usernamePassword(credentialsId: "${params.JFROG_CREDENTIALS_ID}", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
            def checkResult = sh(script: "./check_ccache.sh -l ${params.JFROG_ARTFACTORY_URL}/ccache", returnStatus: true)
            if ("${env.BINRARY_VERSION}" == "gpu") {
-                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -g -u -c"
+                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -g -x -u -c"
            } else {
-                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -m -u -c"
+                sh ". ./before-install.sh && ./build.sh -t ${params.BUILD_TYPE} -o /opt/milvus -l -u -c"
            }
            sh "./update_ccache.sh -l ${params.JFROG_ARTFACTORY_URL}/ccache -u ${USERNAME} -p ${PASSWORD}"
        }
--- a/ci/jenkins/step/internalCoverage.groovy
+++ b/ci/jenkins/step/internalCoverage.groovy
@ -0,0 +1,6 @@
+timeout(time: 30, unit: 'MINUTES') {
+    dir ("ci/scripts") {
+        sh "./coverage.sh -o /opt/milvus -u root -p 123456 -t \$POD_IP"
+    }
+}
+
--- a/ci/scripts/check_ccache.sh
+++ b/ci/scripts/check_ccache.sh
@ -46,7 +46,7 @@ check_ccache() {
    echo "fetching ${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
    wget -q --method HEAD "${ARTIFACTORY_URL}/${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz"
    if [[ $? == 0 ]];then
-        wget "${ARTIFACTORY_URL}/${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz" && \
+        wget -q "${ARTIFACTORY_URL}/${BRANCH}/ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz" && \
        mkdir -p ${CCACHE_DIRECTORY} && \
        tar zxf ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz -C ${CCACHE_DIRECTORY} && \
        rm ccache-${OS_NAME}-${CODE_NAME}-${BUILD_ENV_DOCKER_IMAGE_ID}.tar.gz
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@ -35,15 +35,15 @@ if (NOT DEFINED CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build.")
 endif ()

-set (GIT_BRANCH_NAME_REGEX "[0-9]+\\.[0-9]+\\.[0-9]")
+set(GIT_BRANCH_NAME_REGEX "[0-9]+\\.[0-9]+\\.[0-9]")

 MACRO(GET_GIT_BRANCH_NAME GIT_BRANCH_NAME)
    execute_process(COMMAND sh "-c" "git log --decorate | head -n 1 | sed 's/.*(\\(.*\\))/\\1/' | sed 's/.*, //' | sed 's=[a-zA-Z]*\/==g'"
            OUTPUT_VARIABLE ${GIT_BRANCH_NAME})
-    if(NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
+    if (NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
        execute_process(COMMAND "git" rev-parse --abbrev-ref HEAD OUTPUT_VARIABLE ${GIT_BRANCH_NAME})
    endif ()
-    if(NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
+    if (NOT GIT_BRANCH_NAME MATCHES "${GIT_BRANCH_NAME_REGEX}")
        execute_process(COMMAND "git" symbolic-ref --short -q HEAD HEAD OUTPUT_VARIABLE ${GIT_BRANCH_NAME})
    endif ()
 ENDMACRO(GET_GIT_BRANCH_NAME)
@ -79,7 +79,7 @@ if (MILVUS_VERSION_MAJOR STREQUAL ""
        OR MILVUS_VERSION_PATCH STREQUAL "")
    message(WARNING "Failed to determine Milvus version from git branch name")
    set(MILVUS_VERSION "0.6.0")
-endif()
+endif ()

 message(STATUS "Build version = ${MILVUS_VERSION}")
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/version.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/version.h @ONLY)
@ -141,6 +141,11 @@ if (MILVUS_USE_CCACHE)
    endif (CCACHE_FOUND)
 endif ()

+if (CUSTOMIZATION)
+    set(MILVUS_GPU_VERSION ON)
+    add_compile_definitions(CUSTOMIZATION)
+endif ()
+
 set(MILVUS_CPU_VERSION false)
 if (MILVUS_GPU_VERSION)
    message(STATUS "Building Milvus GPU version")
@ -170,10 +175,6 @@ else ()
    endif ()
 endif ()

-if (CUSTOMIZATION)
-    add_definitions(-DCUSTOMIZATION)
-endif (CUSTOMIZATION)
-
 config_summary()
 add_subdirectory(src)

--- a/core/cmake/DefineOptions.cmake
+++ b/core/cmake/DefineOptions.cmake
@ -41,10 +41,12 @@ macro(define_option_string name description default)
 endmacro()

 #----------------------------------------------------------------------
-set_option_category("GPU version")
+set_option_category("Milvus Build Option")

 define_option(MILVUS_GPU_VERSION "Build GPU version" OFF)

+define_option(CUSTOMIZATION "Build with customized FAISS library" OFF)
+
 #----------------------------------------------------------------------
 set_option_category("Thirdparty")

--- a/core/src/db/DBImpl.cpp
+++ b/core/src/db/DBImpl.cpp
@ -41,6 +41,7 @@
 #include <iostream>
 #include <set>
 #include <thread>
+#include <utility>

 namespace milvus {
 namespace engine {
@ -51,6 +52,8 @@ constexpr uint64_t METRIC_ACTION_INTERVAL = 1;
 constexpr uint64_t COMPACT_ACTION_INTERVAL = 1;
 constexpr uint64_t INDEX_ACTION_INTERVAL = 1;

+constexpr uint64_t INDEX_FAILED_RETRY_TIME = 1;
+
 static const Status SHUTDOWN_ERROR = Status(DB_ERROR, "Milsvus server is shutdown!");

 void
@ -179,7 +182,7 @@ DBImpl::PreloadTable(const std::string& table_id) {
        return SHUTDOWN_ERROR;
    }

-    // get all table files from parent table
+    // step 1: get all table files from parent table
    meta::DatesT dates;
    std::vector<size_t> ids;
    meta::TableFilesSchema files_array;
@ -188,7 +191,7 @@ DBImpl::PreloadTable(const std::string& table_id) {
        return status;
    }

-    // get files from partition tables
+    // step 2: get files from partition tables
    std::vector<meta::TableSchema> partiton_array;
    status = meta_ptr_->ShowPartitions(table_id, partiton_array);
    for (auto& schema : partiton_array) {
@ -200,6 +203,10 @@ DBImpl::PreloadTable(const std::string& table_id) {
    int64_t cache_usage = cache::CpuCacheMgr::GetInstance()->CacheUsage();
    int64_t available_size = cache_total - cache_usage;

+    // step 3: load file one by one
+    ENGINE_LOG_DEBUG << "Begin pre-load table:" + table_id + ", totally " << files_array.size()
+                     << " files need to be pre-loaded";
+    TimeRecorderAuto rc("Pre-load table:" + table_id);
    for (auto& file : files_array) {
        ExecutionEnginePtr engine = EngineFactory::Build(file.dimension_, file.location_, (EngineType)file.engine_type_,
                                                         (MetricType)file.metric_type_, file.nlist_);
@ -210,10 +217,12 @@ DBImpl::PreloadTable(const std::string& table_id) {

        size += engine->PhysicalSize();
        if (size > available_size) {
+            ENGINE_LOG_DEBUG << "Pre-load canceled since cache almost full";
            return Status(SERVER_CACHE_FULL, "Cache is full");
        } else {
            try {
-                // step 1: load index
+                std::string msg = "Pre-loaded file: " + file.file_id_ + " size: " + std::to_string(file.file_size_);
+                TimeRecorderAuto rc_1(msg);
                engine->Load(true);
            } catch (std::exception& ex) {
                std::string msg = "Pre-load table encounter exception: " + std::string(ex.what());
@ -361,6 +370,7 @@ DBImpl::CreateIndex(const std::string& table_id, const TableIndex& index) {
    WaitMergeFileFinish();

    // step 4: wait and build index
+    status = CleanFailedIndexFileOfTable(table_id);
    status = BuildTableIndexRecursively(table_id, index);

    return status;
@ -828,22 +838,35 @@ DBImpl::BackgroundBuildIndex() {
    std::unique_lock<std::mutex> lock(build_index_mutex_);
    meta::TableFilesSchema to_index_files;
    meta_ptr_->FilesToIndex(to_index_files);
-    Status status;
+    Status status = IgnoreFailedIndexFiles(to_index_files);

    if (!to_index_files.empty()) {
-        scheduler::BuildIndexJobPtr job = std::make_shared<scheduler::BuildIndexJob>(meta_ptr_, options_);
-
        // step 2: put build index task to scheduler
+        std::map<scheduler::BuildIndexJobPtr, scheduler::TableFileSchemaPtr> job2file_map;
        for (auto& file : to_index_files) {
+            scheduler::BuildIndexJobPtr job = std::make_shared<scheduler::BuildIndexJob>(meta_ptr_, options_);
            scheduler::TableFileSchemaPtr file_ptr = std::make_shared<meta::TableFileSchema>(file);
            job->AddToIndexFiles(file_ptr);
+            scheduler::JobMgrInst::GetInstance()->Put(job);
+            job2file_map.insert(std::make_pair(job, file_ptr));
        }
-        scheduler::JobMgrInst::GetInstance()->Put(job);
-        job->WaitBuildIndexFinish();
-        if (!job->GetStatus().ok()) {
-            Status status = job->GetStatus();
-            ENGINE_LOG_ERROR << "Building index failed: " << status.ToString();
+
+        for (auto iter = job2file_map.begin(); iter != job2file_map.end(); ++iter) {
+            scheduler::BuildIndexJobPtr job = iter->first;
+            meta::TableFileSchema& file_schema = *(iter->second.get());
+            job->WaitBuildIndexFinish();
+            if (!job->GetStatus().ok()) {
+                Status status = job->GetStatus();
+                ENGINE_LOG_ERROR << "Building index job " << job->id() << " failed: " << status.ToString();
+
+                MarkFailedIndexFile(file_schema);
+            } else {
+                MarkSucceedIndexFile(file_schema);
+                ENGINE_LOG_DEBUG << "Building index job " << job->id() << " succeed.";
+            }
        }
+
+        ENGINE_LOG_DEBUG << "Background build index thread finished";
    }

    // ENGINE_LOG_TRACE << "Background build index thread exit";
@ -911,6 +934,7 @@ DBImpl::DropTableRecursively(const std::string& table_id, const meta::DatesT& da
    if (dates.empty()) {
        status = mem_mgr_->EraseMemVector(table_id);  // not allow insert
        status = meta_ptr_->DropTable(table_id);      // soft delete table
+        CleanFailedIndexFileOfTable(table_id);

        // scheduler will determine when to delete table files
        auto nres = scheduler::ResMgrInst::GetInstance()->GetNumOfComputeResource();
@ -989,6 +1013,8 @@ DBImpl::BuildTableIndexRecursively(const std::string& table_id, const TableIndex
        std::this_thread::sleep_for(std::chrono::milliseconds(std::min(10 * 1000, times * 100)));
        GetFilesToBuildIndex(table_id, file_types, table_files);
        times++;
+
+        IgnoreFailedIndexFiles(table_files);
    }

    // build index for partition
@ -1001,12 +1027,27 @@ DBImpl::BuildTableIndexRecursively(const std::string& table_id, const TableIndex
        }
    }

+    // failed to build index for some files, return error
+    std::vector<std::string> failed_files;
+    GetFailedIndexFileOfTable(table_id, failed_files);
+    if (!failed_files.empty()) {
+        std::string msg = "Failed to build index for " + std::to_string(failed_files.size()) +
+                          ((failed_files.size() == 1) ? " file" : " files");
+#ifdef MILVUS_CPU_VERSION
+        msg += ", please double check index parameters.";
+#else
+        msg += ", file size is too large or gpu memory is not enough.";
+#endif
+        return Status(DB_ERROR, msg);
+    }
+
    return Status::OK();
 }

 Status
 DBImpl::DropTableIndexRecursively(const std::string& table_id) {
    ENGINE_LOG_DEBUG << "Drop index for table: " << table_id;
+    CleanFailedIndexFileOfTable(table_id);
    auto status = meta_ptr_->DropTableIndex(table_id);
    if (!status.ok()) {
        return status;
@ -1049,5 +1090,86 @@ DBImpl::GetTableRowCountRecursively(const std::string& table_id, uint64_t& row_c
    return Status::OK();
 }

+Status
+DBImpl::CleanFailedIndexFileOfTable(const std::string& table_id) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+    index_failed_files_.erase(table_id);  // rebuild failed index files for this table
+
+    return Status::OK();
+}
+
+Status
+DBImpl::GetFailedIndexFileOfTable(const std::string& table_id, std::vector<std::string>& failed_files) {
+    failed_files.clear();
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+    auto iter = index_failed_files_.find(table_id);
+    if (iter != index_failed_files_.end()) {
+        FileID2FailedTimes& failed_map = iter->second;
+        for (auto it_file = failed_map.begin(); it_file != failed_map.end(); ++it_file) {
+            failed_files.push_back(it_file->first);
+        }
+    }
+
+    return Status::OK();
+}
+
+Status
+DBImpl::MarkFailedIndexFile(const meta::TableFileSchema& file) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+
+    auto iter = index_failed_files_.find(file.table_id_);
+    if (iter == index_failed_files_.end()) {
+        FileID2FailedTimes failed_files;
+        failed_files.insert(std::make_pair(file.file_id_, 1));
+        index_failed_files_.insert(std::make_pair(file.table_id_, failed_files));
+    } else {
+        auto it_failed_files = iter->second.find(file.file_id_);
+        if (it_failed_files != iter->second.end()) {
+            it_failed_files->second++;
+        } else {
+            iter->second.insert(std::make_pair(file.file_id_, 1));
+        }
+    }
+
+    return Status::OK();
+}
+
+Status
+DBImpl::MarkSucceedIndexFile(const meta::TableFileSchema& file) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+
+    auto iter = index_failed_files_.find(file.table_id_);
+    if (iter != index_failed_files_.end()) {
+        iter->second.erase(file.file_id_);
+    }
+
+    return Status::OK();
+}
+
+Status
+DBImpl::IgnoreFailedIndexFiles(meta::TableFilesSchema& table_files) {
+    std::lock_guard<std::mutex> lck(index_failed_mutex_);
+
+    // there could be some failed files belong to different table.
+    // some files may has failed for several times, no need to build index for these files.
+    // thus we can avoid dead circle for build index operation
+    for (auto it_file = table_files.begin(); it_file != table_files.end();) {
+        auto it_failed_files = index_failed_files_.find((*it_file).table_id_);
+        if (it_failed_files != index_failed_files_.end()) {
+            auto it_failed_file = it_failed_files->second.find((*it_file).file_id_);
+            if (it_failed_file != it_failed_files->second.end()) {
+                if (it_failed_file->second >= INDEX_FAILED_RETRY_TIME) {
+                    it_file = table_files.erase(it_file);
+                    continue;
+                }
+            }
+        }
+
+        ++it_file;
+    }
+
+    return Status::OK();
+}
+
 }  // namespace engine
 }  // namespace milvus
--- a/core/src/db/DBImpl.h
+++ b/core/src/db/DBImpl.h
@ -25,6 +25,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <list>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <set>
@ -35,8 +36,6 @@
 namespace milvus {
 namespace engine {

-class Env;
-
 namespace meta {
 class Meta;
 }
@ -179,6 +178,21 @@ class DBImpl : public DB {
    Status
    GetTableRowCountRecursively(const std::string& table_id, uint64_t& row_count);

+    Status
+    CleanFailedIndexFileOfTable(const std::string& table_id);
+
+    Status
+    GetFailedIndexFileOfTable(const std::string& table_id, std::vector<std::string>& failed_files);
+
+    Status
+    MarkFailedIndexFile(const meta::TableFileSchema& file);
+
+    Status
+    MarkSucceedIndexFile(const meta::TableFileSchema& file);
+
+    Status
+    IgnoreFailedIndexFiles(meta::TableFilesSchema& table_files);
+
 private:
    const DBOptions options_;

@ -200,7 +214,11 @@ class DBImpl : public DB {
    std::list<std::future<void>> index_thread_results_;

    std::mutex build_index_mutex_;
-};  // DBImpl
+    std::mutex index_failed_mutex_;
+    using FileID2FailedTimes = std::map<std::string, uint64_t>;
+    using Table2FailedFiles = std::map<std::string, FileID2FailedTimes>;
+    Table2FailedFiles index_failed_files_;  // file id mapping to failed times
+};                                          // DBImpl

 }  // namespace engine
 }  // namespace milvus
--- a/core/src/db/Utils.cpp
+++ b/core/src/db/Utils.cpp
@ -154,7 +154,9 @@ GetTableFilePath(const DBMetaOptions& options, meta::TableFileSchema& table_file
    }

    std::string msg = "Table file doesn't exist: " + file_path;
-    ENGINE_LOG_ERROR << msg << " in path: " << options.path_ << " for table: " << table_file.table_id_;
+    if (table_file.file_size_ > 0) {  // no need to pop error for empty file
+        ENGINE_LOG_ERROR << msg << " in path: " << options.path_ << " for table: " << table_file.table_id_;
+    }

    return Status(DB_ERROR, msg);
 }
--- a/core/src/db/meta/MySQLMetaImpl.cpp
+++ b/core/src/db/meta/MySQLMetaImpl.cpp
@ -290,45 +290,50 @@ MySQLMetaImpl::Initialize() {
    // step 4: validate to avoid open old version schema
    ValidateMetaSchema();

-    // step 5: create meta tables
-    try {
-        if (mode_ != DBOptions::MODE::CLUSTER_READONLY) {
-            CleanUpShadowFiles();
-        }
+    // step 5: clean shadow files
+    if (mode_ != DBOptions::MODE::CLUSTER_READONLY) {
+        CleanUpShadowFiles();
+    }

-        {
-            mysqlpp::ScopedConnection connectionPtr(*mysql_connection_pool_, safe_grab_);
+    // step 6: try connect mysql server
+    mysqlpp::ScopedConnection connectionPtr(*mysql_connection_pool_, safe_grab_);

-            if (connectionPtr == nullptr) {
-                return Status(DB_ERROR, "Failed to connect to meta server(mysql)");
-            }
+    if (connectionPtr == nullptr) {
+        std::string msg = "Failed to connect MySQL meta server: " + uri;
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_INVALID_META_URI, msg);
+    }

-            if (!connectionPtr->thread_aware()) {
-                ENGINE_LOG_ERROR << "MySQL++ wasn't built with thread awareness! Can't run without it.";
-                return Status(DB_ERROR, "MySQL++ wasn't built with thread awareness! Can't run without it.");
-            }
-            mysqlpp::Query InitializeQuery = connectionPtr->query();
+    if (!connectionPtr->thread_aware()) {
+        std::string msg =
+            "Failed to initialize MySQL meta backend: MySQL client component wasn't built with thread awareness";
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_INVALID_META_URI, msg);
+    }

-            InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLES_SCHEMA.name() << " ("
-                            << TABLES_SCHEMA.ToString() + ");";
+    // step 7: create meta table Tables
+    mysqlpp::Query InitializeQuery = connectionPtr->query();

-            ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
+    InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLES_SCHEMA.name() << " (" << TABLES_SCHEMA.ToString() + ");";

-            if (!InitializeQuery.exec()) {
-                return HandleException("Initialization Error", InitializeQuery.error());
-            }
+    ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();

-            InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLEFILES_SCHEMA.name() << " ("
-                            << TABLEFILES_SCHEMA.ToString() + ");";
+    if (!InitializeQuery.exec()) {
+        std::string msg = "Failed to create meta table 'Tables' in MySQL";
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_META_TRANSACTION_FAILED, msg);
+    }

-            ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
+    // step 8: create meta table TableFiles
+    InitializeQuery << "CREATE TABLE IF NOT EXISTS " << TABLEFILES_SCHEMA.name() << " ("
+                    << TABLEFILES_SCHEMA.ToString() + ");";

-            if (!InitializeQuery.exec()) {
-                return HandleException("Initialization Error", InitializeQuery.error());
-            }
-        }  // Scoped Connection
-    } catch (std::exception& e) {
-        return HandleException("GENERAL ERROR DURING INITIALIZATION", e.what());
+    ENGINE_LOG_DEBUG << "MySQLMetaImpl::Initialize: " << InitializeQuery.str();
+
+    if (!InitializeQuery.exec()) {
+        std::string msg = "Failed to create meta table 'TableFiles' in MySQL";
+        ENGINE_LOG_ERROR << msg;
+        throw Exception(DB_META_TRANSACTION_FAILED, msg);
    }

    return Status::OK();
@ -1610,10 +1615,34 @@ MySQLMetaImpl::FilesByType(const std::string& table_id, const std::vector<int>&
                }
            }

-            ENGINE_LOG_DEBUG << "Table " << table_id << " currently has raw files:" << raw_count
-                             << " new files:" << new_count << " new_merge files:" << new_merge_count
-                             << " new_index files:" << new_index_count << " to_index files:" << to_index_count
-                             << " index files:" << index_count << " backup files:" << backup_count;
+            std::string msg = "Get table files by type.";
+            for (int file_type : file_types) {
+                switch (file_type) {
+                    case (int)TableFileSchema::RAW:
+                        msg = msg + " raw files:" + std::to_string(raw_count);
+                        break;
+                    case (int)TableFileSchema::NEW:
+                        msg = msg + " new files:" + std::to_string(new_count);
+                        break;
+                    case (int)TableFileSchema::NEW_MERGE:
+                        msg = msg + " new_merge files:" + std::to_string(new_merge_count);
+                        break;
+                    case (int)TableFileSchema::NEW_INDEX:
+                        msg = msg + " new_index files:" + std::to_string(new_index_count);
+                        break;
+                    case (int)TableFileSchema::TO_INDEX:
+                        msg = msg + " to_index files:" + std::to_string(to_index_count);
+                        break;
+                    case (int)TableFileSchema::INDEX:
+                        msg = msg + " index files:" + std::to_string(index_count);
+                        break;
+                    case (int)TableFileSchema::BACKUP:
+                        msg = msg + " backup files:" + std::to_string(backup_count);
+                        break;
+                    default:break;
+                }
+            }
+            ENGINE_LOG_DEBUG << msg;
        }
    } catch (std::exception& e) {
        return HandleException("GENERAL ERROR WHEN GET FILE BY TYPE", e.what());
--- a/core/src/db/meta/SqliteMetaImpl.cpp
+++ b/core/src/db/meta/SqliteMetaImpl.cpp
@ -1157,10 +1157,34 @@ SqliteMetaImpl::FilesByType(const std::string& table_id,
                table_files.emplace_back(file_schema);
            }

-            ENGINE_LOG_DEBUG << "Table " << table_id << " currently has raw files:" << raw_count
-                             << " new files:" << new_count << " new_merge files:" << new_merge_count
-                             << " new_index files:" << new_index_count << " to_index files:" << to_index_count
-                             << " index files:" << index_count << " backup files:" << backup_count;
+            std::string msg = "Get table files by type.";
+            for (int file_type : file_types) {
+                switch (file_type) {
+                    case (int)TableFileSchema::RAW:
+                        msg = msg + " raw files:" + std::to_string(raw_count);
+                        break;
+                    case (int)TableFileSchema::NEW:
+                        msg = msg + " new files:" + std::to_string(new_count);
+                        break;
+                    case (int)TableFileSchema::NEW_MERGE:
+                        msg = msg + " new_merge files:" + std::to_string(new_merge_count);
+                        break;
+                    case (int)TableFileSchema::NEW_INDEX:
+                        msg = msg + " new_index files:" + std::to_string(new_index_count);
+                        break;
+                    case (int)TableFileSchema::TO_INDEX:
+                        msg = msg + " to_index files:" + std::to_string(to_index_count);
+                        break;
+                    case (int)TableFileSchema::INDEX:
+                        msg = msg + " index files:" + std::to_string(index_count);
+                        break;
+                    case (int)TableFileSchema::BACKUP:
+                        msg = msg + " backup files:" + std::to_string(backup_count);
+                        break;
+                    default:break;
+                }
+            }
+            ENGINE_LOG_DEBUG << msg;
        }
    } catch (std::exception& e) {
        return HandleException("Encounter exception when check non index files", e.what());
--- a/core/src/index/CMakeLists.txt
+++ b/core/src/index/CMakeLists.txt
@ -72,6 +72,11 @@ include(ExternalProject)
 include(DefineOptionsCore)
 include(BuildUtilsCore)

+if (CUSTOMIZATION)
+    set(MILVUS_GPU_VERSION ON)
+    add_compile_definitions(CUSTOMIZATION)
+endif ()
+
 set(KNOWHERE_CPU_VERSION false)
 if (MILVUS_GPU_VERSION OR KNOWHERE_GPU_VERSION)
    message(STATUS "Building Knowhere GPU version")
--- a/core/src/index/cmake/DefineOptionsCore.cmake
+++ b/core/src/index/cmake/DefineOptionsCore.cmake
@ -49,6 +49,8 @@ else ()
    define_option(KNOWHERE_GPU_VERSION "Build GPU version" OFF)
 endif ()

+define_option(CUSTOMIZATION "Build with customized FAISS library" OFF)
+
 #----------------------------------------------------------------------
 set_option_category("Thirdparty")

--- a/core/src/index/cmake/ThirdPartyPackagesCore.cmake
+++ b/core/src/index/cmake/ThirdPartyPackagesCore.cmake
@ -225,11 +225,11 @@ foreach (_VERSION_ENTRY ${TOOLCHAIN_VERSIONS_TXT})
    set(${_LIB_NAME} "${_LIB_VERSION}")
 endforeach ()

+set(FAISS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/faiss)
 if (DEFINED ENV{FAISS_SOURCE_URL})
    set(FAISS_SOURCE_URL "$ENV{FAISS_SOURCE_URL}")
 else ()
    set(FAISS_SOURCE_URL "https://github.com/JinHai-CN/faiss/archive/${FAISS_VERSION}.tar.gz")
-    set(FAISS_MD5 "b02c1a53234f5acc9bea1b0c55524f50")
 endif ()

 if (DEFINED ENV{KNOWHERE_ARROW_URL})
@ -737,12 +737,12 @@ macro(build_faiss)
            set(FAISS_COMPUTE_TYPE "gpu")
        else ()
            set(FAISS_COMPUTE_TYPE "cpu")
-        endif()
+        endif ()
        if (FAISS_WITH_MKL)
            set(FAISS_CACHE_PACKAGE_NAME "faiss_${FAISS_COMPUTE_TYPE}_mkl_${FAISS_COMBINE_MD5}.tar.gz")
        else ()
            set(FAISS_CACHE_PACKAGE_NAME "faiss_${FAISS_COMPUTE_TYPE}_openblas_${FAISS_COMBINE_MD5}.tar.gz")
-        endif()
+        endif ()
        set(FAISS_CACHE_URL "${JFROG_ARTFACTORY_CACHE_URL}/${FAISS_CACHE_PACKAGE_NAME}")
        set(FAISS_CACHE_PACKAGE_PATH "${THIRDPARTY_PACKAGE_CACHE}/${FAISS_CACHE_PACKAGE_NAME}")

@ -779,21 +779,41 @@ macro(build_faiss)
            endif ()
        endif ()
    else ()
-        externalproject_add(faiss_ep
-                URL
-                ${FAISS_SOURCE_URL}
-                ${EP_LOG_OPTIONS}
-                CONFIGURE_COMMAND
-                "./configure"
-                ${FAISS_CONFIGURE_ARGS}
-                BUILD_COMMAND
-                ${MAKE} ${MAKE_BUILD_ARGS} all
-                BUILD_IN_SOURCE
-                1
-                INSTALL_COMMAND
-                ${MAKE} install
-                BUILD_BYPRODUCTS
-                ${FAISS_STATIC_LIB})
+        if (CUSTOMIZATION)
+            externalproject_add(faiss_ep
+                    DOWNLOAD_COMMAND
+                    ""
+                    SOURCE_DIR
+                    ${FAISS_SOURCE_DIR}
+                    ${EP_LOG_OPTIONS}
+                    CONFIGURE_COMMAND
+                    "./configure"
+                    ${FAISS_CONFIGURE_ARGS}
+                    BUILD_COMMAND
+                    ${MAKE} ${MAKE_BUILD_ARGS} all
+                    BUILD_IN_SOURCE
+                    1
+                    INSTALL_COMMAND
+                    ${MAKE} install
+                    BUILD_BYPRODUCTS
+                    ${FAISS_STATIC_LIB})
+        else ()
+            externalproject_add(faiss_ep
+                    URL
+                    ${FAISS_SOURCE_URL}
+                    ${EP_LOG_OPTIONS}
+                    CONFIGURE_COMMAND
+                    "./configure"
+                    ${FAISS_CONFIGURE_ARGS}
+                    BUILD_COMMAND
+                    ${MAKE} ${MAKE_BUILD_ARGS} all
+                    BUILD_IN_SOURCE
+                    1
+                    INSTALL_COMMAND
+                    ${MAKE} install
+                    BUILD_BYPRODUCTS
+                    ${FAISS_STATIC_LIB})
+        endif ()

        if (NOT FAISS_WITH_MKL)
            ExternalProject_Add_StepDependencies(faiss_ep build openblas_ep lapack_ep)
--- a/core/src/index/thirdparty/faiss/.dockerignore
+++ b/core/src/index/thirdparty/faiss/.dockerignore
@ -0,0 +1 @@
+sift1M
--- a/core/src/index/thirdparty/faiss/.gitignore
+++ b/core/src/index/thirdparty/faiss/.gitignore
@ -0,0 +1,21 @@
+*.swp
+*.swo
+*.o
+*.a
+*.dSYM
+*.so
+*.dylib
+*.pyc
+*~
+.DS_Store
+depend
+/config.*
+/aclocal.m4
+/autom4te.cache/
+/makefile.inc
+/bin/
+/c_api/bin/
+/c_api/gpu/bin/
+/tests/test
+/tests/gtest/
+include/
--- a/core/src/index/thirdparty/faiss/AutoTune.cpp
+++ b/core/src/index/thirdparty/faiss/AutoTune.cpp
@ -0,0 +1,719 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+/*
+ * implementation of Hyper-parameter auto-tuning
+ */
+
+#include <faiss/AutoTune.h>
+
+#include <cmath>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/random.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexHNSW.h>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+
+namespace faiss {
+
+
+AutoTuneCriterion::AutoTuneCriterion (idx_t nq, idx_t nnn):
+    nq (nq), nnn (nnn), gt_nnn (0)
+{}
+
+
+void AutoTuneCriterion::set_groundtruth (
+     int gt_nnn, const float *gt_D_in, const idx_t *gt_I_in)
+{
+    this->gt_nnn = gt_nnn;
+    if (gt_D_in) { // allow null for this, as it is often not used
+        gt_D.resize (nq * gt_nnn);
+        memcpy (gt_D.data(), gt_D_in, sizeof (gt_D[0]) * nq * gt_nnn);
+    }
+    gt_I.resize (nq * gt_nnn);
+    memcpy (gt_I.data(), gt_I_in, sizeof (gt_I[0]) * nq * gt_nnn);
+}
+
+
+
+OneRecallAtRCriterion::OneRecallAtRCriterion (idx_t nq, idx_t R):
+    AutoTuneCriterion(nq, R), R(R)
+{}
+
+double OneRecallAtRCriterion::evaluate(const float* /*D*/, const idx_t* I)
+    const {
+  FAISS_THROW_IF_NOT_MSG(
+      (gt_I.size() == gt_nnn * nq && gt_nnn >= 1 && nnn >= R),
+      "ground truth not initialized");
+  idx_t n_ok = 0;
+  for (idx_t q = 0; q < nq; q++) {
+    idx_t gt_nn = gt_I[q * gt_nnn];
+    const idx_t* I_line = I + q * nnn;
+    for (int i = 0; i < R; i++) {
+      if (I_line[i] == gt_nn) {
+        n_ok++;
+        break;
+      }
+    }
+  }
+  return n_ok / double(nq);
+}
+
+
+IntersectionCriterion::IntersectionCriterion (idx_t nq, idx_t R):
+    AutoTuneCriterion(nq, R), R(R)
+{}
+
+double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
+    const {
+    FAISS_THROW_IF_NOT_MSG(
+      (gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
+      "ground truth not initialized");
+    int64_t n_ok = 0;
+#pragma omp parallel for reduction(+: n_ok)
+    for (idx_t q = 0; q < nq; q++) {
+        n_ok += ranklist_intersection_size (
+             R, &gt_I [q * gt_nnn],
+             R, I + q * nnn);
+    }
+    return n_ok / double (nq * R);
+}
+
+/***************************************************************
+ * OperatingPoints
+ ***************************************************************/
+
+OperatingPoints::OperatingPoints ()
+{
+    clear();
+}
+
+void OperatingPoints::clear ()
+{
+    all_pts.clear();
+    optimal_pts.clear();
+    /// default point: doing nothing gives 0 performance and takes 0 time
+    OperatingPoint op = {0, 0, "", -1};
+    optimal_pts.push_back(op);
+}
+
+/// add a performance measure
+bool OperatingPoints::add (double perf, double t, const std::string & key,
+                           size_t cno)
+{
+    OperatingPoint op = {perf, t, key, int64_t(cno)};
+    all_pts.push_back (op);
+    if (perf == 0) {
+        return false;  // no method for 0 accuracy is faster than doing nothing
+    }
+    std::vector<OperatingPoint> & a = optimal_pts;
+    if (perf > a.back().perf) {
+        // keep unconditionally
+        a.push_back (op);
+    } else if (perf == a.back().perf) {
+        if (t < a.back ().t) {
+            a.back() = op;
+        } else {
+            return false;
+        }
+    } else {
+        int i;
+        // stricto sensu this should be a bissection
+        for (i = 0; i < a.size(); i++) {
+            if (a[i].perf >= perf) break;
+        }
+        assert (i < a.size());
+        if (t < a[i].t) {
+            if (a[i].perf == perf) {
+                a[i] = op;
+            } else {
+                a.insert (a.begin() + i, op);
+            }
+        } else {
+            return false;
+        }
+    }
+    { // remove non-optimal points from array
+        int i = a.size() - 1;
+        while (i > 0) {
+            if (a[i].t < a[i - 1].t)
+                a.erase (a.begin() + (i - 1));
+            i--;
+        }
+    }
+    return true;
+}
+
+
+int OperatingPoints::merge_with (const OperatingPoints &other,
+                                 const std::string & prefix)
+{
+    int n_add = 0;
+    for (int i = 0; i < other.all_pts.size(); i++) {
+        const OperatingPoint & op = other.all_pts[i];
+        if (add (op.perf, op.t, prefix + op.key, op.cno))
+            n_add++;
+    }
+    return n_add;
+}
+
+
+
+/// get time required to obtain a given performance measure
+double OperatingPoints::t_for_perf (double perf) const
+{
+    const std::vector<OperatingPoint> & a = optimal_pts;
+    if (perf > a.back().perf) return 1e50;
+    int i0 = -1, i1 = a.size() - 1;
+    while (i0 + 1 < i1) {
+        int imed = (i0 + i1 + 1) / 2;
+        if (a[imed].perf < perf) i0 = imed;
+        else                     i1 = imed;
+    }
+    return a[i1].t;
+}
+
+
+void OperatingPoints::all_to_gnuplot (const char *fname) const
+{
+    FILE *f = fopen(fname, "w");
+    if (!f) {
+        fprintf (stderr, "cannot open %s", fname);
+        perror("");
+        abort();
+    }
+    for (int i = 0; i < all_pts.size(); i++) {
+        const OperatingPoint & op = all_pts[i];
+        fprintf (f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
+    }
+    fclose(f);
+}
+
+void OperatingPoints::optimal_to_gnuplot (const char *fname) const
+{
+    FILE *f = fopen(fname, "w");
+    if (!f) {
+        fprintf (stderr, "cannot open %s", fname);
+        perror("");
+        abort();
+    }
+    double prev_perf = 0.0;
+    for (int i = 0; i < optimal_pts.size(); i++) {
+        const OperatingPoint & op = optimal_pts[i];
+        fprintf (f, "%g %g\n", prev_perf, op.t);
+        fprintf (f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
+        prev_perf = op.perf;
+    }
+    fclose(f);
+}
+
+void OperatingPoints::display (bool only_optimal) const
+{
+    const std::vector<OperatingPoint> &pts =
+        only_optimal ? optimal_pts : all_pts;
+    printf("Tested %ld operating points, %ld ones are optimal:\n",
+           all_pts.size(), optimal_pts.size());
+
+    for (int i = 0; i < pts.size(); i++) {
+        const OperatingPoint & op = pts[i];
+        const char *star = "";
+        if (!only_optimal) {
+            for (int j = 0; j < optimal_pts.size(); j++) {
+                if (op.cno == optimal_pts[j].cno) {
+                    star = "*";
+                    break;
+                }
+            }
+        }
+        printf ("cno=%ld key=%s perf=%.4f t=%.3f %s\n",
+                op.cno, op.key.c_str(), op.perf, op.t, star);
+    }
+
+}
+
+/***************************************************************
+ * ParameterSpace
+ ***************************************************************/
+
+ParameterSpace::ParameterSpace ():
+    verbose (1), n_experiments (500),
+    batchsize (1<<30), thread_over_batches (false),
+    min_test_duration (0)
+{
+}
+
+/* not keeping this constructor as inheritors will call the parent
+   initialize()
+ */
+
+#if 0
+ParameterSpace::ParameterSpace (Index *index):
+    verbose (1), n_experiments (500),
+    batchsize (1<<30), thread_over_batches (false)
+
+{
+    initialize(index);
+}
+#endif
+
+size_t ParameterSpace::n_combinations () const
+{
+    size_t n = 1;
+    for (int i = 0; i < parameter_ranges.size(); i++)
+        n *= parameter_ranges[i].values.size();
+    return n;
+}
+
+/// get string representation of the combination
+std::string ParameterSpace::combination_name (size_t cno) const {
+    char buf[1000], *wp = buf;
+    *wp = 0;
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange & pr = parameter_ranges[i];
+        size_t j = cno % pr.values.size();
+        cno /= pr.values.size();
+        wp += snprintf (
+              wp, buf + 1000 - wp, "%s%s=%g", i == 0 ? "" : ",",
+              pr.name.c_str(), pr.values[j]);
+    }
+    return std::string (buf);
+}
+
+
+bool ParameterSpace::combination_ge (size_t c1, size_t c2) const
+{
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        int nval = parameter_ranges[i].values.size();
+        size_t j1 = c1 % nval;
+        size_t j2 = c2 % nval;
+        if (!(j1 >= j2)) return false;
+        c1 /= nval;
+        c2 /= nval;
+    }
+    return true;
+}
+
+
+
+#define DC(classname) \
+    const classname *ix = dynamic_cast<const classname *>(index)
+
+static void init_pq_ParameterRange (const ProductQuantizer & pq,
+                                    ParameterRange & pr)
+{
+    if (pq.code_size % 4 == 0) {
+        // Polysemous not supported for code sizes that are not a
+        // multiple of 4
+        for (int i = 2; i <= pq.code_size * 8 / 2; i+= 2)
+            pr.values.push_back(i);
+    }
+    pr.values.push_back (pq.code_size * 8);
+}
+
+ParameterRange &ParameterSpace::add_range(const char * name)
+{
+    for (auto & pr : parameter_ranges) {
+        if (pr.name == name) {
+            return pr;
+        }
+    }
+    parameter_ranges.push_back (ParameterRange ());
+    parameter_ranges.back ().name = name;
+    return parameter_ranges.back ();
+}
+
+
+/// initialize with reasonable parameters for the index
+void ParameterSpace::initialize (const Index * index)
+{
+    if (DC (IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC (IndexRefineFlat)) {
+        ParameterRange & pr = add_range("k_factor_rf");
+        for (int i = 0; i <= 6; i++) {
+            pr.values.push_back (1 << i);
+        }
+        index = ix->base_index;
+    }
+    if (DC (IndexPreTransform)) {
+        index = ix->index;
+    }
+
+    if (DC (IndexIVF)) {
+        {
+            ParameterRange & pr = add_range("nprobe");
+            for (int i = 0; i < 13; i++) {
+                size_t nprobe = 1 << i;
+                if (nprobe >= ix->nlist) break;
+                pr.values.push_back (nprobe);
+            }
+        }
+        if (dynamic_cast<const IndexHNSW*>(ix->quantizer)) {
+            ParameterRange & pr = add_range("efSearch");
+            for (int i = 2; i <= 9; i++) {
+                pr.values.push_back (1 << i);
+            }
+        }
+    }
+    if (DC (IndexPQ)) {
+        ParameterRange & pr = add_range("ht");
+        init_pq_ParameterRange (ix->pq, pr);
+    }
+    if (DC (IndexIVFPQ)) {
+        ParameterRange & pr = add_range("ht");
+        init_pq_ParameterRange (ix->pq, pr);
+    }
+
+    if (DC (IndexIVF)) {
+        const MultiIndexQuantizer *miq =
+            dynamic_cast<const MultiIndexQuantizer *> (ix->quantizer);
+        if (miq) {
+            ParameterRange & pr_max_codes = add_range("max_codes");
+            for (int i = 8; i < 20; i++) {
+                pr_max_codes.values.push_back (1 << i);
+            }
+            pr_max_codes.values.push_back (
+                std::numeric_limits<double>::infinity()
+            );
+        }
+    }
+    if (DC (IndexIVFPQR)) {
+        ParameterRange & pr = add_range("k_factor");
+        for (int i = 0; i <= 6; i++) {
+            pr.values.push_back (1 << i);
+        }
+    }
+    if (dynamic_cast<const IndexHNSW*>(index)) {
+        ParameterRange & pr = add_range("efSearch");
+        for (int i = 2; i <= 9; i++) {
+            pr.values.push_back (1 << i);
+        }
+    }
+}
+
+#undef DC
+
+// non-const version
+#define DC(classname) classname *ix = dynamic_cast<classname *>(index)
+
+
+/// set a combination of parameters on an index
+void ParameterSpace::set_index_parameters (Index *index, size_t cno) const
+{
+
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange & pr = parameter_ranges[i];
+        size_t j = cno % pr.values.size();
+        cno /= pr.values.size();
+        double val = pr.values [j];
+        set_index_parameter (index, pr.name, val);
+    }
+}
+
+/// set a combination of parameters on an index
+void ParameterSpace::set_index_parameters (
+     Index *index, const char *description_in) const
+{
+    char description[strlen(description_in) + 1];
+    char *ptr;
+    memcpy (description, description_in, strlen(description_in) + 1);
+
+    for (char *tok = strtok_r (description, " ,", &ptr);
+         tok;
+         tok = strtok_r (nullptr, " ,", &ptr)) {
+        char name[100];
+        double val;
+        int ret = sscanf (tok, "%100[^=]=%lf", name, &val);
+        FAISS_THROW_IF_NOT_FMT (
+           ret == 2, "could not interpret parameters %s", tok);
+        set_index_parameter (index, name, val);
+    }
+
+}
+
+void ParameterSpace::set_index_parameter (
+        Index * index, const std::string & name, double val) const
+{
+    if (verbose > 1)
+        printf("    set %s=%g\n", name.c_str(), val);
+
+    if (name == "verbose") {
+        index->verbose = int(val);
+        // and fall through to also enable it on sub-indexes
+    }
+    if (DC (IndexPreTransform)) {
+        set_index_parameter (ix->index, name, val);
+        return;
+    }
+    if (DC (IndexShards)) {
+        // call on all sub-indexes
+        auto fn =
+          [this, name, val](int, Index* subIndex) {
+            set_index_parameter(subIndex, name, val);
+          };
+
+        ix->runOnIndex(fn);
+        return;
+    }
+    if (DC (IndexReplicas)) {
+        // call on all sub-indexes
+        auto fn =
+          [this, name, val](int, Index* subIndex) {
+            set_index_parameter(subIndex, name, val);
+          };
+
+        ix->runOnIndex(fn);
+        return;
+    }
+    if (DC (IndexRefineFlat)) {
+        if (name == "k_factor_rf") {
+            ix->k_factor = int(val);
+            return;
+        }
+        // otherwise it is for the sub-index
+        set_index_parameter (&ix->refine_index, name, val);
+        return;
+    }
+
+    if (name == "verbose") {
+        index->verbose = int(val);
+        return; // last verbose that we could find
+    }
+
+    if (name == "nprobe") {
+        if (DC (IndexIDMap)) {
+            set_index_parameter (ix->index, name, val);
+            return;
+        } else if (DC (IndexIVF)) {
+            ix->nprobe = int(val);
+            return;
+        }
+    }
+
+    if (name == "ht") {
+        if (DC (IndexPQ)) {
+            if (val >= ix->pq.code_size * 8) {
+                ix->search_type = IndexPQ::ST_PQ;
+            } else {
+                ix->search_type = IndexPQ::ST_polysemous;
+                ix->polysemous_ht = int(val);
+            }
+            return;
+        } else if (DC (IndexIVFPQ)) {
+            if (val >= ix->pq.code_size * 8) {
+                ix->polysemous_ht = 0;
+            } else {
+                ix->polysemous_ht = int(val);
+            }
+            return;
+        }
+    }
+
+    if (name == "k_factor") {
+        if (DC (IndexIVFPQR)) {
+            ix->k_factor = val;
+            return;
+        }
+    }
+    if (name == "max_codes") {
+        if (DC (IndexIVF)) {
+            ix->max_codes = std::isfinite(val) ? size_t(val) : 0;
+            return;
+        }
+    }
+
+    if (name == "efSearch") {
+        if (DC (IndexHNSW)) {
+            ix->hnsw.efSearch = int(val);
+            return;
+        }
+        if (DC (IndexIVF)) {
+            if (IndexHNSW *cq =
+                dynamic_cast<IndexHNSW *>(ix->quantizer)) {
+                cq->hnsw.efSearch = int(val);
+                return;
+            }
+        }
+    }
+
+    FAISS_THROW_FMT ("ParameterSpace::set_index_parameter:"
+                     "could not set parameter %s",
+                     name.c_str());
+}
+
+void ParameterSpace::display () const
+{
+    printf ("ParameterSpace, %ld parameters, %ld combinations:\n",
+            parameter_ranges.size (), n_combinations ());
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange & pr = parameter_ranges[i];
+        printf ("   %s: ", pr.name.c_str ());
+        char sep = '[';
+        for (int j = 0; j < pr.values.size(); j++) {
+            printf ("%c %g", sep, pr.values [j]);
+            sep = ',';
+        }
+        printf ("]\n");
+    }
+}
+
+
+
+void ParameterSpace::update_bounds (size_t cno, const OperatingPoint & op,
+                                    double *upper_bound_perf,
+                                    double *lower_bound_t) const
+{
+    if (combination_ge (cno, op.cno)) {
+        if (op.t > *lower_bound_t) *lower_bound_t = op.t;
+    }
+    if (combination_ge (op.cno, cno)) {
+        if (op.perf < *upper_bound_perf) *upper_bound_perf = op.perf;
+    }
+}
+
+
+
+void ParameterSpace::explore (Index *index,
+                              size_t nq, const float *xq,
+                              const AutoTuneCriterion & crit,
+                              OperatingPoints * ops) const
+{
+    FAISS_THROW_IF_NOT_MSG (nq == crit.nq,
+                      "criterion does not have the same nb of queries");
+
+    size_t n_comb = n_combinations ();
+
+    if (n_experiments == 0) {
+
+        for (size_t cno = 0; cno < n_comb; cno++) {
+            set_index_parameters (index, cno);
+            std::vector<Index::idx_t> I(nq * crit.nnn);
+            std::vector<float> D(nq * crit.nnn);
+
+            double t0 = getmillisecs ();
+            index->search (nq, xq, crit.nnn, D.data(), I.data());
+            double t_search = (getmillisecs() - t0) / 1e3;
+
+            double perf = crit.evaluate (D.data(), I.data());
+
+            bool keep = ops->add (perf, t_search, combination_name (cno), cno);
+
+            if (verbose)
+                printf("  %ld/%ld: %s perf=%.3f t=%.3f s %s\n", cno, n_comb,
+                       combination_name (cno).c_str(), perf, t_search,
+                       keep ? "*" : "");
+        }
+        return;
+    }
+
+    int n_exp = n_experiments;
+
+    if (n_exp > n_comb) n_exp = n_comb;
+    FAISS_THROW_IF_NOT (n_comb == 1 || n_exp > 2);
+    std::vector<int> perm (n_comb);
+    // make sure the slowest and fastest experiment are run
+    perm[0] = 0;
+    if (n_comb > 1) {
+        perm[1] = n_comb - 1;
+        rand_perm (&perm[2], n_comb - 2, 1234);
+        for (int i = 2; i < perm.size(); i++) perm[i] ++;
+    }
+
+    for (size_t xp = 0; xp < n_exp; xp++) {
+        size_t cno = perm[xp];
+
+        if (verbose)
+            printf("  %ld/%d: cno=%ld %s ", xp, n_exp, cno,
+                   combination_name (cno).c_str());
+
+        {
+            double lower_bound_t = 0.0;
+            double upper_bound_perf = 1.0;
+            for (int i = 0; i < ops->all_pts.size(); i++) {
+                update_bounds (cno, ops->all_pts[i],
+                               &upper_bound_perf, &lower_bound_t);
+            }
+            double best_t = ops->t_for_perf (upper_bound_perf);
+            if (verbose)
+                printf ("bounds [perf<=%.3f t>=%.3f] %s",
+                        upper_bound_perf, lower_bound_t,
+                        best_t <= lower_bound_t ? "skip\n" : "");
+            if (best_t <= lower_bound_t) continue;
+        }
+
+        set_index_parameters (index, cno);
+        std::vector<Index::idx_t> I(nq * crit.nnn);
+        std::vector<float> D(nq * crit.nnn);
+
+        double t0 = getmillisecs ();
+
+        int nrun = 0;
+        double t_search;
+
+        do {
+
+            if (thread_over_batches) {
+#pragma omp parallel for
+                for (size_t q0 = 0; q0 < nq; q0 += batchsize) {
+                    size_t q1 = q0 + batchsize;
+                    if (q1 > nq) q1 = nq;
+                    index->search (q1 - q0, xq + q0 * index->d,
+                                   crit.nnn,
+                                   D.data() + q0 * crit.nnn,
+                                   I.data() + q0 * crit.nnn);
+                }
+            } else {
+                for (size_t q0 = 0; q0 < nq; q0 += batchsize) {
+                    size_t q1 = q0 + batchsize;
+                    if (q1 > nq) q1 = nq;
+                    index->search (q1 - q0, xq + q0 * index->d,
+                                   crit.nnn,
+                                   D.data() + q0 * crit.nnn,
+                                   I.data() + q0 * crit.nnn);
+                }
+            }
+            nrun ++;
+            t_search = (getmillisecs() - t0) / 1e3;
+
+        } while (t_search < min_test_duration);
+
+        t_search /= nrun;
+
+        double perf = crit.evaluate (D.data(), I.data());
+
+        bool keep = ops->add (perf, t_search, combination_name (cno), cno);
+
+        if (verbose)
+            printf(" perf %.3f t %.3f (%d runs) %s\n",
+                   perf, t_search, nrun,
+                   keep ? "*" : "");
+    }
+}
+
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/AutoTune.h
+++ b/core/src/index/thirdparty/faiss/AutoTune.h
@ -0,0 +1,212 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_AUTO_TUNE_H
+#define FAISS_AUTO_TUNE_H
+
+#include <vector>
+#include <unordered_map>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+
+/**
+ * Evaluation criterion. Returns a performance measure in [0,1],
+ * higher is better.
+ */
+struct AutoTuneCriterion {
+    typedef Index::idx_t idx_t;
+    idx_t nq;  ///< nb of queries this criterion is evaluated on
+    idx_t nnn; ///< nb of NNs that the query should request
+    idx_t gt_nnn; ///< nb of GT NNs required to evaluate crterion
+
+    std::vector<float> gt_D;  ///< Ground-truth distances (size nq * gt_nnn)
+    std::vector<idx_t> gt_I;  ///< Ground-truth indexes (size nq * gt_nnn)
+
+    AutoTuneCriterion (idx_t nq, idx_t nnn);
+
+    /** Intitializes the gt_D and gt_I vectors. Must be called before evaluating
+     *
+     * @param gt_D_in  size nq * gt_nnn
+     * @param gt_I_in  size nq * gt_nnn
+     */
+    void set_groundtruth (int gt_nnn, const float *gt_D_in,
+                          const idx_t *gt_I_in);
+
+    /** Evaluate the criterion.
+     *
+     * @param D  size nq * nnn
+     * @param I  size nq * nnn
+     * @return the criterion, between 0 and 1. Larger is better.
+     */
+    virtual double evaluate (const float *D, const idx_t *I) const = 0;
+
+    virtual ~AutoTuneCriterion () {}
+
+};
+
+struct OneRecallAtRCriterion: AutoTuneCriterion {
+
+    idx_t R;
+
+    OneRecallAtRCriterion (idx_t nq, idx_t R);
+
+    double evaluate(const float* D, const idx_t* I) const override;
+
+    ~OneRecallAtRCriterion() override {}
+};
+
+
+struct IntersectionCriterion: AutoTuneCriterion {
+
+    idx_t R;
+
+    IntersectionCriterion (idx_t nq, idx_t R);
+
+    double evaluate(const float* D, const idx_t* I) const override;
+
+    ~IntersectionCriterion() override {}
+};
+
+/**
+ * Maintains a list of experimental results. Each operating point is a
+ * (perf, t, key) triplet, where higher perf and lower t is
+ * better. The key field is an arbitrary identifier for the operating point
+ */
+
+struct OperatingPoint {
+    double perf;     ///< performance measure (output of a Criterion)
+    double t;        ///< corresponding execution time (ms)
+    std::string key; ///< key that identifies this op pt
+    int64_t cno;        ///< integer identifer
+};
+
+struct OperatingPoints {
+    /// all operating points
+    std::vector<OperatingPoint> all_pts;
+
+    /// optimal operating points, sorted by perf
+    std::vector<OperatingPoint> optimal_pts;
+
+    // begins with a single operating point: t=0, perf=0
+    OperatingPoints ();
+
+    /// add operating points from other to this, with a prefix to the keys
+    int merge_with (const OperatingPoints &other,
+                    const std::string & prefix = "");
+
+    void clear ();
+
+    /// add a performance measure. Return whether it is an optimal point
+    bool add (double perf, double t, const std::string & key, size_t cno = 0);
+
+    /// get time required to obtain a given performance measure
+    double t_for_perf (double perf) const;
+
+    /// easy-to-read output
+    void display (bool only_optimal = true) const;
+
+    /// output to a format easy to digest by gnuplot
+    void all_to_gnuplot (const char *fname) const;
+    void optimal_to_gnuplot (const char *fname) const;
+
+};
+
+/// possible values of a parameter, sorted from least to most expensive/accurate
+struct ParameterRange {
+    std::string name;
+    std::vector<double> values;
+};
+
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+struct ParameterSpace {
+    /// all tunable parameters
+    std::vector<ParameterRange> parameter_ranges;
+
+    // exploration parameters
+
+    /// verbosity during exploration
+    int verbose;
+
+    /// nb of experiments during optimization (0 = try all combinations)
+    int n_experiments;
+
+    /// maximum number of queries to submit at a time.
+    size_t batchsize;
+
+    /// use multithreading over batches (useful to benchmark
+    /// independent single-searches)
+    bool thread_over_batches;
+
+    /// run tests several times until they reach at least this
+    /// duration (to avoid jittering in MT mode)
+    double min_test_duration;
+
+    ParameterSpace ();
+
+    /// nb of combinations, = product of values sizes
+    size_t n_combinations () const;
+
+    /// returns whether combinations c1 >= c2 in the tuple sense
+    bool combination_ge (size_t c1, size_t c2) const;
+
+    /// get string representation of the combination
+    std::string combination_name (size_t cno) const;
+
+    /// print a description on stdout
+    void display () const;
+
+    /// add a new parameter (or return it if it exists)
+    ParameterRange &add_range(const char * name);
+
+    /// initialize with reasonable parameters for the index
+    virtual void initialize (const Index * index);
+
+    /// set a combination of parameters on an index
+    void set_index_parameters (Index *index, size_t cno) const;
+
+    /// set a combination of parameters described by a string
+    void set_index_parameters (Index *index, const char *param_string) const;
+
+    /// set one of the parameters
+    virtual void set_index_parameter (
+        Index * index, const std::string & name, double val) const;
+
+    /** find an upper bound on the performance and a lower bound on t
+     * for configuration cno given another operating point op */
+    void update_bounds (size_t cno, const OperatingPoint & op,
+                        double *upper_bound_perf,
+                        double *lower_bound_t) const;
+
+    /** explore operating points
+     * @param index   index to run on
+     * @param xq      query vectors (size nq * index.d)
+     * @param crit    selection criterion
+     * @param ops     resulting operating points
+     */
+    void explore (Index *index,
+                  size_t nq, const float *xq,
+                  const AutoTuneCriterion & crit,
+                  OperatingPoints * ops)  const;
+
+    virtual ~ParameterSpace () {}
+};
+
+
+
+} // namespace faiss
+
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/CODE_OF_CONDUCT.md
+++ b/core/src/index/thirdparty/faiss/CODE_OF_CONDUCT.md
@ -0,0 +1,2 @@
+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.fb.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
--- a/core/src/index/thirdparty/faiss/CONTRIBUTING.md
+++ b/core/src/index/thirdparty/faiss/CONTRIBUTING.md
@ -0,0 +1,53 @@
+# Contributing to Faiss
+
+We want to make contributing to this project as easy and transparent as
+possible. 
+
+## Our Development Process
+
+We mainly develop Faiss within Facebook. Sometimes, we will sync the 
+github version of Faiss with the internal state. 
+
+## Pull Requests
+
+We welcome pull requests that add significant value to Faiss. If you plan to do
+a major development and contribute it back to Faiss, please contact us first before
+putting too much effort into it.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+There is a Facebook internal test suite for Faiss, and we need to run 
+all changes to Faiss through it.
+
+## Contributor License Agreement ("CLA")
+
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style  
+
+* 4 or 2 spaces for indentation in C++ (no tabs)
+* 80 character line length (both for C++ and Python)
+* C++ language level: C++11
+
+## License
+
+By contributing to Faiss, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
+
--- a/core/src/index/thirdparty/faiss/Clustering.cpp
+++ b/core/src/index/thirdparty/faiss/Clustering.cpp
@ -0,0 +1,261 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/Clustering.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+
+#include <faiss/utils/utils.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/distances.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+
+namespace faiss {
+
+ClusteringParameters::ClusteringParameters ():
+    niter(25),
+    nredo(1),
+    verbose(false),
+    spherical(false),
+    int_centroids(false),
+    update_index(false),
+    frozen_centroids(false),
+    min_points_per_centroid(39),
+    max_points_per_centroid(256),
+    seed(1234)
+{}
+// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
+
+
+Clustering::Clustering (int d, int k):
+    d(d), k(k) {}
+
+Clustering::Clustering (int d, int k, const ClusteringParameters &cp):
+    ClusteringParameters (cp), d(d), k(k) {}
+
+
+
+static double imbalance_factor (int n, int k, int64_t *assign) {
+    std::vector<int> hist(k, 0);
+    for (int i = 0; i < n; i++)
+        hist[assign[i]]++;
+
+    double tot = 0, uf = 0;
+
+    for (int i = 0 ; i < k ; i++) {
+        tot += hist[i];
+        uf += hist[i] * (double) hist[i];
+    }
+    uf = uf * k / (tot * tot);
+
+    return uf;
+}
+
+void Clustering::post_process_centroids ()
+{
+
+    if (spherical) {
+        fvec_renorm_L2 (d, k, centroids.data());
+    }
+
+    if (int_centroids) {
+        for (size_t i = 0; i < centroids.size(); i++)
+            centroids[i] = roundf (centroids[i]);
+    }
+}
+
+
+void Clustering::train (idx_t nx, const float *x_in, Index & index) {
+    FAISS_THROW_IF_NOT_FMT (nx >= k,
+             "Number of training points (%ld) should be at least "
+             "as large as number of clusters (%ld)", nx, k);
+
+    double t0 = getmillisecs();
+
+    // yes it is the user's responsibility, but it may spare us some
+    // hard-to-debug reports.
+    for (size_t i = 0; i < nx * d; i++) {
+      FAISS_THROW_IF_NOT_MSG (finite (x_in[i]),
+                        "input contains NaN's or Inf's");
+    }
+
+    const float *x = x_in;
+    ScopeDeleter<float> del1;
+
+    if (nx > k * max_points_per_centroid) {
+        if (verbose)
+            printf("Sampling a subset of %ld / %ld for training\n",
+                   k * max_points_per_centroid, nx);
+        std::vector<int> perm (nx);
+        rand_perm (perm.data (), nx, seed);
+        nx = k * max_points_per_centroid;
+        float * x_new = new float [nx * d];
+        for (idx_t i = 0; i < nx; i++)
+            memcpy (x_new + i * d, x + perm[i] * d, sizeof(x_new[0]) * d);
+        x = x_new;
+        del1.set (x);
+    } else if (nx < k * min_points_per_centroid) {
+        fprintf (stderr,
+                 "WARNING clustering %ld points to %ld centroids: "
+                 "please provide at least %ld training points\n",
+                 nx, k, idx_t(k) * min_points_per_centroid);
+    }
+
+
+    if (nx == k) {
+        if (verbose) {
+            printf("Number of training points (%ld) same as number of "
+                   "clusters, just copying\n", nx);
+        }
+        // this is a corner case, just copy training set to clusters
+        centroids.resize (d * k);
+        memcpy (centroids.data(), x_in, sizeof (*x_in) * d * k);
+        index.reset();
+        index.add(k, x_in);
+        return;
+    }
+
+
+    if (verbose)
+        printf("Clustering %d points in %ldD to %ld clusters, "
+               "redo %d times, %d iterations\n",
+               int(nx), d, k, nredo, niter);
+
+    idx_t * assign = new idx_t[nx];
+    ScopeDeleter<idx_t> del (assign);
+    float * dis = new float[nx];
+    ScopeDeleter<float> del2(dis);
+
+    // for redo
+    float best_err = HUGE_VALF;
+    std::vector<float> best_obj;
+    std::vector<float> best_centroids;
+
+    // support input centroids
+
+    FAISS_THROW_IF_NOT_MSG (
+       centroids.size() % d == 0,
+       "size of provided input centroids not a multiple of dimension");
+
+    size_t n_input_centroids = centroids.size() / d;
+
+    if (verbose && n_input_centroids > 0) {
+        printf ("  Using %zd centroids provided as input (%sfrozen)\n",
+                n_input_centroids, frozen_centroids ? "" : "not ");
+    }
+
+    double t_search_tot = 0;
+    if (verbose) {
+        printf("  Preprocessing in %.2f s\n",
+               (getmillisecs() - t0) / 1000.);
+    }
+    t0 = getmillisecs();
+
+    for (int redo = 0; redo < nredo; redo++) {
+
+        if (verbose && nredo > 1) {
+            printf("Outer iteration %d / %d\n", redo, nredo);
+        }
+
+        // initialize remaining centroids with random points from the dataset
+        centroids.resize (d * k);
+        std::vector<int> perm (nx);
+
+        rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
+        for (int i = n_input_centroids; i < k ; i++)
+            memcpy (&centroids[i * d], x + perm[i] * d,
+                    d * sizeof (float));
+
+        post_process_centroids ();
+
+        if (index.ntotal != 0) {
+            index.reset();
+        }
+
+        if (!index.is_trained) {
+            index.train (k, centroids.data());
+        }
+
+        index.add (k, centroids.data());
+        float err = 0;
+        for (int i = 0; i < niter; i++) {
+            double t0s = getmillisecs();
+            index.search (nx, x, 1, dis, assign);
+            InterruptCallback::check();
+            t_search_tot += getmillisecs() - t0s;
+
+            err = 0;
+            for (int j = 0; j < nx; j++)
+                err += dis[j];
+            obj.push_back (err);
+
+            int nsplit = km_update_centroids (
+                  x, centroids.data(),
+                  assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
+
+            if (verbose) {
+                printf ("  Iteration %d (%.2f s, search %.2f s): "
+                        "objective=%g imbalance=%.3f nsplit=%d       \r",
+                        i, (getmillisecs() - t0) / 1000.0,
+                        t_search_tot / 1000,
+                        err, imbalance_factor (nx, k, assign),
+                        nsplit);
+                fflush (stdout);
+            }
+
+            post_process_centroids ();
+
+            index.reset ();
+            if (update_index)
+                index.train (k, centroids.data());
+
+            assert (index.ntotal == 0);
+            index.add (k, centroids.data());
+            InterruptCallback::check ();
+        }
+        if (verbose) printf("\n");
+        if (nredo > 1) {
+            if (err < best_err) {
+                if (verbose)
+                    printf ("Objective improved: keep new clusters\n");
+                best_centroids = centroids;
+                best_obj = obj;
+                best_err = err;
+            }
+            index.reset ();
+        }
+    }
+    if (nredo > 1) {
+        centroids = best_centroids;
+        obj = best_obj;
+        index.reset();
+        index.add(k, best_centroids.data());
+    }
+
+}
+
+float kmeans_clustering (size_t d, size_t n, size_t k,
+                         const float *x,
+                         float *centroids)
+{
+    Clustering clus (d, k);
+    clus.verbose = d * n * k > (1L << 30);
+    // display logs if > 1Gflop per iteration
+    IndexFlatL2 index (d);
+    clus.train (n, x, index);
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.obj.back();
+}
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/Clustering.h
+++ b/core/src/index/thirdparty/faiss/Clustering.h
@ -0,0 +1,101 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_CLUSTERING_H
+#define FAISS_CLUSTERING_H
+#include <faiss/Index.h>
+
+#include <vector>
+
+namespace faiss {
+
+
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+struct ClusteringParameters {
+    int niter;          ///< clustering iterations
+    int nredo;          ///< redo clustering this many times and keep best
+
+    bool verbose;
+    bool spherical;     ///< do we want normalized centroids?
+    bool int_centroids; ///< round centroids coordinates to integer
+    bool update_index;  ///< update index after each iteration?
+    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations
+
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid;  ///< to limit size of dataset
+
+    int seed; ///< seed for the random number generator
+
+    /// sets reasonable defaults
+    ClusteringParameters ();
+};
+
+
+/** clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centoids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ * To do several clusterings, just call train() several times on
+ * different training sets, clearing the centroid table in between.
+ */
+struct Clustering: ClusteringParameters {
+    typedef Index::idx_t idx_t;
+    size_t d;              ///< dimension of the vectors
+    size_t k;              ///< nb of centroids
+
+    /// centroids (k * d)
+    std::vector<float> centroids;
+
+    /// objective values (sum of distances reported by index) over
+    /// iterations
+    std::vector<float> obj;
+
+    /// the only mandatory parameters are k and d
+    Clustering (int d, int k);
+    Clustering (int d, int k, const ClusteringParameters &cp);
+
+    /// Index is used during the assignment stage
+    virtual void train (idx_t n, const float * x, faiss::Index & index);
+
+    /// Post-process the centroids after each centroid update.
+    /// includes optional L2 normalization and nearest integer rounding
+    void post_process_centroids ();
+
+    virtual ~Clustering() {}
+};
+
+
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @return final quantization error
+ */
+float kmeans_clustering (size_t d, size_t n, size_t k,
+                         const float *x,
+                         float *centroids);
+
+
+
+}
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/Dockerfile
+++ b/core/src/index/thirdparty/faiss/Dockerfile
@ -0,0 +1,29 @@
+FROM nvidia/cuda:8.0-devel-centos7
+
+# Install MKL
+RUN yum-config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo
+RUN rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
+RUN yum install -y intel-mkl-2019.3-062
+ENV LD_LIBRARY_PATH /opt/intel/mkl/lib/intel64:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /opt/intel/mkl/lib/intel64:$LIBRARY_PATH
+ENV LD_PRELOAD /usr/lib64/libgomp.so.1:/opt/intel/mkl/lib/intel64/libmkl_def.so:\
+/opt/intel/mkl/lib/intel64/libmkl_avx2.so:/opt/intel/mkl/lib/intel64/libmkl_core.so:\
+/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.so:/opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so
+
+# Install necessary build tools
+RUN yum install -y gcc-c++ make swig3
+
+# Install necesary headers/libs
+RUN yum install -y python-devel numpy
+
+COPY . /opt/faiss
+
+WORKDIR /opt/faiss
+
+# --with-cuda=/usr/local/cuda-8.0 
+RUN ./configure --prefix=/usr --libdir=/usr/lib64 --without-cuda
+RUN make -j $(nproc)
+RUN make -C python
+RUN make test
+RUN make install
+RUN make -C demos demo_ivfpq_indexing && ./demos/demo_ivfpq_indexing
--- a/core/src/index/thirdparty/faiss/INSTALL.md
+++ b/core/src/index/thirdparty/faiss/INSTALL.md
@ -0,0 +1,353 @@
+[//]: # "**********************************************************"
+[//]: # "** INSTALL file for Faiss (Fair AI Similarity Search    **"
+[//]: # "**********************************************************"
+
+INSTALL file for Faiss (Fair AI Similarity Search)
+==================================================
+
+Install via Conda
+-----------------
+
+The easiest way to install FAISS is from Anaconda. We regularly push stable releases to the pytorch conda channel.
+
+Currently we support faiss-cpu both on Linux and OSX. We also provide faiss-gpu compiled with CUDA8/CUDA9/CUDA10 on Linux systems.
+
+You can easily install it by
+
+```
+# CPU version only
+conda install faiss-cpu -c pytorch
+
+# GPU version
+conda install faiss-gpu cudatoolkit=8.0 -c pytorch # For CUDA8
+conda install faiss-gpu cudatoolkit=9.0 -c pytorch # For CUDA9
+conda install faiss-gpu cudatoolkit=10.0 -c pytorch # For CUDA10
+```
+
+Compile from source
+-------------------
+
+The Faiss compilation works in 2 steps:
+
+1. compile the C++ core and examples
+
+2. compile the Python interface
+
+Steps 2 depends on 1.
+
+It is also possible to build a pure C interface. This optional process is
+described separately (please see the [C interface installation file](c_api/INSTALL.md))
+
+General compilation instructions
+================================
+
+TL;DR: `./configure && make (&& make install)` for the C++ library, and then `cd python; make && make install` for the python interface.
+
+1. `./configure`
+
+This generates the system-dependent configuration for the `Makefile`, stored in
+a file called `makefile.inc`.
+
+A few useful options:
+- `./configure --without-cuda` in order to build the CPU part only.
+- `./configure --with-cuda=/path/to/cuda-10.1` in order to hint to the path of
+the cudatoolkit.
+- `./configure --with-cuda-arch="-gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_72,code=sm_72"` for specifying which GPU architectures to build against.
+- `./configure --with-python=/path/to/python3.7` in order to build a python
+interface for a different python than the default one.
+- `LDFLAGS=-L/path_to_mkl/lib/ ./configure` so that configure detects the MKL BLAS imeplementation. Note that this may require to set the LD_LIBRARY_PATH at runtime.
+
+2. `make`
+
+This builds the C++ library (the whole library if a suitable cuda toolkit was
+found, or the CPU part only otherwise).
+
+3. `make install` (optional)
+
+This installs the headers and libraries.
+
+4. `make -C python` (or `make py`)
+
+This builds the python interface.
+
+5. `make -C python install`
+
+This installs the python library.
+
+
+Faiss has been tested only on x86_64 machines on Linux and Mac OS.
+
+Faiss requires a C++ compiler that understands:
+- the Intel intrinsics for SSE instructions,
+- the GCC intrinsic for the popcount instruction,
+- basic OpenMP.
+
+There are a few examples for makefile.inc in the example_makefiles/
+subdirectory. There are also indications for specific configurations in the
+troubleshooting section of the wiki.
+
+https://github.com/facebookresearch/faiss/wiki/Troubleshooting
+
+Faiss comes as a .a archive, that can be linked with executables or
+dynamic libraries (useful for the Python wrapper).
+
+
+BLAS/Lapack
+-----------
+
+The only variables that need to be configured for the C++ Faiss are
+the BLAS/Lapack flags (a linear aglebra software package). It needs a
+flag telling whether BLAS/Lapack uses 32 or 64 bit integers and the
+linking flags. Faiss uses the Fortran 77 interface of BLAS/Lapack and
+thus does not need an include path.
+
+There are several BLAS implementations, depending on the OS and
+machine. To have reasonable performance, the BLAS library should be
+multithreaded. See the example makefile.inc's for hints and examples
+on how to set the flags, or simply run the configure script:
+
+   `./configure`
+
+To check that the link flags are correct, and verify whether the
+implementation uses 32 or 64 bit integers, you can
+
+  `make misc/test_blas`
+
+and run
+
+  `./misc/test_blas`
+
+
+Testing Faiss
+-------------
+
+A basic usage example is in
+
+  `demos/demo_ivfpq_indexing`
+
+which you can build by calling
+  `make -C demos demo_ivfpq_indexing`
+
+It makes a small index, stores it and performs some searches. A normal
+runtime is around 20s. With a fast machine and Intel MKL's BLAS it
+runs in 2.5s.
+
+To run the whole test suite:
+
+   `make test` (for the CPU part)
+
+   `make test_gpu` (for the GPU part)
+
+
+A real-life benchmark
+---------------------
+
+A bit longer example runs and evaluates Faiss on the SIFT1M
+dataset. To run it, please download the ANN_SIFT1M dataset from
+
+http://corpus-texmex.irisa.fr/
+
+and unzip it to the subdirectory `sift1M` at the root of the source
+directory for this repository.
+
+Then compile and run the following (after ensuring you have installed faiss):
+
+```
+make demos
+./demos/demo_sift1M
+```
+
+This is a demonstration of the high-level auto-tuning API. You can try
+setting a different index_key to find the indexing structure that
+gives the best performance.
+
+
+The Python interface
+======================================
+
+The Python interface is compiled with
+
+  `make -C python` (or `make py`)
+
+How it works
+------------
+
+The Python interface is provided via SWIG (Simple Wrapper and
+Interface Generator) and an additional level of manual wrappers (in python/faiss.py).
+
+SWIG generates two wrapper files: a Python file (`python/swigfaiss.py`) and a
+C++ file that must be compiled to a dynamic library (`python/_swigfaiss.so`).
+
+Testing the Python wrapper
+--------------------------
+
+Often, a successful compile does not mean that the library works,
+because missing symbols are detected only at runtime. You should be
+able to load the Faiss dynamic library:
+
+  `python -c "import faiss"`
+
+In case of failure, it reports the first missing symbol. To see all
+missing symbols (on Linux), use
+
+  `ldd -r _swigfaiss.so`
+
+Sometimes, problems (eg with BLAS libraries) appear only when actually
+calling a BLAS function. A simple way to check this
+
+```python
+python -c "import faiss, numpy
+faiss.Kmeans(10, 20).train(numpy.random.rand(1000, 10).astype('float32'))
+```
+
+
+Real-life test
+--------------
+
+The following script extends the demo_sift1M test to several types of
+indexes.  This must be run from the root of the source directory for this
+repository:
+
+```
+mkdir tmp             # graphs of the output will be written here
+PYTHONPATH=. python demos/demo_auto_tune.py
+```
+
+It will cycle through a few types of indexes and find optimal
+operating points. You can play around with the types of indexes.
+
+
+Step 3: Compiling the GPU implementation
+========================================
+
+The GPU version is a superset of the CPU version. In addition it
+requires the cuda compiler and related libraries (Cublas)
+
+The nvcc-specific flags to pass to the compiler, based on your desired
+compute capability can be customized by providing the `--with-cuda-arch` to
+`./configure`. Only compute capability 3.5+ is supported. For example, we enable
+by default:
+
+```
+-gencode=arch=compute_35,code=compute_35
+-gencode=arch=compute_52,code=compute_52
+-gencode=arch=compute_60,code=compute_60
+-gencode=arch=compute_61,code=compute_61
+-gencode=arch=compute_70,code=compute_70
+-gencode=arch=compute_75,code=compute_75
+```
+
+However, look at https://developer.nvidia.com/cuda-gpus to determine
+what compute capability you need to use, and replace our gencode
+specifications with the one(s) you need.
+
+Most other flags are related to the C++11 compiler used by nvcc to
+complile the actual C++ code. They are normally just transmitted by
+nvcc, except some of them that are not recognized and that should be
+escaped by prefixing them with -Xcompiler. Also link flags that are
+prefixed with -Wl, should be passed with -Xlinker.
+
+You may want to add `-j 10` to use 10 threads during compile.
+
+Testing the GPU implementation
+------------------------------
+
+Compile the example with
+
+  `make -C gpu/test demo_ivfpq_indexing_gpu`
+
+This produce the GPU code equivalent to the CPU
+demo_ivfpq_indexing. It also shows how to translate indexed from/to
+the GPU.
+
+
+Python example with GPU support
+-------------------------------
+
+The auto-tuning example above also runs on the GPU. Edit
+`demos/demo_auto_tune.py` at line 100 with the values
+
+```python
+keys_to_test = keys_gpu
+use_gpu = True
+```
+
+and you can run
+
+```
+export PYTHONPATH=.
+python demos/demo_auto_tune.py
+```
+
+to test the GPU code.
+
+
+Docker instructions
+===================
+
+For using GPU capabilities of Faiss, you'll need to run "nvidia-docker"
+rather than "docker". Make sure that docker
+(https://docs.docker.com/engine/installation/) and nvidia-docker
+(https://github.com/NVIDIA/nvidia-docker) are installed on your system
+
+To build the "faiss" image, run
+
+  `nvidia-docker build -t faiss .`
+
+or if you don't want/need to clone the sources, just run
+
+  `nvidia-docker build -t faiss github.com/facebookresearch/faiss`
+
+If you want to run the tests during the docker build, uncomment the
+last 3 "RUN" steps in the Dockerfile. But you might want to run the
+tests by yourself, so just run
+
+  `nvidia-docker run -ti --name faiss faiss bash`
+
+and run what you want. If you need a dataset (like sift1M), download it
+inside the created container, or better, mount a directory from the host
+
+  nvidia-docker run -ti --name faiss -v /my/host/data/folder/ann_dataset/sift/:/opt/faiss/sift1M faiss bash
+
+
+How to use Faiss in your own projects
+=====================================
+
+C++
+---
+
+The makefile generates a static and a dynamic library
+
+```
+libfaiss.a
+libfaiss.so (or libfaiss.dylib)
+```
+
+the executable should be linked to one of these. If you use
+the static version (.a), add the LDFLAGS used in the Makefile.
+
+For binary-only distributions, the headers should be under
+a `faiss/` directory, so that they can be included as
+
+```c++
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+```
+
+Python
+------
+
+To import Faiss in your own Python project, you need the files
+
+```
+__init__.py
+swigfaiss.py
+_swigfaiss.so
+```
+to be present in a `faiss/` directory visible in the PYTHONPATH or in the
+current directory.
+Then Faiss can be used in python with
+
+```python
+import faiss
+```
--- a/core/src/index/thirdparty/faiss/IVFlib.cpp
+++ b/core/src/index/thirdparty/faiss/IVFlib.cpp
@ -0,0 +1,344 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IVFlib.h>
+
+#include <memory>
+
+#include <faiss/IndexPreTransform.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss { namespace ivflib {
+
+
+void check_compatible_for_merge (const Index * index0,
+                                 const Index * index1)
+{
+
+    const faiss::IndexPreTransform *pt0 =
+        dynamic_cast<const faiss::IndexPreTransform *>(index0);
+
+    if (pt0) {
+        const faiss::IndexPreTransform *pt1 =
+            dynamic_cast<const faiss::IndexPreTransform *>(index1);
+        FAISS_THROW_IF_NOT_MSG (pt1, "both indexes should be pretransforms");
+
+        FAISS_THROW_IF_NOT (pt0->chain.size() == pt1->chain.size());
+        for (int i = 0; i < pt0->chain.size(); i++) {
+            FAISS_THROW_IF_NOT (typeid(pt0->chain[i]) == typeid(pt1->chain[i]));
+        }
+
+        index0 = pt0->index;
+        index1 = pt1->index;
+    }
+    FAISS_THROW_IF_NOT (typeid(index0) == typeid(index1));
+    FAISS_THROW_IF_NOT (index0->d == index1->d &&
+                        index0->metric_type == index1->metric_type);
+
+    const faiss::IndexIVF *ivf0 = dynamic_cast<const faiss::IndexIVF *>(index0);
+    if (ivf0) {
+        const faiss::IndexIVF *ivf1 =
+            dynamic_cast<const faiss::IndexIVF *>(index1);
+        FAISS_THROW_IF_NOT (ivf1);
+
+        ivf0->check_compatible_for_merge (*ivf1);
+    }
+
+    // TODO: check as thoroughfully for other index types
+
+}
+
+const IndexIVF * extract_index_ivf (const Index * index)
+{
+    if (auto *pt =
+        dynamic_cast<const IndexPreTransform *>(index)) {
+        index = pt->index;
+    }
+
+    auto *ivf = dynamic_cast<const IndexIVF *>(index);
+
+    FAISS_THROW_IF_NOT (ivf);
+
+    return ivf;
+}
+
+IndexIVF * extract_index_ivf (Index * index) {
+    return const_cast<IndexIVF*> (extract_index_ivf ((const Index*)(index)));
+}
+
+void merge_into(faiss::Index *index0, faiss::Index *index1, bool shift_ids) {
+
+    check_compatible_for_merge (index0, index1);
+    IndexIVF * ivf0 = extract_index_ivf (index0);
+    IndexIVF * ivf1 = extract_index_ivf (index1);
+
+    ivf0->merge_from (*ivf1, shift_ids ? ivf0->ntotal : 0);
+
+    // useful for IndexPreTransform
+    index0->ntotal = ivf0->ntotal;
+    index1->ntotal = ivf1->ntotal;
+}
+
+
+
+void search_centroid(faiss::Index *index,
+                     const float* x, int n,
+                     idx_t* centroid_ids)
+{
+    std::unique_ptr<float[]> del;
+    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
+        x = index_pre->apply_chain(n, x);
+        del.reset((float*)x);
+        index = index_pre->index;
+    }
+    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
+    assert(index_ivf);
+    index_ivf->quantizer->assign(n, x, centroid_ids);
+}
+
+
+
+void search_and_return_centroids(faiss::Index *index,
+                                 size_t n,
+                                 const float* xin,
+                                 long k,
+                                 float *distances,
+                                 idx_t* labels,
+                                 idx_t* query_centroid_ids,
+                                 idx_t* result_centroid_ids)
+{
+    const float *x = xin;
+    std::unique_ptr<float []> del;
+    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
+        x = index_pre->apply_chain(n, x);
+        del.reset((float*)x);
+        index = index_pre->index;
+    }
+    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
+    assert(index_ivf);
+
+    size_t nprobe = index_ivf->nprobe;
+    std::vector<idx_t> cent_nos (n * nprobe);
+    std::vector<float> cent_dis (n * nprobe);
+    index_ivf->quantizer->search(
+        n, x, nprobe, cent_dis.data(), cent_nos.data());
+
+    if (query_centroid_ids) {
+        for (size_t i = 0; i < n; i++)
+            query_centroid_ids[i] = cent_nos[i * nprobe];
+    }
+
+    index_ivf->search_preassigned (n, x, k,
+                                   cent_nos.data(), cent_dis.data(),
+                                   distances, labels, true);
+
+    for (size_t i = 0; i < n * k; i++) {
+        idx_t label = labels[i];
+        if (label < 0) {
+            if (result_centroid_ids)
+                result_centroid_ids[i] = -1;
+        } else {
+            long list_no = label >> 32;
+            long list_index = label & 0xffffffff;
+            if (result_centroid_ids)
+                result_centroid_ids[i] = list_no;
+            labels[i] = index_ivf->invlists->get_single_id(list_no, list_index);
+        }
+    }
+}
+
+
+SlidingIndexWindow::SlidingIndexWindow (Index *index): index (index) {
+    n_slice = 0;
+    IndexIVF* index_ivf = const_cast<IndexIVF*>(extract_index_ivf (index));
+    ils = dynamic_cast<ArrayInvertedLists *> (index_ivf->invlists);
+    nlist = ils->nlist;
+    FAISS_THROW_IF_NOT_MSG (ils,
+               "only supports indexes with ArrayInvertedLists");
+    sizes.resize(nlist);
+}
+
+template<class T>
+static void shift_and_add (std::vector<T> & dst,
+                           size_t remove,
+                           const std::vector<T> & src)
+{
+    if (remove > 0)
+        memmove (dst.data(), dst.data() + remove,
+                 (dst.size() - remove) * sizeof (T));
+    size_t insert_point = dst.size() - remove;
+    dst.resize (insert_point + src.size());
+    memcpy (dst.data() + insert_point, src.data (), src.size() * sizeof(T));
+}
+
+template<class T>
+static void remove_from_begin (std::vector<T> & v,
+                               size_t remove)
+{
+    if (remove > 0)
+        v.erase (v.begin(), v.begin() + remove);
+}
+
+void SlidingIndexWindow::step(const Index *sub_index, bool remove_oldest) {
+
+    FAISS_THROW_IF_NOT_MSG (!remove_oldest || n_slice > 0,
+                            "cannot remove slice: there is none");
+
+    const ArrayInvertedLists *ils2 = nullptr;
+    if(sub_index) {
+        check_compatible_for_merge (index, sub_index);
+        ils2 = dynamic_cast<const ArrayInvertedLists*>(
+                                   extract_index_ivf (sub_index)->invlists);
+        FAISS_THROW_IF_NOT_MSG (ils2, "supports only ArrayInvertedLists");
+    }
+    IndexIVF *index_ivf = extract_index_ivf (index);
+
+    if (remove_oldest && ils2) {
+        for (int i = 0; i < nlist; i++) {
+            std::vector<size_t> & sizesi = sizes[i];
+            size_t amount_to_remove = sizesi[0];
+            index_ivf->ntotal += ils2->ids[i].size() - amount_to_remove;
+
+            shift_and_add (ils->ids[i], amount_to_remove, ils2->ids[i]);
+            shift_and_add (ils->codes[i], amount_to_remove * ils->code_size,
+                           ils2->codes[i]);
+            for (int j = 0; j + 1 < n_slice; j++) {
+                sizesi[j] = sizesi[j + 1] - amount_to_remove;
+            }
+            sizesi[n_slice - 1] = ils->ids[i].size();
+        }
+    } else if (ils2) {
+        for (int i = 0; i < nlist; i++) {
+            index_ivf->ntotal += ils2->ids[i].size();
+            shift_and_add (ils->ids[i], 0, ils2->ids[i]);
+            shift_and_add (ils->codes[i], 0, ils2->codes[i]);
+            sizes[i].push_back(ils->ids[i].size());
+        }
+        n_slice++;
+    } else if (remove_oldest) {
+        for (int i = 0; i < nlist; i++) {
+            size_t amount_to_remove = sizes[i][0];
+            index_ivf->ntotal -= amount_to_remove;
+            remove_from_begin (ils->ids[i], amount_to_remove);
+            remove_from_begin (ils->codes[i],
+                               amount_to_remove * ils->code_size);
+            for (int j = 0; j + 1 < n_slice; j++) {
+                sizes[i][j] = sizes[i][j + 1] - amount_to_remove;
+            }
+            sizes[i].pop_back ();
+        }
+        n_slice--;
+    } else {
+        FAISS_THROW_MSG ("nothing to do???");
+    }
+    index->ntotal = index_ivf->ntotal;
+}
+
+
+
+// Get a subset of inverted lists [i0, i1). Works on IndexIVF's and
+// IndexIVF's embedded in a IndexPreTransform
+
+ArrayInvertedLists *
+get_invlist_range (const Index *index, long i0, long i1)
+{
+    const IndexIVF *ivf = extract_index_ivf (index);
+
+    FAISS_THROW_IF_NOT (0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
+
+    const InvertedLists *src = ivf->invlists;
+
+    ArrayInvertedLists * il = new ArrayInvertedLists(i1 - i0, src->code_size);
+
+    for (long i = i0; i < i1; i++) {
+        il->add_entries(i - i0, src->list_size(i),
+                        InvertedLists::ScopedIds (src, i).get(),
+                        InvertedLists::ScopedCodes (src, i).get());
+    }
+    return il;
+}
+
+
+
+void set_invlist_range (Index *index, long i0, long i1,
+                        ArrayInvertedLists * src)
+{
+    IndexIVF *ivf = extract_index_ivf (index);
+
+    FAISS_THROW_IF_NOT (0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
+
+    ArrayInvertedLists *dst = dynamic_cast<ArrayInvertedLists *>(ivf->invlists);
+    FAISS_THROW_IF_NOT_MSG (dst, "only ArrayInvertedLists supported");
+    FAISS_THROW_IF_NOT (src->nlist == i1 - i0 &&
+                        dst->code_size == src->code_size);
+
+    size_t ntotal = index->ntotal;
+    for (long i = i0 ; i < i1; i++) {
+        ntotal -= dst->list_size (i);
+        ntotal += src->list_size (i - i0);
+        std::swap (src->codes[i - i0], dst->codes[i]);
+        std::swap (src->ids[i - i0], dst->ids[i]);
+    }
+    ivf->ntotal = index->ntotal = ntotal;
+}
+
+
+void search_with_parameters (const Index *index,
+                             idx_t n, const float *x, idx_t k,
+                             float *distances, idx_t *labels,
+                             IVFSearchParameters *params,
+                             size_t *nb_dis_ptr)
+{
+    FAISS_THROW_IF_NOT (params);
+    const float *prev_x = x;
+    ScopeDeleter<float> del;
+
+    if (auto ip = dynamic_cast<const IndexPreTransform *> (index)) {
+        x = ip->apply_chain (n, x);
+        if (x != prev_x) {
+            del.set(x);
+        }
+        index = ip->index;
+    }
+
+    std::vector<idx_t> Iq(params->nprobe * n);
+    std::vector<float> Dq(params->nprobe * n);
+
+    const IndexIVF *index_ivf = dynamic_cast<const IndexIVF *>(index);
+    FAISS_THROW_IF_NOT (index_ivf);
+
+    double t0 = getmillisecs();
+    index_ivf->quantizer->search(n, x, params->nprobe,
+                                 Dq.data(), Iq.data());
+    double t1 = getmillisecs();
+    indexIVF_stats.quantization_time += t1 - t0;
+
+    if (nb_dis_ptr) {
+        size_t nb_dis = 0;
+        const InvertedLists *il = index_ivf->invlists;
+        for (idx_t i = 0; i < n * params->nprobe; i++) {
+          if (Iq[i] >= 0) {
+              nb_dis += il->list_size(Iq[i]);
+          }
+        }
+        *nb_dis_ptr = nb_dis;
+    }
+
+    index_ivf->search_preassigned(n, x, k, Iq.data(), Dq.data(),
+                                  distances, labels,
+                                  false, params);
+    double t2 = getmillisecs();
+    indexIVF_stats.search_time += t2 - t1;
+}
+
+
+
+} } // namespace faiss::ivflib
--- a/core/src/index/thirdparty/faiss/IVFlib.h
+++ b/core/src/index/thirdparty/faiss/IVFlib.h
@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_IVFLIB_H
+#define FAISS_IVFLIB_H
+
+/** Since IVF (inverted file) indexes are of so much use for
+ * large-scale use cases, we group a few functions related to them in
+ * this small library. Most functions work both on IndexIVFs and
+ * IndexIVFs embedded within an IndexPreTransform.
+ */
+
+#include <vector>
+#include <faiss/IndexIVF.h>
+
+namespace faiss { namespace ivflib {
+
+
+/** check if two indexes have the same parameters and are trained in
+ * the same way, otherwise throw. */
+void check_compatible_for_merge (const Index * index1,
+                                 const Index * index2);
+
+/** get an IndexIVF from an index. The index may be an IndexIVF or
+ * some wrapper class that encloses an IndexIVF
+ *
+ * throws an exception if this is not the case.
+ */
+const IndexIVF * extract_index_ivf (const Index * index);
+IndexIVF * extract_index_ivf (Index * index);
+
+/** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
+ *  embedded in a IndexPreTransform. On output, the index1 is empty.
+ *
+ * @param shift_ids: translate the ids from index1 to index0->prev_ntotal
+ */
+void merge_into(Index *index0, Index *index1, bool shift_ids);
+
+typedef Index::idx_t idx_t;
+
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param embeddings object descriptors for which the centroids should be found,
+ *                   size num_objects * d
+ * @param centroid_ids
+ *                   cluster id each object belongs to, size num_objects
+ */
+void search_centroid(Index *index,
+                     const float* x, int n,
+                     idx_t* centroid_ids);
+
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param query_centroid_ids
+ *                   centroid ids corresponding to the query vectors (size n)
+ * @param result_centroid_ids
+ *                   centroid ids corresponding to the results (size n * k)
+ * other arguments are the same as the standard search function
+ */
+void search_and_return_centroids(Index *index,
+                                 size_t n,
+                                 const float* xin,
+                                 long k,
+                                 float *distances,
+                                 idx_t* labels,
+                                 idx_t* query_centroid_ids,
+                                 idx_t* result_centroid_ids);
+
+
+/** A set of IndexIVFs concatenated together in a FIFO fashion.
+ * at each "step", the oldest index slice is removed and a new index is added.
+ */
+struct SlidingIndexWindow {
+    /// common index that contains the sliding window
+    Index * index;
+
+    /// InvertedLists of index
+    ArrayInvertedLists *ils;
+
+    /// number of slices currently in index
+    int n_slice;
+
+    /// same as index->nlist
+    size_t nlist;
+
+    /// cumulative list sizes at each slice
+    std::vector<std::vector<size_t> > sizes;
+
+    /// index should be initially empty and trained
+    SlidingIndexWindow (Index *index);
+
+    /** Add one index to the current index and remove the oldest one.
+     *
+     * @param sub_index        slice to swap in (can be NULL)
+     * @param remove_oldest    if true, remove the oldest slices */
+    void step(const Index *sub_index, bool remove_oldest);
+
+};
+
+
+/// Get a subset of inverted lists [i0, i1)
+ArrayInvertedLists * get_invlist_range (const Index *index,
+                                        long i0, long i1);
+
+/// Set a subset of inverted lists
+void set_invlist_range (Index *index, long i0, long i1,
+                        ArrayInvertedLists * src);
+
+// search an IndexIVF, possibly embedded in an IndexPreTransform with
+// given parameters. Optionally returns the number of distances
+// computed
+void search_with_parameters (const Index *index,
+                             idx_t n, const float *x, idx_t k,
+                             float *distances, idx_t *labels,
+                             IVFSearchParameters *params,
+                             size_t *nb_dis = nullptr);
+
+
+
+} } // namespace faiss::ivflib
+
+#endif
--- a/core/src/index/thirdparty/faiss/Index.cpp
+++ b/core/src/index/thirdparty/faiss/Index.cpp
@ -0,0 +1,171 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/Index.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+
+#include <cstring>
+
+
+namespace faiss {
+
+Index::~Index ()
+{
+}
+
+
+void Index::train(idx_t /*n*/, const float* /*x*/) {
+    // does nothing by default
+}
+
+
+void Index::range_search (idx_t , const float *, float,
+                          RangeSearchResult *) const
+{
+  FAISS_THROW_MSG ("range search not implemented");
+}
+
+void Index::assign (idx_t n, const float * x, idx_t * labels, idx_t k)
+{
+  float * distances = new float[n * k];
+  ScopeDeleter<float> del(distances);
+  search (n, x, k, distances, labels);
+}
+
+void Index::add_with_ids(
+    idx_t /*n*/,
+    const float* /*x*/,
+    const idx_t* /*xids*/) {
+  FAISS_THROW_MSG ("add_with_ids not implemented for this type of index");
+}
+
+size_t Index::remove_ids(const IDSelector& /*sel*/) {
+  FAISS_THROW_MSG ("remove_ids not implemented for this type of index");
+  return -1;
+}
+
+
+void Index::reconstruct (idx_t, float * ) const {
+  FAISS_THROW_MSG ("reconstruct not implemented for this type of index");
+}
+
+
+void Index::reconstruct_n (idx_t i0, idx_t ni, float *recons) const {
+  for (idx_t i = 0; i < ni; i++) {
+    reconstruct (i0 + i, recons + i * d);
+  }
+}
+
+
+void Index::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                    float *distances, idx_t *labels,
+                                    float *recons) const {
+  search (n, x, k, distances, labels);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      float* reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        reconstruct (key, reconstructed);
+      }
+    }
+  }
+}
+
+void Index::compute_residual (const float * x,
+                              float * residual, idx_t key) const {
+  reconstruct (key, residual);
+  for (size_t i = 0; i < d; i++) {
+    residual[i] = x[i] - residual[i];
+  }
+}
+
+void Index::compute_residual_n (idx_t n, const float* xs,
+                                float* residuals,
+                                const idx_t* keys) const {
+#pragma omp parallel for
+  for (idx_t i = 0; i < n; ++i) {
+    compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
+  }
+}
+
+
+
+size_t Index::sa_code_size () const
+{
+    FAISS_THROW_MSG ("standalone codec not implemented for this type of index");
+}
+
+void Index::sa_encode (idx_t, const float *,
+                             uint8_t *) const
+{
+    FAISS_THROW_MSG ("standalone codec not implemented for this type of index");
+}
+
+void Index::sa_decode (idx_t, const uint8_t *,
+                            float *) const
+{
+    FAISS_THROW_MSG ("standalone codec not implemented for this type of index");
+}
+
+
+namespace {
+
+
+// storage that explicitly reconstructs vectors before computing distances
+struct GenericDistanceComputer : DistanceComputer {
+  size_t d;
+  const Index& storage;
+  std::vector<float> buf;
+  const float *q;
+
+  explicit GenericDistanceComputer(const Index& storage)
+      : storage(storage) {
+    d = storage.d;
+    buf.resize(d * 2);
+  }
+
+  float operator () (idx_t i) override {
+    storage.reconstruct(i, buf.data());
+    return fvec_L2sqr(q, buf.data(), d);
+  }
+
+  float symmetric_dis(idx_t i, idx_t j) override {
+    storage.reconstruct(i, buf.data());
+    storage.reconstruct(j, buf.data() + d);
+    return fvec_L2sqr(buf.data() + d, buf.data(), d);
+  }
+
+  void set_query(const float *x) override {
+    q = x;
+  }
+
+};
+
+
+}  // namespace
+
+
+DistanceComputer * Index::get_distance_computer() const {
+    if (metric_type == METRIC_L2) {
+        return new GenericDistanceComputer(*this);
+    } else {
+        FAISS_THROW_MSG ("get_distance_computer() not implemented");
+    }
+}
+
+
+}
--- a/core/src/index/thirdparty/faiss/Index.h
+++ b/core/src/index/thirdparty/faiss/Index.h
@ -0,0 +1,261 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_H
+#define FAISS_INDEX_H
+
+
+#include <cstdio>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#define FAISS_VERSION_MAJOR 1
+#define FAISS_VERSION_MINOR 6
+#define FAISS_VERSION_PATCH 0
+
+/**
+ * @namespace faiss
+ *
+ * Throughout the library, vectors are provided as float * pointers.
+ * Most algorithms can be optimized when several vectors are processed
+ * (added/searched) together in a batch. In this case, they are passed
+ * in as a matrix. When n vectors of size d are provided as float * x,
+ * component j of vector i is
+ *
+ *   x[ i * d + j ]
+ *
+ * where 0 <= i < n and 0 <= j < d. In other words, matrices are
+ * always compact. When specifying the size of the matrix, we call it
+ * an n*d matrix, which implies a row-major storage.
+ */
+
+
+namespace faiss {
+
+
+/// Some algorithms support both an inner product version and a L2 search version.
+enum MetricType {
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+
+};
+
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+struct DistanceComputer;
+
+/** Abstract structure for an index
+ *
+ * Supports adding vertices and searching them.
+ *
+ * Currently only asymmetric queries are supported:
+ * database-to-database queries are not implemented.
+ */
+struct Index {
+    using idx_t = int64_t;  ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;
+
+    int d;                 ///< vector dimension
+    idx_t ntotal;          ///< total nb of indexed vectors
+    bool verbose;          ///< verbosity level
+
+    /// set if the Index does not require training, or if training is
+    /// done already
+    bool is_trained;
+
+    /// type of metric this index uses for search
+    MetricType metric_type;
+    float metric_arg;     ///< argument of the metric type
+
+    explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
+                    d(d),
+                    ntotal(0),
+                    verbose(false),
+                    is_trained(true),
+                    metric_type (metric),
+                    metric_arg(0) {}
+
+    virtual ~Index ();
+
+
+    /** Perform training on a representative set of vectors
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * This function slices the input vectors in chuncks smaller than
+     * blocksize_add and calls add_core.
+     * @param x      input matrix, size n * d
+     */
+    virtual void add (idx_t n, const float *x) = 0;
+
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels) const = 0;
+
+    /** query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many
+     * indexes do not implement the range_search (only the k-NN search
+     * is mandatory).
+     *
+     * @param x           input vectors to search, size n * d
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search (idx_t n, const float *x, float radius,
+                               RangeSearchResult *result) const;
+
+    /** return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical as search but only return labels of neighbors.
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     */
+    void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
+
+    /// removes all elements from the database.
+    virtual void reset() = 0;
+
+    /** removes IDs from the index. Not supported by all
+     * indexes. Returns the number of elements removed.
+     */
+    virtual size_t remove_ids (const IDSelector & sel);
+
+    /** Reconstruct a stored vector (or an approximation if lossy coding)
+     *
+     * this function may not be defined for some indexes
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d)
+     */
+    virtual void reconstruct (idx_t key, float * recons) const;
+
+    /** Reconstruct vectors i0 to i0 + ni - 1
+     *
+     * this function may not be defined for some indexes
+     * @param recons      reconstucted vector (size ni * d)
+     */
+    virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting arrays
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                         float *distances, idx_t *labels,
+                                         float *recons) const;
+
+    /** Computes a residual vector after indexing encoding.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param x           input vector, size d
+     * @param residual    output residual vector, size d
+     * @param key         encoded index, as returned by search and assign
+     */
+    virtual void compute_residual (const float * x,
+                                   float * residual, idx_t key) const;
+
+    /** Computes a residual vector after indexing encoding (batch form).
+     * Equivalent to calling compute_residual for each vector.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param n           number of vectors
+     * @param xs          input vectors, size (n x d)
+     * @param residuals   output residual vectors, size (n x d)
+     * @param keys        encoded index, as returned by search and assign
+     */
+    virtual void compute_residual_n (idx_t n, const float* xs,
+                                     float* residuals,
+                                     const idx_t* keys) const;
+
+    /** Get a DistanceComputer (defined in AuxIndexStructures) object
+     * for this kind of index.
+     *
+     * DistanceComputer is implemented for indexes that support random
+     * access of their vectors.
+     */
+    virtual DistanceComputer * get_distance_computer() const;
+
+
+    /* The standalone codec interface */
+
+    /** size of the produced codes in bytes */
+    virtual size_t sa_code_size () const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param x       input vectors, size n * d
+     * @param bytes   output encoded vectors, size n * sa_code_size()
+     */
+    virtual void sa_encode (idx_t n, const float *x,
+                                  uint8_t *bytes) const;
+
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * sa_code_size()
+     * @param x       output vectors, size n * d
+     */
+    virtual void sa_decode (idx_t n, const uint8_t *bytes,
+                                    float *x) const;
+
+
+};
+
+}
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/Index2Layer.cpp
+++ b/core/src/index/thirdparty/faiss/Index2Layer.cpp
@ -0,0 +1,437 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/Index2Layer.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cassert>
+#include <stdint.h>
+
+#ifdef __SSE__
+#include <immintrin.h>
+#endif
+
+#include <algorithm>
+
+#include <faiss/IndexIVFPQ.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/distances.h>
+
+
+/*
+#include <faiss/utils/Heap.h>
+
+#include <faiss/Clustering.h>
+
+#include <faiss/utils/hamming.h>
+
+
+*/
+
+
+namespace faiss {
+
+using idx_t = Index::idx_t;
+
+/*************************************
+ * Index2Layer implementation
+ *************************************/
+
+
+Index2Layer::Index2Layer (Index * quantizer, size_t nlist,
+                          int M, int nbit,
+                          MetricType metric):
+    Index (quantizer->d, metric),
+    q1 (quantizer, nlist),
+    pq (quantizer->d, M, nbit)
+{
+    is_trained = false;
+    for (int nbyte = 0; nbyte < 7; nbyte++) {
+        if ((1L << (8 * nbyte)) >= nlist) {
+            code_size_1 = nbyte;
+            break;
+        }
+    }
+    code_size_2 = pq.code_size;
+    code_size = code_size_1 + code_size_2;
+}
+
+Index2Layer::Index2Layer ()
+{
+    code_size = code_size_1 = code_size_2 = 0;
+}
+
+Index2Layer::~Index2Layer ()
+{}
+
+void Index2Layer::train(idx_t n, const float* x)
+{
+    if (verbose) {
+        printf ("training level-1 quantizer %ld vectors in %dD\n",
+                n, d);
+    }
+
+    q1.train_q1 (n, x, verbose, metric_type);
+
+    if (verbose) {
+        printf("computing residuals\n");
+    }
+
+    const float * x_in = x;
+
+    x = fvecs_maybe_subsample (
+         d, (size_t*)&n, pq.cp.max_points_per_centroid * pq.ksub,
+         x, verbose, pq.cp.seed);
+
+    ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
+
+    std::vector<idx_t> assign(n); // assignement to coarse centroids
+    q1.quantizer->assign (n, x, assign.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+           x + i * d, residuals.data() + i * d, assign[i]);
+    }
+
+    if (verbose)
+        printf ("training %zdx%zd product quantizer on %ld vectors in %dD\n",
+                pq.M, pq.ksub, n, d);
+    pq.verbose = verbose;
+    pq.train (n, residuals.data());
+
+    is_trained = true;
+}
+
+void Index2Layer::add(idx_t n, const float* x)
+{
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("Index2Layer::add: adding %ld:%ld / %ld\n",
+                       i0, i1, n);
+            }
+            add (i1 - i0, x + i0 * d);
+        }
+        return;
+    }
+
+    std::vector<idx_t> codes1 (n);
+    q1.quantizer->assign (n, x, codes1.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+            x + i * d, residuals.data() + i * d, codes1[i]);
+    }
+    std::vector<uint8_t> codes2 (n * code_size_2);
+
+    pq.compute_codes (residuals.data(), codes2.data(), n);
+
+    codes.resize ((ntotal + n) * code_size);
+    uint8_t *wp = &codes[ntotal * code_size];
+
+    {
+        int i = 0x11223344;
+        const char *ip = (char*)&i;
+        FAISS_THROW_IF_NOT_MSG (ip[0] == 0x44,
+                                "works only on a little-endian CPU");
+    }
+
+    // copy to output table
+    for (idx_t i = 0; i < n; i++) {
+        memcpy (wp, &codes1[i], code_size_1);
+        wp += code_size_1;
+        memcpy (wp, &codes2[i * code_size_2], code_size_2);
+        wp += code_size_2;
+    }
+
+    ntotal += n;
+
+}
+
+void Index2Layer::search(
+    idx_t /*n*/,
+    const float* /*x*/,
+    idx_t /*k*/,
+    float* /*distances*/,
+    idx_t* /*labels*/) const {
+  FAISS_THROW_MSG("not implemented");
+}
+
+
+void Index2Layer::reconstruct_n(idx_t i0, idx_t ni, float* recons) const
+{
+    float recons1[d];
+    FAISS_THROW_IF_NOT (i0 >= 0 && i0 + ni <= ntotal);
+    const uint8_t *rp = &codes[i0 * code_size];
+
+    for (idx_t i = 0; i < ni; i++) {
+        idx_t key = 0;
+        memcpy (&key, rp, code_size_1);
+        q1.quantizer->reconstruct (key, recons1);
+        rp += code_size_1;
+        pq.decode (rp, recons);
+        for (idx_t j = 0; j < d; j++) {
+            recons[j] += recons1[j];
+        }
+        rp += code_size_2;
+        recons += d;
+    }
+}
+
+void Index2Layer::transfer_to_IVFPQ (IndexIVFPQ & other) const
+{
+    FAISS_THROW_IF_NOT (other.nlist == q1.nlist);
+    FAISS_THROW_IF_NOT (other.code_size == code_size_2);
+    FAISS_THROW_IF_NOT (other.ntotal == 0);
+
+    const uint8_t *rp = codes.data();
+
+    for (idx_t i = 0; i < ntotal; i++) {
+        idx_t key = 0;
+        memcpy (&key, rp, code_size_1);
+        rp += code_size_1;
+        other.invlists->add_entry (key, i, rp);
+        rp += code_size_2;
+    }
+
+    other.ntotal = ntotal;
+
+}
+
+
+
+void Index2Layer::reconstruct(idx_t key, float* recons) const
+{
+    reconstruct_n (key, 1, recons);
+}
+
+void Index2Layer::reset()
+{
+    ntotal = 0;
+    codes.clear ();
+}
+
+
+namespace {
+
+
+struct Distance2Level : DistanceComputer {
+    size_t d;
+    const Index2Layer& storage;
+    std::vector<float> buf;
+    const float *q;
+
+    const float *pq_l1_tab, *pq_l2_tab;
+
+    explicit Distance2Level(const Index2Layer& storage)
+        : storage(storage) {
+        d = storage.d;
+        FAISS_ASSERT(storage.pq.dsub == 4);
+        pq_l2_tab = storage.pq.centroids.data();
+        buf.resize(2 * d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        storage.reconstruct(i, buf.data());
+        storage.reconstruct(j, buf.data() + d);
+        return fvec_L2sqr(buf.data() + d, buf.data(), d);
+    }
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+// well optimized for xNN+PQNN
+struct DistanceXPQ4 : Distance2Level {
+
+    int M, k;
+
+    explicit DistanceXPQ4(const Index2Layer& storage)
+        : Distance2Level (storage) {
+        const IndexFlat *quantizer =
+            dynamic_cast<IndexFlat*> (storage.q1.quantizer);
+
+        FAISS_ASSERT(quantizer);
+        M = storage.pq.M;
+        pq_l1_tab = quantizer->xb.data();
+    }
+
+    float operator () (idx_t i) override {
+#ifdef __SSE__
+        const uint8_t *code = storage.codes.data() + i * storage.code_size;
+        long key = 0;
+        memcpy (&key, code, storage.code_size_1);
+        code += storage.code_size_1;
+
+        // walking pointers
+        const float *qa = q;
+        const __m128 *l1_t = (const __m128 *)(pq_l1_tab + d * key);
+        const __m128 *pq_l2_t = (const __m128 *)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+
+        for (int m = 0; m < M; m++) {
+            __m128 qi = _mm_loadu_ps(qa);
+            __m128 recons = l1_t[m] + pq_l2_t[*code++];
+            __m128 diff = qi - recons;
+            accu += diff * diff;
+            pq_l2_t += 256;
+            qa += 4;
+        }
+
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        return  _mm_cvtss_f32 (accu);
+#else
+        FAISS_THROW_MSG("not implemented for non-x64 platforms");
+#endif
+    }
+
+};
+
+// well optimized for 2xNN+PQNN
+struct Distance2xXPQ4 : Distance2Level {
+
+    int M_2, mi_nbits;
+
+    explicit Distance2xXPQ4(const Index2Layer& storage)
+        : Distance2Level(storage) {
+        const MultiIndexQuantizer *mi =
+            dynamic_cast<MultiIndexQuantizer*> (storage.q1.quantizer);
+
+        FAISS_ASSERT(mi);
+        FAISS_ASSERT(storage.pq.M % 2 == 0);
+        M_2 = storage.pq.M / 2;
+        mi_nbits = mi->pq.nbits;
+        pq_l1_tab = mi->pq.centroids.data();
+    }
+
+    float operator () (idx_t i) override {
+        const uint8_t *code = storage.codes.data() + i * storage.code_size;
+        long key01 = 0;
+        memcpy (&key01, code, storage.code_size_1);
+        code += storage.code_size_1;
+#ifdef __SSE__
+
+        // walking pointers
+        const float *qa = q;
+        const __m128 *pq_l1_t = (const __m128 *)pq_l1_tab;
+        const __m128 *pq_l2_t = (const __m128 *)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+
+        for (int mi_m = 0; mi_m < 2; mi_m++) {
+            long l1_idx = key01 & ((1L << mi_nbits) - 1);
+            const __m128 * pq_l1 = pq_l1_t + M_2 * l1_idx;
+
+            for (int m = 0; m < M_2; m++) {
+                __m128 qi = _mm_loadu_ps(qa);
+                __m128 recons = pq_l1[m] + pq_l2_t[*code++];
+                __m128 diff = qi - recons;
+                accu += diff * diff;
+                pq_l2_t += 256;
+                qa += 4;
+            }
+            pq_l1_t += M_2 << mi_nbits;
+            key01 >>= mi_nbits;
+        }
+        accu = _mm_hadd_ps (accu, accu);
+        accu = _mm_hadd_ps (accu, accu);
+        return  _mm_cvtss_f32 (accu);
+#else
+        FAISS_THROW_MSG("not implemented for non-x64 platforms");
+#endif
+    }
+
+};
+
+
+}  // namespace
+
+
+DistanceComputer * Index2Layer::get_distance_computer() const {
+#ifdef __SSE__
+    const MultiIndexQuantizer *mi =
+        dynamic_cast<MultiIndexQuantizer*> (q1.quantizer);
+
+    if (mi && pq.M % 2 == 0 && pq.dsub == 4) {
+        return new Distance2xXPQ4(*this);
+    }
+
+    const IndexFlat *fl =
+        dynamic_cast<IndexFlat*> (q1.quantizer);
+
+    if (fl && pq.dsub == 4) {
+        return new DistanceXPQ4(*this);
+    }
+#endif
+
+    return Index::get_distance_computer();
+}
+
+
+/* The standalone codec interface */
+size_t Index2Layer::sa_code_size () const
+{
+    return code_size;
+}
+
+void Index2Layer::sa_encode (idx_t n, const float *x, uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> list_nos (new int64_t [n]);
+    q1.quantizer->assign (n, x, list_nos.get());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual (
+            x + i * d, residuals.data() + i * d, list_nos[i]);
+    }
+    pq.compute_codes (residuals.data(), bytes, n);
+
+    for (idx_t i = n - 1; i >= 0; i--) {
+        uint8_t * code = bytes + i * code_size;
+        memmove (code + code_size_1,
+                 bytes + i * code_size_2, code_size_2);
+        q1.encode_listno (list_nos[i], code);
+    }
+
+}
+
+void Index2Layer::sa_decode (idx_t n, const uint8_t *bytes, float *x) const
+{
+
+#pragma omp parallel
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = bytes + i * code_size;
+            int64_t list_no = q1.decode_listno (code);
+            float *xi = x + i * d;
+            pq.decode (code + code_size_1, xi);
+            q1.quantizer->reconstruct (list_no, residual.data());
+            for (size_t j = 0; j < d; j++) {
+                xi[j] += residual[j];
+            }
+        }
+    }
+
+}
+
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/Index2Layer.h
+++ b/core/src/index/thirdparty/faiss/Index2Layer.h
@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexIVF.h>
+
+namespace faiss {
+
+struct IndexIVFPQ;
+
+
+/** Same as an IndexIVFPQ without the inverted lists: codes are stored sequentially
+ *
+ * The class is mainly inteded to store encoded vectors that can be
+ * accessed randomly, the search function is not implemented.
+ */
+struct Index2Layer: Index {
+    /// first level quantizer
+    Level1Quantizer q1;
+
+    /// second level quantizer is always a PQ
+    ProductQuantizer pq;
+
+    /// Codes. Size ntotal * code_size.
+    std::vector<uint8_t> codes;
+
+    /// size of the code for the first level (ceil(log8(q1.nlist)))
+    size_t code_size_1;
+
+    /// size of the code for the second level
+    size_t code_size_2;
+
+    /// code_size_1 + code_size_2
+    size_t code_size;
+
+    Index2Layer (Index * quantizer, size_t nlist,
+                 int M, int nbit = 8,
+                 MetricType metric = METRIC_L2);
+
+    Index2Layer ();
+    ~Index2Layer ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    /// not implemented
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset() override;
+
+    DistanceComputer * get_distance_computer() const override;
+
+    /// transfer the flat codes to an IVFPQ index
+    void transfer_to_IVFPQ(IndexIVFPQ & other) const;
+
+
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+    void sa_encode (idx_t n, const float *x, uint8_t *bytes) const override;
+    void sa_decode (idx_t n, const uint8_t *bytes, float *x) const override;
+
+};
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinary.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinary.cpp
@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/FaissAssert.h>
+
+#include <cstring>
+
+namespace faiss {
+
+IndexBinary::~IndexBinary() {}
+
+void IndexBinary::train(idx_t, const uint8_t *) {
+  // Does nothing by default.
+}
+
+void IndexBinary::range_search(idx_t, const uint8_t *, int,
+                               RangeSearchResult *) const {
+  FAISS_THROW_MSG("range search not implemented");
+}
+
+void IndexBinary::assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k) {
+  int *distances = new int[n * k];
+  ScopeDeleter<int> del(distances);
+  search(n, x, k, distances, labels);
+}
+
+void IndexBinary::add_with_ids(idx_t, const uint8_t *, const idx_t *) {
+  FAISS_THROW_MSG("add_with_ids not implemented for this type of index");
+}
+
+size_t IndexBinary::remove_ids(const IDSelector&) {
+  FAISS_THROW_MSG("remove_ids not implemented for this type of index");
+  return 0;
+}
+
+void IndexBinary::reconstruct(idx_t, uint8_t *) const {
+  FAISS_THROW_MSG("reconstruct not implemented for this type of index");
+}
+
+void IndexBinary::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
+  for (idx_t i = 0; i < ni; i++) {
+    reconstruct(i0 + i, recons + i * d);
+  }
+}
+
+void IndexBinary::search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                         int32_t *distances, idx_t *labels,
+                                         uint8_t *recons) const {
+  search(n, x, k, distances, labels);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      uint8_t *reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        reconstruct(key, reconstructed);
+      }
+    }
+  }
+}
+
+void IndexBinary::display() const {
+  printf("Index: %s  -> %ld elements\n", typeid (*this).name(), ntotal);
+}
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinary.h
+++ b/core/src/index/thirdparty/faiss/IndexBinary.h
@ -0,0 +1,163 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_H
+#define FAISS_INDEX_BINARY_H
+
+#include <cstdio>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+
+/** Abstract structure for a binary index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinary {
+  using idx_t = Index::idx_t;    ///< all indices are this type
+  using component_t = uint8_t;
+  using distance_t = int32_t;
+
+  int d;                 ///< vector dimension
+  int code_size;   ///< number of bytes per vector ( = d / 8 )
+  idx_t ntotal;          ///< total nb of indexed vectors
+  bool verbose;          ///< verbosity level
+
+  /// set if the Index does not require training, or if training is done already
+  bool is_trained;
+
+  /// type of metric this index uses for search
+  MetricType metric_type;
+
+  explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_L2)
+      : d(d),
+        code_size(d / 8),
+        ntotal(0),
+        verbose(false),
+        is_trained(true),
+        metric_type(metric) {
+        FAISS_THROW_IF_NOT(d % 8 == 0);
+      }
+
+  virtual ~IndexBinary();
+
+
+  /** Perform training on a representative set of vectors.
+   *
+   * @param n      nb of training vectors
+   * @param x      training vecors, size n * d / 8
+   */
+  virtual void train(idx_t n, const uint8_t *x);
+
+  /** Add n vectors of dimension d to the index.
+   *
+   * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+   * @param x      input matrix, size n * d / 8
+   */
+  virtual void add(idx_t n, const uint8_t *x) = 0;
+
+  /** Same as add, but stores xids instead of sequential ids.
+   *
+   * The default implementation fails with an assertion, as it is
+   * not supported by all indexes.
+   *
+   * @param xids if non-null, ids to store for the vectors (size n)
+   */
+  virtual void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids);
+
+  /** Query n vectors of dimension d to the index.
+   *
+   * return at most k vectors. If there are not enough results for a
+   * query, the result array is padded with -1s.
+   *
+   * @param x           input vectors to search, size n * d / 8
+   * @param labels      output labels of the NNs, size n*k
+   * @param distances   output pairwise distances, size n*k
+   */
+  virtual void search(idx_t n, const uint8_t *x, idx_t k,
+                      int32_t *distances, idx_t *labels) const = 0;
+
+  /** Query n vectors of dimension d to the index.
+   *
+   * return all vectors with distance < radius. Note that many
+   * indexes do not implement the range_search (only the k-NN search
+   * is mandatory).
+   *
+   * @param x           input vectors to search, size n * d / 8
+   * @param radius      search radius
+   * @param result      result table
+   */
+  virtual void range_search(idx_t n, const uint8_t *x, int radius,
+                            RangeSearchResult *result) const;
+
+  /** Return the indexes of the k vectors closest to the query x.
+   *
+   * This function is identical to search but only returns labels of neighbors.
+   * @param x           input vectors to search, size n * d / 8
+   * @param labels      output labels of the NNs, size n*k
+   */
+  void assign(idx_t n, const uint8_t *x, idx_t *labels, idx_t k = 1);
+
+  /// Removes all elements from the database.
+  virtual void reset() = 0;
+
+  /** Removes IDs from the index. Not supported by all indexes.
+   */
+  virtual size_t remove_ids(const IDSelector& sel);
+
+  /** Reconstruct a stored vector.
+   *
+   * This function may not be defined for some indexes.
+   * @param key         id of the vector to reconstruct
+   * @param recons      reconstucted vector (size d / 8)
+   */
+  virtual void reconstruct(idx_t key, uint8_t *recons) const;
+
+
+  /** Reconstruct vectors i0 to i0 + ni - 1.
+   *
+   * This function may not be defined for some indexes.
+   * @param recons      reconstucted vectors (size ni * d / 8)
+   */
+  virtual void reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const;
+
+  /** Similar to search, but also reconstructs the stored vectors (or an
+   * approximation in the case of lossy coding) for the search results.
+   *
+   * If there are not enough results for a query, the resulting array
+   * is padded with -1s.
+   *
+   * @param recons      reconstructed vectors size (n, k, d)
+   **/
+  virtual void search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                      int32_t *distances, idx_t *labels,
+                                      uint8_t *recons) const;
+
+  /** Display the actual class name and some more info. */
+  void display() const;
+};
+
+
+}  // namespace faiss
+
+#endif  // FAISS_INDEX_BINARY_H
--- a/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
@ -0,0 +1,83 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryFlat.h>
+
+#include <cstring>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+IndexBinaryFlat::IndexBinaryFlat(idx_t d)
+    : IndexBinary(d) {}
+
+void IndexBinaryFlat::add(idx_t n, const uint8_t *x) {
+  xb.insert(xb.end(), x, x + n * code_size);
+  ntotal += n;
+}
+
+void IndexBinaryFlat::reset() {
+  xb.clear();
+  ntotal = 0;
+}
+
+void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels) const {
+  const idx_t block_size = query_batch_size;
+  for (idx_t s = 0; s < n; s += block_size) {
+    idx_t nn = block_size;
+    if (s + block_size > n) {
+      nn = n - s;
+    }
+
+    if (use_heap) {
+      // We see the distances and labels as heaps.
+      int_maxheap_array_t res = {
+        size_t(nn), size_t(k), labels + s * k, distances + s * k
+      };
+
+      hammings_knn_hc(&res, x + s * code_size, xb.data(), ntotal, code_size,
+                      /* ordered = */ true);
+    } else {
+      hammings_knn_mc(x + s * code_size, xb.data(), nn, ntotal, k, code_size,
+                      distances + s * k, labels + s * k);
+    }
+  }
+}
+
+size_t IndexBinaryFlat::remove_ids(const IDSelector& sel) {
+  idx_t j = 0;
+  for (idx_t i = 0; i < ntotal; i++) {
+    if (sel.is_member(i)) {
+      // should be removed
+    } else {
+      if (i > j) {
+        memmove(&xb[code_size * j], &xb[code_size * i], sizeof(xb[0]) * code_size);
+      }
+      j++;
+    }
+  }
+  long nremove = ntotal - j;
+  if (nremove > 0) {
+    ntotal = j;
+    xb.resize(ntotal * code_size);
+  }
+  return nremove;
+}
+
+void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
+  memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
+}
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
@ -0,0 +1,54 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_BINARY_FLAT_H
+#define INDEX_BINARY_FLAT_H
+
+#include <vector>
+
+#include <faiss/IndexBinary.h>
+
+namespace faiss {
+
+
+/** Index that stores the full vectors and performs exhaustive search. */
+struct IndexBinaryFlat : IndexBinary {
+  /// database vectors, size ntotal * d / 8
+  std::vector<uint8_t> xb;
+
+  /** Select between using a heap or counting to select the k smallest values
+   * when scanning inverted lists.
+   */
+  bool use_heap = true;
+
+  size_t query_batch_size = 32;
+
+  explicit IndexBinaryFlat(idx_t d);
+
+  void add(idx_t n, const uint8_t *x) override;
+
+  void reset() override;
+
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels) const override;
+
+  void reconstruct(idx_t key, uint8_t *recons) const override;
+
+  /** Remove some ids. Note that because of the indexing structure,
+   * the semantics of this operation are different from the usual ones:
+   * the new ids are shifted. */
+  size_t remove_ids(const IDSelector& sel) override;
+
+  IndexBinaryFlat() {}
+};
+
+
+}  // namespace faiss
+
+#endif  // INDEX_BINARY_FLAT_H
--- a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
@ -0,0 +1,78 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryFromFloat.h>
+
+#include <memory>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+
+IndexBinaryFromFloat::IndexBinaryFromFloat() {}
+
+IndexBinaryFromFloat::IndexBinaryFromFloat(Index *index)
+    : IndexBinary(index->d),
+      index(index),
+      own_fields(false) {
+  is_trained = index->is_trained;
+  ntotal = index->ntotal;
+}
+
+IndexBinaryFromFloat::~IndexBinaryFromFloat() {
+  if (own_fields) {
+    delete index;
+  }
+}
+
+void IndexBinaryFromFloat::add(idx_t n, const uint8_t *x) {
+  constexpr idx_t bs = 32768;
+  std::unique_ptr<float[]> xf(new float[bs * d]);
+
+  for (idx_t b = 0; b < n; b += bs) {
+    idx_t bn = std::min(bs, n - b);
+    binary_to_real(bn * d, x + b * code_size, xf.get());
+
+    index->add(bn, xf.get());
+  }
+  ntotal = index->ntotal;
+}
+
+void IndexBinaryFromFloat::reset() {
+  index->reset();
+  ntotal = index->ntotal;
+}
+
+void IndexBinaryFromFloat::search(idx_t n, const uint8_t *x, idx_t k,
+                                  int32_t *distances, idx_t *labels) const {
+  constexpr idx_t bs = 32768;
+  std::unique_ptr<float[]> xf(new float[bs * d]);
+  std::unique_ptr<float[]> df(new float[bs * k]);
+
+  for (idx_t b = 0; b < n; b += bs) {
+    idx_t bn = std::min(bs, n - b);
+    binary_to_real(bn * d, x + b * code_size, xf.get());
+
+    index->search(bn, xf.get(), k, df.get(), labels + b * k);
+    for (int i = 0; i < bn * k; ++i) {
+      distances[b * k + i] = int32_t(std::round(df[i] / 4.0));
+    }
+  }
+}
+
+void IndexBinaryFromFloat::train(idx_t n, const uint8_t *x) {
+  std::unique_ptr<float[]> xf(new float[n * d]);
+  binary_to_real(n * d, x, xf.get());
+
+  index->train(n, xf.get());
+  is_trained = true;
+  ntotal = index->ntotal;
+}
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_FROM_FLOAT_H
+#define FAISS_INDEX_BINARY_FROM_FLOAT_H
+
+#include <faiss/IndexBinary.h>
+
+
+namespace faiss {
+
+
+struct Index;
+
+/** IndexBinary backed by a float Index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinaryFromFloat : IndexBinary {
+  Index *index = nullptr;
+
+  bool own_fields = false; ///< Whether object owns the index pointer.
+
+  IndexBinaryFromFloat();
+
+  explicit IndexBinaryFromFloat(Index *index);
+
+  ~IndexBinaryFromFloat();
+
+  void add(idx_t n, const uint8_t *x) override;
+
+  void reset() override;
+
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels) const override;
+
+  void train(idx_t n, const uint8_t *x) override;
+};
+
+
+}  // namespace faiss
+
+#endif  // FAISS_INDEX_BINARY_FROM_FLOAT_H
--- a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
@ -0,0 +1,325 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryHNSW.h>
+
+
+#include <memory>
+#include <cstdlib>
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+#include <cmath>
+#include <omp.h>
+
+#include <unordered_set>
+#include <queue>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include <faiss/utils/random.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+
+/**************************************************************
+ * add / search blocks of descriptors
+ **************************************************************/
+
+namespace {
+
+
+void hnsw_add_vertices(IndexBinaryHNSW& index_hnsw,
+                       size_t n0,
+                       size_t n, const uint8_t *x,
+                       bool verbose,
+                       bool preset_levels = false) {
+  HNSW& hnsw = index_hnsw.hnsw;
+  size_t ntotal = n0 + n;
+  double t0 = getmillisecs();
+  if (verbose) {
+    printf("hnsw_add_vertices: adding %ld elements on top of %ld "
+           "(preset_levels=%d)\n",
+           n, n0, int(preset_levels));
+  }
+
+  int max_level = hnsw.prepare_level_tab(n, preset_levels);
+
+  if (verbose) {
+    printf("  max_level = %d\n", max_level);
+  }
+
+  std::vector<omp_lock_t> locks(ntotal);
+  for(int i = 0; i < ntotal; i++) {
+    omp_init_lock(&locks[i]);
+  }
+
+  // add vectors from highest to lowest level
+  std::vector<int> hist;
+  std::vector<int> order(n);
+
+  { // make buckets with vectors of the same level
+
+    // build histogram
+    for (int i = 0; i < n; i++) {
+      HNSW::storage_idx_t pt_id = i + n0;
+      int pt_level = hnsw.levels[pt_id] - 1;
+      while (pt_level >= hist.size()) {
+        hist.push_back(0);
+      }
+      hist[pt_level] ++;
+    }
+
+    // accumulate
+    std::vector<int> offsets(hist.size() + 1, 0);
+    for (int i = 0; i < hist.size() - 1; i++) {
+      offsets[i + 1] = offsets[i] + hist[i];
+    }
+
+    // bucket sort
+    for (int i = 0; i < n; i++) {
+      HNSW::storage_idx_t pt_id = i + n0;
+      int pt_level = hnsw.levels[pt_id] - 1;
+      order[offsets[pt_level]++] = pt_id;
+    }
+  }
+
+  { // perform add
+    RandomGenerator rng2(789);
+
+    int i1 = n;
+
+    for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+      int i0 = i1 - hist[pt_level];
+
+      if (verbose) {
+        printf("Adding %d elements at level %d\n",
+               i1 - i0, pt_level);
+      }
+
+      // random permutation to get rid of dataset order bias
+      for (int j = i0; j < i1; j++) {
+        std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
+      }
+
+#pragma omp parallel
+      {
+        VisitedTable vt (ntotal);
+
+        std::unique_ptr<DistanceComputer> dis(
+          index_hnsw.get_distance_computer()
+        );
+        int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
+
+#pragma omp  for schedule(dynamic)
+        for (int i = i0; i < i1; i++) {
+          HNSW::storage_idx_t pt_id = order[i];
+          dis->set_query((float *)(x + (pt_id - n0) * index_hnsw.code_size));
+
+          hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+
+          if (prev_display >= 0 && i - i0 > prev_display + 10000) {
+            prev_display = i - i0;
+            printf("  %d / %d\r", i - i0, i1 - i0);
+            fflush(stdout);
+          }
+        }
+      }
+      i1 = i0;
+    }
+    FAISS_ASSERT(i1 == 0);
+  }
+  if (verbose) {
+    printf("Done in %.3f ms\n", getmillisecs() - t0);
+  }
+
+  for(int i = 0; i < ntotal; i++)
+    omp_destroy_lock(&locks[i]);
+}
+
+
+} // anonymous namespace
+
+
+/**************************************************************
+ * IndexBinaryHNSW implementation
+ **************************************************************/
+
+IndexBinaryHNSW::IndexBinaryHNSW()
+{
+  is_trained = true;
+}
+
+IndexBinaryHNSW::IndexBinaryHNSW(int d, int M)
+    : IndexBinary(d),
+      hnsw(M),
+      own_fields(true),
+      storage(new IndexBinaryFlat(d))
+{
+  is_trained = true;
+}
+
+IndexBinaryHNSW::IndexBinaryHNSW(IndexBinary *storage, int M)
+    : IndexBinary(storage->d),
+      hnsw(M),
+      own_fields(false),
+      storage(storage)
+{
+  is_trained = true;
+}
+
+IndexBinaryHNSW::~IndexBinaryHNSW() {
+  if (own_fields) {
+    delete storage;
+  }
+}
+
+void IndexBinaryHNSW::train(idx_t n, const uint8_t *x)
+{
+  // hnsw structure does not require training
+  storage->train(n, x);
+  is_trained = true;
+}
+
+void IndexBinaryHNSW::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels) const
+{
+#pragma omp parallel
+  {
+    VisitedTable vt(ntotal);
+    std::unique_ptr<DistanceComputer> dis(get_distance_computer());
+
+#pragma omp for
+    for(idx_t i = 0; i < n; i++) {
+      idx_t *idxi = labels + i * k;
+      float *simi = (float *)(distances + i * k);
+
+      dis->set_query((float *)(x + i * code_size));
+
+      maxheap_heapify(k, simi, idxi);
+      hnsw.search(*dis, k, idxi, simi, vt);
+      maxheap_reorder(k, simi, idxi);
+    }
+  }
+
+#pragma omp parallel for
+  for (int i = 0; i < n * k; ++i) {
+    distances[i] = std::round(((float *)distances)[i]);
+  }
+}
+
+
+void IndexBinaryHNSW::add(idx_t n, const uint8_t *x)
+{
+  FAISS_THROW_IF_NOT(is_trained);
+  int n0 = ntotal;
+  storage->add(n, x);
+  ntotal = storage->ntotal;
+
+  hnsw_add_vertices(*this, n0, n, x, verbose,
+                    hnsw.levels.size() == ntotal);
+}
+
+void IndexBinaryHNSW::reset()
+{
+  hnsw.reset();
+  storage->reset();
+  ntotal = 0;
+}
+
+void IndexBinaryHNSW::reconstruct(idx_t key, uint8_t *recons) const
+{
+  storage->reconstruct(key, recons);
+}
+
+
+namespace {
+
+
+template<class HammingComputer>
+struct FlatHammingDis : DistanceComputer {
+  const int code_size;
+  const uint8_t *b;
+  size_t ndis;
+  HammingComputer hc;
+
+  float operator () (idx_t i) override {
+    ndis++;
+    return hc.hamming(b + i * code_size);
+  }
+
+  float symmetric_dis(idx_t i, idx_t j) override {
+    return HammingComputerDefault(b + j * code_size, code_size)
+      .hamming(b + i * code_size);
+  }
+
+
+  explicit FlatHammingDis(const IndexBinaryFlat& storage)
+      : code_size(storage.code_size),
+        b(storage.xb.data()),
+        ndis(0),
+        hc() {}
+
+  // NOTE: Pointers are cast from float in order to reuse the floating-point
+  //   DistanceComputer.
+  void set_query(const float *x) override {
+    hc.set((uint8_t *)x, code_size);
+  }
+
+  ~FlatHammingDis() override {
+#pragma omp critical
+    {
+      hnsw_stats.ndis += ndis;
+    }
+  }
+};
+
+
+}  // namespace
+
+
+DistanceComputer *IndexBinaryHNSW::get_distance_computer() const {
+  IndexBinaryFlat *flat_storage = dynamic_cast<IndexBinaryFlat *>(storage);
+
+  FAISS_ASSERT(flat_storage != nullptr);
+
+  switch(code_size) {
+    case 4:
+      return new FlatHammingDis<HammingComputer4>(*flat_storage);
+    case 8:
+      return new FlatHammingDis<HammingComputer8>(*flat_storage);
+    case 16:
+      return new FlatHammingDis<HammingComputer16>(*flat_storage);
+    case 20:
+      return new FlatHammingDis<HammingComputer20>(*flat_storage);
+    case 32:
+      return new FlatHammingDis<HammingComputer32>(*flat_storage);
+    case 64:
+      return new FlatHammingDis<HammingComputer64>(*flat_storage);
+    default:
+      if (code_size % 8 == 0) {
+        return new FlatHammingDis<HammingComputerM8>(*flat_storage);
+      } else if (code_size % 4 == 0) {
+        return new FlatHammingDis<HammingComputerM4>(*flat_storage);
+      }
+  }
+
+  return new FlatHammingDis<HammingComputerDefault>(*flat_storage);
+}
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
@ -0,0 +1,56 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <faiss/impl/HNSW.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss {
+
+
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+
+struct IndexBinaryHNSW : IndexBinary {
+  typedef HNSW::storage_idx_t storage_idx_t;
+
+  // the link strcuture
+  HNSW hnsw;
+
+  // the sequential storage
+  bool own_fields;
+  IndexBinary *storage;
+
+  explicit IndexBinaryHNSW();
+  explicit IndexBinaryHNSW(int d, int M = 32);
+  explicit IndexBinaryHNSW(IndexBinary *storage, int M = 32);
+
+  ~IndexBinaryHNSW() override;
+
+  DistanceComputer *get_distance_computer() const;
+
+  void add(idx_t n, const uint8_t *x) override;
+
+  /// Trains the storage if needed
+  void train(idx_t n, const uint8_t* x) override;
+
+  /// entry point for search
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels) const override;
+
+  void reconstruct(idx_t key, uint8_t* recons) const override;
+
+  void reset() override;
+};
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
@ -0,0 +1,671 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryIVF.h>
+
+#include <cstdio>
+#include <memory>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+
+
+namespace faiss {
+
+IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist)
+    : IndexBinary(d),
+      invlists(new ArrayInvertedLists(nlist, code_size)),
+      own_invlists(true),
+      nprobe(1),
+      max_codes(0),
+      maintain_direct_map(false),
+      quantizer(quantizer),
+      nlist(nlist),
+      own_fields(false),
+      clustering_index(nullptr)
+{
+  FAISS_THROW_IF_NOT (d == quantizer->d);
+  is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
+
+  cp.niter = 10;
+}
+
+IndexBinaryIVF::IndexBinaryIVF()
+    : invlists(nullptr),
+      own_invlists(false),
+      nprobe(1),
+      max_codes(0),
+      maintain_direct_map(false),
+      quantizer(nullptr),
+      nlist(0),
+      own_fields(false),
+      clustering_index(nullptr)
+{}
+
+void IndexBinaryIVF::add(idx_t n, const uint8_t *x) {
+  add_with_ids(n, x, nullptr);
+}
+
+void IndexBinaryIVF::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) {
+  add_core(n, x, xids, nullptr);
+}
+
+void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
+                              const idx_t *precomputed_idx) {
+  FAISS_THROW_IF_NOT(is_trained);
+  assert(invlists);
+  FAISS_THROW_IF_NOT_MSG(!(maintain_direct_map && xids),
+                         "cannot have direct map and add with ids");
+
+  const idx_t * idx;
+
+  std::unique_ptr<idx_t[]> scoped_idx;
+
+  if (precomputed_idx) {
+    idx = precomputed_idx;
+  } else {
+    scoped_idx.reset(new idx_t[n]);
+    quantizer->assign(n, x, scoped_idx.get());
+    idx = scoped_idx.get();
+  }
+
+  long n_add = 0;
+  for (size_t i = 0; i < n; i++) {
+    idx_t id = xids ? xids[i] : ntotal + i;
+    idx_t list_no = idx[i];
+
+    if (list_no < 0)
+      continue;
+    const uint8_t *xi = x + i * code_size;
+    size_t offset = invlists->add_entry(list_no, id, xi);
+
+    if (maintain_direct_map)
+      direct_map.push_back(list_no << 32 | offset);
+    n_add++;
+  }
+  if (verbose) {
+    printf("IndexBinaryIVF::add_with_ids: added %ld / %ld vectors\n",
+           n_add, n);
+  }
+  ntotal += n_add;
+}
+
+void IndexBinaryIVF::make_direct_map(bool new_maintain_direct_map) {
+  // nothing to do
+  if (new_maintain_direct_map == maintain_direct_map)
+    return;
+
+  if (new_maintain_direct_map) {
+    direct_map.resize(ntotal, -1);
+    for (size_t key = 0; key < nlist; key++) {
+      size_t list_size = invlists->list_size(key);
+      const idx_t *idlist = invlists->get_ids(key);
+
+      for (size_t ofs = 0; ofs < list_size; ofs++) {
+        FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] < ntotal,
+                               "direct map supported only for seuquential ids");
+        direct_map[idlist[ofs]] = key << 32 | ofs;
+      }
+    }
+  } else {
+    direct_map.clear();
+  }
+  maintain_direct_map = new_maintain_direct_map;
+}
+
+void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k,
+                            int32_t *distances, idx_t *labels) const {
+  std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+  std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+  double t0 = getmillisecs();
+  quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+  indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+  t0 = getmillisecs();
+  invlists->prefetch_lists(idx.get(), n * nprobe);
+
+  search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
+                     distances, labels, false);
+  indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
+void IndexBinaryIVF::reconstruct(idx_t key, uint8_t *recons) const {
+  FAISS_THROW_IF_NOT_MSG(direct_map.size() == ntotal,
+                         "direct map is not initialized");
+  idx_t list_no = direct_map[key] >> 32;
+  idx_t offset = direct_map[key] & 0xffffffff;
+  reconstruct_from_offset(list_no, offset, recons);
+}
+
+void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
+  FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+
+  for (idx_t list_no = 0; list_no < nlist; list_no++) {
+    size_t list_size = invlists->list_size(list_no);
+    const Index::idx_t *idlist = invlists->get_ids(list_no);
+
+    for (idx_t offset = 0; offset < list_size; offset++) {
+      idx_t id = idlist[offset];
+      if (!(id >= i0 && id < i0 + ni)) {
+        continue;
+      }
+
+      uint8_t *reconstructed = recons + (id - i0) * d;
+      reconstruct_from_offset(list_no, offset, reconstructed);
+    }
+  }
+}
+
+void IndexBinaryIVF::search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                            int32_t *distances, idx_t *labels,
+                                            uint8_t *recons) const {
+  std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+  std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+  quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+
+  invlists->prefetch_lists(idx.get(), n * nprobe);
+
+  // search_preassigned() with `store_pairs` enabled to obtain the list_no
+  // and offset into `codes` for reconstruction
+  search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
+                     distances, labels, /* store_pairs */true);
+  for (idx_t i = 0; i < n; ++i) {
+    for (idx_t j = 0; j < k; ++j) {
+      idx_t ij = i * k + j;
+      idx_t key = labels[ij];
+      uint8_t *reconstructed = recons + ij * d;
+      if (key < 0) {
+        // Fill with NaNs
+        memset(reconstructed, -1, sizeof(*reconstructed) * d);
+      } else {
+        int list_no = key >> 32;
+        int offset = key & 0xffffffff;
+
+        // Update label to the actual id
+        labels[ij] = invlists->get_single_id(list_no, offset);
+
+        reconstruct_from_offset(list_no, offset, reconstructed);
+      }
+    }
+  }
+}
+
+void IndexBinaryIVF::reconstruct_from_offset(idx_t list_no, idx_t offset,
+                                             uint8_t *recons) const {
+  memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
+}
+
+void IndexBinaryIVF::reset() {
+  direct_map.clear();
+  invlists->reset();
+  ntotal = 0;
+}
+
+size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
+  FAISS_THROW_IF_NOT_MSG(!maintain_direct_map,
+                         "direct map remove not implemented");
+
+  std::vector<idx_t> toremove(nlist);
+
+#pragma omp parallel for
+  for (idx_t i = 0; i < nlist; i++) {
+    idx_t l0 = invlists->list_size (i), l = l0, j = 0;
+    const idx_t *idsi = invlists->get_ids(i);
+    while (j < l) {
+      if (sel.is_member(idsi[j])) {
+        l--;
+        invlists->update_entry(
+          i, j,
+          invlists->get_single_id(i, l),
+          invlists->get_single_code(i, l));
+      } else {
+        j++;
+      }
+    }
+    toremove[i] = l0 - l;
+  }
+  // this will not run well in parallel on ondisk because of possible shrinks
+  size_t nremove = 0;
+  for (idx_t i = 0; i < nlist; i++) {
+    if (toremove[i] > 0) {
+      nremove += toremove[i];
+      invlists->resize(
+        i, invlists->list_size(i) - toremove[i]);
+    }
+  }
+  ntotal -= nremove;
+  return nremove;
+}
+
+void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
+  if (verbose) {
+    printf("Training quantizer\n");
+  }
+
+  if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+    if (verbose) {
+      printf("IVF quantizer does not need training.\n");
+    }
+  } else {
+    if (verbose) {
+      printf("Training quantizer on %ld vectors in %dD\n", n, d);
+    }
+
+    Clustering clus(d, nlist, cp);
+    quantizer->reset();
+
+    std::unique_ptr<float[]> x_f(new float[n * d]);
+    binary_to_real(n * d, x, x_f.get());
+
+    IndexFlatL2 index_tmp(d);
+
+    if (clustering_index && verbose) {
+      printf("using clustering_index of dimension %d to do the clustering\n",
+             clustering_index->d);
+    }
+
+    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
+
+    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
+    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
+
+    quantizer->add(clus.k, x_b.get());
+    quantizer->is_trained = true;
+  }
+
+  is_trained = true;
+}
+
+void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
+  // minimal sanity checks
+  FAISS_THROW_IF_NOT(other.d == d);
+  FAISS_THROW_IF_NOT(other.nlist == nlist);
+  FAISS_THROW_IF_NOT(other.code_size == code_size);
+  FAISS_THROW_IF_NOT_MSG((!maintain_direct_map &&
+                          !other.maintain_direct_map),
+                         "direct map copy not implemented");
+  FAISS_THROW_IF_NOT_MSG(typeid (*this) == typeid (other),
+                         "can only merge indexes of the same type");
+
+  invlists->merge_from (other.invlists, add_id);
+
+  ntotal += other.ntotal;
+  other.ntotal = 0;
+}
+
+void IndexBinaryIVF::replace_invlists(InvertedLists *il, bool own) {
+  FAISS_THROW_IF_NOT(il->nlist == nlist &&
+                     il->code_size == code_size);
+  if (own_invlists) {
+    delete invlists;
+  }
+  invlists = il;
+  own_invlists = own;
+}
+
+
+namespace {
+
+using idx_t = Index::idx_t;
+
+
+template<class HammingComputer, bool store_pairs>
+struct IVFBinaryScannerL2: BinaryInvertedListScanner {
+
+    HammingComputer hc;
+    size_t code_size;
+
+    IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
+    {}
+
+    void set_query (const uint8_t *query_vector) override {
+        hc.set (query_vector, code_size);
+    }
+
+    idx_t list_no;
+    void set_list (idx_t list_no, uint8_t /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+
+    uint32_t distance_to_code (const uint8_t *code) const override {
+        return hc.hamming (code);
+    }
+
+    size_t scan_codes (size_t n,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       int32_t *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        using C = CMax<int32_t, idx_t>;
+
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming (codes);
+            if (dis < simi[0]) {
+                heap_pop<C> (k, simi, idxi);
+                idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                heap_push<C> (k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+
+};
+
+
+template <bool store_pairs>
+BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
+
+    switch (code_size) {
+#define HANDLE_CS(cs)                                                  \
+    case cs:                                                            \
+        return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
+      HANDLE_CS(4);
+      HANDLE_CS(8);
+      HANDLE_CS(16);
+      HANDLE_CS(20);
+      HANDLE_CS(32);
+      HANDLE_CS(64);
+#undef HANDLE_CS
+    default:
+        if (code_size % 8 == 0) {
+            return new IVFBinaryScannerL2<HammingComputerM8,
+                store_pairs> (code_size);
+        } else if (code_size % 4 == 0) {
+            return new IVFBinaryScannerL2<HammingComputerM4,
+                store_pairs> (code_size);
+        } else {
+            return new IVFBinaryScannerL2<HammingComputerDefault,
+                store_pairs> (code_size);
+        }
+    }
+}
+
+
+void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
+                             size_t n,
+                             const uint8_t *x,
+                             idx_t k,
+                             const idx_t *keys,
+                             const int32_t * coarse_dis,
+                             int32_t *distances, idx_t *labels,
+                             bool store_pairs,
+                             const IVFSearchParameters *params)
+{
+    long nprobe = params ? params->nprobe : ivf.nprobe;
+    long max_codes = params ? params->max_codes : ivf.max_codes;
+    MetricType metric_type = ivf.metric_type;
+
+    // almost verbatim copy from IndexIVF::search_preassigned
+
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+    using HeapForIP = CMin<int32_t, idx_t>;
+    using HeapForL2 = CMax<int32_t, idx_t>;
+
+#pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
+    {
+        std::unique_ptr<BinaryInvertedListScanner> scanner
+            (ivf.get_InvertedListScanner (store_pairs));
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *xi = x + i * ivf.code_size;
+            scanner->set_query(xi);
+
+            const idx_t * keysi = keys + i * nprobe;
+            int32_t * simi = distances + k * i;
+            idx_t * idxi = labels + k * i;
+
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2> (k, simi, idxi);
+            }
+
+            size_t nscan = 0;
+
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                idx_t key = keysi[ik];  /* select the list  */
+                if (key < 0) {
+                    // not enough centroids for multiprobe
+                    continue;
+                }
+                FAISS_THROW_IF_NOT_FMT
+                    (key < (idx_t) ivf.nlist,
+                     "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                     key, ik, ivf.nlist);
+
+                scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+
+                nlistv++;
+
+                size_t list_size = ivf.invlists->list_size(key);
+                InvertedLists::ScopedCodes scodes (ivf.invlists, key);
+                std::unique_ptr<InvertedLists::ScopedIds> sids;
+                const Index::idx_t * ids = nullptr;
+
+                if (!store_pairs) {
+                    sids.reset (new InvertedLists::ScopedIds (ivf.invlists, key));
+                    ids = sids->get();
+                }
+
+                nheap += scanner->scan_codes (list_size, scodes.get(),
+                                              ids, simi, idxi, k);
+
+                nscan += list_size;
+                if (max_codes && nscan >= max_codes)
+                    break;
+            }
+
+            ndis += nscan;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2> (k, simi, idxi);
+            }
+
+        } // parallel for
+    } // parallel
+
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nheap_updates += nheap;
+
+}
+
+template<class HammingComputer, bool store_pairs>
+void search_knn_hamming_count(const IndexBinaryIVF& ivf,
+                              size_t nx,
+                              const uint8_t *x,
+                              const idx_t *keys,
+                              int k,
+                              int32_t *distances,
+                              idx_t *labels,
+                              const IVFSearchParameters *params) {
+  const int nBuckets = ivf.d + 1;
+  std::vector<int> all_counters(nx * nBuckets, 0);
+  std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
+
+  long nprobe = params ? params->nprobe : ivf.nprobe;
+  long max_codes = params ? params->max_codes : ivf.max_codes;
+
+  std::vector<HCounterState<HammingComputer>> cs;
+  for (size_t i = 0; i < nx; ++i) {
+    cs.push_back(HCounterState<HammingComputer>(
+                   all_counters.data() + i * nBuckets,
+                   all_ids_per_dis.get() + i * nBuckets * k,
+                   x + i * ivf.code_size,
+                   ivf.d,
+                   k
+                 ));
+  }
+
+  size_t nlistv = 0, ndis = 0;
+
+#pragma omp parallel for reduction(+: nlistv, ndis)
+  for (size_t i = 0; i < nx; i++) {
+    const idx_t * keysi = keys + i * nprobe;
+    HCounterState<HammingComputer>& csi = cs[i];
+
+    size_t nscan = 0;
+
+    for (size_t ik = 0; ik < nprobe; ik++) {
+      idx_t key = keysi[ik];  /* select the list  */
+      if (key < 0) {
+        // not enough centroids for multiprobe
+        continue;
+      }
+      FAISS_THROW_IF_NOT_FMT (
+        key < (idx_t) ivf.nlist,
+        "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+        key, ik, ivf.nlist);
+
+      nlistv++;
+      size_t list_size = ivf.invlists->list_size(key);
+      InvertedLists::ScopedCodes scodes (ivf.invlists, key);
+      const uint8_t *list_vecs = scodes.get();
+      const Index::idx_t *ids = store_pairs
+        ? nullptr
+        : ivf.invlists->get_ids(key);
+
+      for (size_t j = 0; j < list_size; j++) {
+        const uint8_t * yj = list_vecs + ivf.code_size * j;
+
+        idx_t id = store_pairs ? (key << 32 | j) : ids[j];
+        csi.update_counter(yj, id);
+      }
+      if (ids)
+          ivf.invlists->release_ids (key, ids);
+
+      nscan += list_size;
+      if (max_codes && nscan >= max_codes)
+        break;
+    }
+    ndis += nscan;
+
+    int nres = 0;
+    for (int b = 0; b < nBuckets && nres < k; b++) {
+      for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+        labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+        distances[i * k + nres] = b;
+        nres++;
+      }
+    }
+    while (nres < k) {
+      labels[i * k + nres] = -1;
+      distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+      ++nres;
+    }
+  }
+
+  indexIVF_stats.nq += nx;
+  indexIVF_stats.nlist += nlistv;
+  indexIVF_stats.ndis += ndis;
+}
+
+
+
+template<bool store_pairs>
+void search_knn_hamming_count_1 (
+                        const IndexBinaryIVF& ivf,
+                        size_t nx,
+                        const uint8_t *x,
+                        const idx_t *keys,
+                        int k,
+                        int32_t *distances,
+                        idx_t *labels,
+                        const IVFSearchParameters *params) {
+    switch (ivf.code_size) {
+#define HANDLE_CS(cs)                                                  \
+    case cs:                                                            \
+       search_knn_hamming_count<HammingComputer ## cs, store_pairs>(    \
+           ivf, nx, x, keys, k, distances, labels, params);             \
+      break;
+      HANDLE_CS(4);
+      HANDLE_CS(8);
+      HANDLE_CS(16);
+      HANDLE_CS(20);
+      HANDLE_CS(32);
+      HANDLE_CS(64);
+#undef HANDLE_CS
+    default:
+        if (ivf.code_size % 8 == 0) {
+            search_knn_hamming_count<HammingComputerM8, store_pairs>
+                (ivf, nx, x, keys, k, distances, labels, params);
+        } else if (ivf.code_size % 4 == 0) {
+            search_knn_hamming_count<HammingComputerM4, store_pairs>
+                (ivf, nx, x, keys, k, distances, labels, params);
+        } else {
+            search_knn_hamming_count<HammingComputerDefault, store_pairs>
+                (ivf, nx, x, keys, k, distances, labels, params);
+        }
+        break;
+    }
+
+}
+
+}  // namespace
+
+BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
+      (bool store_pairs) const
+{
+    if (store_pairs) {
+        return select_IVFBinaryScannerL2<true> (code_size);
+    } else {
+        return select_IVFBinaryScannerL2<false> (code_size);
+    }
+}
+
+void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
+                                        const idx_t *idx,
+                                        const int32_t * coarse_dis,
+                                        int32_t *distances, idx_t *labels,
+                                        bool store_pairs,
+                                        const IVFSearchParameters *params
+                                        ) const {
+
+    if (use_heap) {
+        search_knn_hamming_heap (*this, n, x, k, idx, coarse_dis,
+                                 distances, labels, store_pairs,
+                                 params);
+    } else {
+        if (store_pairs) {
+            search_knn_hamming_count_1<true>
+                (*this, n, x, idx, k, distances, labels, params);
+        } else {
+            search_knn_hamming_count_1<false>
+                (*this, n, x, idx, k, distances, labels, params);
+        }
+    }
+}
+
+IndexBinaryIVF::~IndexBinaryIVF() {
+  if (own_invlists) {
+    delete invlists;
+  }
+
+  if (own_fields) {
+    delete quantizer;
+  }
+}
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
@ -0,0 +1,211 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_BINARY_IVF_H
+#define FAISS_INDEX_BINARY_IVF_H
+
+
+#include <vector>
+
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+struct BinaryInvertedListScanner;
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an IndexBinary instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * Otherwise the object is similar to the IndexIVF
+ */
+struct IndexBinaryIVF : IndexBinary {
+    /// Acess to the actual data
+    InvertedLists *invlists;
+    bool own_invlists;
+
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+
+    /** Select between using a heap or counting to select the k smallest values
+     * when scanning inverted lists.
+     */
+    bool use_heap = true;
+
+    /// map for direct access to the elements. Enables reconstruct().
+    bool maintain_direct_map;
+    std::vector<idx_t> direct_map;
+
+    IndexBinary *quantizer;   ///< quantizer that maps vectors to inverted lists
+    size_t nlist;             ///< number of possible key values
+
+    bool own_fields;          ///< whether object owns the quantizer
+
+    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
+
+    /** The Inverted file takes a quantizer (an IndexBinary) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexBinaryIVF is in use.
+     */
+    IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist);
+
+    IndexBinaryIVF();
+
+    ~IndexBinaryIVF() override;
+
+    void reset() override;
+
+    /// Trains the quantizer
+    void train(idx_t n, const uint8_t *x) override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
+
+    /// same as add_with_ids, with precomputed coarse quantizer
+    void add_core (idx_t n, const uint8_t * x, const idx_t *xids,
+                   const idx_t *precomputed_idx);
+
+    /** Search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. search() calls this.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    void search_preassigned(idx_t n, const uint8_t *x, idx_t k,
+                            const idx_t *assign,
+                            const int32_t *centroid_dis,
+                            int32_t *distances, idx_t *labels,
+                            bool store_pairs,
+                            const IVFSearchParameters *params=nullptr
+                            ) const;
+
+    virtual BinaryInvertedListScanner *get_InvertedListScanner (
+                                         bool store_pairs=false) const;
+
+    /** assign the vectors, then call search_preassign */
+    virtual void search(idx_t n, const uint8_t *x, idx_t k,
+                        int32_t *distances, idx_t *labels) const override;
+
+    void reconstruct(idx_t key, uint8_t *recons) const override;
+
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d / 8
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const override;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d / 8)
+     */
+    void search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
+                                int32_t *distances, idx_t *labels,
+                                uint8_t *recons) const override;
+
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset(idx_t list_no, idx_t offset,
+                                         uint8_t* recons) const;
+
+
+    /// Dataset manipulation functions
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from(IndexBinaryIVF& other, idx_t add_id);
+
+    size_t get_list_size(size_t list_no) const
+    { return invlists->list_size(list_no); }
+
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map(bool new_maintain_direct_map=true);
+
+    void replace_invlists(InvertedLists *il, bool own=false);
+};
+
+
+struct BinaryInvertedListScanner {
+
+    using idx_t = Index::idx_t;
+
+    /// from now on we handle this query.
+    virtual void set_query (const uint8_t *query_vector) = 0;
+
+    /// following codes come from this inverted list
+    virtual void set_list (idx_t list_no, uint8_t coarse_dis) = 0;
+
+    /// compute a single query-to-code distance
+    virtual uint32_t distance_to_code (const uint8_t *code) const = 0;
+
+    /** compute the distances to codes. (distances, labels) should be
+     * organized as a min- or max-heap
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     */
+    virtual size_t scan_codes (size_t n,
+                               const uint8_t *codes,
+                               const idx_t *ids,
+                               int32_t *distances, idx_t *labels,
+                               size_t k) const = 0;
+
+    virtual ~BinaryInvertedListScanner () {}
+
+};
+
+
+}  // namespace faiss
+
+#endif  // FAISS_INDEX_BINARY_IVF_H
--- a/core/src/index/thirdparty/faiss/IndexFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexFlat.cpp
@ -0,0 +1,508 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexFlat.h>
+
+#include <cstring>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+namespace faiss {
+
+IndexFlat::IndexFlat (idx_t d, MetricType metric):
+            Index(d, metric)
+{
+}
+
+
+
+void IndexFlat::add (idx_t n, const float *x) {
+    xb.insert(xb.end(), x, x + n * d);
+    ntotal += n;
+}
+
+
+void IndexFlat::reset() {
+    xb.clear();
+    ntotal = 0;
+}
+
+
+void IndexFlat::search (idx_t n, const float *x, idx_t k,
+                               float *distances, idx_t *labels) const
+{
+    // we see the distances and labels as heaps
+
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        float_minheap_array_t res = {
+            size_t(n), size_t(k), labels, distances};
+        knn_inner_product (x, xb.data(), d, n, ntotal, &res);
+    } else if (metric_type == METRIC_L2) {
+        float_maxheap_array_t res = {
+            size_t(n), size_t(k), labels, distances};
+        knn_L2sqr (x, xb.data(), d, n, ntotal, &res);
+    } else {
+        float_maxheap_array_t res = {
+            size_t(n), size_t(k), labels, distances};
+        knn_extra_metrics (x, xb.data(), d, n, ntotal,
+                           metric_type, metric_arg,
+                           &res);
+    }
+}
+
+void IndexFlat::range_search (idx_t n, const float *x, float radius,
+                              RangeSearchResult *result) const
+{
+    switch (metric_type) {
+    case METRIC_INNER_PRODUCT:
+        range_search_inner_product (x, xb.data(), d, n, ntotal,
+                                    radius, result);
+        break;
+    case METRIC_L2:
+        range_search_L2sqr (x, xb.data(), d, n, ntotal, radius, result);
+        break;
+    default:
+        FAISS_THROW_MSG("metric type not supported");
+    }
+}
+
+
+void IndexFlat::compute_distance_subset (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            const idx_t *labels) const
+{
+    switch (metric_type) {
+        case METRIC_INNER_PRODUCT:
+            fvec_inner_products_by_idx (
+                 distances,
+                 x, xb.data(), labels, d, n, k);
+            break;
+        case METRIC_L2:
+            fvec_L2sqr_by_idx (
+                 distances,
+                 x, xb.data(), labels, d, n, k);
+            break;
+        default:
+            FAISS_THROW_MSG("metric type not supported");
+    }
+
+}
+
+size_t IndexFlat::remove_ids (const IDSelector & sel)
+{
+    idx_t j = 0;
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member (i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                memmove (&xb[d * j], &xb[d * i], sizeof(xb[0]) * d);
+            }
+            j++;
+        }
+    }
+    size_t nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        xb.resize (ntotal * d);
+    }
+    return nremove;
+}
+
+
+namespace {
+
+
+struct FlatL2Dis : DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const float *q;
+    const float *b;
+    size_t ndis;
+
+    float operator () (idx_t i) override {
+        ndis++;
+        return fvec_L2sqr(q, b + i * d, d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return fvec_L2sqr(b + j * d, b + i * d, d);
+    }
+
+    explicit FlatL2Dis(const IndexFlat& storage, const float *q = nullptr)
+        : d(storage.d),
+          nb(storage.ntotal),
+          q(q),
+          b(storage.xb.data()),
+          ndis(0) {}
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+struct FlatIPDis : DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const float *q;
+    const float *b;
+    size_t ndis;
+
+    float operator () (idx_t i) override {
+        ndis++;
+        return fvec_inner_product (q, b + i * d, d);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return fvec_inner_product (b + j * d, b + i * d, d);
+    }
+
+    explicit FlatIPDis(const IndexFlat& storage, const float *q = nullptr)
+        : d(storage.d),
+          nb(storage.ntotal),
+          q(q),
+          b(storage.xb.data()),
+          ndis(0) {}
+
+    void set_query(const float *x) override {
+        q = x;
+    }
+};
+
+
+
+
+}  // namespace
+
+
+DistanceComputer * IndexFlat::get_distance_computer() const {
+    if (metric_type == METRIC_L2) {
+        return new FlatL2Dis(*this);
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        return new FlatIPDis(*this);
+    } else {
+        return get_extra_distance_computer (d, metric_type, metric_arg,
+                                            ntotal, xb.data());
+    }
+}
+
+
+void IndexFlat::reconstruct (idx_t key, float * recons) const
+{
+    memcpy (recons, &(xb[key * d]), sizeof(*recons) * d);
+}
+
+
+/* The standalone codec interface */
+size_t IndexFlat::sa_code_size () const
+{
+    return sizeof(float) * d;
+}
+
+void IndexFlat::sa_encode (idx_t n, const float *x, uint8_t *bytes) const
+{
+    memcpy (bytes, x, sizeof(float) * d * n);
+}
+
+void IndexFlat::sa_decode (idx_t n, const uint8_t *bytes, float *x) const
+{
+    memcpy (x, bytes, sizeof(float) * d * n);
+}
+
+
+
+
+/***************************************************
+ * IndexFlatL2BaseShift
+ ***************************************************/
+
+IndexFlatL2BaseShift::IndexFlatL2BaseShift (idx_t d, size_t nshift, const float *shift):
+    IndexFlatL2 (d), shift (nshift)
+{
+    memcpy (this->shift.data(), shift, sizeof(float) * nshift);
+}
+
+void IndexFlatL2BaseShift::search (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (shift.size() == ntotal);
+
+    float_maxheap_array_t res = {
+        size_t(n), size_t(k), labels, distances};
+    knn_L2sqr_base_shift (x, xb.data(), d, n, ntotal, &res, shift.data());
+}
+
+
+
+/***************************************************
+ * IndexRefineFlat
+ ***************************************************/
+
+IndexRefineFlat::IndexRefineFlat (Index *base_index):
+    Index (base_index->d, base_index->metric_type),
+    refine_index (base_index->d, base_index->metric_type),
+    base_index (base_index), own_fields (false),
+    k_factor (1)
+{
+    is_trained = base_index->is_trained;
+    FAISS_THROW_IF_NOT_MSG (base_index->ntotal == 0,
+                      "base_index should be empty in the beginning");
+}
+
+IndexRefineFlat::IndexRefineFlat () {
+    base_index = nullptr;
+    own_fields = false;
+    k_factor = 1;
+}
+
+
+void IndexRefineFlat::train (idx_t n, const float *x)
+{
+    base_index->train (n, x);
+    is_trained = true;
+}
+
+void IndexRefineFlat::add (idx_t n, const float *x) {
+    FAISS_THROW_IF_NOT (is_trained);
+    base_index->add (n, x);
+    refine_index.add (n, x);
+    ntotal = refine_index.ntotal;
+}
+
+void IndexRefineFlat::reset ()
+{
+    base_index->reset ();
+    refine_index.reset ();
+    ntotal = 0;
+}
+
+namespace {
+typedef faiss::Index::idx_t idx_t;
+
+template<class C>
+static void reorder_2_heaps (
+      idx_t n,
+      idx_t k, idx_t *labels, float *distances,
+      idx_t k_base, const idx_t *base_labels, const float *base_distances)
+{
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        idx_t *idxo = labels + i * k;
+        float *diso = distances + i * k;
+        const idx_t *idxi = base_labels + i * k_base;
+        const float *disi = base_distances + i * k_base;
+
+        heap_heapify<C> (k, diso, idxo, disi, idxi, k);
+        if (k_base != k) { // add remaining elements
+            heap_addn<C> (k, diso, idxo, disi + k, idxi + k, k_base - k);
+        }
+        heap_reorder<C> (k, diso, idxo);
+    }
+}
+
+
+}
+
+
+void IndexRefineFlat::search (
+              idx_t n, const float *x, idx_t k,
+              float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    idx_t k_base = idx_t (k * k_factor);
+    idx_t * base_labels = labels;
+    float * base_distances = distances;
+    ScopeDeleter<idx_t> del1;
+    ScopeDeleter<float> del2;
+
+
+    if (k != k_base) {
+        base_labels = new idx_t [n * k_base];
+        del1.set (base_labels);
+        base_distances = new float [n * k_base];
+        del2.set (base_distances);
+    }
+
+    base_index->search (n, x, k_base, base_distances, base_labels);
+
+    for (int i = 0; i < n * k_base; i++)
+        assert (base_labels[i] >= -1 &&
+                base_labels[i] < ntotal);
+
+    // compute refined distances
+    refine_index.compute_distance_subset (
+        n, x, k_base, base_distances, base_labels);
+
+    // sort and store result
+    if (metric_type == METRIC_L2) {
+        typedef CMax <float, idx_t> C;
+        reorder_2_heaps<C> (
+            n, k, labels, distances,
+            k_base, base_labels, base_distances);
+
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        typedef CMin <float, idx_t> C;
+        reorder_2_heaps<C> (
+            n, k, labels, distances,
+            k_base, base_labels, base_distances);
+    } else {
+        FAISS_THROW_MSG("Metric type not supported");
+    }
+
+}
+
+
+
+IndexRefineFlat::~IndexRefineFlat ()
+{
+    if (own_fields) delete base_index;
+}
+
+/***************************************************
+ * IndexFlat1D
+ ***************************************************/
+
+
+IndexFlat1D::IndexFlat1D (bool continuous_update):
+    IndexFlatL2 (1),
+    continuous_update (continuous_update)
+{
+}
+
+/// if not continuous_update, call this between the last add and
+/// the first search
+void IndexFlat1D::update_permutation ()
+{
+    perm.resize (ntotal);
+    if (ntotal < 1000000) {
+        fvec_argsort (ntotal, xb.data(), (size_t*)perm.data());
+    } else {
+        fvec_argsort_parallel (ntotal, xb.data(), (size_t*)perm.data());
+    }
+}
+
+void IndexFlat1D::add (idx_t n, const float *x)
+{
+    IndexFlatL2::add (n, x);
+    if (continuous_update)
+        update_permutation();
+}
+
+void IndexFlat1D::reset()
+{
+    IndexFlatL2::reset();
+    perm.clear();
+}
+
+void IndexFlat1D::search (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT_MSG (perm.size() == ntotal,
+                    "Call update_permutation before search");
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+
+        float q = x[i]; // query
+        float *D = distances + i * k;
+        idx_t *I = labels + i * k;
+
+        // binary search
+        idx_t i0 = 0, i1 = ntotal;
+        idx_t wp = 0;
+
+        if (xb[perm[i0]] > q) {
+            i1 = 0;
+            goto finish_right;
+        }
+
+        if (xb[perm[i1 - 1]] <= q) {
+            i0 = i1 - 1;
+            goto finish_left;
+        }
+
+        while (i0 + 1 < i1) {
+            idx_t imed = (i0 + i1) / 2;
+            if (xb[perm[imed]] <= q) i0 = imed;
+            else                    i1 = imed;
+        }
+
+        // query is between xb[perm[i0]] and xb[perm[i1]]
+        // expand to nearest neighs
+
+        while (wp < k) {
+            float xleft = xb[perm[i0]];
+            float xright = xb[perm[i1]];
+
+            if (q - xleft < xright - q) {
+                D[wp] = q - xleft;
+                I[wp] = perm[i0];
+                i0--; wp++;
+                if (i0 < 0) { goto finish_right; }
+            } else {
+                D[wp] = xright - q;
+                I[wp] = perm[i1];
+                i1++; wp++;
+                if (i1 >= ntotal) { goto finish_left; }
+            }
+        }
+        goto done;
+
+    finish_right:
+        // grow to the right from i1
+        while (wp < k) {
+            if (i1 < ntotal) {
+                D[wp] = xb[perm[i1]] - q;
+                I[wp] = perm[i1];
+                i1++;
+            } else {
+                D[wp] = std::numeric_limits<float>::infinity();
+                I[wp] = -1;
+            }
+            wp++;
+        }
+        goto done;
+
+    finish_left:
+        // grow to the left from i0
+        while (wp < k) {
+            if (i0 >= 0) {
+                D[wp] = q - xb[perm[i0]];
+                I[wp] = perm[i0];
+                i0--;
+            } else {
+                D[wp] = std::numeric_limits<float>::infinity();
+                I[wp] = -1;
+            }
+            wp++;
+        }
+    done:  ;
+    }
+
+}
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexFlat.h
@ -0,0 +1,175 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_FLAT_H
+#define INDEX_FLAT_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+/** Index that stores the full vectors and performs exhaustive search */
+struct IndexFlat: Index {
+    /// database vectors, size ntotal * d
+    std::vector<float> xb;
+
+    explicit IndexFlat (idx_t d, MetricType metric = METRIC_L2);
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    /** compute distance with a subset of vectors
+     *
+     * @param x       query vectors, size n * d
+     * @param labels  indices of the vectors that should be compared
+     *                for each query vector, size n * k
+     * @param distances
+     *                corresponding output distances, size n * k
+     */
+    void compute_distance_subset (
+            idx_t n,
+            const float *x,
+            idx_t k,
+            float *distances,
+            const idx_t *labels) const;
+
+    /** remove some ids. NB that Because of the structure of the
+     * indexing structure, the semantics of this operation are
+     * different from the usual ones: the new ids are shifted */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    IndexFlat () {}
+
+    DistanceComputer * get_distance_computer() const override;
+
+    /* The stanadlone codec interface (just memcopies in this case) */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+
+struct IndexFlatIP:IndexFlat {
+    explicit IndexFlatIP (idx_t d): IndexFlat (d, METRIC_INNER_PRODUCT) {}
+    IndexFlatIP () {}
+};
+
+
+struct IndexFlatL2:IndexFlat {
+    explicit IndexFlatL2 (idx_t d): IndexFlat (d, METRIC_L2) {}
+    IndexFlatL2 () {}
+};
+
+
+// same as an IndexFlatL2 but a value is subtracted from each distance
+struct IndexFlatL2BaseShift: IndexFlatL2 {
+    std::vector<float> shift;
+
+    IndexFlatL2BaseShift (idx_t d, size_t nshift, const float *shift);
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+};
+
+
+/** Index that queries in a base_index (a fast one) and refines the
+ *  results with an exact search, hopefully improving the results.
+ */
+struct IndexRefineFlat: Index {
+
+    /// storage for full vectors
+    IndexFlat refine_index;
+
+    /// faster index to pre-select the vectors that should be filtered
+    Index *base_index;
+    bool own_fields;  ///< should the base index be deallocated?
+
+    /// factor between k requested in search and the k requested from
+    /// the base_index (should be >= 1)
+    float k_factor;
+
+    explicit IndexRefineFlat (Index *base_index);
+
+    IndexRefineFlat ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    ~IndexRefineFlat() override;
+};
+
+
+/// optimized version for 1D "vectors"
+struct IndexFlat1D:IndexFlatL2 {
+    bool continuous_update; ///< is the permutation updated continuously?
+
+    std::vector<idx_t> perm; ///< sorted database indices
+
+    explicit IndexFlat1D (bool continuous_update=true);
+
+    /// if not continuous_update, call this between the last add and
+    /// the first search
+    void update_permutation ();
+
+    void add(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    /// Warn: the distances returned are L1 not L2
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+};
+
+
+}
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
--- a/core/src/index/thirdparty/faiss/IndexHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.h
@ -0,0 +1,170 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/impl/HNSW.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss {
+
+struct IndexHNSW;
+
+struct ReconstructFromNeighbors {
+    typedef Index::idx_t idx_t;
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    const IndexHNSW & index;
+    size_t M; // number of neighbors
+    size_t k; // number of codebook entries
+    size_t nsq; // number of subvectors
+    size_t code_size;
+    int k_reorder; // nb to reorder. -1 = all
+
+    std::vector<float> codebook; // size nsq * k * (M + 1)
+
+    std::vector<uint8_t> codes; // size ntotal * code_size
+    size_t ntotal;
+    size_t d, dsub; // derived values
+
+    explicit ReconstructFromNeighbors(const IndexHNSW& index,
+                                      size_t k=256, size_t nsq=1);
+
+    /// codes must be added in the correct order and the IndexHNSW
+    /// must be populated and sorted
+    void add_codes(size_t n, const float *x);
+
+    size_t compute_distances(size_t n, const idx_t *shortlist,
+                             const float *query, float *distances) const;
+
+    /// called by add_codes
+    void estimate_code(const float *x, storage_idx_t i, uint8_t *code) const;
+
+    /// called by compute_distances
+    void reconstruct(storage_idx_t i, float *x, float *tmp) const;
+
+    void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float *x) const;
+
+    /// get the M+1 -by-d table for neighbor coordinates for vector i
+    void get_neighbor_table(storage_idx_t i, float *out) const;
+
+};
+
+
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+
+struct IndexHNSW : Index {
+
+    typedef HNSW::storage_idx_t storage_idx_t;
+
+    // the link strcuture
+    HNSW hnsw;
+
+    // the sequential storage
+    bool own_fields;
+    Index *storage;
+
+    ReconstructFromNeighbors *reconstruct_from_neighbors;
+
+    explicit IndexHNSW (int d = 0, int M = 32);
+    explicit IndexHNSW (Index *storage, int M = 32);
+
+    ~IndexHNSW() override;
+
+    void add(idx_t n, const float *x) override;
+
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    void reset () override;
+
+    void shrink_level_0_neighbors(int size);
+
+    /** Perform search only on level 0, given the starting points for
+     * each vertex.
+     *
+     * @param search_type 1:perform one search per nprobe, 2: enqueue
+     *                    all entry points
+     */
+    void search_level_0(idx_t n, const float *x, idx_t k,
+                        const storage_idx_t *nearest, const float *nearest_d,
+                        float *distances, idx_t *labels, int nprobe = 1,
+                        int search_type = 1) const;
+
+    /// alternative graph building
+    void init_level_0_from_knngraph(
+                        int k, const float *D, const idx_t *I);
+
+    /// alternative graph building
+    void init_level_0_from_entry_points(
+                        int npt, const storage_idx_t *points,
+                        const storage_idx_t *nearests);
+
+    // reorder links from nearest to farthest
+    void reorder_links();
+
+    void link_singletons();
+};
+
+
+/** Flat index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+
+struct IndexHNSWFlat : IndexHNSW {
+    IndexHNSWFlat();
+    IndexHNSWFlat(int d, int M);
+};
+
+/** PQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWPQ : IndexHNSW {
+    IndexHNSWPQ();
+    IndexHNSWPQ(int d, int pq_m, int M);
+    void train(idx_t n, const float* x) override;
+};
+
+/** SQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWSQ : IndexHNSW {
+    IndexHNSWSQ();
+    IndexHNSWSQ(int d, ScalarQuantizer::QuantizerType qtype, int M);
+};
+
+/** 2-level code structure with fast random access
+ */
+struct IndexHNSW2Level : IndexHNSW {
+    IndexHNSW2Level();
+    IndexHNSW2Level(Index *quantizer, size_t nlist, int m_pq, int M);
+
+    void flip_to_ivf();
+
+    /// entry point for search
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+};
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVF.cpp
@ -0,0 +1,966 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVF.h>
+
+
+#include <omp.h>
+
+#include <cstdio>
+#include <memory>
+#include <iostream>
+
+#include <faiss/utils/utils.h>
+#include <faiss/utils/hamming.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+using ScopedIds = InvertedLists::ScopedIds;
+using ScopedCodes = InvertedLists::ScopedCodes;
+
+/*****************************************
+ * Level1Quantizer implementation
+ ******************************************/
+
+
+Level1Quantizer::Level1Quantizer (Index * quantizer, size_t nlist):
+    quantizer (quantizer),
+    nlist (nlist),
+    quantizer_trains_alone (0),
+    own_fields (false),
+    clustering_index (nullptr)
+{
+    // here we set a low # iterations because this is typically used
+    // for large clusterings (nb this is not used for the MultiIndex,
+    // for which quantizer_trains_alone = true)
+    cp.niter = 10;
+}
+
+Level1Quantizer::Level1Quantizer ():
+    quantizer (nullptr),
+    nlist (0),
+    quantizer_trains_alone (0), own_fields (false),
+    clustering_index (nullptr)
+{}
+
+Level1Quantizer::~Level1Quantizer ()
+{
+    if (own_fields) {
+        if(quantizer == quantizer_backup) {
+            if(quantizer != nullptr) {
+                delete quantizer;
+            }
+        } else {
+            if(quantizer != nullptr) {
+                delete quantizer;
+            }
+
+            if(quantizer_backup != nullptr) {
+                delete quantizer_backup;
+            }
+        }
+        quantizer = nullptr;
+        quantizer_backup = nullptr;
+    }
+}
+
+void Level1Quantizer::train_q1 (size_t n, const float *x, bool verbose, MetricType metric_type)
+{
+    size_t d = quantizer->d;
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (verbose)
+            printf ("IVF quantizer does not need training.\n");
+    } else if (quantizer_trains_alone == 1) {
+        if (verbose)
+            printf ("IVF quantizer trains alone...\n");
+        quantizer->train (n, x);
+        quantizer->verbose = verbose;
+        FAISS_THROW_IF_NOT_MSG (quantizer->ntotal == nlist,
+                          "nlist not consistent with quantizer size");
+    } else if (quantizer_trains_alone == 0) {
+        if (verbose)
+            printf ("Training level-1 quantizer on %ld vectors in %ldD\n",
+                    n, d);
+
+        Clustering clus (d, nlist, cp);
+        quantizer->reset();
+        if (clustering_index) {
+            clus.train (n, x, *clustering_index);
+            quantizer->add (nlist, clus.centroids.data());
+        } else {
+            clus.train (n, x, *quantizer);
+        }
+        quantizer->is_trained = true;
+    } else if (quantizer_trains_alone == 2) {
+        if (verbose)
+            printf (
+                "Training L2 quantizer on %ld vectors in %ldD%s\n",
+                n, d,
+                clustering_index ? "(user provided index)" : "");
+        FAISS_THROW_IF_NOT (metric_type == METRIC_L2);
+        Clustering clus (d, nlist, cp);
+        if (!clustering_index) {
+            IndexFlatL2 assigner (d);
+            clus.train(n, x, assigner);
+        } else {
+            clus.train(n, x, *clustering_index);
+        }
+        if (verbose)
+            printf ("Adding centroids to quantizer\n");
+        quantizer->add (nlist, clus.centroids.data());
+    }
+}
+
+size_t Level1Quantizer::coarse_code_size () const
+{
+    size_t nl = nlist - 1;
+    size_t nbyte = 0;
+    while (nl > 0) {
+        nbyte ++;
+        nl >>= 8;
+    }
+    return nbyte;
+}
+
+void Level1Quantizer::encode_listno (Index::idx_t list_no, uint8_t *code) const
+{
+    // little endian
+    size_t nl = nlist - 1;
+    while (nl > 0) {
+        *code++ = list_no & 0xff;
+        list_no >>= 8;
+        nl >>= 8;
+    }
+}
+
+Index::idx_t Level1Quantizer::decode_listno (const uint8_t *code) const
+{
+    size_t nl = nlist - 1;
+    int64_t list_no = 0;
+    int nbit = 0;
+    while (nl > 0) {
+        list_no |= int64_t(*code++) << nbit;
+        nbit += 8;
+        nl >>= 8;
+    }
+    FAISS_THROW_IF_NOT (list_no >= 0 && list_no < nlist);
+    return list_no;
+}
+
+
+
+/*****************************************
+ * IndexIVF implementation
+ ******************************************/
+
+
+IndexIVF::IndexIVF (Index * quantizer, size_t d,
+                    size_t nlist, size_t code_size,
+                    MetricType metric):
+    Index (d, metric),
+    Level1Quantizer (quantizer, nlist),
+    invlists (new ArrayInvertedLists (nlist, code_size)),
+    own_invlists (true),
+    code_size (code_size),
+    nprobe (1),
+    max_codes (0),
+    parallel_mode (0),
+    maintain_direct_map (false)
+{
+    FAISS_THROW_IF_NOT (d == quantizer->d);
+    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
+    // Spherical by default if the metric is inner_product
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        cp.spherical = true;
+    }
+
+}
+
+IndexIVF::IndexIVF ():
+    invlists (nullptr), own_invlists (false),
+    code_size (0),
+    nprobe (1), max_codes (0), parallel_mode (0),
+    maintain_direct_map (false)
+{}
+
+void IndexIVF::add (idx_t n, const float * x)
+{
+    add_with_ids (n, x, nullptr);
+}
+
+
+void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
+{
+    // do some blocking to avoid excessive allocs
+    idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min (n, i0 + bs);
+            if (verbose) {
+                printf("   IndexIVF::add_with_ids %ld:%ld\n", i0, i1);
+            }
+            add_with_ids (i1 - i0, x + i0 * d,
+                          xids ? xids + i0 : nullptr);
+        }
+        return;
+    }
+
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<idx_t []> idx(new idx_t[n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0, nminus1 = 0;
+
+    for (size_t i = 0; i < n; i++) {
+        if (idx[i] < 0) nminus1++;
+    }
+
+    std::unique_ptr<uint8_t []> flat_codes(new uint8_t [n * code_size]);
+    encode_vectors (n, x, idx.get(), flat_codes.get());
+
+#pragma omp parallel reduction(+: nadd)
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            idx_t list_no = idx [i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                idx_t id = xids ? xids[i] : ntotal + i;
+                invlists->add_entry (list_no, id,
+                                     flat_codes.get() + i * code_size);
+                nadd++;
+            }
+        }
+    }
+
+    if (verbose) {
+        printf("    added %ld / %ld vectors (%ld -1s)\n", nadd, n, nminus1);
+    }
+
+    ntotal += n;
+}
+
+void IndexIVF::to_readonly() {
+    if (is_readonly()) return;
+    auto readonly_lists = this->invlists->to_readonly();
+    if (!readonly_lists) return;
+    this->replace_invlists(readonly_lists, true);
+}
+
+bool IndexIVF::is_readonly() const {
+    return this->invlists->is_readonly();
+}
+
+void IndexIVF::backup_quantizer() {
+    this->quantizer_backup = quantizer;
+}
+
+void IndexIVF::restore_quantizer() {
+    if(this->quantizer_backup != nullptr) {
+        quantizer = this->quantizer_backup;
+    }
+}
+
+void IndexIVF::make_direct_map (bool new_maintain_direct_map)
+{
+    // nothing to do
+    if (new_maintain_direct_map == maintain_direct_map)
+        return;
+
+    if (new_maintain_direct_map) {
+        direct_map.resize (ntotal, -1);
+        for (size_t key = 0; key < nlist; key++) {
+            size_t list_size = invlists->list_size (key);
+            ScopedIds idlist (invlists, key);
+
+            for (long ofs = 0; ofs < list_size; ofs++) {
+                FAISS_THROW_IF_NOT_MSG (
+                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
+                       "direct map supported only for seuquential ids");
+                direct_map [idlist [ofs]] = key << 32 | ofs;
+            }
+        }
+    } else {
+        direct_map.clear ();
+    }
+    maintain_direct_map = new_maintain_direct_map;
+}
+
+
+void IndexIVF::search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels) const
+{
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search (n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists (idx.get(), n * nprobe);
+
+    search_preassigned (n, x, k, idx.get(), coarse_dis.get(),
+                        distances, labels, false);
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
+
+
+void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
+                                   const idx_t *keys,
+                                   const float *coarse_dis ,
+                                   float *distances, idx_t *labels,
+                                   bool store_pairs,
+                                   const IVFSearchParameters *params) const
+{
+    long nprobe = params ? params->nprobe : this->nprobe;
+    long max_codes = params ? params->max_codes : this->max_codes;
+
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+
+    using HeapForIP = CMin<float, idx_t>;
+    using HeapForL2 = CMax<float, idx_t>;
+
+    bool interrupt = false;
+
+    // don't start parallel section if single query
+    bool do_parallel =
+        parallel_mode == 0 ? n > 1 :
+        parallel_mode == 1 ? nprobe > 1 :
+        nprobe * n > 1;
+
+#pragma omp parallel if(do_parallel) reduction(+: nlistv, ndis, nheap)
+    {
+        InvertedListScanner *scanner = get_InvertedListScanner(store_pairs);
+        ScopeDeleter1<InvertedListScanner> del(scanner);
+
+        /*****************************************************
+         * Depending on parallel_mode, there are two possible ways
+         * to organize the search. Here we define local functions
+         * that are in common between the two
+         ******************************************************/
+
+        // intialize + reorder a result heap
+
+        auto init_result = [&](float *simi, idx_t *idxi) {
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2> (k, simi, idxi);
+            }
+        };
+
+        auto reorder_result = [&] (float *simi, idx_t *idxi) {
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2> (k, simi, idxi);
+            }
+        };
+
+        // single list scan using the current scanner (with query
+        // set porperly) and storing results in simi and idxi
+        auto scan_one_list = [&] (idx_t key, float coarse_dis_i,
+                                  float *simi, idx_t *idxi) {
+
+            if (key < 0) {
+                // not enough centroids for multiprobe
+                return (size_t)0;
+            }
+            FAISS_THROW_IF_NOT_FMT (key < (idx_t) nlist,
+                                    "Invalid key=%ld nlist=%ld\n",
+                                    key, nlist);
+
+            size_t list_size = invlists->list_size(key);
+
+            // don't waste time on empty lists
+            if (list_size == 0) {
+                return (size_t)0;
+            }
+
+            scanner->set_list (key, coarse_dis_i);
+
+            nlistv++;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+
+            std::unique_ptr<InvertedLists::ScopedIds> sids;
+            const Index::idx_t * ids = nullptr;
+
+            if (!store_pairs)  {
+                sids.reset (new InvertedLists::ScopedIds (invlists, key));
+                ids = sids->get();
+            }
+
+            nheap += scanner->scan_codes (list_size, scodes.get(),
+                                          ids, simi, idxi, k);
+
+            return list_size;
+        };
+
+        /****************************************************
+         * Actual loops, depending on parallel_mode
+         ****************************************************/
+
+        if (parallel_mode == 0) {
+
+#pragma omp for
+            for (size_t i = 0; i < n; i++) {
+
+                if (interrupt) {
+                    continue;
+                }
+
+                // loop over queries
+                scanner->set_query (x + i * d);
+                float * simi = distances + i * k;
+                idx_t * idxi = labels + i * k;
+
+                init_result (simi, idxi);
+
+                long nscan = 0;
+
+                // loop over probes
+                for (size_t ik = 0; ik < nprobe; ik++) {
+
+                    nscan += scan_one_list (
+                         keys [i * nprobe + ik],
+                         coarse_dis[i * nprobe + ik],
+                         simi, idxi
+                    );
+
+                    if (max_codes && nscan >= max_codes) {
+                        break;
+                    }
+                }
+
+                ndis += nscan;
+                reorder_result (simi, idxi);
+
+                if (InterruptCallback::is_interrupted ()) {
+                    interrupt = true;
+                }
+
+            } // parallel for
+        } else if (parallel_mode == 1) {
+            std::vector <idx_t> local_idx (k);
+            std::vector <float> local_dis (k);
+
+            for (size_t i = 0; i < n; i++) {
+                scanner->set_query (x + i * d);
+                init_result (local_dis.data(), local_idx.data());
+
+#pragma omp for schedule(dynamic)
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    ndis += scan_one_list
+                        (keys [i * nprobe + ik],
+                         coarse_dis[i * nprobe + ik],
+                         local_dis.data(), local_idx.data());
+
+                    // can't do the test on max_codes
+                }
+                // merge thread-local results
+
+                float * simi = distances + i * k;
+                idx_t * idxi = labels + i * k;
+#pragma omp single
+                init_result (simi, idxi);
+
+#pragma omp barrier
+#pragma omp critical
+                {
+                    if (metric_type == METRIC_INNER_PRODUCT) {
+                        heap_addn<HeapForIP>
+                            (k, simi, idxi,
+                             local_dis.data(), local_idx.data(), k);
+                    } else {
+                        heap_addn<HeapForL2>
+                            (k, simi, idxi,
+                             local_dis.data(), local_idx.data(), k);
+                    }
+                }
+#pragma omp barrier
+#pragma omp single
+                reorder_result (simi, idxi);
+            }
+        } else {
+            FAISS_THROW_FMT ("parallel_mode %d not supported\n",
+                             parallel_mode);
+        }
+    } // parallel section
+
+    if (interrupt) {
+        FAISS_THROW_MSG ("computation interrupted");
+    }
+
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nheap_updates += nheap;
+
+}
+
+
+
+
+void IndexIVF::range_search (idx_t nx, const float *x, float radius,
+                             RangeSearchResult *result) const
+{
+    std::unique_ptr<idx_t[]> keys (new idx_t[nx * nprobe]);
+    std::unique_ptr<float []> coarse_dis (new float[nx * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search (nx, x, nprobe, coarse_dis.get (), keys.get ());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists (keys.get(), nx * nprobe);
+
+    range_search_preassigned (nx, x, radius, keys.get (), coarse_dis.get (),
+                              result);
+
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
+void IndexIVF::range_search_preassigned (
+         idx_t nx, const float *x, float radius,
+         const idx_t *keys, const float *coarse_dis,
+         RangeSearchResult *result) const
+{
+
+    size_t nlistv = 0, ndis = 0;
+    bool store_pairs = false;
+
+    std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
+
+#pragma omp parallel reduction(+: nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(result);
+        std::unique_ptr<InvertedListScanner> scanner
+            (get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT (scanner.get ());
+        all_pres[omp_get_thread_num()] = &pres;
+
+        // prepare the list scanning function
+
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres) {
+
+            idx_t key = keys[i * nprobe + ik];  /* select the list  */
+            if (key < 0) return;
+            FAISS_THROW_IF_NOT_FMT (
+                  key < (idx_t) nlist,
+                  "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                  key, ik, nlist);
+            const size_t list_size = invlists->list_size(key);
+
+            if (list_size == 0) return;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+            InvertedLists::ScopedIds ids (invlists, key);
+
+            scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+            nlistv++;
+            ndis += list_size;
+            scanner->scan_codes_range (list_size, scodes.get(),
+                                       ids.get(), radius, qres);
+        };
+
+        if (parallel_mode == 0) {
+
+#pragma omp for
+            for (size_t i = 0; i < nx; i++) {
+                scanner->set_query (x + i * d);
+
+                RangeQueryResult & qres = pres.new_result (i);
+
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func (i, ik, qres);
+                }
+
+            }
+
+        } else if (parallel_mode == 1) {
+
+            for (size_t i = 0; i < nx; i++) {
+                scanner->set_query (x + i * d);
+
+                RangeQueryResult & qres = pres.new_result (i);
+
+#pragma omp for schedule(dynamic)
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func (i, ik, qres);
+                }
+            }
+        } else if (parallel_mode == 2) {
+            std::vector<RangeQueryResult *> all_qres (nx);
+            RangeQueryResult *qres = nullptr;
+
+#pragma omp for schedule(dynamic)
+            for (size_t iik = 0; iik < nx * nprobe; iik++) {
+                size_t i = iik / nprobe;
+                size_t ik = iik % nprobe;
+                if (qres == nullptr || qres->qno != i) {
+                    FAISS_ASSERT (!qres || i > qres->qno);
+                    qres = &pres.new_result (i);
+                    scanner->set_query (x + i * d);
+                }
+                scan_list_func (i, ik, *qres);
+            }
+        } else {
+            FAISS_THROW_FMT ("parallel_mode %d not supported\n", parallel_mode);
+        }
+        if (parallel_mode == 0) {
+            pres.finalize ();
+        } else {
+#pragma omp barrier
+#pragma omp single
+            RangeSearchPartialResult::merge (all_pres, false);
+#pragma omp barrier
+
+        }
+    }
+    indexIVF_stats.nq += nx;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+}
+
+
+InvertedListScanner *IndexIVF::get_InvertedListScanner (
+    bool /*store_pairs*/) const
+{
+    return nullptr;
+}
+
+void IndexIVF::reconstruct (idx_t key, float* recons) const
+{
+    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
+                            "direct map is not initialized");
+    FAISS_THROW_IF_NOT_MSG (key >= 0 && key < direct_map.size(),
+                            "invalid key");
+    idx_t list_no = direct_map[key] >> 32;
+    idx_t offset = direct_map[key] & 0xffffffff;
+    reconstruct_from_offset (list_no, offset, recons);
+}
+
+
+void IndexIVF::reconstruct_n (idx_t i0, idx_t ni, float* recons) const
+{
+    FAISS_THROW_IF_NOT (ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t list_size = invlists->list_size (list_no);
+        ScopedIds idlist (invlists, list_no);
+
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+
+            float* reconstructed = recons + (id - i0) * d;
+            reconstruct_from_offset (list_no, offset, reconstructed);
+        }
+    }
+}
+
+
+/* standalone codec interface */
+size_t IndexIVF::sa_code_size () const
+{
+    size_t coarse_size = coarse_code_size();
+    return code_size + coarse_size;
+}
+
+void IndexIVF::sa_encode (idx_t n, const float *x,
+                                 uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> idx (new int64_t [n]);
+    quantizer->assign (n, x, idx.get());
+    encode_vectors (n, x, idx.get(), bytes, true);
+}
+
+
+void IndexIVF::search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                       float *distances, idx_t *labels,
+                                       float *recons) const
+{
+    idx_t * idx = new idx_t [n * nprobe];
+    ScopeDeleter<idx_t> del (idx);
+    float * coarse_dis = new float [n * nprobe];
+    ScopeDeleter<float> del2 (coarse_dis);
+
+    quantizer->search (n, x, nprobe, coarse_dis, idx);
+
+    invlists->prefetch_lists (idx, n * nprobe);
+
+    // search_preassigned() with `store_pairs` enabled to obtain the list_no
+    // and offset into `codes` for reconstruction
+    search_preassigned (n, x, k, idx, coarse_dis,
+                        distances, labels, true /* store_pairs */);
+    for (idx_t i = 0; i < n; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            idx_t ij = i * k + j;
+            idx_t key = labels[ij];
+            float* reconstructed = recons + ij * d;
+            if (key < 0) {
+                // Fill with NaNs
+                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+            } else {
+                int list_no = key >> 32;
+                int offset = key & 0xffffffff;
+
+                // Update label to the actual id
+                labels[ij] = invlists->get_single_id (list_no, offset);
+
+                reconstruct_from_offset (list_no, offset, reconstructed);
+            }
+        }
+    }
+}
+
+void IndexIVF::reconstruct_from_offset(
+    int64_t /*list_no*/,
+    int64_t /*offset*/,
+    float* /*recons*/) const {
+  FAISS_THROW_MSG ("reconstruct_from_offset not implemented");
+}
+
+void IndexIVF::reset ()
+{
+    direct_map.clear ();
+    invlists->reset ();
+    ntotal = 0;
+}
+
+
+size_t IndexIVF::remove_ids (const IDSelector & sel)
+{
+    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+                    "direct map remove not implemented");
+
+    std::vector<idx_t> toremove(nlist);
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        idx_t l0 = invlists->list_size (i), l = l0, j = 0;
+        ScopedIds idsi (invlists, i);
+        while (j < l) {
+            if (sel.is_member (idsi[j])) {
+                l--;
+                invlists->update_entry (
+                     i, j,
+                     invlists->get_single_id (i, l),
+                     ScopedCodes (invlists, i, l).get());
+            } else {
+                j++;
+            }
+        }
+        toremove[i] = l0 - l;
+    }
+    // this will not run well in parallel on ondisk because of possible shrinks
+    size_t nremove = 0;
+    for (idx_t i = 0; i < nlist; i++) {
+        if (toremove[i] > 0) {
+            nremove += toremove[i];
+            invlists->resize(
+                i, invlists->list_size(i) - toremove[i]);
+        }
+    }
+    ntotal -= nremove;
+    return nremove;
+}
+
+
+
+
+void IndexIVF::train (idx_t n, const float *x)
+{
+    if (verbose)
+        printf ("Training level-1 quantizer\n");
+
+    train_q1 (n, x, verbose, metric_type);
+
+    if (verbose)
+        printf ("Training IVF residual\n");
+
+    train_residual (n, x);
+    is_trained = true;
+
+}
+
+void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
+  if (verbose)
+    printf("IndexIVF: no residual training\n");
+  // does nothing by default
+}
+
+
+void IndexIVF::check_compatible_for_merge (const IndexIVF &other) const
+{
+    // minimal sanity checks
+    FAISS_THROW_IF_NOT (other.d == d);
+    FAISS_THROW_IF_NOT (other.nlist == nlist);
+    FAISS_THROW_IF_NOT (other.code_size == code_size);
+    FAISS_THROW_IF_NOT_MSG (typeid (*this) == typeid (other),
+                  "can only merge indexes of the same type");
+}
+
+
+void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)
+{
+    check_compatible_for_merge (other);
+    FAISS_THROW_IF_NOT_MSG ((!maintain_direct_map &&
+                             !other.maintain_direct_map),
+                  "direct map copy not implemented");
+
+    invlists->merge_from (other.invlists, add_id);
+
+    ntotal += other.ntotal;
+    other.ntotal = 0;
+}
+
+
+void IndexIVF::replace_invlists (InvertedLists *il, bool own)
+{
+    if (own_invlists) {
+        delete invlists;
+    }
+    // FAISS_THROW_IF_NOT (ntotal == 0);
+    if (il) {
+        FAISS_THROW_IF_NOT (il->nlist == nlist &&
+                            il->code_size == code_size);
+    }
+    invlists = il;
+    own_invlists = own;
+}
+
+
+void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
+                                 idx_t a1, idx_t a2) const
+{
+
+    FAISS_THROW_IF_NOT (nlist == other.nlist);
+    FAISS_THROW_IF_NOT (code_size == other.code_size);
+    FAISS_THROW_IF_NOT (!other.maintain_direct_map);
+    FAISS_THROW_IF_NOT_FMT (
+          subset_type == 0 || subset_type == 1 || subset_type == 2,
+          "subset type %d not implemented", subset_type);
+
+    size_t accu_n = 0;
+    size_t accu_a1 = 0;
+    size_t accu_a2 = 0;
+
+    InvertedLists *oivf = other.invlists;
+
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = invlists->list_size (list_no);
+        ScopedIds ids_in (invlists, list_no);
+
+        if (subset_type == 0) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (a1 <= id && id < a2) {
+                    oivf->add_entry (list_no,
+                                     invlists->get_single_id (list_no, i),
+                                     ScopedCodes (invlists, list_no, i).get());
+                    other.ntotal++;
+                }
+            }
+        } else if (subset_type == 1) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    oivf->add_entry (list_no,
+                                     invlists->get_single_id (list_no, i),
+                                     ScopedCodes (invlists, list_no, i).get());
+                    other.ntotal++;
+                }
+            }
+        } else if (subset_type == 2) {
+            // see what is allocated to a1 and to a2
+            size_t next_accu_n = accu_n + n;
+            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
+            size_t i1 = next_accu_a1 - accu_a1;
+            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
+            size_t i2 = next_accu_a2 - accu_a2;
+
+            for (idx_t i = i1; i < i2; i++) {
+                oivf->add_entry (list_no,
+                                 invlists->get_single_id (list_no, i),
+                                 ScopedCodes (invlists, list_no, i).get());
+            }
+
+            other.ntotal += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
+        }
+        accu_n += n;
+    }
+    FAISS_ASSERT(accu_n == ntotal);
+
+}
+
+void
+IndexIVF::dump() {
+    for (auto i = 0; i < invlists->nlist; ++ i) {
+        auto numVecs = invlists->list_size(i);
+        auto ids = invlists->get_ids(i);
+        auto codes = invlists->get_codes(i);
+        int code_size = invlists->code_size;
+
+        std::cout << "Bucket ID: " << i << ", with code size: " << code_size << ", vectors number: " << numVecs << std::endl;
+        if(code_size == 8) {
+            // int8 types
+            for (auto j=0; j < numVecs; ++j) {
+                std::cout << *(ids+j) << ": " << std::endl;
+                for(int k = 0; k < this->d; ++ k) {
+                    printf("%u ", (uint8_t)(codes[j * d + k]));
+                }
+                std::cout << std::endl;
+            }
+        }
+        std::cout << "Bucket End." << std::endl;
+    }
+}
+
+IndexIVF::~IndexIVF()
+{
+    if (own_invlists) {
+        delete invlists;
+    }
+}
+
+
+void IndexIVFStats::reset()
+{
+    memset ((void*)this, 0, sizeof (*this));
+}
+
+
+IndexIVFStats indexIVF_stats;
+
+void InvertedListScanner::scan_codes_range (size_t ,
+                       const uint8_t *,
+                       const idx_t *,
+                       float ,
+                       RangeQueryResult &) const
+{
+    FAISS_THROW_MSG ("scan_codes_range not implemented");
+}
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexIVF.h
@ -0,0 +1,363 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVF_H
+#define FAISS_INDEX_IVF_H
+
+
+#include <vector>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+#include <faiss/InvertedLists.h>
+#include <faiss/Clustering.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+
+/** Encapsulates a quantizer object for the IndexIVF
+ *
+ * The class isolates the fields that are independent of the storage
+ * of the lists (especially training)
+ */
+struct Level1Quantizer {
+    Index * quantizer = nullptr;        ///< quantizer that maps vectors to inverted lists
+    Index * quantizer_backup = nullptr; ///< quantizer for backup
+    size_t nlist;             ///< number of possible key values
+
+
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
+    bool own_fields;          ///< whether object owns the quantizer
+
+    ClusteringParameters cp; ///< to override default clustering params
+    Index *clustering_index; ///< to override index used during clustering
+
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train_q1 (size_t n, const float *x, bool verbose,
+                   MetricType metric_type);
+
+
+    /// compute the number of bytes required to store list ids
+    size_t coarse_code_size () const;
+    void encode_listno (Index::idx_t list_no, uint8_t *code) const;
+    Index::idx_t decode_listno (const uint8_t *code) const;
+
+    Level1Quantizer (Index * quantizer, size_t nlist);
+
+    Level1Quantizer ();
+
+    ~Level1Quantizer ();
+
+};
+
+
+
+struct IVFSearchParameters {
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+    virtual ~IVFSearchParameters () {}
+};
+
+
+
+struct InvertedListScanner;
+
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an Index instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * The inverted list object is required only after trainng. If none is
+ * set externally, an ArrayInvertedLists is used automatically.
+ *
+ * At search time, the vector to be searched is also quantized, and
+ * only the list corresponding to the quantization index is
+ * searched. This speeds up the search by making it
+ * non-exhaustive. This can be relaxed using multi-probe search: a few
+ * (nprobe) quantization indices are selected and several inverted
+ * lists are visited.
+ *
+ * Sub-classes implement a post-filtering of the index that refines
+ * the distance estimation from the query to databse vectors.
+ */
+struct IndexIVF: Index, Level1Quantizer {
+    /// Acess to the actual data
+    InvertedLists *invlists;
+    bool own_invlists;
+
+    size_t code_size;              ///< code size per vector in bytes
+
+    size_t nprobe;            ///< number of probes at query time
+    size_t max_codes;         ///< max nb of codes to visit to do a query
+
+    /** Parallel mode determines how queries are parallelized with OpenMP
+     *
+     * 0 (default): parallelize over queries
+     * 1: parallelize over over inverted lists
+     * 2: parallelize over both
+     */
+    int parallel_mode;
+
+    /// map for direct access to the elements. Enables reconstruct().
+    bool maintain_direct_map;
+    std::vector <idx_t> direct_map;
+
+    /** The Inverted file takes a quantizer (an Index) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexIVF is in use.
+     */
+    IndexIVF (Index * quantizer, size_t d,
+              size_t nlist, size_t code_size,
+              MetricType metric = METRIC_L2);
+
+    void reset() override;
+
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train(idx_t n, const float* x) override;
+
+    /// Calls add_with_ids with NULL ids
+    void add(idx_t n, const float* x) override;
+
+    /// default implementation that calls encode_vectors
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    /** Encodes a set of vectors as they would appear in the inverted lists
+     *
+     * @param list_nos   inverted list ids as returned by the
+     *                   quantizer (size n). -1s are ignored.
+     * @param codes      output codes, size n * code_size
+     * @param include_listno
+     *                   include the list ids in the code (in this case add
+     *                   ceil(log8(nlist)) to the code size)
+     */
+    virtual void encode_vectors(idx_t n, const float* x,
+                                const idx_t *list_nos,
+                                uint8_t * codes,
+                                bool include_listno = false) const = 0;
+
+    /// Sub-classes that encode the residuals can train their encoders here
+    /// does nothing by default
+    virtual void train_residual (idx_t n, const float *x);
+
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    virtual void search_preassigned (idx_t n, const float *x, idx_t k,
+                                     const idx_t *assign,
+                                     const float *centroid_dis,
+                                     float *distances, idx_t *labels,
+                                     bool store_pairs,
+                                     const IVFSearchParameters *params=nullptr
+                                     ) const;
+
+    /** assign the vectors, then call search_preassign */
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels) const override;
+
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
+
+    void range_search_preassigned(idx_t nx, const float *x, float radius,
+                                  const idx_t *keys, const float *coarse_dis,
+                                  RangeSearchResult *result) const;
+
+    /// get a scanner for this index (store_pairs means ignore labels)
+    virtual InvertedListScanner *get_InvertedListScanner (
+        bool store_pairs=false) const;
+
+    void reconstruct (idx_t key, float* recons) const override;
+
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     */
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
+
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                          float* recons) const;
+
+
+    /// Dataset manipulation functions
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    void check_compatible_for_merge (const IndexIVF &other) const;
+
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from (IndexIVF &other, idx_t add_id);
+
+    /** copy a subset of the entries index to the other index
+     *
+     * if subset_type == 0: copies ids in [a1, a2)
+     * if subset_type == 1: copies ids if id % a1 == a2
+     * if subset_type == 2: copies inverted lists such that a1
+     *                      elements are left before and a2 elements are after
+     */
+    virtual void copy_subset_to (IndexIVF & other, int subset_type,
+                                 idx_t a1, idx_t a2) const;
+
+    virtual void to_readonly();
+    virtual bool is_readonly() const;
+
+    virtual void backup_quantizer();
+
+    virtual void restore_quantizer();
+
+    ~IndexIVF() override;
+
+    size_t get_list_size (size_t list_no) const
+    { return invlists->list_size(list_no); }
+
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map (bool new_maintain_direct_map=true);
+
+    /// replace the inverted lists, old one is deallocated if own_invlists
+    void replace_invlists (InvertedLists *il, bool own=false);
+
+    /* The standalone codec interface (except sa_decode that is specific) */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void dump();
+
+    IndexIVF ();
+};
+
+struct RangeQueryResult;
+
+/** Object that handles a query. The inverted lists to scan are
+ * provided externally. The object has a lot of state, but
+ * distance_to_code and scan_codes can be called in multiple
+ * threads */
+struct InvertedListScanner {
+
+    using idx_t = Index::idx_t;
+
+    /// from now on we handle this query.
+    virtual void set_query (const float *query_vector) = 0;
+
+    /// following codes come from this inverted list
+    virtual void set_list (idx_t list_no, float coarse_dis) = 0;
+
+    /// compute a single query-to-code distance
+    virtual float distance_to_code (const uint8_t *code) const = 0;
+
+    /** scan a set of codes, compute distances to current query and
+     * update heap of results if necessary.
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     * @return number of heap updates performed
+     */
+    virtual size_t scan_codes (size_t n,
+                               const uint8_t *codes,
+                               const idx_t *ids,
+                               float *distances, idx_t *labels,
+                               size_t k) const = 0;
+
+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   float radius,
+                                   RangeQueryResult &result) const;
+
+    virtual ~InvertedListScanner () {}
+
+};
+
+
+struct IndexIVFStats {
+    size_t nq;       // nb of queries run
+    size_t nlist;    // nb of inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+    size_t nheap_updates; // nb of times the heap was updated
+    double quantization_time; // time spent quantizing vectors (in ms)
+    double search_time;       // time spent searching lists (in ms)
+
+    IndexIVFStats () {reset (); }
+    void reset ();
+};
+
+// global var that collects them all
+extern IndexIVFStats indexIVF_stats;
+
+
+} // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
@ -0,0 +1,502 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVFFlat.h>
+
+#include <cstdio>
+
+#include <faiss/IndexFlat.h>
+
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+
+namespace faiss {
+
+
+/*****************************************
+ * IndexIVFFlat implementation
+ ******************************************/
+
+IndexIVFFlat::IndexIVFFlat (Index * quantizer,
+                            size_t d, size_t nlist, MetricType metric):
+    IndexIVF (quantizer, d, nlist, sizeof(float) * d, metric)
+{
+    code_size = sizeof(float) * d;
+}
+
+
+void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const idx_t *xids)
+{
+    add_core (n, x, xids, nullptr);
+}
+
+void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
+                             const int64_t *precomputed_idx)
+
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
+                            "cannot have direct map and add with ids");
+    const int64_t * idx;
+    ScopeDeleter<int64_t> del;
+
+    if (precomputed_idx) {
+        idx = precomputed_idx;
+    } else {
+        int64_t * idx0 = new int64_t [n];
+        del.set (idx0);
+        quantizer->assign (n, x, idx0);
+        idx = idx0;
+    }
+    int64_t n_add = 0;
+    for (size_t i = 0; i < n; i++) {
+        int64_t id = xids ? xids[i] : ntotal + i;
+        int64_t list_no = idx [i];
+
+        if (list_no < 0)
+            continue;
+        const float *xi = x + i * d;
+        size_t offset = invlists->add_entry (
+              list_no, id, (const uint8_t*) xi);
+
+        if (maintain_direct_map)
+            direct_map.push_back (list_no << 32 | offset);
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
+               n_add, n);
+    }
+    ntotal += n;
+}
+
+void IndexIVFFlat::encode_vectors(idx_t n, const float* x,
+                                  const idx_t * list_nos,
+                                  uint8_t * codes,
+                                  bool include_listnos) const
+{
+    if (!include_listnos) {
+        memcpy (codes, x, code_size * n);
+    } else {
+        size_t coarse_size = coarse_code_size ();
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            uint8_t *code = codes + i * (code_size + coarse_size);
+            const float *xi = x + i * d;
+            if (list_no >= 0) {
+                encode_listno (list_no, code);
+                memcpy (code + coarse_size, xi, code_size);
+            } else {
+                memset (code, 0, code_size + coarse_size);
+            }
+
+        }
+    }
+}
+
+void IndexIVFFlat::sa_decode (idx_t n, const uint8_t *bytes,
+                                      float *x) const
+{
+    size_t coarse_size = coarse_code_size ();
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t *code = bytes + i * (code_size + coarse_size);
+        float *xi = x + i * d;
+        memcpy (xi, code + coarse_size, code_size);
+    }
+}
+
+
+namespace {
+
+
+template<MetricType metric, class C>
+struct IVFFlatScanner: InvertedListScanner {
+    size_t d;
+    bool store_pairs;
+
+    IVFFlatScanner(size_t d, bool store_pairs):
+        d(d), store_pairs(store_pairs) {}
+
+    const float *xi;
+    void set_query (const float *query) override {
+        this->xi = query;
+    }
+
+    idx_t list_no;
+    void set_list (idx_t list_no, float /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+
+    float distance_to_code (const uint8_t *code) const override {
+        const float *yj = (float*)code;
+        float dis = metric == METRIC_INNER_PRODUCT ?
+            fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+        return dis;
+    }
+
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (simi[0], dis)) {
+                heap_pop<C> (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                heap_push<C> (k, simi, idxi, dis, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        const float *list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float * yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT ?
+                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
+            if (C::cmp (radius, dis)) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+        }
+    }
+
+
+};
+
+
+} // anonymous namespace
+
+
+
+InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
+     (bool store_pairs) const
+{
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFFlatScanner<
+            METRIC_INNER_PRODUCT, CMin<float, int64_t> > (d, store_pairs);
+    } else if (metric_type == METRIC_L2) {
+        return new IVFFlatScanner<
+            METRIC_L2, CMax<float, int64_t> >(d, store_pairs);
+    } else {
+        FAISS_THROW_MSG("metric type not supported");
+    }
+    return nullptr;
+}
+
+
+
+void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
+{
+
+    FAISS_THROW_IF_NOT (maintain_direct_map);
+    FAISS_THROW_IF_NOT (is_trained);
+    std::vector<idx_t> assign (n);
+    quantizer->assign (n, x, assign.data());
+
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = new_ids[i];
+        FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
+                                "id to update out of range");
+        { // remove old one
+            int64_t dm = direct_map[id];
+            int64_t ofs = dm & 0xffffffff;
+            int64_t il = dm >> 32;
+            size_t l = invlists->list_size (il);
+            if (ofs != l - 1) { // move l - 1 to ofs
+                int64_t id2 = invlists->get_single_id (il, l - 1);
+                direct_map[id2] = (il << 32) | ofs;
+                invlists->update_entry (il, ofs, id2,
+                                        invlists->get_single_code (il, l - 1));
+            }
+            invlists->resize (il, l - 1);
+        }
+        { // insert new one
+            int64_t il = assign[i];
+            size_t l = invlists->list_size (il);
+            int64_t dm = (il << 32) | l;
+            direct_map[id] = dm;
+            invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
+        }
+    }
+
+}
+
+void IndexIVFFlat::reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                            float* recons) const
+{
+    memcpy (recons, invlists->get_single_code (list_no, offset), code_size);
+}
+
+/*****************************************
+ * IndexIVFFlatDedup implementation
+ ******************************************/
+
+IndexIVFFlatDedup::IndexIVFFlatDedup (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType metric_type):
+    IndexIVFFlat (quantizer, d, nlist_, metric_type)
+{}
+
+
+void IndexIVFFlatDedup::train(idx_t n, const float* x)
+{
+    std::unordered_map<uint64_t, idx_t> map;
+    float * x2 = new float [n * d];
+    ScopeDeleter<float> del (x2);
+
+    int64_t n2 = 0;
+    for (int64_t i = 0; i < n; i++) {
+        uint64_t hash = hash_bytes((uint8_t *)(x + i * d), code_size);
+        if (map.count(hash) &&
+            !memcmp (x2 + map[hash] * d, x + i * d, code_size)) {
+            // is duplicate, skip
+        } else {
+            map [hash] = n2;
+            memcpy (x2 + n2 * d, x + i * d, code_size);
+            n2 ++;
+        }
+    }
+    if (verbose) {
+        printf ("IndexIVFFlatDedup::train: train on %ld points after dedup "
+                "(was %ld points)\n", n2, n);
+    }
+    IndexIVFFlat::train (n2, x2);
+}
+
+
+
+void IndexIVFFlatDedup::add_with_ids(
+           idx_t na, const float* x, const idx_t* xids)
+{
+
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    FAISS_THROW_IF_NOT_MSG (
+           !maintain_direct_map,
+           "IVFFlatDedup not implemented with direct_map");
+    int64_t * idx = new int64_t [na];
+    ScopeDeleter<int64_t> del (idx);
+    quantizer->assign (na, x, idx);
+
+    int64_t n_add = 0, n_dup = 0;
+    // TODO make a omp loop with this
+    for (size_t i = 0; i < na; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        int64_t list_no = idx [i];
+
+        if (list_no < 0) {
+            continue;
+        }
+        const float *xi = x + i * d;
+
+        // search if there is already an entry with that id
+        InvertedLists::ScopedCodes codes (invlists, list_no);
+
+        int64_t n = invlists->list_size (list_no);
+        int64_t offset = -1;
+        for (int64_t o = 0; o < n; o++) {
+            if (!memcmp (codes.get() + o * code_size,
+                         xi, code_size)) {
+                offset = o;
+                break;
+            }
+        }
+
+        if (offset == -1) { // not found
+            invlists->add_entry (list_no, id, (const uint8_t*) xi);
+        } else {
+            // mark equivalence
+            idx_t id2 = invlists->get_single_id (list_no, offset);
+            std::pair<idx_t, idx_t> pair (id2, id);
+            instances.insert (pair);
+            n_dup ++;
+        }
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_with_ids: added %ld / %ld vectors"
+               " (out of which %ld are duplicates)\n",
+               n_add, na, n_dup);
+    }
+    ntotal += n_add;
+}
+
+void IndexIVFFlatDedup::search_preassigned (
+           idx_t n, const float *x, idx_t k,
+           const idx_t *assign,
+           const float *centroid_dis,
+           float *distances, idx_t *labels,
+           bool store_pairs,
+           const IVFSearchParameters *params) const
+{
+    FAISS_THROW_IF_NOT_MSG (
+           !store_pairs, "store_pairs not supported in IVFDedup");
+
+    IndexIVFFlat::search_preassigned (n, x, k, assign, centroid_dis,
+                                      distances, labels, false,
+                                      params);
+
+    std::vector <idx_t> labels2 (k);
+    std::vector <float> dis2 (k);
+
+    for (int64_t i = 0; i < n; i++) {
+        idx_t *labels1 = labels + i * k;
+        float *dis1 = distances + i * k;
+        int64_t j = 0;
+        for (; j < k; j++) {
+            if (instances.find (labels1[j]) != instances.end ()) {
+                // a duplicate: special handling
+                break;
+            }
+        }
+        if (j < k) {
+            // there are duplicates, special handling
+            int64_t j0 = j;
+            int64_t rp = j;
+            while (j < k) {
+                auto range = instances.equal_range (labels1[rp]);
+                float dis = dis1[rp];
+                labels2[j] = labels1[rp];
+                dis2[j] = dis;
+                j ++;
+                for (auto it = range.first; j < k && it != range.second; ++it) {
+                    labels2[j] = it->second;
+                    dis2[j] = dis;
+                    j++;
+                }
+                rp++;
+            }
+            memcpy (labels1 + j0, labels2.data() + j0,
+                    sizeof(labels1[0]) * (k - j0));
+            memcpy (dis1 + j0, dis2.data() + j0,
+                    sizeof(dis2[0]) * (k - j0));
+        }
+    }
+
+}
+
+
+size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel)
+{
+    std::unordered_map<idx_t, idx_t> replace;
+    std::vector<std::pair<idx_t, idx_t> > toadd;
+    for (auto it = instances.begin(); it != instances.end(); ) {
+        if (sel.is_member(it->first)) {
+            // then we erase this entry
+            if (!sel.is_member(it->second)) {
+                // if the second is not erased
+                if (replace.count(it->first) == 0) {
+                    replace[it->first] = it->second;
+                } else { // remember we should add an element
+                    std::pair<idx_t, idx_t> new_entry (
+                          replace[it->first], it->second);
+                    toadd.push_back(new_entry);
+                }
+            }
+            it = instances.erase(it);
+        } else {
+            if (sel.is_member(it->second)) {
+                it = instances.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+
+    instances.insert (toadd.begin(), toadd.end());
+
+    // mostly copied from IndexIVF.cpp
+
+    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+                    "direct map remove not implemented");
+
+    std::vector<int64_t> toremove(nlist);
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < nlist; i++) {
+        int64_t l0 = invlists->list_size (i), l = l0, j = 0;
+        InvertedLists::ScopedIds idsi (invlists, i);
+        while (j < l) {
+            if (sel.is_member (idsi[j])) {
+                if (replace.count(idsi[j]) == 0) {
+                    l--;
+                    invlists->update_entry (
+                        i, j,
+                        invlists->get_single_id (i, l),
+                        InvertedLists::ScopedCodes (invlists, i, l).get());
+                } else {
+                    invlists->update_entry (
+                        i, j,
+                        replace[idsi[j]],
+                        InvertedLists::ScopedCodes (invlists, i, j).get());
+                    j++;
+                }
+            } else {
+                j++;
+            }
+        }
+        toremove[i] = l0 - l;
+    }
+    // this will not run well in parallel on ondisk because of possible shrinks
+    int64_t nremove = 0;
+    for (int64_t i = 0; i < nlist; i++) {
+        if (toremove[i] > 0) {
+            nremove += toremove[i];
+            invlists->resize(
+                i, invlists->list_size(i) - toremove[i]);
+        }
+    }
+    ntotal -= nremove;
+    return nremove;
+}
+
+
+void IndexIVFFlatDedup::range_search(
+        idx_t ,
+        const float* ,
+        float ,
+        RangeSearchResult* ) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+void IndexIVFFlatDedup::reconstruct_from_offset (
+         int64_t , int64_t , float* ) const
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
@ -0,0 +1,118 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVF_FLAT_H
+#define FAISS_INDEX_IVF_FLAT_H
+
+#include <unordered_map>
+#include <stdint.h>
+
+#include <faiss/IndexIVF.h>
+
+
+namespace faiss {
+
+/** Inverted file with stored vectors. Here the inverted file
+ * pre-selects the vectors to be searched, but they are not otherwise
+ * encoded, the code array just contains the raw float entries.
+ */
+struct IndexIVFFlat: IndexIVF {
+
+    IndexIVFFlat (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType = METRIC_L2);
+
+    /// same as add_with_ids, with precomputed coarse quantizer
+    virtual void add_core (idx_t n, const float * x, const int64_t *xids,
+                   const int64_t *precomputed_idx);
+
+    /// implemented for all IndexIVF* classes
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos=false) const override;
+
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    virtual void update_vectors (int nv, idx_t *idx, const float *v);
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+    IndexIVFFlat () {}
+};
+
+
+struct IndexIVFFlatDedup: IndexIVFFlat {
+
+    /** Maps ids stored in the index to the ids of vectors that are
+     *  the same. When a vector is unique, it does not appear in the
+     *  instances map */
+    std::unordered_multimap <idx_t, idx_t> instances;
+
+    IndexIVFFlatDedup (
+            Index * quantizer, size_t d, size_t nlist_,
+            MetricType = METRIC_L2);
+
+    /// also dedups the training set
+    void train(idx_t n, const float* x) override;
+
+    /// implemented for all IndexIVF* classes
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void search_preassigned (idx_t n, const float *x, idx_t k,
+                             const idx_t *assign,
+                             const float *centroid_dis,
+                             float *distances, idx_t *labels,
+                             bool store_pairs,
+                             const IVFSearchParameters *params=nullptr
+                             ) const override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /// not implemented
+    void range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result) const override;
+
+    /// not implemented
+    void update_vectors (int nv, idx_t *idx, const float *v) override;
+
+
+    /// not implemented
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    IndexIVFFlatDedup () {}
+
+
+};
+
+
+
+} // namespace faiss
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.h
@ -0,0 +1,161 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVFPQ_H
+#define FAISS_INDEX_IVFPQ_H
+
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexPQ.h>
+
+
+namespace faiss {
+
+struct IVFPQSearchParameters: IVFSearchParameters {
+    size_t scan_table_threshold;   ///< use table computation or on-the-fly?
+    int polysemous_ht;             ///< Hamming thresh for polysemous filtering
+    ~IVFPQSearchParameters () {}
+};
+
+
+/** Inverted file with Product Quantizer encoding. Each residual
+ * vector is encoded as a product quantizer code.
+ */
+struct IndexIVFPQ: IndexIVF {
+    bool by_residual;              ///< Encode residual or plain vector?
+
+    ProductQuantizer pq;           ///< produces the codes
+
+    bool do_polysemous_training;   ///< reorder PQ centroids after training?
+    PolysemousTraining *polysemous_training; ///< if NULL, use default
+
+    // search-time parameters
+    size_t scan_table_threshold;   ///< use table computation or on-the-fly?
+    int polysemous_ht;             ///< Hamming thresh for polysemous filtering
+
+    /** Precompute table that speed up query preprocessing at some
+     * memory cost
+     * =-1: force disable
+     * =0: decide heuristically (default: use tables only if they are
+     *     < precomputed_tables_max_bytes)
+     * =1: tables that work for all quantizers (size 256 * nlist * M)
+     * =2: specific version for MultiIndexQuantizer (much more compact)
+     */
+    int use_precomputed_table;     ///< if by_residual, build precompute tables
+    static size_t precomputed_table_max_bytes;
+
+    /// if use_precompute_table
+    /// size nlist * pq.M * pq.ksub
+    std::vector <float> precomputed_table;
+
+    IndexIVFPQ (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx);
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids = nullptr)
+        override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos = false) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                    float *x) const override;
+
+
+    /// same as add_core, also:
+    /// - output 2nd level residuals if residuals_2 != NULL
+    /// - use precomputed list numbers if precomputed_idx != NULL
+    void add_core_o (idx_t n, const float *x,
+                     const idx_t *xids, float *residuals_2,
+                     const idx_t *precomputed_idx = nullptr);
+
+    /// trains the product quantizer
+    void train_residual(idx_t n, const float* x) override;
+
+    /// same as train_residual, also output 2nd level residuals
+    void train_residual_o (idx_t n, const float *x, float *residuals_2);
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    /** Find exact duplicates in the dataset.
+     *
+     * the duplicates are returned in pre-allocated arrays (see the
+     * max sizes).
+     *
+     * @params lims   limits between groups of duplicates
+     *                (max size ntotal / 2 + 1)
+     * @params ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     *                duplicates (max size ntotal)
+     * @return n      number of groups found
+     */
+    size_t find_duplicates (idx_t *ids, size_t *lims) const;
+
+    // map a vector to a binary code knowning the index
+    void encode (idx_t key, const float * x, uint8_t * code) const;
+
+    /** Encode multiple vectors
+     *
+     * @param n       nb vectors to encode
+     * @param keys    posting list ids for those vectors (size n)
+     * @param x       vectors (size n * d)
+     * @param codes   output codes (size n * code_size)
+     * @param compute_keys  if false, assume keys are precomputed,
+     *                      otherwise compute them
+     */
+    void encode_multiple (size_t n, idx_t *keys,
+                          const float * x, uint8_t * codes,
+                          bool compute_keys = false) const;
+
+    /// inverse of encode_multiple
+    void decode_multiple (size_t n, const idx_t *keys,
+                          const uint8_t * xcodes, float * x) const;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+    /// build precomputed table
+    void precompute_table ();
+
+    IndexIVFPQ ();
+
+};
+
+
+/// statistics are robust to internal threading, but not if
+/// IndexIVFPQ::search_preassigned is called by multiple threads
+struct IndexIVFPQStats {
+    size_t nrefine;  // nb of refines (IVFPQR)
+
+    size_t n_hamming_pass;
+    // nb of passed Hamming distance tests (for polysemous)
+
+    // timings measured with the CPU RTC
+    // on all threads
+    size_t search_cycles;
+    size_t refine_cycles; // only for IVFPQR
+
+    IndexIVFPQStats () {reset (); }
+    void reset ();
+};
+
+// global var that collects them all
+extern IndexIVFPQStats indexIVFPQ_stats;
+
+
+
+
+} // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
@ -0,0 +1,219 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIVFPQR.h>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/utils.h>
+#include <faiss/utils/distances.h>
+
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+/*****************************************
+ * IndexIVFPQR implementation
+ ******************************************/
+
+IndexIVFPQR::IndexIVFPQR (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx,
+            size_t M_refine, size_t nbits_per_idx_refine):
+    IndexIVFPQ (quantizer, d, nlist, M, nbits_per_idx),
+    refine_pq (d, M_refine, nbits_per_idx_refine),
+    k_factor (4)
+{
+    by_residual = true;
+}
+
+IndexIVFPQR::IndexIVFPQR ():
+    k_factor (1)
+{
+    by_residual = true;
+}
+
+
+
+void IndexIVFPQR::reset()
+{
+    IndexIVFPQ::reset();
+    refine_codes.clear();
+}
+
+
+
+
+void IndexIVFPQR::train_residual (idx_t n, const float *x)
+{
+
+    float * residual_2 = new float [n * d];
+    ScopeDeleter <float> del(residual_2);
+
+    train_residual_o (n, x, residual_2);
+
+    if (verbose)
+        printf ("training %zdx%zd 2nd level PQ quantizer on %ld %dD-vectors\n",
+                refine_pq.M, refine_pq.ksub, n, d);
+
+    refine_pq.cp.max_points_per_centroid = 1000;
+    refine_pq.cp.verbose = verbose;
+
+    refine_pq.train (n, residual_2);
+
+}
+
+
+void IndexIVFPQR::add_with_ids (idx_t n, const float *x, const idx_t *xids) {
+    add_core (n, x, xids, nullptr);
+}
+
+void IndexIVFPQR::add_core (idx_t n, const float *x, const idx_t *xids,
+                                const idx_t *precomputed_idx) {
+
+    float * residual_2 = new float [n * d];
+    ScopeDeleter <float> del(residual_2);
+
+    idx_t n0 = ntotal;
+
+    add_core_o (n, x, xids, residual_2, precomputed_idx);
+
+    refine_codes.resize (ntotal * refine_pq.code_size);
+
+    refine_pq.compute_codes (
+        residual_2, &refine_codes[n0 * refine_pq.code_size], n);
+
+
+}
+#define TIC t0 = get_cycles()
+#define TOC get_cycles () - t0
+
+
+void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
+                                      const idx_t *idx,
+                                      const float *L1_dis,
+                                      float *distances, idx_t *labels,
+                                      bool store_pairs,
+                                      const IVFSearchParameters *params
+                                      ) const
+{
+    uint64_t t0;
+    TIC;
+    size_t k_coarse = long(k * k_factor);
+    idx_t *coarse_labels = new idx_t [k_coarse * n];
+    ScopeDeleter<idx_t> del1 (coarse_labels);
+    { // query with quantizer levels 1 and 2.
+        float *coarse_distances = new float [k_coarse * n];
+        ScopeDeleter<float> del(coarse_distances);
+
+        IndexIVFPQ::search_preassigned (
+                   n, x, k_coarse,
+                   idx, L1_dis, coarse_distances, coarse_labels,
+                   true, params);
+    }
+
+
+    indexIVFPQ_stats.search_cycles += TOC;
+
+    TIC;
+
+    // 3rd level refinement
+    size_t n_refine = 0;
+#pragma omp parallel reduction(+ : n_refine)
+    {
+        // tmp buffers
+        float *residual_1 = new float [2 * d];
+        ScopeDeleter<float> del (residual_1);
+        float *residual_2 = residual_1 + d;
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const float *xq = x + i * d;
+            const idx_t * shortlist = coarse_labels + k_coarse * i;
+            float * heap_sim = distances + k * i;
+            idx_t * heap_ids = labels + k * i;
+            maxheap_heapify (k, heap_sim, heap_ids);
+
+            for (int j = 0; j < k_coarse; j++) {
+                idx_t sl = shortlist[j];
+
+                if (sl == -1) continue;
+
+                int list_no = sl >> 32;
+                int ofs = sl & 0xffffffff;
+
+                assert (list_no >= 0 && list_no < nlist);
+                assert (ofs >= 0 && ofs < invlists->list_size (list_no));
+
+                // 1st level residual
+                quantizer->compute_residual (xq, residual_1, list_no);
+
+                // 2nd level residual
+                const uint8_t * l2code =
+                    invlists->get_single_code (list_no, ofs);
+
+                pq.decode (l2code, residual_2);
+                for (int l = 0; l < d; l++)
+                    residual_2[l] = residual_1[l] - residual_2[l];
+
+                // 3rd level residual's approximation
+                idx_t id = invlists->get_single_id (list_no, ofs);
+                assert (0 <= id && id < ntotal);
+                refine_pq.decode (&refine_codes [id * refine_pq.code_size],
+                                  residual_1);
+
+                float dis = fvec_L2sqr (residual_1, residual_2, d);
+
+                if (dis < heap_sim[0]) {
+                    maxheap_pop (k, heap_sim, heap_ids);
+                    idx_t id_or_pair = store_pairs ? sl : id;
+                    maxheap_push (k, heap_sim, heap_ids, dis, id_or_pair);
+                }
+                n_refine ++;
+            }
+            maxheap_reorder (k, heap_sim, heap_ids);
+        }
+    }
+    indexIVFPQ_stats.nrefine += n_refine;
+    indexIVFPQ_stats.refine_cycles += TOC;
+}
+
+void IndexIVFPQR::reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                           float* recons) const
+{
+    IndexIVFPQ::reconstruct_from_offset (list_no, offset, recons);
+
+    idx_t id = invlists->get_single_id (list_no, offset);
+    assert (0 <= id && id < ntotal);
+
+    std::vector<float> r3(d);
+    refine_pq.decode (&refine_codes [id * refine_pq.code_size], r3.data());
+    for (int i = 0; i < d; ++i) {
+      recons[i] += r3[i];
+    }
+}
+
+void IndexIVFPQR::merge_from (IndexIVF &other_in, idx_t add_id)
+{
+    IndexIVFPQR *other = dynamic_cast<IndexIVFPQR *> (&other_in);
+    FAISS_THROW_IF_NOT(other);
+
+    IndexIVF::merge_from (other_in, add_id);
+
+    refine_codes.insert (refine_codes.end(),
+                         other->refine_codes.begin(),
+                         other->refine_codes.end());
+    other->refine_codes.clear();
+}
+
+size_t IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
+  FAISS_THROW_MSG("not implemented");
+  return 0;
+}
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexIVFPQR.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.h
@ -0,0 +1,65 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/IndexIVFPQ.h>
+
+
+namespace faiss {
+
+
+
+/** Index with an additional level of PQ refinement */
+struct IndexIVFPQR: IndexIVFPQ {
+    ProductQuantizer refine_pq;           ///< 3rd level quantizer
+    std::vector <uint8_t> refine_codes;   ///< corresponding codes
+
+    /// factor between k requested in search and the k requested from the IVFPQ
+    float k_factor;
+
+    IndexIVFPQR (
+            Index * quantizer, size_t d, size_t nlist,
+            size_t M, size_t nbits_per_idx,
+            size_t M_refine, size_t nbits_per_idx_refine);
+
+    void reset() override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /// trains the two product quantizers
+    void train_residual(idx_t n, const float* x) override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    /// same as add_with_ids, but optionally use the precomputed list ids
+    void add_core (idx_t n, const float *x, const idx_t *xids,
+                     const idx_t *precomputed_idx = nullptr);
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    void merge_from (IndexIVF &other, idx_t add_id) override;
+
+
+    void search_preassigned (idx_t n, const float *x, idx_t k,
+                             const idx_t *assign,
+                             const float *centroid_dis,
+                             float *distances, idx_t *labels,
+                             bool store_pairs,
+                             const IVFSearchParameters *params=nullptr
+                             ) const override;
+
+    IndexIVFPQR();
+};
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
@ -0,0 +1,331 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+
+#include <faiss/IndexIVFSpectralHash.h>
+
+#include <memory>
+#include <algorithm>
+#include <stdint.h>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+
+IndexIVFSpectralHash::IndexIVFSpectralHash (
+        Index * quantizer, size_t d, size_t nlist,
+        int nbit, float period):
+    IndexIVF (quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
+    nbit (nbit), period (period), threshold_type (Thresh_global)
+{
+    FAISS_THROW_IF_NOT (code_size % 4 == 0);
+    RandomRotationMatrix *rr = new RandomRotationMatrix (d, nbit);
+    rr->init (1234);
+    vt = rr;
+    own_fields = true;
+    is_trained = false;
+}
+
+IndexIVFSpectralHash::IndexIVFSpectralHash():
+    IndexIVF(), vt(nullptr), own_fields(false),
+    nbit(0), period(0), threshold_type(Thresh_global)
+{}
+
+IndexIVFSpectralHash::~IndexIVFSpectralHash ()
+{
+    if (own_fields) {
+        delete vt;
+    }
+}
+
+namespace {
+
+
+float median (size_t n, float *x) {
+    std::sort(x, x + n);
+    if (n % 2 == 1) {
+        return x [n / 2];
+    } else {
+        return (x [n / 2 - 1] + x [n / 2]) / 2;
+    }
+}
+
+}
+
+
+void IndexIVFSpectralHash::train_residual (idx_t n, const float *x)
+{
+    if (!vt->is_trained) {
+        vt->train (n, x);
+    }
+
+    if (threshold_type == Thresh_global) {
+        // nothing to do
+        return;
+    } else if (threshold_type == Thresh_centroid ||
+        threshold_type == Thresh_centroid_half) {
+        // convert all centroids with vt
+        std::vector<float> centroids (nlist * d);
+        quantizer->reconstruct_n (0, nlist, centroids.data());
+        trained.resize(nlist * nbit);
+        vt->apply_noalloc (nlist, centroids.data(), trained.data());
+        if (threshold_type == Thresh_centroid_half) {
+            for (size_t i = 0; i < nlist * nbit; i++) {
+                trained[i] -= 0.25 * period;
+            }
+        }
+        return;
+    }
+    // otherwise train medians
+
+    // assign
+    std::unique_ptr<idx_t []> idx (new idx_t [n]);
+    quantizer->assign (n, x, idx.get());
+
+    std::vector<size_t> sizes(nlist + 1);
+    for (size_t i = 0; i < n; i++) {
+        FAISS_THROW_IF_NOT (idx[i] >= 0);
+        sizes[idx[i]]++;
+    }
+
+    size_t ofs = 0;
+    for (int j = 0; j < nlist; j++) {
+        size_t o0 = ofs;
+        ofs += sizes[j];
+        sizes[j] = o0;
+    }
+
+    // transform
+    std::unique_ptr<float []> xt (vt->apply (n, x));
+
+    // transpose + reorder
+    std::unique_ptr<float []> xo (new float[n * nbit]);
+
+    for (size_t i = 0; i < n; i++) {
+        size_t idest = sizes[idx[i]]++;
+        for (size_t j = 0; j < nbit; j++) {
+            xo[idest + n * j] = xt[i * nbit + j];
+        }
+    }
+
+    trained.resize (n * nbit);
+    // compute medians
+#pragma omp for
+    for (int i = 0; i < nlist; i++) {
+        size_t i0 = i == 0 ? 0 : sizes[i - 1];
+        size_t i1 = sizes[i];
+        for (int j = 0; j < nbit; j++) {
+            float *xoi = xo.get() + i0 + n * j;
+            if (i0 == i1) { // nothing to train
+                trained[i * nbit + j] = 0.0;
+            } else if (i1 == i0 + 1) {
+                trained[i * nbit + j] = xoi[0];
+            } else {
+                trained[i * nbit + j] = median(i1 - i0, xoi);
+            }
+        }
+    }
+}
+
+
+namespace {
+
+void binarize_with_freq(size_t nbit, float freq,
+                        const float *x, const float *c,
+                        uint8_t *codes)
+{
+    memset (codes, 0, (nbit + 7) / 8);
+    for (size_t i = 0; i < nbit; i++) {
+        float xf = (x[i] - c[i]);
+        int xi = int(floor(xf * freq));
+        int bit = xi & 1;
+        codes[i >> 3] |= bit << (i & 7);
+    }
+}
+
+
+};
+
+
+
+void IndexIVFSpectralHash::encode_vectors(idx_t n, const float* x_in,
+                                          const idx_t *list_nos,
+                                          uint8_t * codes,
+                                          bool include_listnos) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    float freq = 2.0 / period;
+
+    FAISS_THROW_IF_NOT_MSG (!include_listnos, "listnos encoding not supported");
+
+    // transform with vt
+    std::unique_ptr<float []> x (vt->apply (n, x_in));
+
+#pragma omp parallel
+    {
+        std::vector<float> zero (nbit);
+
+        // each thread takes care of a subset of lists
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+
+            if (list_no >= 0) {
+                const float *c;
+                if (threshold_type == Thresh_global) {
+                    c = zero.data();
+                } else {
+                    c = trained.data() + list_no * nbit;
+                }
+                binarize_with_freq (nbit, freq,
+                                    x.get() + i * nbit, c,
+                                    codes + i * code_size) ;
+            }
+        }
+    }
+}
+
+namespace {
+
+
+template<class HammingComputer>
+struct IVFScanner: InvertedListScanner {
+
+    // copied from index structure
+    const IndexIVFSpectralHash *index;
+    size_t code_size;
+    size_t nbit;
+    bool store_pairs;
+
+    float period, freq;
+    std::vector<float> q;
+    std::vector<float> zero;
+    std::vector<uint8_t> qcode;
+    HammingComputer hc;
+
+    using idx_t = Index::idx_t;
+
+    IVFScanner (const IndexIVFSpectralHash * index,
+                bool store_pairs):
+        index (index),
+        code_size(index->code_size),
+        nbit(index->nbit),
+        store_pairs(store_pairs),
+        period(index->period), freq(2.0 / index->period),
+        q(nbit), zero(nbit), qcode(code_size),
+        hc(qcode.data(), code_size)
+    {
+    }
+
+
+    void set_query (const float *query) override {
+        FAISS_THROW_IF_NOT(query);
+        FAISS_THROW_IF_NOT(q.size() == nbit);
+        index->vt->apply_noalloc (1, query, q.data());
+
+        if (index->threshold_type ==
+            IndexIVFSpectralHash::Thresh_global) {
+            binarize_with_freq
+                (nbit, freq, q.data(), zero.data(), qcode.data());
+            hc.set (qcode.data(), code_size);
+        }
+    }
+
+    idx_t list_no;
+
+    void set_list (idx_t list_no, float /*coarse_dis*/) override {
+        this->list_no = list_no;
+        if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
+            const float *c = index->trained.data() + list_no * nbit;
+            binarize_with_freq (nbit, freq, q.data(), c, qcode.data());
+            hc.set (qcode.data(), code_size);
+        }
+    }
+
+    float distance_to_code (const uint8_t *code) const final {
+        return hc.hamming (code);
+    }
+
+    size_t scan_codes (size_t list_size,
+                       const uint8_t *codes,
+                       const idx_t *ids,
+                       float *simi, idx_t *idxi,
+                       size_t k) const override
+    {
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+
+            float dis = hc.hamming (codes);
+
+            if (dis < simi [0]) {
+                maxheap_pop (k, simi, idxi);
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                maxheap_push (k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+    void scan_codes_range (size_t list_size,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           float radius,
+                           RangeQueryResult & res) const override
+    {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = hc.hamming (codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                res.add (dis, id);
+            }
+            codes += code_size;
+        }
+    }
+
+
+};
+
+} // anonymous namespace
+
+InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner
+    (bool store_pairs) const
+{
+    switch (code_size) {
+#define HANDLE_CODE_SIZE(cs) \
+    case cs: \
+        return new IVFScanner<HammingComputer ## cs> (this, store_pairs)
+        HANDLE_CODE_SIZE(4);
+        HANDLE_CODE_SIZE(8);
+        HANDLE_CODE_SIZE(16);
+        HANDLE_CODE_SIZE(20);
+        HANDLE_CODE_SIZE(32);
+        HANDLE_CODE_SIZE(64);
+#undef HANDLE_CODE_SIZE
+        default:
+            if (code_size % 8 == 0) {
+                return new IVFScanner<HammingComputerM8>(this, store_pairs);
+            } else if (code_size % 4 == 0) {
+                return new IVFScanner<HammingComputerM4>(this, store_pairs);
+            } else {
+                FAISS_THROW_MSG("not supported");
+            }
+    }
+
+}
+
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.h
@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_IVFSH_H
+#define FAISS_INDEX_IVFSH_H
+
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+
+
+namespace faiss {
+
+struct VectorTransform;
+
+/** Inverted list that stores binary codes of size nbit. Before the
+ * binary conversion, the dimension of the vectors is transformed from
+ * dim d into dim nbit by vt (a random rotation by default).
+ *
+ * Each coordinate is subtracted from a value determined by
+ * threshold_type, and split into intervals of size period. Half of
+ * the interval is a 0 bit, the other half a 1.
+ */
+struct IndexIVFSpectralHash: IndexIVF {
+
+    VectorTransform *vt; // transformation from d to nbit dim
+    bool own_fields;
+
+    int nbit;
+    float period;
+
+    enum ThresholdType {
+        Thresh_global,
+        Thresh_centroid,
+        Thresh_centroid_half,
+        Thresh_median
+    };
+    ThresholdType threshold_type;
+
+    // size nlist * nbit or 0 if Thresh_global
+    std::vector<float> trained;
+
+    IndexIVFSpectralHash (Index * quantizer, size_t d, size_t nlist,
+                          int nbit, float period);
+
+    IndexIVFSpectralHash ();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos = false) const override;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+    ~IndexIVFSpectralHash () override;
+
+};
+
+
+
+
+}; // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexLSH.cpp
+++ b/core/src/index/thirdparty/faiss/IndexLSH.cpp
@ -0,0 +1,225 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexLSH.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include <algorithm>
+
+#include <faiss/utils/utils.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+/***************************************************************
+ * IndexLSH
+ ***************************************************************/
+
+
+IndexLSH::IndexLSH (idx_t d, int nbits, bool rotate_data, bool train_thresholds):
+    Index(d), nbits(nbits), rotate_data(rotate_data),
+    train_thresholds (train_thresholds), rrot(d, nbits)
+{
+    is_trained = !train_thresholds;
+
+    bytes_per_vec = (nbits + 7) / 8;
+
+    if (rotate_data) {
+        rrot.init(5);
+    } else {
+        FAISS_THROW_IF_NOT (d >= nbits);
+    }
+}
+
+IndexLSH::IndexLSH ():
+    nbits (0), bytes_per_vec(0), rotate_data (false), train_thresholds (false)
+{
+}
+
+
+const float * IndexLSH::apply_preprocess (idx_t n, const float *x) const
+{
+
+    float *xt = nullptr;
+    if (rotate_data) {
+        // also applies bias if exists
+        xt = rrot.apply (n, x);
+    } else if (d != nbits) {
+        assert (nbits < d);
+        xt = new float [nbits * n];
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++) {
+            const float *xl = x + i * d;
+            for (int j = 0; j < nbits; j++)
+                *xp++ = xl [j];
+        }
+    }
+
+    if (train_thresholds) {
+
+        if (xt == NULL) {
+            xt = new float [nbits * n];
+            memcpy (xt, x, sizeof(*x) * n * nbits);
+        }
+
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++)
+            for (int j = 0; j < nbits; j++)
+                *xp++ -= thresholds [j];
+    }
+
+    return xt ? xt : x;
+}
+
+
+
+void IndexLSH::train (idx_t n, const float *x)
+{
+    if (train_thresholds) {
+        thresholds.resize (nbits);
+        train_thresholds = false;
+        const float *xt = apply_preprocess (n, x);
+        ScopeDeleter<float> del (xt == x ? nullptr : xt);
+        train_thresholds = true;
+
+        float * transposed_x = new float [n * nbits];
+        ScopeDeleter<float> del2 (transposed_x);
+
+        for (idx_t i = 0; i < n; i++)
+            for (idx_t j = 0; j < nbits; j++)
+                transposed_x [j * n + i] = xt [i * nbits + j];
+
+        for (idx_t i = 0; i < nbits; i++) {
+            float *xi = transposed_x + i * n;
+            // std::nth_element
+            std::sort (xi, xi + n);
+            if (n % 2 == 1)
+                thresholds [i] = xi [n / 2];
+            else
+                thresholds [i] = (xi [n / 2 - 1] + xi [n / 2]) / 2;
+
+        }
+    }
+    is_trained = true;
+}
+
+
+void IndexLSH::add (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    codes.resize ((ntotal + n) * bytes_per_vec);
+
+    sa_encode (n, x, &codes[ntotal * bytes_per_vec]);
+
+    ntotal += n;
+}
+
+
+void IndexLSH::search (
+        idx_t n,
+        const float *x,
+        idx_t k,
+        float *distances,
+        idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_preprocess (n, x);
+    ScopeDeleter<float> del (xt == x ? nullptr : xt);
+
+    uint8_t * qcodes = new uint8_t [n * bytes_per_vec];
+    ScopeDeleter<uint8_t> del2 (qcodes);
+
+    fvecs2bitvecs (xt, qcodes, nbits, n);
+
+    int * idistances = new int [n * k];
+    ScopeDeleter<int> del3 (idistances);
+
+    int_maxheap_array_t res = { size_t(n), size_t(k), labels, idistances};
+
+    hammings_knn_hc (&res, qcodes, codes.data(),
+                     ntotal, bytes_per_vec, true);
+
+
+    // convert distances to floats
+    for (int i = 0; i < k * n; i++)
+        distances[i] = idistances[i];
+
+}
+
+
+void IndexLSH::transfer_thresholds (LinearTransform *vt) {
+    if (!train_thresholds) return;
+    FAISS_THROW_IF_NOT (nbits == vt->d_out);
+    if (!vt->have_bias) {
+        vt->b.resize (nbits, 0);
+        vt->have_bias = true;
+    }
+    for (int i = 0; i < nbits; i++)
+        vt->b[i] -= thresholds[i];
+    train_thresholds = false;
+    thresholds.clear();
+}
+
+void IndexLSH::reset() {
+    codes.clear();
+    ntotal = 0;
+}
+
+
+size_t IndexLSH::sa_code_size () const
+{
+    return bytes_per_vec;
+}
+
+void IndexLSH::sa_encode (idx_t n, const float *x,
+                                uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_preprocess (n, x);
+    ScopeDeleter<float> del (xt == x ? nullptr : xt);
+    fvecs2bitvecs (xt, bytes, nbits, n);
+}
+
+void IndexLSH::sa_decode (idx_t n, const uint8_t *bytes,
+                                  float *x) const
+{
+    float *xt = x;
+    ScopeDeleter<float> del;
+    if (rotate_data || nbits != d) {
+        xt = new float [n * nbits];
+        del.set(xt);
+    }
+    bitvecs2fvecs (bytes, xt, nbits, n);
+
+    if (train_thresholds) {
+        float *xp = xt;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < nbits; j++) {
+                *xp++ += thresholds [j];
+            }
+        }
+    }
+
+    if (rotate_data) {
+        rrot.reverse_transform (n, xt, x);
+    } else if (nbits != d) {
+        for (idx_t i = 0; i < n; i++) {
+            memcpy (x + i * d, xt + i * nbits,
+                    nbits * sizeof(xt[0]));
+        }
+    }
+}
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexLSH.h
+++ b/core/src/index/thirdparty/faiss/IndexLSH.h
@ -0,0 +1,87 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef INDEX_LSH_H
+#define INDEX_LSH_H
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+
+/** The sign of each vector component is put in a binary signature */
+struct IndexLSH:Index {
+    typedef unsigned char uint8_t;
+
+    int nbits;              ///< nb of bits per vector
+    int bytes_per_vec;      ///< nb of 8-bits per encoded vector
+    bool rotate_data;       ///< whether to apply a random rotation to input
+    bool train_thresholds;  ///< whether we train thresholds or use 0
+
+    RandomRotationMatrix rrot; ///< optional random rotation
+
+    std::vector <float> thresholds; ///< thresholds to compare with
+
+    /// encoded dataset
+    std::vector<uint8_t> codes;
+
+    IndexLSH (
+            idx_t d, int nbits,
+            bool rotate_data = true,
+            bool train_thresholds = false);
+
+    /** Preprocesses and resizes the input to the size required to
+     * binarize the data
+     *
+     * @param x input vectors, size n * d
+     * @return output vectors, size n * bits. May be the same pointer
+     *         as x, otherwise it should be deleted by the caller
+     */
+    const float *apply_preprocess (idx_t n, const float *x) const;
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    /// transfer the thresholds to a pre-processing stage (and unset
+    /// train_thresholds)
+    void transfer_thresholds (LinearTransform * vt);
+
+    ~IndexLSH() override {}
+
+    IndexLSH ();
+
+    /* standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexLattice.cpp
+++ b/core/src/index/thirdparty/faiss/IndexLattice.cpp
@ -0,0 +1,143 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+
+#include <faiss/IndexLattice.h>
+#include <faiss/utils/hamming.h>    // for the bitstring routines
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+
+namespace faiss {
+
+
+IndexLattice::IndexLattice (idx_t d, int nsq, int scale_nbit, int r2):
+    Index (d),
+    nsq (nsq),
+    dsq (d / nsq),
+    zn_sphere_codec (dsq, r2),
+    scale_nbit (scale_nbit)
+{
+    FAISS_THROW_IF_NOT (d % nsq == 0);
+
+    lattice_nbit = 0;
+    while (!( ((uint64_t)1 << lattice_nbit) >= zn_sphere_codec.nv)) {
+        lattice_nbit++;
+    }
+
+    int total_nbit = (lattice_nbit + scale_nbit) * nsq;
+
+    code_size = (total_nbit + 7) / 8;
+
+    is_trained = false;
+}
+
+void IndexLattice::train(idx_t n, const float* x)
+{
+    // compute ranges per sub-block
+    trained.resize (nsq * 2);
+    float * mins = trained.data();
+    float * maxs = trained.data() + nsq;
+    for (int sq = 0; sq < nsq; sq++) {
+        mins[sq] = HUGE_VAL;
+        maxs[sq] = -1;
+    }
+
+    for (idx_t i = 0; i < n; i++) {
+        for (int sq = 0; sq < nsq; sq++) {
+            float norm2 = fvec_norm_L2sqr (x + i * d + sq * dsq, dsq);
+            if (norm2 > maxs[sq]) maxs[sq] = norm2;
+            if (norm2 < mins[sq]) mins[sq] = norm2;
+        }
+    }
+
+    for (int sq = 0; sq < nsq; sq++) {
+        mins[sq] = sqrtf (mins[sq]);
+        maxs[sq] = sqrtf (maxs[sq]);
+    }
+
+    is_trained = true;
+}
+
+/* The standalone codec interface */
+size_t IndexLattice::sa_code_size () const
+{
+    return code_size;
+}
+
+
+
+void IndexLattice::sa_encode (idx_t n, const float *x, uint8_t *codes) const
+{
+
+    const float * mins = trained.data();
+    const float * maxs = mins + nsq;
+    int64_t sc = int64_t(1) << scale_nbit;
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        BitstringWriter wr(codes + i * code_size, code_size);
+        const float *xi = x + i * d;
+        for (int j = 0; j < nsq; j++) {
+            float nj =
+                (sqrtf(fvec_norm_L2sqr(xi, dsq)) - mins[j])
+                * sc / (maxs[j] - mins[j]);
+            if (nj < 0) nj = 0;
+            if (nj >= sc) nj = sc - 1;
+            wr.write((int64_t)nj, scale_nbit);
+            wr.write(zn_sphere_codec.encode(xi), lattice_nbit);
+            xi += dsq;
+        }
+    }
+}
+
+void IndexLattice::sa_decode (idx_t n, const uint8_t *codes, float *x) const
+{
+    const float * mins = trained.data();
+    const float * maxs = mins + nsq;
+    float sc = int64_t(1) << scale_nbit;
+    float r = sqrtf(zn_sphere_codec.r2);
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        BitstringReader rd(codes + i * code_size, code_size);
+        float *xi = x + i * d;
+        for (int j = 0; j < nsq; j++) {
+            float norm =
+                (rd.read (scale_nbit) + 0.5) *
+                (maxs[j] - mins[j]) / sc + mins[j];
+            norm /= r;
+            zn_sphere_codec.decode (rd.read (lattice_nbit), xi);
+            for (int l = 0; l < dsq; l++) {
+                xi[l] *= norm;
+            }
+            xi += dsq;
+        }
+    }
+}
+
+void IndexLattice::add(idx_t , const float* )
+{
+    FAISS_THROW_MSG("not implemented");
+}
+
+
+void  IndexLattice::search(idx_t , const float* , idx_t ,
+                           float* , idx_t* ) const
+{
+    FAISS_THROW_MSG("not implemented");
+}
+
+
+void IndexLattice::reset()
+{
+    FAISS_THROW_MSG("not implemented");
+}
+
+
+}  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexLattice.h
+++ b/core/src/index/thirdparty/faiss/IndexLattice.h
@ -0,0 +1,68 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_LATTICE_H
+#define FAISS_INDEX_LATTICE_H
+
+
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/lattice_Zn.h>
+
+namespace faiss {
+
+
+
+
+
+/** Index that encodes a vector with a series of Zn lattice quantizers
+ */
+struct IndexLattice: Index {
+
+    /// number of sub-vectors
+    int nsq;
+    /// dimension of sub-vectors
+    size_t dsq;
+
+    /// the lattice quantizer
+    ZnSphereCodecAlt zn_sphere_codec;
+
+    /// nb bits used to encode the scale, per subvector
+    int scale_nbit, lattice_nbit;
+    /// total, in bytes
+    size_t code_size;
+
+    /// mins and maxes of the vector norms, per subquantizer
+    std::vector<float> trained;
+
+    IndexLattice (idx_t d, int nsq, int scale_nbit, int r2);
+
+    void train(idx_t n, const float* x) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+    /// not implemented
+    void add(idx_t n, const float* x) override;
+    void search(idx_t n, const float* x, idx_t k,
+                float* distances, idx_t* labels) const override;
+    void reset() override;
+
+};
+
+} // namespace faiss
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexPQ.cpp
--- a/core/src/index/thirdparty/faiss/IndexPQ.h
+++ b/core/src/index/thirdparty/faiss/IndexPQ.h
@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_PQ_H
+#define FAISS_INDEX_PQ_H
+
+#include <stdint.h>
+
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/PolysemousTraining.h>
+
+namespace faiss {
+
+
+/** Index based on a product quantizer. Stored vectors are
+ * approximated by PQ codes. */
+struct IndexPQ: Index {
+
+    /// The product quantizer used to encode the vectors
+    ProductQuantizer pq;
+
+    /// Codes. Size ntotal * pq.code_size
+    std::vector<uint8_t> codes;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexPQ (int d,                    ///< dimensionality of the input vectors
+             size_t M,                 ///< number of subquantizers
+             size_t nbits,             ///< number of bit per subvector index
+             MetricType metric = METRIC_L2);
+
+    IndexPQ ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    /* The standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+
+    DistanceComputer * get_distance_computer() const override;
+
+    /******************************************************
+     * Polysemous codes implementation
+     ******************************************************/
+    bool do_polysemous_training; ///< false = standard PQ
+
+    /// parameters used for the polysemous training
+    PolysemousTraining polysemous_training;
+
+    /// how to perform the search in search_core
+    enum Search_type_t {
+        ST_PQ,             ///< asymmetric product quantizer (default)
+        ST_HE,             ///< Hamming distance on codes
+        ST_generalized_HE, ///< nb of same codes
+        ST_SDC,            ///< symmetric product quantizer (SDC)
+        ST_polysemous,     ///< HE filter (using ht) + PQ combination
+        ST_polysemous_generalize,  ///< Filter on generalized Hamming
+    };
+
+    Search_type_t search_type;
+
+    // just encode the sign of the components, instead of using the PQ encoder
+    // used only for the queries
+    bool encode_signs;
+
+    /// Hamming threshold used for polysemy
+    int polysemous_ht;
+
+    // actual polysemous search
+    void search_core_polysemous (idx_t n, const float *x, idx_t k,
+                               float *distances, idx_t *labels) const;
+
+    /// prepare query for a polysemous search, but instead of
+    /// computing the result, just get the histogram of Hamming
+    /// distances. May be computed on a provided dataset if xb != NULL
+    /// @param dist_histogram (M * nbits + 1)
+    void hamming_distance_histogram (idx_t n, const float *x,
+                                     idx_t nb, const float *xb,
+                                     int64_t *dist_histogram);
+
+    /** compute pairwise distances between queries and database
+     *
+     * @param n    nb of query vectors
+     * @param x    query vector, size n * d
+     * @param dis  output distances, size n * ntotal
+     */
+    void hamming_distance_table (idx_t n, const float *x,
+                                 int32_t *dis) const;
+
+};
+
+
+/// statistics are robust to internal threading, but not if
+/// IndexPQ::search is called by multiple threads
+struct IndexPQStats {
+    size_t nq;       // nb of queries run
+    size_t ncode;    // nb of codes visited
+
+    size_t n_hamming_pass; // nb of passed Hamming distance tests (for polysemy)
+
+    IndexPQStats () {reset (); }
+    void reset ();
+};
+
+extern IndexPQStats indexPQ_stats;
+
+
+
+/** Quantizer where centroids are virtual: they are the Cartesian
+ *  product of sub-centroids. */
+struct MultiIndexQuantizer: Index  {
+    ProductQuantizer pq;
+
+    MultiIndexQuantizer (int d,         ///< dimension of the input vectors
+                         size_t M,      ///< number of subquantizers
+                         size_t nbits); ///< number of bit per subvector index
+
+    void train(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n, const float* x, idx_t k,
+        float* distances, idx_t* labels) const override;
+
+    /// add and reset will crash at runtime
+    void add(idx_t n, const float* x) override;
+    void reset() override;
+
+    MultiIndexQuantizer () {}
+
+    void reconstruct(idx_t key, float* recons) const override;
+};
+
+
+/** MultiIndexQuantizer where the PQ assignmnet is performed by sub-indexes
+ */
+struct MultiIndexQuantizer2: MultiIndexQuantizer {
+
+    /// M Indexes on d / M dimensions
+    std::vector<Index*> assign_indexes;
+    bool own_fields;
+
+    MultiIndexQuantizer2 (
+        int d, size_t M, size_t nbits,
+        Index **indexes);
+
+    MultiIndexQuantizer2 (
+        int d, size_t nbits,
+        Index *assign_index_0,
+        Index *assign_index_1);
+
+    void train(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n, const float* x, idx_t k,
+        float* distances, idx_t* labels) const override;
+
+};
+
+
+} // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
+++ b/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
@ -0,0 +1,288 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexPreTransform.h>
+
+#include <cstdio>
+#include <cmath>
+#include <cstring>
+#include <memory>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+/*********************************************
+ * IndexPreTransform
+ *********************************************/
+
+IndexPreTransform::IndexPreTransform ():
+    index(nullptr), own_fields (false)
+{
+}
+
+
+IndexPreTransform::IndexPreTransform (
+        Index * index):
+    Index (index->d, index->metric_type),
+    index (index), own_fields (false)
+{
+    is_trained = index->is_trained;
+    ntotal = index->ntotal;
+}
+
+
+IndexPreTransform::IndexPreTransform (
+        VectorTransform * ltrans,
+        Index * index):
+    Index (index->d, index->metric_type),
+    index (index), own_fields (false)
+{
+    is_trained = index->is_trained;
+    ntotal = index->ntotal;
+    prepend_transform (ltrans);
+}
+
+void IndexPreTransform::prepend_transform (VectorTransform *ltrans)
+{
+    FAISS_THROW_IF_NOT (ltrans->d_out == d);
+    is_trained = is_trained && ltrans->is_trained;
+    chain.insert (chain.begin(), ltrans);
+    d = ltrans->d_in;
+}
+
+
+IndexPreTransform::~IndexPreTransform ()
+{
+    if (own_fields) {
+        for (int i = 0; i < chain.size(); i++)
+            delete chain[i];
+        delete index;
+    }
+}
+
+
+
+
+void IndexPreTransform::train (idx_t n, const float *x)
+{
+    int last_untrained = 0;
+    if (!index->is_trained) {
+        last_untrained = chain.size();
+    } else {
+        for (int i = chain.size() - 1; i >= 0; i--) {
+            if (!chain[i]->is_trained) {
+                last_untrained = i;
+                break;
+            }
+        }
+    }
+    const float *prev_x = x;
+    ScopeDeleter<float> del;
+
+    if (verbose) {
+        printf("IndexPreTransform::train: training chain 0 to %d\n",
+               last_untrained);
+    }
+
+    for (int i = 0; i <= last_untrained; i++) {
+
+        if (i < chain.size()) {
+            VectorTransform *ltrans = chain [i];
+            if (!ltrans->is_trained) {
+                if (verbose) {
+                    printf("   Training chain component %d/%zd\n",
+                           i, chain.size());
+                    if (OPQMatrix *opqm = dynamic_cast<OPQMatrix*>(ltrans)) {
+                        opqm->verbose = true;
+                    }
+                }
+                ltrans->train (n, prev_x);
+            }
+        } else {
+            if (verbose) {
+                printf("   Training sub-index\n");
+            }
+            index->train (n, prev_x);
+        }
+        if (i == last_untrained) break;
+        if (verbose) {
+            printf("   Applying transform %d/%zd\n",
+                   i, chain.size());
+        }
+
+        float * xt = chain[i]->apply (n, prev_x);
+
+        if (prev_x != x) delete [] prev_x;
+        prev_x = xt;
+        del.set(xt);
+    }
+
+    is_trained = true;
+}
+
+
+const float *IndexPreTransform::apply_chain (idx_t n, const float *x) const
+{
+    const float *prev_x = x;
+    ScopeDeleter<float> del;
+
+    for (int i = 0; i < chain.size(); i++) {
+        float * xt = chain[i]->apply (n, prev_x);
+        ScopeDeleter<float> del2 (xt);
+        del2.swap (del);
+        prev_x = xt;
+    }
+    del.release ();
+    return prev_x;
+}
+
+void IndexPreTransform::reverse_chain (idx_t n, const float* xt, float* x) const
+{
+    const float* next_x = xt;
+    ScopeDeleter<float> del;
+
+    for (int i = chain.size() - 1; i >= 0; i--) {
+        float* prev_x = (i == 0) ? x : new float [n * chain[i]->d_in];
+        ScopeDeleter<float> del2 ((prev_x == x) ? nullptr : prev_x);
+        chain [i]->reverse_transform (n, next_x, prev_x);
+        del2.swap (del);
+        next_x = prev_x;
+    }
+}
+
+void IndexPreTransform::add (idx_t n, const float *x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->add (n, xt);
+    ntotal = index->ntotal;
+}
+
+void IndexPreTransform::add_with_ids (idx_t n, const float * x,
+                                      const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->add_with_ids (n, xt, xids);
+    ntotal = index->ntotal;
+}
+
+
+
+
+void IndexPreTransform::search (idx_t n, const float *x, idx_t k,
+                               float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->search (n, xt, k, distances, labels);
+}
+
+void IndexPreTransform::range_search (idx_t n, const float* x, float radius,
+                                      RangeSearchResult* result) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    const float *xt = apply_chain (n, x);
+    ScopeDeleter<float> del(xt == x ? nullptr : xt);
+    index->range_search (n, xt, radius, result);
+}
+
+
+
+void IndexPreTransform::reset () {
+    index->reset();
+    ntotal = 0;
+}
+
+size_t IndexPreTransform::remove_ids (const IDSelector & sel) {
+    size_t nremove = index->remove_ids (sel);
+    ntotal = index->ntotal;
+    return nremove;
+}
+
+
+void IndexPreTransform::reconstruct (idx_t key, float * recons) const
+{
+    float *x = chain.empty() ? recons : new float [index->d];
+    ScopeDeleter<float> del (recons == x ? nullptr : x);
+    // Initial reconstruction
+    index->reconstruct (key, x);
+
+    // Revert transformations from last to first
+    reverse_chain (1, x, recons);
+}
+
+
+void IndexPreTransform::reconstruct_n (idx_t i0, idx_t ni, float *recons) const
+{
+    float *x = chain.empty() ? recons : new float [ni * index->d];
+    ScopeDeleter<float> del (recons == x ? nullptr : x);
+    // Initial reconstruction
+    index->reconstruct_n (i0, ni, x);
+
+    // Revert transformations from last to first
+    reverse_chain (ni, x, recons);
+}
+
+
+void IndexPreTransform::search_and_reconstruct (
+      idx_t n, const float *x, idx_t k,
+      float *distances, idx_t *labels, float* recons) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+
+    const float* xt = apply_chain (n, x);
+    ScopeDeleter<float> del ((xt == x) ? nullptr : xt);
+
+    float* recons_temp = chain.empty() ? recons : new float [n * k * index->d];
+    ScopeDeleter<float> del2 ((recons_temp == recons) ? nullptr : recons_temp);
+    index->search_and_reconstruct (n, xt, k, distances, labels, recons_temp);
+
+    // Revert transformations from last to first
+    reverse_chain (n * k, recons_temp, recons);
+}
+
+size_t IndexPreTransform::sa_code_size () const
+{
+    return index->sa_code_size ();
+}
+
+void IndexPreTransform::sa_encode (idx_t n, const float *x,
+                                         uint8_t *bytes) const
+{
+    if (chain.empty()) {
+        index->sa_encode (n, x, bytes);
+    } else {
+        const float *xt = apply_chain (n, x);
+        ScopeDeleter<float> del(xt == x ? nullptr : xt);
+        index->sa_encode (n, xt, bytes);
+    }
+}
+
+void IndexPreTransform::sa_decode (idx_t n, const uint8_t *bytes,
+                                           float *x) const
+{
+    if (chain.empty()) {
+        index->sa_decode (n, bytes, x);
+    } else {
+        std::unique_ptr<float []> x1 (new float [index->d * n]);
+        index->sa_decode (n, bytes, x1.get());
+        // Revert transformations from last to first
+        reverse_chain (n, x1.get(), x);
+    }
+}
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexPreTransform.h
+++ b/core/src/index/thirdparty/faiss/IndexPreTransform.h
@ -0,0 +1,91 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+
+
+#include <faiss/Index.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+/** Index that applies a LinearTransform transform on vectors before
+ *  handing them over to a sub-index */
+struct IndexPreTransform: Index {
+
+    std::vector<VectorTransform *> chain;  ///! chain of tranforms
+    Index * index;            ///! the sub-index
+
+    bool own_fields;          ///! whether pointers are deleted in destructor
+
+    explicit IndexPreTransform (Index *index);
+
+    IndexPreTransform ();
+
+    /// ltrans is the last transform before the index
+    IndexPreTransform (VectorTransform * ltrans, Index * index);
+
+    void prepend_transform (VectorTransform * ltrans);
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    void reset() override;
+
+    /** removes IDs from the index. Not supported by all indexes.
+     */
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+
+    /* range search, no attempt is done to change the radius */
+    void range_search (idx_t n, const float* x, float radius,
+                       RangeSearchResult* result) const override;
+
+
+    void reconstruct (idx_t key, float * recons) const override;
+
+    void reconstruct_n (idx_t i0, idx_t ni, float *recons)
+        const override;
+
+    void search_and_reconstruct (idx_t n, const float *x, idx_t k,
+                                 float *distances, idx_t *labels,
+                                 float *recons) const override;
+
+    /// apply the transforms in the chain. The returned float * may be
+    /// equal to x, otherwise it should be deallocated.
+    const float * apply_chain (idx_t n, const float *x) const;
+
+    /// Reverse the transforms in the chain. May not be implemented for
+    /// all transforms in the chain or may return approximate results.
+    void reverse_chain (idx_t n, const float* xt, float* x) const;
+
+
+    /* standalone codec interface */
+    size_t sa_code_size () const override;
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+    ~IndexPreTransform() override;
+};
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexReplicas.cpp
+++ b/core/src/index/thirdparty/faiss/IndexReplicas.cpp
@ -0,0 +1,123 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexReplicas.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+template <typename IndexT>
+IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(bool threaded)
+    : ThreadedIndex<IndexT>(threaded) {
+}
+
+template <typename IndexT>
+IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(idx_t d, bool threaded)
+    : ThreadedIndex<IndexT>(d, threaded) {
+}
+
+template <typename IndexT>
+IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(int d, bool threaded)
+    : ThreadedIndex<IndexT>(d, threaded) {
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::onAfterAddIndex(IndexT* index) {
+  // Make sure that the parameters are the same for all prior indices, unless
+  // we're the first index to be added
+  if (this->count() > 0 && this->at(0) != index) {
+    auto existing = this->at(0);
+
+    FAISS_THROW_IF_NOT_FMT(index->ntotal == existing->ntotal,
+                           "IndexReplicas: newly added index does "
+                           "not have same number of vectors as prior index; "
+                           "prior index has %ld vectors, new index has %ld",
+                           existing->ntotal, index->ntotal);
+
+    FAISS_THROW_IF_NOT_MSG(index->is_trained == existing->is_trained,
+                           "IndexReplicas: newly added index does "
+                           "not have same train status as prior index");
+  } else {
+    // Set our parameters based on the first index we're adding
+    // (dimension is handled in ThreadedIndex)
+    this->ntotal = index->ntotal;
+    this->verbose = index->verbose;
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+  }
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::train(idx_t n, const component_t* x) {
+  this->runOnIndex([n, x](int, IndexT* index){ index->train(n, x); });
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::add(idx_t n, const component_t* x) {
+  this->runOnIndex([n, x](int, IndexT* index){ index->add(n, x); });
+  this->ntotal += n;
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::reconstruct(idx_t n, component_t* x) const {
+  FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
+
+  // Just pass to the first replica
+  this->at(0)->reconstruct(n, x);
+}
+
+template <typename IndexT>
+void
+IndexReplicasTemplate<IndexT>::search(idx_t n,
+                                      const component_t* x,
+                                      idx_t k,
+                                      distance_t* distances,
+                                      idx_t* labels) const {
+  FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
+
+  if (n == 0) {
+    return;
+  }
+
+  auto dim = this->d;
+  size_t componentsPerVec =
+    sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
+
+  // Partition the query by the number of indices we have
+  faiss::Index::idx_t queriesPerIndex =
+    (faiss::Index::idx_t) (n + this->count() - 1) /
+    (faiss::Index::idx_t) this->count();
+  FAISS_ASSERT(n / queriesPerIndex <= this->count());
+
+  auto fn =
+    [queriesPerIndex, componentsPerVec,
+     n, x, k, distances, labels](int i, const IndexT* index) {
+      faiss::Index::idx_t base = (faiss::Index::idx_t) i * queriesPerIndex;
+
+      if (base < n) {
+        auto numForIndex = std::min(queriesPerIndex, n - base);
+
+        index->search(numForIndex,
+                      x + base * componentsPerVec,
+                      k,
+                      distances + base * k,
+                      labels + base * k);
+      }
+    };
+
+  this->runOnIndex(fn);
+}
+
+// explicit instantiations
+template struct IndexReplicasTemplate<Index>;
+template struct IndexReplicasTemplate<IndexBinary>;
+
+} // namespace
--- a/core/src/index/thirdparty/faiss/IndexReplicas.h
+++ b/core/src/index/thirdparty/faiss/IndexReplicas.h
@ -0,0 +1,76 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/// Takes individual faiss::Index instances, and splits queries for
+/// sending to each Index instance, and joins the results together
+/// when done.
+/// Each index is managed by a separate CPU thread.
+template <typename IndexT>
+class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
+ public:
+  using idx_t = typename IndexT::idx_t;
+  using component_t = typename IndexT::component_t;
+  using distance_t = typename IndexT::distance_t;
+
+  /// The dimension that all sub-indices must share will be the dimension of the
+  /// first sub-index added
+  /// @param threaded do we use one thread per sub-index or do queries
+  /// sequentially?
+  explicit IndexReplicasTemplate(bool threaded = true);
+
+  /// @param d the dimension that all sub-indices must share
+  /// @param threaded do we use one thread per sub index or do queries
+  /// sequentially?
+  explicit IndexReplicasTemplate(idx_t d, bool threaded = true);
+
+  /// int version due to the implicit bool conversion ambiguity of int as
+  /// dimension
+  explicit IndexReplicasTemplate(int d, bool threaded = true);
+
+  /// Alias for addIndex()
+  void add_replica(IndexT* index) { this->addIndex(index); }
+
+  /// Alias for removeIndex()
+  void remove_replica(IndexT* index) { this->removeIndex(index); }
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  void train(idx_t n, const component_t* x) override;
+
+  /// faiss::Index API
+  /// All indices receive the same call
+  void add(idx_t n, const component_t* x) override;
+
+  /// faiss::Index API
+  /// Query is partitioned into a slice for each sub-index
+  /// split by ceil(n / #indices) for our sub-indices
+  void search(idx_t n,
+              const component_t* x,
+              idx_t k,
+              distance_t* distances,
+              idx_t* labels) const override;
+
+  /// reconstructs from the first index
+  void reconstruct(idx_t, component_t *v) const override;
+
+ protected:
+  /// Called just after an index is added
+  void onAfterAddIndex(IndexT* index) override;
+};
+
+using IndexReplicas = IndexReplicasTemplate<Index>;
+using IndexBinaryReplicas = IndexReplicasTemplate<IndexBinary>;
+
+} // namespace
--- a/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp
+++ b/core/src/index/thirdparty/faiss/IndexSQHybrid.cpp
@ -0,0 +1,183 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/IndexSQHybrid.h>
+
+namespace faiss {
+
+/*******************************************************************
+ * IndexIVFSQHybrid implementation
+ ********************************************************************/
+
+IndexIVFSQHybrid::IndexIVFSQHybrid (
+            Index *quantizer, size_t d, size_t nlist,
+            ScalarQuantizer::QuantizerType qtype,
+            MetricType metric, bool encode_residual)
+    : IndexIVF(quantizer, d, nlist, 0, metric),
+      sq(d, qtype),
+      by_residual(encode_residual)
+{
+    code_size = sq.code_size;
+    // was not known at construction time
+    invlists->code_size = code_size;
+    is_trained = false;
+}
+
+IndexIVFSQHybrid::IndexIVFSQHybrid ():
+    IndexIVF(),
+    by_residual(true)
+{
+}
+
+void IndexIVFSQHybrid::train_residual (idx_t n, const float *x)
+{
+    sq.train_residual(n, x, quantizer, by_residual, verbose);
+}
+
+void IndexIVFSQHybrid::encode_vectors(idx_t n, const float* x,
+                                             const idx_t *list_nos,
+                                             uint8_t * codes,
+                                             bool include_listnos) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = include_listnos ? coarse_code_size () : 0;
+    memset(codes, 0, (code_size + coarse_size) * n);
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            if (list_no >= 0) {
+                const float *xi = x + i * d;
+                uint8_t *code = codes + i * (code_size + coarse_size);
+                if (by_residual) {
+                    quantizer->compute_residual (
+                          xi, residual.data(), list_no);
+                    xi = residual.data ();
+                }
+                if (coarse_size) {
+                    encode_listno (list_no, code);
+                }
+                squant->encode_vector (xi, code + coarse_size);
+            }
+        }
+    }
+}
+
+void IndexIVFSQHybrid::sa_decode (idx_t n, const uint8_t *codes,
+                                                 float *x) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = coarse_code_size ();
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno (code);
+            float *xi = x + i * d;
+            squant->decode_vector (code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct (list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+
+
+
+void IndexIVFSQHybrid::add_with_ids
+       (idx_t n, const float * x, const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> idx (new int64_t [n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0;
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+
+#pragma omp parallel reduction(+: nadd)
+    {
+        std::vector<float> residual (d);
+        std::vector<uint8_t> one_code (code_size);
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = idx [i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                int64_t id = xids ? xids[i] : ntotal + i;
+
+                const float * xi = x + i * d;
+                if (by_residual) {
+                    quantizer->compute_residual (xi, residual.data(), list_no);
+                    xi = residual.data();
+                }
+
+                memset (one_code.data(), 0, code_size);
+                squant->encode_vector (xi, one_code.data());
+
+                invlists->add_entry (list_no, id, one_code.data());
+
+                nadd++;
+
+            }
+        }
+    }
+    ntotal += n;
+}
+
+
+
+
+
+InvertedListScanner* IndexIVFSQHybrid::get_InvertedListScanner
+    (bool store_pairs) const
+{
+    return sq.select_InvertedListScanner (metric_type, quantizer, store_pairs,
+                                          by_residual);
+}
+
+
+void IndexIVFSQHybrid::reconstruct_from_offset (int64_t list_no,
+                                                       int64_t offset,
+                                                       float* recons) const
+{
+    std::vector<float> centroid(d);
+    quantizer->reconstruct (list_no, centroid.data());
+
+    const uint8_t* code = invlists->get_single_code (list_no, offset);
+    sq.decode (code, recons, 1);
+    for (int i = 0; i < d; ++i) {
+        recons[i] += centroid[i];
+    }
+}
+
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexSQHybrid.h
+++ b/core/src/index/thirdparty/faiss/IndexSQHybrid.h
@ -0,0 +1,65 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_SQ_HYBRID_H
+#define FAISS_INDEX_SQ_HYBRID_H
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+
+namespace faiss {
+
+ /** An IVF implementation where the components of the residuals are
+ * encoded with a scalar uniform quantizer. All distance computations
+ * are asymmetric, so the encoded vectors are decoded and approximate
+ * distances are computed.
+ */
+
+struct IndexIVFSQHybrid: IndexIVF {
+    ScalarQuantizer sq;
+    bool by_residual;
+
+    IndexIVFSQHybrid(Index *quantizer, size_t d, size_t nlist,
+                            ScalarQuantizer::QuantizerType qtype,
+                            MetricType metric = METRIC_L2,
+                            bool encode_residual = true);
+
+    IndexIVFSQHybrid();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos=false) const override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    /* standalone codec interface */
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
@ -0,0 +1,317 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexScalarQuantizer.h>
+
+#include <cstdio>
+#include <algorithm>
+
+#include <omp.h>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+namespace faiss {
+
+
+
+/*******************************************************************
+ * IndexScalarQuantizer implementation
+ ********************************************************************/
+
+IndexScalarQuantizer::IndexScalarQuantizer
+                      (int d, ScalarQuantizer::QuantizerType qtype,
+                       MetricType metric):
+          Index(d, metric),
+          sq (d, qtype)
+{
+    is_trained =
+        qtype == ScalarQuantizer::QT_fp16 ||
+        qtype == ScalarQuantizer::QT_8bit_direct;
+    code_size = sq.code_size;
+}
+
+
+IndexScalarQuantizer::IndexScalarQuantizer ():
+    IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit)
+{}
+
+void IndexScalarQuantizer::train(idx_t n, const float* x)
+{
+    sq.train(n, x);
+    is_trained = true;
+}
+
+void IndexScalarQuantizer::add(idx_t n, const float* x)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    codes.resize ((n + ntotal) * code_size);
+    sq.compute_codes (x, &codes[ntotal * code_size], n);
+    ntotal += n;
+}
+
+
+void IndexScalarQuantizer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    FAISS_THROW_IF_NOT (metric_type == METRIC_L2 ||
+                        metric_type == METRIC_INNER_PRODUCT);
+
+#pragma omp parallel
+    {
+        InvertedListScanner* scanner = sq.select_InvertedListScanner
+            (metric_type, nullptr, true);
+        ScopeDeleter1<InvertedListScanner> del(scanner);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            float * D = distances + k * i;
+            idx_t * I = labels + k * i;
+            // re-order heap
+            if (metric_type == METRIC_L2) {
+                maxheap_heapify (k, D, I);
+            } else {
+                minheap_heapify (k, D, I);
+            }
+            scanner->set_query (x + i * d);
+            scanner->scan_codes (ntotal, codes.data(),
+                                 nullptr, D, I, k);
+
+            // re-order heap
+            if (metric_type == METRIC_L2) {
+                maxheap_reorder (k, D, I);
+            } else {
+                minheap_reorder (k, D, I);
+            }
+        }
+    }
+
+}
+
+
+DistanceComputer *IndexScalarQuantizer::get_distance_computer () const
+{
+    ScalarQuantizer::SQDistanceComputer *dc =
+        sq.get_distance_computer (metric_type);
+    dc->code_size = sq.code_size;
+    dc->codes = codes.data();
+    return dc;
+}
+
+
+void IndexScalarQuantizer::reset()
+{
+    codes.clear();
+    ntotal = 0;
+}
+
+void IndexScalarQuantizer::reconstruct_n(
+             idx_t i0, idx_t ni, float* recons) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+    for (size_t i = 0; i < ni; i++) {
+        squant->decode_vector(&codes[(i + i0) * code_size], recons + i * d);
+    }
+}
+
+void IndexScalarQuantizer::reconstruct(idx_t key, float* recons) const
+{
+    reconstruct_n(key, 1, recons);
+}
+
+/* Codec interface */
+size_t IndexScalarQuantizer::sa_code_size () const
+{
+    return sq.code_size;
+}
+
+void IndexScalarQuantizer::sa_encode (idx_t n, const float *x,
+                      uint8_t *bytes) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    sq.compute_codes (x, bytes, n);
+}
+
+void IndexScalarQuantizer::sa_decode (idx_t n, const uint8_t *bytes,
+                                              float *x) const
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    sq.decode(bytes, x, n);
+}
+
+
+
+/*******************************************************************
+ * IndexIVFScalarQuantizer implementation
+ ********************************************************************/
+
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer (
+            Index *quantizer, size_t d, size_t nlist,
+            ScalarQuantizer::QuantizerType qtype,
+            MetricType metric, bool encode_residual)
+    : IndexIVF(quantizer, d, nlist, 0, metric),
+      sq(d, qtype),
+      by_residual(encode_residual)
+{
+    code_size = sq.code_size;
+    // was not known at construction time
+    invlists->code_size = code_size;
+    is_trained = false;
+}
+
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
+    IndexIVF(),
+    by_residual(true)
+{
+}
+
+void IndexIVFScalarQuantizer::train_residual (idx_t n, const float *x)
+{
+    sq.train_residual(n, x, quantizer, by_residual, verbose);
+}
+
+void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
+                                             const idx_t *list_nos,
+                                             uint8_t * codes,
+                                             bool include_listnos) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = include_listnos ? coarse_code_size () : 0;
+    memset(codes, 0, (code_size + coarse_size) * n);
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos [i];
+            if (list_no >= 0) {
+                const float *xi = x + i * d;
+                uint8_t *code = codes + i * (code_size + coarse_size);
+                if (by_residual) {
+                    quantizer->compute_residual (
+                          xi, residual.data(), list_no);
+                    xi = residual.data ();
+                }
+                if (coarse_size) {
+                    encode_listno (list_no, code);
+                }
+                squant->encode_vector (xi, code + coarse_size);
+            }
+        }
+    }
+}
+
+void IndexIVFScalarQuantizer::sa_decode (idx_t n, const uint8_t *codes,
+                                                 float *x) const
+{
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant (sq.select_quantizer ());
+    size_t coarse_size = coarse_code_size ();
+
+#pragma omp parallel if(n > 1)
+    {
+        std::vector<float> residual (d);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            const uint8_t *code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno (code);
+            float *xi = x + i * d;
+            squant->decode_vector (code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct (list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+
+
+
+void IndexIVFScalarQuantizer::add_with_ids
+       (idx_t n, const float * x, const idx_t *xids)
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    std::unique_ptr<int64_t []> idx (new int64_t [n]);
+    quantizer->assign (n, x, idx.get());
+    size_t nadd = 0;
+    std::unique_ptr<ScalarQuantizer::Quantizer> squant(sq.select_quantizer ());
+
+#pragma omp parallel reduction(+: nadd)
+    {
+        std::vector<float> residual (d);
+        std::vector<uint8_t> one_code (code_size);
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = idx [i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                int64_t id = xids ? xids[i] : ntotal + i;
+
+                const float * xi = x + i * d;
+                if (by_residual) {
+                    quantizer->compute_residual (xi, residual.data(), list_no);
+                    xi = residual.data();
+                }
+
+                memset (one_code.data(), 0, code_size);
+                squant->encode_vector (xi, one_code.data());
+
+                invlists->add_entry (list_no, id, one_code.data());
+
+                nadd++;
+
+            }
+        }
+    }
+    ntotal += n;
+}
+
+
+
+
+
+InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner
+    (bool store_pairs) const
+{
+    return sq.select_InvertedListScanner (metric_type, quantizer, store_pairs,
+                                          by_residual);
+}
+
+
+void IndexIVFScalarQuantizer::reconstruct_from_offset (int64_t list_no,
+                                                       int64_t offset,
+                                                       float* recons) const
+{
+    std::vector<float> centroid(d);
+    quantizer->reconstruct (list_no, centroid.data());
+
+    const uint8_t* code = invlists->get_single_code (list_no, offset);
+    sq.decode (code, recons, 1);
+    for (int i = 0; i < d; ++i) {
+        recons[i] += centroid[i];
+    }
+}
+
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
@ -0,0 +1,127 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INDEX_SCALAR_QUANTIZER_H
+#define FAISS_INDEX_SCALAR_QUANTIZER_H
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/ScalarQuantizer.h>
+
+
+namespace faiss {
+
+/**
+ * The uniform quantizer has a range [vmin, vmax]. The range can be
+ * the same for all dimensions (uniform) or specific per dimension
+ * (default).
+ */
+
+
+
+
+struct IndexScalarQuantizer: Index {
+    /// Used to encode the vectors
+    ScalarQuantizer sq;
+
+    /// Codes. Size ntotal * pq.code_size
+    std::vector<uint8_t> codes;
+
+    size_t code_size;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexScalarQuantizer (int d,
+                          ScalarQuantizer::QuantizerType qtype,
+                          MetricType metric = METRIC_L2);
+
+    IndexScalarQuantizer ();
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void reset() override;
+
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+
+    void reconstruct(idx_t key, float* recons) const override;
+
+    DistanceComputer *get_distance_computer () const override;
+
+    /* standalone codec interface */
+    size_t sa_code_size () const override;
+
+    void sa_encode (idx_t n, const float *x,
+                          uint8_t *bytes) const override;
+
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+
+};
+
+
+ /** An IVF implementation where the components of the residuals are
+ * encoded with a scalar uniform quantizer. All distance computations
+ * are asymmetric, so the encoded vectors are decoded and approximate
+ * distances are computed.
+ */
+
+struct IndexIVFScalarQuantizer: IndexIVF {
+    ScalarQuantizer sq;
+    bool by_residual;
+
+    IndexIVFScalarQuantizer(Index *quantizer, size_t d, size_t nlist,
+                            ScalarQuantizer::QuantizerType qtype,
+                            MetricType metric = METRIC_L2,
+                            bool encode_residual = true);
+
+    IndexIVFScalarQuantizer();
+
+    void train_residual(idx_t n, const float* x) override;
+
+    void encode_vectors(idx_t n, const float* x,
+                        const idx_t *list_nos,
+                        uint8_t * codes,
+                        bool include_listnos=false) const override;
+
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+
+    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
+        const override;
+
+
+    void reconstruct_from_offset (int64_t list_no, int64_t offset,
+                                  float* recons) const override;
+
+    /* standalone codec interface */
+    void sa_decode (idx_t n, const uint8_t *bytes,
+                            float *x) const override;
+
+};
+
+
+}
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexShards.cpp
+++ b/core/src/index/thirdparty/faiss/IndexShards.cpp
@ -0,0 +1,317 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexShards.h>
+
+#include <cstdio>
+#include <functional>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+
+namespace faiss {
+
+// subroutines
+namespace {
+
+typedef Index::idx_t idx_t;
+
+
+// add translation to all valid labels
+void translate_labels (long n, idx_t *labels, long translation)
+{
+    if (translation == 0) return;
+    for (long i = 0; i < n; i++) {
+        if(labels[i] < 0) continue;
+        labels[i] += translation;
+    }
+}
+
+
+/** merge result tables from several shards.
+ * @param all_distances  size nshard * n * k
+ * @param all_labels     idem
+ * @param translartions  label translations to apply, size nshard
+ */
+
+template <class IndexClass, class C>
+void
+merge_tables(long n, long k, long nshard,
+             typename IndexClass::distance_t *distances,
+             idx_t *labels,
+             const std::vector<typename IndexClass::distance_t>& all_distances,
+             const std::vector<idx_t>& all_labels,
+             const std::vector<long>& translations) {
+  if (k == 0) {
+    return;
+  }
+  using distance_t = typename IndexClass::distance_t;
+
+  long stride = n * k;
+#pragma omp parallel
+  {
+    std::vector<int> buf (2 * nshard);
+    int * pointer = buf.data();
+    int * shard_ids = pointer + nshard;
+    std::vector<distance_t> buf2 (nshard);
+    distance_t * heap_vals = buf2.data();
+#pragma omp for
+    for (long i = 0; i < n; i++) {
+      // the heap maps values to the shard where they are
+      // produced.
+      const distance_t *D_in = all_distances.data() + i * k;
+      const idx_t *I_in = all_labels.data() + i * k;
+      int heap_size = 0;
+
+      for (long s = 0; s < nshard; s++) {
+        pointer[s] = 0;
+        if (I_in[stride * s] >= 0) {
+          heap_push<C> (++heap_size, heap_vals, shard_ids,
+                        D_in[stride * s], s);
+        }
+      }
+
+      distance_t *D = distances + i * k;
+      idx_t *I = labels + i * k;
+
+      for (int j = 0; j < k; j++) {
+        if (heap_size == 0) {
+          I[j] = -1;
+          D[j] = C::neutral();
+        } else {
+          // pop best element
+          int s = shard_ids[0];
+          int & p = pointer[s];
+          D[j] = heap_vals[0];
+          I[j] = I_in[stride * s + p] + translations[s];
+
+          heap_pop<C> (heap_size--, heap_vals, shard_ids);
+          p++;
+          if (p < k && I_in[stride * s + p] >= 0) {
+            heap_push<C> (++heap_size, heap_vals, shard_ids,
+                          D_in[stride * s + p], s);
+          }
+        }
+      }
+    }
+  }
+}
+
+} // anonymous namespace
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(idx_t d,
+                                                 bool threaded,
+                                                 bool successive_ids)
+    : ThreadedIndex<IndexT>(d, threaded),
+      successive_ids(successive_ids) {
+}
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(int d,
+                                                 bool threaded,
+                                                 bool successive_ids)
+    : ThreadedIndex<IndexT>(d, threaded),
+      successive_ids(successive_ids) {
+}
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(bool threaded,
+                                                 bool successive_ids)
+    : ThreadedIndex<IndexT>(threaded),
+      successive_ids(successive_ids) {
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
+  sync_with_shard_indexes();
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::onAfterRemoveIndex(IndexT* index /* unused */) {
+  sync_with_shard_indexes();
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::sync_with_shard_indexes() {
+  if (!this->count()) {
+    this->is_trained = false;
+    this->ntotal = 0;
+
+    return;
+  }
+
+  auto firstIndex = this->at(0);
+  this->metric_type = firstIndex->metric_type;
+  this->is_trained = firstIndex->is_trained;
+  this->ntotal = firstIndex->ntotal;
+
+  for (int i = 1; i < this->count(); ++i) {
+    auto index = this->at(i);
+    FAISS_THROW_IF_NOT(this->metric_type == index->metric_type);
+    FAISS_THROW_IF_NOT(this->d == index->d);
+
+    this->ntotal += index->ntotal;
+  }
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::train(idx_t n,
+                                   const component_t *x) {
+  auto fn =
+    [n, x](int no, IndexT *index) {
+      if (index->verbose) {
+        printf("begin train shard %d on %ld points\n", no, n);
+      }
+
+      index->train(n, x);
+
+      if (index->verbose) {
+        printf("end train shard %d\n", no);
+      }
+    };
+
+  this->runOnIndex(fn);
+  sync_with_shard_indexes();
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::add(idx_t n,
+                                 const component_t *x) {
+  add_with_ids(n, x, nullptr);
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::add_with_ids(idx_t n,
+                                          const component_t * x,
+                                          const idx_t *xids) {
+
+  FAISS_THROW_IF_NOT_MSG(!(successive_ids && xids),
+                         "It makes no sense to pass in ids and "
+                         "request them to be shifted");
+
+  if (successive_ids) {
+    FAISS_THROW_IF_NOT_MSG(!xids,
+                           "It makes no sense to pass in ids and "
+                           "request them to be shifted");
+    FAISS_THROW_IF_NOT_MSG(this->ntotal == 0,
+                           "when adding to IndexShards with sucessive_ids, "
+                           "only add() in a single pass is supported");
+  }
+
+  idx_t nshard = this->count();
+  const idx_t *ids = xids;
+
+  std::vector<idx_t> aids;
+
+  if (!ids && !successive_ids) {
+    aids.resize(n);
+
+    for (idx_t i = 0; i < n; i++) {
+      aids[i] = this->ntotal + i;
+    }
+
+    ids = aids.data();
+  }
+
+  size_t components_per_vec =
+    sizeof(component_t) == 1 ? (this->d + 7) / 8 : this->d;
+
+  auto fn =
+    [n, ids, x, nshard, components_per_vec](int no, IndexT *index) {
+      idx_t i0 = (idx_t) no * n / nshard;
+      idx_t i1 = ((idx_t) no + 1) * n / nshard;
+      auto x0 = x + i0 * components_per_vec;
+
+      if (index->verbose) {
+        printf ("begin add shard %d on %ld points\n", no, n);
+      }
+
+      if (ids) {
+        index->add_with_ids (i1 - i0, x0, ids + i0);
+      } else {
+        index->add (i1 - i0, x0);
+      }
+
+      if (index->verbose) {
+        printf ("end add shard %d on %ld points\n", no, i1 - i0);
+      }
+    };
+
+  this->runOnIndex(fn);
+
+  // This is safe to do here because the current thread controls execution in
+  // all threads, and nothing else is happening
+  this->ntotal += n;
+}
+
+template <typename IndexT>
+void
+IndexShardsTemplate<IndexT>::search(idx_t n,
+                                    const component_t *x,
+                                    idx_t k,
+                                    distance_t *distances,
+                                    idx_t *labels) const {
+  long nshard = this->count();
+
+  std::vector<distance_t> all_distances(nshard * k * n);
+  std::vector<idx_t> all_labels(nshard * k * n);
+
+  auto fn =
+    [n, k, x, &all_distances, &all_labels](int no, const IndexT *index) {
+      if (index->verbose) {
+        printf ("begin query shard %d on %ld points\n", no, n);
+      }
+
+      index->search (n, x, k,
+                     all_distances.data() + no * k * n,
+                     all_labels.data() + no * k * n);
+
+      if (index->verbose) {
+        printf ("end query shard %d\n", no);
+      }
+    };
+
+  this->runOnIndex(fn);
+
+  std::vector<long> translations(nshard, 0);
+
+  // Because we just called runOnIndex above, it is safe to access the sub-index
+  // ntotal here
+  if (successive_ids) {
+    translations[0] = 0;
+
+    for (int s = 0; s + 1 < nshard; s++) {
+      translations[s + 1] = translations[s] + this->at(s)->ntotal;
+    }
+  }
+
+  if (this->metric_type == METRIC_L2) {
+    merge_tables<IndexT, CMin<distance_t, int>>(
+      n, k, nshard, distances, labels,
+      all_distances, all_labels, translations);
+  } else {
+    merge_tables<IndexT, CMax<distance_t, int>>(
+      n, k, nshard, distances, labels,
+      all_distances, all_labels, translations);
+  }
+}
+
+// explicit instanciations
+template struct IndexShardsTemplate<Index>;
+template struct IndexShardsTemplate<IndexBinary>;
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexShards.h
+++ b/core/src/index/thirdparty/faiss/IndexShards.h
@ -0,0 +1,100 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/**
+ * Index that concatenates the results from several sub-indexes
+ */
+template <typename IndexT>
+struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
+  using idx_t = typename IndexT::idx_t;
+  using component_t = typename IndexT::component_t;
+  using distance_t = typename IndexT::distance_t;
+
+  /**
+   * The dimension that all sub-indices must share will be the dimension of the
+   * first sub-index added
+   *
+   * @param threaded     do we use one thread per sub_index or do
+   *                     queries sequentially?
+   * @param successive_ids should we shift the returned ids by
+   *                     the size of each sub-index or return them
+   *                     as they are?
+   */
+  explicit IndexShardsTemplate(bool threaded = false,
+                               bool successive_ids = true);
+
+  /**
+   * @param threaded     do we use one thread per sub_index or do
+   *                     queries sequentially?
+   * @param successive_ids should we shift the returned ids by
+   *                     the size of each sub-index or return them
+   *                     as they are?
+   */
+  explicit IndexShardsTemplate(idx_t d,
+                               bool threaded = false,
+                               bool successive_ids = true);
+
+  /// int version due to the implicit bool conversion ambiguity of int as
+  /// dimension
+  explicit IndexShardsTemplate(int d,
+                               bool threaded = false,
+                               bool successive_ids = true);
+
+  /// Alias for addIndex()
+  void add_shard(IndexT* index) { this->addIndex(index); }
+
+  /// Alias for removeIndex()
+  void remove_shard(IndexT* index) { this->removeIndex(index); }
+
+  /// supported only for sub-indices that implement add_with_ids
+  void add(idx_t n, const component_t* x) override;
+
+  /**
+   * Cases (successive_ids, xids):
+   * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+   *                        request them to be shifted
+   * - true, NULL           OK, but should be called only once (calls add()
+   *                        on sub-indexes).
+   * - false, non-NULL      OK: will call add_with_ids with passed in xids
+   *                        distributed evenly over shards
+   * - false, NULL          OK: will call add_with_ids on each sub-index,
+   *                        starting at ntotal
+   */
+  void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+  void search(idx_t n, const component_t* x, idx_t k,
+              distance_t* distances, idx_t* labels) const override;
+
+  void train(idx_t n, const component_t* x) override;
+
+  // update metric_type and ntotal. Call if you changes something in
+  // the shard indexes.
+  void sync_with_shard_indexes();
+
+  bool successive_ids;
+
+ protected:
+  /// Called just after an index is added
+  void onAfterAddIndex(IndexT* index) override;
+
+  /// Called just after an index is removed
+  void onAfterRemoveIndex(IndexT* index) override;
+};
+
+using IndexShards = IndexShardsTemplate<Index>;
+using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/InvertedLists.cpp
+++ b/core/src/index/thirdparty/faiss/InvertedLists.cpp
@ -0,0 +1,805 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/InvertedLists.h>
+
+#include <cstdio>
+#include <numeric>
+
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+#include "gpu/utils/DeviceUtils.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+namespace faiss {
+
+PageLockMemory::PageLockMemory(size_t size) : nbytes(size) {
+    CUDA_VERIFY(cudaHostAlloc(&data, size, 0));
+}
+
+PageLockMemory::~PageLockMemory() {
+    CUDA_VERIFY(cudaFreeHost((void*)data));
+}
+
+PageLockMemory::PageLockMemory(const PageLockMemory& other) {
+    CUDA_VERIFY(cudaHostAlloc(&data, other.nbytes, 0));
+    memcpy(data, other.data, other.nbytes);
+    nbytes = other.nbytes;
+}
+
+PageLockMemory::PageLockMemory(PageLockMemory &&other) {
+    data = other.data;
+    nbytes = other.nbytes;
+    other.data = nullptr;
+    other.nbytes = 0;
+}
+}
+
+namespace faiss {
+
+using ScopedIds = InvertedLists::ScopedIds;
+using ScopedCodes = InvertedLists::ScopedCodes;
+
+
+/*****************************************
+ * InvertedLists implementation
+ ******************************************/
+
+InvertedLists::InvertedLists (size_t nlist, size_t code_size):
+    nlist (nlist), code_size (code_size)
+{
+}
+
+InvertedLists::~InvertedLists ()
+{}
+
+InvertedLists::idx_t InvertedLists::get_single_id (
+     size_t list_no, size_t offset) const
+{
+    assert (offset < list_size (list_no));
+    return get_ids(list_no)[offset];
+}
+
+
+void InvertedLists::release_codes (size_t, const uint8_t *) const
+{}
+
+void InvertedLists::release_ids (size_t, const idx_t *) const
+{}
+
+void InvertedLists::prefetch_lists (const idx_t *, int) const
+{}
+
+const uint8_t * InvertedLists::get_single_code (
+                   size_t list_no, size_t offset) const
+{
+    assert (offset < list_size (list_no));
+    return get_codes(list_no) + offset * code_size;
+}
+
+size_t InvertedLists::add_entry (size_t list_no, idx_t theid,
+                                 const uint8_t *code)
+{
+    return add_entries (list_no, 1, &theid, code);
+}
+
+void InvertedLists::update_entry (size_t list_no, size_t offset,
+                                        idx_t id, const uint8_t *code)
+{
+    update_entries (list_no, offset, 1, &id, code);
+}
+
+InvertedLists* InvertedLists::to_readonly() {
+    return nullptr;
+}
+
+bool InvertedLists::is_readonly() const {
+    return false;
+}
+
+void InvertedLists::reset () {
+    for (size_t i = 0; i < nlist; i++) {
+        resize (i, 0);
+    }
+}
+
+void InvertedLists::merge_from (InvertedLists *oivf, size_t add_id) {
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        size_t list_size = oivf->list_size (i);
+        ScopedIds ids (oivf, i);
+        if (add_id == 0) {
+            add_entries (i, list_size, ids.get (),
+                         ScopedCodes (oivf, i).get());
+        } else {
+            std::vector <idx_t> new_ids (list_size);
+
+            for (size_t j = 0; j < list_size; j++) {
+                new_ids [j] = ids[j] + add_id;
+            }
+            add_entries (i, list_size, new_ids.data(),
+                                   ScopedCodes (oivf, i).get());
+        }
+        oivf->resize (i, 0);
+    }
+}
+
+double InvertedLists::imbalance_factor () const {
+    std::vector<int> hist(nlist);
+
+    for (size_t i = 0; i < nlist; i++) {
+        hist[i] = list_size(i);
+    }
+
+    return faiss::imbalance_factor(nlist, hist.data());
+}
+
+void InvertedLists::print_stats () const {
+    std::vector<int> sizes(40);
+    for (size_t i = 0; i < nlist; i++) {
+        for (size_t j = 0; j < sizes.size(); j++) {
+            if ((list_size(i) >> j) == 0) {
+                sizes[j]++;
+                break;
+            }
+        }
+    }
+    for (size_t i = 0; i < sizes.size(); i++) {
+        if (sizes[i]) {
+            printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+        }
+    }
+}
+
+size_t InvertedLists::compute_ntotal () const {
+    size_t tot = 0;
+    for (size_t i = 0; i < nlist; i++) {
+        tot += list_size(i);
+    }
+    return tot;
+}
+
+/*****************************************
+ * ArrayInvertedLists implementation
+ ******************************************/
+
+ArrayInvertedLists::ArrayInvertedLists (size_t nlist, size_t code_size):
+    InvertedLists (nlist, code_size)
+{
+    ids.resize (nlist);
+    codes.resize (nlist);
+}
+
+size_t ArrayInvertedLists::add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids_in, const uint8_t *code)
+{
+    if (n_entry == 0) return 0;
+    assert (list_no < nlist);
+    size_t o = ids [list_no].size();
+    ids [list_no].resize (o + n_entry);
+    memcpy (&ids[list_no][o], ids_in, sizeof (ids_in[0]) * n_entry);
+    codes [list_no].resize ((o + n_entry) * code_size);
+    memcpy (&codes[list_no][o * code_size], code, code_size * n_entry);
+    return o;
+}
+
+size_t ArrayInvertedLists::list_size(size_t list_no) const
+{
+    assert (list_no < nlist);
+    return ids[list_no].size();
+}
+
+const uint8_t * ArrayInvertedLists::get_codes (size_t list_no) const
+{
+    assert (list_no < nlist);
+    return codes[list_no].data();
+}
+
+
+const InvertedLists::idx_t * ArrayInvertedLists::get_ids (size_t list_no) const
+{
+    assert (list_no < nlist);
+    return ids[list_no].data();
+}
+
+void ArrayInvertedLists::resize (size_t list_no, size_t new_size)
+{
+    ids[list_no].resize (new_size);
+    codes[list_no].resize (new_size * code_size);
+}
+
+void ArrayInvertedLists::update_entries (
+      size_t list_no, size_t offset, size_t n_entry,
+      const idx_t *ids_in, const uint8_t *codes_in)
+{
+    assert (list_no < nlist);
+    assert (n_entry + offset <= ids[list_no].size());
+    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
+    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
+}
+
+InvertedLists* ArrayInvertedLists::to_readonly() {
+    ReadOnlyArrayInvertedLists* readonly = new ReadOnlyArrayInvertedLists(*this);
+    return readonly;
+}
+
+ArrayInvertedLists::~ArrayInvertedLists ()
+{}
+
+/*****************************************************************
+ * ReadOnlyArrayInvertedLists implementations
+ *****************************************************************/
+
+ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(size_t nlist,
+                                                       size_t code_size, const std::vector<size_t>& list_length)
+        : InvertedLists (nlist, code_size),
+          readonly_length(list_length) {
+    valid = readonly_length.size() == nlist;
+    if (!valid) {
+        FAISS_THROW_MSG ("Invalid list_length");
+        return;
+    }
+    auto total_size = std::accumulate(readonly_length.begin(), readonly_length.end(), 0);
+    readonly_offset.reserve(nlist);
+
+    size_t offset = 0;
+    for (auto i=0; i<readonly_length.size(); ++i) {
+        readonly_offset.emplace_back(offset);
+        offset += readonly_length[i];
+    }
+}
+
+ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other)
+        : InvertedLists (other.nlist, other.code_size) {
+    std::vector <uint8_t> readonly_codes;
+    std::vector <idx_t> readonly_ids;
+    readonly_length.reserve(nlist);
+    size_t offset = 0;
+    for (auto& list_ids : other.ids) {
+        readonly_length.emplace_back(list_ids.size());
+        readonly_offset.emplace_back(offset);
+        offset += list_ids.size();
+        readonly_ids.insert(readonly_ids.end(), list_ids.begin(), list_ids.end());
+    }
+
+    for(auto& list_codes : other.codes) {
+        readonly_codes.insert(readonly_codes.end(), list_codes.begin(), list_codes.end());
+    }
+
+    // convert to page-lock memory
+    {
+        size_t size = readonly_codes.size() * sizeof(uint8_t);
+        pin_readonly_codes = std::make_shared<PageLockMemory>(size);
+        memcpy(pin_readonly_codes->data, readonly_codes.data(), size);
+    }
+    {
+        size_t size = readonly_ids.size() * sizeof(idx_t);
+        pin_readonly_ids = std::make_shared<PageLockMemory>(size);
+        memcpy(pin_readonly_ids->data, readonly_ids.data(), size);
+    }
+
+    valid = true;
+}
+
+//ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ReadOnlyArrayInvertedLists &other)
+//    : InvertedLists (other.nlist, other.code_size) {
+//    readonly_length = other.readonly_length;
+//    readonly_offset = other.readonly_offset;
+//    pin_readonly_codes = std::make_shared<PageLockMemory>(*other.pin_readonly_codes);
+//    pin_readonly_ids = std::make_shared<PageLockMemory>(*other.pin_readonly_ids);
+//    valid = true;
+//}
+
+//ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(ReadOnlyArrayInvertedLists &&other)
+//    : InvertedLists (other.nlist, other.code_size) {
+//    readonly_length = std::move(other.readonly_length);
+//    readonly_offset = std::move(other.readonly_offset);
+//    pin_readonly_codes = other.pin_readonly_codes;
+//    pin_readonly_ids = other.pin_readonly_ids;
+//
+//    other.pin_readonly_codes = nullptr;
+//    other.pin_readonly_ids = nullptr;
+//    valid = true;
+//}
+
+ReadOnlyArrayInvertedLists::~ReadOnlyArrayInvertedLists() {
+}
+
+bool
+ReadOnlyArrayInvertedLists::is_valid() {
+    return valid;
+}
+
+size_t ReadOnlyArrayInvertedLists::add_entries (
+        size_t , size_t ,
+        const idx_t* , const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyArrayInvertedLists::update_entries (size_t, size_t , size_t ,
+                                                 const idx_t *, const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyArrayInvertedLists::resize (size_t , size_t )
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+size_t ReadOnlyArrayInvertedLists::list_size(size_t list_no) const
+{
+    FAISS_ASSERT(list_no < nlist && valid);
+    return readonly_length[list_no];
+}
+
+const uint8_t * ReadOnlyArrayInvertedLists::get_codes (size_t list_no) const
+{
+    FAISS_ASSERT(list_no < nlist && valid);
+    uint8_t *pcodes = (uint8_t *)(pin_readonly_codes->data);
+    return pcodes + readonly_offset[list_no] * code_size;
+}
+
+const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_ids (size_t list_no) const
+{
+    FAISS_ASSERT(list_no < nlist && valid);
+    idx_t *pids = (idx_t *)pin_readonly_ids->data;
+    return pids + readonly_offset[list_no];
+}
+
+const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_all_ids() const {
+    FAISS_ASSERT(valid);
+    return (idx_t *)(pin_readonly_ids->data);
+}
+
+const uint8_t* ReadOnlyArrayInvertedLists::get_all_codes() const {
+    FAISS_ASSERT(valid);
+    return (uint8_t *)(pin_readonly_codes->data);
+}
+
+const std::vector<size_t>& ReadOnlyArrayInvertedLists::get_list_length() const {
+    FAISS_ASSERT(valid);
+    return readonly_length;
+}
+
+bool ReadOnlyArrayInvertedLists::is_readonly() const {
+    FAISS_ASSERT(valid);
+    return true;
+}
+
+/*****************************************************************
+ * Meta-inverted list implementations
+ *****************************************************************/
+
+
+size_t ReadOnlyInvertedLists::add_entries (
+           size_t , size_t ,
+           const idx_t* , const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyInvertedLists::update_entries (size_t, size_t , size_t ,
+                         const idx_t *, const uint8_t *)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+void ReadOnlyInvertedLists::resize (size_t , size_t )
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+
+/*****************************************
+ * HStackInvertedLists implementation
+ ******************************************/
+
+HStackInvertedLists::HStackInvertedLists (
+          int nil, const InvertedLists **ils_in):
+    ReadOnlyInvertedLists (nil > 0 ? ils_in[0]->nlist : 0,
+                   nil > 0 ? ils_in[0]->code_size : 0)
+{
+    FAISS_THROW_IF_NOT (nil > 0);
+    for (int i = 0; i < nil; i++) {
+        ils.push_back (ils_in[i]);
+        FAISS_THROW_IF_NOT (ils_in[i]->code_size == code_size &&
+                            ils_in[i]->nlist == nlist);
+    }
+}
+
+size_t HStackInvertedLists::list_size(size_t list_no) const
+{
+    size_t sz = 0;
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        sz += il->list_size (list_no);
+    }
+    return sz;
+}
+
+const uint8_t * HStackInvertedLists::get_codes (size_t list_no) const
+{
+    uint8_t *codes = new uint8_t [code_size * list_size(list_no)], *c = codes;
+
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size(list_no) * code_size;
+        if (sz > 0) {
+            memcpy (c, ScopedCodes (il, list_no).get(), sz);
+            c += sz;
+        }
+    }
+    return codes;
+}
+
+const uint8_t * HStackInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size (list_no);
+        if (offset < sz) {
+            // here we have to copy the code, otherwise it will crash at dealloc
+            uint8_t * code = new uint8_t [code_size];
+            memcpy (code, ScopedCodes (il, list_no, offset).get(), code_size);
+            return code;
+        }
+        offset -= sz;
+    }
+    FAISS_THROW_FMT ("offset %ld unknown", offset);
+}
+
+
+void HStackInvertedLists::release_codes (size_t, const uint8_t *codes) const {
+    delete [] codes;
+}
+
+const Index::idx_t * HStackInvertedLists::get_ids (size_t list_no) const
+{
+    idx_t *ids = new idx_t [list_size(list_no)], *c = ids;
+
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size(list_no);
+        if (sz > 0) {
+            memcpy (c, ScopedIds (il, list_no).get(), sz * sizeof(idx_t));
+            c += sz;
+        }
+    }
+    return ids;
+}
+
+Index::idx_t HStackInvertedLists::get_single_id (
+                    size_t list_no, size_t offset) const
+{
+
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        size_t sz = il->list_size (list_no);
+        if (offset < sz) {
+            return il->get_single_id (list_no, offset);
+        }
+        offset -= sz;
+    }
+    FAISS_THROW_FMT ("offset %ld unknown", offset);
+}
+
+
+void HStackInvertedLists::release_ids (size_t, const idx_t *ids) const {
+    delete [] ids;
+}
+
+void HStackInvertedLists::prefetch_lists (const idx_t *list_nos, int nlist) const
+{
+    for (int i = 0; i < ils.size(); i++) {
+        const InvertedLists *il = ils[i];
+        il->prefetch_lists (list_nos, nlist);
+    }
+}
+
+/*****************************************
+ * SliceInvertedLists implementation
+ ******************************************/
+
+
+namespace {
+
+    using idx_t = InvertedLists::idx_t;
+
+    idx_t translate_list_no (const SliceInvertedLists *sil,
+                             idx_t list_no) {
+        FAISS_THROW_IF_NOT (list_no >= 0 && list_no < sil->nlist);
+        return list_no + sil->i0;
+    }
+
+};
+
+
+
+SliceInvertedLists::SliceInvertedLists (
+    const InvertedLists *il, idx_t i0, idx_t i1):
+    ReadOnlyInvertedLists (i1 - i0, il->code_size),
+    il (il), i0(i0), i1(i1)
+{
+
+}
+
+size_t SliceInvertedLists::list_size(size_t list_no) const
+{
+    return il->list_size (translate_list_no (this, list_no));
+}
+
+const uint8_t * SliceInvertedLists::get_codes (size_t list_no) const
+{
+    return il->get_codes (translate_list_no (this, list_no));
+}
+
+const uint8_t * SliceInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    return il->get_single_code (translate_list_no (this, list_no), offset);
+}
+
+
+void SliceInvertedLists::release_codes (
+       size_t list_no, const uint8_t *codes) const {
+    return il->release_codes (translate_list_no (this, list_no), codes);
+}
+
+const Index::idx_t * SliceInvertedLists::get_ids (size_t list_no) const
+{
+    return il->get_ids (translate_list_no (this, list_no));
+}
+
+Index::idx_t SliceInvertedLists::get_single_id (
+                    size_t list_no, size_t offset) const
+{
+    return il->get_single_id (translate_list_no (this, list_no), offset);
+}
+
+
+void SliceInvertedLists::release_ids (size_t list_no, const idx_t *ids) const {
+    return il->release_ids (translate_list_no (this, list_no), ids);
+}
+
+void SliceInvertedLists::prefetch_lists (const idx_t *list_nos, int nlist) const
+{
+    std::vector<idx_t> translated_list_nos;
+    for (int j = 0; j < nlist; j++) {
+        idx_t list_no = list_nos[j];
+        if (list_no < 0) continue;
+        translated_list_nos.push_back (translate_list_no (this, list_no));
+    }
+    il->prefetch_lists (translated_list_nos.data(),
+                        translated_list_nos.size());
+}
+
+
+/*****************************************
+ * VStackInvertedLists implementation
+ ******************************************/
+
+namespace {
+
+    using idx_t = InvertedLists::idx_t;
+
+    // find the invlist this number belongs to
+    int translate_list_no (const VStackInvertedLists *vil,
+                             idx_t list_no) {
+        FAISS_THROW_IF_NOT (list_no >= 0 && list_no < vil->nlist);
+        int i0 = 0, i1 = vil->ils.size();
+        const idx_t *cumsz = vil->cumsz.data();
+        while (i0 + 1 < i1) {
+            int imed = (i0 + i1) / 2;
+            if (list_no >= cumsz[imed]) {
+                i0 = imed;
+            } else {
+                i1 = imed;
+            }
+        }
+        assert(list_no >= cumsz[i0] && list_no < cumsz[i0 + 1]);
+        return i0;
+    }
+
+    idx_t sum_il_sizes (int nil, const InvertedLists **ils_in) {
+        idx_t tot = 0;
+        for (int i = 0; i < nil; i++) {
+            tot += ils_in[i]->nlist;
+        }
+        return tot;
+    }
+
+};
+
+
+
+VStackInvertedLists::VStackInvertedLists (
+          int nil, const InvertedLists **ils_in):
+    ReadOnlyInvertedLists (sum_il_sizes(nil, ils_in),
+                   nil > 0 ? ils_in[0]->code_size : 0)
+{
+    FAISS_THROW_IF_NOT (nil > 0);
+    cumsz.resize (nil + 1);
+    for (int i = 0; i < nil; i++) {
+        ils.push_back (ils_in[i]);
+        FAISS_THROW_IF_NOT (ils_in[i]->code_size == code_size);
+        cumsz[i + 1] = cumsz[i] + ils_in[i]->nlist;
+    }
+}
+
+size_t VStackInvertedLists::list_size(size_t list_no) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->list_size (list_no);
+}
+
+const uint8_t * VStackInvertedLists::get_codes (size_t list_no) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_codes (list_no);
+}
+
+const uint8_t * VStackInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_single_code (list_no, offset);
+}
+
+
+void VStackInvertedLists::release_codes (
+          size_t list_no, const uint8_t *codes) const {
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->release_codes (list_no, codes);
+}
+
+const Index::idx_t * VStackInvertedLists::get_ids (size_t list_no) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_ids (list_no);
+}
+
+Index::idx_t VStackInvertedLists::get_single_id (
+                    size_t list_no, size_t offset) const
+{
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->get_single_id (list_no, offset);
+}
+
+
+void VStackInvertedLists::release_ids (size_t list_no, const idx_t *ids) const {
+    int i = translate_list_no (this, list_no);
+    list_no -= cumsz[i];
+    return ils[i]->release_ids (list_no, ids);
+}
+
+void VStackInvertedLists::prefetch_lists (
+           const idx_t *list_nos, int nlist) const
+{
+    std::vector<int> ilno (nlist, -1);
+    std::vector<int> n_per_il (ils.size(), 0);
+    for (int j = 0; j < nlist; j++) {
+        idx_t list_no = list_nos[j];
+        if (list_no < 0) continue;
+        int i = ilno[j] = translate_list_no (this, list_no);
+        n_per_il[i]++;
+    }
+    std::vector<int> cum_n_per_il (ils.size() + 1, 0);
+    for (int j = 0; j < ils.size(); j++) {
+        cum_n_per_il[j + 1] = cum_n_per_il[j] + n_per_il[j];
+    }
+    std::vector<idx_t> sorted_list_nos (cum_n_per_il.back());
+    for (int j = 0; j < nlist; j++) {
+        idx_t list_no = list_nos[j];
+        if (list_no < 0) continue;
+        int i = ilno[j];
+        list_no -= cumsz[i];
+        sorted_list_nos[cum_n_per_il[i]++] = list_no;
+    }
+
+    int i0 = 0;
+    for (int j = 0; j < ils.size(); j++) {
+        int i1 = i0 + n_per_il[j];
+        if (i1 > i0) {
+            ils[j]->prefetch_lists (sorted_list_nos.data() + i0,
+                                    i1 - i0);
+        }
+        i0 = i1;
+    }
+}
+
+
+
+/*****************************************
+ * MaskedInvertedLists implementation
+ ******************************************/
+
+
+MaskedInvertedLists::MaskedInvertedLists (const InvertedLists *il0,
+                                          const InvertedLists *il1):
+    ReadOnlyInvertedLists (il0->nlist, il0->code_size),
+    il0 (il0), il1 (il1)
+{
+    FAISS_THROW_IF_NOT (il1->nlist == nlist);
+    FAISS_THROW_IF_NOT (il1->code_size == code_size);
+}
+
+size_t MaskedInvertedLists::list_size(size_t list_no) const
+{
+    size_t sz = il0->list_size(list_no);
+    return sz ? sz : il1->list_size(list_no);
+}
+
+const uint8_t * MaskedInvertedLists::get_codes (size_t list_no) const
+{
+    size_t sz = il0->list_size(list_no);
+    return (sz ? il0 : il1)->get_codes(list_no);
+}
+
+const idx_t * MaskedInvertedLists::get_ids (size_t list_no) const
+{
+    size_t sz = il0->list_size (list_no);
+    return (sz ? il0 : il1)->get_ids (list_no);
+}
+
+void MaskedInvertedLists::release_codes (
+      size_t list_no, const uint8_t *codes) const
+{
+    size_t sz = il0->list_size (list_no);
+    (sz ? il0 : il1)->release_codes (list_no, codes);
+}
+
+void MaskedInvertedLists::release_ids (size_t list_no, const idx_t *ids) const
+{
+    size_t sz = il0->list_size (list_no);
+    (sz ? il0 : il1)->release_ids (list_no, ids);
+}
+
+idx_t MaskedInvertedLists::get_single_id (size_t list_no, size_t offset) const
+{
+    size_t sz = il0->list_size (list_no);
+    return (sz ? il0 : il1)->get_single_id (list_no, offset);
+}
+
+const uint8_t * MaskedInvertedLists::get_single_code (
+           size_t list_no, size_t offset) const
+{
+    size_t sz = il0->list_size (list_no);
+    return (sz ? il0 : il1)->get_single_code (list_no, offset);
+}
+
+void MaskedInvertedLists::prefetch_lists (
+       const idx_t *list_nos, int nlist) const
+{
+    std::vector<idx_t> list0, list1;
+    for (int i = 0; i < nlist; i++) {
+        idx_t list_no = list_nos[i];
+        if (list_no < 0) continue;
+        size_t sz = il0->list_size(list_no);
+        (sz ? list0 : list1).push_back (list_no);
+    }
+    il0->prefetch_lists (list0.data(), list0.size());
+    il1->prefetch_lists (list1.data(), list1.size());
+}
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/InvertedLists.h
+++ b/core/src/index/thirdparty/faiss/InvertedLists.h
@ -0,0 +1,402 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_INVERTEDLISTS_IVF_H
+#define FAISS_INVERTEDLISTS_IVF_H
+
+/**
+ * Definition of inverted lists + a few common classes that implement
+ * the interface.
+ */
+
+#include <memory>
+#include <vector>
+#include <faiss/Index.h>
+
+namespace faiss {
+
+struct PageLockMemory {
+public:
+    PageLockMemory() : data(nullptr), nbytes(0) {}
+
+    PageLockMemory(size_t size);
+
+    ~PageLockMemory();
+
+    PageLockMemory(const PageLockMemory& other);
+
+    PageLockMemory(PageLockMemory &&other);
+
+    inline size_t size() {
+        return nbytes;
+    }
+
+    void *data;
+    size_t nbytes;
+};
+using PageLockMemoryPtr = std::shared_ptr<PageLockMemory>;
+}
+
+namespace faiss {
+
+/** Table of inverted lists
+ * multithreading rules:
+ * - concurrent read accesses are allowed
+ * - concurrent update accesses are allowed
+ * - for resize and add_entries, only concurrent access to different lists
+ *   are allowed
+ */
+struct InvertedLists {
+    typedef Index::idx_t idx_t;
+
+    size_t nlist;             ///< number of possible key values
+    size_t code_size;         ///< code size per vector in bytes
+
+    InvertedLists (size_t nlist, size_t code_size);
+
+    /*************************
+     *  Read only functions */
+
+    /// get the size of a list
+    virtual size_t list_size(size_t list_no) const = 0;
+
+    /** get the codes for an inverted list
+     * must be released by release_codes
+     *
+     * @return codes    size list_size * code_size
+     */
+    virtual const uint8_t * get_codes (size_t list_no) const = 0;
+
+    /** get the ids for an inverted list
+     * must be released by release_ids
+     *
+     * @return ids      size list_size
+     */
+    virtual const idx_t * get_ids (size_t list_no) const = 0;
+
+    /// release codes returned by get_codes (default implementation is nop
+    virtual void release_codes (size_t list_no, const uint8_t *codes) const;
+
+    /// release ids returned by get_ids
+    virtual void release_ids (size_t list_no, const idx_t *ids) const;
+
+    /// @return a single id in an inverted list
+    virtual idx_t get_single_id (size_t list_no, size_t offset) const;
+
+    /// @return a single code in an inverted list
+    /// (should be deallocated with release_codes)
+    virtual const uint8_t * get_single_code (
+                size_t list_no, size_t offset) const;
+
+    /// prepare the following lists (default does nothing)
+    /// a list can be -1 hence the signed long
+    virtual void prefetch_lists (const idx_t *list_nos, int nlist) const;
+
+    /*************************
+     * writing functions     */
+
+    /// add one entry to an inverted list
+    virtual size_t add_entry (size_t list_no, idx_t theid,
+                              const uint8_t *code);
+
+    virtual size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) = 0;
+
+    virtual void update_entry (size_t list_no, size_t offset,
+                               idx_t id, const uint8_t *code);
+
+    virtual void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                                 const idx_t *ids, const uint8_t *code) = 0;
+
+    virtual void resize (size_t list_no, size_t new_size) = 0;
+
+    virtual void reset ();
+
+    virtual InvertedLists* to_readonly();
+
+    virtual bool is_readonly() const;
+
+    /// move all entries from oivf (empty on output)
+    void merge_from (InvertedLists *oivf, size_t add_id);
+
+    virtual ~InvertedLists ();
+
+    /*************************
+     * statistics            */
+
+    /// 1= perfectly balanced, >1: imbalanced
+    double imbalance_factor () const;
+
+    /// display some stats about the inverted lists
+    void print_stats () const;
+
+    /// sum up list sizes
+    size_t compute_ntotal () const;
+
+    /**************************************
+     * Scoped inverted lists (for automatic deallocation)
+     *
+     * instead of writing:
+     *
+     *     uint8_t * codes = invlists->get_codes (10);
+     *     ... use codes
+     *     invlists->release_codes(10, codes)
+     *
+     * write:
+     *
+     *    ScopedCodes codes (invlists, 10);
+     *    ... use codes.get()
+     *    // release called automatically when codes goes out of scope
+     *
+     * the following function call also works:
+     *
+     *    foo (123, ScopedCodes (invlists, 10).get(), 456);
+     *
+     */
+
+    struct ScopedIds {
+        const InvertedLists *il;
+        const idx_t *ids;
+        size_t list_no;
+
+        ScopedIds (const InvertedLists *il, size_t list_no):
+        il (il), ids (il->get_ids (list_no)), list_no (list_no)
+        {}
+
+        const idx_t *get() {return ids; }
+
+        idx_t operator [] (size_t i) const {
+            return ids[i];
+        }
+
+        ~ScopedIds () {
+            il->release_ids (list_no, ids);
+        }
+    };
+
+    struct ScopedCodes {
+        const InvertedLists *il;
+        const uint8_t *codes;
+        size_t list_no;
+
+        ScopedCodes (const InvertedLists *il, size_t list_no):
+            il (il), codes (il->get_codes (list_no)), list_no (list_no)
+        {}
+
+        ScopedCodes (const InvertedLists *il, size_t list_no, size_t offset):
+            il (il), codes (il->get_single_code (list_no, offset)),
+            list_no (list_no)
+        {}
+
+        const uint8_t *get() {return codes; }
+
+        ~ScopedCodes () {
+            il->release_codes (list_no, codes);
+        }
+    };
+
+
+};
+
+
+/// simple (default) implementation as an array of inverted lists
+struct ArrayInvertedLists: InvertedLists {
+    std::vector < std::vector<uint8_t> > codes; // binary codes, size nlist
+    std::vector < std::vector<idx_t> > ids;  ///< Inverted lists for indexes
+
+    ArrayInvertedLists (size_t nlist, size_t code_size);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+    InvertedLists* to_readonly() override;
+
+    virtual ~ArrayInvertedLists ();
+};
+
+struct ReadOnlyArrayInvertedLists: InvertedLists {
+    PageLockMemoryPtr pin_readonly_codes;
+    PageLockMemoryPtr pin_readonly_ids;
+//    std::vector <uint8_t> readonly_codes;
+//    std::vector <idx_t> readonly_ids;
+    std::vector <size_t> readonly_length;
+    std::vector <size_t> readonly_offset;
+    bool valid;
+
+    ReadOnlyArrayInvertedLists(size_t nlist, size_t code_size, const std::vector<size_t>& list_length);
+    explicit ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other);
+
+    // Use default copy construct, just copy pointer, DON'T COPY pin_readonly_codes AND pin_readonly_ids
+//    explicit ReadOnlyArrayInvertedLists(const ReadOnlyArrayInvertedLists &);
+//    explicit ReadOnlyArrayInvertedLists(ReadOnlyArrayInvertedLists &&);
+    virtual ~ReadOnlyArrayInvertedLists();
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    const uint8_t * get_all_codes() const;
+    const idx_t * get_all_ids() const;
+    const std::vector<size_t>& get_list_length() const;
+
+    size_t add_entries (
+            size_t list_no, size_t n_entry,
+            const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+    bool is_readonly() const override;
+
+    bool is_valid();
+};
+/*****************************************************************
+ * Meta-inverted lists
+ *
+ * About terminology: the inverted lists are seen as a sparse matrix,
+ * that can be stacked horizontally, vertically and sliced.
+ *****************************************************************/
+
+struct ReadOnlyInvertedLists: InvertedLists {
+
+    ReadOnlyInvertedLists (size_t nlist, size_t code_size):
+    InvertedLists (nlist, code_size) {}
+
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+};
+
+
+/// Horizontal stack of inverted lists
+struct HStackInvertedLists: ReadOnlyInvertedLists {
+
+    std::vector<const InvertedLists *>ils;
+
+    /// build InvertedLists by concatenating nil of them
+    HStackInvertedLists (int nil, const InvertedLists **ils);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+};
+
+using ConcatenatedInvertedLists = HStackInvertedLists;
+
+
+/// vertical slice of indexes in another InvertedLists
+struct SliceInvertedLists: ReadOnlyInvertedLists {
+    const InvertedLists *il;
+    idx_t i0, i1;
+
+    SliceInvertedLists(const InvertedLists *il, idx_t i0, idx_t i1);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+};
+
+
+struct VStackInvertedLists: ReadOnlyInvertedLists {
+    std::vector<const InvertedLists *>ils;
+    std::vector<idx_t> cumsz;
+
+    /// build InvertedLists by concatenating nil of them
+    VStackInvertedLists (int nil, const InvertedLists **ils);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+};
+
+
+/** use the first inverted lists if they are non-empty otherwise use the second
+ *
+ * This is useful if il1 has a few inverted lists that are too long,
+ * and that il0 has replacement lists for those, with empty lists for
+ * the others. */
+struct MaskedInvertedLists: ReadOnlyInvertedLists {
+
+    const InvertedLists *il0;
+    const InvertedLists *il1;
+
+    MaskedInvertedLists (const InvertedLists *il0,
+                         const InvertedLists *il1);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    void release_codes (size_t list_no, const uint8_t *codes) const override;
+    void release_ids (size_t list_no, const idx_t *ids) const override;
+
+    idx_t get_single_id (size_t list_no, size_t offset) const override;
+
+    const uint8_t * get_single_code (
+           size_t list_no, size_t offset) const override;
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+};
+
+} // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/LICENSE
+++ b/core/src/index/thirdparty/faiss/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/core/src/index/thirdparty/faiss/Makefile
+++ b/core/src/index/thirdparty/faiss/Makefile
@ -0,0 +1,113 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+-include makefile.inc
+
+HEADERS     = $(wildcard *.h impl/*.h utils/*.h)
+SRC         = $(wildcard *.cpp impl/*.cpp utils/*.cpp)
+OBJ         = $(SRC:.cpp=.o)
+INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss
+
+GPU_HEADERS = $(wildcard gpu/*.h gpu/impl/*.h gpu/utils/*.h)
+GPU_CPPSRC  = $(wildcard gpu/*.cpp gpu/impl/*.cpp gpu/utils/*.cpp)
+GPU_CUSRC   = $(wildcard gpu/*.cu gpu/impl/*.cu gpu/utils/*.cu \
+gpu/utils/nvidia/*.cu gpu/utils/blockselect/*.cu gpu/utils/warpselect/*.cu)
+GPU_SRC     = $(GPU_CPPSRC) $(GPU_CUSRC)
+GPU_CPPOBJ  = $(GPU_CPPSRC:.cpp=.o)
+GPU_CUOBJ   = $(GPU_CUSRC:.cu=.o)
+GPU_OBJ     = $(GPU_CPPOBJ) $(GPU_CUOBJ)
+
+ifneq ($(strip $(NVCC)),)
+	OBJ         += $(GPU_OBJ)
+	HEADERS     += $(GPU_HEADERS)
+endif
+
+CPPFLAGS += -I.
+NVCCFLAGS += -I.
+
+############################
+# Building
+
+all: libfaiss.a libfaiss.$(SHAREDEXT)
+
+libfaiss.a: $(OBJ)
+	$(AR) r $@ $^
+
+libfaiss.$(SHAREDEXT): $(OBJ)
+	$(CXX) $(SHAREDFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+%.o: %.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@
+
+%.o: %.cu
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+
+clean:
+	rm -f libfaiss.a libfaiss.$(SHAREDEXT)
+	rm -f $(OBJ)
+
+
+############################
+# Installing
+
+install: libfaiss.a libfaiss.$(SHAREDEXT) installdirs
+	cp libfaiss.a libfaiss.$(SHAREDEXT) $(DESTDIR)$(libdir)
+	tar cf - $(HEADERS) | tar xf - -C $(DESTDIR)$(includedir)/faiss/
+
+installdirs:
+	$(MKDIR_P) $(INSTALLDIRS)
+
+uninstall:
+	rm -f $(DESTDIR)$(libdir)/libfaiss.a \
+	      $(DESTDIR)$(libdir)/libfaiss.$(SHAREDEXT)
+	rm -rf $(DESTDIR)$(includedir)/faiss
+
+
+#############################
+# Dependencies
+
+-include depend
+
+depend: $(SRC) $(GPU_SRC)
+	for i in $^; do \
+		$(CXXCPP) $(CPPFLAGS) -DCUDA_VERSION=7050 -x c++ -MM $$i; \
+	done > depend
+
+
+#############################
+# Python
+
+py: libfaiss.a
+	$(MAKE) -C python
+
+
+#############################
+# Tests
+
+test: libfaiss.a py
+	$(MAKE) -C tests run
+	PYTHONPATH=./python/build/`ls python/build | grep lib` \
+	$(PYTHON) -m unittest discover tests/ -v
+
+test_gpu: libfaiss.a
+	$(MAKE) -C gpu/test run
+	PYTHONPATH=./python/build/`ls python/build | grep lib` \
+	$(PYTHON) -m unittest discover gpu/test/ -v
+
+#############################
+# Demos
+
+demos: libfaiss.a
+	$(MAKE) -C demos
+
+
+#############################
+# Misc
+
+misc/test_blas: misc/test_blas.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+
+.PHONY: all clean demos install installdirs py test test_gpu uninstall
--- a/core/src/index/thirdparty/faiss/MatrixStats.cpp
+++ b/core/src/index/thirdparty/faiss/MatrixStats.cpp
@ -0,0 +1,252 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+
+#include <faiss/MatrixStats.h>
+
+
+#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
+
+#include <cmath>
+#include <cstdio>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/*********************************************************************
+ * MatrixStats
+ *********************************************************************/
+
+MatrixStats::PerDimStats::PerDimStats():
+    n(0), n_nan(0), n_inf(0), n0(0),
+    min(HUGE_VALF), max(-HUGE_VALF),
+    sum(0), sum2(0),
+    mean(NAN), stddev(NAN)
+{}
+
+
+void MatrixStats::PerDimStats::add (float x)
+{
+    n++;
+    if (std::isnan(x)) {
+        n_nan++;
+        return;
+    }
+    if (!std::isfinite(x)) {
+        n_inf++;
+        return;
+    }
+    if (x == 0) n0++;
+    if (x < min) min = x;
+    if (x > max) max = x;
+    sum += x;
+    sum2 += (double)x * (double)x;
+}
+
+void MatrixStats::PerDimStats::compute_mean_std ()
+{
+    n_valid = n - n_nan - n_inf;
+    mean = sum / n_valid;
+    double var = sum2 / n_valid - mean * mean;
+    if (var < 0) var = 0;
+    stddev = sqrt(var);
+}
+
+
+void MatrixStats::do_comment (const char *fmt, ...)
+{
+    va_list ap;
+
+    /* Determine required size */
+    va_start(ap, fmt);
+    size_t size = vsnprintf(buf, nbuf, fmt, ap);
+    va_end(ap);
+
+    nbuf -= size;
+    buf += size;
+}
+
+
+
+MatrixStats::MatrixStats (size_t n, size_t d, const float *x):
+    n(n), d(d),
+    n_collision(0), n_valid(0), n0(0),
+    min_norm2(HUGE_VAL), max_norm2(0)
+{
+    std::vector<char> comment_buf (10000);
+    buf = comment_buf.data ();
+    nbuf = comment_buf.size();
+
+    do_comment ("analyzing %ld vectors of size %ld\n", n, d);
+
+    if (d > 1024) {
+        do_comment (
+           "indexing this many dimensions is hard, "
+           "please consider dimensionality reducution (with PCAMatrix)\n");
+    }
+
+    size_t nbytes = sizeof (x[0]) * d;
+    per_dim_stats.resize (d);
+
+    for (size_t i = 0; i < n; i++) {
+        const float *xi = x + d * i;
+        double sum2 = 0;
+        for (size_t j = 0; j < d; j++) {
+            per_dim_stats[j].add (xi[j]);
+            sum2 += xi[j] * (double)xi[j];
+        }
+
+        if (std::isfinite (sum2)) {
+            n_valid++;
+            if (sum2 == 0) {
+                n0 ++;
+            } else {
+                if (sum2 < min_norm2) min_norm2 = sum2;
+                if (sum2 > max_norm2) max_norm2 = sum2;
+            }
+        }
+
+        { // check hash
+            uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
+            auto elt = occurrences.find (hash);
+            if (elt == occurrences.end()) {
+                Occurrence occ = {i, 1};
+                occurrences[hash] = occ;
+            } else {
+                if (!memcmp (xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count ++;
+                } else {
+                    n_collision ++;
+                    // we should use a list of collisions but overkill
+                }
+            }
+        }
+    }
+
+    // invalid vecor stats
+    if (n_valid == n) {
+        do_comment ("no NaN or Infs in data\n");
+    } else {
+        do_comment ("%ld vectors contain NaN or Inf "
+                 "(or have too large components), "
+                 "expect bad results with indexing!\n", n - n_valid);
+    }
+
+    // copies in dataset
+    if (occurrences.size() == n) {
+        do_comment ("all vectors are distinct\n");
+    } else {
+        do_comment ("%ld vectors are distinct (%.2f%%)\n",
+                 occurrences.size(),
+                 occurrences.size() * 100.0 / n);
+
+        if (n_collision > 0) {
+            do_comment ("%ld collisions in hash table, "
+                     "counts may be invalid\n", n_collision);
+        }
+
+        Occurrence max = {0, 0};
+        for (auto it = occurrences.begin();
+             it != occurrences.end(); ++it) {
+            if (it->second.count > max.count) {
+                max = it->second;
+            }
+        }
+        do_comment ("vector %ld has %ld copies\n", max.first, max.count);
+    }
+
+    { // norm stats
+        min_norm2 = sqrt (min_norm2);
+        max_norm2 = sqrt (max_norm2);
+        do_comment ("range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                 min_norm2, max_norm2, n0);
+
+        if (max_norm2 < min_norm2 * 1.0001) {
+            do_comment ("vectors are normalized, inner product and "
+                     "L2  search are equivalent\n");
+        }
+
+        if (max_norm2 > min_norm2 * 100) {
+            do_comment ("vectors have very large differences in norms, "
+                     "is this normal?\n");
+        }
+    }
+
+    { // per dimension stats
+
+        double max_std = 0, min_std = HUGE_VAL;
+
+        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            PerDimStats &st = per_dim_stats[j];
+            st.compute_mean_std ();
+            n0 += st.n0;
+
+            if (st.max == st.min) {
+                n_0_range ++;
+            } else if (st.max < 1.001 * st.min) {
+                n_dangerous_range ++;
+            }
+
+            if (st.stddev > max_std) max_std = st.stddev;
+            if (st.stddev < min_std) min_std = st.stddev;
+        }
+
+
+
+        if (n0 == 0) {
+            do_comment ("matrix contains no 0s\n");
+        } else {
+            do_comment ("matrix contains %.2f %% 0 entries\n",
+                     n0 * 100.0 / (n * d));
+        }
+
+        if (n_0_range == 0) {
+            do_comment ("no constant dimensions\n");
+        } else {
+            do_comment ("%ld dimensions are constant: they can be removed\n",
+                     n_0_range);
+        }
+
+        if (n_dangerous_range == 0) {
+            do_comment ("no dimension has a too large mean\n");
+        } else {
+            do_comment ("%ld dimensions are too large "
+                     "wrt. their variance, may loose precision "
+                     "in IndexFlatL2 (use CenteringTransform)\n",
+                     n_dangerous_range);
+        }
+
+        do_comment ("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+
+        size_t n_small_var = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            const PerDimStats &st = per_dim_stats[j];
+            if (st.stddev < max_std * 1e-4) {
+                n_small_var++;
+            }
+        }
+
+        if (n_small_var > 0) {
+            do_comment ("%ld dimensions have negligible stddev wrt. "
+                     "the largest dimension, they could be ignored",
+                     n_small_var);
+        }
+
+    }
+    comments = comment_buf.data ();
+    buf = nullptr;
+    nbuf = 0;
+}
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/MatrixStats.h
+++ b/core/src/index/thirdparty/faiss/MatrixStats.h
@ -0,0 +1,62 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <stdint.h>
+
+
+namespace faiss {
+
+
+/** Reports some statistics on a dataset and comments on them.
+ *
+ * It is a class rather than a function so that all stats can also be
+ * accessed from code */
+
+struct MatrixStats {
+    MatrixStats (size_t n, size_t d, const float *x);
+    std::string comments;
+
+    // raw statistics
+    size_t n, d;
+    size_t n_collision, n_valid, n0;
+    double min_norm2, max_norm2;
+
+    struct PerDimStats {
+        size_t n, n_nan, n_inf, n0;
+
+        float min, max;
+        double sum, sum2;
+
+        size_t n_valid;
+        double mean, stddev;
+
+        PerDimStats();
+        void add (float x);
+        void compute_mean_std ();
+    };
+
+    std::vector<PerDimStats> per_dim_stats;
+    struct Occurrence {
+        size_t first;
+        size_t count;
+    };
+    std::unordered_map<uint64_t, Occurrence> occurrences;
+
+    char *buf;
+    size_t nbuf;
+    void do_comment (const char *fmt, ...);
+
+};
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/MetaIndexes.cpp
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.cpp
@ -0,0 +1,351 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/MetaIndexes.h>
+
+#include <cstdio>
+#include <stdint.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/utils/WorkerThread.h>
+
+
+namespace faiss {
+
+namespace {
+
+typedef Index::idx_t idx_t;
+
+} // namespace
+
+/*****************************************************
+ * IndexIDMap implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::IndexIDMapTemplate (IndexT *index):
+    index (index),
+    own_fields (false)
+{
+    FAISS_THROW_IF_NOT_MSG (index->ntotal == 0, "index must be empty on input");
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+    this->verbose = index->verbose;
+    this->d = index->d;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add
+    (idx_t, const typename IndexT::component_t *)
+{
+    FAISS_THROW_MSG ("add does not make sense with IndexIDMap, "
+                      "use add_with_ids");
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::train
+    (idx_t n, const typename IndexT::component_t *x)
+{
+    index->train (n, x);
+    this->is_trained = index->is_trained;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::reset ()
+{
+    index->reset ();
+    id_map.clear();
+    this->ntotal = 0;
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add_with_ids
+    (idx_t n, const typename IndexT::component_t * x,
+     const typename IndexT::idx_t *xids)
+{
+    index->add (n, x);
+    for (idx_t i = 0; i < n; i++)
+        id_map.push_back (xids[i]);
+    this->ntotal = index->ntotal;
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::search
+    (idx_t n, const typename IndexT::component_t *x, idx_t k,
+     typename IndexT::distance_t *distances, typename IndexT::idx_t *labels) const
+{
+    index->search (n, x, k, distances, labels);
+    idx_t *li = labels;
+#pragma omp parallel for
+    for (idx_t i = 0; i < n * k; i++) {
+        li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
+    }
+}
+
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::range_search
+    (typename IndexT::idx_t n, const typename IndexT::component_t *x,
+     typename IndexT::distance_t radius, RangeSearchResult *result) const
+{
+  index->range_search(n, x, radius, result);
+#pragma omp parallel for
+  for (idx_t i = 0; i < result->lims[result->nq]; i++) {
+      result->labels[i] = result->labels[i] < 0 ?
+        result->labels[i] : id_map[result->labels[i]];
+  }
+}
+
+namespace {
+
+struct IDTranslatedSelector: IDSelector {
+    const std::vector <int64_t> & id_map;
+    const IDSelector & sel;
+    IDTranslatedSelector (const std::vector <int64_t> & id_map,
+                          const IDSelector & sel):
+        id_map (id_map), sel (sel)
+    {}
+    bool is_member(idx_t id) const override {
+      return sel.is_member(id_map[id]);
+    }
+};
+
+}
+
+template <typename IndexT>
+size_t IndexIDMapTemplate<IndexT>::remove_ids (const IDSelector & sel)
+{
+    // remove in sub-index first
+    IDTranslatedSelector sel2 (id_map, sel);
+    size_t nremove = index->remove_ids (sel2);
+
+    int64_t j = 0;
+    for (idx_t i = 0; i < this->ntotal; i++) {
+        if (sel.is_member (id_map[i])) {
+            // remove
+        } else {
+            id_map[j] = id_map[i];
+            j++;
+        }
+    }
+    FAISS_ASSERT (j == index->ntotal);
+    this->ntotal = j;
+    id_map.resize(this->ntotal);
+    return nremove;
+}
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::~IndexIDMapTemplate ()
+{
+    if (own_fields) delete index;
+}
+
+
+
+/*****************************************************
+ * IndexIDMap2 implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMap2Template<IndexT>::IndexIDMap2Template (IndexT *index):
+    IndexIDMapTemplate<IndexT> (index)
+{}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::add_with_ids
+    (idx_t n, const typename IndexT::component_t* x,
+     const typename IndexT::idx_t* xids)
+{
+    size_t prev_ntotal = this->ntotal;
+    IndexIDMapTemplate<IndexT>::add_with_ids (n, x, xids);
+    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
+        rev_map [this->id_map [i]] = i;
+    }
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::construct_rev_map ()
+{
+    rev_map.clear ();
+    for (size_t i = 0; i < this->ntotal; i++) {
+        rev_map [this->id_map [i]] = i;
+    }
+}
+
+
+template <typename IndexT>
+size_t IndexIDMap2Template<IndexT>::remove_ids(const IDSelector& sel)
+{
+    // This is quite inefficient
+    size_t nremove = IndexIDMapTemplate<IndexT>::remove_ids (sel);
+    construct_rev_map ();
+    return nremove;
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::reconstruct
+    (idx_t key, typename IndexT::component_t * recons) const
+{
+    try {
+        this->index->reconstruct (rev_map.at (key), recons);
+    } catch (const std::out_of_range& e) {
+        FAISS_THROW_FMT ("key %ld not found", key);
+    }
+}
+
+
+// explicit template instantiations
+
+template struct IndexIDMapTemplate<Index>;
+template struct IndexIDMapTemplate<IndexBinary>;
+template struct IndexIDMap2Template<Index>;
+template struct IndexIDMap2Template<IndexBinary>;
+
+
+/*****************************************************
+ * IndexSplitVectors implementation
+ *******************************************************/
+
+
+IndexSplitVectors::IndexSplitVectors (idx_t d, bool threaded):
+    Index (d), own_fields (false),
+    threaded (threaded), sum_d (0)
+{
+
+}
+
+void IndexSplitVectors::add_sub_index (Index *index)
+{
+    sub_indexes.push_back (index);
+    sync_with_sub_indexes ();
+}
+
+void IndexSplitVectors::sync_with_sub_indexes ()
+{
+    if (sub_indexes.empty()) return;
+    Index * index0 = sub_indexes[0];
+    sum_d = index0->d;
+    metric_type = index0->metric_type;
+    is_trained = index0->is_trained;
+    ntotal = index0->ntotal;
+    for (int i = 1; i < sub_indexes.size(); i++) {
+        Index * index = sub_indexes[i];
+        FAISS_THROW_IF_NOT (metric_type == index->metric_type);
+        FAISS_THROW_IF_NOT (ntotal == index->ntotal);
+        sum_d += index->d;
+    }
+
+}
+
+void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
+  FAISS_THROW_MSG("not implemented");
+}
+
+
+
+void IndexSplitVectors::search (
+           idx_t n, const float *x, idx_t k,
+           float *distances, idx_t *labels) const
+{
+    FAISS_THROW_IF_NOT_MSG (k == 1,
+                      "search implemented only for k=1");
+    FAISS_THROW_IF_NOT_MSG (sum_d == d,
+                      "not enough indexes compared to # dimensions");
+
+    int64_t nshard = sub_indexes.size();
+    float *all_distances = new float [nshard * k * n];
+    idx_t *all_labels = new idx_t [nshard * k * n];
+    ScopeDeleter<float> del (all_distances);
+    ScopeDeleter<idx_t> del2 (all_labels);
+
+    auto query_func = [n, x, k, distances, labels, all_distances, all_labels, this]
+        (int no) {
+        const IndexSplitVectors *index = this;
+        float *distances1 = no == 0 ? distances : all_distances + no * k * n;
+        idx_t *labels1 = no == 0 ? labels : all_labels + no * k * n;
+        if (index->verbose)
+            printf ("begin query shard %d on %ld points\n", no, n);
+        const Index * sub_index = index->sub_indexes[no];
+        int64_t sub_d = sub_index->d, d = index->d;
+        idx_t ofs = 0;
+        for (int i = 0; i < no; i++) ofs += index->sub_indexes[i]->d;
+        float *sub_x = new float [sub_d * n];
+        ScopeDeleter<float> del1 (sub_x);
+        for (idx_t i = 0; i < n; i++)
+            memcpy (sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof (sub_x));
+        sub_index->search (n, sub_x, k, distances1, labels1);
+        if (index->verbose)
+            printf ("end query shard %d\n", no);
+    };
+
+    if (!threaded) {
+        for (int i = 0; i < nshard; i++) {
+            query_func(i);
+        }
+    } else {
+        std::vector<std::unique_ptr<WorkerThread> > threads;
+        std::vector<std::future<bool>> v;
+
+        for (int i = 0; i < nshard; i++) {
+            threads.emplace_back(new WorkerThread());
+            WorkerThread *wt = threads.back().get();
+            v.emplace_back(wt->add([i, query_func](){query_func(i); }));
+        }
+
+        // Blocking wait for completion
+        for (auto& func : v) {
+            func.get();
+        }
+    }
+
+    int64_t factor = 1;
+    for (int i = 0; i < nshard; i++) {
+        if (i > 0) { // results of 0 are already in the table
+            const float *distances_i = all_distances + i * k * n;
+            const idx_t *labels_i = all_labels + i * k * n;
+            for (int64_t j = 0; j < n; j++) {
+                if (labels[j] >= 0 && labels_i[j] >= 0) {
+                    labels[j] += labels_i[j] * factor;
+                    distances[j] += distances_i[j];
+                } else {
+                    labels[j] = -1;
+                    distances[j] = 0.0 / 0.0;
+                }
+            }
+        }
+        factor *= sub_indexes[i]->ntotal;
+    }
+
+}
+
+void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
+  FAISS_THROW_MSG("not implemented");
+}
+
+void IndexSplitVectors::reset ()
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
+
+IndexSplitVectors::~IndexSplitVectors ()
+{
+    if (own_fields) {
+        for (int s = 0; s < sub_indexes.size(); s++)
+            delete sub_indexes [s];
+    }
+}
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/MetaIndexes.h
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.h
@ -0,0 +1,126 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef META_INDEXES_H
+#define META_INDEXES_H
+
+#include <vector>
+#include <unordered_map>
+#include <faiss/Index.h>
+#include <faiss/IndexShards.h>
+#include <faiss/IndexReplicas.h>
+
+namespace faiss {
+
+/** Index that translates search results to ids */
+template <typename IndexT>
+struct IndexIDMapTemplate : IndexT {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    IndexT * index;           ///! the sub-index
+    bool own_fields;          ///! whether pointers are deleted in destructo
+    std::vector<idx_t> id_map;
+
+    explicit IndexIDMapTemplate (IndexT *index);
+
+    /// @param xids if non-null, ids to store for the vectors (size n)
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+    /// this will fail. Use add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    void search(
+        idx_t n, const component_t* x, idx_t k,
+        distance_t* distances,
+        idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void reset() override;
+
+    /// remove ids adapted to IndexFlat
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void range_search (idx_t n, const component_t *x, distance_t radius,
+                       RangeSearchResult *result) const override;
+
+    ~IndexIDMapTemplate () override;
+    IndexIDMapTemplate () {own_fields=false; index=nullptr; }
+};
+
+using IndexIDMap = IndexIDMapTemplate<Index>;
+using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
+
+
+/** same as IndexIDMap but also provides an efficient reconstruction
+ *  implementation via a 2-way index */
+template <typename IndexT>
+struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    std::unordered_map<idx_t, idx_t> rev_map;
+
+    explicit IndexIDMap2Template (IndexT *index);
+
+    /// make the rev_map from scratch
+    void construct_rev_map ();
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void reconstruct (idx_t key, component_t * recons) const override;
+
+    ~IndexIDMap2Template() override {}
+    IndexIDMap2Template () {}
+};
+
+using IndexIDMap2 = IndexIDMap2Template<Index>;
+using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
+
+
+/** splits input vectors in segments and assigns each segment to a sub-index
+ * used to distribute a MultiIndexQuantizer
+ */
+struct IndexSplitVectors: Index {
+    bool own_fields;
+    bool threaded;
+    std::vector<Index*> sub_indexes;
+    idx_t sum_d;  /// sum of dimensions seen so far
+
+    explicit IndexSplitVectors (idx_t d, bool threaded = false);
+
+    void add_sub_index (Index *);
+    void sync_with_sub_indexes ();
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const override;
+
+    void train(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    ~IndexSplitVectors() override;
+};
+
+
+} // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/OnDiskInvertedLists.cpp
+++ b/core/src/index/thirdparty/faiss/OnDiskInvertedLists.cpp
@ -0,0 +1,674 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/OnDiskInvertedLists.h>
+
+#include <pthread.h>
+
+#include <unordered_set>
+
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+
+namespace faiss {
+
+
+/**********************************************
+ * LockLevels
+ **********************************************/
+
+
+struct LockLevels {
+    /* There n times lock1(n), one lock2 and one lock3
+     * Invariants:
+     *    a single thread can hold one lock1(n) for some n
+     *    a single thread can hold lock2, if it holds lock1(n) for some n
+     *    a single thread can hold lock3, if it holds lock1(n) for some n
+     *       AND lock2 AND no other thread holds lock1(m) for m != n
+     */
+    pthread_mutex_t mutex1;
+    pthread_cond_t level1_cv;
+    pthread_cond_t level2_cv;
+    pthread_cond_t level3_cv;
+
+    std::unordered_set<int> level1_holders; // which level1 locks are held
+    int n_level2; // nb threads that wait on level2
+    bool level3_in_use; // a threads waits on level3
+    bool level2_in_use;
+
+    LockLevels() {
+        pthread_mutex_init(&mutex1, nullptr);
+        pthread_cond_init(&level1_cv, nullptr);
+        pthread_cond_init(&level2_cv, nullptr);
+        pthread_cond_init(&level3_cv, nullptr);
+        n_level2 = 0;
+        level2_in_use = false;
+        level3_in_use = false;
+    }
+
+    ~LockLevels() {
+        pthread_cond_destroy(&level1_cv);
+        pthread_cond_destroy(&level2_cv);
+        pthread_cond_destroy(&level3_cv);
+        pthread_mutex_destroy(&mutex1);
+    }
+
+    void lock_1(int no) {
+        pthread_mutex_lock(&mutex1);
+        while (level3_in_use || level1_holders.count(no) > 0) {
+            pthread_cond_wait(&level1_cv, &mutex1);
+        }
+        level1_holders.insert(no);
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void unlock_1(int no) {
+        pthread_mutex_lock(&mutex1);
+        assert(level1_holders.count(no) == 1);
+        level1_holders.erase(no);
+        if (level3_in_use) { // a writer is waiting
+            pthread_cond_signal(&level3_cv);
+        } else {
+            pthread_cond_broadcast(&level1_cv);
+        }
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void lock_2() {
+        pthread_mutex_lock(&mutex1);
+        n_level2 ++;
+        if (level3_in_use) { // tell waiting level3 that we are blocked
+            pthread_cond_signal(&level3_cv);
+        }
+        while (level2_in_use) {
+            pthread_cond_wait(&level2_cv, &mutex1);
+        }
+        level2_in_use = true;
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void unlock_2() {
+        pthread_mutex_lock(&mutex1);
+        level2_in_use = false;
+        n_level2 --;
+        pthread_cond_signal(&level2_cv);
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void lock_3() {
+        pthread_mutex_lock(&mutex1);
+        level3_in_use = true;
+        // wait until there are no level1 holders anymore except the
+        // ones that are waiting on level2 (we are holding lock2)
+        while (level1_holders.size() > n_level2) {
+            pthread_cond_wait(&level3_cv, &mutex1);
+        }
+        // don't release the lock!
+    }
+
+    void unlock_3() {
+        level3_in_use = false;
+        // wake up all level1_holders
+        pthread_cond_broadcast(&level1_cv);
+        pthread_mutex_unlock(&mutex1);
+    }
+
+    void print () {
+        pthread_mutex_lock(&mutex1);
+        printf("State: level3_in_use=%d n_level2=%d level1_holders: [", level3_in_use, n_level2);
+        for (int k : level1_holders) {
+            printf("%d ", k);
+        }
+        printf("]\n");
+        pthread_mutex_unlock(&mutex1);
+    }
+
+};
+
+/**********************************************
+ * OngoingPrefetch
+ **********************************************/
+
+struct OnDiskInvertedLists::OngoingPrefetch {
+
+    struct Thread {
+        pthread_t pth;
+        OngoingPrefetch *pf;
+
+        bool one_list () {
+            idx_t list_no = pf->get_next_list();
+            if(list_no == -1) return false;
+            const OnDiskInvertedLists *od = pf->od;
+            od->locks->lock_1 (list_no);
+            size_t n = od->list_size (list_no);
+            const Index::idx_t *idx = od->get_ids (list_no);
+            const uint8_t *codes = od->get_codes (list_no);
+            int cs = 0;
+            for (size_t i = 0; i < n;i++) {
+                cs += idx[i];
+            }
+            const idx_t *codes8 = (const idx_t*)codes;
+            idx_t n8 = n * od->code_size / 8;
+
+            for (size_t i = 0; i < n8;i++) {
+                cs += codes8[i];
+            }
+            od->locks->unlock_1(list_no);
+
+            global_cs += cs & 1;
+            return true;
+        }
+
+    };
+
+    std::vector<Thread> threads;
+
+    pthread_mutex_t list_ids_mutex;
+    std::vector<idx_t> list_ids;
+    int cur_list;
+
+    // mutex for the list of tasks
+    pthread_mutex_t mutex;
+
+    // pretext to avoid code below to be optimized out
+    static int global_cs;
+
+    const OnDiskInvertedLists *od;
+
+    explicit OngoingPrefetch (const OnDiskInvertedLists *od): od (od)
+    {
+        pthread_mutex_init (&mutex, nullptr);
+        pthread_mutex_init (&list_ids_mutex, nullptr);
+        cur_list = 0;
+    }
+
+    static void* prefetch_list (void * arg) {
+        Thread *th = static_cast<Thread*>(arg);
+
+        while (th->one_list()) ;
+
+        return nullptr;
+    }
+
+    idx_t get_next_list () {
+        idx_t list_no = -1;
+        pthread_mutex_lock (&list_ids_mutex);
+        if (cur_list >= 0 && cur_list < list_ids.size()) {
+            list_no = list_ids[cur_list++];
+        }
+        pthread_mutex_unlock (&list_ids_mutex);
+        return list_no;
+    }
+
+    void prefetch_lists (const idx_t *list_nos, int n) {
+        pthread_mutex_lock (&mutex);
+        pthread_mutex_lock (&list_ids_mutex);
+        list_ids.clear ();
+        pthread_mutex_unlock (&list_ids_mutex);
+        for (auto &th: threads) {
+            pthread_join (th.pth, nullptr);
+        }
+
+        threads.resize (0);
+        cur_list = 0;
+        int nt = std::min (n, od->prefetch_nthread);
+
+        if (nt > 0) {
+            // prepare tasks
+            for (int i = 0; i < n; i++) {
+                idx_t list_no = list_nos[i];
+                if (list_no >= 0 && od->list_size(list_no) > 0) {
+                    list_ids.push_back (list_no);
+                }
+            }
+            // prepare threads
+            threads.resize (nt);
+            for (Thread &th: threads) {
+                th.pf = this;
+                pthread_create (&th.pth, nullptr, prefetch_list, &th);
+            }
+        }
+        pthread_mutex_unlock (&mutex);
+    }
+
+    ~OngoingPrefetch () {
+        pthread_mutex_lock (&mutex);
+        for (auto &th: threads) {
+            pthread_join (th.pth, nullptr);
+        }
+        pthread_mutex_unlock (&mutex);
+        pthread_mutex_destroy (&mutex);
+        pthread_mutex_destroy (&list_ids_mutex);
+    }
+
+};
+
+int OnDiskInvertedLists::OngoingPrefetch::global_cs = 0;
+
+
+void OnDiskInvertedLists::prefetch_lists (const idx_t *list_nos, int n) const
+{
+    pf->prefetch_lists (list_nos, n);
+}
+
+
+
+/**********************************************
+ * OnDiskInvertedLists: mmapping
+ **********************************************/
+
+
+void OnDiskInvertedLists::do_mmap ()
+{
+    const char *rw_flags = read_only ? "r" : "r+";
+    int prot = read_only ? PROT_READ : PROT_WRITE | PROT_READ;
+    FILE *f = fopen (filename.c_str(), rw_flags);
+    FAISS_THROW_IF_NOT_FMT (f, "could not open %s in mode %s: %s",
+                            filename.c_str(), rw_flags, strerror(errno));
+
+    uint8_t * ptro = (uint8_t*)mmap (nullptr, totsize,
+                          prot, MAP_SHARED, fileno (f), 0);
+
+    FAISS_THROW_IF_NOT_FMT (ptro != MAP_FAILED,
+                            "could not mmap %s: %s",
+                            filename.c_str(),
+                            strerror(errno));
+    ptr = ptro;
+    fclose (f);
+
+}
+
+void OnDiskInvertedLists::update_totsize (size_t new_size)
+{
+
+    // unmap file
+    if (ptr != nullptr) {
+        int err = munmap (ptr, totsize);
+        FAISS_THROW_IF_NOT_FMT (err == 0, "munmap error: %s",
+                                strerror(errno));
+    }
+    if (totsize == 0) {
+        // must create file before truncating it
+        FILE *f = fopen (filename.c_str(), "w");
+        FAISS_THROW_IF_NOT_FMT (f, "could not open %s in mode W: %s",
+                                filename.c_str(), strerror(errno));
+        fclose (f);
+    }
+
+    if (new_size > totsize) {
+        if (!slots.empty() &&
+            slots.back().offset + slots.back().capacity == totsize) {
+            slots.back().capacity += new_size - totsize;
+        } else {
+            slots.push_back (Slot(totsize, new_size - totsize));
+        }
+    } else {
+        assert(!"not implemented");
+    }
+
+    totsize = new_size;
+
+    // create file
+    printf ("resizing %s to %ld bytes\n", filename.c_str(), totsize);
+
+    int err = truncate (filename.c_str(), totsize);
+
+    FAISS_THROW_IF_NOT_FMT (err == 0, "truncate %s to %ld: %s",
+                            filename.c_str(), totsize,
+                            strerror(errno));
+    do_mmap ();
+}
+
+
+
+
+
+
+/**********************************************
+ * OnDiskInvertedLists
+ **********************************************/
+
+#define INVALID_OFFSET (size_t)(-1)
+
+OnDiskInvertedLists::List::List ():
+    size (0), capacity (0), offset (INVALID_OFFSET)
+{}
+
+OnDiskInvertedLists::Slot::Slot (size_t offset, size_t capacity):
+    offset (offset), capacity (capacity)
+{}
+
+OnDiskInvertedLists::Slot::Slot ():
+    offset (0), capacity (0)
+{}
+
+
+
+OnDiskInvertedLists::OnDiskInvertedLists (
+        size_t nlist, size_t code_size,
+        const char *filename):
+    InvertedLists (nlist, code_size),
+    filename (filename),
+    totsize (0),
+    ptr (nullptr),
+    read_only (false),
+    locks (new LockLevels ()),
+    pf (new OngoingPrefetch (this)),
+    prefetch_nthread (32)
+{
+    lists.resize (nlist);
+
+    // slots starts empty
+}
+
+OnDiskInvertedLists::OnDiskInvertedLists ():
+    OnDiskInvertedLists (0, 0, "")
+{
+}
+
+OnDiskInvertedLists::~OnDiskInvertedLists ()
+{
+    delete pf;
+
+    // unmap all lists
+    if (ptr != nullptr) {
+        int err = munmap (ptr, totsize);
+        if (err != 0) {
+            fprintf(stderr, "mumap error: %s",
+                    strerror(errno));
+        }
+    }
+    delete locks;
+}
+
+
+
+
+size_t OnDiskInvertedLists::list_size(size_t list_no) const
+{
+    return lists[list_no].size;
+}
+
+
+const uint8_t * OnDiskInvertedLists::get_codes (size_t list_no) const
+{
+    if (lists[list_no].offset == INVALID_OFFSET) {
+        return nullptr;
+    }
+
+    return ptr + lists[list_no].offset;
+}
+
+const Index::idx_t * OnDiskInvertedLists::get_ids (size_t list_no) const
+{
+    if (lists[list_no].offset == INVALID_OFFSET) {
+        return nullptr;
+    }
+
+    return (const idx_t*)(ptr + lists[list_no].offset +
+                          code_size * lists[list_no].capacity);
+}
+
+
+void OnDiskInvertedLists::update_entries (
+      size_t list_no, size_t offset, size_t n_entry,
+      const idx_t *ids_in, const uint8_t *codes_in)
+{
+    FAISS_THROW_IF_NOT (!read_only);
+    if (n_entry == 0) return;
+    const List & l = lists[list_no];
+    assert (n_entry + offset <= l.size);
+    idx_t *ids = const_cast<idx_t*>(get_ids (list_no));
+    memcpy (ids + offset, ids_in, sizeof(ids_in[0]) * n_entry);
+    uint8_t *codes = const_cast<uint8_t*>(get_codes (list_no));
+    memcpy (codes + offset * code_size, codes_in, code_size * n_entry);
+}
+
+size_t OnDiskInvertedLists::add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code)
+{
+    FAISS_THROW_IF_NOT (!read_only);
+    locks->lock_1 (list_no);
+    size_t o = list_size (list_no);
+    resize_locked (list_no, n_entry + o);
+    update_entries (list_no, o, n_entry, ids, code);
+    locks->unlock_1 (list_no);
+    return o;
+}
+
+void OnDiskInvertedLists::resize (size_t list_no, size_t new_size)
+{
+    FAISS_THROW_IF_NOT (!read_only);
+    locks->lock_1 (list_no);
+    resize_locked (list_no, new_size);
+    locks->unlock_1 (list_no);
+}
+
+
+
+void OnDiskInvertedLists::resize_locked (size_t list_no, size_t new_size)
+{
+    List & l = lists[list_no];
+
+    if (new_size <= l.capacity &&
+        new_size > l.capacity / 2) {
+        l.size = new_size;
+        return;
+    }
+
+    // otherwise we release the current slot, and find a new one
+
+    locks->lock_2 ();
+    free_slot (l.offset, l.capacity);
+
+    List new_l;
+
+    if (new_size == 0) {
+        new_l = List();
+    } else {
+        new_l.size = new_size;
+        new_l.capacity = 1;
+        while (new_l.capacity < new_size) {
+            new_l.capacity *= 2;
+        }
+        new_l.offset = allocate_slot (
+            new_l.capacity * (sizeof(idx_t) + code_size));
+    }
+
+    // copy common data
+    if (l.offset != new_l.offset) {
+        size_t n = std::min (new_size, l.size);
+        if (n > 0) {
+            memcpy (ptr + new_l.offset, get_codes(list_no), n * code_size);
+            memcpy (ptr + new_l.offset + new_l.capacity * code_size,
+                    get_ids (list_no), n * sizeof(idx_t));
+        }
+    }
+
+    lists[list_no] = new_l;
+    locks->unlock_2 ();
+}
+
+size_t OnDiskInvertedLists::allocate_slot (size_t capacity) {
+    // should hold lock2
+
+    auto it = slots.begin();
+    while (it != slots.end() && it->capacity < capacity) {
+        it++;
+    }
+
+    if (it == slots.end()) {
+        // not enough capacity
+        size_t new_size = totsize == 0 ? 32 : totsize * 2;
+        while (new_size - totsize < capacity)
+            new_size *= 2;
+        locks->lock_3 ();
+        update_totsize(new_size);
+        locks->unlock_3 ();
+        it = slots.begin();
+        while (it != slots.end() && it->capacity < capacity) {
+            it++;
+        }
+        assert (it != slots.end());
+    }
+
+    size_t o = it->offset;
+    if (it->capacity == capacity) {
+        slots.erase (it);
+    } else {
+        // take from beginning of slot
+        it->capacity -= capacity;
+        it->offset += capacity;
+    }
+
+    return o;
+}
+
+
+
+void OnDiskInvertedLists::free_slot (size_t offset, size_t capacity) {
+
+    // should hold lock2
+    if (capacity == 0) return;
+
+    auto it = slots.begin();
+    while (it != slots.end() && it->offset <= offset) {
+        it++;
+    }
+
+    size_t inf = 1UL << 60;
+
+    size_t end_prev = inf;
+    if (it != slots.begin()) {
+        auto prev = it;
+        prev--;
+        end_prev = prev->offset + prev->capacity;
+    }
+
+    size_t begin_next = 1L << 60;
+    if (it != slots.end()) {
+        begin_next = it->offset;
+    }
+
+    assert (end_prev == inf || offset >= end_prev);
+    assert (offset + capacity <= begin_next);
+
+    if (offset == end_prev) {
+        auto prev = it;
+        prev--;
+        if (offset + capacity == begin_next) {
+            prev->capacity += capacity + it->capacity;
+            slots.erase (it);
+        } else {
+            prev->capacity += capacity;
+        }
+    } else {
+        if (offset + capacity == begin_next) {
+            it->offset -= capacity;
+            it->capacity += capacity;
+        } else {
+            slots.insert (it, Slot (offset, capacity));
+        }
+    }
+
+    // TODO shrink global storage if needed
+}
+
+
+/*****************************************
+ * Compact form
+ *****************************************/
+
+size_t OnDiskInvertedLists::merge_from (const InvertedLists **ils, int n_il,
+                                        bool verbose)
+{
+    FAISS_THROW_IF_NOT_MSG (totsize == 0, "works only on an empty InvertedLists");
+
+    std::vector<size_t> sizes (nlist);
+    for (int i = 0; i < n_il; i++) {
+        const InvertedLists *il = ils[i];
+        FAISS_THROW_IF_NOT (il->nlist == nlist && il->code_size == code_size);
+
+        for (size_t j = 0; j < nlist; j++)  {
+            sizes [j] += il->list_size(j);
+        }
+    }
+
+    size_t cums = 0;
+    size_t ntotal = 0;
+    for (size_t j = 0; j < nlist; j++)  {
+        ntotal += sizes[j];
+        lists[j].size = 0;
+        lists[j].capacity = sizes[j];
+        lists[j].offset = cums;
+        cums += lists[j].capacity * (sizeof(idx_t) + code_size);
+    }
+
+    update_totsize (cums);
+
+
+    size_t nmerged = 0;
+    double t0 = getmillisecs(), last_t = t0;
+
+#pragma omp parallel for
+    for (size_t j = 0; j < nlist; j++) {
+        List & l = lists[j];
+        for (int i = 0; i < n_il; i++) {
+            const InvertedLists *il = ils[i];
+            size_t n_entry = il->list_size(j);
+            l.size += n_entry;
+            update_entries (j, l.size - n_entry, n_entry,
+                            ScopedIds(il, j).get(),
+                            ScopedCodes(il, j).get());
+        }
+        assert (l.size == l.capacity);
+        if (verbose) {
+#pragma omp critical
+            {
+                nmerged++;
+                double t1 = getmillisecs();
+                if (t1 - last_t > 500) {
+                    printf("merged %ld lists in %.3f s\r",
+                           nmerged, (t1 - t0) / 1000.0);
+                    fflush(stdout);
+                    last_t = t1;
+                }
+            }
+        }
+    }
+    if(verbose) {
+        printf("\n");
+    }
+
+    return ntotal;
+}
+
+
+void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1)
+{
+    FAISS_THROW_IF_NOT(0 <= l0 && l0 <= l1 && l1 <= nlist);
+
+    std::vector<List> new_lists (l1 - l0);
+    memcpy (new_lists.data(), &lists[l0], (l1 - l0) * sizeof(List));
+
+    lists.swap(new_lists);
+
+    nlist = l1 - l0;
+}
+
+
+
+
+} // namespace faiss
--- a/core/src/index/thirdparty/faiss/OnDiskInvertedLists.h
+++ b/core/src/index/thirdparty/faiss/OnDiskInvertedLists.h
@ -0,0 +1,127 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_ON_DISK_INVERTED_LISTS_H
+#define FAISS_ON_DISK_INVERTED_LISTS_H
+
+#include <vector>
+#include <list>
+
+#include <faiss/IndexIVF.h>
+
+namespace faiss {
+
+
+struct LockLevels;
+
+/** On-disk storage of inverted lists.
+ *
+ * The data is stored in a mmapped chunk of memory (base ptointer ptr,
+ * size totsize). Each list is a range of memory that contains (object
+ * List) that contains:
+ *
+ * - uint8_t codes[capacity * code_size]
+ * - followed by idx_t ids[capacity]
+ *
+ * in each of the arrays, the size <= capacity first elements are
+ * used, the rest is not initialized.
+ *
+ * Addition and resize are supported by:
+ * - roundind up the capacity of the lists to a power of two
+ * - maintaining a list of empty slots, sorted by size.
+ * - resizing the mmapped block is adjusted as needed.
+ *
+ * An OnDiskInvertedLists is compact if the size == capacity for all
+ * lists and there are no available slots.
+ *
+ * Addition to the invlists is slow. For incremental add it is better
+ * to use a default ArrayInvertedLists object and convert it to an
+ * OnDisk with merge_from.
+ *
+ * When it is known that a set of lists will be accessed, it is useful
+ * to call prefetch_lists, that launches a set of threads to read the
+ * lists in parallel.
+ */
+struct OnDiskInvertedLists: InvertedLists {
+
+    struct List {
+        size_t size;     // size of inverted list (entries)
+        size_t capacity; // allocated size (entries)
+        size_t offset;   // offset in buffer (bytes)
+        List ();
+    };
+
+    // size nlist
+    std::vector<List> lists;
+
+    struct Slot {
+        size_t offset;    // bytes
+        size_t capacity;  // bytes
+        Slot (size_t offset, size_t capacity);
+        Slot ();
+    };
+
+    // size whatever space remains
+    std::list<Slot> slots;
+
+    std::string filename;
+    size_t totsize;
+    uint8_t *ptr; // mmap base pointer
+    bool read_only;  /// are inverted lists mapped read-only
+
+    OnDiskInvertedLists (size_t nlist, size_t code_size,
+                         const char *filename);
+
+    size_t list_size(size_t list_no) const override;
+    const uint8_t * get_codes (size_t list_no) const override;
+    const idx_t * get_ids (size_t list_no) const override;
+
+    size_t add_entries (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids, const uint8_t *code) override;
+
+    void update_entries (size_t list_no, size_t offset, size_t n_entry,
+                         const idx_t *ids, const uint8_t *code) override;
+
+    void resize (size_t list_no, size_t new_size) override;
+
+    // copy all inverted lists into *this, in compact form (without
+    // allocating slots)
+    size_t merge_from (const InvertedLists **ils, int n_il, bool verbose=false);
+
+    /// restrict the inverted lists to l0:l1 without touching the mmapped region
+    void crop_invlists(size_t l0, size_t l1);
+
+    void prefetch_lists (const idx_t *list_nos, int nlist) const override;
+
+    virtual ~OnDiskInvertedLists ();
+
+    // private
+
+    LockLevels * locks;
+
+    // encapsulates the threads that are busy prefeteching
+    struct OngoingPrefetch;
+    OngoingPrefetch *pf;
+    int prefetch_nthread;
+
+    void do_mmap ();
+    void update_totsize (size_t new_totsize);
+    void resize_locked (size_t list_no, size_t new_size);
+    size_t allocate_slot (size_t capacity);
+    void free_slot (size_t offset, size_t capacity);
+
+    // empty constructor for the I/O functions
+    OnDiskInvertedLists ();
+};
+
+
+} // namespace faiss
+
+#endif
--- a/core/src/index/thirdparty/faiss/README.md
+++ b/core/src/index/thirdparty/faiss/README.md
@ -0,0 +1,87 @@
+# Faiss 
+
+Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed by [Facebook AI Research](https://research.fb.com/category/facebook-ai-research-fair/).
+
+## NEWS
+
+*NEW: version 1.5.3 (2019-06-24) fix performance regression in IndexIVF.*
+
+*NEW: version 1.5.2 (2019-05-27) the license was relaxed to MIT from BSD+Patents. Read LICENSE for details.*
+
+*NEW: version 1.5.0 (2018-12-19) GPU binary flat index and binary HNSW index*
+
+*NEW: version 1.4.0 (2018-08-30) no more crashes in pure Python code*
+
+*NEW: version 1.3.0 (2018-07-12) support for binary indexes*
+
+*NEW: latest commit (2018-02-22) supports on-disk storage of inverted indexes, see demos/demo_ondisk_ivf.py*
+
+*NEW: latest commit (2018-01-09) includes an implementation of the HNSW indexing method, see benchs/bench_hnsw.py*
+
+*NEW: there is now a Facebook public discussion group for Faiss users at https://www.facebook.com/groups/faissusers/*
+
+*NEW: on 2017-07-30, the license on Faiss was relaxed to BSD from CC-BY-NC. Read LICENSE for details.*
+
+## Introduction
+
+Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
+
+Most of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. 
+
+The GPU implementation can accept input from either CPU or GPU memory. On a server with GPUs, the GPU indexes can be used a drop-in replacement for the CPU indexes (e.g., replace `IndexFlatL2` with `GpuIndexFlatL2`) and copies to/from GPU memory are handled automatically. Results will be faster however if both input and output remain resident on the GPU. Both single and multi-GPU usage is supported.
+
+## Building 
+
+The library is mostly implemented in C++, with optional GPU support provided via CUDA, and an optional Python interface. The CPU version requires a BLAS library. It compiles with a Makefile and can be packaged in a docker image. See [INSTALL.md](INSTALL.md) for details.
+
+## How Faiss works
+
+Faiss is built around an index type that stores a set of vectors, and provides a function to search in them with L2 and/or dot product vector comparison. Some index types are simple baselines, such as exact search. Most of the available indexing structures correspond to various trade-offs with respect to
+
+- search time
+- search quality
+- memory used per index vector 
+- training time
+- need for external data for unsupervised training
+
+The optional GPU implementation provides what is likely (as of March 2017) the fastest exact and approximate (compressed-domain) nearest neighbor search implementation for high-dimensional vectors, fastest Lloyd's k-means, and fastest small k-selection algorithm known. [The implementation is detailed here](https://arxiv.org/abs/1702.08734).
+
+## Full documentation of Faiss
+
+The following are entry points for documentation: 
+
+- the full documentation, including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting) can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki)
+- the [doxygen documentation](http://rawgithub.com/facebookresearch/faiss/master/docs/html/annotated.html) gives per-class information
+- to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [
+Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code)
+
+## Authors
+
+The main authors of Faiss are:
+- [Hervé Jégou](https://github.com/jegou) initiated the Faiss project and wrote its first implementation
+- [Matthijs Douze](https://github.com/mdouze) implemented most of the CPU Faiss
+- [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
+- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes
+
+## Reference
+
+Reference to cite when you use Faiss in a research paper:
+
+```
+@article{JDH17,
+  title={Billion-scale similarity search with GPUs},
+  author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
+  journal={arXiv preprint arXiv:1702.08734},
+  year={2017}
+}
+```
+
+## Join the Faiss community
+
+For public discussion of Faiss or for questions, there is a Facebook public discussion group at https://www.facebook.com/groups/faissusers/
+
+We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository. You can report bugs, ask questions, etc.
+
+## License
+
+Faiss is MIT-licensed.
--- a/core/src/index/thirdparty/faiss/VectorTransform.cpp
+++ b/core/src/index/thirdparty/faiss/VectorTransform.cpp
--- a/core/src/index/thirdparty/faiss/VectorTransform.h
+++ b/core/src/index/thirdparty/faiss/VectorTransform.h
@ -0,0 +1,322 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_VECTOR_TRANSFORM_H
+#define FAISS_VECTOR_TRANSFORM_H
+
+/** Defines a few objects that apply transformations to a set of
+ * vectors Often these are pre-processing steps.
+ */
+
+#include <vector>
+#include <stdint.h>
+
+#include <faiss/Index.h>
+
+
+namespace faiss {
+
+
+/** Any transformation applied on a set of vectors */
+struct VectorTransform {
+
+    typedef Index::idx_t idx_t;
+
+    int d_in;      ///! input dimension
+    int d_out;     ///! output dimension
+
+    explicit VectorTransform (int d_in = 0, int d_out = 0):
+    d_in(d_in), d_out(d_out), is_trained(true)
+    {}
+
+
+    /// set if the VectorTransform does not require training, or if
+    /// training is done already
+    bool is_trained;
+
+
+    /** Perform training on a representative set of vectors. Does
+     * nothing by default.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train (idx_t n, const float *x);
+
+    /** apply the random roation, return new allocated matrix
+     * @param     x size n * d_in
+     * @return    size n * d_out
+     */
+    float *apply (idx_t n, const float * x) const;
+
+    /// same as apply, but result is pre-allocated
+    virtual void apply_noalloc (idx_t n, const float * x,
+                                float *xt) const = 0;
+
+    /// reverse transformation. May not be implemented or may return
+    /// approximate result
+    virtual void reverse_transform (idx_t n, const float * xt,
+                                    float *x) const;
+
+    virtual ~VectorTransform () {}
+
+};
+
+
+
+/** Generic linear transformation, with bias term applied on output
+ * y = A * x + b
+ */
+struct LinearTransform: VectorTransform {
+
+    bool have_bias; ///! whether to use the bias term
+
+    /// check if matrix A is orthonormal (enables reverse_transform)
+    bool is_orthonormal;
+
+    /// Transformation matrix, size d_out * d_in
+    std::vector<float> A;
+
+     /// bias vector, size d_out
+    std::vector<float> b;
+
+    /// both d_in > d_out and d_out < d_in are supported
+    explicit LinearTransform (int d_in = 0, int d_out = 0,
+                              bool have_bias = false);
+
+    /// same as apply, but result is pre-allocated
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// compute x = A^T * (x - b)
+    /// is reverse transform if A has orthonormal lines
+    void transform_transpose (idx_t n, const float * y,
+                              float *x) const;
+
+    /// works only if is_orthonormal
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+
+    /// compute A^T * A to set the is_orthonormal flag
+    void set_is_orthonormal ();
+
+    bool verbose;
+    void print_if_verbose (const char*name, const std::vector<double> &mat,
+                           int n, int d) const;
+
+    ~LinearTransform() override {}
+};
+
+
+
+/// Randomly rotate a set of vectors
+struct RandomRotationMatrix: LinearTransform {
+
+     /// both d_in > d_out and d_out < d_in are supported
+     RandomRotationMatrix (int d_in, int d_out):
+         LinearTransform(d_in, d_out, false) {}
+
+     /// must be called before the transform is used
+     void init(int seed);
+
+     // intializes with an arbitrary seed
+     void train(idx_t n, const float* x) override;
+
+     RandomRotationMatrix () {}
+};
+
+
+/** Applies a principal component analysis on a set of vectors,
+ *  with optionally whitening and random rotation. */
+struct PCAMatrix: LinearTransform {
+
+    /** after transformation the components are multiplied by
+     * eigenvalues^eigen_power
+     *
+     * =0: no whitening
+     * =-0.5: full whitening
+     */
+    float eigen_power;
+
+    /// random rotation after PCA
+    bool random_rotation;
+
+    /// ratio between # training vectors and dimension
+    size_t max_points_per_d;
+
+    /// try to distribute output eigenvectors in this many bins
+    int balanced_bins;
+
+    /// Mean, size d_in
+    std::vector<float> mean;
+
+    /// eigenvalues of covariance matrix (= squared singular values)
+    std::vector<float> eigenvalues;
+
+    /// PCA matrix, size d_in * d_in
+    std::vector<float> PCAMat;
+
+    // the final matrix is computed after random rotation and/or whitening
+    explicit PCAMatrix (int d_in = 0, int d_out = 0,
+                        float eigen_power = 0, bool random_rotation = false);
+
+    /// train on n vectors. If n < d_in then the eigenvector matrix
+    /// will be completed with 0s
+    void train(idx_t n, const float* x) override;
+
+    /// copy pre-trained PCA matrix
+    void copy_from (const PCAMatrix & other);
+
+    /// called after mean, PCAMat and eigenvalues are computed
+    void prepare_Ab();
+
+};
+
+
+/** ITQ implementation from
+ *
+ *     Iterative quantization: A procrustean approach to learning binary codes
+ *     for large-scale image retrieval,
+ *
+ * Yunchao Gong, Svetlana Lazebnik, Albert Gordo, Florent Perronnin,
+ * PAMI'12.
+ */
+
+struct ITQMatrix: LinearTransform {
+
+    int max_iter;
+    int seed;
+
+    // force initialization of the rotation (for debugging)
+    std::vector<double> init_rotation;
+
+    explicit ITQMatrix (int d = 0);
+
+    void train (idx_t n, const float* x) override;
+};
+
+
+
+/** The full ITQ transform, including normalizations and PCA transformation
+ */
+struct ITQTransform: VectorTransform {
+
+    std::vector<float> mean;
+    bool do_pca;
+    ITQMatrix itq;
+
+    /// max training points per dimension
+    int max_train_per_dim;
+
+    // concatenation of PCA + ITQ transformation
+    LinearTransform pca_then_itq;
+
+    explicit ITQTransform (int d_in = 0, int d_out = 0, bool do_pca = false);
+
+    void train (idx_t n, const float *x) override;
+
+    void apply_noalloc (idx_t n, const float* x, float* xt) const override;
+
+};
+
+
+struct ProductQuantizer;
+
+/** Applies a rotation to align the dimensions with a PQ to minimize
+ *  the reconstruction error. Can be used before an IndexPQ or an
+ *  IndexIVFPQ. The method is the non-parametric version described in:
+ *
+ * "Optimized Product Quantization for Approximate Nearest Neighbor Search"
+ * Tiezheng Ge, Kaiming He, Qifa Ke, Jian Sun, CVPR'13
+ *
+ */
+struct OPQMatrix: LinearTransform {
+
+    int M;          ///< nb of subquantizers
+    int niter;      ///< Number of outer training iterations
+    int niter_pq;   ///< Number of training iterations for the PQ
+    int niter_pq_0; ///< same, for the first outer iteration
+
+    /// if there are too many training points, resample
+    size_t max_train_points;
+    bool verbose;
+
+    /// if non-NULL, use this product quantizer for training
+    /// should be constructed with (d_out, M, _)
+    ProductQuantizer * pq;
+
+    /// if d2 != -1, output vectors of this dimension
+    explicit OPQMatrix (int d = 0, int M = 1, int d2 = -1);
+
+    void train(idx_t n, const float* x) override;
+};
+
+
+/** remap dimensions for intput vectors, possibly inserting 0s
+ * strictly speaking this is also a linear transform but we don't want
+ * to compute it with matrix multiplies */
+struct RemapDimensionsTransform: VectorTransform {
+
+    /// map from output dimension to input, size d_out
+    /// -1 -> set output to 0
+    std::vector<int> map;
+
+    RemapDimensionsTransform (int d_in, int d_out, const int *map);
+
+    /// remap input to output, skipping or inserting dimensions as needed
+    /// if uniform: distribute dimensions uniformly
+    /// otherwise just take the d_out first ones.
+    RemapDimensionsTransform (int d_in, int d_out, bool uniform = true);
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// reverse transform correct only when the mapping is a permutation
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    RemapDimensionsTransform () {}
+};
+
+
+/** per-vector normalization */
+struct NormalizationTransform: VectorTransform {
+    float norm;
+
+    explicit NormalizationTransform (int d, float norm = 2.0);
+    NormalizationTransform ();
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// Identity transform since norm is not revertible
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+};
+
+/** Subtract the mean of each component from the vectors. */
+struct CenteringTransform: VectorTransform {
+
+    /// Mean, size d_in = d_out
+    std::vector<float> mean;
+
+    explicit CenteringTransform (int d = 0);
+
+    /// train on n vectors.
+    void train(idx_t n, const float* x) override;
+
+    /// subtract the mean
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// add the mean
+    void reverse_transform (idx_t n, const float * xt,
+                            float *x) const override;
+
+};
+
+
+} // namespace faiss
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/acinclude/ax_blas.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_blas.m4
@ -0,0 +1,234 @@
+# ===========================================================================
+#         https://www.gnu.org/software/autoconf-archive/ax_blas.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_BLAS([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro looks for a library that implements the BLAS linear-algebra
+#   interface (see http://www.netlib.org/blas/). On success, it sets the
+#   BLAS_LIBS output variable to hold the requisite library linkages.
+#
+#   To link with BLAS, you should link with:
+#
+#     $BLAS_LIBS $LIBS $FLIBS
+#
+#   in that order. FLIBS is the output variable of the
+#   AC_F77_LIBRARY_LDFLAGS macro (called if necessary by AX_BLAS), and is
+#   sometimes necessary in order to link with F77 libraries. Users will also
+#   need to use AC_F77_DUMMY_MAIN (see the autoconf manual), for the same
+#   reason.
+#
+#   Many libraries are searched for, from ATLAS to CXML to ESSL. The user
+#   may also use --with-blas=<lib> in order to use some specific BLAS
+#   library <lib>. In order to link successfully, however, be aware that you
+#   will probably need to use the same Fortran compiler (which can be set
+#   via the F77 env. var.) as was used to compile the BLAS library.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a BLAS library is
+#   found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it is
+#   not found. If ACTION-IF-FOUND is not specified, the default action will
+#   define HAVE_BLAS.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 15
+
+AU_ALIAS([ACX_BLAS], [AX_BLAS])
+AC_DEFUN([AX_BLAS], [
+AC_PREREQ(2.50)
+# AC_REQUIRE([AC_F77_LIBRARY_LDFLAGS])
+AC_REQUIRE([AC_CANONICAL_HOST])
+ax_blas_ok=no
+
+AC_ARG_WITH(blas,
+	[AS_HELP_STRING([--with-blas=<lib>], [use BLAS library <lib>])])
+case $with_blas in
+	yes | "") ;;
+	no) ax_blas_ok=disable ;;
+	-* | */* | *.a | *.so | *.so.* | *.o) BLAS_LIBS="$with_blas" ;;
+	*) BLAS_LIBS="-l$with_blas" ;;
+esac
+
+OPENMP_LDFLAGS="$OPENMP_CXXFLAGS"
+
+# Get fortran linker names of BLAS functions to check for.
+# AC_F77_FUNC(sgemm)
+# AC_F77_FUNC(dgemm)
+sgemm=sgemm_
+dgemm=dgemm_
+
+ax_blas_save_LIBS="$LIBS"
+LIBS="$LIBS $FLIBS"
+
+# First, check BLAS_LIBS environment variable
+if test $ax_blas_ok = no; then
+if test "x$BLAS_LIBS" != x; then
+	save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
+	AC_MSG_CHECKING([for $sgemm in $BLAS_LIBS])
+	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes], [BLAS_LIBS=""])
+	AC_MSG_RESULT($ax_blas_ok)
+	LIBS="$save_LIBS"
+fi
+fi
+
+# BLAS linked to by default?  (happens on some supercomputers)
+if test $ax_blas_ok = no; then
+	save_LIBS="$LIBS"; LIBS="$LIBS"
+	AC_MSG_CHECKING([if $sgemm is being linked in already])
+	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes])
+	AC_MSG_RESULT($ax_blas_ok)
+	LIBS="$save_LIBS"
+fi
+
+# BLAS in Intel MKL library?
+if test $ax_blas_ok = no; then
+  case $host_os in
+    darwin*)
+      AC_CHECK_LIB(mkl_intel_lp64, $sgemm,
+                   [ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread"; OPENMP_LDFLAGS=""],,
+                   [-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread])
+      ;;
+    *)
+      if test $host_cpu = x86_64; then
+        AC_CHECK_LIB(mkl_intel_lp64, $sgemm,
+                     [ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl"],,
+                     [-lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl])
+      elif test $host_cpu = i686; then
+        AC_CHECK_LIB(mkl_intel, $sgemm,
+                     [ax_blas_ok=yes;BLAS_LIBS="-lmkl_intel -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl"],,
+                     [-lmkl_intel -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl])
+      fi
+    ;;
+  esac
+fi
+# Old versions of MKL
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(mkl, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-lmkl -lguide -lpthread"],,[-lguide -lpthread])
+fi
+
+# BLAS in OpenBLAS library? (http://xianyi.github.com/OpenBLAS/)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(openblas, $sgemm, [ax_blas_ok=yes
+			                BLAS_LIBS="-lopenblas"])
+fi
+
+# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(atlas, ATL_xerbla,
+		[AC_CHECK_LIB(f77blas, $sgemm,
+		[AC_CHECK_LIB(cblas, cblas_dgemm,
+			[ax_blas_ok=yes
+			 BLAS_LIBS="-lcblas -lf77blas -latlas"],
+			[], [-lf77blas -latlas])],
+			[], [-latlas])])
+fi
+
+# BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(blas, $sgemm,
+		[AC_CHECK_LIB(dgemm, $dgemm,
+		[AC_CHECK_LIB(sgemm, $sgemm,
+			[ax_blas_ok=yes; BLAS_LIBS="-lsgemm -ldgemm -lblas"],
+			[], [-lblas])],
+			[], [-lblas])])
+fi
+
+# BLAS in Apple vecLib library?
+if test $ax_blas_ok = no; then
+	save_LIBS="$LIBS"; LIBS="-framework vecLib $LIBS"
+	AC_MSG_CHECKING([for $sgemm in -framework vecLib])
+	AC_TRY_LINK_FUNC($sgemm, [ax_blas_ok=yes;BLAS_LIBS="-framework vecLib"])
+	AC_MSG_RESULT($ax_blas_ok)
+	LIBS="$save_LIBS"
+fi
+
+# BLAS in Alpha CXML library?
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(cxml, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-lcxml"])
+fi
+
+# BLAS in Alpha DXML library? (now called CXML, see above)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(dxml, $sgemm, [ax_blas_ok=yes;BLAS_LIBS="-ldxml"])
+fi
+
+# BLAS in Sun Performance library?
+if test $ax_blas_ok = no; then
+	if test "x$GCC" != xyes; then # only works with Sun CC
+		AC_CHECK_LIB(sunmath, acosp,
+			[AC_CHECK_LIB(sunperf, $sgemm,
+				[BLAS_LIBS="-xlic_lib=sunperf -lsunmath"
+                                 ax_blas_ok=yes],[],[-lsunmath])])
+	fi
+fi
+
+# BLAS in SCSL library?  (SGI/Cray Scientific Library)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(scs, $sgemm, [ax_blas_ok=yes; BLAS_LIBS="-lscs"])
+fi
+
+# BLAS in SGIMATH library?
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(complib.sgimath, $sgemm,
+		     [ax_blas_ok=yes; BLAS_LIBS="-lcomplib.sgimath"])
+fi
+
+# BLAS in IBM ESSL library? (requires generic BLAS lib, too)
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(blas, $sgemm,
+		[AC_CHECK_LIB(essl, $sgemm,
+			[ax_blas_ok=yes; BLAS_LIBS="-lessl -lblas"],
+			[], [-lblas $FLIBS])])
+fi
+
+# Generic BLAS library?
+if test $ax_blas_ok = no; then
+	AC_CHECK_LIB(blas, $sgemm, [ax_blas_ok=yes; BLAS_LIBS="-lblas"])
+fi
+
+AC_SUBST(BLAS_LIBS)
+AC_SUBST(OPENMP_LDFLAGS)
+
+LIBS="$ax_blas_save_LIBS"
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_blas_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_BLAS,1,[Define if you have a BLAS library.]),[$1])
+        :
+else
+        ax_blas_ok=no
+        $2
+fi
+])dnl AX_BLAS
--- a/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
@ -0,0 +1,26 @@
+# serial 1
+
+AC_DEFUN([AX_CPU_ARCH], [
+
+AC_MSG_CHECKING([for cpu arch])
+
+  AC_CANONICAL_TARGET
+
+  case $target in
+    amd64-* | x86_64-*)
+      ARCH_CPUFLAGS="-mavx2 -mf16c -msse4 -mpopcnt"
+      ARCH_CXXFLAGS="-m64"
+      ;;
+    aarch64*-*)
+dnl This is an arch for Nvidia Xavier a proper detection would be nice.
+      ARCH_CPUFLAGS="-march=armv8.2-a"
+      ;;
+    *) ;;
+  esac
+
+AC_MSG_RESULT([$target CPUFLAGS+="$ARCH_CPUFLAGS" CXXFLAGS+="$ARCH_CXXFLAGS"])
+
+AC_SUBST(ARCH_CPUFLAGS)
+AC_SUBST(ARCH_CXXFLAGS)
+
+])dnl
--- a/core/src/index/thirdparty/faiss/acinclude/ax_cxx_compile_stdcxx.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_cxx_compile_stdcxx.m4
@ -0,0 +1,972 @@
+# ===========================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the specified
+#   version of the C++ standard.  If necessary, add switches to CXX and
+#   CXXCPP to enable support.  VERSION may be '11' (for the C++11 standard)
+#   or '14' (for the C++14 standard).
+#
+#   The second argument, if specified, indicates whether you insist on an
+#   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
+#   -std=c++11).  If neither is specified, you get whatever works, with
+#   preference for an extended mode.
+#
+#   The third argument, if specified 'mandatory' or if left unspecified,
+#   indicates that baseline support for the specified C++ standard is
+#   required and that the macro should error out if no mode with that
+#   support is found.  If specified 'optional', then configuration proceeds
+#   regardless, after defining HAVE_CXX${VERSION} if and only if a
+#   supporting mode is found.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Benjamin Kosnik <bkoz@redhat.com>
+#   Copyright (c) 2012 Zack Weinberg <zackw@panix.com>
+#   Copyright (c) 2013 Roy Stogner <roystgnr@ices.utexas.edu>
+#   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <sokolov@google.com>
+#   Copyright (c) 2015 Paul Norman <penorman@mac.com>
+#   Copyright (c) 2015 Moritz Klammler <moritz@klammler.eu>
+#   Copyright (c) 2016, 2018 Krzesimir Nowak <qdlacz@gmail.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 9
+
+dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
+dnl  (serial version number 13).
+
+AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
+  m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"],
+        [$1], [14], [ax_cxx_compile_alternatives="14 1y"],
+        [$1], [17], [ax_cxx_compile_alternatives="17 1z"],
+        [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$2], [], [],
+        [$2], [ext], [],
+        [$2], [noext], [],
+        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
+        [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
+  AC_LANG_PUSH([C++])dnl
+  ac_success=no
+
+  m4_if([$2], [noext], [], [dnl
+  if test x$ac_success = xno; then
+    for alternative in ${ax_cxx_compile_alternatives}; do
+      switch="-std=gnu++${alternative}"
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                     $cachevar,
+        [ac_save_CXX="$CXX"
+         CXX="$CXX $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXX="$ac_save_CXX"])
+      if eval test x\$$cachevar = xyes; then
+        CXX="$CXX $switch"
+        if test -n "$CXXCPP" ; then
+          CXXCPP="$CXXCPP $switch"
+        fi
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+
+  m4_if([$2], [ext], [], [dnl
+  if test x$ac_success = xno; then
+    dnl HP's aCC needs +std=c++11 according to:
+    dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
+    dnl Cray's crayCC needs "-h std=c++11"
+    for alternative in ${ax_cxx_compile_alternatives}; do
+      for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}"; do
+        cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+        AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                       $cachevar,
+          [ac_save_CXX="$CXX"
+           CXX="$CXX $switch"
+           AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+            [eval $cachevar=yes],
+            [eval $cachevar=no])
+           CXX="$ac_save_CXX"])
+        if eval test x\$$cachevar = xyes; then
+          CXX="$CXX $switch"
+          if test -n "$CXXCPP" ; then
+            CXXCPP="$CXXCPP $switch"
+          fi
+          ac_success=yes
+          break
+        fi
+      done
+      if test x$ac_success = xyes; then
+        break
+      fi
+    done
+  fi])
+  AC_LANG_POP([C++])
+  if test x$ax_cxx_compile_cxx$1_required = xtrue; then
+    if test x$ac_success = xno; then
+      AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
+    fi
+  fi
+  if test x$ac_success = xno; then
+    HAVE_CXX$1=0
+    AC_MSG_NOTICE([No compiler with C++$1 support was found])
+  else
+    HAVE_CXX$1=1
+    AC_DEFINE(HAVE_CXX$1,1,
+              [define if the compiler supports basic C++$1 syntax])
+  fi
+  AC_SUBST(HAVE_CXX$1)
+])
+
+
+dnl  Test body for checking C++11 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+)
+
+
+dnl  Test body for checking C++14 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+)
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
+)
+
+dnl  Tests for new features in C++11
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+  namespace test_static_assert
+  {
+
+    template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+  }
+
+  namespace test_final_override
+  {
+
+    struct Base
+    {
+      virtual void f() {}
+    };
+
+    struct Derived : public Base
+    {
+      virtual void f() override {}
+    };
+
+  }
+
+  namespace test_double_right_angle_brackets
+  {
+
+    template < typename T >
+    struct check {};
+
+    typedef check<void> single_type;
+    typedef check<check<void>> double_type;
+    typedef check<check<check<void>>> triple_type;
+    typedef check<check<check<check<void>>>> quadruple_type;
+
+  }
+
+  namespace test_decltype
+  {
+
+    int
+    f()
+    {
+      int a = 1;
+      decltype(a) b = 2;
+      return a + b;
+    }
+
+  }
+
+  namespace test_type_deduction
+  {
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static const bool value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static const bool value = true;
+    };
+
+    template < typename T1, typename T2 >
+    auto
+    add(T1 a1, T2 a2) -> decltype(a1 + a2)
+    {
+      return a1 + a2;
+    }
+
+    int
+    test(const int c, volatile int v)
+    {
+      static_assert(is_same<int, decltype(0)>::value == true, "");
+      static_assert(is_same<int, decltype(c)>::value == false, "");
+      static_assert(is_same<int, decltype(v)>::value == false, "");
+      auto ac = c;
+      auto av = v;
+      auto sumi = ac + av + 'x';
+      auto sumf = ac + av + 1.0;
+      static_assert(is_same<int, decltype(ac)>::value == true, "");
+      static_assert(is_same<int, decltype(av)>::value == true, "");
+      static_assert(is_same<int, decltype(sumi)>::value == true, "");
+      static_assert(is_same<int, decltype(sumf)>::value == false, "");
+      static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+      return (sumf > 0.0) ? sumi : add(c, v);
+    }
+
+  }
+
+  namespace test_noexcept
+  {
+
+    int f() { return 0; }
+    int g() noexcept { return 0; }
+
+    static_assert(noexcept(f()) == false, "");
+    static_assert(noexcept(g()) == true, "");
+
+  }
+
+  namespace test_constexpr
+  {
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+    {
+      return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+    }
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c(const CharT *const s) noexcept
+    {
+      return strlen_c_r(s, 0UL);
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("1") == 1UL, "");
+    static_assert(strlen_c("example") == 7UL, "");
+    static_assert(strlen_c("another\0example") == 7UL, "");
+
+  }
+
+  namespace test_rvalue_references
+  {
+
+    template < int N >
+    struct answer
+    {
+      static constexpr int value = N;
+    };
+
+    answer<1> f(int&)       { return answer<1>(); }
+    answer<2> f(const int&) { return answer<2>(); }
+    answer<3> f(int&&)      { return answer<3>(); }
+
+    void
+    test()
+    {
+      int i = 0;
+      const int c = 0;
+      static_assert(decltype(f(i))::value == 1, "");
+      static_assert(decltype(f(c))::value == 2, "");
+      static_assert(decltype(f(0))::value == 3, "");
+    }
+
+  }
+
+  namespace test_uniform_initialization
+  {
+
+    struct test
+    {
+      static const int zero {};
+      static const int one {1};
+    };
+
+    static_assert(test::zero == 0, "");
+    static_assert(test::one == 1, "");
+
+  }
+
+  namespace test_lambdas
+  {
+
+    void
+    test1()
+    {
+      auto lambda1 = [](){};
+      auto lambda2 = lambda1;
+      lambda1();
+      lambda2();
+    }
+
+    int
+    test2()
+    {
+      auto a = [](int i, int j){ return i + j; }(1, 2);
+      auto b = []() -> int { return '0'; }();
+      auto c = [=](){ return a + b; }();
+      auto d = [&](){ return c; }();
+      auto e = [a, &b](int x) mutable {
+        const auto identity = [](int y){ return y; };
+        for (auto i = 0; i < a; ++i)
+          a += b--;
+        return x + identity(a + b);
+      }(0);
+      return a + b + c + d + e;
+    }
+
+    int
+    test3()
+    {
+      const auto nullary = [](){ return 0; };
+      const auto unary = [](int x){ return x; };
+      using nullary_t = decltype(nullary);
+      using unary_t = decltype(unary);
+      const auto higher1st = [](nullary_t f){ return f(); };
+      const auto higher2nd = [unary](nullary_t f1){
+        return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+      };
+      return higher1st(nullary) + higher2nd(nullary)(unary);
+    }
+
+  }
+
+  namespace test_variadic_templates
+  {
+
+    template <int...>
+    struct sum;
+
+    template <int N0, int... N1toN>
+    struct sum<N0, N1toN...>
+    {
+      static constexpr auto value = N0 + sum<N1toN...>::value;
+    };
+
+    template <>
+    struct sum<>
+    {
+      static constexpr auto value = 0;
+    };
+
+    static_assert(sum<>::value == 0, "");
+    static_assert(sum<1>::value == 1, "");
+    static_assert(sum<23>::value == 23, "");
+    static_assert(sum<1, 2>::value == 3, "");
+    static_assert(sum<5, 5, 11>::value == 21, "");
+    static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+  }
+
+  // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+  // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+  // because of this.
+  namespace test_template_alias_sfinae
+  {
+
+    struct foo {};
+
+    template<typename T>
+    using member = typename T::member_type;
+
+    template<typename T>
+    void func(...) {}
+
+    template<typename T>
+    void func(member<T>*) {}
+
+    void test();
+
+    void test() { func<foo>(0); }
+
+  }
+
+}  // namespace cxx11
+
+#endif  // __cplusplus >= 201103L
+
+]])
+
+
+dnl  Tests for new features in C++14
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
+
+// If the compiler admits that it is not ready for C++14, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201402L
+
+#error "This is not a C++14 compiler"
+
+#else
+
+namespace cxx14
+{
+
+  namespace test_polymorphic_lambdas
+  {
+
+    int
+    test()
+    {
+      const auto lambda = [](auto&&... args){
+        const auto istiny = [](auto x){
+          return (sizeof(x) == 1UL) ? 1 : 0;
+        };
+        const int aretiny[] = { istiny(args)... };
+        return aretiny[0];
+      };
+      return lambda(1, 1L, 1.0f, '1');
+    }
+
+  }
+
+  namespace test_binary_literals
+  {
+
+    constexpr auto ivii = 0b0000000000101010;
+    static_assert(ivii == 42, "wrong value");
+
+  }
+
+  namespace test_generalized_constexpr
+  {
+
+    template < typename CharT >
+    constexpr unsigned long
+    strlen_c(const CharT *const s) noexcept
+    {
+      auto length = 0UL;
+      for (auto p = s; *p; ++p)
+        ++length;
+      return length;
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("x") == 1UL, "");
+    static_assert(strlen_c("test") == 4UL, "");
+    static_assert(strlen_c("another\0test") == 7UL, "");
+
+  }
+
+  namespace test_lambda_init_capture
+  {
+
+    int
+    test()
+    {
+      auto x = 0;
+      const auto lambda1 = [a = x](int b){ return a + b; };
+      const auto lambda2 = [a = lambda1(x)](){ return a; };
+      return lambda2();
+    }
+
+  }
+
+  namespace test_digit_separators
+  {
+
+    constexpr auto ten_million = 100'000'000;
+    static_assert(ten_million == 100000000, "");
+
+  }
+
+  namespace test_return_type_deduction
+  {
+
+    auto f(int& x) { return x; }
+    decltype(auto) g(int& x) { return x; }
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static constexpr auto value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static constexpr auto value = true;
+    };
+
+    int
+    test()
+    {
+      auto x = 0;
+      static_assert(is_same<int, decltype(f(x))>::value, "");
+      static_assert(is_same<int&, decltype(g(x))>::value, "");
+      return x;
+    }
+
+  }
+
+}  // namespace cxx14
+
+#endif  // __cplusplus >= 201402L
+
+]])
+
+
+dnl  Tests for new features in C++17
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
+
+// If the compiler admits that it is not ready for C++17, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus <= 201402L
+
+#error "This is not a C++17 compiler"
+
+#else
+
+#if defined(__clang__)
+  #define REALLY_CLANG
+#else
+  #if defined(__GNUC__)
+    #define REALLY_GCC
+  #endif
+#endif
+
+#include <initializer_list>
+#include <utility>
+#include <type_traits>
+
+namespace cxx17
+{
+
+#if !defined(REALLY_CLANG)
+  namespace test_constexpr_lambdas
+  {
+
+    // TODO: test it with clang++ from git
+
+    constexpr int foo = [](){return 42;}();
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+  namespace test::nested_namespace::definitions
+  {
+
+  }
+
+  namespace test_fold_expression
+  {
+
+    template<typename... Args>
+    int multiply(Args... args)
+    {
+      return (args * ... * 1);
+    }
+
+    template<typename... Args>
+    bool all(Args... args)
+    {
+      return (args && ...);
+    }
+
+  }
+
+  namespace test_extended_static_assert
+  {
+
+    static_assert (true);
+
+  }
+
+  namespace test_auto_brace_init_list
+  {
+
+    auto foo = {5};
+    auto bar {5};
+
+    static_assert(std::is_same<std::initializer_list<int>, decltype(foo)>::value);
+    static_assert(std::is_same<int, decltype(bar)>::value);
+  }
+
+  namespace test_typename_in_template_template_parameter
+  {
+
+    template<template<typename> typename X> struct D;
+
+  }
+
+  namespace test_fallthrough_nodiscard_maybe_unused_attributes
+  {
+
+    int f1()
+    {
+      return 42;
+    }
+
+    [[nodiscard]] int f2()
+    {
+      [[maybe_unused]] auto unused = f1();
+
+      switch (f1())
+      {
+      case 17:
+        f1();
+        [[fallthrough]];
+      case 42:
+        f1();
+      }
+      return f1();
+    }
+
+  }
+
+  namespace test_extended_aggregate_initialization
+  {
+
+    struct base1
+    {
+      int b1, b2 = 42;
+    };
+
+    struct base2
+    {
+      base2() {
+        b3 = 42;
+      }
+      int b3;
+    };
+
+    struct derived : base1, base2
+    {
+        int d;
+    };
+
+    derived d1 {{1, 2}, {}, 4};  // full initialization
+    derived d2 {{}, {}, 4};      // value-initialized bases
+
+  }
+
+  namespace test_general_range_based_for_loop
+  {
+
+    struct iter
+    {
+      int i;
+
+      int& operator* ()
+      {
+        return i;
+      }
+
+      const int& operator* () const
+      {
+        return i;
+      }
+
+      iter& operator++()
+      {
+        ++i;
+        return *this;
+      }
+    };
+
+    struct sentinel
+    {
+      int i;
+    };
+
+    bool operator== (const iter& i, const sentinel& s)
+    {
+      return i.i == s.i;
+    }
+
+    bool operator!= (const iter& i, const sentinel& s)
+    {
+      return !(i == s);
+    }
+
+    struct range
+    {
+      iter begin() const
+      {
+        return {0};
+      }
+
+      sentinel end() const
+      {
+        return {5};
+      }
+    };
+
+    void f()
+    {
+      range r {};
+
+      for (auto i : r)
+      {
+        [[maybe_unused]] auto v = i;
+      }
+    }
+
+  }
+
+  namespace test_lambda_capture_asterisk_this_by_value
+  {
+
+    struct t
+    {
+      int i;
+      int foo()
+      {
+        return [*this]()
+        {
+          return i;
+        }();
+      }
+    };
+
+  }
+
+  namespace test_enum_class_construction
+  {
+
+    enum class byte : unsigned char
+    {};
+
+    byte foo {42};
+
+  }
+
+  namespace test_constexpr_if
+  {
+
+    template <bool cond>
+    int f ()
+    {
+      if constexpr(cond)
+      {
+        return 13;
+      }
+      else
+      {
+        return 42;
+      }
+    }
+
+  }
+
+  namespace test_selection_statement_with_initializer
+  {
+
+    int f()
+    {
+      return 13;
+    }
+
+    int f2()
+    {
+      if (auto i = f(); i > 0)
+      {
+        return 3;
+      }
+
+      switch (auto i = f(); i + 4)
+      {
+      case 17:
+        return 2;
+
+      default:
+        return 1;
+      }
+    }
+
+  }
+
+#if !defined(REALLY_CLANG)
+  namespace test_template_argument_deduction_for_class_templates
+  {
+
+    // TODO: test it with clang++ from git
+
+    template <typename T1, typename T2>
+    struct pair
+    {
+      pair (T1 p1, T2 p2)
+        : m1 {p1},
+          m2 {p2}
+      {}
+
+      T1 m1;
+      T2 m2;
+    };
+
+    void f()
+    {
+      [[maybe_unused]] auto p = pair{13, 42u};
+    }
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+  namespace test_non_type_auto_template_parameters
+  {
+
+    template <auto n>
+    struct B
+    {};
+
+    B<5> b1;
+    B<'a'> b2;
+
+  }
+
+#if !defined(REALLY_CLANG)
+  namespace test_structured_bindings
+  {
+
+    // TODO: test it with clang++ from git
+
+    int arr[2] = { 1, 2 };
+    std::pair<int, int> pr = { 1, 2 };
+
+    auto f1() -> int(&)[2]
+    {
+      return arr;
+    }
+
+    auto f2() -> std::pair<int, int>&
+    {
+      return pr;
+    }
+
+    struct S
+    {
+      int x1 : 2;
+      volatile double y1;
+    };
+
+    S f3()
+    {
+      return {};
+    }
+
+    auto [ x1, y1 ] = f1();
+    auto& [ xr1, yr1 ] = f1();
+    auto [ x2, y2 ] = f2();
+    auto& [ xr2, yr2 ] = f2();
+    const auto [ x3, y3 ] = f3();
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+#if !defined(REALLY_CLANG)
+  namespace test_exception_spec_type_system
+  {
+
+    // TODO: test it with clang++ from git
+
+    struct Good {};
+    struct Bad {};
+
+    void g1() noexcept;
+    void g2();
+
+    template<typename T>
+    Bad
+    f(T*, T*);
+
+    template<typename T1, typename T2>
+    Good
+    f(T1*, T2*);
+
+    static_assert (std::is_same_v<Good, decltype(f(g1, g2))>);
+
+  }
+#endif // !defined(REALLY_CLANG)
+
+  namespace test_inline_variables
+  {
+
+    template<class T> void f(T)
+    {}
+
+    template<class T> inline T g(T)
+    {
+      return T{};
+    }
+
+    template<> inline void f<>(int)
+    {}
+
+    template<> int g<>(int)
+    {
+      return 5;
+    }
+
+  }
+
+}  // namespace cxx17
+
+#endif  // __cplusplus <= 201402L
+
+]])
--- a/core/src/index/thirdparty/faiss/acinclude/ax_lapack.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_lapack.m4
@ -0,0 +1,132 @@
+# ===========================================================================
+#        https://www.gnu.org/software/autoconf-archive/ax_lapack.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_LAPACK([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro looks for a library that implements the LAPACK linear-algebra
+#   interface (see http://www.netlib.org/lapack/). On success, it sets the
+#   LAPACK_LIBS output variable to hold the requisite library linkages.
+#
+#   To link with LAPACK, you should link with:
+#
+#     $LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS
+#
+#   in that order. BLAS_LIBS is the output variable of the AX_BLAS macro,
+#   called automatically. FLIBS is the output variable of the
+#   AC_F77_LIBRARY_LDFLAGS macro (called if necessary by AX_BLAS), and is
+#   sometimes necessary in order to link with F77 libraries. Users will also
+#   need to use AC_F77_DUMMY_MAIN (see the autoconf manual), for the same
+#   reason.
+#
+#   The user may also use --with-lapack=<lib> in order to use some specific
+#   LAPACK library <lib>. In order to link successfully, however, be aware
+#   that you will probably need to use the same Fortran compiler (which can
+#   be set via the F77 env. var.) as was used to compile the LAPACK and BLAS
+#   libraries.
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a LAPACK library
+#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
+#   is not found. If ACTION-IF-FOUND is not specified, the default action
+#   will define HAVE_LAPACK.
+#
+# LICENSE
+#
+#   Copyright (c) 2009 Steven G. Johnson <stevenj@alum.mit.edu>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([ACX_LAPACK], [AX_LAPACK])
+AC_DEFUN([AX_LAPACK], [
+AC_REQUIRE([AX_BLAS])
+ax_lapack_ok=no
+
+AC_ARG_WITH(lapack,
+        [AS_HELP_STRING([--with-lapack=<lib>], [use LAPACK library <lib>])])
+case $with_lapack in
+        yes | "") ;;
+        no) ax_lapack_ok=disable ;;
+        -* | */* | *.a | *.so | *.so.* | *.o) LAPACK_LIBS="$with_lapack" ;;
+        *) LAPACK_LIBS="-l$with_lapack" ;;
+esac
+
+# Get fortran linker name of LAPACK function to check for.
+# AC_F77_FUNC(cheev)
+cheev=cheev_
+
+# We cannot use LAPACK if BLAS is not found
+if test "x$ax_blas_ok" != xyes; then
+        ax_lapack_ok=noblas
+        LAPACK_LIBS=""
+fi
+
+# First, check LAPACK_LIBS environment variable
+if test "x$LAPACK_LIBS" != x; then
+        save_LIBS="$LIBS"; LIBS="$LAPACK_LIBS $BLAS_LIBS $LIBS $FLIBS"
+        AC_MSG_CHECKING([for $cheev in $LAPACK_LIBS])
+        AC_TRY_LINK_FUNC($cheev, [ax_lapack_ok=yes], [LAPACK_LIBS=""])
+        AC_MSG_RESULT($ax_lapack_ok)
+        LIBS="$save_LIBS"
+        if test $ax_lapack_ok = no; then
+                LAPACK_LIBS=""
+        fi
+fi
+
+# LAPACK linked to by default?  (is sometimes included in BLAS lib)
+if test $ax_lapack_ok = no; then
+        save_LIBS="$LIBS"; LIBS="$LIBS $BLAS_LIBS $FLIBS"
+        AC_CHECK_FUNC($cheev, [ax_lapack_ok=yes])
+        LIBS="$save_LIBS"
+fi
+
+# Generic LAPACK library?
+for lapack in lapack lapack_rs6k; do
+        if test $ax_lapack_ok = no; then
+                save_LIBS="$LIBS"; LIBS="$BLAS_LIBS $LIBS"
+                AC_CHECK_LIB($lapack, $cheev,
+                    [ax_lapack_ok=yes; LAPACK_LIBS="-l$lapack"], [], [$FLIBS])
+                LIBS="$save_LIBS"
+        fi
+done
+
+AC_SUBST(LAPACK_LIBS)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$ax_lapack_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_LAPACK,1,[Define if you have LAPACK library.]),[$1])
+        :
+else
+        ax_lapack_ok=no
+        $2
+fi
+])dnl AX_LAPACK
--- a/core/src/index/thirdparty/faiss/acinclude/fa_check_cuda.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_check_cuda.m4
@ -0,0 +1,67 @@
+AC_DEFUN([FA_CHECK_CUDA], [
+
+AC_ARG_WITH(cuda,
+  [AS_HELP_STRING([--with-cuda=<prefix>], [prefix of the CUDA installation])])
+AC_ARG_WITH(cuda-arch,
+  [AS_HELP_STRING([--with-cuda-arch=<gencodes>], [device specific -gencode flags])],
+  [],
+  [with_cuda_arch=default])
+
+if test x$with_cuda != xno; then
+  if test x$with_cuda != x; then
+    cuda_prefix=$with_cuda
+    AC_CHECK_PROG(NVCC, [nvcc], [$cuda_prefix/bin/nvcc], [], [$cuda_prefix/bin])
+    NVCC_CPPFLAGS="-I$cuda_prefix/include"
+    NVCC_LDFLAGS="-L$cuda_prefix/lib64"
+  else
+    AC_CHECK_PROGS(NVCC, [nvcc /usr/local/cuda/bin/nvcc], [])
+    if test "x$NVCC" == "x/usr/local/cuda/bin/nvcc"; then
+      cuda_prefix="/usr/local/cuda"
+      NVCC_CPPFLAGS="-I$cuda_prefix/include"
+      NVCC_LDFLAGS="-L$cuda_prefix/lib64"
+    else
+      cuda_prefix=""
+      NVCC_CPPFLAGS=""
+      NVCC_LDFLAGS=""
+    fi
+  fi
+
+  if test "x$NVCC" == x; then
+    AC_MSG_ERROR([Couldn't find nvcc])
+  fi
+
+  if test "x$with_cuda_arch" == xdefault; then
+    with_cuda_arch="-gencode=arch=compute_35,code=compute_35 \\
+-gencode=arch=compute_52,code=compute_52 \\
+-gencode=arch=compute_60,code=compute_60 \\
+-gencode=arch=compute_61,code=compute_61 \\
+-gencode=arch=compute_70,code=compute_70 \\
+-gencode=arch=compute_75,code=compute_75"
+  fi
+
+  fa_save_CPPFLAGS="$CPPFLAGS"
+  fa_save_LDFLAGS="$LDFLAGS"
+  fa_save_LIBS="$LIBS"
+
+  CPPFLAGS="$NVCC_CPPFLAGS $CPPFLAGS"
+  LDFLAGS="$NVCC_LDFLAGS $LDFLAGS"
+
+  AC_CHECK_HEADER([cuda.h], [], AC_MSG_FAILURE([Couldn't find cuda.h]))
+  AC_CHECK_LIB([cublas], [cublasAlloc], [], AC_MSG_FAILURE([Couldn't find libcublas]))
+  AC_CHECK_LIB([cudart], [cudaSetDevice], [], AC_MSG_FAILURE([Couldn't find libcudart]))
+
+  NVCC_LIBS="$LIBS"
+  NVCC_CPPFLAGS="$CPPFLAGS"
+  NVCC_LDFLAGS="$LDFLAGS"
+  CPPFLAGS="$fa_save_CPPFLAGS"
+  LDFLAGS="$fa_save_LDFLAGS"
+  LIBS="$fa_save_LIBS"
+fi
+
+AC_SUBST(NVCC)
+AC_SUBST(NVCC_CPPFLAGS)
+AC_SUBST(NVCC_LDFLAGS)
+AC_SUBST(NVCC_LIBS)
+AC_SUBST(CUDA_PREFIX, $cuda_prefix)
+AC_SUBST(CUDA_ARCH, $with_cuda_arch)
+])
--- a/core/src/index/thirdparty/faiss/acinclude/fa_numpy.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_numpy.m4
@ -0,0 +1,20 @@
+AC_DEFUN([FA_NUMPY], [
+AC_REQUIRE([FA_PYTHON])
+
+AC_MSG_CHECKING([for numpy headers path])
+
+fa_numpy_headers=`$PYTHON -c "import numpy; print(numpy.get_include())"`
+
+if test $? == 0; then
+  if test x$fa_numpy_headers != x; then
+    AC_MSG_RESULT($fa_numpy_headers)
+    AC_SUBST(NUMPY_INCLUDE, $fa_numpy_headers)
+  else
+    AC_MSG_RESULT([not found])
+    AC_MSG_WARN([You won't be able to build the python interface.])
+  fi
+else
+  AC_MSG_RESULT([not found])
+  AC_MSG_WARN([You won't be able to build the python interface.])
+fi
+])dnl
--- a/core/src/index/thirdparty/faiss/acinclude/fa_prog_nm.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_prog_nm.m4
@ -0,0 +1,16 @@
+dnl
+dnl Check for an nm(1) utility.
+dnl
+AC_DEFUN([FA_PROG_NM],
+[
+    case "${NM-unset}" in
+        unset) AC_CHECK_PROGS(NM, nm, nm) ;;
+        *) AC_CHECK_PROGS(NM, $NM nm, nm) ;;
+    esac
+    AC_MSG_CHECKING(nm flags)
+    case "${NMFLAGS-unset}" in
+        unset) NMFLAGS= ;;
+    esac
+    AC_MSG_RESULT($NMFLAGS)
+    AC_SUBST(NMFLAGS)
+])
--- a/core/src/index/thirdparty/faiss/acinclude/fa_prog_swig.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_prog_swig.m4
@ -0,0 +1,11 @@
+AC_DEFUN([FA_PROG_SWIG], [
+
+AC_ARG_WITH(swig,
+[AS_HELP_STRING([--with-swig=<bin>], [use SWIG binary <bin>])])
+case $with_swig in
+ "") AC_CHECK_PROG(SWIG, swig, swig);;
+  *) SWIG="$with_swig"
+esac
+
+AC_SUBST(SWIG)
+])
--- a/core/src/index/thirdparty/faiss/acinclude/fa_python.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/fa_python.m4
@ -0,0 +1,21 @@
+AC_DEFUN([FA_PYTHON], [
+
+AC_ARG_WITH(python,
+  [AS_HELP_STRING([--with-python=<bin>], [use Python binary <bin>])])
+case $with_python in
+  "") PYTHON_BIN=python ;;
+  *) PYTHON_BIN="$with_python"
+esac
+
+AC_CHECK_PROG(PYTHON, $PYTHON_BIN, $PYTHON_BIN)
+fa_python_bin=$PYTHON
+
+AC_MSG_CHECKING([for Python C flags])
+fa_python_cflags=`$PYTHON -c "
+import sysconfig
+paths = [['-I' + sysconfig.get_path(p) for p in ['include', 'platinclude']]]
+print(' '.join(paths))"`
+AC_MSG_RESULT($fa_python_cflags)
+AC_SUBST(PYTHON_CFLAGS, "$PYTHON_CFLAGS $fa_python_cflags")
+
+])dnl FA_PYTHON
--- a/core/src/index/thirdparty/faiss/benchs/README.md
+++ b/core/src/index/thirdparty/faiss/benchs/README.md
@ -0,0 +1,338 @@
+
+# Benchmarking scripts
+
+This directory contains benchmarking scripts that can reproduce the
+numbers reported in the two papers
+
+```
+@inproceedings{DJP16,
+  Author = {Douze, Matthijs and J{\'e}gou, Herv{\'e} and Perronnin, Florent},
+  Booktitle = "ECCV",
+  Organization = {Springer},
+  Title = {Polysemous codes},
+  Year = {2016}
+}
+```
+and
+
+```
+@inproceedings{JDJ17,
+   Author = {Jeff Johnson and Matthijs Douze and Herv{\'e} J{\'e}gou},
+   journal= {arXiv:1702.08734},,
+   Title = {Billion-scale similarity search with GPUs},
+   Year = {2017},
+}
+```
+
+Note that the numbers (especially timings) change slightly due to changes in the implementation, different machines, etc.
+
+The scripts are self-contained. They depend only on Faiss and external training data that should be stored in sub-directories.
+
+## SIFT1M experiments
+
+The script [`bench_polysemous_sift1m.py`](bench_polysemous_sift1m.py) reproduces the numbers in
+Figure 3 from the "Polysemous" paper.
+
+### Getting SIFT1M
+
+To run it, please download the ANN_SIFT1M dataset from
+
+http://corpus-texmex.irisa.fr/
+
+and unzip it to the subdirectory sift1M.
+
+### Result
+
+The output looks like:
+
+```
+PQ training on 100000 points, remains 0 points: training polysemous on centroids
+add vectors to index
+PQ baseline        7.517 ms per query, R@1 0.4474
+Polysemous 64      9.875 ms per query, R@1 0.4474
+Polysemous 62      8.358 ms per query, R@1 0.4474
+Polysemous 58      5.531 ms per query, R@1 0.4474
+Polysemous 54      3.420 ms per query, R@1 0.4478
+Polysemous 50      2.182 ms per query, R@1 0.4475
+Polysemous 46      1.621 ms per query, R@1 0.4408
+Polysemous 42      1.448 ms per query, R@1 0.4174
+Polysemous 38      1.331 ms per query, R@1 0.3563
+Polysemous 34      1.334 ms per query, R@1 0.2661
+Polysemous 30      1.272 ms per query, R@1 0.1794
+```
+
+
+## Experiments on 1B elements dataset
+
+The script [`bench_polysemous_1bn.py`](bench_polysemous_1bn.py) reproduces a few experiments on
+two datasets of size 1B from the Polysemous codes" paper.
+
+
+### Getting BIGANN
+
+Download the four files of ANN_SIFT1B from
+http://corpus-texmex.irisa.fr/ to subdirectory bigann/
+
+### Getting Deep1B
+
+The ground-truth and queries are available here 
+
+https://yadi.sk/d/11eDCm7Dsn9GA
+
+For the learning and database vectors, use the script
+
+https://github.com/arbabenko/GNOIMI/blob/master/downloadDeep1B.py
+
+to download the data to subdirectory deep1b/, then concatenate the
+database files to base.fvecs and the training files to learn.fvecs
+
+### Running the experiments
+
+These experiments are quite long. To support resuming, the script
+stores the result of training to a temporary directory, `/tmp/bench_polysemous`.
+
+The script `bench_polysemous_1bn.py` takes at least two arguments:
+
+- the dataset name: SIFT1000M (aka SIFT1B, aka BIGANN) or Deep1B. SIFT1M, SIFT2M,... are also supported to make subsets of for small experiments (note that SIFT1M as a subset of SIFT1B is not the same as the SIFT1M above)
+
+- the type of index to build, which should be a valid [index_factory key](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning#index-factory) (see below for examples)
+
+- the remaining arguments are parsed as search-time parameters.
+
+### Experiments of Table 2
+
+The `IMI*+PolyD+ADC` results in Table 2 can be reproduced with (for 16 bytes):
+
+```
+python bench_polysemous_1bn.par SIFT1000M IMI2x12,PQ16 nprobe=16,max_codes={10000,30000},ht={44..54}
+```
+
+Training takes about 2 minutes and adding vectors to the dataset
+takes 3.1 h. These operations are multithreaded. Note that in the command
+above, we use bash's [brace expansion](https://www.gnu.org/software/bash/manual/html_node/Brace-Expansion.html) to set a grid of parameters.
+
+The search is *not* multithreaded, and the output looks like:
+
+```
+                                        R@1    R@10   R@100     time    %pass
+nprobe=16,max_codes=10000,ht=44         0.1779 0.2994 0.3139    0.194   12.45
+nprobe=16,max_codes=10000,ht=45         0.1859 0.3183 0.3339    0.197   14.24
+nprobe=16,max_codes=10000,ht=46         0.1930 0.3366 0.3543    0.202   16.22
+nprobe=16,max_codes=10000,ht=47         0.1993 0.3550 0.3745    0.209   18.39
+nprobe=16,max_codes=10000,ht=48         0.2033 0.3694 0.3917    0.640   20.77
+nprobe=16,max_codes=10000,ht=49         0.2070 0.3839 0.4077    0.229   23.36
+nprobe=16,max_codes=10000,ht=50         0.2101 0.3949 0.4205    0.232   26.17
+nprobe=16,max_codes=10000,ht=51         0.2120 0.4042 0.4310    0.239   29.21
+nprobe=16,max_codes=10000,ht=52         0.2134 0.4113 0.4402    0.245   32.47
+nprobe=16,max_codes=10000,ht=53         0.2157 0.4184 0.4482    0.250   35.96
+nprobe=16,max_codes=10000,ht=54         0.2170 0.4240 0.4546    0.256   39.66
+nprobe=16,max_codes=30000,ht=44         0.1882 0.3327 0.3555    0.226   11.29
+nprobe=16,max_codes=30000,ht=45         0.1964 0.3525 0.3771    0.231   13.05
+nprobe=16,max_codes=30000,ht=46         0.2039 0.3713 0.3987    0.236   15.01
+nprobe=16,max_codes=30000,ht=47         0.2103 0.3907 0.4202    0.245   17.19
+nprobe=16,max_codes=30000,ht=48         0.2145 0.4055 0.4384    0.251   19.60
+nprobe=16,max_codes=30000,ht=49         0.2179 0.4198 0.4550    0.257   22.25
+nprobe=16,max_codes=30000,ht=50         0.2208 0.4305 0.4681    0.268   25.15
+nprobe=16,max_codes=30000,ht=51         0.2227 0.4402 0.4791    0.275   28.30
+nprobe=16,max_codes=30000,ht=52         0.2241 0.4473 0.4884    0.284   31.70
+nprobe=16,max_codes=30000,ht=53         0.2265 0.4544 0.4965    0.294   35.34
+nprobe=16,max_codes=30000,ht=54         0.2278 0.4601 0.5031    0.303   39.20
+```
+
+The result reported in table 2 is the one for which the %pass (percentage of code comparisons that pass the Hamming check) is around 20%, which occurs for Hamming threshold `ht=48`.
+
+The 8-byte results can be reproduced with the factory key `IMI2x12,PQ8`
+
+### Experiments of the appendix
+
+The experiments in the appendix are only in the ArXiv version of the paper (table 3). 
+
+```
+python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
+
+               	R@1    R@10   R@100     time    %pass
+nprobe=1,ht=20 	0.0351 0.0616 0.0751    0.158   19.01
+...
+nprobe=32,ht=28 	0.1256 0.3563 0.5026    0.561   52.61
+...
+```
+Here again the runs are not exactly the same but the original result was obtained from nprobe=32,ht=28.
+
+For Deep1B, we used a simple version of [auto-tuning](https://github.com/facebookresearch/faiss/wiki/High-level-interface-and-auto-tuning/_edit#auto-tuning-the-runtime-parameters) to sweep through the set of operating points:
+
+```
+python bench_polysemous_1bn.py Deep1B OPQ20_80,IMI2x14,PQ20 autotune
+...
+Done in 4067.555 s, available OPs:
+Parameters                                1-R@1     time
+                                          0.0000    0.000
+nprobe=1,ht=22,max_codes=256              0.0215    3.115
+nprobe=1,ht=30,max_codes=256              0.0381    3.120
+...
+nprobe=512,ht=68,max_codes=524288         0.4478   36.903
+nprobe=1024,ht=80,max_codes=131072        0.4557   46.363
+nprobe=1024,ht=78,max_codes=262144        0.4616   61.939
+...
+```
+The original results were obtained with `nprobe=1024,ht=66,max_codes=262144`.
+
+
+## GPU experiments
+
+The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss. 
+
+### Search on SIFT1M
+
+See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers. 
+
+The output is:
+```
+============ Exact search
+add vectors to index
+warmup
+benchmark
+k=1 0.715 s, R@1 0.9914
+k=2 0.729 s, R@1 0.9935
+k=4 0.731 s, R@1 0.9935
+k=8 0.732 s, R@1 0.9935
+k=16 0.742 s, R@1 0.9935
+k=32 0.737 s, R@1 0.9935
+k=64 0.753 s, R@1 0.9935
+k=128 0.761 s, R@1 0.9935
+k=256 0.799 s, R@1 0.9935
+k=512 0.975 s, R@1 0.9935
+k=1024 1.424 s, R@1 0.9935
+============ Approximate search
+train
+WARNING clustering 100000 points to 4096 centroids: please provide at least 159744 training points
+add vectors to index
+WARN: increase temp memory to avoid cudaMalloc, or decrease query/add size (alloc 256000000 B, highwater 256000000 B)
+warmup
+benchmark
+nprobe=   1 0.043 s recalls= 0.3909 0.4312 0.4312
+nprobe=   2 0.040 s recalls= 0.5041 0.5636 0.5636
+nprobe=   4 0.048 s recalls= 0.6048 0.6897 0.6897
+nprobe=   8 0.064 s recalls= 0.6879 0.8028 0.8028
+nprobe=  16 0.088 s recalls= 0.7534 0.8940 0.8940
+nprobe=  32 0.134 s recalls= 0.7957 0.9549 0.9550
+nprobe=  64 0.224 s recalls= 0.8125 0.9833 0.9834
+nprobe= 128 0.395 s recalls= 0.8205 0.9953 0.9954
+nprobe= 256 0.717 s recalls= 0.8227 0.9993 0.9994
+nprobe= 512 1.348 s recalls= 0.8228 0.9999 1.0000
+```
+The run produces two warnings:
+
+- the clustering complains that it does not have enough training data, there is not much we can do about this.
+
+- the add() function complains that there is an inefficient memory allocation, but this is a concern only when it happens often, and we are not benchmarking the add time anyways.
+
+To index small datasets, it is more efficient to use a `GpuIVFFlat`, which just stores the full vectors in the inverted lists. We did not mention this in the the paper because it is not as scalable. To experiment with this setting, change the `index_factory` string from "IVF4096,PQ64" to "IVF16384,Flat". This gives:
+
+```
+nprobe=   1 0.025 s recalls= 0.4084 0.4105 0.4105
+nprobe=   2 0.033 s recalls= 0.5235 0.5264 0.5264
+nprobe=   4 0.033 s recalls= 0.6332 0.6367 0.6367
+nprobe=   8 0.040 s recalls= 0.7358 0.7403 0.7403
+nprobe=  16 0.049 s recalls= 0.8273 0.8324 0.8324
+nprobe=  32 0.068 s recalls= 0.8957 0.9024 0.9024
+nprobe=  64 0.104 s recalls= 0.9477 0.9549 0.9549
+nprobe= 128 0.174 s recalls= 0.9760 0.9837 0.9837
+nprobe= 256 0.299 s recalls= 0.9866 0.9944 0.9944
+nprobe= 512 0.527 s recalls= 0.9907 0.9987 0.9987
+```
+
+### Clustering on MNIST8m
+
+To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
+
+The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output: 
+
+```
+python kmeans_mnist.py 1 256
+...
+Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
+  Preprocessing in 7.94526 s
+  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0        
+final objective: 1.449e+13
+total runtime: 140.615 s
+```
+
+### search on SIFT1B
+
+The script [`bench_gpu_1bn.py`](bench_gpu_1bn.py) runs multi-gpu searches on the two 1-billion vector datasets we considered. It is more complex than the previous scripts, because it supports many search options and decomposes the dataset build process in Python to exploit the best possible CPU/GPU parallelism and GPU distribution.
+
+Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
+
+The search results on SIFT1B in the "GPU paper" can be obtained with 
+
+<!-- see P57124181 -->
+
+```
+python bench_gpu_1bn.py SIFT1000M OPQ8_32,IVF262144,PQ8 -nnn 10 -ngpu 1 -tempmem $[1536*1024*1024]
+...
+0/10000 (0.024 s)      probe=1  : 0.161 s 1-R@1: 0.0752 1-R@10: 0.1924
+0/10000 (0.005 s)      probe=2  : 0.150 s 1-R@1: 0.0964 1-R@10: 0.2693
+0/10000 (0.005 s)      probe=4  : 0.153 s 1-R@1: 0.1102 1-R@10: 0.3328
+0/10000 (0.005 s)      probe=8  : 0.170 s 1-R@1: 0.1220 1-R@10: 0.3827
+0/10000 (0.005 s)      probe=16 : 0.196 s 1-R@1: 0.1290 1-R@10: 0.4151
+0/10000 (0.006 s)      probe=32 : 0.244 s 1-R@1: 0.1314 1-R@10: 0.4345
+0/10000 (0.006 s)      probe=64 : 0.353 s 1-R@1: 0.1332 1-R@10: 0.4461
+0/10000 (0.005 s)      probe=128: 0.587 s 1-R@1: 0.1341 1-R@10: 0.4502
+0/10000 (0.006 s)      probe=256: 1.160 s 1-R@1: 0.1342 1-R@10: 0.4511
+```
+
+We use the `-tempmem` option to reduce the temporary memory allocation to 1.5G, otherwise the dataset does not fit in GPU memory
+
+### search on Deep1B
+
+The same script generates the GPU search results on Deep1B. 
+
+```
+python bench_gpu_1bn.py  Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
+...
+
+0/10000 (0.115 s)      probe=1  : 0.239 s 1-R@1: 0.2387 1-R@10: 0.3420
+0/10000 (0.006 s)      probe=2  : 0.103 s 1-R@1: 0.3110 1-R@10: 0.4623
+0/10000 (0.005 s)      probe=4  : 0.105 s 1-R@1: 0.3772 1-R@10: 0.5862
+0/10000 (0.005 s)      probe=8  : 0.116 s 1-R@1: 0.4235 1-R@10: 0.6889
+0/10000 (0.005 s)      probe=16 : 0.133 s 1-R@1: 0.4517 1-R@10: 0.7693
+0/10000 (0.005 s)      probe=32 : 0.168 s 1-R@1: 0.4713 1-R@10: 0.8281
+0/10000 (0.005 s)      probe=64 : 0.238 s 1-R@1: 0.4841 1-R@10: 0.8649
+0/10000 (0.007 s)      probe=128: 0.384 s 1-R@1: 0.4900 1-R@10: 0.8816
+0/10000 (0.005 s)      probe=256: 0.736 s 1-R@1: 0.4933 1-R@10: 0.8912
+```
+
+Here we are a bit tight on memory so we disable precomputed tables (`-noptables`) and restrict the amount of temporary memory. The `-altadd` option avoids GPU memory overflows during add.
+
+
+### knn-graph on Deep1B
+
+The same script generates the KNN-graph on Deep1B. Note that the inverted file from above will not be re-used because the training sets are different. For the knngraph, the script will first do a pass over the whole dataset to compute the ground-truth knn for a subset of 10k nodes, for evaluation.
+
+```
+python bench_gpu_1bn.py Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -altadd -knngraph  -R 2 -noptables -tempmem $[1<<30] -ngpu 4
+...
+CPU index contains 1000000000 vectors, move to GPU
+Copy CPU index to 2 sharded GPU indexes
+   dispatch to GPUs 0:2
+IndexShards shard 0 indices 0:500000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+IndexShards shard 1 indices 500000000:1000000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+   dispatch to GPUs 2:4
+IndexShards shard 0 indices 0:500000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+IndexShards shard 1 indices 500000000:1000000000
+  IndexIVFPQ size 500000000 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=0 reserveVecs=0
+move to GPU done in 151.535 s
+search...
+999997440/1000000000 (8389.961 s, 0.3379)      probe=1  : 8389.990 s rank-10 intersection results: 0.3379
+999997440/1000000000 (9205.934 s, 0.4079)      probe=2  : 9205.966 s rank-10 intersection results: 0.4079
+999997440/1000000000 (9741.095 s, 0.4722)      probe=4  : 9741.128 s rank-10 intersection results: 0.4722
+999997440/1000000000 (10830.420 s, 0.5256)      probe=8  : 10830.455 s rank-10 intersection results: 0.5256
+999997440/1000000000 (12531.716 s, 0.5603)      probe=16 : 12531.758 s rank-10 intersection results: 0.5603
+999997440/1000000000 (15922.519 s, 0.5825)      probe=32 : 15922.571 s rank-10 intersection results: 0.5825
+999997440/1000000000 (22774.153 s, 0.5950)      probe=64 : 22774.220 s rank-10 intersection results: 0.5950
+999997440/1000000000 (36717.207 s, 0.6015)      probe=128: 36717.309 s rank-10 intersection results: 0.6015
+999997440/1000000000 (70616.392 s, 0.6047)      probe=256: 70616.581 s rank-10 intersection results: 0.6047
+```
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/README.md
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/README.md
@ -0,0 +1,20 @@
+# Benchmark of IVF variants
+
+This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy. 
+The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
+
+
+The code is organized as: 
+
+- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
+
+- `bench_all_ivf.py`: evaluate one type of inverted file
+
+- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
+Since the number of experiments is quite large the script is structued so that the benchmark can be run on a cluster.
+
+- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
+
+The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies. 
+
+It was run in October 2018 for the results in the wiki. 
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
@ -0,0 +1,308 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+import os
+import sys
+import time
+import numpy as np
+import faiss
+import argparse
+import datasets
+from datasets import sanitize
+
+######################################################
+# Command-line parsing
+######################################################
+
+
+parser = argparse.ArgumentParser()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa('--compute_gt', default=False, action='store_true',
+    help='compute and store the groundtruth')
+
+group = parser.add_argument_group('index consturction')
+
+aa('--indexkey', default='HNSW32', help='index_factory type')
+aa('--efConstruction', default=200, type=int,
+   help='HNSW construction factor')
+aa('--M0', default=-1, type=int, help='size of base level')
+aa('--maxtrain', default=256 * 256, type=int,
+   help='maximum number of training points (0 to set automatically)')
+aa('--indexfile', default='', help='file to read or write index from')
+aa('--add_bs', default=-1, type=int,
+   help='add elements index by batches of this size')
+aa('--no_precomputed_tables', action='store_true', default=False,
+   help='disable precomputed tables (uses less memory)')
+aa('--clustering_niter', default=-1, type=int,
+   help='number of clustering iterations (-1 = leave default)')
+aa('--train_on_gpu', default=False, action='store_true',
+   help='do training on GPU')
+aa('--get_centroids_from', default='',
+   help='get the centroids from this index (to speed up training)')
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--searchthreads', default=-1, type=int,
+   help='nb of threads to use at search time')
+aa('--searchparams', nargs='+', default=['autotune'],
+   help="search parameters to use (can be autotune or a list of params)")
+aa('--n_autotune', default=500, type=int,
+   help="max nb of autotune experiments")
+aa('--autotune_max', default=[], nargs='*',
+   help='set max value for autotune variables format "var:val" (exclusive)')
+aa('--autotune_range', default=[], nargs='*',
+   help='set complete autotune range, format "var:val1,val2,..."')
+aa('--min_test_duration', default=0, type=float,
+   help='run test at least for so long to avoid jitter')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+os.system('echo -n "nb processors "; '
+          'cat /proc/cpuinfo | grep ^processor | wc -l; '
+          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(
+    dataset=args.db, compute_gt=args.compute_gt)
+
+
+print("dataset sizes: train %s base %s query %s GT %s" % (
+    xt.shape, xb.shape, xq.shape, gt.shape))
+
+nq, d = xq.shape
+nb, d = xb.shape
+
+
+######################################################
+# Make index
+######################################################
+
+if args.indexfile and os.path.exists(args.indexfile):
+
+    print("reading", args.indexfile)
+    index = faiss.read_index(args.indexfile)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_ivf = faiss.downcast_index(index.index)
+    else:
+        index_ivf = index
+        assert isinstance(index_ivf, faiss.IndexIVF)
+        vec_transform = lambda x: x
+    assert isinstance(index_ivf, faiss.IndexIVF)
+
+else:
+
+    print("build index, key=", args.indexkey)
+
+    index = faiss.index_factory(d, args.indexkey)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_ivf = faiss.downcast_index(index.index)
+        vec_transform = index.chain.at(0).apply_py
+    else:
+        index_ivf = index
+        vec_transform = lambda x:x
+    assert isinstance(index_ivf, faiss.IndexIVF)
+    index_ivf.verbose = True
+    index_ivf.quantizer.verbose = True
+    index_ivf.cp.verbose = True
+
+    maxtrain = args.maxtrain
+    if maxtrain == 0:
+        if 'IMI' in args.indexkey:
+            maxtrain = int(256 * 2 ** (np.log2(index_ivf.nlist) / 2))
+        else:
+            maxtrain = 50 * index_ivf.nlist
+        print("setting maxtrain to %d" % maxtrain)
+        args.maxtrain = maxtrain
+
+    xt2 = sanitize(xt[:args.maxtrain])
+    assert np.all(np.isfinite(xt2))
+
+    print("train, size", xt2.shape)
+
+    if args.get_centroids_from == '':
+
+        if args.clustering_niter >= 0:
+            print(("setting nb of clustering iterations to %d" %
+                   args.clustering_niter))
+            index_ivf.cp.niter = args.clustering_niter
+
+        if args.train_on_gpu:
+            print("add a training index on GPU")
+            train_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+            index_ivf.clustering_index = train_index
+
+    else:
+        print("Getting centroids from", args.get_centroids_from)
+        src_index = faiss.read_index(args.get_centroids_from)
+        src_quant = faiss.downcast_index(src_index.quantizer)
+        centroids = faiss.vector_to_array(src_quant.xb)
+        centroids = centroids.reshape(-1, d)
+        print("  centroid table shape", centroids.shape)
+
+        if isinstance(index, faiss.IndexPreTransform):
+            print("  training vector transform")
+            assert index.chain.size() == 1
+            vt = index.chain.at(0)
+            vt.train(xt2)
+            print("  transform centroids")
+            centroids = vt.apply_py(centroids)
+
+        print("  add centroids to quantizer")
+        index_ivf.quantizer.add(centroids)
+        del src_index
+
+    t0 = time.time()
+    index.train(xt2)
+    print("  train in %.3f s" % (time.time() - t0))
+
+    print("adding")
+    t0 = time.time()
+    if args.add_bs == -1:
+        index.add(sanitize(xb))
+    else:
+        for i0 in range(0, nb, args.add_bs):
+            i1 = min(nb, i0 + args.add_bs)
+            print("  adding %d:%d / %d" % (i0, i1, nb))
+            index.add(sanitize(xb[i0:i1]))
+
+    print("  add in %.3f s" % (time.time() - t0))
+    if args.indexfile:
+        print("storing", args.indexfile)
+        faiss.write_index(index, args.indexfile)
+
+if args.no_precomputed_tables:
+    if isinstance(index_ivf, faiss.IndexIVFPQ):
+        print("disabling precomputed table")
+        index_ivf.use_precomputed_table = -1
+        index_ivf.precomputed_table.clear()
+
+if args.indexfile:
+    print("index size on disk: ", os.stat(args.indexfile).st_size)
+
+print("current RSS:", faiss.get_mem_usage_kb() * 1024)
+
+precomputed_table_size = 0
+if hasattr(index_ivf, 'precomputed_table'):
+    precomputed_table_size = index_ivf.precomputed_table.size() * 4
+
+print("precomputed tables size:", precomputed_table_size)
+
+
+#############################################################
+# Index is ready
+#############################################################
+
+xq = sanitize(xq)
+
+if args.searchthreads != -1:
+    print("Setting nb of threads to", args.searchthreads)
+    faiss.omp_set_num_threads(args.searchthreads)
+
+
+ps = faiss.ParameterSpace()
+ps.initialize(index)
+
+
+parametersets = args.searchparams
+
+header = '%-40s     R@1   R@10  R@100  time(ms/q)   nb distances #runs' % "parameters"
+
+
+def eval_setting(index, xq, gt, min_time):
+    nq = xq.shape[0]
+    ivf_stats = faiss.cvar.indexIVF_stats
+    ivf_stats.reset()
+    nrun = 0
+    t0 = time.time()
+    while True:
+        D, I = index.search(xq, 100)
+        nrun += 1
+        t1 = time.time()
+        if t1 - t0 > min_time:
+            break
+    ms_per_query = ((t1 - t0) * 1000.0 / nq / nrun)
+    for rank in 1, 10, 100:
+        n_ok = (I[:, :rank] == gt[:, :1]).sum()
+        print("%.4f" % (n_ok / float(nq)), end=' ')
+    print("   %8.3f  " % ms_per_query, end=' ')
+    print("%12d   " % (ivf_stats.ndis / nrun), end=' ')
+    print(nrun)
+
+
+if parametersets == ['autotune']:
+
+    ps.n_experiments = args.n_autotune
+    ps.min_test_duration = args.min_test_duration
+
+    for kv in args.autotune_max:
+        k, vmax = kv.split(':')
+        vmax = float(vmax)
+        print("limiting %s to %g" % (k, vmax))
+        pr = ps.add_range(k)
+        values = faiss.vector_to_array(pr.values)
+        values = np.array([v for v in values if v < vmax])
+        faiss.copy_array_to_vector(values, pr.values)
+
+    for kv in args.autotune_range:
+        k, vals = kv.split(':')
+        vals = np.fromstring(vals, sep=',')
+        print("setting %s to %s" % (k, vals))
+        pr = ps.add_range(k)
+        faiss.copy_array_to_vector(vals, pr.values)
+
+    # setup the Criterion object: optimize for 1-R@1
+    crit = faiss.OneRecallAtRCriterion(nq, 1)
+
+    # by default, the criterion will request only 1 NN
+    crit.nnn = 100
+    crit.set_groundtruth(None, gt.astype('int64'))
+
+    # then we let Faiss find the optimal parameters by itself
+    print("exploring operating points")
+    ps.display()
+
+    t0 = time.time()
+    op = ps.explore(index, xq, crit)
+    print("Done in %.3f s, available OPs:" % (time.time() - t0))
+
+    op.display()
+
+    print(header)
+    opv = op.optimal_pts
+    for i in range(opv.size()):
+        opt = opv.at(i)
+
+        ps.set_index_parameters(index, opt.key)
+
+        print("%-40s " % opt.key, end=' ')
+        sys.stdout.flush()
+
+        eval_setting(index, xq, gt, args.min_test_duration)
+
+else:
+    print(header)
+    for param in parametersets:
+        print("%-40s " % param, end=' ')
+        sys.stdout.flush()
+        ps.set_index_parameters(index, param)
+
+        eval_setting(index, xq, gt, args.min_test_duration)
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_kmeans.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/bench_kmeans.py
@ -0,0 +1,118 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python2
+
+from __future__ import print_function
+import os
+import numpy as np
+import faiss
+import argparse
+import datasets
+from datasets import sanitize
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa('--nt', default=65536, type=int)
+aa('--nb', default=100000, type=int)
+aa('--nt_sample', default=0, type=int)
+
+group = parser.add_argument_group('kmeans options')
+aa('--k', default=256, type=int)
+aa('--seed', default=12345, type=int)
+aa('--pcadim', default=-1, type=int, help='PCA to this dimension')
+aa('--niter', default=25, type=int)
+aa('--eval_freq', default=100, type=int)
+
+
+args = parser.parse_args()
+
+print("args:", args)
+
+os.system('echo -n "nb processors "; '
+          'cat /proc/cpuinfo | grep ^processor | wc -l; '
+          'cat /proc/cpuinfo | grep ^"model name" | tail -1')
+
+ngpu = faiss.get_num_gpus()
+print("nb GPUs:", ngpu)
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(dataset=args.db)
+
+
+if args.nt_sample == 0:
+    xt_pca = xt[args.nt:args.nt + 10000]
+    xt = xt[:args.nt]
+else:
+    xt_pca = xt[args.nt_sample:args.nt_sample + 10000]
+    rs = np.random.RandomState(args.seed)
+    idx = rs.choice(args.nt_sample, size=args.nt, replace=False)
+    xt = xt[idx]
+
+xb = xb[:args.nb]
+
+d = xb.shape[1]
+
+if args.pcadim != -1:
+    print("training PCA: %d -> %d" % (d, args.pcadim))
+    pca = faiss.PCAMatrix(d, args.pcadim)
+    pca.train(sanitize(xt_pca))
+    xt = pca.apply_py(sanitize(xt))
+    xb = pca.apply_py(sanitize(xb))
+    d = xb.shape[1]
+
+
+######################################################
+# Run clustering
+######################################################
+
+
+index = faiss.IndexFlatL2(d)
+
+if ngpu > 0:
+    print("moving index to GPU")
+    index = faiss.index_cpu_to_all_gpus(index)
+
+
+clustering = faiss.Clustering(d, args.k)
+
+clustering.verbose = True
+clustering.seed = args.seed
+clustering.max_points_per_centroid = 10**6
+clustering.min_points_per_centroid = 1
+
+
+for iter0 in range(0, args.niter, args.eval_freq):
+    iter1 = min(args.niter, iter0 + args.eval_freq)
+    clustering.niter = iter1 - iter0
+
+    if iter0 > 0:
+        faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids)
+
+    clustering.train(sanitize(xt), index)
+    index.reset()
+    centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d)
+    index.add(centroids)
+
+    _, I = index.search(sanitize(xb), 1)
+
+    error = ((xb - centroids[I.ravel()]) ** 2).sum()
+
+    print("iter1=%d quantization error on test: %.4f" % (iter1, error))
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
@ -0,0 +1,234 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+
+from __future__ import print_function
+import time
+import numpy as np
+import faiss
+import sys
+
+# set this to the directory that contains the datafiles.
+# deep1b data should be at simdir + 'deep1b'
+# bigann data should be at simdir + 'bigann'
+simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
+
+#################################################################
+# Small I/O functions
+#################################################################
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def bvecs_mmap(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def ivecs_write(fname, m):
+    n, d = m.shape
+    m1 = np.empty((n, d + 1), dtype='int32')
+    m1[:, 0] = d
+    m1[:, 1:] = m
+    m1.tofile(fname)
+
+
+def fvecs_write(fname, m):
+    m = m.astype('float32')
+    ivecs_write(fname, m.view('int32'))
+
+
+#################################################################
+# Dataset
+#################################################################
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+
+def compute_GT_sliced(xb, xq, k):
+    print("compute GT")
+    t0 = time.time()
+    nb, d = xb.shape
+    nq, d = xq.shape
+    rh = ResultHeap(nq, k)
+    bs = 10 ** 5
+
+    xqs = sanitize(xq)
+
+    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+
+    # compute ground-truth by blocks of bs, and add to heaps
+    for i0 in range(0, nb, bs):
+        i1 = min(nb, i0 + bs)
+        xsl = sanitize(xb[i0:i1])
+        db_gt.add(xsl)
+        D, I = db_gt.search(xqs, k)
+        rh.add_batch_result(D, I, i0)
+        db_gt.reset()
+        print("\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print()
+    rh.finalize()
+    gt_I = rh.I
+
+    print("GT time: %.3f s" % (time.time() - t0))
+    return gt_I
+
+
+def do_compute_gt(xb, xq, k):
+    print("computing GT")
+    nb, d = xb.shape
+    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+    if nb < 100 * 1000:
+        print("   add")
+        index.add(np.ascontiguousarray(xb, dtype='float32'))
+        print("   search")
+        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
+    else:
+        I = compute_GT_sliced(xb, xq, k)
+
+    return I.astype('int32')
+
+
+def load_data(dataset='deep1M', compute_gt=False):
+
+    print("load data", dataset)
+
+    if dataset == 'sift1M':
+        basedir = simdir + 'sift1M/'
+
+        xt = fvecs_read(basedir + "sift_learn.fvecs")
+        xb = fvecs_read(basedir + "sift_base.fvecs")
+        xq = fvecs_read(basedir + "sift_query.fvecs")
+        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
+
+    elif dataset.startswith('bigann'):
+        basedir = simdir + 'bigann/'
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
+        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
+        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
+        # trim xb to correct size
+        xb = xb[:dbsize * 1000 * 1000]
+        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
+
+    elif dataset.startswith("deep"):
+        basedir = simdir + 'deep1b/'
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+
+        xt = fvecs_mmap(basedir + "learn.fvecs")
+        xb = fvecs_mmap(basedir + "base.fvecs")
+        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
+
+        xb = xb[:dbsize]
+
+        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
+        if compute_gt:
+            gt = do_compute_gt(xb, xq, 100)
+            print("store", gt_fname)
+            ivecs_write(gt_fname, gt)
+
+        gt = ivecs_read(gt_fname)
+
+    else:
+        assert False
+
+    print("dataset %s sizes: B %s Q %s T %s" % (
+        dataset, xb.shape, xq.shape, xt.shape))
+
+    return xt, xb, xq, gt
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print("\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq), end=' ')
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+    if endl:
+        print()
+    return D, I
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/parse_bench_all_ivf.py
@ -0,0 +1,268 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#! /usr/bin/env python2
+
+import os
+import numpy as np
+from matplotlib import pyplot
+
+import re
+
+from argparse import Namespace
+
+
+# the directory used in run_on_cluster.bash
+basedir = '/mnt/vol/gfsai-east/ai-group/users/matthijs/bench_all_ivf/'
+logdir = basedir + 'logs/'
+
+
+# which plot to output
+db = 'bigann1B'
+code_size = 8
+
+
+
+def unitsize(indexkey):
+    """ size of one vector in the index """
+    mo = re.match('.*,PQ(\\d+)', indexkey)
+    if mo:
+        return int(mo.group(1))
+    if indexkey.endswith('SQ8'):
+        bits_per_d = 8
+    elif indexkey.endswith('SQ4'):
+        bits_per_d = 4
+    elif indexkey.endswith('SQfp16'):
+        bits_per_d = 16
+    else:
+        assert False
+    mo = re.match('PCAR(\\d+),.*', indexkey)
+    if mo:
+        return bits_per_d * int(mo.group(1)) / 8
+    mo = re.match('OPQ\\d+_(\\d+),.*', indexkey)
+    if mo:
+        return bits_per_d * int(mo.group(1)) / 8
+    mo = re.match('RR(\\d+),.*', indexkey)
+    if mo:
+        return bits_per_d * int(mo.group(1)) / 8
+    assert False
+
+
+def dbsize_from_name(dbname):
+    sufs = {
+        '1B': 10**9,
+        '100M': 10**8,
+        '10M': 10**7,
+        '1M': 10**6,
+    }
+    for s in sufs:
+        if dbname.endswith(s):
+            return sufs[s]
+    else:
+        assert False
+
+
+def keep_latest_stdout(fnames):
+    fnames = [fname for fname in fnames if fname.endswith('.stdout')]
+    fnames.sort()
+    n = len(fnames)
+    fnames2 = []
+    for i, fname in enumerate(fnames):
+        if i + 1 < n and fnames[i + 1][:-8] == fname[:-8]:
+            continue
+        fnames2.append(fname)
+    return fnames2
+
+
+def parse_result_file(fname):
+    # print fname
+    st = 0
+    res = []
+    keys = []
+    stats = {}
+    stats['run_version'] = fname[-8]
+    for l in open(fname):
+        if st == 0:
+            if l.startswith('CHRONOS_JOB_INSTANCE_ID'):
+                stats['CHRONOS_JOB_INSTANCE_ID'] = l.split()[-1]
+            if l.startswith('index size on disk:'):
+                stats['index_size'] = int(l.split()[-1])
+            if l.startswith('current RSS:'):
+                stats['RSS'] = int(l.split()[-1])
+            if l.startswith('precomputed tables size:'):
+                stats['tables_size'] = int(l.split()[-1])
+            if l.startswith('Setting nb of threads to'):
+                stats['n_threads'] = int(l.split()[-1])
+            if l.startswith('  add in'):
+                stats['add_time'] = float(l.split()[-2])
+            if l.startswith('args:'):
+                args = eval(l[l.find(' '):])
+                indexkey = args.indexkey
+            elif 'R@1   R@10  R@100' in l:
+                st = 1
+            elif 'index size on disk:' in l:
+                index_size = int(l.split()[-1])
+        elif st == 1:
+            st = 2
+        elif st == 2:
+            fi = l.split()
+            keys.append(fi[0])
+            res.append([float(x) for x in fi[1:]])
+    return indexkey, np.array(res), keys, stats
+
+# run parsing
+allres = {}
+allstats = {}
+nts = []
+missing = []
+versions = {}
+
+fnames = keep_latest_stdout(os.listdir(logdir))
+# print fnames
+# filenames are in the form <key>.x.stdout
+# where x is a version number (from a to z)
+# keep only latest version of each name
+
+for fname in fnames:
+    if not ('db' + db in fname and fname.endswith('.stdout')):
+        continue
+    indexkey, res, _, stats = parse_result_file(logdir + fname)
+    if res.size == 0:
+        missing.append(fname)
+        errorline = open(
+            logdir + fname.replace('.stdout', '.stderr')).readlines()
+        if len(errorline) > 0:
+            errorline = errorline[-1]
+        else:
+            errorline = 'NO STDERR'
+        print fname, stats['CHRONOS_JOB_INSTANCE_ID'], errorline
+
+    else:
+        if indexkey in allres:
+            if allstats[indexkey]['run_version'] > stats['run_version']:
+                # don't use this run
+                continue
+        n_threads = stats.get('n_threads', 1)
+        nts.append(n_threads)
+        allres[indexkey] = res
+        allstats[indexkey] = stats
+
+assert len(set(nts)) == 1
+n_threads = nts[0]
+
+
+def plot_tradeoffs(allres, code_size, recall_rank):
+    dbsize = dbsize_from_name(db)
+    recall_idx = int(np.log10(recall_rank))
+
+    bigtab = []
+    names = []
+
+    for k,v in sorted(allres.items()):
+        if v.ndim != 2: continue
+        us = unitsize(k)
+        if us != code_size: continue
+        perf = v[:, recall_idx]
+        times = v[:, 3]
+        bigtab.append(
+            np.vstack((
+                np.ones(times.size, dtype=int) * len(names),
+                perf, times
+            ))
+        )
+        names.append(k)
+
+    bigtab = np.hstack(bigtab)
+
+    perm = np.argsort(bigtab[1, :])
+    bigtab = bigtab[:, perm]
+
+    times = np.minimum.accumulate(bigtab[2, ::-1])[::-1]
+    selection = np.where(bigtab[2, :] == times)
+
+    selected_methods = [names[i] for i in
+                        np.unique(bigtab[0, selection].astype(int))]
+    not_selected = list(set(names) - set(selected_methods))
+
+    print "methods without an optimal OP: ", not_selected
+
+    nq = 10000
+    pyplot.title('database ' + db + ' code_size=%d' % code_size)
+
+    # grayed out lines
+
+    for k in not_selected:
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(k)
+        if us != code_size: continue
+
+        linestyle = (':' if 'PQ' in k else
+                     '-.' if 'SQ4' in k else
+                     '--' if 'SQ8' in k else '-')
+
+        pyplot.semilogy(v[:, recall_idx], v[:, 3], label=None,
+                        linestyle=linestyle,
+                        marker='o' if 'HNSW' in k else '+',
+                        color='#cccccc', linewidth=0.2)
+
+    # important methods
+    for k in selected_methods:
+        v = allres[k]
+        if v.ndim != 2: continue
+        us = unitsize(k)
+        if us != code_size: continue
+
+        stats = allstats[k]
+        tot_size = stats['index_size'] + stats['tables_size']
+        id_size = 8 # 64 bit
+
+        addt = ''
+        if 'add_time' in stats:
+            add_time = stats['add_time']
+            if add_time > 7200:
+                add_min = add_time / 60
+                addt = ', %dh%02d' % (add_min / 60, add_min % 60)
+            else:
+                add_sec = int(add_time)
+                addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)
+
+
+        label = k + ' (size+%.1f%%%s)' % (
+            tot_size / float((code_size + id_size) * dbsize) * 100 - 100,
+            addt)
+
+        linestyle = (':' if 'PQ' in k else
+                     '-.' if 'SQ4' in k else
+                     '--' if 'SQ8' in k else '-')
+
+        pyplot.semilogy(v[:, recall_idx], v[:, 3], label=label,
+                        linestyle=linestyle,
+                        marker='o' if 'HNSW' in k else '+')
+
+    if len(not_selected) == 0:
+        om = ''
+    else:
+        om = '\nomitted:'
+        nc = len(om)
+        for m in not_selected:
+            if nc > 80:
+                om += '\n'
+                nc = 0
+            om += ' ' + m
+            nc += len(m) + 1
+
+    pyplot.xlabel('1-recall at %d %s' % (recall_rank, om) )
+    pyplot.ylabel('search time per query (ms, %d threads)' % n_threads)
+    pyplot.legend()
+    pyplot.grid()
+    pyplot.savefig('figs/tradeoffs_%s_cs%d_r%d.png' % (
+        db, code_size, recall_rank))
+    return selected_methods, not_selected
+
+
+pyplot.gcf().set_size_inches(15, 10)
+
+plot_tradeoffs(allres, code_size=code_size, recall_rank=1)
--- a/Show More
+++ b/Show More