From 84c7701126a78cb2de15c91e141d1693282c5636 Mon Sep 17 00:00:00 2001
From: Xiaohai Xu <xiaohai.xu@zilliz.com>
Date: Fri, 3 Jul 2020 19:27:11 +0800
Subject: [PATCH] #2689 Construct Knowhere Index Without Data (#2704)

* Offset for IVFFlat

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* modify segment reader

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* add index_flat_nm

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* fix compilation issue

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix segment reader

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* Fix issue

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix hnsw_nm

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* fix nsg

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* fix bug

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* NSG

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix NSG issue

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* client test

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix index size

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* remove unnecessary things

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* add changelog

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix clang format

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* add changelog

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix compile error

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix compile error

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix compile error

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix compile error

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix issues

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Change data to shared_ptr

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix hnsw and nsg ut

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* clang-format

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* not to insert cache immediately

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* Fix macro issue

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

Co-authored-by: shengjun.li <shengjun.li@zilliz.com>
---
 CHANGELOG.md                                  |    1 +
 core/src/codecs/VectorIndexFormat.h           |    5 +
 core/src/codecs/VectorsFormat.h               |    4 +
 .../default/DefaultVectorIndexFormat.cpp      |   25 +-
 .../codecs/default/DefaultVectorIndexFormat.h |    6 +-
 .../codecs/default/DefaultVectorsFormat.cpp   |   52 +
 .../src/codecs/default/DefaultVectorsFormat.h |    7 +
 core/src/db/engine/ExecutionEngineImpl.cpp    |   12 +-
 core/src/index/knowhere/CMakeLists.txt        |   24 +-
 .../knowhere/index/vector_index/IndexNSG.cpp  |    4 +-
 .../knowhere/index/vector_index/VecIndex.h    |    2 +
 .../index/vector_index/VecIndexFactory.cpp    |   14 +-
 .../index/vector_index/helpers/Cloner.cpp     |    3 +
 .../index/vector_index/impl/nsg/NSG.cpp       |   79 +-
 .../index/vector_index/impl/nsg/NSG.h         |   35 +-
 .../index/vector_index/impl/nsg/NSGIO.cpp     |    6 +-
 .../vector_offset_index/IndexHNSW_NM.cpp      |  190 +++
 .../index/vector_offset_index/IndexHNSW_NM.h  |   66 +
 .../index/vector_offset_index/IndexIVF_NM.cpp |  339 +++++
 .../index/vector_offset_index/IndexIVF_NM.h   |  104 ++
 .../index/vector_offset_index/IndexNSG_NM.cpp |  163 +++
 .../index/vector_offset_index/IndexNSG_NM.h   |   80 ++
 .../vector_offset_index/OffsetBaseIndex.cpp   |   56 +
 .../vector_offset_index/OffsetBaseIndex.h     |   45 +
 .../gpu/IndexGPUIVF_NM.cpp                    |  182 +++
 .../vector_offset_index/gpu/IndexGPUIVF_NM.h  |   63 +
 core/src/index/thirdparty/faiss/AutoTune.h    |    2 +-
 core/src/index/thirdparty/faiss/Index.cpp     |    9 +
 core/src/index/thirdparty/faiss/Index.h       |   13 +
 core/src/index/thirdparty/faiss/IndexIVF.cpp  |  240 ++++
 core/src/index/thirdparty/faiss/IndexIVF.h    |   24 +
 .../index/thirdparty/faiss/IndexIVFFlat.cpp   |   34 +
 .../src/index/thirdparty/faiss/IndexIVFFlat.h |    3 +
 .../index/thirdparty/faiss/InvertedLists.cpp  |   99 +-
 .../index/thirdparty/faiss/InvertedLists.h    |   28 +
 .../index/thirdparty/faiss/gpu/GpuCloner.cpp  |   59 +
 .../index/thirdparty/faiss/gpu/GpuCloner.h    |   13 +
 .../thirdparty/faiss/gpu/GpuIndexIVFFlat.cu   |  576 ++++----
 .../thirdparty/faiss/gpu/GpuIndexIVFFlat.h    |    4 +
 .../thirdparty/faiss/impl/index_read.cpp      |  131 +-
 .../thirdparty/faiss/impl/index_write.cpp     |   98 ++
 core/src/index/thirdparty/faiss/index_io.h    |   11 +
 core/src/index/thirdparty/hnswlib/hnswalg.h   |    9 +
 .../src/index/thirdparty/hnswlib/hnswalg_nm.h | 1206 +++++++++++++++++
 core/src/index/thirdparty/hnswlib/hnswlib.h   |    5 +
 .../src/index/thirdparty/hnswlib/hnswlib_nm.h |   98 ++
 .../thirdparty/hnswlib/visited_list_pool.h    |    1 +
 core/src/index/unittest/CMakeLists.txt        |   23 +-
 core/src/index/unittest/Helper.h              |   14 +
 core/src/index/unittest/test_hnsw.cpp         |   38 +-
 core/src/index/unittest/test_ivf.cpp          |    4 +-
 core/src/index/unittest/test_ivf_cpu_nm.cpp   |  106 ++
 core/src/index/unittest/test_ivf_gpu_nm.cpp   |  114 ++
 core/src/index/unittest/test_nsg.cpp          |   82 +-
 core/src/scheduler/task/BuildIndexTask.cpp    |    7 +-
 core/src/segment/SegmentReader.cpp            |   16 +
 core/src/segment/SegmentReader.h              |    3 +
 core/unittest/db/utils.cpp                    |    2 +-
 sdk/examples/simple/src/ClientTest.cpp        |    2 +-
 sdk/examples/simple/src/ClientTest.h          |    2 +-
 sdk/examples/utils/Utils.cpp                  |    2 +-
 sdk/examples/utils/Utils.h                    |    2 +-
 62 files changed, 4254 insertions(+), 393 deletions(-)
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.cpp
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.h
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.cpp
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.h
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.cpp
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.h
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.cpp
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.h
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.cpp
 create mode 100644 core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h
 create mode 100644 core/src/index/thirdparty/hnswlib/hnswalg_nm.h
 create mode 100644 core/src/index/thirdparty/hnswlib/hnswlib_nm.h
 create mode 100644 core/src/index/unittest/test_ivf_cpu_nm.cpp
 create mode 100644 core/src/index/unittest/test_ivf_gpu_nm.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b0237a55fa..afe7c631a7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,7 @@ Please mark all changes in change log and use the issue from GitHub
 -   \#2509 Count up query statistics for debug ease
 -   \#2572 Support structured data index
 -   \#2585 Support IVF_PQ on GPU with using metric_type IP
+-   \#2689 Construct Knowhere Index Without Data
 
 ## Improvement
 -   \#2543 Remove secondary_path related code
diff --git a/core/src/codecs/VectorIndexFormat.h b/core/src/codecs/VectorIndexFormat.h
index d25fcd712c..5b0795f622 100644
--- a/core/src/codecs/VectorIndexFormat.h
+++ b/core/src/codecs/VectorIndexFormat.h
@@ -34,6 +34,11 @@ class VectorIndexFormat {
     virtual void
     write(const storage::FSHandlerPtr& fs_ptr, const std::string& location,
           const segment::VectorIndexPtr& vector_index) = 0;
+
+    virtual void
+    read(const storage::FSHandlerPtr& fs_ptr, const std::string& location, knowhere::BinaryPtr raw_data,
+         segment::VectorIndexPtr& vector_index) {
+    }
 };
 
 using VectorIndexFormatPtr = std::shared_ptr<VectorIndexFormat>;
diff --git a/core/src/codecs/VectorsFormat.h b/core/src/codecs/VectorsFormat.h
index 5227f9a6a0..150e9309d6 100644
--- a/core/src/codecs/VectorsFormat.h
+++ b/core/src/codecs/VectorsFormat.h
@@ -20,6 +20,7 @@
 #include <memory>
 #include <vector>
 
+#include "index/knowhere/knowhere/common/BinarySet.h"
 #include "segment/Vectors.h"
 #include "storage/FSHandler.h"
 
@@ -37,6 +38,9 @@ class VectorsFormat {
     virtual void
     read_uids(const storage::FSHandlerPtr& fs_ptr, std::vector<segment::doc_id_t>& uids) = 0;
 
+    virtual void
+    read_vectors(const storage::FSHandlerPtr& fs_ptr, knowhere::BinaryPtr& raw_vectors) = 0;
+
     virtual void
     read_vectors(const storage::FSHandlerPtr& fs_ptr, off_t offset, size_t num_bytes,
                  std::vector<uint8_t>& raw_vectors) = 0;
diff --git a/core/src/codecs/default/DefaultVectorIndexFormat.cpp b/core/src/codecs/default/DefaultVectorIndexFormat.cpp
index d543192dc4..dd35126651 100644
--- a/core/src/codecs/default/DefaultVectorIndexFormat.cpp
+++ b/core/src/codecs/default/DefaultVectorIndexFormat.cpp
@@ -31,7 +31,8 @@ namespace milvus {
 namespace codec {
 
 knowhere::VecIndexPtr
-DefaultVectorIndexFormat::read_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& path) {
+DefaultVectorIndexFormat::read_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& path,
+                                        knowhere::BinaryPtr raw_data) {
     milvus::TimeRecorder recorder("read_index");
     knowhere::BinarySet load_data_list;
 
@@ -91,6 +92,12 @@ DefaultVectorIndexFormat::read_internal(const storage::FSHandlerPtr& fs_ptr, con
     auto index =
         vec_index_factory.CreateVecIndex(knowhere::OldIndexTypeToStr(current_type), knowhere::IndexMode::MODE_CPU);
     if (index != nullptr) {
+        if (raw_data != nullptr) {
+            LOG_ENGINE_DEBUG_ << "load index with row data " << raw_data->size;
+            load_data_list.Append(RAW_DATA, raw_data);
+            length += raw_data->size;
+        }
+
         index->Load(load_data_list);
         index->SetIndexSize(length);
     } else {
@@ -116,6 +123,22 @@ DefaultVectorIndexFormat::read(const storage::FSHandlerPtr& fs_ptr, const std::s
     vector_index->SetVectorIndex(index);
 }
 
+void
+DefaultVectorIndexFormat::read(const storage::FSHandlerPtr& fs_ptr, const std::string& location,
+                               knowhere::BinaryPtr raw_data, segment::VectorIndexPtr& vector_index) {
+    const std::lock_guard<std::mutex> lock(mutex_);
+
+    std::string dir_path = fs_ptr->operation_ptr_->GetDirectory();
+    if (!boost::filesystem::is_directory(dir_path)) {
+        std::string err_msg = "Directory: " + dir_path + "does not exist";
+        LOG_ENGINE_ERROR_ << err_msg;
+        throw Exception(SERVER_INVALID_ARGUMENT, err_msg);
+    }
+
+    knowhere::VecIndexPtr index = read_internal(fs_ptr, location, raw_data);
+    vector_index->SetVectorIndex(index);
+}
+
 void
 DefaultVectorIndexFormat::write(const storage::FSHandlerPtr& fs_ptr, const std::string& location,
                                 const segment::VectorIndexPtr& vector_index) {
diff --git a/core/src/codecs/default/DefaultVectorIndexFormat.h b/core/src/codecs/default/DefaultVectorIndexFormat.h
index 945ff31f45..412b5c3e70 100644
--- a/core/src/codecs/default/DefaultVectorIndexFormat.h
+++ b/core/src/codecs/default/DefaultVectorIndexFormat.h
@@ -33,6 +33,10 @@ class DefaultVectorIndexFormat : public VectorIndexFormat {
     read(const storage::FSHandlerPtr& fs_ptr, const std::string& location,
          segment::VectorIndexPtr& vector_index) override;
 
+    void
+    read(const storage::FSHandlerPtr& fs_ptr, const std::string& location, knowhere::BinaryPtr raw_data,
+         segment::VectorIndexPtr& vector_index) override;
+
     void
     write(const storage::FSHandlerPtr& fs_ptr, const std::string& location,
           const segment::VectorIndexPtr& vector_index) override;
@@ -48,7 +52,7 @@ class DefaultVectorIndexFormat : public VectorIndexFormat {
 
  private:
     knowhere::VecIndexPtr
-    read_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& path);
+    read_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& path, knowhere::BinaryPtr raw_data = nullptr);
 
  private:
     std::mutex mutex_;
diff --git a/core/src/codecs/default/DefaultVectorsFormat.cpp b/core/src/codecs/default/DefaultVectorsFormat.cpp
index 9e6e50d2f0..3f2bb7b76d 100644
--- a/core/src/codecs/default/DefaultVectorsFormat.cpp
+++ b/core/src/codecs/default/DefaultVectorsFormat.cpp
@@ -20,6 +20,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <algorithm>
+#include <memory>
 
 #include <boost/filesystem.hpp>
 
@@ -53,6 +54,30 @@ DefaultVectorsFormat::read_vectors_internal(const storage::FSHandlerPtr& fs_ptr,
     fs_ptr->reader_ptr_->close();
 }
 
+void
+DefaultVectorsFormat::read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path,
+                                            knowhere::BinaryPtr& raw_vectors) {
+    if (!fs_ptr->reader_ptr_->open(file_path.c_str())) {
+        std::string err_msg = "Failed to open file: " + file_path + ", error: " + std::strerror(errno);
+        LOG_ENGINE_ERROR_ << err_msg;
+        throw Exception(SERVER_CANNOT_OPEN_FILE, err_msg);
+    }
+
+    size_t num_bytes;
+    fs_ptr->reader_ptr_->read(&num_bytes, sizeof(size_t));
+
+    raw_vectors = std::make_shared<knowhere::Binary>();
+    raw_vectors->size = num_bytes;
+    raw_vectors->data = std::shared_ptr<uint8_t[]>(new uint8_t[num_bytes]);
+
+    // Beginning of file is num_bytes
+    fs_ptr->reader_ptr_->seekg(sizeof(size_t));
+
+    fs_ptr->reader_ptr_->read(raw_vectors->data.get(), num_bytes);
+
+    fs_ptr->reader_ptr_->close();
+}
+
 void
 DefaultVectorsFormat::read_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path,
                                          std::vector<segment::doc_id_t>& uids) {
@@ -157,6 +182,32 @@ DefaultVectorsFormat::read_uids(const storage::FSHandlerPtr& fs_ptr, std::vector
         const auto& path = it->path();
         if (path.extension().string() == user_id_extension_) {
             read_uids_internal(fs_ptr, path.string(), uids);
+            break;
+        }
+    }
+}
+
+void
+DefaultVectorsFormat::read_vectors(const storage::FSHandlerPtr& fs_ptr, knowhere::BinaryPtr& raw_vectors) {
+    const std::lock_guard<std::mutex> lock(mutex_);
+
+    std::string dir_path = fs_ptr->operation_ptr_->GetDirectory();
+    if (!boost::filesystem::is_directory(dir_path)) {
+        std::string err_msg = "Directory: " + dir_path + "does not exist";
+        LOG_ENGINE_ERROR_ << err_msg;
+        throw Exception(SERVER_INVALID_ARGUMENT, err_msg);
+    }
+
+    boost::filesystem::path target_path(dir_path);
+    typedef boost::filesystem::directory_iterator d_it;
+    d_it it_end;
+    d_it it(target_path);
+    //    for (auto& it : boost::filesystem::directory_iterator(dir_path)) {
+    for (; it != it_end; ++it) {
+        const auto& path = it->path();
+        if (path.extension().string() == raw_vector_extension_) {
+            read_vectors_internal(fs_ptr, path.string(), raw_vectors);
+            break;
         }
     }
 }
@@ -182,6 +233,7 @@ DefaultVectorsFormat::read_vectors(const storage::FSHandlerPtr& fs_ptr, off_t of
         const auto& path = it->path();
         if (path.extension().string() == raw_vector_extension_) {
             read_vectors_internal(fs_ptr, path.string(), offset, num_bytes, raw_vectors);
+            break;
         }
     }
 }
diff --git a/core/src/codecs/default/DefaultVectorsFormat.h b/core/src/codecs/default/DefaultVectorsFormat.h
index ac5fc89a5a..28589f6f5f 100644
--- a/core/src/codecs/default/DefaultVectorsFormat.h
+++ b/core/src/codecs/default/DefaultVectorsFormat.h
@@ -40,6 +40,9 @@ class DefaultVectorsFormat : public VectorsFormat {
     void
     read_uids(const storage::FSHandlerPtr& fs_ptr, std::vector<segment::doc_id_t>& uids) override;
 
+    void
+    read_vectors(const storage::FSHandlerPtr& fs_ptr, knowhere::BinaryPtr& raw_vectors) override;
+
     void
     read_vectors(const storage::FSHandlerPtr& fs_ptr, off_t offset, size_t num_bytes,
                  std::vector<uint8_t>& raw_vectors) override;
@@ -58,6 +61,10 @@ class DefaultVectorsFormat : public VectorsFormat {
     read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path, off_t offset, size_t num,
                           std::vector<uint8_t>& raw_vectors);
 
+    void
+    read_vectors_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path,
+                          knowhere::BinaryPtr& raw_vectors);
+
     void
     read_uids_internal(const storage::FSHandlerPtr& fs_ptr, const std::string& file_path,
                        std::vector<segment::doc_id_t>& uids);
diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp
index 9df036ae8a..6c0d4e0543 100644
--- a/core/src/db/engine/ExecutionEngineImpl.cpp
+++ b/core/src/db/engine/ExecutionEngineImpl.cpp
@@ -92,6 +92,11 @@ IsBinaryIndexType(knowhere::IndexType type) {
     return type == knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP || type == knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT;
 }
 
+bool
+IndexSupportOffset(EngineType type) {
+    return type == EngineType::FAISS_IVFFLAT || type == EngineType::HNSW || type == EngineType::NSG_MIX;
+}
+
 }  // namespace
 
 #ifdef MILVUS_GPU_VERSION
@@ -448,7 +453,12 @@ ExecutionEngineImpl::Load(bool to_cache) {
             try {
                 segment::SegmentPtr segment_ptr;
                 segment_reader_ptr->GetSegment(segment_ptr);
-                auto status = segment_reader_ptr->LoadVectorIndex(location_, segment_ptr->vector_index_ptr_);
+                if (IndexSupportOffset(index_type_)) {
+                    auto status =
+                        segment_reader_ptr->LoadVectorIndexWithRawData(location_, segment_ptr->vector_index_ptr_);
+                } else {
+                    auto status = segment_reader_ptr->LoadVectorIndex(location_, segment_ptr->vector_index_ptr_);
+                }
                 index_ = segment_ptr->vector_index_ptr_->GetVectorIndex();
 
                 if (index_ == nullptr) {
diff --git a/core/src/index/knowhere/CMakeLists.txt b/core/src/index/knowhere/CMakeLists.txt
index 282dd076a0..11c5492ef5 100644
--- a/core/src/index/knowhere/CMakeLists.txt
+++ b/core/src/index/knowhere/CMakeLists.txt
@@ -42,7 +42,7 @@ set(external_srcs
         knowhere/common/Timer.cpp
         )
 
-set(index_srcs
+set(vector_index_srcs
         knowhere/index/vector_index/adapter/VectorAdapter.cpp
         knowhere/index/vector_index/helpers/FaissIO.cpp
         knowhere/index/vector_index/helpers/IndexParameter.cpp
@@ -56,23 +56,28 @@ set(index_srcs
         knowhere/index/vector_index/FaissBaseIndex.cpp
         knowhere/index/vector_index/IndexBinaryIDMAP.cpp
         knowhere/index/vector_index/IndexBinaryIVF.cpp
-        knowhere/index/vector_index/IndexHNSW.cpp
         knowhere/index/vector_index/IndexIDMAP.cpp
         knowhere/index/vector_index/IndexIVF.cpp
         knowhere/index/vector_index/IndexIVFPQ.cpp
         knowhere/index/vector_index/IndexIVFSQ.cpp
-        knowhere/index/vector_index/IndexNSG.cpp
         knowhere/index/vector_index/IndexType.cpp
         knowhere/index/vector_index/VecIndexFactory.cpp
         knowhere/index/vector_index/IndexAnnoy.cpp
         )
 
+set(vector_offset_index_srcs
+        knowhere/index/vector_offset_index/OffsetBaseIndex.cpp
+        knowhere/index/vector_offset_index/IndexIVF_NM.cpp
+        knowhere/index/vector_offset_index/IndexHNSW_NM.cpp
+        knowhere/index/vector_offset_index/IndexNSG_NM.cpp
+        )
+
 if (MILVUS_SUPPORT_SPTAG)
-    set(index_srcs
+    set(vector_index_srcs
             knowhere/index/vector_index/adapter/SptagAdapter.cpp
             knowhere/index/vector_index/helpers/SPTAGParameterMgr.cpp
             knowhere/index/vector_index/IndexSPTAG.cpp
-            ${index_srcs}
+            ${vector_index_srcs}
             )
 endif ()
 
@@ -117,7 +122,7 @@ if (MILVUS_GPU_VERSION)
             ${cuda_lib}
             )
 
-    set(index_srcs ${index_srcs}
+    set(vector_index_srcs ${vector_index_srcs}
             knowhere/index/vector_index/gpu/IndexGPUIDMAP.cpp
             knowhere/index/vector_index/gpu/IndexGPUIVF.cpp
             knowhere/index/vector_index/gpu/IndexGPUIVFPQ.cpp
@@ -126,13 +131,18 @@ if (MILVUS_GPU_VERSION)
             knowhere/index/vector_index/helpers/Cloner.cpp
             knowhere/index/vector_index/helpers/FaissGpuResourceMgr.cpp
             )
+
+    set(vector_offset_index_srcs ${vector_offset_index_srcs}
+            knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.cpp
+            )
 endif ()
 
 if (NOT TARGET knowhere)
     add_library(
             knowhere STATIC
             ${external_srcs}
-            ${index_srcs}
+            ${vector_index_srcs}
+            ${vector_offset_index_srcs}
     )
 endif ()
 
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp
index 301306b645..1a55701307 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexNSG.cpp
@@ -94,8 +94,8 @@ NSG::Query(const DatasetPtr& dataset_ptr, const Config& config) {
         s_params.k = config[meta::TOPK];
         {
             std::lock_guard<std::mutex> lk(mutex_);
-            index_->Search((float*)p_data, rows, dim, config[meta::TOPK].get<int64_t>(), p_dist, p_id, s_params,
-                           blacklist);
+            index_->Search((float*)p_data, nullptr, rows, dim, config[meta::TOPK].get<int64_t>(), p_dist, p_id,
+                           s_params, blacklist);
         }
 
         auto ret_ds = std::make_shared<Dataset>();
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h b/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h
index 9f49166f5e..5dbfe9853b 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/VecIndex.h
@@ -25,6 +25,8 @@
 namespace milvus {
 namespace knowhere {
 
+#define RAW_DATA "RAW_DATA"
+
 class VecIndex : public Index {
  public:
     virtual void
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp b/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp
index ae4400660e..7e967a260b 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp
@@ -16,12 +16,13 @@
 #include "knowhere/index/vector_index/IndexAnnoy.h"
 #include "knowhere/index/vector_index/IndexBinaryIDMAP.h"
 #include "knowhere/index/vector_index/IndexBinaryIVF.h"
-#include "knowhere/index/vector_index/IndexHNSW.h"
 #include "knowhere/index/vector_index/IndexIDMAP.h"
 #include "knowhere/index/vector_index/IndexIVF.h"
 #include "knowhere/index/vector_index/IndexIVFPQ.h"
 #include "knowhere/index/vector_index/IndexIVFSQ.h"
-#include "knowhere/index/vector_index/IndexNSG.h"
+#include "knowhere/index/vector_offset_index/IndexHNSW_NM.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
+#include "knowhere/index/vector_offset_index/IndexNSG_NM.h"
 #ifdef MILVUS_SUPPORT_SPTAG
 #include "knowhere/index/vector_index/IndexSPTAG.h"
 #endif
@@ -34,6 +35,7 @@
 #include "knowhere/index/vector_index/gpu/IndexGPUIVFSQ.h"
 #include "knowhere/index/vector_index/gpu/IndexIVFSQHybrid.h"
 #include "knowhere/index/vector_index/helpers/Cloner.h"
+#include "knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h"
 #endif
 
 namespace milvus {
@@ -47,10 +49,10 @@ VecIndexFactory::CreateVecIndex(const IndexType& type, const IndexMode mode) {
     } else if (type == IndexEnum::INDEX_FAISS_IVFFLAT) {
 #ifdef MILVUS_GPU_VERSION
         if (mode == IndexMode::MODE_GPU) {
-            return std::make_shared<knowhere::GPUIVF>(gpu_device);
+            return std::make_shared<knowhere::GPUIVF_NM>(gpu_device);
         }
 #endif
-        return std::make_shared<knowhere::IVF>();
+        return std::make_shared<knowhere::IVF_NM>();
     } else if (type == IndexEnum::INDEX_FAISS_IVFPQ) {
 #ifdef MILVUS_GPU_VERSION
         if (mode == IndexMode::MODE_GPU) {
@@ -74,7 +76,7 @@ VecIndexFactory::CreateVecIndex(const IndexType& type, const IndexMode mode) {
     } else if (type == IndexEnum::INDEX_FAISS_BIN_IVFFLAT) {
         return std::make_shared<knowhere::BinaryIVF>();
     } else if (type == IndexEnum::INDEX_NSG) {
-        return std::make_shared<knowhere::NSG>(-1);
+        return std::make_shared<knowhere::NSG_NM>(-1);
 #ifdef MILVUS_SUPPORT_SPTAG
     } else if (type == IndexEnum::INDEX_SPTAG_KDT_RNT) {
         return std::make_shared<knowhere::CPUSPTAGRNG>("KDT");
@@ -82,7 +84,7 @@ VecIndexFactory::CreateVecIndex(const IndexType& type, const IndexMode mode) {
         return std::make_shared<knowhere::CPUSPTAGRNG>("BKT");
 #endif
     } else if (type == IndexEnum::INDEX_HNSW) {
-        return std::make_shared<knowhere::IndexHNSW>();
+        return std::make_shared<knowhere::IndexHNSW_NM>();
     } else if (type == IndexEnum::INDEX_ANNOY) {
         return std::make_shared<knowhere::IndexAnnoy>();
     } else {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/helpers/Cloner.cpp b/core/src/index/knowhere/knowhere/index/vector_index/helpers/Cloner.cpp
index 2ba189c513..cb1f6f306f 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/helpers/Cloner.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/helpers/Cloner.cpp
@@ -19,6 +19,7 @@
 #include "knowhere/index/vector_index/gpu/GPUIndex.h"
 #include "knowhere/index/vector_index/gpu/IndexGPUIVF.h"
 #include "knowhere/index/vector_index/gpu/IndexIVFSQHybrid.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
 
 namespace milvus {
 namespace knowhere {
@@ -50,6 +51,8 @@ CopyCpuToGpu(const VecIndexPtr& index, const int64_t device_id, const Config& co
     VecIndexPtr result;
     if (auto device_index = std::dynamic_pointer_cast<IVFSQHybrid>(index)) {
         result = device_index->CopyCpuToGpu(device_id, config);
+    } else if (auto cpu_index = std::dynamic_pointer_cast<IVF_NM>(index)) {
+        result = cpu_index->CopyCpuToGpu(device_id, config);
     } else if (auto device_index = std::dynamic_pointer_cast<GPUIndex>(index)) {
         result = device_index->CopyGpuToGpu(device_id, config);
     } else if (auto cpu_index = std::dynamic_pointer_cast<IVFSQ>(index)) {
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.cpp b/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.cpp
index dbc4cd5bc5..5f92589c4b 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.cpp
@@ -41,17 +41,17 @@ NsgIndex::NsgIndex(const size_t& dimension, const size_t& n, std::string metric)
 }
 
 NsgIndex::~NsgIndex() {
-    delete[] ori_data_;
+    // delete[] ori_data_;
     delete[] ids_;
     delete distance_;
 }
 
 void
-NsgIndex::Build_with_ids(size_t nb, const float* data, const int64_t* ids, const BuildParams& parameters) {
+NsgIndex::Build_with_ids(size_t nb, float* data, const int64_t* ids, const BuildParams& parameters) {
     ntotal = nb;
-    ori_data_ = new float[ntotal * dimension];
+    // ori_data_ = new float[ntotal * dimension];
     ids_ = new int64_t[ntotal];
-    memcpy((void*)ori_data_, (void*)data, sizeof(float) * ntotal * dimension);
+    // memcpy((void*)ori_data_, (void*)data, sizeof(float) * ntotal * dimension);
     memcpy((void*)ids_, (void*)ids, sizeof(int64_t) * ntotal);
 
     search_length = parameters.search_length;
@@ -59,13 +59,13 @@ NsgIndex::Build_with_ids(size_t nb, const float* data, const int64_t* ids, const
     candidate_pool_size = parameters.candidate_pool_size;
 
     TimeRecorder rc("NSG", 1);
-    InitNavigationPoint();
+    InitNavigationPoint(data);
     rc.RecordSection("init");
 
-    Link();
+    Link(data);
     rc.RecordSection("Link");
 
-    CheckConnectivity();
+    CheckConnectivity(data);
     rc.RecordSection("Connect");
     rc.ElapseFromBegin("finish");
 
@@ -89,14 +89,14 @@ NsgIndex::Build_with_ids(size_t nb, const float* data, const int64_t* ids, const
 }
 
 void
-NsgIndex::InitNavigationPoint() {
+NsgIndex::InitNavigationPoint(float* data) {
     // calculate the center of vectors
     auto center = new float[dimension];
     memset(center, 0, sizeof(float) * dimension);
 
     for (size_t i = 0; i < ntotal; i++) {
         for (size_t j = 0; j < dimension; j++) {
-            center[j] += ori_data_[i * dimension + j];
+            center[j] += data[i * dimension + j];
         }
     }
     for (size_t j = 0; j < dimension; j++) {
@@ -106,7 +106,7 @@ NsgIndex::InitNavigationPoint() {
     // select navigation point
     std::vector<Neighbor> resset;
     navigation_point = rand_r(&seed) % ntotal;  // random initialize navigating point
-    GetNeighbors(center, resset, knng);
+    GetNeighbors(center, data, resset, knng);
     navigation_point = resset[0].id;
 
     // Debug code
@@ -124,7 +124,7 @@ NsgIndex::InitNavigationPoint() {
 
 // Specify Link
 void
-NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset,
+NsgIndex::GetNeighbors(const float* query, float* data, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset,
                        boost::dynamic_bitset<>& has_calculated_dist) {
     auto& graph = knng;
     size_t buffer_size = search_length;
@@ -174,7 +174,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
                 continue;
             }
 
-            float dist = distance_->Compare(ori_data_ + dimension * id, query, dimension);
+            float dist = distance_->Compare(data + dimension * id, query, dimension);
             resset[i] = Neighbor(id, dist, false);
 
             //// difference from other GetNeighbors
@@ -199,7 +199,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
                         continue;
                     has_calculated_dist[id] = true;
 
-                    float dist = distance_->Compare(query, ori_data_ + dimension * id, dimension);
+                    float dist = distance_->Compare(query, data + dimension * id, dimension);
                     Neighbor nn(id, dist, false);
                     fullset.push_back(nn);
 
@@ -226,7 +226,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
 
 // FindUnconnectedNode
 void
-NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset) {
+NsgIndex::GetNeighbors(const float* query, float* data, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset) {
     auto& graph = nsg;
     size_t buffer_size = search_length;
 
@@ -276,7 +276,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
                 continue;
             }
 
-            float dist = distance_->Compare(ori_data_ + id * dimension, query, dimension);
+            float dist = distance_->Compare(data + id * dimension, query, dimension);
             resset[i] = Neighbor(id, dist, false);
         }
         std::sort(resset.begin(), resset.end());  // sort by distance
@@ -297,7 +297,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
                         continue;
                     has_calculated_dist[id] = true;
 
-                    float dist = distance_->Compare(ori_data_ + dimension * id, query, dimension);
+                    float dist = distance_->Compare(data + dimension * id, query, dimension);
                     Neighbor nn(id, dist, false);
                     fullset.push_back(nn);
 
@@ -323,7 +323,8 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::v
 }
 
 void
-NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph& graph, SearchParams* params) {
+NsgIndex::GetNeighbors(const float* query, float* data, std::vector<Neighbor>& resset, Graph& graph,
+                       SearchParams* params) {
     size_t buffer_size = params ? params->search_length : search_length;
 
     if (buffer_size > ntotal) {
@@ -367,7 +368,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
                 KNOWHERE_THROW_MSG("Build Index Error, id > ntotal");
             }
 
-            float dist = distance_->Compare(ori_data_ + id * dimension, query, dimension);
+            float dist = distance_->Compare(data + id * dimension, query, dimension);
             resset[i] = Neighbor(id, dist, false);
         }
         std::sort(resset.begin(), resset.end());  // sort by distance
@@ -388,7 +389,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
                         continue;
                     has_calculated_dist[id] = true;
 
-                    float dist = distance_->Compare(query, ori_data_ + dimension * id, dimension);
+                    float dist = distance_->Compare(query, data + dimension * id, dimension);
 
                     if (dist >= resset[buffer_size - 1].distance)
                         continue;
@@ -422,7 +423,7 @@ NsgIndex::GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph&
 }
 
 void
-NsgIndex::Link() {
+NsgIndex::Link(float* data) {
     float* cut_graph_dist = new float[ntotal * out_degree];
     nsg.resize(ntotal);
 
@@ -437,8 +438,8 @@ NsgIndex::Link() {
             fullset.clear();
             temp.clear();
             flags.reset();
-            GetNeighbors(ori_data_ + dimension * n, temp, fullset, flags);
-            SyncPrune(n, fullset, flags, cut_graph_dist);
+            GetNeighbors(data + dimension * n, data, temp, fullset, flags);
+            SyncPrune(data, n, fullset, flags, cut_graph_dist);
         }
 
         // Debug code
@@ -464,20 +465,20 @@ NsgIndex::Link() {
 #pragma omp for schedule(dynamic, 100)
     for (unsigned n = 0; n < ntotal; ++n) {
         faiss::BuilderSuspend::check_wait();
-        InterInsert(n, mutex_vec, cut_graph_dist);
+        InterInsert(data, n, mutex_vec, cut_graph_dist);
     }
     delete[] cut_graph_dist;
 }
 
 void
-NsgIndex::SyncPrune(size_t n, std::vector<Neighbor>& pool, boost::dynamic_bitset<>& has_calculated,
+NsgIndex::SyncPrune(float* data, size_t n, std::vector<Neighbor>& pool, boost::dynamic_bitset<>& has_calculated,
                     float* cut_graph_dist) {
     // avoid lose nearest neighbor in knng
     for (size_t i = 0; i < knng[n].size(); ++i) {
         auto id = knng[n][i];
         if (has_calculated[id])
             continue;
-        float dist = distance_->Compare(ori_data_ + dimension * n, ori_data_ + dimension * id, dimension);
+        float dist = distance_->Compare(data + dimension * n, data + dimension * id, dimension);
         pool.emplace_back(Neighbor(id, dist, true));
     }
 
@@ -490,7 +491,7 @@ NsgIndex::SyncPrune(size_t n, std::vector<Neighbor>& pool, boost::dynamic_bitset
     }
     result.push_back(pool[cursor]);  // init result with nearest neighbor
 
-    SelectEdge(cursor, pool, result, true);
+    SelectEdge(data, cursor, pool, result, true);
 
     // filling the cut_graph
     auto& des_id_pool = nsg[n];
@@ -507,7 +508,7 @@ NsgIndex::SyncPrune(size_t n, std::vector<Neighbor>& pool, boost::dynamic_bitset
 
 //>> Optimize: remove read-lock
 void
-NsgIndex::InterInsert(unsigned n, std::vector<std::mutex>& mutex_vec, float* cut_graph_dist) {
+NsgIndex::InterInsert(float* data, unsigned n, std::vector<std::mutex>& mutex_vec, float* cut_graph_dist) {
     auto& current = n;
 
     auto& neighbor_id_pool = nsg[current];
@@ -555,7 +556,7 @@ NsgIndex::InterInsert(unsigned n, std::vector<std::mutex>& mutex_vec, float* cut
             std::sort(wait_for_link_pool.begin(), wait_for_link_pool.end());
             result.push_back(wait_for_link_pool[start]);
 
-            SelectEdge(start, wait_for_link_pool, result);
+            SelectEdge(data, start, wait_for_link_pool, result);
 
             {
                 LockGuard lk(mutex_vec[current_neighbor]);
@@ -580,7 +581,8 @@ NsgIndex::InterInsert(unsigned n, std::vector<std::mutex>& mutex_vec, float* cut
 }
 
 void
-NsgIndex::SelectEdge(unsigned& cursor, std::vector<Neighbor>& sort_pool, std::vector<Neighbor>& result, bool limit) {
+NsgIndex::SelectEdge(float* data, unsigned& cursor, std::vector<Neighbor>& sort_pool, std::vector<Neighbor>& result,
+                     bool limit) {
     auto& pool = sort_pool;
 
     /*
@@ -594,8 +596,7 @@ NsgIndex::SelectEdge(unsigned& cursor, std::vector<Neighbor>& sort_pool, std::ve
         auto& p = pool[cursor];
         bool should_link = true;
         for (size_t t = 0; t < result.size(); ++t) {
-            float dist =
-                distance_->Compare(ori_data_ + dimension * result[t].id, ori_data_ + dimension * p.id, dimension);
+            float dist = distance_->Compare(data + dimension * result[t].id, data + dimension * p.id, dimension);
 
             if (dist < p.distance) {
                 should_link = false;
@@ -608,7 +609,7 @@ NsgIndex::SelectEdge(unsigned& cursor, std::vector<Neighbor>& sort_pool, std::ve
 }
 
 void
-NsgIndex::CheckConnectivity() {
+NsgIndex::CheckConnectivity(float* data) {
     auto root = navigation_point;
     boost::dynamic_bitset<> has_linked{ntotal, 0};
     int64_t linked_count = 0;
@@ -619,7 +620,7 @@ NsgIndex::CheckConnectivity() {
         if (linked_count >= static_cast<int64_t>(ntotal)) {
             break;
         }
-        FindUnconnectedNode(has_linked, root);
+        FindUnconnectedNode(data, has_linked, root);
     }
 }
 
@@ -657,7 +658,7 @@ NsgIndex::DFS(size_t root, boost::dynamic_bitset<>& has_linked, int64_t& linked_
 }
 
 void
-NsgIndex::FindUnconnectedNode(boost::dynamic_bitset<>& has_linked, int64_t& root) {
+NsgIndex::FindUnconnectedNode(float* data, boost::dynamic_bitset<>& has_linked, int64_t& root) {
     // find any of unlinked-node
     size_t id = ntotal;
     for (size_t i = 0; i < ntotal; i++) {  // find not link
@@ -672,7 +673,7 @@ NsgIndex::FindUnconnectedNode(boost::dynamic_bitset<>& has_linked, int64_t& root
 
     // search unlinked-node's neighbor
     std::vector<Neighbor> tmp, pool;
-    GetNeighbors(ori_data_ + dimension * id, tmp, pool);
+    GetNeighbors(data + dimension * id, data, tmp, pool);
     std::sort(pool.begin(), pool.end());
 
     size_t found = 0;
@@ -831,18 +832,18 @@ NsgIndex::FindUnconnectedNode(boost::dynamic_bitset<>& has_linked, int64_t& root
 // }
 
 void
-NsgIndex::Search(const float* query, const unsigned& nq, const unsigned& dim, const unsigned& k, float* dist,
-                 int64_t* ids, SearchParams& params, faiss::ConcurrentBitsetPtr bitset) {
+NsgIndex::Search(const float* query, float* data, const unsigned& nq, const unsigned& dim, const unsigned& k,
+                 float* dist, int64_t* ids, SearchParams& params, faiss::ConcurrentBitsetPtr bitset) {
     std::vector<std::vector<Neighbor>> resset(nq);
 
     TimeRecorder rc("NsgIndex::search", 1);
     if (nq == 1) {
-        GetNeighbors(query, resset[0], nsg, &params);
+        GetNeighbors(query, data, resset[0], nsg, &params);
     } else {
 #pragma omp parallel for
         for (unsigned int i = 0; i < nq; ++i) {
             const float* single_query = query + i * dim;
-            GetNeighbors(single_query, resset[i], nsg, &params);
+            GetNeighbors(single_query, data, resset[i], nsg, &params);
         }
     }
     rc.RecordSection("search");
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.h b/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.h
index 603af1417d..fc5deed322 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.h
+++ b/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSG.h
@@ -48,7 +48,7 @@ class NsgIndex {
     std::string metric_type;  // L2 | IP
     Distance* distance_;
 
-    float* ori_data_;
+    // float* ori_data_;
     int64_t* ids_;
     Graph nsg;   // final graph
     Graph knng;  // reset after build
@@ -74,12 +74,12 @@ class NsgIndex {
     void
     SetKnnGraph(Graph& knng);
 
-    virtual void
-    Build_with_ids(size_t nb, const float* data, const int64_t* ids, const BuildParams& parameters);
+    void
+    Build_with_ids(size_t nb, float* data, const int64_t* ids, const BuildParams& parameters);
 
     void
-    Search(const float* query, const unsigned& nq, const unsigned& dim, const unsigned& k, float* dist, int64_t* ids,
-           SearchParams& params, faiss::ConcurrentBitsetPtr bitset = nullptr);
+    Search(const float* query, float* data, const unsigned& nq, const unsigned& dim, const unsigned& k, float* dist,
+           int64_t* ids, SearchParams& params, faiss::ConcurrentBitsetPtr bitset = nullptr);
 
     // Not support yet.
     // virtual void Add() = 0;
@@ -95,46 +95,49 @@ class NsgIndex {
     //                   const BuildParam &parameters);
 
  protected:
-    virtual void
-    InitNavigationPoint();
+    void
+    InitNavigationPoint(float* data);
 
     // link specify
     void
-    GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset,
+    GetNeighbors(const float* query, float* data, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset,
                  boost::dynamic_bitset<>& has_calculated_dist);
 
     // FindUnconnectedNode
     void
-    GetNeighbors(const float* query, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset);
+    GetNeighbors(const float* query, float* data, std::vector<Neighbor>& resset, std::vector<Neighbor>& fullset);
 
     // navigation-point
     void
-    GetNeighbors(const float* query, std::vector<Neighbor>& resset, Graph& graph, SearchParams* param = nullptr);
+    GetNeighbors(const float* query, float* data, std::vector<Neighbor>& resset, Graph& graph,
+                 SearchParams* param = nullptr);
 
     // only for search
     // void
     // GetNeighbors(const float* query, node_t* I, float* D, SearchParams* params);
 
     void
-    Link();
+    Link(float* data);
 
     void
-    SyncPrune(size_t q, std::vector<Neighbor>& pool, boost::dynamic_bitset<>& has_calculated, float* cut_graph_dist);
+    SyncPrune(float* data, size_t q, std::vector<Neighbor>& pool, boost::dynamic_bitset<>& has_calculated,
+              float* cut_graph_dist);
 
     void
-    SelectEdge(unsigned& cursor, std::vector<Neighbor>& sort_pool, std::vector<Neighbor>& result, bool limit = false);
+    SelectEdge(float* data, unsigned& cursor, std::vector<Neighbor>& sort_pool, std::vector<Neighbor>& result,
+               bool limit = false);
 
     void
-    InterInsert(unsigned n, std::vector<std::mutex>& mutex_vec, float* dist);
+    InterInsert(float* data, unsigned n, std::vector<std::mutex>& mutex_vec, float* dist);
 
     void
-    CheckConnectivity();
+    CheckConnectivity(float* data);
 
     void
     DFS(size_t root, boost::dynamic_bitset<>& flags, int64_t& count);
 
     void
-    FindUnconnectedNode(boost::dynamic_bitset<>& flags, int64_t& root);
+    FindUnconnectedNode(float* data, boost::dynamic_bitset<>& flags, int64_t& root);
 };
 
 }  // namespace impl
diff --git a/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSGIO.cpp b/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSGIO.cpp
index 880cca71ea..31b32295b1 100644
--- a/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSGIO.cpp
+++ b/core/src/index/knowhere/knowhere/index/vector_index/impl/nsg/NSGIO.cpp
@@ -22,7 +22,7 @@ write_index(NsgIndex* index, MemoryIOWriter& writer) {
     writer(&index->ntotal, sizeof(index->ntotal), 1);
     writer(&index->dimension, sizeof(index->dimension), 1);
     writer(&index->navigation_point, sizeof(index->navigation_point), 1);
-    writer(index->ori_data_, sizeof(float) * index->ntotal * index->dimension, 1);
+    // writer(index->ori_data_, sizeof(float) * index->ntotal * index->dimension, 1);
     writer(index->ids_, sizeof(int64_t) * index->ntotal, 1);
 
     for (unsigned i = 0; i < index->ntotal; ++i) {
@@ -41,9 +41,9 @@ read_index(MemoryIOReader& reader) {
     auto index = new NsgIndex(dimension, ntotal);
     reader(&index->navigation_point, sizeof(index->navigation_point), 1);
 
-    index->ori_data_ = new float[index->ntotal * index->dimension];
+    // index->ori_data_ = new float[index->ntotal * index->dimension];
     index->ids_ = new int64_t[index->ntotal];
-    reader(index->ori_data_, sizeof(float) * index->ntotal * index->dimension, 1);
+    // reader(index->ori_data_, sizeof(float) * index->ntotal * index->dimension, 1);
     reader(index->ids_, sizeof(int64_t) * index->ntotal, 1);
 
     index->nsg.reserve(index->ntotal);
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.cpp b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.cpp
new file mode 100644
index 0000000000..294e3b8a86
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.cpp
@@ -0,0 +1,190 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#include "knowhere/index/vector_offset_index/IndexHNSW_NM.h"
+
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "faiss/BuilderSuspend.h"
+#include "hnswlib/space_ip.h"
+#include "hnswlib/space_l2.h"
+#include "knowhere/common/Exception.h"
+#include "knowhere/common/Log.h"
+#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
+#include "knowhere/index/vector_index/helpers/FaissIO.h"
+
+namespace milvus {
+namespace knowhere {
+
+// void
+// normalize_vector(float* data, float* norm_array, size_t dim) {
+//     float norm = 0.0f;
+//     for (int i = 0; i < dim; i++) norm += data[i] * data[i];
+//     norm = 1.0f / (sqrtf(norm) + 1e-30f);
+//     for (int i = 0; i < dim; i++) norm_array[i] = data[i] * norm;
+// }
+
+BinarySet
+IndexHNSW_NM::Serialize(const Config& config) {
+    if (!index_) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    try {
+        MemoryIOWriter writer;
+        index_->saveIndex(writer);
+        std::shared_ptr<uint8_t[]> data(writer.data_);
+
+        BinarySet res_set;
+        res_set.Append("HNSW", data, writer.rp);
+        return res_set;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+IndexHNSW_NM::Load(const BinarySet& index_binary) {
+    try {
+        auto binary = index_binary.GetByName("HNSW");
+
+        MemoryIOReader reader;
+        reader.total = binary->size;
+        reader.data_ = binary->data.get();
+
+        hnswlib::SpaceInterface<float>* space;
+        index_ = std::make_shared<hnswlib::HierarchicalNSW_NM<float>>(space);
+        index_->loadIndex(reader);
+
+        normalize = (index_->metric_type_ == 1);  // 1 == InnerProduct
+
+        data_ = index_binary.GetByName(RAW_DATA)->data;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+IndexHNSW_NM::Train(const DatasetPtr& dataset_ptr, const Config& config) {
+    try {
+        GETTENSOR(dataset_ptr)
+
+        hnswlib::SpaceInterface<float>* space;
+        if (config[Metric::TYPE] == Metric::L2) {
+            space = new hnswlib::L2Space(dim);
+        } else if (config[Metric::TYPE] == Metric::IP) {
+            space = new hnswlib::InnerProductSpace(dim);
+            normalize = true;
+        }
+        index_ = std::make_shared<hnswlib::HierarchicalNSW_NM<float>>(
+            space, rows, config[IndexParams::M].get<int64_t>(), config[IndexParams::efConstruction].get<int64_t>());
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+IndexHNSW_NM::Add(const DatasetPtr& dataset_ptr, const Config& config) {
+    // It will not call Query() just after Add()
+    // So, not to set 'data_' is allowed.
+
+    if (!index_) {
+        KNOWHERE_THROW_MSG("index not initialize");
+    }
+
+    std::lock_guard<std::mutex> lk(mutex_);
+
+    GETTENSORWITHIDS(dataset_ptr)
+
+    auto base = index_->getCurrentElementCount();
+    auto pp_data = const_cast<void*>(p_data);
+    index_->addPoint(pp_data, p_ids[0], base, 0);
+#pragma omp parallel for
+    for (int i = 1; i < rows; ++i) {
+        faiss::BuilderSuspend::check_wait();
+        index_->addPoint(pp_data, p_ids[i], base, i);
+    }
+}
+
+DatasetPtr
+IndexHNSW_NM::Query(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+    GETTENSOR(dataset_ptr)
+
+    size_t k = config[meta::TOPK].get<int64_t>();
+    size_t id_size = sizeof(int64_t) * k;
+    size_t dist_size = sizeof(float) * k;
+    auto p_id = (int64_t*)malloc(id_size * rows);
+    auto p_dist = (float*)malloc(dist_size * rows);
+
+    index_->setEf(config[IndexParams::ef]);
+
+    using P = std::pair<float, int64_t>;
+    auto compare = [](const P& v1, const P& v2) { return v1.first < v2.first; };
+
+    faiss::ConcurrentBitsetPtr blacklist = GetBlacklist();
+#pragma omp parallel for
+    for (unsigned int i = 0; i < rows; ++i) {
+        std::vector<P> ret;
+        const float* single_query = (float*)p_data + i * dim;
+
+        ret = index_->searchKnn_NM((void*)single_query, k, compare, blacklist, (float*)(data_.get()));
+
+        while (ret.size() < k) {
+            ret.emplace_back(std::make_pair(-1, -1));
+        }
+        std::vector<float> dist;
+        std::vector<int64_t> ids;
+
+        if (normalize) {
+            std::transform(ret.begin(), ret.end(), std::back_inserter(dist),
+                           [](const std::pair<float, int64_t>& e) { return float(1 - e.first); });
+        } else {
+            std::transform(ret.begin(), ret.end(), std::back_inserter(dist),
+                           [](const std::pair<float, int64_t>& e) { return e.first; });
+        }
+        std::transform(ret.begin(), ret.end(), std::back_inserter(ids),
+                       [](const std::pair<float, int64_t>& e) { return e.second; });
+
+        memcpy(p_dist + i * k, dist.data(), dist_size);
+        memcpy(p_id + i * k, ids.data(), id_size);
+    }
+
+    auto ret_ds = std::make_shared<Dataset>();
+    ret_ds->Set(meta::IDS, p_id);
+    ret_ds->Set(meta::DISTANCE, p_dist);
+    return ret_ds;
+}
+
+int64_t
+IndexHNSW_NM::Count() {
+    if (!index_) {
+        KNOWHERE_THROW_MSG("index not initialize");
+    }
+    return index_->cur_element_count;
+}
+
+int64_t
+IndexHNSW_NM::Dim() {
+    if (!index_) {
+        KNOWHERE_THROW_MSG("index not initialize");
+    }
+    return (*(size_t*)index_->dist_func_param_);
+}
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.h b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.h
new file mode 100644
index 0000000000..815a99e05b
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+
+#include "hnswlib/hnswalg_nm.h"
+#include "hnswlib/hnswlib.h"
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/index/vector_index/VecIndex.h"
+
+namespace milvus {
+namespace knowhere {
+
+class IndexHNSW_NM : public VecIndex {
+ public:
+    IndexHNSW_NM() {
+        index_type_ = IndexEnum::INDEX_HNSW;
+    }
+
+    BinarySet
+    Serialize(const Config& config = Config()) override;
+
+    void
+    Load(const BinarySet& index_binary) override;
+
+    void
+    Train(const DatasetPtr& dataset_ptr, const Config& config) override;
+
+    void
+    Add(const DatasetPtr& dataset_ptr, const Config& config) override;
+
+    void
+    AddWithoutIds(const DatasetPtr&, const Config&) override {
+        KNOWHERE_THROW_MSG("Incremental index is not supported");
+    }
+
+    DatasetPtr
+    Query(const DatasetPtr& dataset_ptr, const Config& config) override;
+
+    int64_t
+    Count() override;
+
+    int64_t
+    Dim() override;
+
+ private:
+    bool normalize = false;
+    std::mutex mutex_;
+    std::shared_ptr<hnswlib::HierarchicalNSW_NM<float>> index_ = nullptr;
+    std::shared_ptr<uint8_t[]> data_ = nullptr;
+};
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.cpp b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.cpp
new file mode 100644
index 0000000000..370c25b022
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.cpp
@@ -0,0 +1,339 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#include <faiss/AutoTune.h>
+#include <faiss/IVFlib.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/clone_index.h>
+#include <faiss/index_factory.h>
+#include <faiss/index_io.h>
+#ifdef MILVUS_GPU_VERSION
+#include <faiss/gpu/GpuAutoTune.h>
+#include <faiss/gpu/GpuCloner.h>
+#endif
+
+#include <fiu-local.h>
+#include <chrono>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "faiss/BuilderSuspend.h"
+#include "knowhere/common/Exception.h"
+#include "knowhere/common/Log.h"
+#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
+#include "knowhere/index/vector_index/helpers/IndexParameter.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
+#ifdef MILVUS_GPU_VERSION
+#include "knowhere/index/vector_index/gpu/IndexGPUIVF.h"
+#include "knowhere/index/vector_index/helpers/FaissGpuResourceMgr.h"
+#endif
+
+namespace milvus {
+namespace knowhere {
+
+using stdclock = std::chrono::high_resolution_clock;
+
+BinarySet
+IVF_NM::Serialize(const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    std::lock_guard<std::mutex> lk(mutex_);
+    return SerializeImpl(index_type_);
+}
+
+void
+IVF_NM::Load(const BinarySet& binary_set) {
+    std::lock_guard<std::mutex> lk(mutex_);
+    LoadImpl(binary_set, index_type_);
+
+    // Construct arranged data from original data
+    auto binary = binary_set.GetByName(RAW_DATA);
+    const float* original_data = (const float*)binary->data.get();
+    auto ivf_index = dynamic_cast<faiss::IndexIVF*>(index_.get());
+    auto invlists = ivf_index->invlists;
+    auto d = ivf_index->d;
+    auto nb = (size_t)(binary->size / invlists->code_size);
+    auto arranged_data = new uint8_t[d * sizeof(float) * nb];
+    prefix_sum.resize(invlists->nlist);
+    size_t curr_index = 0;
+
+#ifndef MILVUS_GPU_VERSION
+    auto ails = dynamic_cast<faiss::ArrayInvertedLists*>(invlists);
+    for (int i = 0; i < invlists->nlist; i++) {
+        auto list_size = ails->ids[i].size();
+        for (int j = 0; j < list_size; j++) {
+            memcpy(arranged_data + d * sizeof(float) * (curr_index + j), original_data + d * ails->ids[i][j],
+                   d * sizeof(float));
+        }
+        prefix_sum[i] = curr_index;
+        curr_index += list_size;
+    }
+#else
+    auto rol = dynamic_cast<faiss::ReadOnlyArrayInvertedLists*>(invlists);
+    auto lengths = rol->readonly_length;
+    auto rol_ids = (const int64_t*)rol->pin_readonly_ids->data;
+    for (int i = 0; i < invlists->nlist; i++) {
+        auto list_size = lengths[i];
+        for (int j = 0; j < list_size; j++) {
+            memcpy(arranged_data + d * sizeof(float) * (curr_index + j), original_data + d * rol_ids[curr_index + j],
+                   d * sizeof(float));
+        }
+        prefix_sum[i] = curr_index;
+        curr_index += list_size;
+    }
+#endif
+    data_ = std::shared_ptr<uint8_t[]>(arranged_data);
+}
+
+void
+IVF_NM::Train(const DatasetPtr& dataset_ptr, const Config& config) {
+    GETTENSOR(dataset_ptr)
+
+    faiss::Index* coarse_quantizer = new faiss::IndexFlatL2(dim);
+    int64_t nlist = config[IndexParams::nlist].get<int64_t>();
+    faiss::MetricType metric_type = GetMetricType(config[Metric::TYPE].get<std::string>());
+    auto index = std::make_shared<faiss::IndexIVFFlat>(coarse_quantizer, dim, nlist, metric_type);
+    index->train(rows, (float*)p_data);
+
+    index_.reset(faiss::clone_index(index.get()));
+}
+
+void
+IVF_NM::Add(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    std::lock_guard<std::mutex> lk(mutex_);
+    GETTENSORWITHIDS(dataset_ptr)
+    index_->add_with_ids_without_codes(rows, (float*)p_data, p_ids);
+}
+
+void
+IVF_NM::AddWithoutIds(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    std::lock_guard<std::mutex> lk(mutex_);
+    GETTENSOR(dataset_ptr)
+    index_->add_without_codes(rows, (float*)p_data);
+}
+
+DatasetPtr
+IVF_NM::Query(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    GETTENSOR(dataset_ptr)
+
+    try {
+        fiu_do_on("IVF_NM.Search.throw_std_exception", throw std::exception());
+        fiu_do_on("IVF_NM.Search.throw_faiss_exception", throw faiss::FaissException(""));
+        int64_t k = config[meta::TOPK].get<int64_t>();
+        auto elems = rows * k;
+
+        size_t p_id_size = sizeof(int64_t) * elems;
+        size_t p_dist_size = sizeof(float) * elems;
+        auto p_id = (int64_t*)malloc(p_id_size);
+        auto p_dist = (float*)malloc(p_dist_size);
+
+        QueryImpl(rows, (float*)p_data, k, p_dist, p_id, config);
+
+        auto ret_ds = std::make_shared<Dataset>();
+        ret_ds->Set(meta::IDS, p_id);
+        ret_ds->Set(meta::DISTANCE, p_dist);
+        return ret_ds;
+    } catch (faiss::FaissException& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+#if 0
+DatasetPtr
+IVF_NM::QueryById(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    auto rows = dataset_ptr->Get<int64_t>(meta::ROWS);
+    auto p_data = dataset_ptr->Get<const int64_t*>(meta::IDS);
+
+    try {
+        int64_t k = config[meta::TOPK].get<int64_t>();
+        auto elems = rows * k;
+
+        size_t p_id_size = sizeof(int64_t) * elems;
+        size_t p_dist_size = sizeof(float) * elems;
+        auto p_id = (int64_t*)malloc(p_id_size);
+        auto p_dist = (float*)malloc(p_dist_size);
+
+        // todo: enable search by id (zhiru)
+        //        auto blacklist = dataset_ptr->Get<faiss::ConcurrentBitsetPtr>("bitset");
+        auto index_ivf = std::static_pointer_cast<faiss::IndexIVF>(index_);
+        index_ivf->search_by_id(rows, p_data, k, p_dist, p_id, bitset_);
+
+        auto ret_ds = std::make_shared<Dataset>();
+        ret_ds->Set(meta::IDS, p_id);
+        ret_ds->Set(meta::DISTANCE, p_dist);
+        return ret_ds;
+    } catch (faiss::FaissException& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+DatasetPtr
+IVF_NM::GetVectorById(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    auto p_data = dataset_ptr->Get<const int64_t*>(meta::IDS);
+    auto elems = dataset_ptr->Get<int64_t>(meta::DIM);
+
+    try {
+        size_t p_x_size = sizeof(float) * elems;
+        auto p_x = (float*)malloc(p_x_size);
+
+        auto index_ivf = std::static_pointer_cast<faiss::IndexIVF>(index_);
+        index_ivf->get_vector_by_id(1, p_data, p_x, bitset_);
+
+        auto ret_ds = std::make_shared<Dataset>();
+        ret_ds->Set(meta::TENSOR, p_x);
+        return ret_ds;
+    } catch (faiss::FaissException& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+#endif
+
+void
+IVF_NM::Seal() {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+    SealImpl();
+}
+
+VecIndexPtr
+IVF_NM::CopyCpuToGpu(const int64_t device_id, const Config& config) {
+#ifdef MILVUS_GPU_VERSION
+    if (auto res = FaissGpuResourceMgr::GetInstance().GetRes(device_id)) {
+        ResScope rs(res, device_id, false);
+        auto gpu_index =
+            faiss::gpu::index_cpu_to_gpu_without_codes(res->faiss_res.get(), device_id, index_.get(), data_.get());
+
+        std::shared_ptr<faiss::Index> device_index;
+        device_index.reset(gpu_index);
+        return std::make_shared<GPUIVF>(device_index, device_id, res);
+    } else {
+        KNOWHERE_THROW_MSG("CopyCpuToGpu Error, can't get gpu_resource");
+    }
+
+#else
+    KNOWHERE_THROW_MSG("Calling IVF_NM::CopyCpuToGpu when we are using CPU version");
+#endif
+}
+
+void
+IVF_NM::GenGraph(const float* data, const int64_t k, GraphType& graph, const Config& config) {
+    int64_t K = k + 1;
+    auto ntotal = Count();
+
+    size_t dim = config[meta::DIM];
+    auto batch_size = 1000;
+    auto tail_batch_size = ntotal % batch_size;
+    auto batch_search_count = ntotal / batch_size;
+    auto total_search_count = tail_batch_size == 0 ? batch_search_count : batch_search_count + 1;
+
+    std::vector<float> res_dis(K * batch_size);
+    graph.resize(ntotal);
+    GraphType res_vec(total_search_count);
+    for (int i = 0; i < total_search_count; ++i) {
+        // it is usually used in NSG::train, to check BuilderSuspend
+        faiss::BuilderSuspend::check_wait();
+
+        auto b_size = (i == (total_search_count - 1)) && tail_batch_size != 0 ? tail_batch_size : batch_size;
+
+        auto& res = res_vec[i];
+        res.resize(K * b_size);
+
+        auto xq = data + batch_size * dim * i;
+        QueryImpl(b_size, (float*)xq, K, res_dis.data(), res.data(), config);
+
+        for (int j = 0; j < b_size; ++j) {
+            auto& node = graph[batch_size * i + j];
+            node.resize(k);
+            auto start_pos = j * K + 1;
+            for (int m = 0, cursor = start_pos; m < k && cursor < start_pos + k; ++m, ++cursor) {
+                node[m] = res[cursor];
+            }
+        }
+    }
+}
+
+std::shared_ptr<faiss::IVFSearchParameters>
+IVF_NM::GenParams(const Config& config) {
+    auto params = std::make_shared<faiss::IVFSearchParameters>();
+    params->nprobe = config[IndexParams::nprobe];
+    // params->max_codes = config["max_codes"];
+    return params;
+}
+
+void
+IVF_NM::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& config) {
+    auto params = GenParams(config);
+    auto ivf_index = dynamic_cast<faiss::IndexIVF*>(index_.get());
+    ivf_index->nprobe = params->nprobe;
+    stdclock::time_point before = stdclock::now();
+    if (params->nprobe > 1 && n <= 4) {
+        ivf_index->parallel_mode = 1;
+    } else {
+        ivf_index->parallel_mode = 0;
+    }
+    ivf_index->search_without_codes(n, (float*)data, (const uint8_t*)data_.get(), prefix_sum, k, distances, labels,
+                                    bitset_);
+    stdclock::time_point after = stdclock::now();
+    double search_cost = (std::chrono::duration<double, std::micro>(after - before)).count();
+    LOG_KNOWHERE_DEBUG_ << "IVF_NM search cost: " << search_cost
+                        << ", quantization cost: " << faiss::indexIVF_stats.quantization_time
+                        << ", data search cost: " << faiss::indexIVF_stats.search_time;
+    faiss::indexIVF_stats.quantization_time = 0;
+    faiss::indexIVF_stats.search_time = 0;
+}
+
+void
+IVF_NM::SealImpl() {
+#ifdef MILVUS_GPU_VERSION
+    faiss::Index* index = index_.get();
+    auto idx = dynamic_cast<faiss::IndexIVF*>(index);
+    if (idx != nullptr) {
+        idx->to_readonly_without_codes();
+    }
+#endif
+}
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.h b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.h
new file mode 100644
index 0000000000..4371388afc
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.h
@@ -0,0 +1,104 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+
+#include "knowhere/common/Typedef.h"
+#include "knowhere/index/vector_index/VecIndex.h"
+#include "knowhere/index/vector_offset_index/OffsetBaseIndex.h"
+
+namespace milvus {
+namespace knowhere {
+
+class IVF_NM : public VecIndex, public OffsetBaseIndex {
+ public:
+    IVF_NM() : OffsetBaseIndex(nullptr) {
+        index_type_ = IndexEnum::INDEX_FAISS_IVFFLAT;
+    }
+
+    explicit IVF_NM(std::shared_ptr<faiss::Index> index) : OffsetBaseIndex(std::move(index)) {
+        index_type_ = IndexEnum::INDEX_FAISS_IVFFLAT;
+    }
+
+    BinarySet
+    Serialize(const Config& config = Config()) override;
+
+    void
+    Load(const BinarySet&) override;
+
+    void
+    Train(const DatasetPtr&, const Config&) override;
+
+    void
+    Add(const DatasetPtr&, const Config&) override;
+
+    void
+    AddWithoutIds(const DatasetPtr&, const Config&) override;
+
+    DatasetPtr
+    Query(const DatasetPtr&, const Config&) override;
+
+#if 0
+    DatasetPtr
+    QueryById(const DatasetPtr& dataset, const Config& config) override;
+#endif
+
+    int64_t
+    Count() override {
+        return index_->ntotal;
+    }
+
+    int64_t
+    Dim() override {
+        return index_->d;
+    }
+
+#if 0
+    DatasetPtr
+    GetVectorById(const DatasetPtr& dataset, const Config& config) override;
+#endif
+
+    virtual void
+    Seal();
+
+    virtual VecIndexPtr
+    CopyCpuToGpu(const int64_t, const Config&);
+
+    virtual void
+    GenGraph(const float* data, const int64_t k, GraphType& graph, const Config& config);
+
+ protected:
+    virtual std::shared_ptr<faiss::IVFSearchParameters>
+    GenParams(const Config&);
+
+    virtual void
+    QueryImpl(int64_t, const float*, int64_t, float*, int64_t*, const Config&);
+
+    void
+    SealImpl() override;
+
+ protected:
+    std::mutex mutex_;
+    std::shared_ptr<uint8_t[]> data_ = nullptr;
+    std::vector<size_t> prefix_sum;
+};
+
+using IVFNMPtr = std::shared_ptr<IVF_NM>;
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.cpp b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.cpp
new file mode 100644
index 0000000000..c71ef81867
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.cpp
@@ -0,0 +1,163 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#include <fiu-local.h>
+#include <string>
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/common/Timer.h"
+#include "knowhere/index/vector_index/IndexIDMAP.h"
+#include "knowhere/index/vector_index/IndexIVF.h"
+#include "knowhere/index/vector_index/IndexType.h"
+#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
+#include "knowhere/index/vector_index/impl/nsg/NSGIO.h"
+#include "knowhere/index/vector_offset_index/IndexNSG_NM.h"
+
+#ifdef MILVUS_GPU_VERSION
+#include "knowhere/index/vector_index/gpu/IndexGPUIDMAP.h"
+#include "knowhere/index/vector_index/gpu/IndexGPUIVF.h"
+#include "knowhere/index/vector_index/helpers/Cloner.h"
+#endif
+
+namespace milvus {
+namespace knowhere {
+
+BinarySet
+NSG_NM::Serialize(const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    try {
+        fiu_do_on("NSG_NM.Serialize.throw_exception", throw std::exception());
+        std::lock_guard<std::mutex> lk(mutex_);
+        impl::NsgIndex* index = index_.get();
+
+        MemoryIOWriter writer;
+        impl::write_index(index, writer);
+        std::shared_ptr<uint8_t[]> data(writer.data_);
+
+        BinarySet res_set;
+        res_set.Append("NSG_NM", data, writer.rp);
+        return res_set;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+NSG_NM::Load(const BinarySet& index_binary) {
+    try {
+        fiu_do_on("NSG_NM.Load.throw_exception", throw std::exception());
+        std::lock_guard<std::mutex> lk(mutex_);
+        auto binary = index_binary.GetByName("NSG_NM");
+
+        MemoryIOReader reader;
+        reader.total = binary->size;
+        reader.data_ = binary->data.get();
+
+        auto index = impl::read_index(reader);
+        index_.reset(index);
+
+        data_ = index_binary.GetByName(RAW_DATA)->data;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+DatasetPtr
+NSG_NM::Query(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    GETTENSOR(dataset_ptr)
+
+    try {
+        auto topK = config[meta::TOPK].get<int64_t>();
+        auto elems = rows * topK;
+        size_t p_id_size = sizeof(int64_t) * elems;
+        size_t p_dist_size = sizeof(float) * elems;
+        auto p_id = (int64_t*)malloc(p_id_size);
+        auto p_dist = (float*)malloc(p_dist_size);
+
+        faiss::ConcurrentBitsetPtr blacklist = GetBlacklist();
+
+        impl::SearchParams s_params;
+        s_params.search_length = config[IndexParams::search_length];
+        s_params.k = config[meta::TOPK];
+        {
+            std::lock_guard<std::mutex> lk(mutex_);
+            // index_->ori_data_ = (float*) data_.get();
+            index_->Search((float*)p_data, (float*)data_.get(), rows, dim, topK, p_dist, p_id, s_params, blacklist);
+        }
+
+        auto ret_ds = std::make_shared<Dataset>();
+        ret_ds->Set(meta::IDS, p_id);
+        ret_ds->Set(meta::DISTANCE, p_dist);
+        return ret_ds;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+NSG_NM::Train(const DatasetPtr& dataset_ptr, const Config& config) {
+    auto idmap = std::make_shared<IDMAP>();
+    idmap->Train(dataset_ptr, config);
+    idmap->AddWithoutIds(dataset_ptr, config);
+    impl::Graph knng;
+    const float* raw_data = idmap->GetRawVectors();
+    const int64_t k = config[IndexParams::knng].get<int64_t>();
+#ifdef MILVUS_GPU_VERSION
+    const int64_t device_id = config[knowhere::meta::DEVICEID].get<int64_t>();
+    if (device_id == -1) {
+        auto preprocess_index = std::make_shared<IVF>();
+        preprocess_index->Train(dataset_ptr, config);
+        preprocess_index->AddWithoutIds(dataset_ptr, config);
+        preprocess_index->GenGraph(raw_data, k, knng, config);
+    } else {
+        auto gpu_idx = cloner::CopyCpuToGpu(idmap, device_id, config);
+        auto gpu_idmap = std::dynamic_pointer_cast<GPUIDMAP>(gpu_idx);
+        gpu_idmap->GenGraph(raw_data, k, knng, config);
+    }
+#else
+    auto preprocess_index = std::make_shared<IVF>();
+    preprocess_index->Train(dataset_ptr, config);
+    preprocess_index->AddWithoutIds(dataset_ptr, config);
+    preprocess_index->GenGraph(raw_data, k, knng, config);
+#endif
+
+    impl::BuildParams b_params;
+    b_params.candidate_pool_size = config[IndexParams::candidate];
+    b_params.out_degree = config[IndexParams::out_degree];
+    b_params.search_length = config[IndexParams::search_length];
+
+    auto p_ids = dataset_ptr->Get<const int64_t*>(meta::IDS);
+
+    GETTENSOR(dataset_ptr)
+    index_ = std::make_shared<impl::NsgIndex>(dim, rows, config[Metric::TYPE].get<std::string>());
+    index_->SetKnnGraph(knng);
+    index_->Build_with_ids(rows, (float*)p_data, (int64_t*)p_ids, b_params);
+}
+
+int64_t
+NSG_NM::Count() {
+    return index_->ntotal;
+}
+
+int64_t
+NSG_NM::Dim() {
+    return index_->dimension;
+}
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.h b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.h
new file mode 100644
index 0000000000..dba6cab117
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/common/Log.h"
+#include "knowhere/index/vector_index/VecIndex.h"
+
+namespace milvus {
+namespace knowhere {
+
+namespace impl {
+class NsgIndex;
+}
+
+class NSG_NM : public VecIndex {
+ public:
+    explicit NSG_NM(const int64_t gpu_num = -1) : gpu_(gpu_num) {
+        if (gpu_ >= 0) {
+            index_mode_ = IndexMode::MODE_GPU;
+        }
+        index_type_ = IndexEnum::INDEX_NSG;
+    }
+
+    BinarySet
+    Serialize(const Config& config = Config()) override;
+
+    void
+    Load(const BinarySet&) override;
+
+    void
+    BuildAll(const DatasetPtr& dataset_ptr, const Config& config) override {
+        Train(dataset_ptr, config);
+    }
+
+    void
+    Train(const DatasetPtr&, const Config&) override;
+
+    void
+    Add(const DatasetPtr&, const Config&) override {
+        KNOWHERE_THROW_MSG("Incremental index is not supported");
+    }
+
+    void
+    AddWithoutIds(const DatasetPtr&, const Config&) override {
+        KNOWHERE_THROW_MSG("Addwithoutids is not supported");
+    }
+
+    DatasetPtr
+    Query(const DatasetPtr&, const Config&) override;
+
+    int64_t
+    Count() override;
+
+    int64_t
+    Dim() override;
+
+ private:
+    std::mutex mutex_;
+    int64_t gpu_;
+    std::shared_ptr<impl::NsgIndex> index_ = nullptr;
+    std::shared_ptr<uint8_t[]> data_ = nullptr;
+};
+
+using NSG_NMIndexPtr = std::shared_ptr<NSG_NM>();
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.cpp b/core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.cpp
new file mode 100644
index 0000000000..6becbba0ad
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.cpp
@@ -0,0 +1,56 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#include <faiss/index_io.h>
+#include <fiu-local.h>
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/index/vector_index/IndexType.h"
+#include "knowhere/index/vector_index/helpers/FaissIO.h"
+#include "knowhere/index/vector_offset_index/OffsetBaseIndex.h"
+
+namespace milvus {
+namespace knowhere {
+
+BinarySet
+OffsetBaseIndex::SerializeImpl(const IndexType& type) {
+    try {
+        fiu_do_on("OffsetBaseIndex.SerializeImpl.throw_exception", throw std::exception());
+        faiss::Index* index = index_.get();
+
+        MemoryIOWriter writer;
+        faiss::write_index_nm(index, &writer);
+        std::shared_ptr<uint8_t[]> data(writer.data_);
+
+        BinarySet res_set;
+        res_set.Append("IVF", data, writer.rp);
+        return res_set;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+OffsetBaseIndex::LoadImpl(const BinarySet& binary_set, const IndexType& type) {
+    auto binary = binary_set.GetByName("IVF");
+
+    MemoryIOReader reader;
+    reader.total = binary->size;
+    reader.data_ = binary->data.get();
+
+    faiss::Index* index = faiss::read_index_nm(&reader);
+    index_.reset(index);
+
+    SealImpl();
+}
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.h b/core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.h
new file mode 100644
index 0000000000..19908ce345
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include <faiss/Index.h>
+
+#include "knowhere/common/BinarySet.h"
+#include "knowhere/index/vector_index/IndexType.h"
+
+namespace milvus {
+namespace knowhere {
+
+class OffsetBaseIndex {
+ protected:
+    explicit OffsetBaseIndex(std::shared_ptr<faiss::Index> index) : index_(std::move(index)) {
+    }
+
+    virtual BinarySet
+    SerializeImpl(const IndexType& type);
+
+    virtual void
+    LoadImpl(const BinarySet&, const IndexType& type);
+
+    virtual void
+    SealImpl() { /* do nothing */
+    }
+
+ public:
+    std::shared_ptr<faiss::Index> index_ = nullptr;
+};
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.cpp b/core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.cpp
new file mode 100644
index 0000000000..b5109e3ecb
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.cpp
@@ -0,0 +1,182 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#include <memory>
+
+#include <faiss/gpu/GpuCloner.h>
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/index_io.h>
+#include <fiu-local.h>
+#include <string>
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
+#include "knowhere/index/vector_index/helpers/Cloner.h"
+#include "knowhere/index/vector_index/helpers/FaissIO.h"
+#include "knowhere/index/vector_index/helpers/IndexParameter.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
+#include "knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h"
+
+namespace milvus {
+namespace knowhere {
+
+void
+GPUIVF_NM::Train(const DatasetPtr& dataset_ptr, const Config& config) {
+    GETTENSOR(dataset_ptr)
+    gpu_id_ = config[knowhere::meta::DEVICEID];
+
+    auto gpu_res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_);
+    if (gpu_res != nullptr) {
+        ResScope rs(gpu_res, gpu_id_, true);
+        faiss::gpu::GpuIndexIVFFlatConfig idx_config;
+        idx_config.device = gpu_id_;
+        int32_t nlist = config[IndexParams::nlist];
+        faiss::MetricType metric_type = GetMetricType(config[Metric::TYPE].get<std::string>());
+        faiss::gpu::GpuIndexIVFFlat device_index(gpu_res->faiss_res.get(), dim, nlist, metric_type, idx_config);
+        device_index.train(rows, (float*)p_data);
+
+        std::shared_ptr<faiss::Index> host_index = nullptr;
+        host_index.reset(faiss::gpu::index_gpu_to_cpu(&device_index));
+
+        auto device_index1 = faiss::gpu::index_cpu_to_gpu(gpu_res->faiss_res.get(), gpu_id_, host_index.get());
+        index_.reset(device_index1);
+        res_ = gpu_res;
+    } else {
+        KNOWHERE_THROW_MSG("Build IVF can't get gpu resource");
+    }
+}
+
+void
+GPUIVF_NM::Add(const DatasetPtr& dataset_ptr, const Config& config) {
+    if (auto spt = res_.lock()) {
+        ResScope rs(res_, gpu_id_);
+        IVF::Add(dataset_ptr, config);
+    } else {
+        KNOWHERE_THROW_MSG("Add IVF can't get gpu resource");
+    }
+}
+
+void
+GPUIVF_NM::Load(const BinarySet& binary_set) {
+    /*
+    std::lock_guard<std::mutex> lk(mutex_);
+    auto binary = binary_set.GetByName("IVF");
+    MemoryIOReader reader;
+    reader.total = binary->size;
+    reader.data_ = binary->data.get();
+    faiss::Index* index = faiss::read_index_nm(&reader);
+    index_.reset(index);
+    // Construct arranged data from original data
+    auto binary_data = binary_set.GetByName(RAW_DATA);
+    const float* original_data = (const float*) binary_data->data.get();
+    auto ivf_index = dynamic_cast<faiss::IndexIVF*>(index_.get());
+    auto invlists = ivf_index->invlists;
+    auto ails = dynamic_cast<faiss::ArrayInvertedLists*>(invlists);
+    auto d = ivf_index->d;
+    auto nb = (size_t) (binary_data->size / ails->code_size);
+    arranged_data = new uint8_t[d * sizeof(float) * nb];
+    size_t curr_index = 0;
+    for (int i = 0; i < ails->nlist; i++) {
+        auto list_size = ails->ids[i].size();
+        for (int j = 0; j < list_size; j++) {
+            memcpy(arranged_data + d * sizeof(float) * (curr_index + j), original_data + d * ails->ids[i][j],
+                   d * sizeof(float));
+        }
+        curr_index += list_size;
+    }
+    if (auto temp_res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_)) {
+        ResScope rs(temp_res, gpu_id_, false);
+        auto device_index =
+            faiss::gpu::index_cpu_to_gpu_without_codes(temp_res->faiss_res.get(), gpu_id_, index, arranged_data);
+        index_.reset(device_index);
+        res_ = temp_res;
+    } else {
+        KNOWHERE_THROW_MSG("Load error, can't get gpu resource");
+    }
+    delete index;
+    */
+
+    // not supported
+}
+
+VecIndexPtr
+GPUIVF_NM::CopyGpuToCpu(const Config& config) {
+    std::lock_guard<std::mutex> lk(mutex_);
+
+    if (auto device_idx = std::dynamic_pointer_cast<faiss::gpu::GpuIndexIVF>(index_)) {
+        faiss::Index* device_index = index_.get();
+        faiss::Index* host_index = faiss::gpu::index_gpu_to_cpu_without_codes(device_index);
+
+        std::shared_ptr<faiss::Index> new_index;
+        new_index.reset(host_index);
+        return std::make_shared<IVF_NM>(new_index);
+    } else {
+        return std::make_shared<IVF_NM>(index_);
+    }
+}
+
+VecIndexPtr
+GPUIVF_NM::CopyGpuToGpu(const int64_t device_id, const Config& config) {
+    auto host_index = CopyGpuToCpu(config);
+    return std::static_pointer_cast<IVF>(host_index)->CopyCpuToGpu(device_id, config);
+}
+
+BinarySet
+GPUIVF_NM::SerializeImpl(const IndexType& type) {
+    if (!index_ || !index_->is_trained) {
+        KNOWHERE_THROW_MSG("index not initialize or trained");
+    }
+
+    try {
+        fiu_do_on("GPUIVF_NM.SerializeImpl.throw_exception", throw std::exception());
+        MemoryIOWriter writer;
+        {
+            faiss::Index* index = index_.get();
+            faiss::Index* host_index = faiss::gpu::index_gpu_to_cpu_without_codes(index);
+            faiss::write_index_nm(host_index, &writer);
+            delete host_index;
+        }
+        std::shared_ptr<uint8_t[]> data(writer.data_);
+
+        BinarySet res_set;
+        res_set.Append("IVF", data, writer.rp);
+
+        return res_set;
+    } catch (std::exception& e) {
+        KNOWHERE_THROW_MSG(e.what());
+    }
+}
+
+void
+GPUIVF_NM::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& config) {
+    std::lock_guard<std::mutex> lk(mutex_);
+
+    auto device_index = std::dynamic_pointer_cast<faiss::gpu::GpuIndexIVF>(index_);
+    fiu_do_on("GPUIVF_NM.search_impl.invald_index", device_index = nullptr);
+    if (device_index) {
+        device_index->nprobe = config[IndexParams::nprobe];
+        ResScope rs(res_, gpu_id_);
+
+        // if query size > 2048 we search by blocks to avoid malloc issue
+        const int64_t block_size = 2048;
+        int64_t dim = device_index->d;
+        for (int64_t i = 0; i < n; i += block_size) {
+            int64_t search_size = (n - i > block_size) ? block_size : (n - i);
+            device_index->search(search_size, (float*)data + i * dim, k, distances + i * k, labels + i * k, bitset_);
+        }
+    } else {
+        KNOWHERE_THROW_MSG("Not a GpuIndexIVF type.");
+    }
+}
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h b/core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h
new file mode 100644
index 0000000000..7b4254f200
--- /dev/null
+++ b/core/src/index/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "knowhere/index/vector_index/IndexIVF.h"
+#include "knowhere/index/vector_index/gpu/GPUIndex.h"
+
+namespace milvus {
+namespace knowhere {
+
+class GPUIVF_NM : public IVF, public GPUIndex {
+ public:
+    explicit GPUIVF_NM(const int& device_id) : IVF(), GPUIndex(device_id) {
+        index_mode_ = IndexMode::MODE_GPU;
+    }
+
+    explicit GPUIVF_NM(std::shared_ptr<faiss::Index> index, const int64_t device_id, ResPtr& res)
+        : IVF(std::move(index)), GPUIndex(device_id, res) {
+        index_mode_ = IndexMode::MODE_GPU;
+    }
+
+    void
+    Train(const DatasetPtr&, const Config&) override;
+
+    void
+    Add(const DatasetPtr&, const Config&) override;
+
+    void
+    Load(const BinarySet&) override;
+
+    VecIndexPtr
+    CopyGpuToCpu(const Config&) override;
+
+    VecIndexPtr
+    CopyGpuToGpu(const int64_t, const Config&) override;
+
+ protected:
+    BinarySet
+    SerializeImpl(const IndexType&) override;
+
+    void
+    QueryImpl(int64_t, const float*, int64_t, float*, int64_t*, const Config&) override;
+
+ protected:
+    uint8_t* arranged_data;
+};
+
+using GPUIVFNMPtr = std::shared_ptr<GPUIVF_NM>;
+
+}  // namespace knowhere
+}  // namespace milvus
diff --git a/core/src/index/thirdparty/faiss/AutoTune.h b/core/src/index/thirdparty/faiss/AutoTune.h
index d7eff14e64..d755844d6d 100644
--- a/core/src/index/thirdparty/faiss/AutoTune.h
+++ b/core/src/index/thirdparty/faiss/AutoTune.h
@@ -28,7 +28,7 @@ struct AutoTuneCriterion {
     typedef Index::idx_t idx_t;
     idx_t nq;  ///< nb of queries this criterion is evaluated on
     idx_t nnn; ///< nb of NNs that the query should request
-    idx_t gt_nnn; ///< nb of GT NNs required to evaluate crterion
+    idx_t gt_nnn; ///< nb of GT NNs required to evaluate criterion
 
     std::vector<float> gt_D;  ///< Ground-truth distances (size nq * gt_nnn)
     std::vector<idx_t> gt_I;  ///< Ground-truth indexes (size nq * gt_nnn)
diff --git a/core/src/index/thirdparty/faiss/Index.cpp b/core/src/index/thirdparty/faiss/Index.cpp
index d5748f719f..b11cfb2683 100644
--- a/core/src/index/thirdparty/faiss/Index.cpp
+++ b/core/src/index/thirdparty/faiss/Index.cpp
@@ -52,6 +52,15 @@ void Index::add_with_ids(
   FAISS_THROW_MSG ("add_with_ids not implemented for this type of index");
 }
 
+
+void Index::add_without_codes(idx_t n, const float* x) {
+  FAISS_THROW_MSG ("add_without_codes not implemented for this type of index");
+}
+
+void Index::add_with_ids_without_codes(idx_t n, const float* x, const idx_t* xids) {
+  FAISS_THROW_MSG ("add_with_ids_without_codes not implemented for this type of index");
+}
+
 #if 0
 void Index::get_vector_by_id (idx_t n, const idx_t *xid, float *x, ConcurrentBitsetPtr bitset) {
   FAISS_THROW_MSG ("get_vector_by_id not implemented for this type of index");
diff --git a/core/src/index/thirdparty/faiss/Index.h b/core/src/index/thirdparty/faiss/Index.h
index 9a0967962e..9e8d22dba4 100644
--- a/core/src/index/thirdparty/faiss/Index.h
+++ b/core/src/index/thirdparty/faiss/Index.h
@@ -94,6 +94,13 @@ struct Index {
      */
     virtual void add (idx_t n, const float *x) = 0;
 
+    /** Same as add, but only add ids, not codes
+     * 
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void add_without_codes(idx_t n, const float* x);
+
     /** Same as add, but stores xids instead of sequential ids.
      *
      * The default implementation fails with an assertion, as it is
@@ -103,6 +110,12 @@ struct Index {
      */
     virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
 
+    /** Same as add_with_ids, but only add ids, not codes
+     * 
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids_without_codes(idx_t n, const float* x, const idx_t* xids);
+
     /** query n vectors of dimension d to the index.
      *
      * return at most k vectors. If there are not enough results for a
diff --git a/core/src/index/thirdparty/faiss/IndexIVF.cpp b/core/src/index/thirdparty/faiss/IndexIVF.cpp
index fa6f050ca6..6805cf1542 100644
--- a/core/src/index/thirdparty/faiss/IndexIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVF.cpp
@@ -196,6 +196,16 @@ void IndexIVF::add (idx_t n, const float * x)
     add_with_ids (n, x, nullptr);
 }
 
+void IndexIVF::add_without_codes (idx_t n, const float * x)
+{
+    add_with_ids_without_codes (n, x, nullptr);
+}
+
+void IndexIVF::add_with_ids_without_codes (idx_t n, const float * x, const idx_t *xids)
+{
+    // will be overriden
+    FAISS_THROW_MSG ("add_with_ids_without_codes not implemented for this type of index");
+}
 
 void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
 {
@@ -268,6 +278,13 @@ void IndexIVF::to_readonly() {
     this->replace_invlists(readonly_lists, true);
 }
 
+void IndexIVF::to_readonly_without_codes() {
+    if (is_readonly()) return;
+    auto readonly_lists = this->invlists->to_readonly_without_codes();
+    if (!readonly_lists) return;
+    this->replace_invlists(readonly_lists, true);
+}
+
 bool IndexIVF::is_readonly() const {
     return this->invlists->is_readonly();
 }
@@ -316,6 +333,26 @@ void IndexIVF::search (idx_t n, const float *x, idx_t k,
     indexIVF_stats.search_time += getmillisecs() - t0;
 }
 
+void IndexIVF::search_without_codes (idx_t n, const float *x, 
+                                     const uint8_t *arranged_codes, std::vector<size_t> prefix_sum, 
+                                     idx_t k, float *distances, idx_t *labels,
+                                     ConcurrentBitsetPtr bitset) 
+{
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search (n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists (idx.get(), n * nprobe);
+
+    search_preassigned_without_codes (n, x, arranged_codes, prefix_sum, k, idx.get(), coarse_dis.get(),
+                                      distances, labels, false, nullptr, bitset);
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+
 #if 0
 void IndexIVF::get_vector_by_id (idx_t n, const idx_t *xid, float *x, ConcurrentBitsetPtr bitset) {
     make_direct_map(true);
@@ -545,7 +582,210 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
 }
 
 
+void IndexIVF::search_preassigned_without_codes (idx_t n, const float *x, 
+                                                 const uint8_t *arranged_codes, 
+                                                 std::vector<size_t> prefix_sum,  idx_t k,
+                                                 const idx_t *keys,
+                                                 const float *coarse_dis ,
+                                                 float *distances, idx_t *labels,
+                                                 bool store_pairs,
+                                                 const IVFSearchParameters *params,
+                                                 ConcurrentBitsetPtr bitset)
+{
+    long nprobe = params ? params->nprobe : this->nprobe;
+    long max_codes = params ? params->max_codes : this->max_codes;
 
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+
+    using HeapForIP = CMin<float, idx_t>;
+    using HeapForL2 = CMax<float, idx_t>;
+
+    bool interrupt = false;
+
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+
+    // don't start parallel section if single query
+    bool do_parallel =
+        pmode == 0 ? n > 1 :
+        pmode == 1 ? nprobe > 1 :
+        nprobe * n > 1;
+
+#pragma omp parallel if(do_parallel) reduction(+: nlistv, ndis, nheap)
+    {
+        InvertedListScanner *scanner = get_InvertedListScanner(store_pairs);
+        ScopeDeleter1<InvertedListScanner> del(scanner);
+
+        /*****************************************************
+         * Depending on parallel_mode, there are two possible ways
+         * to organize the search. Here we define local functions
+         * that are in common between the two
+         ******************************************************/
+
+        // intialize + reorder a result heap
+
+        auto init_result = [&](float *simi, idx_t *idxi) {
+            if (!do_heap_init) return;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2> (k, simi, idxi);
+            }
+        };
+
+        auto reorder_result = [&] (float *simi, idx_t *idxi) {
+            if (!do_heap_init) return;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP> (k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2> (k, simi, idxi);
+            }
+        };
+
+        // single list scan using the current scanner (with query
+        // set porperly) and storing results in simi and idxi
+        auto scan_one_list = [&] (idx_t key, float coarse_dis_i, const uint8_t *arranged_codes,
+                                  float *simi, idx_t *idxi, ConcurrentBitsetPtr bitset) {
+
+            if (key < 0) {
+                // not enough centroids for multiprobe
+                return (size_t)0;
+            }
+            FAISS_THROW_IF_NOT_FMT (key < (idx_t) nlist,
+                                    "Invalid key=%ld nlist=%ld\n",
+                                    key, nlist);
+
+            size_t list_size = invlists->list_size(key);
+            size_t offset = prefix_sum[key];
+
+            // don't waste time on empty lists
+            if (list_size == 0) {
+                return (size_t)0;
+            }
+
+            scanner->set_list (key, coarse_dis_i);
+
+            nlistv++;
+
+            InvertedLists::ScopedCodes scodes (invlists, key, arranged_codes);
+
+            std::unique_ptr<InvertedLists::ScopedIds> sids;
+            const Index::idx_t * ids = nullptr;
+
+            if (!store_pairs)  {
+                sids.reset (new InvertedLists::ScopedIds (invlists, key));
+                ids = sids->get();
+            }
+
+            nheap += scanner->scan_codes (list_size, (const uint8_t *) ((const float *)scodes.get() + d * offset),
+                                          ids, simi, idxi, k, bitset);
+
+            return list_size;
+        };
+
+        /****************************************************
+         * Actual loops, depending on parallel_mode
+         ****************************************************/
+
+        if (pmode == 0) {
+
+#pragma omp for
+            for (size_t i = 0; i < n; i++) {
+
+                if (interrupt) {
+                    continue;
+                }
+
+                // loop over queries
+                scanner->set_query (x + i * d);
+                float * simi = distances + i * k;
+                idx_t * idxi = labels + i * k;
+
+                init_result (simi, idxi);
+
+                long nscan = 0;
+
+                // loop over probes
+                for (size_t ik = 0; ik < nprobe; ik++) {
+
+                    nscan += scan_one_list (
+                         keys [i * nprobe + ik],
+                         coarse_dis[i * nprobe + ik],
+                         arranged_codes,
+                         simi, idxi, bitset
+                    );
+
+                    if (max_codes && nscan >= max_codes) {
+                        break;
+                    }
+                }
+
+                ndis += nscan;
+                reorder_result (simi, idxi);
+
+                if (InterruptCallback::is_interrupted ()) {
+                    interrupt = true;
+                }
+
+            } // parallel for
+        } else if (pmode == 1) {
+            std::vector <idx_t> local_idx (k);
+            std::vector <float> local_dis (k);
+
+            for (size_t i = 0; i < n; i++) {
+                scanner->set_query (x + i * d);
+                init_result (local_dis.data(), local_idx.data());
+
+#pragma omp for schedule(dynamic)
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    ndis += scan_one_list
+                        (keys [i * nprobe + ik],
+                         coarse_dis[i * nprobe + ik],
+                         arranged_codes,
+                         local_dis.data(), local_idx.data(), bitset);
+
+                    // can't do the test on max_codes
+                }
+                // merge thread-local results
+
+                float * simi = distances + i * k;
+                idx_t * idxi = labels + i * k;
+#pragma omp single
+                init_result (simi, idxi);
+
+#pragma omp barrier
+#pragma omp critical
+                {
+                    if (metric_type == METRIC_INNER_PRODUCT) {
+                        heap_addn<HeapForIP>
+                            (k, simi, idxi,
+                             local_dis.data(), local_idx.data(), k);
+                    } else {
+                        heap_addn<HeapForL2>
+                            (k, simi, idxi,
+                             local_dis.data(), local_idx.data(), k);
+                    }
+                }
+#pragma omp barrier
+#pragma omp single
+                reorder_result (simi, idxi);
+            }
+        } else {
+            FAISS_THROW_FMT ("parallel_mode %d not supported\n",
+                             pmode);
+        }
+    } // parallel section
+
+    if (interrupt) {
+        FAISS_THROW_MSG ("computation interrupted");
+    }
+
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nheap_updates += nheap;
+
+}
 
 void IndexIVF::range_search (idx_t nx, const float *x, float radius,
                              RangeSearchResult *result,
diff --git a/core/src/index/thirdparty/faiss/IndexIVF.h b/core/src/index/thirdparty/faiss/IndexIVF.h
index 744f27f333..099067e411 100644
--- a/core/src/index/thirdparty/faiss/IndexIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexIVF.h
@@ -139,9 +139,15 @@ struct IndexIVF: Index, Level1Quantizer {
     /// Calls add_with_ids with NULL ids
     void add(idx_t n, const float* x) override;
 
+    /// Calls add_with_ids_without_codes
+    void add_without_codes(idx_t n, const float* x) override;
+
     /// default implementation that calls encode_vectors
     void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
 
+    /// Implementation for adding without original vector data
+    void add_with_ids_without_codes(idx_t n, const float* x, const idx_t* xids) override;
+
     /** Encodes a set of vectors as they would appear in the inverted lists
      *
      * @param list_nos   inverted list ids as returned by the
@@ -187,11 +193,28 @@ struct IndexIVF: Index, Level1Quantizer {
                                      ConcurrentBitsetPtr bitset = nullptr
                                      ) const;
 
+    /** Similar to search_preassigned, but does not store codes **/
+    virtual void search_preassigned_without_codes (idx_t n, const float *x, 
+                                                   const uint8_t *arranged_codes, 
+                                                   std::vector<size_t> prefix_sum, idx_t k,
+                                                   const idx_t *assign,
+                                                   const float *centroid_dis,
+                                                   float *distances, idx_t *labels,
+                                                   bool store_pairs,
+                                                   const IVFSearchParameters *params = nullptr,
+                                                   ConcurrentBitsetPtr bitset = nullptr);
+
     /** assign the vectors, then call search_preassign */
     void search (idx_t n, const float *x, idx_t k,
                  float *distances, idx_t *labels,
                  ConcurrentBitsetPtr bitset = nullptr) const override;
 
+    /** Similar to search, but does not store codes **/
+    void search_without_codes (idx_t n, const float *x, 
+                               const uint8_t *arranged_codes, std::vector<size_t> prefix_sum, 
+                               idx_t k, float *distances, idx_t *labels,
+                               ConcurrentBitsetPtr bitset = nullptr);
+
 #if 0
     /** get raw vectors by ids */
     void get_vector_by_id (idx_t n, const idx_t *xid, float *x, ConcurrentBitsetPtr bitset = nullptr) override;
@@ -286,6 +309,7 @@ struct IndexIVF: Index, Level1Quantizer {
                                  idx_t a1, idx_t a2) const;
 
     virtual void to_readonly();
+    virtual void to_readonly_without_codes();
     virtual bool is_readonly() const;
 
     virtual void backup_quantizer();
diff --git a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
index 2846990f9f..147263750f 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
@@ -39,6 +39,40 @@ void IndexIVFFlat::add_with_ids (idx_t n, const float * x, const idx_t *xids)
     add_core (n, x, xids, nullptr);
 }
 
+// Add ids only, vectors not added to Index.
+void IndexIVFFlat::add_with_ids_without_codes(idx_t n, const float* x, const idx_t* xids) 
+{
+    FAISS_THROW_IF_NOT (is_trained);
+    assert (invlists);
+    direct_map.check_can_add (xids);
+    const int64_t * idx;
+    ScopeDeleter<int64_t> del;
+
+    int64_t * idx0 = new int64_t [n];
+    del.set (idx0);
+    quantizer->assign (n, x, idx0);
+    idx = idx0;
+
+    int64_t n_add = 0;
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        idx_t list_no = idx [i];
+        size_t offset;
+
+        if (list_no >= 0) {
+            const float *xi = x + i * d;
+            offset = invlists->add_entry_without_codes (
+                     list_no, id);
+            n_add++;
+        } else {
+            offset = 0;
+        }
+        direct_map.add_single_id (id, list_no, offset);
+    }
+
+    ntotal += n;
+}
+
 void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
                              const int64_t *precomputed_idx)
 
diff --git a/core/src/index/thirdparty/faiss/IndexIVFFlat.h b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
index 3c5777a1c2..74b0b4c0ec 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
@@ -35,6 +35,9 @@ struct IndexIVFFlat: IndexIVF {
     /// implemented for all IndexIVF* classes
     void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
 
+    /// implemented for all IndexIVF* classes
+    void add_with_ids_without_codes(idx_t n, const float* x, const idx_t* xids) override;
+
     void encode_vectors(idx_t n, const float* x,
                         const idx_t *list_nos,
                         uint8_t * codes,
diff --git a/core/src/index/thirdparty/faiss/InvertedLists.cpp b/core/src/index/thirdparty/faiss/InvertedLists.cpp
index 59f5d1e7cb..cc79ab47dc 100644
--- a/core/src/index/thirdparty/faiss/InvertedLists.cpp
+++ b/core/src/index/thirdparty/faiss/InvertedLists.cpp
@@ -108,6 +108,15 @@ size_t InvertedLists::add_entry (size_t list_no, idx_t theid,
     return add_entries (list_no, 1, &theid, code);
 }
 
+size_t InvertedLists::add_entry_without_codes (size_t list_no, idx_t theid) 
+{
+    return add_entries_without_codes (list_no, 1, &theid);
+}
+
+size_t InvertedLists::add_entries_without_codes (size_t list_no, size_t n_entry,
+                                                 const idx_t* ids) 
+{}
+
 void InvertedLists::update_entry (size_t list_no, size_t offset,
                                         idx_t id, const uint8_t *code)
 {
@@ -118,6 +127,10 @@ InvertedLists* InvertedLists::to_readonly() {
     return nullptr;
 }
 
+InvertedLists* InvertedLists::to_readonly_without_codes() {
+    return nullptr;
+}
+
 bool InvertedLists::is_readonly() const {
     return false;
 }
@@ -210,6 +223,18 @@ size_t ArrayInvertedLists::add_entries (
     return o;
 }
 
+size_t ArrayInvertedLists::add_entries_without_codes (
+           size_t list_no, size_t n_entry,
+           const idx_t* ids_in)
+{
+    if (n_entry == 0) return 0;
+    assert (list_no < nlist);
+    size_t o = ids [list_no].size();
+    ids [list_no].resize (o + n_entry);
+    memcpy (&ids[list_no][o], ids_in, sizeof (ids_in[0]) * n_entry);
+    return o;
+}
+
 size_t ArrayInvertedLists::list_size(size_t list_no) const
 {
     assert (list_no < nlist);
@@ -250,6 +275,11 @@ InvertedLists* ArrayInvertedLists::to_readonly() {
     return readonly;
 }
 
+InvertedLists* ArrayInvertedLists::to_readonly_without_codes() {
+    ReadOnlyArrayInvertedLists* readonly = new ReadOnlyArrayInvertedLists(*this, true);
+    return readonly;
+}
+
 ArrayInvertedLists::~ArrayInvertedLists ()
 {}
 
@@ -325,26 +355,43 @@ ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ArrayInvertedLists&
     valid = true;
 }
 
-//ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ReadOnlyArrayInvertedLists &other)
-//    : InvertedLists (other.nlist, other.code_size) {
-//    readonly_length = other.readonly_length;
-//    readonly_offset = other.readonly_offset;
-//    pin_readonly_codes = std::make_shared<PageLockMemory>(*other.pin_readonly_codes);
-//    pin_readonly_ids = std::make_shared<PageLockMemory>(*other.pin_readonly_ids);
-//    valid = true;
-//}
+ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other, bool offset_only)
+        : InvertedLists (other.nlist, other.code_size) {
+    readonly_length.resize(nlist);
+    readonly_offset.resize(nlist);
+    size_t offset = 0;
+    for (auto i = 0; i < other.ids.size(); i++) {
+        auto& list_ids = other.ids[i];
+        readonly_length[i] = list_ids.size();
+        readonly_offset[i] = offset;
+        offset += list_ids.size();
+    }
+
+#ifdef USE_CPU
+    for (auto i = 0; i < other.ids.size(); i++) {
+        auto& list_ids = other.ids[i];
+        readonly_ids.insert(readonly_ids.end(), list_ids.begin(), list_ids.end());
+    }
+#else
+    size_t ids_size = offset * sizeof(idx_t);
+    size_t codes_size = offset * (this->code_size) * sizeof(uint8_t);
+    pin_readonly_codes = std::make_shared<PageLockMemory>(codes_size);
+    pin_readonly_ids = std::make_shared<PageLockMemory>(ids_size);
+
+    offset = 0;
+    for (auto i = 0; i < other.ids.size(); i++) {
+        auto& list_ids = other.ids[i];
+
+        uint8_t* ids_ptr = (uint8_t*)(pin_readonly_ids->data) + offset * sizeof(idx_t);
+        memcpy(ids_ptr, list_ids.data(), list_ids.size() * sizeof(idx_t));
+
+        offset += list_ids.size();
+    }
+#endif
+
+    valid = true;
+}
 
-//ReadOnlyArrayInvertedLists::ReadOnlyArrayInvertedLists(ReadOnlyArrayInvertedLists &&other)
-//    : InvertedLists (other.nlist, other.code_size) {
-//    readonly_length = std::move(other.readonly_length);
-//    readonly_offset = std::move(other.readonly_offset);
-//    pin_readonly_codes = other.pin_readonly_codes;
-//    pin_readonly_ids = other.pin_readonly_ids;
-//
-//    other.pin_readonly_codes = nullptr;
-//    other.pin_readonly_ids = nullptr;
-//    valid = true;
-//}
 
 ReadOnlyArrayInvertedLists::~ReadOnlyArrayInvertedLists() {
 }
@@ -361,6 +408,13 @@ size_t ReadOnlyArrayInvertedLists::add_entries (
     FAISS_THROW_MSG ("not implemented");
 }
 
+size_t ReadOnlyArrayInvertedLists::add_entries_without_codes (
+           size_t , size_t ,
+           const idx_t*)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
 void ReadOnlyArrayInvertedLists::update_entries (size_t, size_t , size_t ,
                                                  const idx_t *, const uint8_t *)
 {
@@ -440,6 +494,13 @@ size_t ReadOnlyInvertedLists::add_entries (
     FAISS_THROW_MSG ("not implemented");
 }
 
+size_t ReadOnlyInvertedLists::add_entries_without_codes (
+           size_t , size_t ,
+           const idx_t*)
+{
+    FAISS_THROW_MSG ("not implemented");
+}
+
 void ReadOnlyInvertedLists::update_entries (size_t, size_t , size_t ,
                          const idx_t *, const uint8_t *)
 {
diff --git a/core/src/index/thirdparty/faiss/InvertedLists.h b/core/src/index/thirdparty/faiss/InvertedLists.h
index ec77d2cb18..c57b7b6961 100644
--- a/core/src/index/thirdparty/faiss/InvertedLists.h
+++ b/core/src/index/thirdparty/faiss/InvertedLists.h
@@ -111,6 +111,12 @@ struct InvertedLists {
            size_t list_no, size_t n_entry,
            const idx_t* ids, const uint8_t *code) = 0;
 
+    /// add one entry to an inverted list without codes
+    virtual size_t add_entry_without_codes (size_t list_no, idx_t theid);
+
+    virtual size_t add_entries_without_codes ( size_t list_no, size_t n_entry,
+                                               const idx_t* ids);
+
     virtual void update_entry (size_t list_no, size_t offset,
                                idx_t id, const uint8_t *code);
 
@@ -123,6 +129,8 @@ struct InvertedLists {
 
     virtual InvertedLists* to_readonly();
 
+    virtual InvertedLists* to_readonly_without_codes();
+
     virtual bool is_readonly() const;
 
     /// move all entries from oivf (empty on output)
@@ -197,6 +205,11 @@ struct InvertedLists {
             list_no (list_no)
         {}
 
+        // For codes outside
+        ScopedCodes (const InvertedLists *il, size_t list_no, const uint8_t *original_codes):
+            il (il), codes (original_codes), list_no (list_no)
+        {}
+
         const uint8_t *get() {return codes; }
 
         ~ScopedCodes () {
@@ -223,6 +236,10 @@ struct ArrayInvertedLists: InvertedLists {
            size_t list_no, size_t n_entry,
            const idx_t* ids, const uint8_t *code) override;
 
+    size_t add_entries_without_codes ( 
+           size_t list_no, size_t n_entry,
+           const idx_t* ids) override;
+
     void update_entries (size_t list_no, size_t offset, size_t n_entry,
                          const idx_t *ids, const uint8_t *code) override;
 
@@ -230,6 +247,8 @@ struct ArrayInvertedLists: InvertedLists {
 
     InvertedLists* to_readonly() override;
 
+    InvertedLists* to_readonly_without_codes() override;
+
     virtual ~ArrayInvertedLists ();
 };
 
@@ -248,6 +267,7 @@ struct ReadOnlyArrayInvertedLists: InvertedLists {
 
     ReadOnlyArrayInvertedLists(size_t nlist, size_t code_size, const std::vector<size_t>& list_length);
     explicit ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other);
+    explicit ReadOnlyArrayInvertedLists(const ArrayInvertedLists& other, bool offset);
 
     // Use default copy construct, just copy pointer, DON'T COPY pin_readonly_codes AND pin_readonly_ids
 //    explicit ReadOnlyArrayInvertedLists(const ReadOnlyArrayInvertedLists &);
@@ -266,6 +286,10 @@ struct ReadOnlyArrayInvertedLists: InvertedLists {
             size_t list_no, size_t n_entry,
             const idx_t* ids, const uint8_t *code) override;
 
+    size_t add_entries_without_codes ( 
+            size_t list_no, size_t n_entry,
+            const idx_t* ids) override;
+
     void update_entries (size_t list_no, size_t offset, size_t n_entry,
                          const idx_t *ids, const uint8_t *code) override;
 
@@ -292,6 +316,10 @@ struct ReadOnlyInvertedLists: InvertedLists {
            size_t list_no, size_t n_entry,
            const idx_t* ids, const uint8_t *code) override;
 
+    size_t add_entries_without_codes ( 
+           size_t list_no, size_t n_entry,
+           const idx_t* ids) override;
+
     void update_entries (size_t list_no, size_t offset, size_t n_entry,
                          const idx_t *ids, const uint8_t *code) override;
 
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
index 192c02db42..57e716e6a1 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
@@ -108,12 +108,28 @@ Index *ToCPUCloner::clone_Index(const Index *index)
     }
 }
 
+Index *ToCPUCloner::clone_Index_Without_Codes(const Index *index)
+{
+    if(auto ifl = dynamic_cast<const GpuIndexIVFFlat *>(index)) {
+        IndexIVFFlat *res = new IndexIVFFlat();
+        ifl->copyToWithoutCodes(res);
+        return res;
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
 faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index)
 {
     ToCPUCloner cl;
     return cl.clone_Index(gpu_index);
 }
 
+faiss::Index * index_gpu_to_cpu_without_codes(const faiss::Index *gpu_index)
+{
+    ToCPUCloner cl;
+    return cl.clone_Index_Without_Codes(gpu_index);
+}
 
 
 
@@ -256,6 +272,38 @@ Index *ToGpuCloner::clone_Index(const Index *index)
         return res;
     } else {
         return Cloner::clone_Index(index);
+    
+    }
+
+}
+
+
+Index *ToGpuCloner::clone_Index_Without_Codes(const Index *index, const uint8_t *arranged_data)
+{
+    auto ivf_sqh = dynamic_cast<const faiss::IndexIVFSQHybrid*>(index);
+    if(ivf_sqh) {
+        // should not happen
+    } else if(auto ifl = dynamic_cast<const faiss::IndexIVFFlat *>(index)) {
+        GpuIndexIVFFlatConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFFlat *res =
+            new GpuIndexIVFFlat(resources,
+                                ifl->d,
+                                ifl->nlist,
+                                ifl->metric_type,
+                                config);
+        if(reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFromWithoutCodes(ifl, arranged_data);
+        return res;
+    } else {
+        return Cloner::clone_Index(index);
     }
 }
 
@@ -270,6 +318,17 @@ faiss::Index * index_cpu_to_gpu(
     return cl.clone_Index(index);
 }
 
+faiss::Index * index_cpu_to_gpu_without_codes(
+       GpuResources* resources, int device,
+       const faiss::Index *index,
+       const uint8_t *arranged_data,
+       const GpuClonerOptions *options)
+{
+    GpuClonerOptions defaults;
+    ToGpuCloner cl(resources, device, options ? *options : defaults);
+    return cl.clone_Index_Without_Codes(index, arranged_data);
+}
+
 faiss::Index * index_cpu_to_gpu(
         GpuResources* resources, int device,
         IndexComposition* index_composition,
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuCloner.h b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
index f2c5388d93..c01029279e 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
@@ -23,7 +23,10 @@ class GpuResources;
 /// Cloner specialized for GPU -> CPU
 struct ToCPUCloner: faiss::Cloner {
     void merge_index(Index *dst, Index *src, bool successive_ids);
+
     Index *clone_Index(const Index *index) override;
+
+    Index *clone_Index_Without_Codes(const Index *index);
 };
 
 
@@ -38,6 +41,8 @@ struct ToGpuCloner: faiss::Cloner, GpuClonerOptions {
     Index *clone_Index(const Index *index) override;
 
     Index *clone_Index (IndexComposition* index_composition) override;
+
+    Index *clone_Index_Without_Codes(const Index *index, const uint8_t *arranged_data);
 };
 
 /// Cloner specialized for CPU -> multiple GPUs
@@ -66,12 +71,20 @@ struct ToGpuClonerMultiple: faiss::Cloner, GpuMultipleClonerOptions {
 /// converts any GPU index inside gpu_index to a CPU index
 faiss::Index * index_gpu_to_cpu(const faiss::Index *gpu_index);
 
+faiss::Index * index_gpu_to_cpu_without_codes(const faiss::Index *gpu_index);
+
 /// converts any CPU index that can be converted to GPU
 faiss::Index * index_cpu_to_gpu(
        GpuResources* resources, int device,
        const faiss::Index *index,
        const GpuClonerOptions *options = nullptr);
 
+faiss::Index * index_cpu_to_gpu_without_codes(
+       GpuResources* resources, int device,
+       const faiss::Index *index,
+       const uint8_t *arranged_data,
+       const GpuClonerOptions *options = nullptr);
+
 faiss::Index * index_cpu_to_gpu(
        GpuResources* resources, int device,
        IndexComposition* index_composition,
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
index 6ca7c70ffb..20f019f0b4 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
@@ -6,258 +6,324 @@
  */
 
 
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Float16.cuh>
-
-#include <limits>
-
-namespace faiss { namespace gpu {
-
-GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
-                                 const faiss::IndexIVFFlat* index,
-                                 GpuIndexIVFFlatConfig config) :
-    GpuIndexIVF(resources,
-                index->d,
-                index->metric_type,
-                index->metric_arg,
-                index->nlist,
-                config),
-    ivfFlatConfig_(config),
-    reserveMemoryVecs_(0),
-    index_(nullptr) {
-  copyFrom(index);
-}
-
-GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
-                                 int dims,
-                                 int nlist,
-                                 faiss::MetricType metric,
-                                 GpuIndexIVFFlatConfig config) :
-    GpuIndexIVF(resources, dims, metric, 0, nlist, config),
-    ivfFlatConfig_(config),
-    reserveMemoryVecs_(0),
-    index_(nullptr) {
-
-  // faiss::Index params
-  this->is_trained = false;
-
-  // We haven't trained ourselves, so don't construct the IVFFlat
-  // index yet
-}
-
-GpuIndexIVFFlat::~GpuIndexIVFFlat() {
-  delete index_;
-}
-
-void
-GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
-  reserveMemoryVecs_ = numVecs;
-  if (index_) {
-    DeviceScope scope(device_);
-    index_->reserveMemory(numVecs);
-  }
-}
-
-void
-GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
-  DeviceScope scope(device_);
-
-  GpuIndexIVF::copyFrom(index);
-
-  // Clear out our old data
-  delete index_;
-  index_ = nullptr;
-
-  // The other index might not be trained
-  if (!index->is_trained) {
-    FAISS_ASSERT(!is_trained);
-    return;
-  }
-
-  // Otherwise, we can populate ourselves from the other index
-  FAISS_ASSERT(is_trained);
-
-  // Copy our lists as well
-  index_ = new IVFFlat(resources_,
-                       quantizer->getGpuData(),
-                       index->metric_type,
-                       index->metric_arg,
-                       false, // no residual
-                       nullptr, // no scalar quantizer
-                       ivfFlatConfig_.indicesOptions,
-                       memorySpace_);
-  InvertedLists *ivf = index->invlists;
-
-  if (ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
-    index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
-                                   (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
-    /* double t0 = getmillisecs(); */
-    /* std::cout << "Readonly Takes " << getmillisecs() - t0 << " ms" << std::endl; */
-  } else {
-    for (size_t i = 0; i < ivf->nlist; ++i) {
-      auto numVecs = ivf->list_size(i);
-
-      // GPU index can only support max int entries per list
-      FAISS_THROW_IF_NOT_FMT(numVecs <=
-                             (size_t) std::numeric_limits<int>::max(),
-                             "GPU inverted list can only support "
-                             "%zu entries; %zu found",
-                             (size_t) std::numeric_limits<int>::max(),
-                             numVecs);
-
-      index_->addCodeVectorsFromCpu(i,
-                                    (const unsigned char*)(ivf->get_codes(i)),
-                                    ivf->get_ids(i),
-                                    numVecs);
-    }
-  }
-}
-
-void
-GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const {
-  DeviceScope scope(device_);
-
-  // We must have the indices in order to copy to ourselves
-  FAISS_THROW_IF_NOT_MSG(ivfFlatConfig_.indicesOptions != INDICES_IVF,
-                         "Cannot copy to CPU as GPU index doesn't retain "
-                         "indices (INDICES_IVF)");
-
-  GpuIndexIVF::copyTo(index);
-  index->code_size = this->d * sizeof(float);
-
-  InvertedLists *ivf = new ArrayInvertedLists(nlist, index->code_size);
-  index->replace_invlists(ivf, true);
-
-  // Copy the inverted lists
-  if (index_) {
-    for (int i = 0; i < nlist; ++i) {
-      auto listIndices = index_->getListIndices(i);
-      auto listData = index_->getListVectors(i);
-
-      ivf->add_entries(i,
-                       listIndices.size(),
-                       listIndices.data(),
-                       (const uint8_t*) listData.data());
-    }
-  }
-}
-
-size_t
-GpuIndexIVFFlat::reclaimMemory() {
-  if (index_) {
-    DeviceScope scope(device_);
-
-    return index_->reclaimMemory();
-  }
-
-  return 0;
-}
-
-void
-GpuIndexIVFFlat::reset() {
-  if (index_) {
-    DeviceScope scope(device_);
-
-    index_->reset();
-    this->ntotal = 0;
-  } else {
-    FAISS_ASSERT(this->ntotal == 0);
-  }
-}
-
-void
-GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
-  DeviceScope scope(device_);
-
-  if (this->is_trained) {
-    FAISS_ASSERT(quantizer->is_trained);
-    FAISS_ASSERT(quantizer->ntotal == nlist);
-    FAISS_ASSERT(index_);
-    return;
-  }
-
-  FAISS_ASSERT(!index_);
-
-  trainQuantizer_(n, x);
-
-  // The quantizer is now trained; construct the IVF index
-  index_ = new IVFFlat(resources_,
-                       quantizer->getGpuData(),
-                       this->metric_type,
-                       this->metric_arg,
-                       false, // no residual
-                       nullptr, // no scalar quantizer
-                       ivfFlatConfig_.indicesOptions,
-                       memorySpace_);
-
-  if (reserveMemoryVecs_) {
-    index_->reserveMemory(reserveMemoryVecs_);
-  }
-
-  this->is_trained = true;
-}
-
-void
-GpuIndexIVFFlat::addImpl_(int n,
-                          const float* x,
-                          const Index::idx_t* xids) {
-  // Device is already set in GpuIndex::add
-  FAISS_ASSERT(index_);
-  FAISS_ASSERT(n > 0);
-
-  auto stream = resources_->getDefaultStream(device_);
-
-  // Data is already resident on the GPU
-  Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
-
-  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
-  Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
-
-  // Not all vectors may be able to be added (some may contain NaNs etc)
-  index_->classifyAndAddVectors(data, labels);
-
-  // but keep the ntotal based on the total number of vectors that we attempted
-  // to add
-  ntotal += n;
-}
-
-void
-GpuIndexIVFFlat::searchImpl_(int n,
-                             const float* x,
-                             int k,
-                             float* distances,
-                             Index::idx_t* labels,
-                             ConcurrentBitsetPtr bitset) const {
-  // Device is already set in GpuIndex::search
-  FAISS_ASSERT(index_);
-  FAISS_ASSERT(n > 0);
-
-  auto stream = resources_->getDefaultStream(device_);
-
-  // Data is already resident on the GPU
-  Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
-  Tensor<float, 2, true> outDistances(distances, {n, k});
-
-  static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
-  Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
-
-  if (!bitset) {
-    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
-  } else {
-    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
-                                             const_cast<uint8_t*>(bitset->data()), stream,
-                                             {(int) bitset->size()});
-    index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
-  }
-}
-
-
-} } // namespace
+ #include <faiss/gpu/GpuIndexIVFFlat.h>
+ #include <faiss/IndexFlat.h>
+ #include <faiss/IndexIVFFlat.h>
+ #include <faiss/gpu/GpuIndexFlat.h>
+ #include <faiss/gpu/GpuResources.h>
+ #include <faiss/gpu/impl/IVFFlat.cuh>
+ #include <faiss/gpu/utils/CopyUtils.cuh>
+ #include <faiss/gpu/utils/DeviceUtils.h>
+ #include <faiss/gpu/utils/Float16.cuh>
+ 
+ #include <limits>
+ 
+ namespace faiss { namespace gpu {
+ 
+ GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
+                                  const faiss::IndexIVFFlat* index,
+                                  GpuIndexIVFFlatConfig config) :
+     GpuIndexIVF(resources,
+                 index->d,
+                 index->metric_type,
+                 index->metric_arg,
+                 index->nlist,
+                 config),
+     ivfFlatConfig_(config),
+     reserveMemoryVecs_(0),
+     index_(nullptr) {
+   copyFrom(index);
+ }
+ 
+ GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
+                                  int dims,
+                                  int nlist,
+                                  faiss::MetricType metric,
+                                  GpuIndexIVFFlatConfig config) :
+     GpuIndexIVF(resources, dims, metric, 0, nlist, config),
+     ivfFlatConfig_(config),
+     reserveMemoryVecs_(0),
+     index_(nullptr) {
+ 
+   // faiss::Index params
+   this->is_trained = false;
+ 
+   // We haven't trained ourselves, so don't construct the IVFFlat
+   // index yet
+ }
+ 
+ GpuIndexIVFFlat::~GpuIndexIVFFlat() {
+   delete index_;
+ }
+ 
+ void
+ GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
+   reserveMemoryVecs_ = numVecs;
+   if (index_) {
+     DeviceScope scope(device_);
+     index_->reserveMemory(numVecs);
+   }
+ }
+ 
+ void
+ GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
+   DeviceScope scope(device_);
+ 
+   GpuIndexIVF::copyFrom(index);
+ 
+   // Clear out our old data
+   delete index_;
+   index_ = nullptr;
+ 
+   // The other index might not be trained
+   if (!index->is_trained) {
+     FAISS_ASSERT(!is_trained);
+     return;
+   }
+ 
+   // Otherwise, we can populate ourselves from the other index
+   FAISS_ASSERT(is_trained);
+ 
+   // Copy our lists as well
+   index_ = new IVFFlat(resources_,
+                        quantizer->getGpuData(),
+                        index->metric_type,
+                        index->metric_arg,
+                        false, // no residual
+                        nullptr, // no scalar quantizer
+                        ivfFlatConfig_.indicesOptions,
+                        memorySpace_);
+   InvertedLists *ivf = index->invlists;
+ 
+   if (ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+     index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
+                                    (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+     /* double t0 = getmillisecs(); */
+     /* std::cout << "Readonly Takes " << getmillisecs() - t0 << " ms" << std::endl; */
+   } else {
+     for (size_t i = 0; i < ivf->nlist; ++i) {
+       auto numVecs = ivf->list_size(i);
+ 
+       // GPU index can only support max int entries per list
+       FAISS_THROW_IF_NOT_FMT(numVecs <=
+                              (size_t) std::numeric_limits<int>::max(),
+                              "GPU inverted list can only support "
+                              "%zu entries; %zu found",
+                              (size_t) std::numeric_limits<int>::max(),
+                              numVecs);
+ 
+       index_->addCodeVectorsFromCpu(i,
+                                     (const unsigned char*)(ivf->get_codes(i)),
+                                     ivf->get_ids(i),
+                                     numVecs);
+     }
+   }
+ }
+ 
+ void
+ GpuIndexIVFFlat::copyFromWithoutCodes(const faiss::IndexIVFFlat* index, const uint8_t* arranged_data) {
+   DeviceScope scope(device_);
+ 
+   GpuIndexIVF::copyFrom(index);
+ 
+   // Clear out our old data
+   delete index_;
+   index_ = nullptr;
+ 
+   // The other index might not be trained
+   if (!index->is_trained) {
+     FAISS_ASSERT(!is_trained);
+     return;
+   }
+ 
+   // Otherwise, we can populate ourselves from the other index
+   FAISS_ASSERT(is_trained);
+ 
+   // Copy our lists as well
+   index_ = new IVFFlat(resources_,
+                        quantizer->getGpuData(),
+                        index->metric_type,
+                        index->metric_arg,
+                        false, // no residual
+                        nullptr, // no scalar quantizer
+                        ivfFlatConfig_.indicesOptions,
+                        memorySpace_);
+   InvertedLists *ivf = index->invlists;
+ 
+   if (ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+     index_->copyCodeVectorsFromCpu((const float *) arranged_data,
+                                    (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+   } else {
+     // should not happen
+   }
+ }
+ 
+ void
+ GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const {
+   DeviceScope scope(device_);
+ 
+   // We must have the indices in order to copy to ourselves
+   FAISS_THROW_IF_NOT_MSG(ivfFlatConfig_.indicesOptions != INDICES_IVF,
+                          "Cannot copy to CPU as GPU index doesn't retain "
+                          "indices (INDICES_IVF)");
+ 
+   GpuIndexIVF::copyTo(index);
+   index->code_size = this->d * sizeof(float);
+ 
+   InvertedLists *ivf = new ArrayInvertedLists(nlist, index->code_size);
+   index->replace_invlists(ivf, true);
+ 
+   // Copy the inverted lists
+   if (index_) {
+     for (int i = 0; i < nlist; ++i) {
+       auto listIndices = index_->getListIndices(i);
+       auto listData = index_->getListVectors(i);
+ 
+       ivf->add_entries(i,
+                        listIndices.size(),
+                        listIndices.data(),
+                        (const uint8_t*) listData.data());
+     }
+   }
+ }
+ 
+ void
+ GpuIndexIVFFlat::copyToWithoutCodes(faiss::IndexIVFFlat* index) const {
+   DeviceScope scope(device_);
+ 
+   // We must have the indices in order to copy to ourselves
+   FAISS_THROW_IF_NOT_MSG(ivfFlatConfig_.indicesOptions != INDICES_IVF,
+                          "Cannot copy to CPU as GPU index doesn't retain "
+                          "indices (INDICES_IVF)");
+ 
+   GpuIndexIVF::copyTo(index);
+   index->code_size = this->d * sizeof(float);
+ 
+   InvertedLists *ivf = new ArrayInvertedLists(nlist, index->code_size);
+   index->replace_invlists(ivf, true);
+ 
+   // Copy the inverted lists
+   if (index_) {
+     for (int i = 0; i < nlist; ++i) {
+       auto listIndices = index_->getListIndices(i);
+ 
+       ivf->add_entries_without_codes(i,
+                        listIndices.size(),
+                        listIndices.data());
+     }
+   }
+ }
+ 
+ size_t
+ GpuIndexIVFFlat::reclaimMemory() {
+   if (index_) {
+     DeviceScope scope(device_);
+ 
+     return index_->reclaimMemory();
+   }
+ 
+   return 0;
+ }
+ 
+ void
+ GpuIndexIVFFlat::reset() {
+   if (index_) {
+     DeviceScope scope(device_);
+ 
+     index_->reset();
+     this->ntotal = 0;
+   } else {
+     FAISS_ASSERT(this->ntotal == 0);
+   }
+ }
+ 
+ void
+ GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
+   DeviceScope scope(device_);
+ 
+   if (this->is_trained) {
+     FAISS_ASSERT(quantizer->is_trained);
+     FAISS_ASSERT(quantizer->ntotal == nlist);
+     FAISS_ASSERT(index_);
+     return;
+   }
+ 
+   FAISS_ASSERT(!index_);
+ 
+   trainQuantizer_(n, x);
+ 
+   // The quantizer is now trained; construct the IVF index
+   index_ = new IVFFlat(resources_,
+                        quantizer->getGpuData(),
+                        this->metric_type,
+                        this->metric_arg,
+                        false, // no residual
+                        nullptr, // no scalar quantizer
+                        ivfFlatConfig_.indicesOptions,
+                        memorySpace_);
+ 
+   if (reserveMemoryVecs_) {
+     index_->reserveMemory(reserveMemoryVecs_);
+   }
+ 
+   this->is_trained = true;
+ }
+ 
+ void
+ GpuIndexIVFFlat::addImpl_(int n,
+                           const float* x,
+                           const Index::idx_t* xids) {
+   // Device is already set in GpuIndex::add
+   FAISS_ASSERT(index_);
+   FAISS_ASSERT(n > 0);
+ 
+   auto stream = resources_->getDefaultStream(device_);
+ 
+   // Data is already resident on the GPU
+   Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
+ 
+   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+   Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
+ 
+   // Not all vectors may be able to be added (some may contain NaNs etc)
+   index_->classifyAndAddVectors(data, labels);
+ 
+   // but keep the ntotal based on the total number of vectors that we attempted
+   // to add
+   ntotal += n;
+ }
+ 
+ void
+ GpuIndexIVFFlat::searchImpl_(int n,
+                              const float* x,
+                              int k,
+                              float* distances,
+                              Index::idx_t* labels,
+                              ConcurrentBitsetPtr bitset) const {
+   // Device is already set in GpuIndex::search
+   FAISS_ASSERT(index_);
+   FAISS_ASSERT(n > 0);
+ 
+   auto stream = resources_->getDefaultStream(device_);
+ 
+   // Data is already resident on the GPU
+   Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
+   Tensor<float, 2, true> outDistances(distances, {n, k});
+ 
+   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
+   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
+ 
+   if (!bitset) {
+     auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+     index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+   } else {
+     auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
+                                              const_cast<uint8_t*>(bitset->data()), stream,
+                                              {(int) bitset->size()});
+     index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
+   }
+ }
+ 
+ 
+ } } // namespace
+ 
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
index a7328c31e3..e0b79aaee1 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
@@ -48,10 +48,14 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
   /// all data in ourselves
   void copyFrom(const faiss::IndexIVFFlat* index);
 
+  void copyFromWithoutCodes(const faiss::IndexIVFFlat* index, const uint8_t* arranged_data);
+
   /// Copy ourselves to the given CPU index; will overwrite all data
   /// in the index instance
   void copyTo(faiss::IndexIVFFlat* index) const;
 
+  void copyToWithoutCodes(faiss::IndexIVFFlat* index) const;
+
   /// After adding vectors, one can call this to reclaim device memory
   /// to exactly the amount needed. Returns space reclaimed in bytes
   size_t reclaimMemory();
diff --git a/core/src/index/thirdparty/faiss/impl/index_read.cpp b/core/src/index/thirdparty/faiss/impl/index_read.cpp
index 85cec7d39f..24556606ee 100644
--- a/core/src/index/thirdparty/faiss/impl/index_read.cpp
+++ b/core/src/index/thirdparty/faiss/impl/index_read.cpp
@@ -343,6 +343,89 @@ static void read_InvertedLists (
     ivf->own_invlists = true;
 }
 
+InvertedLists *read_InvertedLists_nm (IOReader *f, int io_flags) {
+    uint32_t h;
+    READ1 (h);
+    if (h == fourcc ("il00")) {
+        fprintf(stderr, "read_InvertedLists:"
+                " WARN! inverted lists not stored with IVF object\n");
+        return nullptr;
+    } else if (h == fourcc ("iloa") && !(io_flags & IO_FLAG_MMAP)) {
+        // not going to happen
+        return nullptr;
+    } else if (h == fourcc ("ilar") && !(io_flags & IO_FLAG_MMAP)) {
+        auto ails = new ArrayInvertedLists (0, 0);
+        READ1 (ails->nlist);
+        READ1 (ails->code_size);
+        ails->ids.resize (ails->nlist);
+        std::vector<size_t> sizes (ails->nlist);
+        read_ArrayInvertedLists_sizes (f, sizes);
+        for (size_t i = 0; i < ails->nlist; i++) {
+            ails->ids[i].resize (sizes[i]);
+        }
+        for (size_t i = 0; i < ails->nlist; i++) {
+            size_t n = ails->ids[i].size();
+            if (n > 0) {
+                READANDCHECK (ails->ids[i].data(), n);
+            }
+        }
+        return ails;
+    } else if (h == fourcc ("ilar") && (io_flags & IO_FLAG_MMAP)) {
+        // then we load it as an OnDiskInvertedLists
+        FileIOReader *reader = dynamic_cast<FileIOReader*>(f);
+        FAISS_THROW_IF_NOT_MSG(reader, "mmap only supported for File objects");
+        FILE *fdesc = reader->f;
+
+        auto ails = new OnDiskInvertedLists ();
+        READ1 (ails->nlist);
+        READ1 (ails->code_size);
+        ails->read_only = true;
+        ails->lists.resize (ails->nlist);
+        std::vector<size_t> sizes (ails->nlist);
+        read_ArrayInvertedLists_sizes (f, sizes);
+        size_t o0 = ftell(fdesc), o = o0;
+        { // do the mmap
+            struct stat buf;
+            int ret = fstat (fileno(fdesc), &buf);
+            FAISS_THROW_IF_NOT_FMT (ret == 0,
+                                    "fstat failed: %s", strerror(errno));
+            ails->totsize = buf.st_size;
+            ails->ptr = (uint8_t*)mmap (nullptr, ails->totsize,
+                                        PROT_READ, MAP_SHARED,
+                                        fileno(fdesc), 0);
+            FAISS_THROW_IF_NOT_FMT (ails->ptr != MAP_FAILED,
+                            "could not mmap: %s",
+                            strerror(errno));
+        }
+
+        for (size_t i = 0; i < ails->nlist; i++) {
+            OnDiskInvertedLists::List & l = ails->lists[i];
+            l.size = l.capacity = sizes[i];
+            l.offset = o;
+            o += l.size * (sizeof(OnDiskInvertedLists::idx_t) +
+                           ails->code_size);
+        }
+        FAISS_THROW_IF_NOT(o <= ails->totsize);
+        // resume normal reading of file
+        fseek (fdesc, o, SEEK_SET);
+        return ails;
+    } else if (h == fourcc ("ilod")) {
+        // not going to happen
+        return nullptr;
+    } else {
+        FAISS_THROW_MSG ("read_InvertedLists: unsupported invlist type");
+    }
+}
+
+static void read_InvertedLists_nm (
+        IndexIVF *ivf, IOReader *f, int io_flags) {
+    InvertedLists *ils = read_InvertedLists_nm (f, io_flags);
+    FAISS_THROW_IF_NOT (!ils || (ils->nlist == ivf->nlist &&
+                                 ils->code_size == ivf->code_size));
+    ivf->invlists = ils;
+    ivf->own_invlists = true;
+}
+
 static void read_ProductQuantizer (ProductQuantizer *pq, IOReader *f) {
     READ1 (pq->d);
     READ1 (pq->M);
@@ -736,6 +819,52 @@ Index *read_index (const char *fname, int io_flags) {
     return idx;
 }
 
+// read offset-only index
+Index *read_index_nm (IOReader *f, int io_flags) {
+    Index * idx = nullptr;
+    uint32_t h;
+    READ1 (h);
+    if (h == fourcc ("IwFl")) {
+        IndexIVFFlat * ivfl = new IndexIVFFlat ();
+        read_ivf_header (ivfl, f);
+        ivfl->code_size = ivfl->d * sizeof(float);
+        read_InvertedLists_nm (ivfl, f, io_flags);
+        idx = ivfl;
+    } else if(h == fourcc ("IwSq")) {
+        IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
+        read_ivf_header (ivsc, f);
+        read_ScalarQuantizer (&ivsc->sq, f);
+        READ1 (ivsc->code_size);
+        READ1 (ivsc->by_residual);
+        read_InvertedLists_nm (ivsc, f, io_flags);
+        idx = ivsc;
+    } else if (h == fourcc("ISqH")) {
+        IndexIVFSQHybrid *ivfsqhbyrid = new IndexIVFSQHybrid();
+        read_ivf_header(ivfsqhbyrid, f);
+        read_ScalarQuantizer(&ivfsqhbyrid->sq, f);
+        READ1 (ivfsqhbyrid->code_size);
+        READ1 (ivfsqhbyrid->by_residual);
+        read_InvertedLists_nm(ivfsqhbyrid, f, io_flags);
+        idx = ivfsqhbyrid;
+    } else {
+        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
+        idx = nullptr;
+    }
+    return idx;
+}
+
+
+Index *read_index_nm (FILE * f, int io_flags) {
+    FileIOReader reader(f);
+    return read_index_nm(&reader, io_flags);
+}
+
+Index *read_index_nm (const char *fname, int io_flags) {
+    FileIOReader reader(fname);
+    Index *idx = read_index_nm (&reader, io_flags);
+    return idx;
+}
+
 VectorTransform *read_VectorTransform (const char *fname) {
     FileIOReader reader(fname);
     VectorTransform *vt = read_VectorTransform (&reader);
@@ -917,4 +1046,4 @@ IndexBinary *read_index_binary (const char *fname, int io_flags) {
 }
 
 
-} // namespace faiss
+} // namespace faiss
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/impl/index_write.cpp b/core/src/index/thirdparty/faiss/impl/index_write.cpp
index 54fce2fc46..ef7720a273 100644
--- a/core/src/index/thirdparty/faiss/impl/index_write.cpp
+++ b/core/src/index/thirdparty/faiss/impl/index_write.cpp
@@ -286,6 +286,63 @@ void write_InvertedLists (const InvertedLists *ils, IOWriter *f) {
     }
 }
 
+// write inverted lists for offset-only index
+void write_InvertedLists_nm (const InvertedLists *ils, IOWriter *f) {
+    if (ils == nullptr) {
+        uint32_t h = fourcc ("il00");
+        WRITE1 (h);
+    } else if (const auto & ails =
+               dynamic_cast<const ArrayInvertedLists *>(ils)) {
+        uint32_t h = fourcc ("ilar");
+        WRITE1 (h);
+        WRITE1 (ails->nlist);
+        WRITE1 (ails->code_size);
+        // here we store either as a full or a sparse data buffer
+        size_t n_non0 = 0;
+        for (size_t i = 0; i < ails->nlist; i++) {
+            if (ails->ids[i].size() > 0)
+                n_non0++;
+        }
+        if (n_non0 > ails->nlist / 2) {
+            uint32_t list_type = fourcc("full");
+            WRITE1 (list_type);
+            std::vector<size_t> sizes;
+            for (size_t i = 0; i < ails->nlist; i++) {
+                sizes.push_back (ails->ids[i].size());
+            }
+            WRITEVECTOR (sizes);
+        } else {
+            int list_type = fourcc("sprs"); // sparse
+            WRITE1 (list_type);
+            std::vector<size_t> sizes;
+            for (size_t i = 0; i < ails->nlist; i++) {
+                size_t n = ails->ids[i].size();
+                if (n > 0) {
+                    sizes.push_back (i);
+                    sizes.push_back (n);
+                }
+            }
+            WRITEVECTOR (sizes);
+        }
+        // make a single contiguous data buffer (useful for mmapping)
+        for (size_t i = 0; i < ails->nlist; i++) {
+            size_t n = ails->ids[i].size();
+            if (n > 0) {
+                // WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
+                WRITEANDCHECK (ails->ids[i].data(), n);
+            }
+        }
+    } else if (const auto & oa =
+            dynamic_cast<const ReadOnlyArrayInvertedLists *>(ils)) {
+        // not going to happen
+    } else {
+        fprintf(stderr, "WARN! write_InvertedLists: unsupported invlist type, "
+                "saving null invlist\n");
+        uint32_t h = fourcc ("il00");
+        WRITE1 (h);
+    }
+}
+
 
 void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
     FileIOWriter writer(fname);
@@ -518,6 +575,47 @@ void write_index (const Index *idx, const char *fname) {
     write_index (idx, &writer);
 }
 
+// write index for offset-only index
+void write_index_nm (const Index *idx, IOWriter *f) {
+    if(const IndexIVFFlat * ivfl =
+              dynamic_cast<const IndexIVFFlat *> (idx)) {
+        uint32_t h = fourcc ("IwFl");
+        WRITE1 (h);
+        write_ivf_header (ivfl, f);
+        write_InvertedLists_nm (ivfl->invlists, f);
+    } else if(const IndexIVFScalarQuantizer * ivsc =
+              dynamic_cast<const IndexIVFScalarQuantizer *> (idx)) {
+        uint32_t h = fourcc ("IwSq");
+        WRITE1 (h);
+        write_ivf_header (ivsc, f);
+        write_ScalarQuantizer (&ivsc->sq, f);
+        WRITE1 (ivsc->code_size);
+        WRITE1 (ivsc->by_residual);
+        write_InvertedLists_nm (ivsc->invlists, f);
+    } else if(const IndexIVFSQHybrid *ivfsqhbyrid =
+            dynamic_cast<const IndexIVFSQHybrid*>(idx)) {
+        uint32_t h = fourcc ("ISqH");
+        WRITE1 (h);
+        write_ivf_header (ivfsqhbyrid, f);
+        write_ScalarQuantizer (&ivfsqhbyrid->sq, f);
+        WRITE1 (ivfsqhbyrid->code_size);
+        WRITE1 (ivfsqhbyrid->by_residual);
+        write_InvertedLists_nm (ivfsqhbyrid->invlists, f);
+    } else {
+      FAISS_THROW_MSG ("don't know how to serialize this type of index");
+    }
+}
+
+void write_index_nm (const Index *idx, FILE *f) {
+    FileIOWriter writer(f);
+    write_index_nm (idx, &writer);
+}
+
+void write_index_nm (const Index *idx, const char *fname) {
+    FileIOWriter writer(fname);
+    write_index_nm (idx, &writer);
+}
+
 void write_VectorTransform (const VectorTransform *vt, const char *fname) {
     FileIOWriter writer(fname);
     write_VectorTransform (vt, &writer);
diff --git a/core/src/index/thirdparty/faiss/index_io.h b/core/src/index/thirdparty/faiss/index_io.h
index 5aef62c87b..ac686da71c 100644
--- a/core/src/index/thirdparty/faiss/index_io.h
+++ b/core/src/index/thirdparty/faiss/index_io.h
@@ -37,6 +37,10 @@ void write_index (const Index *idx, const char *fname);
 void write_index (const Index *idx, FILE *f);
 void write_index (const Index *idx, IOWriter *writer);
 
+void write_index_nm (const Index *idx, const char *fname);
+void write_index_nm (const Index *idx, FILE *f);
+void write_index_nm (const Index *idx, IOWriter *writer);
+
 void write_index_binary (const IndexBinary *idx, const char *fname);
 void write_index_binary (const IndexBinary *idx, FILE *f);
 void write_index_binary (const IndexBinary *idx, IOWriter *writer);
@@ -52,6 +56,10 @@ Index *read_index (const char *fname, int io_flags = 0);
 Index *read_index (FILE * f, int io_flags = 0);
 Index *read_index (IOReader *reader, int io_flags = 0);
 
+Index *read_index_nm (const char *fname, int io_flags = 0);
+Index *read_index_nm (FILE * f, int io_flags = 0);
+Index *read_index_nm (IOReader *reader, int io_flags = 0);
+
 IndexBinary *read_index_binary (const char *fname, int io_flags = 0);
 IndexBinary *read_index_binary (FILE * f, int io_flags = 0);
 IndexBinary *read_index_binary (IOReader *reader, int io_flags = 0);
@@ -68,6 +76,9 @@ void write_ProductQuantizer (const ProductQuantizer*pq, IOWriter *f);
 void write_InvertedLists (const InvertedLists *ils, IOWriter *f);
 InvertedLists *read_InvertedLists (IOReader *reader, int io_flags = 0);
 
+void write_InvertedLists_nm (const InvertedLists *ils, IOWriter *f);
+InvertedLists *read_InvertedLists_nm (IOReader *reader, int io_flags = 0);
+
 
 } // namespace faiss
 
diff --git a/core/src/index/thirdparty/hnswlib/hnswalg.h b/core/src/index/thirdparty/hnswlib/hnswalg.h
index 8c54c8c39f..481722a5d4 100644
--- a/core/src/index/thirdparty/hnswlib/hnswalg.h
+++ b/core/src/index/thirdparty/hnswlib/hnswalg.h
@@ -1130,6 +1130,15 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
 
         return result;
     }
+
+    void addPoint(void *datapoint, labeltype label, size_t base, size_t offset) {
+        return;
+    }
+
+    std::priority_queue<std::pair<dist_t, labeltype >> searchKnn_NM(const void* query_data, size_t k, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) const {
+        std::priority_queue<std::pair<dist_t, labeltype >> ret;
+        return ret;
+    }
 };
 
 }
diff --git a/core/src/index/thirdparty/hnswlib/hnswalg_nm.h b/core/src/index/thirdparty/hnswlib/hnswalg_nm.h
new file mode 100644
index 0000000000..81fbe2a2a4
--- /dev/null
+++ b/core/src/index/thirdparty/hnswlib/hnswalg_nm.h
@@ -0,0 +1,1206 @@
+#pragma once
+
+#include "visited_list_pool.h"
+#include "hnswlib.h"
+#include <random>
+#include <stdlib.h>
+#include <unordered_set>
+#include <list>
+
+#include "knowhere/index/vector_index/helpers/FaissIO.h"
+
+namespace hnswlib {
+
+    typedef unsigned int tableint;
+    typedef unsigned int linklistsizeint;
+
+    template<typename dist_t>
+    class HierarchicalNSW_NM : public AlgorithmInterface<dist_t> {
+    public:
+        HierarchicalNSW_NM(SpaceInterface<dist_t> *s) {
+        }
+
+        HierarchicalNSW_NM(SpaceInterface<dist_t> *s, const std::string &location, bool nmslib = false, size_t max_elements=0) {
+            loadIndex(location, s, max_elements);
+        }
+
+        HierarchicalNSW_NM(SpaceInterface<dist_t> *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) :
+                link_list_locks_(max_elements), element_levels_(max_elements) {
+            // linxj
+            space = s;
+            if (auto x = dynamic_cast<L2Space*>(s)) {
+                metric_type_ = 0;
+            } else if (auto x = dynamic_cast<InnerProductSpace*>(s)) {
+                metric_type_ = 1;
+            } else {
+                metric_type_ = 100;
+            }
+
+            max_elements_ = max_elements;
+            mem_stats_ += max_elements * sizeof(int);
+            mem_stats_ += max_elements * sizeof(std::mutex);
+
+            has_deletions_=false;
+            data_size_ = s->get_data_size();
+            fstdistfunc_ = s->get_dist_func();
+            dist_func_param_ = s->get_dist_func_param();
+            M_ = M;
+            maxM_ = M_;
+            cmli_cnt_ = 0;
+            cmli_cnt2_ = 0;
+            cmli_time_ = 0.0;
+            maxM0_ = M_ * 2;
+            ef_construction_ = std::max(ef_construction,M_);
+            ef_ = 10;
+
+            level_generator_.seed(random_seed);
+
+            size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
+            size_data_per_element_ = size_links_level0_; // + sizeof(labeltype); + data_size_;;
+//            label_offset_ = size_links_level0_;
+
+            data_level0_memory_ = (char *) malloc(max_elements_ * size_data_per_element_);
+            mem_stats_ += max_elements_ * size_data_per_element_;
+            if (data_level0_memory_ == nullptr)
+                throw std::runtime_error("Not enough memory");
+
+            cur_element_count = 0;
+
+            visited_list_pool_ = new VisitedListPool(1, max_elements);
+            mem_stats_ += (max_elements + 2) * sizeof(short int);
+
+
+
+            //initializations for special treatment of the first node
+            enterpoint_node_ = -1;
+            maxlevel_ = -1;
+
+            linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
+            mem_stats_ += sizeof(void *) * max_elements_;
+            if (linkLists_ == nullptr)
+                throw std::runtime_error("Not enough memory: HierarchicalNSW_NM failed to allocate linklists");
+            size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
+            mult_ = 1 / log(1.0 * M_);
+            revSize_ = 1.0 / mult_;
+            level_stats_.resize(10);
+        }
+
+        struct CompareByFirst {
+            constexpr bool operator()(std::pair<dist_t, tableint> const &a,
+                                      std::pair<dist_t, tableint> const &b) const noexcept {
+                return a.first < b.first;
+            }
+        };
+
+        ~HierarchicalNSW_NM() {
+
+            free(data_level0_memory_);
+            for (tableint i = 0; i < cur_element_count; i++) {
+                if (element_levels_[i] > 0)
+                    free(linkLists_[i]);
+            }
+            free(linkLists_);
+            delete visited_list_pool_;
+
+            // linxj: delete
+            delete space;
+        }
+
+        // linxj: use for free resource
+        SpaceInterface<dist_t> *space;
+        size_t metric_type_; // 0:l2, 1:ip
+
+        // cmli: statistics of memory usage
+        long long int mem_stats_ = 0;
+        // cmli: statistics of levels
+        std::vector<int> level_stats_;
+
+
+        size_t max_elements_;
+        size_t cur_element_count;
+        size_t size_data_per_element_;
+        size_t size_links_per_element_;
+
+        size_t M_;
+        size_t maxM_;
+        size_t maxM0_;
+        size_t cmli_cnt_;
+        size_t cmli_cnt2_;
+        double cmli_time_;
+        size_t ef_construction_;
+
+        double mult_, revSize_;
+        int maxlevel_;
+
+
+        VisitedListPool *visited_list_pool_;
+        std::mutex cur_element_count_guard_;
+
+        std::vector<std::mutex> link_list_locks_;
+        tableint enterpoint_node_;
+
+
+        size_t size_links_level0_;
+
+
+        char *data_level0_memory_;
+        char **linkLists_;
+        std::vector<int> element_levels_;
+
+        size_t data_size_;
+
+        bool has_deletions_;
+
+
+        DISTFUNC<dist_t> fstdistfunc_;
+        void *dist_func_param_;
+//    std::unordered_map<labeltype, tableint> label_lookup_;
+
+        std::default_random_engine level_generator_;
+
+        /*
+        inline labeltype getExternalLabel(tableint internal_id) const {
+            labeltype return_label;
+            memcpy(&return_label,(data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), sizeof(labeltype));
+            return return_label;
+        }
+
+        inline void setExternalLabel(tableint internal_id, labeltype label) const {
+            memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype));
+        }
+
+        inline labeltype *getExternalLabeLp(tableint internal_id) const {
+            return (labeltype *) (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_);
+        }
+
+        inline char *getDataByInternalId(tableint internal_id) const {
+            return (data_level0_memory_ + internal_id * size_data_per_element_ + offsetData_);
+        }
+        */
+
+        inline char *getDataByInternalId(void *pdata, tableint offset) const {
+            return ((char*)pdata + offset * data_size_);
+        }
+
+        int getRandomLevel(double reverse_size) {
+            std::uniform_real_distribution<double> distribution(0.0, 1.0);
+            double r = -log(distribution(level_generator_)) * reverse_size;
+            return (int) r;
+        }
+
+        std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
+        searchBaseLayer(tableint ep_id, const void *data_point, int layer, void *pdata) {
+            VisitedList *vl = visited_list_pool_->getFreeVisitedList();
+            vl_type *visited_array = vl->mass;
+            vl_type visited_array_tag = vl->curV;
+
+            std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
+            std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidateSet;
+
+            dist_t lowerBound;
+            if (!isMarkedDeleted(ep_id)) {
+                dist_t dist = fstdistfunc_(data_point, getDataByInternalId(pdata, ep_id), dist_func_param_);
+                top_candidates.emplace(dist, ep_id);
+                lowerBound = dist;
+                candidateSet.emplace(-dist, ep_id);
+            } else {
+                lowerBound = std::numeric_limits<dist_t>::max();
+                candidateSet.emplace(-lowerBound, ep_id);
+            }
+            visited_array[ep_id] = visited_array_tag;
+
+            while (!candidateSet.empty()) {
+                std::pair<dist_t, tableint> curr_el_pair = candidateSet.top();
+                if ((-curr_el_pair.first) > lowerBound) {
+                    break;
+                }
+                candidateSet.pop();
+
+                tableint curNodeNum = curr_el_pair.second;
+
+                std::unique_lock <std::mutex> lock(link_list_locks_[curNodeNum]);
+
+                int *data;// = (int *)(linkList0_ + curNodeNum * size_links_per_element0_);
+                if (layer == 0) {
+                    data = (int*)get_linklist0(curNodeNum);
+                } else {
+                    data = (int*)get_linklist(curNodeNum, layer);
+                    // data = (int *) (linkLists_[curNodeNum] + (layer - 1) * size_links_per_element_);
+                }
+                size_t size = getListCount((linklistsizeint*)data);
+                tableint *datal = (tableint *) (data + 1);
+#ifdef USE_SSE
+                _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0);
+                _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
+                _mm_prefetch(getDataByInternalId(pdata, *datal), _MM_HINT_T0);
+                _mm_prefetch(getDataByInternalId(pdata, *(datal + 1)), _MM_HINT_T0);
+#endif
+
+                for (size_t j = 0; j < size; j++) {
+                    tableint candidate_id = *(datal + j);
+                    // if (candidate_id == 0) continue;
+#ifdef USE_SSE
+                    _mm_prefetch((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0);
+                    _mm_prefetch(getDataByInternalId(pdata, *(datal + j + 1)), _MM_HINT_T0);
+#endif
+                    if (visited_array[candidate_id] == visited_array_tag) continue;
+                    visited_array[candidate_id] = visited_array_tag;
+                    char *currObj1 = (getDataByInternalId(pdata, candidate_id));
+
+                    dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_);
+                    if (top_candidates.size() < ef_construction_ || lowerBound > dist1) {
+                        candidateSet.emplace(-dist1, candidate_id);
+#ifdef USE_SSE
+                        _mm_prefetch(getDataByInternalId(pdata, candidateSet.top().second), _MM_HINT_T0);
+#endif
+
+                        if (!isMarkedDeleted(candidate_id))
+                            top_candidates.emplace(dist1, candidate_id);
+
+                        if (top_candidates.size() > ef_construction_)
+                            top_candidates.pop();
+
+                        if (!top_candidates.empty())
+                            lowerBound = top_candidates.top().first;
+                    }
+                }
+            }
+            visited_list_pool_->releaseVisitedList(vl);
+
+            return top_candidates;
+        }
+
+        template <bool has_deletions>
+        std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
+        searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, faiss::ConcurrentBitsetPtr bitset, void *pdata) const {
+            VisitedList *vl = visited_list_pool_->getFreeVisitedList();
+            vl_type *visited_array = vl->mass;
+            vl_type visited_array_tag = vl->curV;
+
+            std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
+            std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidate_set;
+
+            dist_t lowerBound;
+//        if (!has_deletions || !isMarkedDeleted(ep_id)) {
+            if (!has_deletions || !bitset->test((faiss::ConcurrentBitset::id_type_t)(ep_id))) {
+                dist_t dist = fstdistfunc_(data_point, getDataByInternalId(pdata, ep_id), dist_func_param_);
+                lowerBound = dist;
+                top_candidates.emplace(dist, ep_id);
+                candidate_set.emplace(-dist, ep_id);
+            } else {
+                lowerBound = std::numeric_limits<dist_t>::max();
+                candidate_set.emplace(-lowerBound, ep_id);
+            }
+
+            visited_array[ep_id] = visited_array_tag;
+
+            while (!candidate_set.empty()) {
+
+                std::pair<dist_t, tableint> current_node_pair = candidate_set.top();
+
+                if ((-current_node_pair.first) > lowerBound) {
+                    break;
+                }
+                candidate_set.pop();
+
+                tableint current_node_id = current_node_pair.second;
+                int *data = (int *) get_linklist0(current_node_id);
+                size_t size = getListCount((linklistsizeint*)data);
+                // bool cur_node_deleted = isMarkedDeleted(current_node_id);
+
+#ifdef USE_SSE
+                _mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0);
+                _mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
+//            _mm_prefetch(data_level0_memory_ + (*(data + 1)) * size_data_per_element_ + offsetData_, _MM_HINT_T0);
+                _mm_prefetch(getDataByInternalId(pdata, *(data + 1)), _MM_HINT_T0);
+                _mm_prefetch((char *) (data + 2), _MM_HINT_T0);
+#endif
+
+                for (size_t j = 1; j <= size; j++) {
+                    int candidate_id = *(data + j);
+                    // if (candidate_id == 0) continue;
+#ifdef USE_SSE
+                    _mm_prefetch((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0);
+                    _mm_prefetch(getDataByInternalId(pdata, *(data + j + 1)),
+                                 _MM_HINT_T0);////////////
+#endif
+                    if (!(visited_array[candidate_id] == visited_array_tag)) {
+
+                        visited_array[candidate_id] = visited_array_tag;
+
+                        char *currObj1 = (getDataByInternalId(pdata, candidate_id));
+                        dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_);
+
+                        if (top_candidates.size() < ef || lowerBound > dist) {
+                            candidate_set.emplace(-dist, candidate_id);
+#ifdef USE_SSE
+                            _mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_,///////////
+                                         _MM_HINT_T0);////////////////////////
+#endif
+
+//                        if (!has_deletions || !isMarkedDeleted(candidate_id))
+                            if (!has_deletions || (!bitset->test((faiss::ConcurrentBitset::id_type_t)(candidate_id))))
+                                top_candidates.emplace(dist, candidate_id);
+
+                            if (top_candidates.size() > ef)
+                                top_candidates.pop();
+
+                            if (!top_candidates.empty())
+                                lowerBound = top_candidates.top().first;
+                        }
+                    }
+                }
+            }
+
+            visited_list_pool_->releaseVisitedList(vl);
+            return top_candidates;
+        }
+
+        void getNeighborsByHeuristic2(
+                std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
+                const size_t M, tableint *ret, size_t &ret_len, void *pdata) {
+            if (top_candidates.size() < M) {
+                while (top_candidates.size() > 0) {
+                    ret[ret_len ++] = top_candidates.top().second;
+                    top_candidates.pop();
+                }
+                return;
+            }
+            std::priority_queue<std::pair<dist_t, tableint>> queue_closest;
+            std::vector<std::pair<dist_t, tableint>> return_list;
+            while (top_candidates.size() > 0) {
+                queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
+                top_candidates.pop();
+            }
+
+            while (queue_closest.size()) {
+                if (return_list.size() >= M)
+                    break;
+                std::pair<dist_t, tableint> curent_pair = queue_closest.top();
+                dist_t dist_to_query = -curent_pair.first;
+                queue_closest.pop();
+                bool good = true;
+                for (std::pair<dist_t, tableint> second_pair : return_list) {
+                    dist_t curdist =
+                            fstdistfunc_(getDataByInternalId(pdata, second_pair.second),
+                                         getDataByInternalId(pdata, curent_pair.second),
+                                         dist_func_param_);;
+                    if (curdist < dist_to_query) {
+                        good = false;
+                        break;
+                    }
+                }
+                if (good) {
+                    return_list.push_back(curent_pair);
+                    ret[ret_len ++] = curent_pair.second;
+                }
+
+
+            }
+
+//        for (std::pair<dist_t, tableint> curent_pair : return_list) {
+//
+//            top_candidates.emplace(-curent_pair.first, curent_pair.second);
+//        }
+        }
+
+
+        linklistsizeint *get_linklist0(tableint internal_id) const {
+            return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_);
+        };
+
+        linklistsizeint *get_linklist0(tableint internal_id, char *data_level0_memory_) const {
+            return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_);
+        };
+
+        linklistsizeint *get_linklist(tableint internal_id, int level) const {
+            return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_);
+        };
+
+        void mutuallyConnectNewElement(const void *data_point, tableint cur_c,
+                                       std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> &top_candidates,
+                                       int level, void *pdata) {
+
+            size_t Mcurmax = level ? maxM_ : maxM0_;
+//        std::vector<tableint> selectedNeighbors;
+//        selectedNeighbors.reserve(M_);
+            tableint *selectedNeighbors = (tableint*)malloc(sizeof(tableint) * M_);
+            size_t selectedNeighbors_size = 0;
+            getNeighborsByHeuristic2(top_candidates, M_, selectedNeighbors, selectedNeighbors_size, pdata);
+            if (selectedNeighbors_size > M_)
+                throw std::runtime_error("Should be not be more than M_ candidates returned by the heuristic");
+
+//        while (top_candidates.size() > 0) {
+//            selectedNeighbors.push_back(top_candidates.top().second);
+//            top_candidates.pop();
+//        }
+
+            {
+                linklistsizeint *ll_cur;
+                if (level == 0)
+                    ll_cur = get_linklist0(cur_c);
+                else
+                    ll_cur = get_linklist(cur_c, level);
+
+                if (*ll_cur) {
+                    throw std::runtime_error("The newly inserted element should have blank link list");
+                }
+                setListCount(ll_cur,(unsigned short)selectedNeighbors_size);
+                tableint *data = (tableint *) (ll_cur + 1);
+
+
+                for (size_t idx = 0; idx < selectedNeighbors_size; idx++) {
+                    if (data[idx])
+                        throw std::runtime_error("Possible memory corruption");
+                    if (level > element_levels_[selectedNeighbors[idx]])
+                        throw std::runtime_error("Trying to make a link on a non-existent level");
+
+                    data[idx] = selectedNeighbors[idx];
+
+                }
+            }
+            for (size_t idx = 0; idx < selectedNeighbors_size; idx++) {
+
+                std::unique_lock <std::mutex> lock(link_list_locks_[selectedNeighbors[idx]]);
+
+
+                linklistsizeint *ll_other;
+                if (level == 0)
+                    ll_other = get_linklist0(selectedNeighbors[idx]);
+                else
+                    ll_other = get_linklist(selectedNeighbors[idx], level);
+
+                size_t sz_link_list_other = getListCount(ll_other);
+
+                if (sz_link_list_other > Mcurmax)
+                    throw std::runtime_error("Bad value of sz_link_list_other");
+                if (selectedNeighbors[idx] == cur_c)
+                    throw std::runtime_error("Trying to connect an element to itself");
+                if (level > element_levels_[selectedNeighbors[idx]])
+                    throw std::runtime_error("Trying to make a link on a non-existent level");
+
+                tableint *data = (tableint *) (ll_other + 1);
+                if (sz_link_list_other < Mcurmax) {
+                    data[sz_link_list_other] = cur_c;
+                    setListCount(ll_other, sz_link_list_other + 1);
+                } else {
+                    // finding the "weakest" element to replace it with the new one
+                    dist_t d_max = fstdistfunc_(getDataByInternalId(pdata, cur_c), getDataByInternalId(pdata, selectedNeighbors[idx]),
+                                                dist_func_param_);
+                    // Heuristic:
+                    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidates;
+                    candidates.emplace(d_max, cur_c);
+
+                    for (size_t j = 0; j < sz_link_list_other; j++) {
+                        candidates.emplace(
+                                fstdistfunc_(getDataByInternalId(pdata, data[j]), getDataByInternalId(pdata, selectedNeighbors[idx]),
+                                             dist_func_param_), data[j]);
+                    }
+
+                    if (candidates.size() >= Mcurmax) cmli_cnt2_ ++;
+                    size_t indx = 0;
+                    auto t0 = std::chrono::high_resolution_clock::now();
+                    getNeighborsByHeuristic2(candidates, Mcurmax, data, indx, pdata);
+                    auto t1 = std::chrono::high_resolution_clock::now();
+                    cmli_time_ += (double)std::chrono::duration_cast<std::chrono::milliseconds>( t1 - t0 ).count();
+                    cmli_cnt_ ++;
+
+//                while (candidates.size() > 0) {
+//                    data[indx] = candidates.top().second;
+//                    candidates.pop();
+//                    indx++;
+//                }
+                    setListCount(ll_other, indx);
+                    // Nearest K:
+                    /*int indx = -1;
+                    for (int j = 0; j < sz_link_list_other; j++) {
+                        dist_t d = fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(rez[idx]), dist_func_param_);
+                        if (d > d_max) {
+                            indx = j;
+                            d_max = d;
+                        }
+                    }
+                    if (indx >= 0) {
+                        data[indx] = cur_c;
+                    } */
+                }
+
+            }
+        }
+
+        std::mutex global;
+        size_t ef_;
+
+        void setEf(size_t ef) {
+            ef_ = ef;
+        }
+
+
+        std::priority_queue<std::pair<dist_t, tableint>> searchKnnInternal(void *query_data, int k, dist_t *pdata) {
+            std::priority_queue<std::pair<dist_t, tableint  >> top_candidates;
+            if (cur_element_count == 0) return top_candidates;
+            tableint currObj = enterpoint_node_;
+            dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(pdata, enterpoint_node_), dist_func_param_);
+
+            for (size_t level = maxlevel_; level > 0; level--) {
+                bool changed = true;
+                while (changed) {
+                    changed = false;
+                    int *data;
+                    data = (int *) get_linklist(currObj,level);
+                    int size = getListCount(data);
+                    tableint *datal = (tableint *) (data + 1);
+                    for (int i = 0; i < size; i++) {
+                        tableint cand = datal[i];
+                        if (cand < 0 || cand > max_elements_)
+                            throw std::runtime_error("cand error");
+                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(pdata, cand), dist_func_param_);
+
+                        if (d < curdist) {
+                            curdist = d;
+                            currObj = cand;
+                            changed = true;
+                        }
+                    }
+                }
+            }
+
+            if (has_deletions_) {
+                std::priority_queue<std::pair<dist_t, tableint  >> top_candidates1=searchBaseLayerST<true>(currObj, query_data,
+                                                                                                           ef_, pdata);
+                top_candidates.swap(top_candidates1);
+            }
+            else{
+                std::priority_queue<std::pair<dist_t, tableint  >> top_candidates1=searchBaseLayerST<false>(currObj, query_data,
+                                                                                                            ef_, pdata);
+                top_candidates.swap(top_candidates1);
+            }
+
+            while (top_candidates.size() > k) {
+                top_candidates.pop();
+            }
+            return top_candidates;
+        };
+
+        void resizeIndex(size_t new_max_elements){
+            if (new_max_elements<cur_element_count)
+                throw std::runtime_error("Cannot resize, max element is less than the current number of elements");
+
+
+            delete visited_list_pool_;
+            mem_stats_ = 0;
+            visited_list_pool_ = new VisitedListPool(1, new_max_elements);
+            mem_stats_ += (new_max_elements + 2) * sizeof(short int);
+
+
+
+            element_levels_.resize(new_max_elements);
+            mem_stats_ += new_max_elements * sizeof(int);
+
+            std::vector<std::mutex>(new_max_elements).swap(link_list_locks_);
+            mem_stats_ += new_max_elements * sizeof(link_list_locks_[0]);
+
+
+            // Reallocate base layer
+            char * data_level0_memory_new = (char *) malloc(new_max_elements * size_data_per_element_);
+            mem_stats_ += new_max_elements * size_data_per_element_;
+            if (data_level0_memory_new == nullptr)
+                throw std::runtime_error("Not enough memory: resizeIndex failed to allocate base layer");
+            memcpy(data_level0_memory_new, data_level0_memory_,cur_element_count * size_data_per_element_);
+            free(data_level0_memory_);
+            data_level0_memory_=data_level0_memory_new;
+
+            // Reallocate all other layers
+            char ** linkLists_new = (char **) malloc(sizeof(void *) * new_max_elements);
+            mem_stats_ += sizeof(void *) * new_max_elements;
+            if (linkLists_new == nullptr)
+                throw std::runtime_error("Not enough memory: resizeIndex failed to allocate other layers");
+            memcpy(linkLists_new, linkLists_,cur_element_count * sizeof(void *));
+            free(linkLists_);
+            linkLists_=linkLists_new;
+
+            max_elements_=new_max_elements;
+
+        }
+
+        void saveIndex(milvus::knowhere::MemoryIOWriter& output) {
+            // write l2/ip calculator
+            writeBinaryPOD(output, metric_type_);
+            writeBinaryPOD(output, data_size_);
+            writeBinaryPOD(output, *((size_t *) dist_func_param_));
+
+//        writeBinaryPOD(output, offsetLevel0_);
+            writeBinaryPOD(output, max_elements_);
+            writeBinaryPOD(output, cur_element_count);
+            writeBinaryPOD(output, size_data_per_element_);
+//            writeBinaryPOD(output, label_offset_);
+//        writeBinaryPOD(output, offsetData_);
+            writeBinaryPOD(output, maxlevel_);
+            writeBinaryPOD(output, enterpoint_node_);
+            writeBinaryPOD(output, maxM_);
+
+            writeBinaryPOD(output, maxM0_);
+            writeBinaryPOD(output, M_);
+            writeBinaryPOD(output, mult_);
+            writeBinaryPOD(output, ef_construction_);
+
+            output.write(data_level0_memory_, cur_element_count * size_data_per_element_);
+
+            for (size_t i = 0; i < cur_element_count; i++) {
+                unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
+                writeBinaryPOD(output, linkListSize);
+                if (linkListSize)
+                    output.write(linkLists_[i], linkListSize);
+            }
+            // output.close();
+        }
+
+        void loadIndex(milvus::knowhere::MemoryIOReader& input, size_t max_elements_i = 0) {
+            // linxj: init with metrictype
+            size_t dim = 100;
+            readBinaryPOD(input, metric_type_);
+            readBinaryPOD(input, data_size_);
+            readBinaryPOD(input, dim);
+            if (metric_type_ == 0) {
+                space = new hnswlib::L2Space(dim);
+            } else if (metric_type_ == 1) {
+                space = new hnswlib::InnerProductSpace(dim);
+            } else {
+                // throw exception
+            }
+            fstdistfunc_ = space->get_dist_func();
+            dist_func_param_ = space->get_dist_func_param();
+
+//        readBinaryPOD(input, offsetLevel0_);
+            readBinaryPOD(input, max_elements_);
+            readBinaryPOD(input, cur_element_count);
+
+            size_t max_elements=max_elements_i;
+            if(max_elements < cur_element_count)
+                max_elements = max_elements_;
+            max_elements_ = max_elements;
+            readBinaryPOD(input, size_data_per_element_);
+//            readBinaryPOD(input, label_offset_);
+//        readBinaryPOD(input, offsetData_);
+            readBinaryPOD(input, maxlevel_);
+            readBinaryPOD(input, enterpoint_node_);
+
+            readBinaryPOD(input, maxM_);
+            readBinaryPOD(input, maxM0_);
+            readBinaryPOD(input, M_);
+            readBinaryPOD(input, mult_);
+            readBinaryPOD(input, ef_construction_);
+
+
+            // data_size_ = s->get_data_size();
+            // fstdistfunc_ = s->get_dist_func();
+            // dist_func_param_ = s->get_dist_func_param();
+
+            // auto pos= input.rp;
+
+
+            // /// Optional - check if index is ok:
+            //
+            // input.seekg(cur_element_count * size_data_per_element_,input.cur);
+            // for (size_t i = 0; i < cur_element_count; i++) {
+            //     if(input.tellg() < 0 || input.tellg()>=total_filesize){
+            //         throw std::runtime_error("Index seems to be corrupted or unsupported");
+            //     }
+            //
+            //     unsigned int linkListSize;
+            //     readBinaryPOD(input, linkListSize);
+            //     if (linkListSize != 0) {
+            //         input.seekg(linkListSize,input.cur);
+            //     }
+            // }
+            //
+            // // throw exception if it either corrupted or old index
+            // if(input.tellg()!=total_filesize)
+            //     throw std::runtime_error("Index seems to be corrupted or unsupported");
+            //
+            // input.clear();
+            //
+            // /// Optional check end
+            //
+            // input.seekg(pos,input.beg);
+
+
+            data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
+            mem_stats_ += max_elements * size_data_per_element_;
+            if (data_level0_memory_ == nullptr)
+                throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0");
+            input.read(data_level0_memory_, cur_element_count * size_data_per_element_);
+
+
+
+
+            size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
+
+
+            size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
+            std::vector<std::mutex>(max_elements).swap(link_list_locks_);
+
+
+            visited_list_pool_ = new VisitedListPool(1, max_elements);
+
+
+            linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
+            mem_stats_ += sizeof(void *) * max_elements;
+            if (linkLists_ == nullptr)
+                throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
+            element_levels_ = std::vector<int>(max_elements);
+            revSize_ = 1.0 / mult_;
+            ef_ = 10;
+            for (size_t i = 0; i < cur_element_count; i++) {
+//            label_lookup_[getExternalLabel(i)]=i;
+                unsigned int linkListSize;
+                readBinaryPOD(input, linkListSize);
+                if (linkListSize == 0) {
+                    element_levels_[i] = 0;
+
+                    linkLists_[i] = nullptr;
+                } else {
+                    element_levels_[i] = linkListSize / size_links_per_element_;
+                    linkLists_[i] = (char *) malloc(linkListSize);
+                    mem_stats_ += linkListSize;
+                    if (linkLists_[i] == nullptr)
+                        throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist");
+                    input.read(linkLists_[i], linkListSize);
+                }
+            }
+
+            has_deletions_=false;
+
+            for (size_t i = 0; i < cur_element_count; i++) {
+                if(isMarkedDeleted(i))
+                    has_deletions_=true;
+            }
+
+            return;
+        }
+
+        void saveIndex(const std::string &location) {
+            std::ofstream output(location, std::ios::binary);
+            std::streampos position;
+
+//        writeBinaryPOD(output, offsetLevel0_);
+            writeBinaryPOD(output, max_elements_);
+            writeBinaryPOD(output, cur_element_count);
+            writeBinaryPOD(output, size_data_per_element_);
+//            writeBinaryPOD(output, label_offset_);
+//        writeBinaryPOD(output, offsetData_);
+            writeBinaryPOD(output, maxlevel_);
+            writeBinaryPOD(output, enterpoint_node_);
+            writeBinaryPOD(output, maxM_);
+
+            writeBinaryPOD(output, maxM0_);
+            writeBinaryPOD(output, M_);
+            writeBinaryPOD(output, mult_);
+            writeBinaryPOD(output, ef_construction_);
+
+            output.write(data_level0_memory_, cur_element_count * size_data_per_element_);
+
+            for (size_t i = 0; i < cur_element_count; i++) {
+                unsigned int linkListSize = element_levels_[i] > 0 ? size_links_per_element_ * element_levels_[i] : 0;
+                writeBinaryPOD(output, linkListSize);
+                if (linkListSize)
+                    output.write(linkLists_[i], linkListSize);
+            }
+            output.close();
+        }
+
+        void loadIndex(const std::string &location, SpaceInterface<dist_t> *s, size_t max_elements_i=0) {
+            std::ifstream input(location, std::ios::binary);
+
+            if (!input.is_open())
+                throw std::runtime_error("Cannot open file");
+
+            // get file size:
+            input.seekg(0,input.end);
+            std::streampos total_filesize=input.tellg();
+            input.seekg(0,input.beg);
+
+//        readBinaryPOD(input, offsetLevel0_);
+            readBinaryPOD(input, max_elements_);
+            readBinaryPOD(input, cur_element_count);
+
+            size_t max_elements=max_elements_i;
+            if(max_elements < cur_element_count)
+                max_elements = max_elements_;
+            max_elements_ = max_elements;
+            readBinaryPOD(input, size_data_per_element_);
+//            readBinaryPOD(input, label_offset_);
+//        readBinaryPOD(input, offsetData_);
+            readBinaryPOD(input, maxlevel_);
+            readBinaryPOD(input, enterpoint_node_);
+
+            readBinaryPOD(input, maxM_);
+            readBinaryPOD(input, maxM0_);
+            readBinaryPOD(input, M_);
+            readBinaryPOD(input, mult_);
+            readBinaryPOD(input, ef_construction_);
+
+            data_size_ = s->get_data_size();
+            fstdistfunc_ = s->get_dist_func();
+            dist_func_param_ = s->get_dist_func_param();
+
+            auto pos=input.tellg();
+
+            /// Optional - check if index is ok:
+
+            input.seekg(cur_element_count * size_data_per_element_,input.cur);
+            for (size_t i = 0; i < cur_element_count; i++) {
+                if(input.tellg() < 0 || input.tellg()>=total_filesize){
+                    throw std::runtime_error("Index seems to be corrupted or unsupported");
+                }
+
+                unsigned int linkListSize;
+                readBinaryPOD(input, linkListSize);
+                if (linkListSize != 0) {
+                    input.seekg(linkListSize,input.cur);
+                }
+            }
+
+            // throw exception if it either corrupted or old index
+            if(input.tellg()!=total_filesize)
+                throw std::runtime_error("Index seems to be corrupted or unsupported");
+
+            input.clear();
+
+            /// Optional check end
+
+            input.seekg(pos,input.beg);
+
+            data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_);
+            mem_stats_ += max_elements * size_data_per_element_;
+            if (data_level0_memory_ == nullptr)
+                throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0");
+            input.read(data_level0_memory_, cur_element_count * size_data_per_element_);
+
+            size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
+
+            size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
+            std::vector<std::mutex>(max_elements).swap(link_list_locks_);
+
+            visited_list_pool_ = new VisitedListPool(1, max_elements);
+
+            linkLists_ = (char **) malloc(sizeof(void *) * max_elements);
+            mem_stats_ += sizeof(void *) * max_elements;
+            if (linkLists_ == nullptr)
+                throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists");
+            element_levels_ = std::vector<int>(max_elements);
+            revSize_ = 1.0 / mult_;
+            ef_ = 10;
+            for (size_t i = 0; i < cur_element_count; i++) {
+//            label_lookup_[getExternalLabel(i)]=i;
+                unsigned int linkListSize;
+                readBinaryPOD(input, linkListSize);
+                if (linkListSize == 0) {
+                    element_levels_[i] = 0;
+                    linkLists_[i] = nullptr;
+                } else {
+                    element_levels_[i] = linkListSize / size_links_per_element_;
+                    linkLists_[i] = (char *) malloc(linkListSize);
+                    mem_stats_ += linkListSize;
+                    if (linkLists_[i] == nullptr)
+                        throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist");
+                    input.read(linkLists_[i], linkListSize);
+                }
+            }
+
+            has_deletions_=false;
+
+            for (size_t i = 0; i < cur_element_count; i++) {
+                if(isMarkedDeleted(i))
+                    has_deletions_=true;
+            }
+
+            input.close();
+            return;
+        }
+
+        /*
+        template<typename data_t>
+        std::vector<data_t> getDataByLabel(tableint internal_id, dist_t *pdata) {
+    //        tableint label_c;
+    //        auto search = label_lookup_.find(label);
+    //        if (search == label_lookup_.end() || isMarkedDeleted(search->second)) {
+    //            throw std::runtime_error("Label not found");
+    //        }
+    //        label_c = search->second;
+
+            char* data_ptrv = getDataByInternalId(pdata, internal_id);
+            size_t dim = *((size_t *) dist_func_param_);
+            std::vector<data_t> data;
+            data_t* data_ptr = (data_t*) data_ptrv;
+            for (int i = 0; i < dim; i++) {
+                data.push_back(*data_ptr);
+                data_ptr += 1;
+            }
+            return data;
+        }
+        */
+
+        static const unsigned char DELETE_MARK = 0x01;
+        // static const unsigned char REUSE_MARK = 0x10;
+        /**
+         * Marks an element with the given label deleted, does NOT really change the current graph.
+         * @param label
+         */
+        void markDelete(labeltype label)
+        {
+            has_deletions_=true;
+//        auto search = label_lookup_.find(label);
+//        if (search == label_lookup_.end()) {
+//            throw std::runtime_error("Label not found");
+//        }
+//        markDeletedInternal(search->second);
+            markDeletedInternal(label);
+        }
+
+        /**
+         * Uses the first 8 bits of the memory for the linked list to store the mark,
+         * whereas maxM0_ has to be limited to the lower 24 bits, however, still large enough in almost all cases.
+         * @param internalId
+         */
+        void markDeletedInternal(tableint internalId) {
+            unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
+            *ll_cur |= DELETE_MARK;
+        }
+
+        /**
+         * Remove the deleted mark of the node.
+         * @param internalId
+         */
+        void unmarkDeletedInternal(tableint internalId) {
+            unsigned char *ll_cur = ((unsigned char *)get_linklist0(internalId))+2;
+            *ll_cur &= ~DELETE_MARK;
+        }
+
+        /**
+         * Checks the first 8 bits of the memory to see if the element is marked deleted.
+         * @param internalId
+         * @return
+         */
+        bool isMarkedDeleted(tableint internalId) const {
+            unsigned char *ll_cur = ((unsigned char*)get_linklist0(internalId))+2;
+            return *ll_cur & DELETE_MARK;
+        }
+
+        unsigned short int getListCount(linklistsizeint * ptr) const {
+            return *((unsigned short int *)ptr);
+        }
+
+        void setListCount(linklistsizeint * ptr, unsigned short int size) const {
+            *((unsigned short int*)(ptr))=*((unsigned short int *)&size);
+        }
+
+        size_t getCurrentElementCount() {
+            return cur_element_count;
+        }
+
+        void addPoint(void *data_point, labeltype label, size_t base, size_t offset) {
+            addPoint(data_point, label,-1, base, offset);
+        }
+
+        tableint addPoint(void *data_point, labeltype label, int level, size_t base, size_t offset) {
+            tableint cur_c = 0;
+            {
+                std::unique_lock <std::mutex> lock(cur_element_count_guard_);
+                if (cur_element_count >= max_elements_) {
+                    throw std::runtime_error("The number of elements exceeds the specified limit");
+                };
+
+//            cur_c = cur_element_count;
+                cur_c = tableint(base + offset);
+                cur_element_count++;
+
+//            auto search = label_lookup_.find(label);
+//            if (search != label_lookup_.end()) {
+//                std::unique_lock <std::mutex> lock_el(link_list_locks_[search->second]);
+//                has_deletions_ = true;
+//                markDeletedInternal(search->second);
+//            }
+//            label_lookup_[label] = cur_c;
+            }
+
+            std::unique_lock <std::mutex> lock_el(link_list_locks_[cur_c]);
+            int curlevel = getRandomLevel(mult_);
+            if (level > 0)
+                curlevel = level;
+
+            element_levels_[cur_c] = curlevel;
+
+            // prepose non-concurrent operation
+            memset(data_level0_memory_ + cur_c * size_data_per_element_, 0, size_data_per_element_);
+//            setExternalLabel(cur_c, label);
+//        memcpy(getDataByInternalId(cur_c), data_point, data_size_);
+            if (curlevel) {
+                linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1);
+                if (linkLists_[cur_c] == nullptr)
+                    throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist");
+                memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1);
+            }
+
+
+            std::unique_lock <std::mutex> templock(global);
+            int maxlevelcopy = maxlevel_;
+            if (curlevel <= maxlevelcopy)
+                templock.unlock();
+            tableint currObj = enterpoint_node_;
+            tableint enterpoint_copy = enterpoint_node_;
+            if (curlevel >= level_stats_.size()) {
+                level_stats_.resize(curlevel << 1);
+            }
+            level_stats_[curlevel] ++;
+
+            // Initialisation of the data and label
+
+            if (curlevel) {
+                mem_stats_ += size_links_per_element_ * curlevel + 1;
+            }
+
+            if ((signed)currObj != -1) {
+
+                if (curlevel < maxlevelcopy) {
+
+                    dist_t curdist = fstdistfunc_(getDataByInternalId(data_point, (tableint)offset), getDataByInternalId(data_point, currObj), dist_func_param_);
+                    for (int level = maxlevelcopy; level > curlevel; level--) {
+                        bool changed = true;
+                        while (changed) {
+                            changed = false;
+                            unsigned int *data;
+                            std::unique_lock <std::mutex> lock(link_list_locks_[currObj]);
+                            data = get_linklist(currObj,level);
+                            int size = getListCount(data);
+
+                            tableint *datal = (tableint *) (data + 1);
+                            for (int i = 0; i < size; i++) {
+                                tableint cand = datal[i];
+                                if (cand < 0 || cand > max_elements_)
+                                    throw std::runtime_error("cand error");
+                                dist_t d = fstdistfunc_(getDataByInternalId(data_point, tableint(offset)), getDataByInternalId(data_point, cand), dist_func_param_);
+                                if (d < curdist) {
+                                    curdist = d;
+                                    currObj = cand;
+                                    changed = true;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                bool epDeleted = isMarkedDeleted(enterpoint_copy);
+                for (int level = std::min(curlevel, maxlevelcopy); level >= 0; level--) {
+                    if (level > maxlevelcopy || level < 0)  // possible?
+                        throw std::runtime_error("Level error");
+
+                    std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates = searchBaseLayer(
+                            currObj, getDataByInternalId(data_point, (tableint)offset), level, data_point);
+                    if (epDeleted) {
+                        top_candidates.emplace(fstdistfunc_(getDataByInternalId(data_point, (tableint)offset), getDataByInternalId(data_point, enterpoint_copy), dist_func_param_), enterpoint_copy);
+                        if (top_candidates.size() > ef_construction_)
+                            top_candidates.pop();
+                    }
+                    currObj = top_candidates.top().second;
+
+                    mutuallyConnectNewElement(getDataByInternalId(data_point, (tableint)offset), cur_c, top_candidates, level, data_point);
+                }
+            } else {
+                // Do nothing for the first element
+                enterpoint_node_ = 0;
+                maxlevel_ = curlevel;
+            }
+
+            //Releasing lock for the maximum level
+            if (curlevel > maxlevelcopy) {
+                enterpoint_node_ = cur_c;
+                maxlevel_ = curlevel;
+            }
+            return cur_c;
+        };
+
+        std::priority_queue<std::pair<dist_t, labeltype >>
+        searchKnn_NM(const void *query_data, size_t k, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) const {
+            std::priority_queue<std::pair<dist_t, labeltype >> result;
+            if (cur_element_count == 0) return result;
+
+            tableint currObj = enterpoint_node_;
+            dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(pdata, enterpoint_node_), dist_func_param_);
+
+            for (int level = maxlevel_; level > 0; level--) {
+                bool changed = true;
+                while (changed) {
+                    changed = false;
+                    unsigned int *data;
+
+                    data = (unsigned int *) get_linklist(currObj, level);
+                    int size = getListCount(data);
+                    tableint *datal = (tableint *) (data + 1);
+                    for (int i = 0; i < size; i++) {
+                        tableint cand = datal[i];
+                        if (cand < 0 || cand > max_elements_)
+                            throw std::runtime_error("cand error");
+                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(pdata, cand), dist_func_param_);
+
+                        if (d < curdist) {
+                            curdist = d;
+                            currObj = cand;
+                            changed = true;
+                        }
+                    }
+                }
+            }
+
+            std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
+            if (bitset != nullptr) {
+                std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
+                        top_candidates1 = searchBaseLayerST<true>(currObj, query_data, std::max(ef_, k), bitset, pdata);
+                top_candidates.swap(top_candidates1);
+            }
+            else{
+                std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst>
+                        top_candidates1 = searchBaseLayerST<false>(currObj, query_data, std::max(ef_, k), bitset, pdata);
+                top_candidates.swap(top_candidates1);
+            }
+            while (top_candidates.size() > k) {
+                top_candidates.pop();
+            }
+            while (top_candidates.size() > 0) {
+                std::pair<dist_t, tableint> rez = top_candidates.top();
+//            result.push(std::pair<dist_t, labeltype>(rez.first, getExternalLabel(rez.second)));
+                result.push(std::pair<dist_t, labeltype>(rez.first, rez.second));
+                top_candidates.pop();
+            }
+            return result;
+        };
+
+        template <typename Comp>
+        std::vector<std::pair<dist_t, labeltype>>
+        searchKnn_NM(const void* query_data, size_t k, Comp comp, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) {
+            std::vector<std::pair<dist_t, labeltype>> result;
+            if (cur_element_count == 0) return result;
+
+            auto ret = searchKnn_NM(query_data, k, bitset, pdata);
+
+            while (!ret.empty()) {
+                result.push_back(ret.top());
+                ret.pop();
+            }
+
+            std::sort(result.begin(), result.end(), comp);
+
+            return result;
+        }
+
+        void addPoint(const void *datapoint, labeltype label) {
+            return;
+        }
+
+        std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(const void *query_data, size_t k, faiss::ConcurrentBitsetPtr bitset) const {
+            std::priority_queue<std::pair<dist_t, labeltype >> ret;
+            return ret;
+        }
+    };
+
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/hnswlib/hnswlib.h b/core/src/index/thirdparty/hnswlib/hnswlib.h
index 89d8d423a7..dab1afaa98 100644
--- a/core/src/index/thirdparty/hnswlib/hnswlib.h
+++ b/core/src/index/thirdparty/hnswlib/hnswlib.h
@@ -82,10 +82,15 @@ namespace hnswlib {
     class AlgorithmInterface {
     public:
         virtual void addPoint(const void *datapoint, labeltype label)=0;
+        virtual void addPoint(void *datapoint, labeltype label, size_t base, size_t offset)=0;
         virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(const void *, size_t, faiss::ConcurrentBitsetPtr bitset) const = 0;
         template <typename Comp>
         std::vector<std::pair<dist_t, labeltype>> searchKnn(const void*, size_t, Comp, faiss::ConcurrentBitsetPtr bitset) {
         }
+        virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn_NM(const void *, size_t, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) const = 0;
+        template <typename Comp>
+        std::vector<std::pair<dist_t, labeltype>> searchKnn_NM(const void*, size_t, Comp, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) {
+        }
         virtual void saveIndex(const std::string &location)=0;
         virtual ~AlgorithmInterface(){
         }
diff --git a/core/src/index/thirdparty/hnswlib/hnswlib_nm.h b/core/src/index/thirdparty/hnswlib/hnswlib_nm.h
new file mode 100644
index 0000000000..019890f681
--- /dev/null
+++ b/core/src/index/thirdparty/hnswlib/hnswlib_nm.h
@@ -0,0 +1,98 @@
+#pragma once
+#ifndef NO_MANUAL_VECTORIZATION
+#ifdef __SSE__
+#define USE_SSE
+#ifdef __AVX__
+#define USE_AVX
+#endif
+#endif
+#endif
+
+#if defined(USE_AVX) || defined(USE_SSE)
+#ifdef _MSC_VER
+#include <intrin.h>
+#include <stdexcept>
+#else
+#include <x86intrin.h>
+#endif
+
+#if defined(__GNUC__)
+#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
+#else
+#define PORTABLE_ALIGN32 __declspec(align(32))
+#endif
+#endif
+
+#include <fstream>
+#include <queue>
+#include <vector>
+
+#include <string.h>
+#include <faiss/utils/ConcurrentBitset.h>
+
+namespace hnswlib {
+    typedef int64_t labeltype;
+
+    template <typename T>
+    class pairGreater {
+    public:
+        bool operator()(const T& p1, const T& p2) {
+            return p1.first > p2.first;
+        }
+    };
+
+    template<typename T>
+    static void writeBinaryPOD(std::ostream &out, const T &podRef) {
+        out.write((char *) &podRef, sizeof(T));
+    }
+
+    template<typename T>
+    static void readBinaryPOD(std::istream &in, T &podRef) {
+        in.read((char *) &podRef, sizeof(T));
+    }
+
+    template<typename T, typename W>
+    static void writeBinaryPOD(W &out, const T &podRef) {
+        out.write((char *) &podRef, sizeof(T));
+    }
+
+    template<typename T, typename R>
+    static void readBinaryPOD(R &in, T &podRef) {
+        in.read((char *) &podRef, sizeof(T));
+    }
+
+    template<typename MTYPE>
+    using DISTFUNC = MTYPE(*)(const void *, const void *, const void *);
+
+
+    template<typename MTYPE>
+    class SpaceInterface {
+    public:
+        //virtual void search(void *);
+        virtual size_t get_data_size() = 0;
+
+        virtual DISTFUNC<MTYPE> get_dist_func() = 0;
+
+        virtual void *get_dist_func_param() = 0;
+
+        virtual ~SpaceInterface() {}
+    };
+
+    template<typename dist_t>
+    class AlgorithmInterface {
+    public:
+        virtual void addPoint(void *datapoint, labeltype label, size_t base, size_t offset)=0;
+        virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(const void *, size_t, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) const = 0;
+        template <typename Comp>
+        std::vector<std::pair<dist_t, labeltype>> searchKnn(const void*, size_t, Comp, faiss::ConcurrentBitsetPtr bitset, dist_t *pdata) {
+        }
+        virtual void saveIndex(const std::string &location)=0;
+        virtual ~AlgorithmInterface(){
+        }
+    };
+}
+
+#include "space_l2.h"
+#include "space_ip.h"
+#include "bruteforce.h"
+#include "hnswalg_nm.h"
\ No newline at end of file
diff --git a/core/src/index/thirdparty/hnswlib/visited_list_pool.h b/core/src/index/thirdparty/hnswlib/visited_list_pool.h
index 457f73433d..93b6478f68 100644
--- a/core/src/index/thirdparty/hnswlib/visited_list_pool.h
+++ b/core/src/index/thirdparty/hnswlib/visited_list_pool.h
@@ -2,6 +2,7 @@
 
 #include <mutex>
 #include <string.h>
+#include <deque>
 
 namespace hnswlib {
 typedef unsigned short int vl_type;
diff --git a/core/src/index/unittest/CMakeLists.txt b/core/src/index/unittest/CMakeLists.txt
index f29ca1af06..8b41fb835e 100644
--- a/core/src/index/unittest/CMakeLists.txt
+++ b/core/src/index/unittest/CMakeLists.txt
@@ -62,6 +62,8 @@ set(faiss_srcs
         ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexIVF.cpp
         ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexIVFSQ.cpp
         ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexIVFPQ.cpp
+        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_offset_index/OffsetBaseIndex.cpp
+        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_offset_index/IndexIVF_NM.cpp
         )
 if (MILVUS_GPU_VERSION)
 set(faiss_srcs ${faiss_srcs}
@@ -71,6 +73,7 @@ set(faiss_srcs ${faiss_srcs}
         ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVFSQ.cpp
         ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/gpu/IndexGPUIVFPQ.cpp
         ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/gpu/IndexIVFSQHybrid.cpp
+        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.cpp
         )
 endif ()
 
@@ -120,6 +123,22 @@ endif ()
 target_link_libraries(test_ivf ${depend_libs} ${unittest_libs} ${basic_libs})
 install(TARGETS test_ivf DESTINATION unittest)
 
+################################################################################
+#<IVFNM-TEST-CPU>
+if (NOT TARGET test_ivf_cpu_nm)
+    add_executable(test_ivf_cpu_nm test_ivf_cpu_nm.cpp ${faiss_srcs} ${util_srcs})
+endif ()
+target_link_libraries(test_ivf_cpu_nm ${depend_libs} ${unittest_libs} ${basic_libs})
+install(TARGETS test_ivf_cpu_nm DESTINATION unittest)
+
+################################################################################
+#<IVFNM-TEST-GPU>
+if (NOT TARGET test_ivf_gpu_nm)
+    add_executable(test_ivf_gpu_nm test_ivf_gpu_nm.cpp ${faiss_srcs} ${util_srcs})
+endif ()
+target_link_libraries(test_ivf_gpu_nm ${depend_libs} ${unittest_libs} ${basic_libs})
+install(TARGETS test_ivf_gpu_nm DESTINATION unittest)
+
 ################################################################################
 #<BinaryIDMAP-TEST>
 if (NOT TARGET test_binaryidmap)
@@ -152,7 +171,7 @@ endif ()
 include_directories(${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/impl/nsg)
 aux_source_directory(${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/impl/nsg nsg_src)
 set(interface_src
-        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexNSG.cpp
+        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_offset_index/IndexNSG_NM.cpp
         )
 if (NOT TARGET test_nsg)
     add_executable(test_nsg test_nsg.cpp ${interface_src} ${nsg_src} ${util_srcs} ${faiss_srcs})
@@ -163,7 +182,7 @@ install(TARGETS test_nsg DESTINATION unittest)
 ################################################################################
 #<HNSW-TEST>
 set(hnsw_srcs
-        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexHNSW.cpp
+        ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_offset_index/IndexHNSW_NM.cpp
         )
 if (NOT TARGET test_hnsw)
     add_executable(test_hnsw test_hnsw.cpp ${hnsw_srcs} ${util_srcs})
diff --git a/core/src/index/unittest/Helper.h b/core/src/index/unittest/Helper.h
index 7ab2810875..d2605eb463 100644
--- a/core/src/index/unittest/Helper.h
+++ b/core/src/index/unittest/Helper.h
@@ -17,12 +17,14 @@
 #include "knowhere/index/vector_index/IndexIVFSQ.h"
 #include "knowhere/index/vector_index/IndexType.h"
 #include "knowhere/index/vector_index/helpers/IndexParameter.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
 
 #ifdef MILVUS_GPU_VERSION
 #include "knowhere/index/vector_index/gpu/IndexGPUIVF.h"
 #include "knowhere/index/vector_index/gpu/IndexGPUIVFPQ.h"
 #include "knowhere/index/vector_index/gpu/IndexGPUIVFSQ.h"
 #include "knowhere/index/vector_index/gpu/IndexIVFSQHybrid.h"
+#include "knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h"
 #endif
 
 int DEVICEID = 0;
@@ -66,6 +68,18 @@ IndexFactory(const milvus::knowhere::IndexType& type, const milvus::knowhere::In
     return nullptr;
 }
 
+milvus::knowhere::IVFNMPtr
+IndexFactoryNM(const milvus::knowhere::IndexType& type, const milvus::knowhere::IndexMode mode) {
+    if (mode == milvus::knowhere::IndexMode::MODE_CPU) {
+        if (type == milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT) {
+            return std::make_shared<milvus::knowhere::IVF_NM>();
+        } else {
+            std::cout << "Invalid IndexType " << type << std::endl;
+        }
+    }
+    return nullptr;
+}
+
 class ParamGenerator {
  public:
     static ParamGenerator&
diff --git a/core/src/index/unittest/test_hnsw.cpp b/core/src/index/unittest/test_hnsw.cpp
index 48f458fca9..4504c48fbc 100644
--- a/core/src/index/unittest/test_hnsw.cpp
+++ b/core/src/index/unittest/test_hnsw.cpp
@@ -10,7 +10,7 @@
 // or implied. See the License for the specific language governing permissions and limitations under the License.
 
 #include <gtest/gtest.h>
-#include <knowhere/index/vector_index/IndexHNSW.h>
+#include <knowhere/index/vector_offset_index/IndexHNSW_NM.h>
 #include <src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h>
 #include <iostream>
 #include <random>
@@ -28,7 +28,7 @@ class HNSWTest : public DataGen, public TestWithParam<std::string> {
         IndexType = GetParam();
         std::cout << "IndexType from GetParam() is: " << IndexType << std::endl;
         Generate(64, 10000, 10);  // dim = 64, nb = 10000, nq = 10
-        index_ = std::make_shared<milvus::knowhere::IndexHNSW>();
+        index_ = std::make_shared<milvus::knowhere::IndexHNSW_NM>();
         conf = milvus::knowhere::Config{
             {milvus::knowhere::meta::DIM, 64},        {milvus::knowhere::meta::TOPK, 10},
             {milvus::knowhere::IndexParams::M, 16},   {milvus::knowhere::IndexParams::efConstruction, 200},
@@ -38,7 +38,7 @@ class HNSWTest : public DataGen, public TestWithParam<std::string> {
 
  protected:
     milvus::knowhere::Config conf;
-    std::shared_ptr<milvus::knowhere::IndexHNSW> index_ = nullptr;
+    std::shared_ptr<milvus::knowhere::IndexHNSW_NM> index_ = nullptr;
     std::string IndexType;
 };
 
@@ -62,6 +62,19 @@ TEST_P(HNSWTest, HNSW_basic) {
     EXPECT_EQ(index_->Count(), nb);
     EXPECT_EQ(index_->Dim(), dim);
 
+    // Serialize and Load before Query
+    milvus::knowhere::BinarySet bs = index_->Serialize();
+
+    int64_t dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    int64_t rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    auto raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    milvus::knowhere::BinaryPtr bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+
+    index_->Load(bs);
+
     auto result = index_->Query(query_dataset, conf);
     AssertAnns(result, nq, k);
 }
@@ -78,6 +91,20 @@ TEST_P(HNSWTest, HNSW_delete) {
     for (auto i = 0; i < nq; ++i) {
         bitset->set(i);
     }
+
+    // Serialize and Load before Query
+    milvus::knowhere::BinarySet bs = index_->Serialize();
+
+    int64_t dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    int64_t rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    auto raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    milvus::knowhere::BinaryPtr bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+
+    index_->Load(bs);
+
     auto result1 = index_->Query(query_dataset, conf);
     AssertAnns(result1, nq, k);
 
@@ -107,6 +134,7 @@ TEST_P(HNSWTest, HNSW_delete) {
     */
 }
 
+/*
 TEST_P(HNSWTest, HNSW_serialize) {
     auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) {
         {
@@ -138,7 +166,7 @@ TEST_P(HNSWTest, HNSW_serialize) {
         auto result = index_->Query(query_dataset, conf);
         AssertAnns(result, nq, conf[milvus::knowhere::meta::TOPK]);
     }
-}
+}*/
 
 /*
  * faiss style test
@@ -181,7 +209,7 @@ main() {
     int k = 4;
     int m = 16;
     int ef = 200;
-    milvus::knowhere::IndexHNSW index;
+    milvus::knowhere::IndexHNSW_NM index;
     milvus::knowhere::DatasetPtr base_dataset = generate_dataset(nb, d, (const void*)xb, ids);
 //    base_dataset->Set(milvus::knowhere::meta::ROWS, nb);
 //    base_dataset->Set(milvus::knowhere::meta::DIM, d);
diff --git a/core/src/index/unittest/test_ivf.cpp b/core/src/index/unittest/test_ivf.cpp
index 1827d0b4d1..8fc3cd4289 100644
--- a/core/src/index/unittest/test_ivf.cpp
+++ b/core/src/index/unittest/test_ivf.cpp
@@ -81,12 +81,12 @@ INSTANTIATE_TEST_CASE_P(
     IVFParameters, IVFTest,
     Values(
 #ifdef MILVUS_GPU_VERSION
-        std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, milvus::knowhere::IndexMode::MODE_GPU),
+        // std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, milvus::knowhere::IndexMode::MODE_GPU),
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, milvus::knowhere::IndexMode::MODE_GPU),
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, milvus::knowhere::IndexMode::MODE_GPU),
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8H, milvus::knowhere::IndexMode::MODE_GPU),
 #endif
-        std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, milvus::knowhere::IndexMode::MODE_CPU),
+        // std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, milvus::knowhere::IndexMode::MODE_CPU),
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, milvus::knowhere::IndexMode::MODE_CPU),
         std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, milvus::knowhere::IndexMode::MODE_CPU)));
 
diff --git a/core/src/index/unittest/test_ivf_cpu_nm.cpp b/core/src/index/unittest/test_ivf_cpu_nm.cpp
new file mode 100644
index 0000000000..61cf7d1a72
--- /dev/null
+++ b/core/src/index/unittest/test_ivf_cpu_nm.cpp
@@ -0,0 +1,106 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <fiu-control.h>
+#include <fiu-local.h>
+#include <iostream>
+#include <thread>
+
+#ifdef MILVUS_GPU_VERSION
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#endif
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/common/Timer.h"
+#include "knowhere/index/vector_index/IndexType.h"
+#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
+
+#ifdef MILVUS_GPU_VERSION
+#include "knowhere/index/vector_index/helpers/Cloner.h"
+#include "knowhere/index/vector_index/helpers/FaissGpuResourceMgr.h"
+#include "knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h"
+#endif
+
+#include "unittest/Helper.h"
+#include "unittest/utils.h"
+
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+
+class IVFNMCPUTest : public DataGen,
+                     public TestWithParam<::std::tuple<milvus::knowhere::IndexType, milvus::knowhere::IndexMode>> {
+ protected:
+    void
+    SetUp() override {
+        std::tie(index_type_, index_mode_) = GetParam();
+        Generate(DIM, NB, NQ);
+        index_ = IndexFactoryNM(index_type_, index_mode_);
+        conf_ = ParamGenerator::GetInstance().Gen(index_type_);
+    }
+
+    void
+    TearDown() override {
+    }
+
+ protected:
+    milvus::knowhere::IndexType index_type_;
+    milvus::knowhere::IndexMode index_mode_;
+    milvus::knowhere::Config conf_;
+    milvus::knowhere::IVFNMPtr index_ = nullptr;
+};
+
+INSTANTIATE_TEST_CASE_P(IVFParameters, IVFNMCPUTest,
+                        Values(std::make_tuple(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
+                                               milvus::knowhere::IndexMode::MODE_CPU)));
+
+TEST_P(IVFNMCPUTest, ivf_basic_cpu) {
+    assert(!xb.empty());
+
+    if (index_mode_ != milvus::knowhere::IndexMode::MODE_CPU) {
+        return;
+    }
+
+    // null faiss index
+    ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
+    ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
+
+    index_->Train(base_dataset, conf_);
+    index_->AddWithoutIds(base_dataset, conf_);
+    EXPECT_EQ(index_->Count(), nb);
+    EXPECT_EQ(index_->Dim(), dim);
+
+    milvus::knowhere::BinarySet bs = index_->Serialize(conf_);
+
+    int64_t dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    int64_t rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    auto raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    milvus::knowhere::BinaryPtr bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+    index_->Load(bs);
+
+    auto result = index_->Query(query_dataset, conf_);
+    AssertAnns(result, nq, k);
+
+    faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(nb);
+    for (int64_t i = 0; i < nq; ++i) {
+        concurrent_bitset_ptr->set(i);
+    }
+    index_->SetBlacklist(concurrent_bitset_ptr);
+
+    auto result_bs_1 = index_->Query(query_dataset, conf_);
+    AssertAnns(result_bs_1, nq, k, CheckMode::CHECK_NOT_EQUAL);
+}
diff --git a/core/src/index/unittest/test_ivf_gpu_nm.cpp b/core/src/index/unittest/test_ivf_gpu_nm.cpp
new file mode 100644
index 0000000000..ec7087a264
--- /dev/null
+++ b/core/src/index/unittest/test_ivf_gpu_nm.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2019-2020 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <fiu-control.h>
+#include <fiu-local.h>
+#include <iostream>
+#include <thread>
+
+#ifdef MILVUS_GPU_VERSION
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#endif
+
+#include "knowhere/common/Exception.h"
+#include "knowhere/common/Timer.h"
+#include "knowhere/index/vector_index/IndexType.h"
+#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
+#include "knowhere/index/vector_offset_index/IndexIVF_NM.h"
+
+#ifdef MILVUS_GPU_VERSION
+#include "knowhere/index/vector_index/helpers/Cloner.h"
+#include "knowhere/index/vector_index/helpers/FaissGpuResourceMgr.h"
+#include "knowhere/index/vector_offset_index/gpu/IndexGPUIVF_NM.h"
+#endif
+
+#include "unittest/Helper.h"
+#include "unittest/utils.h"
+
+using ::testing::Combine;
+using ::testing::TestWithParam;
+using ::testing::Values;
+
+class IVFNMGPUTest : public DataGen,
+                     public TestWithParam<::std::tuple<milvus::knowhere::IndexType, milvus::knowhere::IndexMode>> {
+ protected:
+    void
+    SetUp() override {
+#ifdef MILVUS_GPU_VERSION
+        milvus::knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(DEVICEID, PINMEM, TEMPMEM, RESNUM);
+#endif
+        index_type_ = milvus::knowhere::IndexEnum::INDEX_FAISS_IVFFLAT;
+        index_mode_ = milvus::knowhere::IndexMode::MODE_GPU;
+        Generate(DIM, NB, NQ);
+#ifdef MILVUS_GPU_VERSION
+        index_ = std::make_shared<milvus::knowhere::GPUIVF_NM>(DEVICEID);
+#endif
+        conf_ = ParamGenerator::GetInstance().Gen(index_type_);
+    }
+
+    void
+    TearDown() override {
+#ifdef MILVUS_GPU_VERSION
+        milvus::knowhere::FaissGpuResourceMgr::GetInstance().Free();
+#endif
+    }
+
+ protected:
+    milvus::knowhere::IndexType index_type_;
+    milvus::knowhere::IndexMode index_mode_;
+    milvus::knowhere::Config conf_;
+    milvus::knowhere::IVFPtr index_ = nullptr;
+};
+
+#ifdef MILVUS_GPU_VERSION
+TEST_F(IVFNMGPUTest, ivf_basic_gpu) {
+    assert(!xb.empty());
+
+    if (index_mode_ != milvus::knowhere::IndexMode::MODE_GPU) {
+        return;
+    }
+
+    // null faiss index
+    ASSERT_ANY_THROW(index_->Add(base_dataset, conf_));
+    ASSERT_ANY_THROW(index_->AddWithoutIds(base_dataset, conf_));
+
+    index_->BuildAll(base_dataset, conf_);
+    EXPECT_EQ(index_->Count(), nb);
+    EXPECT_EQ(index_->Dim(), dim);
+
+    milvus::knowhere::BinarySet bs = index_->Serialize(conf_);
+
+    int64_t dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    int64_t rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    auto raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    milvus::knowhere::BinaryPtr bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+    index_->Load(bs);
+
+    auto result = index_->Query(query_dataset, conf_);
+    AssertAnns(result, nq, k);
+
+    faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(nb);
+    for (int64_t i = 0; i < nq; ++i) {
+        concurrent_bitset_ptr->set(i);
+    }
+    index_->SetBlacklist(concurrent_bitset_ptr);
+
+    auto result_bs_1 = index_->Query(query_dataset, conf_);
+    AssertAnns(result_bs_1, nq, k, CheckMode::CHECK_NOT_EQUAL);
+
+    milvus::knowhere::FaissGpuResourceMgr::GetInstance().Dump();
+}
+#endif
diff --git a/core/src/index/unittest/test_nsg.cpp b/core/src/index/unittest/test_nsg.cpp
index c4dc46a869..350e4096b5 100644
--- a/core/src/index/unittest/test_nsg.cpp
+++ b/core/src/index/unittest/test_nsg.cpp
@@ -15,9 +15,8 @@
 #include <memory>
 
 #include "knowhere/common/Exception.h"
-#include "knowhere/index/vector_index/FaissBaseIndex.h"
-#include "knowhere/index/vector_index/IndexNSG.h"
 #include "knowhere/index/vector_index/helpers/IndexParameter.h"
+#include "knowhere/index/vector_offset_index/IndexNSG_NM.h"
 #ifdef MILVUS_GPU_VERSION
 #include "knowhere/index/vector_index/gpu/IndexGPUIDMAP.h"
 #include "knowhere/index/vector_index/helpers/Cloner.h"
@@ -45,7 +44,7 @@ class NSGInterfaceTest : public DataGen, public ::testing::Test {
 #endif
         int nsg_dim = 256;
         Generate(nsg_dim, 20000, nq);
-        index_ = std::make_shared<milvus::knowhere::NSG>();
+        index_ = std::make_shared<milvus::knowhere::NSG_NM>();
 
         train_conf = milvus::knowhere::Config{{milvus::knowhere::meta::DIM, 256},
                                               {milvus::knowhere::IndexParams::nlist, 163},
@@ -70,7 +69,7 @@ class NSGInterfaceTest : public DataGen, public ::testing::Test {
     }
 
  protected:
-    std::shared_ptr<milvus::knowhere::NSG> index_;
+    std::shared_ptr<milvus::knowhere::NSG_NM> index_;
     milvus::knowhere::Config train_conf;
     milvus::knowhere::Config search_conf;
 };
@@ -88,35 +87,44 @@ TEST_F(NSGInterfaceTest, basic_test) {
 
     train_conf[milvus::knowhere::meta::DEVICEID] = -1;
     index_->BuildAll(base_dataset, train_conf);
+
+    // Serialize and Load before Query
+    milvus::knowhere::BinarySet bs = index_->Serialize();
+
+    int64_t dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    int64_t rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    auto raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    milvus::knowhere::BinaryPtr bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+
+    index_->Load(bs);
+
     auto result = index_->Query(query_dataset, search_conf);
     AssertAnns(result, nq, k);
 
-    auto binaryset = index_->Serialize();
-    {
-        fiu_enable("NSG.Serialize.throw_exception", 1, nullptr, 0);
-        ASSERT_ANY_THROW(index_->Serialize());
-        fiu_disable("NSG.Serialize.throw_exception");
-    }
-
     /* test NSG GPU train */
-    auto new_index_1 = std::make_shared<milvus::knowhere::NSG>(DEVICE_GPU0);
+    auto new_index_1 = std::make_shared<milvus::knowhere::NSG_NM>(DEVICE_GPU0);
     train_conf[milvus::knowhere::meta::DEVICEID] = DEVICE_GPU0;
     new_index_1->BuildAll(base_dataset, train_conf);
+
+    // Serialize and Load before Query
+    bs = new_index_1->Serialize();
+
+    dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+
+    new_index_1->Load(bs);
+
     auto new_result_1 = new_index_1->Query(query_dataset, search_conf);
     AssertAnns(new_result_1, nq, k);
 
-    /* test NSG index load */
-    auto new_index_2 = std::make_shared<milvus::knowhere::NSG>();
-    new_index_2->Load(binaryset);
-    {
-        fiu_enable("NSG.Load.throw_exception", 1, nullptr, 0);
-        ASSERT_ANY_THROW(new_index_2->Load(binaryset));
-        fiu_disable("NSG.Load.throw_exception");
-    }
-
-    auto new_result_2 = new_index_2->Query(query_dataset, search_conf);
-    AssertAnns(new_result_2, nq, k);
-
     ASSERT_EQ(index_->Count(), nb);
     ASSERT_EQ(index_->Dim(), dim);
 }
@@ -142,6 +150,19 @@ TEST_F(NSGInterfaceTest, delete_test) {
     train_conf[milvus::knowhere::meta::DEVICEID] = DEVICE_GPU0;
     index_->Train(base_dataset, train_conf);
 
+    // Serialize and Load before Query
+    milvus::knowhere::BinarySet bs = index_->Serialize();
+
+    int64_t dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    int64_t rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    auto raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    milvus::knowhere::BinaryPtr bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+
+    index_->Load(bs);
+
     auto result = index_->Query(query_dataset, search_conf);
     AssertAnns(result, nq, k);
 
@@ -157,6 +178,19 @@ TEST_F(NSGInterfaceTest, delete_test) {
 
     // search xq with delete
     index_->SetBlacklist(bitset);
+
+    // Serialize and Load before Query
+    bs = index_->Serialize();
+
+    dim = base_dataset->Get<int64_t>(milvus::knowhere::meta::DIM);
+    rows = base_dataset->Get<int64_t>(milvus::knowhere::meta::ROWS);
+    raw_data = base_dataset->Get<const void*>(milvus::knowhere::meta::TENSOR);
+    bptr = std::make_shared<milvus::knowhere::Binary>();
+    bptr->data = std::shared_ptr<uint8_t[]>((uint8_t*)raw_data, [&](uint8_t*) {});
+    bptr->size = dim * rows * sizeof(float);
+    bs.Append(RAW_DATA, bptr);
+
+    index_->Load(bs);
     auto result_after = index_->Query(query_dataset, search_conf);
     AssertAnns(result_after, nq, k, CheckMode::CHECK_NOT_EQUAL);
     auto I_after = result_after->Get<int64_t*>(milvus::knowhere::meta::IDS);
diff --git a/core/src/scheduler/task/BuildIndexTask.cpp b/core/src/scheduler/task/BuildIndexTask.cpp
index 68cab7c2b3..2a3be72553 100644
--- a/core/src/scheduler/task/BuildIndexTask.cpp
+++ b/core/src/scheduler/task/BuildIndexTask.cpp
@@ -220,9 +220,10 @@ XBuildIndexTask::Execute() {
             LOG_ENGINE_DEBUG_ << "New index file " << table_file.file_id_ << " of size " << table_file.file_size_
                               << " bytes"
                               << " from file " << origin_file.file_id_;
-            if (build_index_job->options().insert_cache_immediately_) {
-                index->Cache();
-            }
+            // XXX_Index_NM doesn't support it now.
+            // if (build_index_job->options().insert_cache_immediately_) {
+            //     index->Cache();
+            // }
         } else {
             // failed to update meta, mark the new file as to_delete, don't delete old file
             origin_file.file_type_ = engine::meta::SegmentSchema::TO_INDEX;
diff --git a/core/src/segment/SegmentReader.cpp b/core/src/segment/SegmentReader.cpp
index 62a1358b17..888f086093 100644
--- a/core/src/segment/SegmentReader.cpp
+++ b/core/src/segment/SegmentReader.cpp
@@ -123,6 +123,22 @@ SegmentReader::LoadVectorIndex(const std::string& location, segment::VectorIndex
     return Status::OK();
 }
 
+Status
+SegmentReader::LoadVectorIndexWithRawData(const std::string& location, segment::VectorIndexPtr& vector_index_ptr) {
+    codec::DefaultCodec default_codec;
+    try {
+        fs_ptr_->operation_ptr_->CreateDirectory();
+        knowhere::BinaryPtr raw_data = nullptr;
+        default_codec.GetVectorsFormat()->read_vectors(fs_ptr_, raw_data);
+        default_codec.GetVectorIndexFormat()->read(fs_ptr_, location, raw_data, vector_index_ptr);
+    } catch (std::exception& e) {
+        std::string err_msg = "Failed to load vector index with row data: " + std::string(e.what());
+        LOG_ENGINE_ERROR_ << err_msg;
+        return Status(DB_ERROR, err_msg);
+    }
+    return Status::OK();
+}
+
 Status
 SegmentReader::LoadBloomFilter(segment::IdBloomFilterPtr& id_bloom_filter_ptr) {
     codec::DefaultCodec default_codec;
diff --git a/core/src/segment/SegmentReader.h b/core/src/segment/SegmentReader.h
index a69b1f3685..83e2201165 100644
--- a/core/src/segment/SegmentReader.h
+++ b/core/src/segment/SegmentReader.h
@@ -51,6 +51,9 @@ class SegmentReader {
     Status
     LoadVectorIndex(const std::string& location, segment::VectorIndexPtr& vector_index_ptr);
 
+    Status
+    LoadVectorIndexWithRawData(const std::string& location, segment::VectorIndexPtr& vector_index_ptr);
+
     Status
     LoadBloomFilter(segment::IdBloomFilterPtr& id_bloom_filter_ptr);
 
diff --git a/core/unittest/db/utils.cpp b/core/unittest/db/utils.cpp
index 97458572e1..d4a47c8bbb 100644
--- a/core/unittest/db/utils.cpp
+++ b/core/unittest/db/utils.cpp
@@ -189,7 +189,7 @@ DBTest::SetUp() {
     milvus::scheduler::CPUBuilderInst::GetInstance()->Start();
 
     auto options = GetOptions();
-    options.insert_cache_immediately_ = true;
+    // options.insert_cache_immediately_ = true;
     BuildDB(options);
 
     std::string config_path(options.meta_.path_ + CONFIG_FILE);
diff --git a/sdk/examples/simple/src/ClientTest.cpp b/sdk/examples/simple/src/ClientTest.cpp
index 5c99483ead..138f3c870d 100644
--- a/sdk/examples/simple/src/ClientTest.cpp
+++ b/sdk/examples/simple/src/ClientTest.cpp
@@ -319,4 +319,4 @@ ClientTest::Test() {
 
     DropIndex(collection_name);
     DropCollection(collection_name);
-}
+}
\ No newline at end of file
diff --git a/sdk/examples/simple/src/ClientTest.h b/sdk/examples/simple/src/ClientTest.h
index 1d41e12325..620cbf1e3d 100644
--- a/sdk/examples/simple/src/ClientTest.h
+++ b/sdk/examples/simple/src/ClientTest.h
@@ -85,4 +85,4 @@ class ClientTest {
     std::shared_ptr<milvus::Connection> conn_;
     std::vector<std::pair<int64_t, milvus::Entity>> search_entity_array_;
     std::vector<int64_t> search_id_array_;
-};
+};
\ No newline at end of file
diff --git a/sdk/examples/utils/Utils.cpp b/sdk/examples/utils/Utils.cpp
index 2562c1a4bf..83fb1a4e1d 100644
--- a/sdk/examples/utils/Utils.cpp
+++ b/sdk/examples/utils/Utils.cpp
@@ -365,4 +365,4 @@ Utils::PrintTopKHybridQueryResult(milvus::TopKHybridQueryResult& topk_query_resu
     }
 }
 
-}  // namespace milvus_sdk
+}  // namespace milvus_sdk
\ No newline at end of file
diff --git a/sdk/examples/utils/Utils.h b/sdk/examples/utils/Utils.h
index e0c0cb7888..cdbeccf0ac 100644
--- a/sdk/examples/utils/Utils.h
+++ b/sdk/examples/utils/Utils.h
@@ -84,4 +84,4 @@ class Utils {
     PrintTopKHybridQueryResult(milvus::TopKHybridQueryResult& topk_query_result);
 };
 
-}  // namespace milvus_sdk
+}  // namespace milvus_sdk
\ No newline at end of file