Structured index (#2418)

* Support Structured Index Based on Sort

Signed-off-by: cmli <chengming.li@zilliz.com>

* fix lint errors and make clang-format

Signed-off-by: cmli <chengming.li@zilliz.com>

* pick some advices from laojin

Signed-off-by: cmli <chengming.li@zilliz.com>

* make clang-format

Signed-off-by: cmli <chengming.li@zilliz.com>

* rename n_ to size_, replace self-implemented lower_bound and upper_bound by std implemention as laojin requires

Signed-off-by: cmli <chengming.li@zilliz.com>

* fix lint error

Signed-off-by: cmli <chengming.li@zilliz.com>

* update implementation by shengjun's advice

Signed-off-by: cmli <chengming.li@zilliz.com>

* remove index member size_

Signed-off-by: cmli <chengming.li@zilliz.com>

Co-authored-by: cmli <chengming.li@zilliz.com>
pull/2435/head^2
op-hunter 2020-05-26 14:41:21 +08:00 committed by GitHub
parent 9aa26d4e9c
commit 098f5a534d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 615 additions and 2 deletions

View File

@ -56,6 +56,7 @@ Please mark all change in change log and use the issue from GitHub
- \#2240 Obtain running rpc requests information
- \#2268 Intelligently detect openblas library in system to avoid installing from source code every time
- \#2283 Suspend the building tasks when any query comand arrives.
- \#2417 Support Structured Index Based on Sort
## Improvement
- \#221 Refactor LOG macro

View File

@ -0,0 +1,45 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <memory>
#include "faiss/utils/ConcurrentBitset.h"
#include "knowhere/index/Index.h"
namespace milvus {
namespace knowhere {
enum OperatorType { LT = 0, LE, GT, GE };
template <typename T>
class StructuredIndex : public Index {
public:
virtual void
Build(const size_t n, const T* values) = 0;
virtual const faiss::ConcurrentBitsetPtr
In(const size_t n, const T* values) = 0;
virtual const faiss::ConcurrentBitsetPtr
NotIn(const size_t n, const T* values) = 0;
virtual const faiss::ConcurrentBitsetPtr
Range(const T value, const OperatorType op) = 0;
virtual const faiss::ConcurrentBitsetPtr
Range(const T lower_bound_value, bool lb_inclusive, const T upper_bound_value, bool ub_inclusive) = 0;
};
template <typename T>
using StructuredIndexPtr = std::shared_ptr<StructuredIndex<T>>;
} // namespace knowhere
} // namespace milvus

View File

@ -0,0 +1,199 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <src/index/knowhere/knowhere/common/Log.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "knowhere/index/structured_index/StructuredIndexSort.h"
namespace milvus {
namespace knowhere {
template <typename T>
StructuredIndexSort<T>::StructuredIndexSort() : is_built_(false), data_(nullptr) {
}
template <typename T>
StructuredIndexSort<T>::StructuredIndexSort(const size_t n, const T* values) : is_built_(false) {
Build(n, values);
}
template <typename T>
StructuredIndexSort<T>::~StructuredIndexSort() {
}
template <typename T>
void
StructuredIndexSort<T>::Build(const size_t n, const T* values) {
data_.reserve(n);
T* p = const_cast<T*>(values);
for (size_t i = 0; i < n; ++i) {
data_.emplace_back(IndexStructure(*p++, i));
}
build();
}
template <typename T>
void
StructuredIndexSort<T>::build() {
if (is_built_)
return;
if (data_.size() == 0) {
// todo: throw an exception
KNOWHERE_THROW_MSG("StructuredIndexSort cannot build null values!");
}
std::sort(data_.begin(), data_.end());
is_built_ = true;
}
template <typename T>
BinarySet
StructuredIndexSort<T>::Serialize(const milvus::knowhere::Config& config) {
if (!is_built_) {
build();
}
auto index_data_size = data_.size() * sizeof(IndexStructure<T>);
std::shared_ptr<uint8_t[]> index_data(new uint8_t[index_data_size]);
memcpy(index_data.get(), data_.data(), index_data_size);
std::shared_ptr<uint8_t[]> index_length(new uint8_t[sizeof(size_t)]);
auto index_size = data_.size();
memcpy(index_length.get(), &index_size, sizeof(size_t));
BinarySet res_set;
res_set.Append("index_data", index_data, index_data_size);
res_set.Append("index_length", index_length, sizeof(size_t));
return res_set;
}
template <typename T>
void
StructuredIndexSort<T>::Load(const milvus::knowhere::BinarySet& index_binary) {
try {
size_t index_size;
auto index_length = index_binary.GetByName("index_length");
memcpy(&index_size, index_length->data.get(), (size_t)index_length->size);
auto index_data = index_binary.GetByName("index_data");
data_.resize(index_size);
memcpy(data_.data(), index_data->data.get(), (size_t)index_data->size);
is_built_ = true;
} catch (...) {
KNOHWERE_ERROR_MSG("StructuredIndexSort Load failed!");
}
}
template <typename T>
const faiss::ConcurrentBitsetPtr
StructuredIndexSort<T>::In(const size_t n, const T* values) {
if (!is_built_) {
build();
}
faiss::ConcurrentBitsetPtr bitset = std::make_shared<faiss::ConcurrentBitset>(data_.size());
for (size_t i = 0; i < n; ++i) {
auto lb = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
auto ub = std::upper_bound(data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
for (; lb < ub; ++lb) {
if (lb->a_ != *(values + i)) {
LOG_KNOWHERE_ERROR_ << "error happens in StructuredIndexSort<T>::In, experted value is: "
<< *(values + i) << ", but real value is: " << lb->a_;
}
bitset->set(lb->idx_);
}
}
return bitset;
}
template <typename T>
const faiss::ConcurrentBitsetPtr
StructuredIndexSort<T>::NotIn(const size_t n, const T* values) {
if (!is_built_) {
build();
}
faiss::ConcurrentBitsetPtr bitset = std::make_shared<faiss::ConcurrentBitset>(data_.size(), 0xff);
for (size_t i = 0; i < n; ++i) {
auto lb = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
auto ub = std::upper_bound(data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
for (; lb < ub; ++lb) {
if (lb->a_ != *(values + i)) {
LOG_KNOWHERE_ERROR_ << "error happens in StructuredIndexSort<T>::NotIn, experted value is: "
<< *(values + i) << ", but real value is: " << lb->a_;
}
bitset->clear(lb->idx_);
}
}
return bitset;
}
template <typename T>
const faiss::ConcurrentBitsetPtr
StructuredIndexSort<T>::Range(const T value, const OperatorType op) {
if (!is_built_) {
build();
}
faiss::ConcurrentBitsetPtr bitset = std::make_shared<faiss::ConcurrentBitset>(data_.size());
auto lb = data_.begin();
auto ub = data_.end();
switch (op) {
case OperatorType::LT:
ub = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(value));
break;
case OperatorType::LE:
ub = std::upper_bound(data_.begin(), data_.end(), IndexStructure<T>(value));
break;
case OperatorType::GT:
lb = std::upper_bound(data_.begin(), data_.end(), IndexStructure<T>(value));
break;
case OperatorType::GE:
lb = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(value));
break;
default:
KNOWHERE_THROW_MSG("Invalid OperatorType:" + std::to_string((int)op) + "!");
}
for (; lb < ub; ++lb) {
bitset->set(lb->idx_);
}
return bitset;
}
template <typename T>
const faiss::ConcurrentBitsetPtr
StructuredIndexSort<T>::Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) {
if (!is_built_) {
build();
}
faiss::ConcurrentBitsetPtr bitset = std::make_shared<faiss::ConcurrentBitset>(data_.size());
if (lower_bound_value > upper_bound_value) {
std::swap(lower_bound_value, upper_bound_value);
std::swap(lb_inclusive, ub_inclusive);
}
auto lb = data_.begin();
auto ub = data_.end();
if (lb_inclusive) {
lb = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(lower_bound_value));
} else {
lb = std::upper_bound(data_.begin(), data_.end(), IndexStructure<T>(lower_bound_value));
}
if (ub_inclusive) {
ub = std::upper_bound(data_.begin(), data_.end(), IndexStructure<T>(upper_bound_value));
} else {
ub = std::lower_bound(data_.begin(), data_.end(), IndexStructure<T>(upper_bound_value));
}
for (; lb < ub; ++lb) {
bitset->set(lb->idx_);
}
return bitset;
}
} // namespace knowhere
} // namespace milvus

View File

@ -0,0 +1,100 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <algorithm>
#include <memory>
#include <utility>
#include <vector>
#include "knowhere/common/Exception.h"
#include "knowhere/index/structured_index/StructuredIndex.h"
namespace milvus {
namespace knowhere {
template <typename T>
struct IndexStructure {
IndexStructure() : a_(0), idx_(0) {
}
explicit IndexStructure(const T a) : a_(a), idx_(0) {
}
IndexStructure(const T a, const size_t idx) : a_(a), idx_(idx) {
}
bool
operator<(const IndexStructure& b) const {
return a_ < b.a_;
}
bool
operator==(const IndexStructure& b) const {
return a_ == b.a_;
}
T a_;
size_t idx_;
};
template <typename T>
class StructuredIndexSort : public StructuredIndex<T> {
public:
StructuredIndexSort();
StructuredIndexSort(const size_t n, const T* values);
~StructuredIndexSort();
BinarySet
Serialize(const Config& config = Config()) override;
void
Load(const BinarySet& index_binary) override;
void
Build(const size_t n, const T* values) override;
void
build();
const faiss::ConcurrentBitsetPtr
In(const size_t n, const T* values) override;
const faiss::ConcurrentBitsetPtr
NotIn(const size_t n, const T* values) override;
const faiss::ConcurrentBitsetPtr
Range(const T value, const OperatorType op) override;
const faiss::ConcurrentBitsetPtr
Range(T lower_bound_value, bool lb_inclusive, T upper_bound_value, bool ub_inclusive) override;
const std::vector<IndexStructure<T>>&
GetData() {
return data_;
}
int64_t
Size() override {
return (int64_t)data_.size();
}
bool
IsBuilt() const {
return is_built_;
}
private:
bool is_built_;
std::vector<IndexStructure<T>> data_;
};
template <typename T>
using StructuredIndexSortPtr = std::shared_ptr<StructuredIndexSort<T>>;
} // namespace knowhere
} // namespace milvus
#include "knowhere/index/structured_index/StructuredIndexSort-inl.h"

View File

@ -15,11 +15,15 @@
// specific language governing permissions and limitations
// under the License.
#include <cstring>
#include "ConcurrentBitset.h"
namespace faiss {
ConcurrentBitset::ConcurrentBitset(id_type_t capacity) : capacity_(capacity), bitset_((capacity + 8 - 1) >> 3) {
ConcurrentBitset::ConcurrentBitset(id_type_t capacity, uint8_t init_value) : capacity_(capacity), bitset_(((capacity + 8 - 1) >> 3)) {
if (init_value) {
memset(mutable_data(), init_value, (capacity + 8 - 1) >> 3);
}
}
std::vector<std::atomic<uint8_t>>&
@ -202,4 +206,8 @@ ConcurrentBitset::data() {
return reinterpret_cast<const uint8_t*>(bitset_.data());
}
uint8_t*
ConcurrentBitset::mutable_data() {
return reinterpret_cast<uint8_t*>(bitset_.data());
}
} // namespace faiss

View File

@ -27,7 +27,7 @@ class ConcurrentBitset {
public:
using id_type_t = int64_t;
explicit ConcurrentBitset(id_type_t size);
explicit ConcurrentBitset(id_type_t size, uint8_t init_value = 0);
// ConcurrentBitset(const ConcurrentBitset&) = delete;
// ConcurrentBitset&
@ -69,6 +69,9 @@ class ConcurrentBitset {
const uint8_t*
data();
uint8_t*
mutable_data();
private:
size_t capacity_;
std::vector<std::atomic<uint8_t>> bitset_;

View File

@ -198,6 +198,16 @@ endif ()
target_link_libraries(test_annoy ${depend_libs} ${unittest_libs} ${basic_libs})
install(TARGETS test_annoy DESTINATION unittest)
################################################################################
#<STRUCTURED-INDEX-SORT-TEST>
set(structured_index_sort_srcs
${INDEX_SOURCE_DIR}/knowhere/knowhere/index/structured_index/StructuredIndexSort-inl.h
)
if (NOT TARGET test_structured_index_sort)
add_executable(test_structured_index_sort test_structured_index_sort.cpp ${structured_index_sort_srcs} ${util_srcs})
endif ()
target_link_libraries(test_structured_index_sort ${depend_libs} ${unittest_libs} ${basic_libs})
install(TARGETS test_structured_index_sort DESTINATION unittest)
#add_subdirectory(faiss_ori)
#add_subdirectory(faiss_benchmark)

View File

@ -0,0 +1,247 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <sstream>
#include "knowhere/index/structured_index/StructuredIndexSort.h"
#include "unittest/utils.h"
void
gen_rand_data(int range, int n, int*& p) {
srand((unsigned int)time(nullptr));
p = (int*)malloc(n * sizeof(int));
int* q = p;
for (auto i = 0; i < n; ++i) {
*q++ = (int)random() % range;
}
}
TEST(STRUCTUREDINDEXSORT_TEST, test_build) {
int range = 100, n = 1000, *p = nullptr;
gen_rand_data(range, n, p);
milvus::knowhere::StructuredIndexSort<int> structuredIndexSort((size_t)n, p); // Build default
std::sort(p, p + n);
const std::vector<milvus::knowhere::IndexStructure<int>> index_data = structuredIndexSort.GetData();
for (auto i = 0; i < n; ++i) {
ASSERT_EQ(*(p + i), index_data[i].a_);
}
free(p);
}
TEST(STRUCTUREDINDEXSORT_TEST, test_serialize_and_load) {
auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) {
{
// write and flush
FileIOWriter writer(filename);
writer(static_cast<void*>(bin->data.get()), bin->size);
}
FileIOReader reader(filename);
reader(ret, bin->size);
};
int range = 100, n = 1000, *p = nullptr;
gen_rand_data(range, n, p);
milvus::knowhere::StructuredIndexSort<int> structuredIndexSort((size_t)n, p); // Build default
auto binaryset = structuredIndexSort.Serialize();
auto bin_data = binaryset.GetByName("index_data");
std::string data_file = "/tmp/sort_test_data_serialize.bin";
auto load_data = new uint8_t[bin_data->size];
serialize(data_file, bin_data, load_data);
auto bin_length = binaryset.GetByName("index_length");
std::string length_file = "/tmp/sort_test_length_serialize.bin";
auto load_length = new uint8_t[bin_length->size];
serialize(length_file, bin_length, load_length);
binaryset.clear();
std::shared_ptr<uint8_t[]> index_data(load_data);
binaryset.Append("index_data", index_data, bin_data->size);
std::shared_ptr<uint8_t[]> length_data(load_length);
binaryset.Append("index_length", length_data, bin_length->size);
structuredIndexSort.Load(binaryset);
EXPECT_EQ(n, (int)structuredIndexSort.Size());
EXPECT_EQ(true, structuredIndexSort.IsBuilt());
std::sort(p, p + n);
const std::vector<milvus::knowhere::IndexStructure<int>> const_index_data = structuredIndexSort.GetData();
for (auto i = 0; i < n; ++i) {
ASSERT_EQ(*(p + i), const_index_data[i].a_);
}
free(p);
}
TEST(STRUCTUREDINDEXSORT_TEST, test_in) {
int range = 1000, n = 1000, *p = nullptr;
gen_rand_data(range, n, p);
milvus::knowhere::StructuredIndexSort<int> structuredIndexSort((size_t)n, p); // Build default
size_t test_times = 10;
std::vector<int> test_vals, test_off;
test_vals.reserve(test_times);
test_off.reserve(test_times);
// std::cout << "STRUCTUREDINDEXSORT_TEST test_in" << std::endl;
for (auto i = 0; i < test_times; ++i) {
auto off = random() % n;
test_vals.emplace_back(*(p + off));
test_off.emplace_back(off);
// std::cout << "val: " << *(p + off) << ", off: " << off << std::endl;
}
auto res = structuredIndexSort.In(test_times, test_vals.data());
for (auto i = 0; i < test_times; ++i) {
// std::cout << test_off[i] << " ";
ASSERT_EQ(true, res->test(test_off[i]));
}
free(p);
}
TEST(STRUCTUREDINDEXSORT_TEST, test_not_in) {
int range = 10000, n = 1000, *p = nullptr;
gen_rand_data(range, n, p);
milvus::knowhere::StructuredIndexSort<int> structuredIndexSort((size_t)n, p); // Build default
size_t test_times = 10;
std::vector<int> test_vals, test_off;
test_vals.reserve(test_times);
test_off.reserve(test_times);
// std::cout << "STRUCTUREDINDEXSORT_TEST test_notin" << std::endl;
for (auto i = 0; i < test_times; ++i) {
auto off = random() % n;
test_vals.emplace_back(*(p + off));
test_off.emplace_back(off);
// std::cout << off << " ";
}
// std::cout << std::endl;
auto res = structuredIndexSort.NotIn(test_times, test_vals.data());
// std::cout << "assert values: " << std::endl;
for (auto i = 0; i < test_times; ++i) {
// std::cout << test_off[i] << " ";
ASSERT_EQ(false, res->test(test_off[i]));
}
// std::cout << std::endl;
free(p);
}
TEST(STRUCTUREDINDEXSORT_TEST, test_single_border_range) {
int range = 100, n = 1000, *p = nullptr;
gen_rand_data(range, n, p);
milvus::knowhere::StructuredIndexSort<int> structuredIndexSort((size_t)n, p); // Build default
srand((unsigned int)time(nullptr));
int val;
// test LT
val = (int)random() % 100;
auto lt_res = structuredIndexSort.Range(val, milvus::knowhere::OperatorType::LT);
for (auto i = 0; i < n; ++i) {
if (*(p + i) < val)
ASSERT_EQ(true, lt_res->test(i));
else
ASSERT_EQ(false, lt_res->test(i));
}
// test LE
val = (int)random() % 100;
auto le_res = structuredIndexSort.Range(val, milvus::knowhere::OperatorType::LE);
for (auto i = 0; i < n; ++i) {
if (*(p + i) <= val)
ASSERT_EQ(true, le_res->test(i));
else
ASSERT_EQ(false, le_res->test(i));
}
// test GE
val = (int)random() % 100;
auto ge_res = structuredIndexSort.Range(val, milvus::knowhere::OperatorType::GE);
for (auto i = 0; i < n; ++i) {
if (*(p + i) >= val)
ASSERT_EQ(true, ge_res->test(i));
else
ASSERT_EQ(false, ge_res->test(i));
}
// test GT
val = (int)random() % 100;
auto gt_res = structuredIndexSort.Range(val, milvus::knowhere::OperatorType::GT);
for (auto i = 0; i < n; ++i) {
if (*(p + i) > val)
ASSERT_EQ(true, gt_res->test(i));
else
ASSERT_EQ(false, gt_res->test(i));
}
free(p);
}
TEST(STRUCTUREDINDEXSORT_TEST, test_double_border_range) {
int range = 100, n = 1000, *p = nullptr;
gen_rand_data(range, n, p);
milvus::knowhere::StructuredIndexSort<int> structuredIndexSort((size_t)n, p); // Build default
srand((unsigned int)time(nullptr));
int lb, ub;
// []
lb = (int)random() % 100;
ub = (int)random() % 100;
if (lb > ub)
std::swap(lb, ub);
auto res1 = structuredIndexSort.Range(lb, true, ub, true);
for (auto i = 0; i < n; ++i) {
if (*(p + i) >= lb && *(p + i) <= ub)
ASSERT_EQ(true, res1->test(i));
else
ASSERT_EQ(false, res1->test(i));
}
// [)
lb = (int)random() % 100;
ub = (int)random() % 100;
if (lb > ub)
std::swap(lb, ub);
auto res2 = structuredIndexSort.Range(lb, true, ub, false);
for (auto i = 0; i < n; ++i) {
if (*(p + i) >= lb && *(p + i) < ub)
ASSERT_EQ(true, res2->test(i));
else
ASSERT_EQ(false, res2->test(i));
}
// (]
lb = (int)random() % 100;
ub = (int)random() % 100;
if (lb > ub)
std::swap(lb, ub);
auto res3 = structuredIndexSort.Range(lb, false, ub, true);
for (auto i = 0; i < n; ++i) {
if (*(p + i) > lb && *(p + i) <= ub)
ASSERT_EQ(true, res3->test(i));
else
ASSERT_EQ(false, res3->test(i));
}
// ()
lb = (int)random() % 100;
ub = (int)random() % 100;
if (lb > ub)
std::swap(lb, ub);
auto res4 = structuredIndexSort.Range(lb, false, ub, false);
for (auto i = 0; i < n; ++i) {
if (*(p + i) > lb && *(p + i) < ub)
ASSERT_EQ(true, res4->test(i));
else
ASSERT_EQ(false, res4->test(i));
}
free(p);
}