enhance: support null value in index (#35238)

#31728

---------

Signed-off-by: lixinguo <xinguo.li@zilliz.com>
Co-authored-by: lixinguo <xinguo.li@zilliz.com>
pull/35525/head
smellthemoon 2024-08-16 15:30:54 +08:00 committed by GitHub
parent f87af9bc54
commit 80dbe87759
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 921 additions and 158 deletions

View File

@ -69,3 +69,5 @@ const int64_t DEFAULT_MAX_OUTPUT_SIZE = 67108864; // bytes, 64MB
const int64_t DEFAULT_CHUNK_MANAGER_REQUEST_TIMEOUT_MS = 10000;
const int64_t DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND = 500;
const size_t MARISA_NULL_KEY_ID = -1;

View File

@ -69,8 +69,8 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
ssize_t byte_count = (element_count + 7) / 8;
// Note: if 'nullable == true` and valid_data is nullptr
// means null_count == 0, will fill it with 0xFF
if (valid_data == nullptr) {
valid_data_.resize(byte_count, 0xFF);
if (!valid_data) {
valid_data_.assign(byte_count, 0xFF);
} else {
std::copy_n(valid_data, byte_count, valid_data_.data());
}

View File

@ -476,7 +476,7 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data == nullptr) {
valid_data_.resize((n + 7) / 8, 0xFF);
valid_data_.assign((n + 7) / 8, 0xFF);
} else {
std::copy_n(valid_data, (n + 7) / 8, valid_data_.data());
}

View File

@ -69,11 +69,14 @@ BitmapIndex<T>::Build(size_t n, const T* data) {
PanicInfo(DataIsEmpty, "BitmapIndex can not build null values");
}
total_num_rows_ = n;
valid_bitset = TargetBitmap(total_num_rows_, false);
T* p = const_cast<T*>(data);
for (int i = 0; i < n; ++i, ++p) {
data_[*p].add(i);
valid_bitset.set(i);
}
total_num_rows_ = n;
if (data_.size() < DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) {
for (auto it = data_.begin(); it != data_.end(); ++it) {
@ -95,8 +98,11 @@ BitmapIndex<T>::BuildPrimitiveField(
for (const auto& data : field_datas) {
auto slice_row_num = data->get_num_rows();
for (size_t i = 0; i < slice_row_num; ++i) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
if (data->is_valid(i)) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
valid_bitset.set(offset);
}
offset++;
}
}
@ -114,6 +120,7 @@ BitmapIndex<T>::BuildWithFieldData(
PanicInfo(DataIsEmpty, "scalar bitmap index can not build null values");
}
total_num_rows_ = total_num_rows;
valid_bitset = TargetBitmap(total_num_rows_, false);
switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
@ -151,12 +158,14 @@ BitmapIndex<T>::BuildArrayField(const std::vector<FieldDataPtr>& field_datas) {
for (const auto& data : field_datas) {
auto slice_row_num = data->get_num_rows();
for (size_t i = 0; i < slice_row_num; ++i) {
auto array =
reinterpret_cast<const milvus::Array*>(data->RawValue(i));
for (size_t j = 0; j < array->length(); ++j) {
auto val = static_cast<T>(array->template get_data<GetType>(j));
data_[val].add(offset);
if (data->is_valid(i)) {
auto array =
reinterpret_cast<const milvus::Array*>(data->RawValue(i));
for (size_t j = 0; j < array->length(); ++j) {
auto val = array->template get_data<T>(j);
data_[val].add(offset);
}
valid_bitset.set(offset);
}
offset++;
}
@ -330,6 +339,9 @@ BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
} else {
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset.set(v);
}
}
}
@ -355,6 +367,9 @@ BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
} else {
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset.set(v);
}
}
}
@ -367,6 +382,7 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
index_meta_buffer->size);
auto index_length = index_meta.first;
total_num_rows_ = index_meta.second;
valid_bitset = TargetBitmap(total_num_rows_, false);
auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
DeserializeIndexData(index_data_buffer->data.get(), index_length);
@ -389,7 +405,7 @@ BitmapIndex<T>::Load(milvus::tracer::TraceContext ctx, const Config& config) {
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
@ -442,6 +458,8 @@ BitmapIndex<T>::NotIn(const size_t n, const T* values) {
}
}
}
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
res &= valid_bitset;
return res;
} else {
TargetBitmap res(total_num_rows_, false);
@ -452,10 +470,31 @@ BitmapIndex<T>::NotIn(const size_t n, const T* values) {
}
}
res.flip();
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
res &= valid_bitset;
return res;
}
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::IsNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, true);
res &= valid_bitset;
res.flip();
return res;
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::IsNotNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, true);
res &= valid_bitset;
return res;
}
template <typename T>
TargetBitmap
BitmapIndex<T>::RangeForBitset(const T value, const OpType op) {

View File

@ -82,6 +82,12 @@ class BitmapIndex : public ScalarIndex<T> {
const TargetBitmap
NotIn(size_t n, const T* values) override;
const TargetBitmap
IsNull() override;
const TargetBitmap
IsNotNull() override;
const TargetBitmap
Range(T value, OpType op) override;
@ -205,6 +211,9 @@ class BitmapIndex : public ScalarIndex<T> {
size_t total_num_rows_{0};
proto::schema::FieldSchema schema_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
// generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate
TargetBitmap valid_bitset;
};
} // namespace index

View File

@ -358,7 +358,7 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);

View File

@ -87,6 +87,16 @@ class HybridScalarIndex : public ScalarIndex<T> {
return internal_index_->NotIn(n, values);
}
const TargetBitmap
IsNull() override {
return internal_index_->IsNull();
}
const TargetBitmap
IsNotNull() override {
return internal_index_->IsNotNull();
}
const TargetBitmap
Query(const DatasetPtr& dataset) override {
return internal_index_->Query(dataset);

View File

@ -21,6 +21,8 @@
#include <boost/filesystem.hpp>
#include <boost/uuid/random_generator.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <cstddef>
#include <vector>
#include "InvertedIndexTantivy.h"
namespace milvus::index {
@ -105,8 +107,14 @@ InvertedIndexTantivy<T>::finish() {
template <typename T>
BinarySet
InvertedIndexTantivy<T>::Serialize(const Config& config) {
auto index_valid_data_length = null_offset.size() * sizeof(size_t);
std::shared_ptr<uint8_t[]> index_valid_data(
new uint8_t[index_valid_data_length]);
memcpy(index_valid_data.get(), null_offset.data(), index_valid_data_length);
BinarySet res_set;
res_set.Append(
"index_null_offset", index_valid_data, index_valid_data_length);
milvus::Disassemble(res_set);
return res_set;
}
@ -137,7 +145,8 @@ InvertedIndexTantivy<T>::Upload(const Config& config) {
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
auto binary_set = Serialize(config);
mem_file_manager_->AddFile(binary_set);
return ret;
}
@ -173,6 +182,26 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
files_value.end());
disk_file_manager_->CacheIndexToDisk(files_value);
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
auto index_valid_data_file =
mem_file_manager_->GetRemoteIndexObjectPrefix() +
std::string("/index_null_offset");
std::vector<std::string> file;
file.push_back(index_valid_data_file);
auto index_datas = mem_file_manager_->LoadIndexToMemory(file);
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
auto index_valid_data = binary_set.GetByName("index_null_offset");
null_offset.resize((size_t)index_valid_data->size / sizeof(size_t));
memcpy(null_offset.data(),
index_valid_data->data.get(),
(size_t)index_valid_data->size);
}
inline void
@ -212,6 +241,27 @@ InvertedIndexTantivy<T>::In(size_t n, const T* values) {
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::IsNull() {
TargetBitmap bitset(Count());
for (size_t i = 0; i < null_offset.size(); ++i) {
bitset.set(null_offset[i]);
}
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::IsNotNull() {
TargetBitmap bitset(Count(), true);
for (size_t i = 0; i < null_offset.size(); ++i) {
bitset.reset(null_offset[i]);
}
return bitset;
}
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::InApplyFilter(
@ -242,6 +292,9 @@ InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
auto array = wrapper_->term_query(values[i]);
apply_hits(bitset, array, false);
}
for (size_t i = 0; i < null_offset.size(); ++i) {
bitset.reset(null_offset[i]);
}
return bitset;
}
@ -378,6 +431,13 @@ template <typename T>
void
InvertedIndexTantivy<T>::BuildWithFieldData(
const std::vector<std::shared_ptr<FieldDataBase>>& field_datas) {
if (schema_.nullable()) {
int64_t total = 0;
for (const auto& data : field_datas) {
total += data->get_null_count();
}
null_offset.reserve(total);
}
switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
case proto::schema::DataType::Int8:
@ -390,6 +450,17 @@ InvertedIndexTantivy<T>::BuildWithFieldData(
case proto::schema::DataType::VarChar: {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
if (schema_.nullable()) {
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
null_offset.push_back(i);
}
wrapper_->add_multi_data<T>(
static_cast<const T*>(data->RawValue(i)),
data->is_valid(i));
}
continue;
}
wrapper_->add_data<T>(static_cast<const T*>(data->Data()), n);
}
break;
@ -417,9 +488,12 @@ InvertedIndexTantivy<T>::build_index_for_array(
for (int64_t i = 0; i < n; i++) {
assert(array_column[i].get_element_type() ==
static_cast<DataType>(schema_.element_type()));
if (schema_.nullable() && !data->is_valid(i)) {
null_offset.push_back(i);
}
auto length = data->is_valid(i) ? array_column[i].length() : 0;
wrapper_->template add_multi_data(
reinterpret_cast<const T*>(array_column[i].data()),
array_column[i].length());
reinterpret_cast<const T*>(array_column[i].data()), length);
}
}
}
@ -435,12 +509,16 @@ InvertedIndexTantivy<std::string>::build_index_for_array(
Assert(IsStringDataType(array_column[i].get_element_type()));
Assert(IsStringDataType(
static_cast<DataType>(schema_.element_type())));
if (schema_.nullable() && !data->is_valid(i)) {
null_offset.push_back(i);
}
std::vector<std::string> output;
for (int64_t j = 0; j < array_column[i].length(); j++) {
output.push_back(
array_column[i].template get_data<std::string>(j));
}
wrapper_->template add_multi_data(output.data(), output.size());
auto length = data->is_valid(i) ? output.size() : 0;
wrapper_->template add_multi_data(output.data(), length);
}
}
}

View File

@ -11,6 +11,8 @@
#pragma once
#include <cstddef>
#include <vector>
#include "common/RegexQuery.h"
#include "index/Index.h"
#include "storage/FileManager.h"
@ -80,12 +82,8 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
const void* values,
const Config& config = {}) override;
/*
* deprecated.
* TODO: why not remove this?
*/
BinarySet
Serialize(const Config& config /* not used */) override;
Serialize(const Config& config) override;
BinarySet
Upload(const Config& config = {}) override;
@ -101,6 +99,12 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
const TargetBitmap
In(size_t n, const T* values) override;
const TargetBitmap
IsNull() override;
const TargetBitmap
IsNotNull() override;
const TargetBitmap
InApplyFilter(
size_t n,
@ -193,5 +197,9 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
*/
MemFileManagerPtr mem_file_manager_;
DiskFileManagerPtr disk_file_manager_;
// all data need to be built to align the offset
// so need to store null_offset in inverted index additionally
std::vector<size_t> null_offset{};
};
} // namespace milvus::index

View File

@ -82,6 +82,12 @@ class ScalarIndex : public IndexBase {
virtual const TargetBitmap
In(size_t n, const T* values) = 0;
virtual const TargetBitmap
IsNull() = 0;
virtual const TargetBitmap
IsNotNull() = 0;
virtual const TargetBitmap
InApplyFilter(size_t n,
const T* values,

View File

@ -68,10 +68,13 @@ ScalarIndexSort<T>::Build(size_t n, const T* values) {
PanicInfo(DataIsEmpty, "ScalarIndexSort cannot build null values!");
}
data_.reserve(n);
total_num_rows_ = n;
valid_bitset = TargetBitmap(total_num_rows_, false);
idx_to_offsets_.resize(n);
T* p = const_cast<T*>(values);
for (size_t i = 0; i < n; ++i) {
data_.emplace_back(IndexStructure(*p++, i));
valid_bitset.set(i);
}
std::sort(data_.begin(), data_.end());
for (size_t i = 0; i < data_.size(); ++i) {
@ -84,28 +87,33 @@ template <typename T>
void
ScalarIndexSort<T>::BuildWithFieldData(
const std::vector<milvus::FieldDataPtr>& field_datas) {
int64_t total_num_rows = 0;
int64_t length = 0;
for (const auto& data : field_datas) {
total_num_rows += data->get_num_rows();
total_num_rows_ += data->get_num_rows();
length += data->get_num_rows() - data->get_null_count();
}
if (total_num_rows == 0) {
if (length == 0) {
PanicInfo(DataIsEmpty, "ScalarIndexSort cannot build null values!");
}
data_.reserve(total_num_rows);
data_.reserve(length);
valid_bitset = TargetBitmap(total_num_rows_, false);
int64_t offset = 0;
for (const auto& data : field_datas) {
auto slice_num = data->get_num_rows();
for (size_t i = 0; i < slice_num; ++i) {
auto value = reinterpret_cast<const T*>(data->RawValue(i));
data_.emplace_back(IndexStructure(*value, offset));
if (data->is_valid(i)) {
auto value = reinterpret_cast<const T*>(data->RawValue(i));
data_.emplace_back(IndexStructure(*value, offset));
valid_bitset.set(offset);
}
offset++;
}
}
std::sort(data_.begin(), data_.end());
idx_to_offsets_.resize(total_num_rows);
for (size_t i = 0; i < total_num_rows; ++i) {
idx_to_offsets_.resize(total_num_rows_);
for (size_t i = 0; i < length; ++i) {
idx_to_offsets_[data_[i].idx_] = i;
}
is_built_ = true;
@ -124,9 +132,13 @@ ScalarIndexSort<T>::Serialize(const Config& config) {
auto index_size = data_.size();
memcpy(index_length.get(), &index_size, sizeof(size_t));
std::shared_ptr<uint8_t[]> index_num_rows(new uint8_t[sizeof(size_t)]);
memcpy(index_num_rows.get(), &total_num_rows_, sizeof(size_t));
BinarySet res_set;
res_set.Append("index_data", index_data, index_data_size);
res_set.Append("index_length", index_length, sizeof(size_t));
res_set.Append("index_num_rows", index_num_rows, sizeof(size_t));
milvus::Disassemble(res_set);
@ -158,11 +170,18 @@ ScalarIndexSort<T>::LoadWithoutAssemble(const BinarySet& index_binary,
auto index_data = index_binary.GetByName("index_data");
data_.resize(index_size);
idx_to_offsets_.resize(index_size);
auto index_num_rows = index_binary.GetByName("index_num_rows");
memcpy(&total_num_rows_,
index_num_rows->data.get(),
(size_t)index_num_rows->size);
idx_to_offsets_.resize(total_num_rows_);
valid_bitset = TargetBitmap(total_num_rows_, false);
memcpy(data_.data(), index_data->data.get(), (size_t)index_data->size);
for (size_t i = 0; i < data_.size(); ++i) {
idx_to_offsets_[data_[i].idx_] = i;
valid_bitset.set(data_[i].idx_);
}
is_built_ = true;
}
@ -185,7 +204,7 @@ ScalarIndexSort<T>::Load(milvus::tracer::TraceContext ctx,
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
@ -199,7 +218,7 @@ template <typename T>
const TargetBitmap
ScalarIndexSort<T>::In(const size_t n, const T* values) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(data_.size());
TargetBitmap bitset(Count());
for (size_t i = 0; i < n; ++i) {
auto lb = std::lower_bound(
data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
@ -221,7 +240,7 @@ template <typename T>
const TargetBitmap
ScalarIndexSort<T>::NotIn(const size_t n, const T* values) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(data_.size(), true);
TargetBitmap bitset(Count(), true);
for (size_t i = 0; i < n; ++i) {
auto lb = std::lower_bound(
data_.begin(), data_.end(), IndexStructure<T>(*(values + i)));
@ -236,6 +255,27 @@ ScalarIndexSort<T>::NotIn(const size_t n, const T* values) {
bitset[lb->idx_] = false;
}
}
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
bitset &= valid_bitset;
return bitset;
}
template <typename T>
const TargetBitmap
ScalarIndexSort<T>::IsNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(total_num_rows_, true);
bitset &= valid_bitset;
bitset.flip();
return bitset;
}
template <typename T>
const TargetBitmap
ScalarIndexSort<T>::IsNotNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(total_num_rows_, true);
bitset &= valid_bitset;
return bitset;
}
@ -243,7 +283,7 @@ template <typename T>
const TargetBitmap
ScalarIndexSort<T>::Range(const T value, const OpType op) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(data_.size());
TargetBitmap bitset(Count());
auto lb = data_.begin();
auto ub = data_.end();
if (ShouldSkip(value, value, op)) {
@ -283,7 +323,7 @@ ScalarIndexSort<T>::Range(T lower_bound_value,
T upper_bound_value,
bool ub_inclusive) {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(data_.size());
TargetBitmap bitset(Count());
if (lower_bound_value > upper_bound_value ||
(lower_bound_value == upper_bound_value &&
!(lb_inclusive && ub_inclusive))) {

View File

@ -47,7 +47,7 @@ class ScalarIndexSort : public ScalarIndex<T> {
int64_t
Count() override {
return data_.size();
return total_num_rows_;
}
ScalarIndexType
@ -67,6 +67,12 @@ class ScalarIndexSort : public ScalarIndex<T> {
const TargetBitmap
NotIn(size_t n, const T* values) override;
const TargetBitmap
IsNull() override;
const TargetBitmap
IsNotNull() override;
const TargetBitmap
Range(T value, OpType op) override;
@ -120,6 +126,9 @@ class ScalarIndexSort : public ScalarIndex<T> {
std::vector<int32_t> idx_to_offsets_; // used to retrieve.
std::vector<IndexStructure<T>> data_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
size_t total_num_rows_{0};
// generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate
TargetBitmap valid_bitset;
};
template <typename T>

View File

@ -33,67 +33,74 @@ SkipIndex::LoadPrimitive(milvus::FieldId field_id,
int64_t chunk_id,
milvus::DataType data_type,
const void* chunk_data,
const bool* valid_data,
int64_t count) {
auto chunkMetrics = std::make_unique<FieldChunkMetrics>();
if (count > 0) {
chunkMetrics->hasValue_ = true;
switch (data_type) {
case DataType::INT8: {
const int8_t* typedData =
static_cast<const int8_t*>(chunk_data);
std::pair<int8_t, int8_t> minMax =
ProcessFieldMetrics<int8_t>(typedData, count);
chunkMetrics->min_ = Metrics(minMax.first);
chunkMetrics->max_ = Metrics(minMax.second);
auto info =
ProcessFieldMetrics<int8_t>(typedData, valid_data, count);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
break;
}
case DataType::INT16: {
const int16_t* typedData =
static_cast<const int16_t*>(chunk_data);
std::pair<int16_t, int16_t> minMax =
ProcessFieldMetrics<int16_t>(typedData, count);
chunkMetrics->min_ = Metrics(minMax.first);
chunkMetrics->max_ = Metrics(minMax.second);
auto info =
ProcessFieldMetrics<int16_t>(typedData, valid_data, count);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
break;
}
case DataType::INT32: {
const int32_t* typedData =
static_cast<const int32_t*>(chunk_data);
std::pair<int32_t, int32_t> minMax =
ProcessFieldMetrics<int32_t>(typedData, count);
chunkMetrics->min_ = Metrics(minMax.first);
chunkMetrics->max_ = Metrics(minMax.second);
auto info =
ProcessFieldMetrics<int32_t>(typedData, valid_data, count);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
break;
}
case DataType::INT64: {
const int64_t* typedData =
static_cast<const int64_t*>(chunk_data);
std::pair<int64_t, int64_t> minMax =
ProcessFieldMetrics<int64_t>(typedData, count);
chunkMetrics->min_ = Metrics(minMax.first);
chunkMetrics->max_ = Metrics(minMax.second);
auto info =
ProcessFieldMetrics<int64_t>(typedData, valid_data, count);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
break;
}
case DataType::FLOAT: {
const float* typedData = static_cast<const float*>(chunk_data);
std::pair<float, float> minMax =
ProcessFieldMetrics<float>(typedData, count);
chunkMetrics->min_ = Metrics(minMax.first);
chunkMetrics->max_ = Metrics(minMax.second);
auto info =
ProcessFieldMetrics<float>(typedData, valid_data, count);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
break;
}
case DataType::DOUBLE: {
const double* typedData =
static_cast<const double*>(chunk_data);
std::pair<double, double> minMax =
ProcessFieldMetrics<double>(typedData, count);
chunkMetrics->min_ = Metrics(minMax.first);
chunkMetrics->max_ = Metrics(minMax.second);
auto info =
ProcessFieldMetrics<double>(typedData, valid_data, count);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
break;
}
}
}
chunkMetrics->hasValue_ = chunkMetrics->null_count_ == count ? false : true;
std::unique_lock lck(mutex_);
if (fieldChunkMetrics_.count(field_id) == 0) {
fieldChunkMetrics_.insert(std::make_pair(
@ -111,21 +118,15 @@ SkipIndex::LoadString(milvus::FieldId field_id,
int num_rows = var_column.NumRows();
auto chunkMetrics = std::make_unique<FieldChunkMetrics>();
if (num_rows > 0) {
chunkMetrics->hasValue_ = true;
std::string_view min_string = var_column.RawAt(0);
std::string_view max_string = var_column.RawAt(0);
for (size_t i = 1; i < num_rows; i++) {
const auto& val = var_column.RawAt(i);
if (val < min_string) {
min_string = val;
}
if (val > max_string) {
max_string = val;
}
}
chunkMetrics->min_ = Metrics(min_string);
chunkMetrics->max_ = Metrics(max_string);
auto info = ProcessStringFieldMetrics(var_column);
chunkMetrics->min_ = Metrics(info.min_);
chunkMetrics->max_ = Metrics(info.max_);
chunkMetrics->null_count_ = info.null_count_;
}
chunkMetrics->hasValue_ =
chunkMetrics->null_count_ == num_rows ? false : true;
std::unique_lock lck(mutex_);
if (fieldChunkMetrics_.count(field_id) == 0) {
fieldChunkMetrics_.insert(std::make_pair(

View File

@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <cstddef>
#include <unordered_map>
#include "common/Types.h"
@ -29,6 +30,7 @@ struct FieldChunkMetrics {
Metrics min_;
Metrics max_;
bool hasValue_;
int64_t null_count_;
FieldChunkMetrics() : hasValue_(false){};
};
@ -73,6 +75,7 @@ class SkipIndex {
int64_t chunk_id,
milvus::DataType data_type,
const void* chunk_data,
const bool* valid_data,
int64_t count);
void
@ -217,17 +220,43 @@ class SkipIndex {
return should_skip;
}
// todo: support some null_count_ skip
template <typename T>
std::pair<T, T>
ProcessFieldMetrics(const T* data, int64_t count) {
struct metricInfo {
T min_;
T max_;
int64_t null_count_;
};
template <typename T>
metricInfo<T>
ProcessFieldMetrics(const T* data, const bool* valid_data, int64_t count) {
//double check to avoid crush
if (data == nullptr || count == 0) {
return {T(), T()};
}
T minValue = data[0];
T maxValue = data[0];
for (size_t i = 0; i < count; i++) {
// find first not null value
int64_t start = 0;
for (int64_t i = start; i < count; i++) {
if (valid_data != nullptr && !valid_data[i]) {
start++;
continue;
}
break;
}
if (start > count - 1) {
return {T(), T(), count};
}
T minValue = data[start];
T maxValue = data[start];
int64_t null_count = start;
for (int64_t i = start; i < count; i++) {
T value = data[i];
if (valid_data != nullptr && !valid_data[i]) {
null_count++;
continue;
}
if (value < minValue) {
minValue = value;
}
@ -235,7 +264,42 @@ class SkipIndex {
maxValue = value;
}
}
return {minValue, maxValue};
return {minValue, maxValue, null_count};
}
metricInfo<std::string_view>
ProcessStringFieldMetrics(
const milvus::VariableColumn<std::string>& var_column) {
int num_rows = var_column.NumRows();
// find first not null value
int64_t start = 0;
for (int64_t i = start; i < num_rows; i++) {
if (!var_column.IsValid(i)) {
start++;
continue;
}
break;
}
if (start > num_rows - 1) {
return {std::string_view(), std::string_view(), num_rows};
}
std::string_view min_string = var_column.RawAt(start);
std::string_view max_string = var_column.RawAt(start);
int64_t null_count = start;
for (int64_t i = start; i < num_rows; i++) {
const auto& val = var_column.RawAt(i);
if (!var_column.IsValid(i)) {
null_count++;
continue;
}
if (val < min_string) {
min_string = val;
}
if (val > max_string) {
max_string = val;
}
}
return {min_string, max_string, null_count};
}
private:

View File

@ -83,23 +83,29 @@ StringIndexMarisa::BuildWithFieldData(
for (const auto& data : field_datas) {
auto slice_num = data->get_num_rows();
for (int64_t i = 0; i < slice_num; ++i) {
keyset.push_back(
(*static_cast<const std::string*>(data->RawValue(i))).c_str());
if (data->is_valid(i)) {
keyset.push_back(
(*static_cast<const std::string*>(data->RawValue(i)))
.c_str());
}
}
total_num_rows += slice_num;
}
trie_.build(keyset);
// fill str_ids_
str_ids_.resize(total_num_rows);
str_ids_.resize(total_num_rows, MARISA_NULL_KEY_ID);
int64_t offset = 0;
for (const auto& data : field_datas) {
auto slice_num = data->get_num_rows();
for (int64_t i = 0; i < slice_num; ++i) {
auto str_id =
lookup(*static_cast<const std::string*>(data->RawValue(i)));
AssertInfo(valid_str_id(str_id), "invalid marisa key");
str_ids_[offset++] = str_id;
if (data->is_valid(offset)) {
auto str_id =
lookup(*static_cast<const std::string*>(data->RawValue(i)));
AssertInfo(valid_str_id(str_id), "invalid marisa key");
str_ids_[offset] = str_id;
}
offset++;
}
}
@ -228,7 +234,7 @@ StringIndexMarisa::Load(milvus::tracer::TraceContext ctx,
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->Size();
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
@ -267,6 +273,32 @@ StringIndexMarisa::NotIn(size_t n, const std::string* values) {
}
}
}
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
auto offsets = str_ids_to_offsets_[MARISA_NULL_KEY_ID];
for (size_t i = 0; i < offsets.size(); i++) {
bitset.reset(offsets[i]);
}
return bitset;
}
const TargetBitmap
StringIndexMarisa::IsNull() {
TargetBitmap bitset(str_ids_.size());
auto offsets = str_ids_to_offsets_[MARISA_NULL_KEY_ID];
for (size_t i = 0; i < offsets.size(); i++) {
bitset.set(offsets[i]);
}
return bitset;
}
const TargetBitmap
StringIndexMarisa::IsNotNull() {
TargetBitmap bitset(str_ids_.size());
auto offsets = str_ids_to_offsets_[MARISA_NULL_KEY_ID];
for (size_t i = 0; i < offsets.size(); i++) {
bitset.set(offsets[i]);
}
bitset.flip();
return bitset;
}

View File

@ -69,6 +69,12 @@ class StringIndexMarisa : public StringIndex {
const TargetBitmap
NotIn(size_t n, const std::string* values) override;
const TargetBitmap
IsNull() override;
const TargetBitmap
IsNotNull() override;
const TargetBitmap
Range(std::string value, OpType op) override;

View File

@ -242,14 +242,15 @@ void
AssembleIndexDatas(std::map<std::string, FieldDataPtr>& index_datas) {
if (index_datas.find(INDEX_FILE_SLICE_META) != index_datas.end()) {
auto slice_meta = index_datas.at(INDEX_FILE_SLICE_META);
Config meta_data = Config::parse(std::string(
static_cast<const char*>(slice_meta->Data()), slice_meta->Size()));
Config meta_data = Config::parse(
std::string(static_cast<const char*>(slice_meta->Data()),
slice_meta->DataSize()));
for (auto& item : meta_data[META]) {
std::string prefix = item[NAME];
int slice_num = item[SLICE_NUM];
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
// todo: support nullable index
// build index skip null value, so not need to set nullable == true
auto new_field_data =
storage::CreateFieldData(DataType::INT8, false, 1, total_len);
@ -258,7 +259,7 @@ AssembleIndexDatas(std::map<std::string, FieldDataPtr>& index_datas) {
AssertInfo(index_datas.find(file_name) != index_datas.end(),
"lost index slice data");
auto data = index_datas.at(file_name);
auto len = data->Size();
auto len = data->DataSize();
new_field_data->FillFieldData(data->Data(), len);
index_datas.erase(file_name);
}
@ -282,13 +283,13 @@ AssembleIndexDatas(std::map<std::string, FieldDataChannelPtr>& index_datas,
index_datas.erase(INDEX_FILE_SLICE_META);
Config metadata = Config::parse(
std::string(static_cast<const char*>(raw_metadata->Data()),
raw_metadata->Size()));
raw_metadata->DataSize()));
for (auto& item : metadata[META]) {
std::string prefix = item[NAME];
int slice_num = item[SLICE_NUM];
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
// todo: support nullable index
// build index skip null value, so not need to set nullable == true
auto new_field_data =
storage::CreateFieldData(DataType::INT8, false, 1, total_len);
@ -299,7 +300,7 @@ AssembleIndexDatas(std::map<std::string, FieldDataChannelPtr>& index_datas,
auto& channel = it->second;
auto data_array = storage::CollectFieldDataChannel(channel);
auto data = storage::MergeFieldData(data_array);
auto len = data->Size();
auto len = data->DataSize();
new_field_data->FillFieldData(data->Data(), len);
index_datas.erase(file_name);
}

View File

@ -195,7 +195,6 @@ VectorMemIndex<T>::Load(milvus::tracer::TraceContext ctx,
std::string prefix = item[NAME];
int slice_num = item[SLICE_NUM];
auto total_len = static_cast<size_t>(item[TOTAL_LEN]);
// todo: support nullable index
auto new_field_data = milvus::storage::CreateFieldData(
DataType::INT8, false, 1, total_len);

View File

@ -245,7 +245,10 @@ class ColumnBase {
bool
IsValid(size_t offset) const {
return valid_data_[offset];
if (nullable_) {
return valid_data_[offset];
}
return true;
}
bool

View File

@ -357,8 +357,10 @@ SegmentInternalInterface::LoadPrimitiveSkipIndex(milvus::FieldId field_id,
int64_t chunk_id,
milvus::DataType data_type,
const void* chunk_data,
const bool* valid_data,
int64_t count) {
skip_index_.LoadPrimitive(field_id, chunk_id, data_type, chunk_data, count);
skip_index_.LoadPrimitive(
field_id, chunk_id, data_type, chunk_data, valid_data, count);
}
void

View File

@ -248,6 +248,7 @@ class SegmentInternalInterface : public SegmentInterface {
int64_t chunk_id,
DataType data_type,
const void* chunk_data,
const bool* valid_data,
int64_t count);
void

View File

@ -423,8 +423,12 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
column->AppendBatch(field_data);
stats_.mem_size += field_data->Size();
}
LoadPrimitiveSkipIndex(
field_id, 0, data_type, column->Span().data(), num_rows);
LoadPrimitiveSkipIndex(field_id,
0,
data_type,
column->Span().data(),
column->Span().valid_data(),
num_rows);
}
AssertInfo(column->NumRows() == num_rows,

View File

@ -213,7 +213,7 @@ DiskFileManagerImpl::CacheIndexToDisk(
auto index_chunks = GetObjectData(rcm_.get(), batch_remote_files);
for (auto& chunk : index_chunks) {
auto index_data = chunk.get()->GetFieldData();
auto index_size = index_data->Size();
auto index_size = index_data->DataSize();
auto chunk_data = reinterpret_cast<uint8_t*>(
const_cast<void*>(index_data->Data()));
file.Write(chunk_data, index_size);

View File

@ -528,6 +528,7 @@ EncodeAndUploadIndexSlice(ChunkManager* chunk_manager,
IndexMeta index_meta,
FieldDataMeta field_meta,
std::string object_key) {
// index not use valid_data, so no need to set nullable==true
auto field_data = CreateFieldData(DataType::INT8, false);
field_data->FillFieldData(buf, batch_size);
auto indexData = std::make_shared<IndexData>(field_data);
@ -551,8 +552,8 @@ EncodeAndUploadFieldSlice(ChunkManager* chunk_manager,
auto dim = IsSparseFloatVectorDataType(field_meta.get_data_type())
? -1
: field_meta.get_dim();
auto field_data = CreateFieldData(
field_meta.get_data_type(), field_meta.is_nullable(), dim, 0);
auto field_data =
CreateFieldData(field_meta.get_data_type(), false, dim, 0);
field_data->FillFieldData(buf, element_count);
auto insertData = std::make_shared<InsertData>(field_data);
insertData->SetFieldDataMeta(field_data_meta);

View File

@ -162,6 +162,7 @@ class ArrayBitmapIndexTest : public testing::Test {
int64_t index_version) {
proto::schema::FieldSchema field_schema;
field_schema.set_data_type(proto::schema::DataType::Array);
field_schema.set_nullable(nullable_);
proto::schema::DataType element_type;
if constexpr (std::is_same_v<int8_t, T>) {
element_type = proto::schema::DataType::Int8;
@ -185,9 +186,26 @@ class ArrayBitmapIndexTest : public testing::Test {
segment_id, field_id, index_build_id, index_version};
data_ = GenerateArrayData(element_type, cardinality_, nb_, 10);
auto field_data = storage::CreateFieldData(DataType::ARRAY);
field_data->FillFieldData(data_.data(), data_.size());
auto field_data = storage::CreateFieldData(DataType::ARRAY, nullable_);
if (nullable_) {
valid_data_.reserve(nb_);
uint8_t* ptr = new uint8_t[(nb_ + 7) / 8];
for (int i = 0; i < nb_; i++) {
int byteIndex = i / 8;
int bitIndex = i % 8;
if (i % 2 == 0) {
valid_data_.push_back(true);
ptr[byteIndex] |= (1 << bitIndex);
} else {
valid_data_.push_back(false);
ptr[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data_.data(), ptr, data_.size());
delete[] ptr;
} else {
field_data->FillFieldData(data_.data(), data_.size());
}
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
@ -237,6 +255,7 @@ class ArrayBitmapIndexTest : public testing::Test {
SetParam() {
nb_ = 10000;
cardinality_ = 30;
nullable_ = false;
}
void
@ -293,6 +312,9 @@ class ArrayBitmapIndexTest : public testing::Test {
for (size_t i = 0; i < bitset.size(); i++) {
auto ref = [&]() -> bool {
milvus::Array array = data_[i];
if (nullable_ && !valid_data_[i]) {
return false;
}
for (size_t j = 0; j < array.length(); ++j) {
auto val = array.template get_data<T>(j);
if (s.find(val) != s.end()) {
@ -313,7 +335,9 @@ class ArrayBitmapIndexTest : public testing::Test {
IndexBasePtr index_;
size_t nb_;
size_t cardinality_;
bool nullable_;
std::vector<milvus::Array> data_;
FixedVector<bool> valid_data_;
};
TYPED_TEST_SUITE_P(ArrayBitmapIndexTest);
@ -350,6 +374,7 @@ class ArrayBitmapIndexTestV1 : public ArrayBitmapIndexTest<T> {
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 200;
this->nullable_ = false;
}
virtual ~ArrayBitmapIndexTestV1() {
@ -363,10 +388,36 @@ TYPED_TEST_P(ArrayBitmapIndexTestV1, CountFuncTest) {
EXPECT_EQ(count, this->nb_);
}
template <typename T>
class ArrayBitmapIndexTestNullable : public ArrayBitmapIndexTest<T> {
public:
virtual void
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 30;
this->nullable_ = true;
}
virtual ~ArrayBitmapIndexTestNullable() {
}
};
TYPED_TEST_SUITE_P(ArrayBitmapIndexTestNullable);
TYPED_TEST_P(ArrayBitmapIndexTestNullable, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}
using BitmapTypeV1 = testing::Types<int32_t, int64_t, std::string>;
REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTestV1, CountFuncTest);
REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTestNullable, CountFuncTest);
INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheckV1,
ArrayBitmapIndexTestV1,
BitmapTypeV1);
INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheckV1,
ArrayBitmapIndexTestNullable,
BitmapTypeV1);

View File

@ -72,6 +72,7 @@ class HybridIndexTestV1 : public testing::Test {
int64_t index_build_id,
int64_t index_version) {
proto::schema::FieldSchema field_schema;
field_schema.set_nullable(nullable_);
if constexpr (std::is_same_v<int8_t, T>) {
field_schema.set_data_type(proto::schema::DataType::Int8);
} else if constexpr (std::is_same_v<int16_t, T>) {
@ -98,8 +99,26 @@ class HybridIndexTestV1 : public testing::Test {
data_.push_back(x);
}
auto field_data = storage::CreateFieldData(type_);
field_data->FillFieldData(data_.data(), data_.size());
auto field_data = storage::CreateFieldData(type_, nullable_);
if (nullable_) {
valid_data_.reserve(nb_);
uint8_t* ptr = new uint8_t[(nb_ + 7) / 8];
for (int i = 0; i < nb_; i++) {
int byteIndex = i / 8;
int bitIndex = i % 8;
if (i % 2 == 0) {
valid_data_.push_back(true);
ptr[byteIndex] |= (1 << bitIndex);
} else {
valid_data_.push_back(false);
ptr[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data_.data(), ptr, data_.size());
delete[] ptr;
} else {
field_data->FillFieldData(data_.data(), data_.size());
}
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
@ -149,6 +168,7 @@ class HybridIndexTestV1 : public testing::Test {
SetParam() {
nb_ = 10000;
cardinality_ = 30;
nullable_ = false;
}
void
SetUp() override {
@ -171,7 +191,7 @@ class HybridIndexTestV1 : public testing::Test {
int64_t field_id = 101;
int64_t index_build_id = 1000;
int64_t index_version = 10000;
std::string root_path = "/tmp/test-bitmap-index/";
std::string root_path = "/tmp/test-bitmap-index";
storage::StorageConfig storage_config;
storage_config.storage_type = "local";
@ -204,7 +224,11 @@ class HybridIndexTestV1 : public testing::Test {
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->In(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
}
}
}
@ -221,7 +245,39 @@ class HybridIndexTestV1 : public testing::Test {
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->NotIn(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data_[i]) == s.end());
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_NE(bitset[i], s.find(data_[i]) != s.end());
}
}
}
void
TestIsNullFunc() {
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->IsNull();
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], true);
} else {
ASSERT_EQ(bitset[i], false);
}
}
}
void
TestIsNotNullFunc() {
auto index_ptr =
dynamic_cast<index::HybridScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->IsNotNull();
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], true);
}
}
}
@ -250,9 +306,15 @@ class HybridIndexTestV1 : public testing::Test {
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(ans, false)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
} else {
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
@ -309,10 +371,17 @@ class HybridIndexTestV1 : public testing::Test {
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = test_case.ref(i);
ASSERT_EQ(ans, should)
<< "lower:" << test_case.lower_val
<< "upper:" << test_case.upper_val << ", @" << i
<< ", ans: " << ans << ", ref: " << should;
if (nullable_ && !valid_data_[i]) {
ASSERT_EQ(ans, false)
<< "lower:" << test_case.lower_val
<< "upper:" << test_case.upper_val << ", @" << i
<< ", ans: " << ans << ", ref: " << false;
} else {
ASSERT_EQ(ans, should)
<< "lower:" << test_case.lower_val
<< "upper:" << test_case.upper_val << ", @" << i
<< ", ans: " << ans << ", ref: " << should;
}
}
}
}
@ -325,6 +394,8 @@ class HybridIndexTestV1 : public testing::Test {
size_t cardinality_;
boost::container::vector<T> data_;
std::shared_ptr<storage::ChunkManager> chunk_manager_;
bool nullable_;
FixedVector<bool> valid_data_;
};
TYPED_TEST_SUITE_P(HybridIndexTestV1);
@ -342,6 +413,14 @@ TYPED_TEST_P(HybridIndexTestV1, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(HybridIndexTestV1, IsNullFuncTest) {
this->TestIsNullFunc();
}
TYPED_TEST_P(HybridIndexTestV1, IsNotNullFuncTest) {
this->TestIsNotNullFunc();
}
TYPED_TEST_P(HybridIndexTestV1, CompareValFuncTest) {
this->TestCompareValueFunc();
}
@ -356,6 +435,8 @@ using BitmapType =
REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV1,
CountFuncTest,
INFuncTest,
IsNullFuncTest,
IsNotNullFuncTest,
NotINFuncTest,
CompareValFuncTest,
TestRangeCompareFuncTest);
@ -371,6 +452,7 @@ class HybridIndexTestV2 : public HybridIndexTestV1<T> {
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 2000;
this->nullable_ = false;
}
virtual ~HybridIndexTestV2() {
@ -392,6 +474,14 @@ TYPED_TEST_P(HybridIndexTestV2, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(HybridIndexTestV2, IsNullFuncTest) {
this->TestIsNullFunc();
}
TYPED_TEST_P(HybridIndexTestV2, IsNotNullFuncTest) {
this->TestIsNotNullFunc();
}
TYPED_TEST_P(HybridIndexTestV2, CompareValFuncTest) {
this->TestCompareValueFunc();
}
@ -400,12 +490,68 @@ TYPED_TEST_P(HybridIndexTestV2, TestRangeCompareFuncTest) {
this->TestRangeCompareFunc();
}
template <typename T>
class HybridIndexTestNullable : public HybridIndexTestV1<T> {
public:
virtual void
SetParam() override {
this->nb_ = 10000;
this->cardinality_ = 2000;
this->nullable_ = true;
}
virtual ~HybridIndexTestNullable() {
}
};
TYPED_TEST_SUITE_P(HybridIndexTestNullable);
TYPED_TEST_P(HybridIndexTestNullable, CountFuncTest) {
auto count = this->index_->Count();
EXPECT_EQ(count, this->nb_);
}
TYPED_TEST_P(HybridIndexTestNullable, INFuncTest) {
this->TestInFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, NotINFuncTest) {
this->TestNotInFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, IsNullFuncTest) {
this->TestIsNullFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, IsNotNullFuncTest) {
this->TestIsNotNullFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, CompareValFuncTest) {
this->TestCompareValueFunc();
}
TYPED_TEST_P(HybridIndexTestNullable, TestRangeCompareFuncTest) {
this->TestRangeCompareFunc();
}
using BitmapType =
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV2,
CountFuncTest,
INFuncTest,
IsNullFuncTest,
IsNotNullFuncTest,
NotINFuncTest,
CompareValFuncTest,
TestRangeCompareFuncTest);
REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestNullable,
CountFuncTest,
INFuncTest,
IsNullFuncTest,
IsNotNullFuncTest,
NotINFuncTest,
CompareValFuncTest,
TestRangeCompareFuncTest);
@ -413,3 +559,7 @@ REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV2,
INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_HighCardinality,
HybridIndexTestV2,
BitmapType);
INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_Nullable,
HybridIndexTestNullable,
BitmapType);

View File

@ -32,8 +32,8 @@ gen_field_meta(int64_t collection_id = 1,
int64_t segment_id = 3,
int64_t field_id = 101,
DataType data_type = DataType::NONE,
DataType element_type = DataType::NONE)
-> storage::FieldDataMeta {
DataType element_type = DataType::NONE,
bool nullable = false) -> storage::FieldDataMeta {
auto meta = storage::FieldDataMeta{
.collection_id = collection_id,
.partition_id = partition_id,
@ -44,6 +44,7 @@ gen_field_meta(int64_t collection_id = 1,
static_cast<proto::schema::DataType>(data_type));
meta.field_schema.set_element_type(
static_cast<proto::schema::DataType>(element_type));
meta.field_schema.set_nullable(nullable);
return meta;
}
@ -92,7 +93,10 @@ struct ChunkManagerWrapper {
};
} // namespace milvus::test
template <typename T, DataType dtype, DataType element_type = DataType::NONE>
template <typename T,
DataType dtype,
DataType element_type = DataType::NONE,
bool nullable = false>
void
test_run() {
int64_t collection_id = 1;
@ -102,8 +106,13 @@ test_run() {
int64_t index_build_id = 1000;
int64_t index_version = 10000;
auto field_meta = test::gen_field_meta(
collection_id, partition_id, segment_id, field_id, dtype, element_type);
auto field_meta = test::gen_field_meta(collection_id,
partition_id,
segment_id,
field_id,
dtype,
element_type,
nullable);
auto index_meta = test::gen_index_meta(
segment_id, field_id, index_build_id, index_version);
@ -114,6 +123,7 @@ test_run() {
size_t nb = 10000;
std::vector<T> data_gen;
boost::container::vector<T> data;
FixedVector<bool> valid_data;
if constexpr (!std::is_same_v<T, bool>) {
data_gen = GenSortedArr<T>(nb);
} else {
@ -121,12 +131,36 @@ test_run() {
data_gen.push_back(rand() % 2 == 0);
}
}
if (nullable) {
valid_data.reserve(nb);
for (size_t i = 0; i < nb; i++) {
valid_data.push_back(rand() % 2 == 0);
}
}
for (auto x : data_gen) {
data.push_back(x);
}
auto field_data = storage::CreateFieldData(dtype);
field_data->FillFieldData(data.data(), data.size());
auto field_data = storage::CreateFieldData(dtype, nullable);
if (nullable) {
int byteSize = (nb + 7) / 8;
uint8_t* valid_data_ = new uint8_t[byteSize];
for (int i = 0; i < nb; i++) {
bool value = valid_data[i];
int byteIndex = i / 8;
int bitIndex = i % 8;
if (value) {
valid_data_[byteIndex] |= (1 << bitIndex);
} else {
valid_data_[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data.data(), valid_data_, data.size());
delete[] valid_data_;
} else {
field_data->FillFieldData(data.data(), data.size());
}
// std::cout << "length:" << field_data->get_num_rows() << std::endl;
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
@ -197,7 +231,11 @@ test_run() {
real_index->In(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
if (nullable && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
}
}
}
@ -213,7 +251,35 @@ test_run() {
real_index->NotIn(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
if (nullable && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
}
}
}
{
auto bitset = real_index->IsNull();
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable && !valid_data[i]) {
ASSERT_EQ(bitset[i], true);
} else {
ASSERT_EQ(bitset[i], false);
}
}
}
{
auto bitset = real_index->IsNotNull();
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
if (nullable && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], true);
}
}
}
}
@ -241,12 +307,16 @@ test_run() {
for (const auto& [test_value, op, ref] : test_cases) {
auto bitset = real_index->Range(test_value, op);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
for (size_t i = 0; i < nb; i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
if (nullable && !valid_data[i]) {
ASSERT_EQ(ans, false);
} else {
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i
<< ", ans: " << ans << ", ref: " << should;
}
}
}
}
@ -287,11 +357,16 @@ test_run() {
auto bitset =
real_index->Range(lb, lb_inclusive, ub, ub_inclusive);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
for (size_t i = 0; i < nb; i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should) << "@" << i << ", ans: " << ans
<< ", ref: " << should;
if (nullable && !valid_data[i]) {
ASSERT_EQ(ans, false);
} else {
ASSERT_EQ(ans, should)
<< "@" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
@ -299,6 +374,7 @@ test_run() {
}
}
template <bool nullable = false>
void
test_string() {
using T = std::string;
@ -316,7 +392,8 @@ test_string() {
segment_id,
field_id,
dtype,
DataType::NONE);
DataType::NONE,
nullable);
auto index_meta = test::gen_index_meta(
segment_id, field_id, index_build_id, index_version);
@ -326,12 +403,36 @@ test_string() {
size_t nb = 10000;
boost::container::vector<T> data;
FixedVector<bool> valid_data;
for (size_t i = 0; i < nb; i++) {
data.push_back(std::to_string(rand()));
}
if (nullable) {
valid_data.reserve(nb);
for (size_t i = 0; i < nb; i++) {
valid_data.push_back(rand() % 2 == 0);
}
}
auto field_data = storage::CreateFieldData(dtype, false);
field_data->FillFieldData(data.data(), data.size());
auto field_data = storage::CreateFieldData(dtype, nullable);
if (nullable) {
int byteSize = (nb + 7) / 8;
uint8_t* valid_data_ = new uint8_t[byteSize];
for (int i = 0; i < nb; i++) {
bool value = valid_data[i];
int byteIndex = i / 8;
int bitIndex = i % 8;
if (value) {
valid_data_[byteIndex] |= (1 << bitIndex);
} else {
valid_data_[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data.data(), valid_data_, data.size());
delete[] valid_data_;
} else {
field_data->FillFieldData(data.data(), data.size());
}
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
@ -399,7 +500,11 @@ test_string() {
auto bitset = real_index->In(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
if (nullable && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_EQ(bitset[i], s.find(data[i]) != s.end());
}
}
}
@ -414,7 +519,11 @@ test_string() {
auto bitset = real_index->NotIn(test_data.size(), test_data.data());
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
if (nullable && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
ASSERT_NE(bitset[i], s.find(data[i]) != s.end());
}
}
}
@ -441,9 +550,13 @@ test_string() {
for (size_t i = 0; i < bitset.size(); i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
if (nullable && !valid_data[i]) {
ASSERT_EQ(ans, false);
} else {
ASSERT_EQ(ans, should)
<< "op: " << op << ", @" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
@ -484,11 +597,15 @@ test_string() {
auto bitset =
real_index->Range(lb, lb_inclusive, ub, ub_inclusive);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
for (size_t i = 0; i < nb; i++) {
auto ans = bitset[i];
auto should = ref(i);
ASSERT_EQ(ans, should)
<< "@" << i << ", ans: " << ans << ", ref: " << should;
if (nullable && !valid_data[i]) {
ASSERT_EQ(ans, false);
} else {
ASSERT_EQ(ans, should) << "@" << i << ", ans: " << ans
<< ", ref: " << should;
}
}
}
}
@ -501,7 +618,11 @@ test_string() {
auto bitset = real_index->Query(dataset);
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], boost::starts_with(data[i], prefix));
auto should = boost::starts_with(data[i], prefix);
if (nullable && !valid_data[i]) {
should = false;
}
ASSERT_EQ(bitset[i], should);
}
}
@ -511,7 +632,11 @@ test_string() {
auto bitset = real_index->RegexQuery(prefix + "(.|\n)*");
ASSERT_EQ(cnt, bitset.size());
for (size_t i = 0; i < bitset.size(); i++) {
ASSERT_EQ(bitset[i], boost::starts_with(data[i], prefix));
auto should = boost::starts_with(data[i], prefix);
if (nullable && !valid_data[i]) {
should = false;
}
ASSERT_EQ(bitset[i], should);
}
}
}
@ -529,4 +654,15 @@ TEST(InvertedIndex, Naive) {
test_run<double, DataType::DOUBLE>();
test_string();
test_run<int8_t, DataType::INT8, DataType::NONE, true>();
test_run<int16_t, DataType::INT16, DataType::NONE, true>();
test_run<int32_t, DataType::INT32, DataType::NONE, true>();
test_run<int64_t, DataType::INT64, DataType::NONE, true>();
test_run<bool, DataType::BOOL, DataType::NONE, true>();
test_run<float, DataType::FLOAT, DataType::NONE, true>();
test_run<double, DataType::DOUBLE, DataType::NONE, true>();
test_string<true>();
}

View File

@ -1872,7 +1872,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
storage::CreateFieldData(DataType::INT64, false, 1, 10);
pk_field_data->FillFieldData(pks.data(), N);
segment->LoadPrimitiveSkipIndex(
pk_fid, 0, DataType::INT64, pk_field_data->Data(), N);
pk_fid, 0, DataType::INT64, pk_field_data->Data(), nullptr, N);
auto& skip_index = segment->GetSkipIndex();
bool equal_5_skip =
skip_index.CanSkipUnaryRange<int64_t>(pk_fid, 0, OpType::Equal, 5);
@ -1914,7 +1914,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
storage::CreateFieldData(DataType::INT32, false, 1, 10);
int32_field_data->FillFieldData(int32s.data(), N);
segment->LoadPrimitiveSkipIndex(
i32_fid, 0, DataType::INT32, int32_field_data->Data(), N);
i32_fid, 0, DataType::INT32, int32_field_data->Data(), nullptr, N);
less_than_1_skip =
skip_index.CanSkipUnaryRange<int32_t>(i32_fid, 0, OpType::LessThan, 1);
ASSERT_TRUE(less_than_1_skip);
@ -1925,7 +1925,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
storage::CreateFieldData(DataType::INT16, false, 1, 10);
int16_field_data->FillFieldData(int16s.data(), N);
segment->LoadPrimitiveSkipIndex(
i16_fid, 0, DataType::INT16, int16_field_data->Data(), N);
i16_fid, 0, DataType::INT16, int16_field_data->Data(), nullptr, N);
bool less_than_12_skip =
skip_index.CanSkipUnaryRange<int16_t>(i16_fid, 0, OpType::LessThan, 12);
ASSERT_FALSE(less_than_12_skip);
@ -1936,7 +1936,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
storage::CreateFieldData(DataType::INT8, false, 1, 10);
int8_field_data->FillFieldData(int8s.data(), N);
segment->LoadPrimitiveSkipIndex(
i8_fid, 0, DataType::INT8, int8_field_data->Data(), N);
i8_fid, 0, DataType::INT8, int8_field_data->Data(), nullptr, N);
bool greater_than_12_skip = skip_index.CanSkipUnaryRange<int8_t>(
i8_fid, 0, OpType::GreaterThan, 12);
ASSERT_TRUE(greater_than_12_skip);
@ -1948,7 +1948,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
storage::CreateFieldData(DataType::FLOAT, false, 1, 10);
float_field_data->FillFieldData(floats.data(), N);
segment->LoadPrimitiveSkipIndex(
float_fid, 0, DataType::FLOAT, float_field_data->Data(), N);
float_fid, 0, DataType::FLOAT, float_field_data->Data(), nullptr, N);
greater_than_10_skip = skip_index.CanSkipUnaryRange<float>(
float_fid, 0, OpType::GreaterThan, 10.0);
ASSERT_TRUE(greater_than_10_skip);
@ -1960,7 +1960,7 @@ TEST(Sealed, SkipIndexSkipUnaryRange) {
storage::CreateFieldData(DataType::DOUBLE, false, 1, 10);
double_field_data->FillFieldData(doubles.data(), N);
segment->LoadPrimitiveSkipIndex(
double_fid, 0, DataType::DOUBLE, double_field_data->Data(), N);
double_fid, 0, DataType::DOUBLE, double_field_data->Data(), nullptr, N);
greater_than_10_skip = skip_index.CanSkipUnaryRange<double>(
double_fid, 0, OpType::GreaterThan, 10.0);
ASSERT_TRUE(greater_than_10_skip);
@ -1984,7 +1984,7 @@ TEST(Sealed, SkipIndexSkipBinaryRange) {
storage::CreateFieldData(DataType::INT64, false, 1, 10);
pk_field_data->FillFieldData(pks.data(), N);
segment->LoadPrimitiveSkipIndex(
pk_fid, 0, DataType::INT64, pk_field_data->Data(), N);
pk_fid, 0, DataType::INT64, pk_field_data->Data(), nullptr, N);
auto& skip_index = segment->GetSkipIndex();
ASSERT_FALSE(
skip_index.CanSkipBinaryRange<int64_t>(pk_fid, 0, -3, 1, true, true));
@ -2002,6 +2002,117 @@ TEST(Sealed, SkipIndexSkipBinaryRange) {
skip_index.CanSkipBinaryRange<int64_t>(pk_fid, 0, 10, 12, true, true));
}
TEST(Sealed, SkipIndexSkipUnaryRangeNullable) {
auto schema = std::make_shared<Schema>();
auto dim = 128;
auto metrics_type = "L2";
auto fake_vec_fid = schema->AddDebugField(
"fakeVec", DataType::VECTOR_FLOAT, dim, metrics_type);
auto i64_fid = schema->AddDebugField("int64_field", DataType::INT64, true);
auto dataset = DataGen(schema, 5);
auto segment = CreateSealedSegment(schema);
//test for int64
std::vector<int64_t> int64s = {1, 2, 3, 4, 5};
uint8_t* valid_data = new uint8_t[1]{0x03};
FixedVector<bool> valid_data_ = {true, true, false, false, false};
auto int64s_field_data =
storage::CreateFieldData(DataType::INT64, true, 1, 5);
int64s_field_data->FillFieldData(int64s.data(), valid_data, 5);
segment->LoadPrimitiveSkipIndex(i64_fid,
0,
DataType::INT64,
int64s_field_data->Data(),
valid_data_.data(),
5);
auto& skip_index = segment->GetSkipIndex();
bool equal_5_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::Equal, 5);
bool equal_4_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::Equal, 4);
bool equal_2_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::Equal, 2);
bool equal_1_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::Equal, 1);
ASSERT_TRUE(equal_5_skip);
ASSERT_TRUE(equal_4_skip);
ASSERT_FALSE(equal_2_skip);
ASSERT_FALSE(equal_1_skip);
bool less_than_1_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::LessThan, 1);
bool less_than_5_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::LessThan, 5);
ASSERT_TRUE(less_than_1_skip);
ASSERT_FALSE(less_than_5_skip);
bool less_equal_than_1_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::LessEqual, 1);
bool less_equal_than_15_skip =
skip_index.CanSkipUnaryRange<int64_t>(i64_fid, 0, OpType::LessThan, 15);
ASSERT_FALSE(less_equal_than_1_skip);
ASSERT_FALSE(less_equal_than_15_skip);
bool greater_than_10_skip = skip_index.CanSkipUnaryRange<int64_t>(
i64_fid, 0, OpType::GreaterThan, 10);
bool greater_than_5_skip = skip_index.CanSkipUnaryRange<int64_t>(
i64_fid, 0, OpType::GreaterThan, 5);
bool greater_than_2_skip = skip_index.CanSkipUnaryRange<int64_t>(
i64_fid, 0, OpType::GreaterThan, 2);
bool greater_than_1_skip = skip_index.CanSkipUnaryRange<int64_t>(
i64_fid, 0, OpType::GreaterThan, 1);
ASSERT_TRUE(greater_than_10_skip);
ASSERT_TRUE(greater_than_5_skip);
ASSERT_TRUE(greater_than_2_skip);
ASSERT_FALSE(greater_than_1_skip);
bool greater_equal_than_3_skip = skip_index.CanSkipUnaryRange<int64_t>(
i64_fid, 0, OpType::GreaterEqual, 3);
bool greater_equal_than_2_skip = skip_index.CanSkipUnaryRange<int64_t>(
i64_fid, 0, OpType::GreaterEqual, 2);
ASSERT_TRUE(greater_equal_than_3_skip);
ASSERT_FALSE(greater_equal_than_2_skip);
}
TEST(Sealed, SkipIndexSkipBinaryRangeNullable) {
auto schema = std::make_shared<Schema>();
auto dim = 128;
auto metrics_type = "L2";
auto fake_vec_fid = schema->AddDebugField(
"fakeVec", DataType::VECTOR_FLOAT, dim, metrics_type);
auto i64_fid = schema->AddDebugField("int64_field", DataType::INT64, true);
auto dataset = DataGen(schema, 5);
auto segment = CreateSealedSegment(schema);
//test for int64
std::vector<int64_t> int64s = {1, 2, 3, 4, 5};
uint8_t* valid_data = new uint8_t[1]{0x03};
FixedVector<bool> valid_data_ = {true, true, false, false, false};
auto int64s_field_data =
storage::CreateFieldData(DataType::INT64, true, 1, 5);
int64s_field_data->FillFieldData(int64s.data(), valid_data, 5);
segment->LoadPrimitiveSkipIndex(i64_fid,
0,
DataType::INT64,
int64s_field_data->Data(),
valid_data_.data(),
5);
auto& skip_index = segment->GetSkipIndex();
ASSERT_FALSE(
skip_index.CanSkipBinaryRange<int64_t>(i64_fid, 0, -3, 1, true, true));
ASSERT_TRUE(
skip_index.CanSkipBinaryRange<int64_t>(i64_fid, 0, -3, 1, true, false));
ASSERT_FALSE(
skip_index.CanSkipBinaryRange<int64_t>(i64_fid, 0, 1, 3, true, true));
ASSERT_FALSE(
skip_index.CanSkipBinaryRange<int64_t>(i64_fid, 0, 1, 2, true, false));
ASSERT_TRUE(
skip_index.CanSkipBinaryRange<int64_t>(i64_fid, 0, 2, 3, false, true));
ASSERT_FALSE(
skip_index.CanSkipBinaryRange<int64_t>(i64_fid, 0, 2, 3, true, true));
}
TEST(Sealed, SkipIndexSkipStringRange) {
auto schema = std::make_shared<Schema>();
auto dim = 128;