enhance: all op(Null) is false in expr (#35527)

#31728

---------

Signed-off-by: lixinguo <xinguo.li@zilliz.com>
Co-authored-by: lixinguo <xinguo.li@zilliz.com>
pull/36819/head
smellthemoon 2024-10-17 21:14:30 +08:00 committed by GitHub
parent 04c306e63f
commit eb3e4583ec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
39 changed files with 8876 additions and 397 deletions

View File

@ -69,7 +69,7 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
ssize_t byte_count = (element_count + 7) / 8;
// Note: if 'nullable == true` and valid_data is nullptr
// means null_count == 0, will fill it with 0xFF
if (!valid_data) {
if (valid_data == nullptr) {
valid_data_.assign(byte_count, 0xFF);
} else {
std::copy_n(valid_data, byte_count, valid_data_.data());

View File

@ -19,6 +19,8 @@
#include <memory>
#include <string>
#include "EasyAssert.h"
#include "Types.h"
#include "common/FieldData.h"
namespace milvus {
@ -50,6 +52,7 @@ class BaseVector {
protected:
DataType type_kind_;
size_t length_;
// todo: use null_count to skip some bitset operate
std::optional<size_t> null_count_;
};
@ -65,8 +68,8 @@ class ColumnVector final : public BaseVector {
size_t length,
std::optional<size_t> null_count = std::nullopt)
: BaseVector(data_type, length, null_count) {
//todo: support null expr
values_ = InitScalarFieldData(data_type, false, length);
valid_values_ = InitScalarFieldData(data_type, false, length);
}
// ColumnVector(FixedVector<bool>&& data)
@ -75,15 +78,25 @@ class ColumnVector final : public BaseVector {
// std::make_shared<FieldData<bool>>(DataType::BOOL, std::move(data));
// }
// // the size is the number of bits
// ColumnVector(TargetBitmap&& bitmap)
// : BaseVector(DataType::INT8, bitmap.size()) {
// values_ = std::make_shared<FieldDataImpl<uint8_t, false>>(
// bitmap.size(), DataType::INT8, false, std::move(bitmap).into());
// }
// the size is the number of bits
ColumnVector(TargetBitmap&& bitmap)
ColumnVector(TargetBitmap&& bitmap, TargetBitmap&& valid_bitmap)
: BaseVector(DataType::INT8, bitmap.size()) {
values_ = std::make_shared<FieldBitsetImpl<uint8_t>>(DataType::INT8,
std::move(bitmap));
valid_values_ = std::make_shared<FieldBitsetImpl<uint8_t>>(
DataType::INT8, std::move(valid_bitmap));
}
virtual ~ColumnVector() override {
values_.reset();
valid_values_.reset();
}
void*
@ -91,6 +104,11 @@ class ColumnVector final : public BaseVector {
return values_->Data();
}
void*
GetValidRawData() {
return valid_values_->Data();
}
template <typename As>
const As*
RawAsValues() const {
@ -99,6 +117,7 @@ class ColumnVector final : public BaseVector {
private:
FieldDataPtr values_;
FieldDataPtr valid_values_;
};
using ColumnVectorPtr = std::shared_ptr<ColumnVector>;

View File

@ -25,16 +25,19 @@ PhyAlwaysTrueExpr::Eval(EvalCtx& context, VectorPtr& result) {
? active_count_ - current_pos_
: batch_size_;
// always true no need to skip null
if (real_batch_size == 0) {
result = nullptr;
return;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
res.set();
valid_res.set();
result = res_vec;
current_pos_ += real_batch_size;

View File

@ -113,9 +113,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto op_type = expr_->op_type_;
@ -129,6 +131,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
#define BinaryArithRangeJSONCompare(cmp) \
do { \
for (size_t i = 0; i < size; ++i) { \
if (valid_data != nullptr && !valid_data[i]) { \
res[i] = false; \
valid_res[i] = false; \
continue; \
} \
auto x = data[i].template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
@ -146,6 +153,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
#define BinaryArithRangeJSONCompareNotEqual(cmp) \
do { \
for (size_t i = 0; i < size; ++i) { \
if (valid_data != nullptr && !valid_data[i]) { \
res[i] = false; \
valid_res[i] = false; \
continue; \
} \
auto x = data[i].template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
@ -161,8 +173,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
} while (false)
auto execute_sub_batch = [op_type, arith_type](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val,
ValueType right_operand,
const std::string& pointer) {
@ -197,6 +211,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = false;
valid_res[i] = false;
continue;
}
int array_length = 0;
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -246,6 +265,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = false;
valid_res[i] = false;
continue;
}
int array_length = 0;
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -295,6 +319,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = false;
valid_res[i] = false;
continue;
}
int array_length = 0;
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -344,6 +373,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = false;
valid_res[i] = false;
continue;
}
int array_length = 0;
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -393,6 +427,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = false;
valid_res[i] = false;
continue;
}
int array_length = 0;
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -442,6 +481,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = false;
valid_res[i] = false;
continue;
}
int array_length = 0;
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -471,6 +515,7 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForJson() {
int64_t processed_size = ProcessDataChunks<milvus::Json>(execute_sub_batch,
std::nullptr_t{},
res,
valid_res,
value,
right_operand,
pointer);
@ -492,9 +537,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
int index = -1;
if (expr_->column_.nested_path_.size() > 0) {
@ -511,6 +558,11 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
#define BinaryArithRangeArrayCompare(cmp) \
do { \
for (size_t i = 0; i < size; ++i) { \
if (valid_data != nullptr && !valid_data[i]) { \
res[i] = false; \
valid_res[i] = false; \
continue; \
} \
if (index >= data[i].length()) { \
res[i] = false; \
continue; \
@ -521,8 +573,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
} while (false)
auto execute_sub_batch = [op_type, arith_type](const ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val,
ValueType right_operand,
int index) {
@ -558,6 +612,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].length() == val;
}
break;
@ -601,6 +659,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].length() != val;
}
break;
@ -644,6 +706,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].length() > val;
}
break;
@ -687,6 +753,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].length() >= val;
}
break;
@ -730,6 +800,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].length() < val;
}
break;
@ -773,6 +847,10 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
case proto::plan::ArithOpType::ArrayLength: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].length() <= val;
}
break;
@ -794,8 +872,14 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForArray() {
}
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, value, right_operand, index);
int64_t processed_size =
ProcessDataChunks<milvus::ArrayView>(execute_sub_batch,
std::nullptr_t{},
res,
valid_res,
value,
right_operand,
index);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -1185,12 +1269,13 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForIndex() {
return res;
};
auto res = ProcessIndexChunks<T>(execute_sub_batch, value, right_operand);
AssertInfo(res.size() == real_batch_size,
AssertInfo(res->size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
res.size(),
res->size(),
real_batch_size);
return std::make_shared<ColumnVector>(std::move(res));
// return std::make_shared<ColumnVector>(std::move(res));
return res;
}
template <typename T>
@ -1209,16 +1294,20 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForData() {
auto value = GetValueFromProto<HighPrecisionType>(expr_->value_);
auto right_operand =
GetValueFromProto<HighPrecisionType>(expr_->right_operand_);
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto op_type = expr_->op_type_;
auto arith_type = expr_->arith_op_type_;
auto execute_sub_batch = [op_type, arith_type](
const T* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
HighPrecisionType value,
HighPrecisionType right_operand) {
switch (op_type) {
@ -1534,9 +1623,23 @@ PhyBinaryArithOpEvalRangeExpr::ExecRangeVisitorImplForData() {
"arithmetic eval expr: {}",
op_type);
}
// there is a batch operation in ArithOpElementFunc,
// so not divide data again for the reason that it may reduce performance if the null distribution is scattered
// but to mask res with valid_data after the batch operation.
if (valid_data != nullptr) {
for (int i = 0; i < size; i++) {
if (!valid_data[i]) {
res[i] = valid_res[i] = false;
}
}
}
};
int64_t processed_size = ProcessDataChunks<T>(
execute_sub_batch, std::nullptr_t{}, res, value, right_operand);
int64_t processed_size = ProcessDataChunks<T>(execute_sub_batch,
std::nullptr_t{},
res,
valid_res,
value,
right_operand);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",

View File

@ -239,7 +239,6 @@ struct ArithOpElementFunc {
}
}
*/
if constexpr (!std::is_same_v<decltype(CmpOpHelper<cmp_op>::op),
void>) {
constexpr auto cmp_op_cvt = CmpOpHelper<cmp_op>::op;
@ -282,22 +281,26 @@ struct ArithOpIndexFunc {
HighPrecisonType right_operand) {
TargetBitmap res(size);
for (size_t i = 0; i < size; ++i) {
auto raw = index->Reverse_Lookup(i);
if (!raw.has_value()) {
res[i] = false;
continue;
}
if constexpr (cmp_op == proto::plan::OpType::Equal) {
if constexpr (arith_op == proto::plan::ArithOpType::Add) {
res[i] = (index->Reverse_Lookup(i) + right_operand) == val;
res[i] = (raw.value() + right_operand) == val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Sub) {
res[i] = (index->Reverse_Lookup(i) - right_operand) == val;
res[i] = (raw.value() - right_operand) == val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mul) {
res[i] = (index->Reverse_Lookup(i) * right_operand) == val;
res[i] = (raw.value() * right_operand) == val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Div) {
res[i] = (index->Reverse_Lookup(i) / right_operand) == val;
res[i] = (raw.value() / right_operand) == val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] =
(fmod(index->Reverse_Lookup(i), right_operand)) == val;
res[i] = (fmod(raw.value(), right_operand)) == val;
} else {
PanicInfo(
OpTypeInvalid,
@ -307,20 +310,19 @@ struct ArithOpIndexFunc {
}
} else if constexpr (cmp_op == proto::plan::OpType::NotEqual) {
if constexpr (arith_op == proto::plan::ArithOpType::Add) {
res[i] = (index->Reverse_Lookup(i) + right_operand) != val;
res[i] = (raw.value() + right_operand) != val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Sub) {
res[i] = (index->Reverse_Lookup(i) - right_operand) != val;
res[i] = (raw.value() - right_operand) != val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mul) {
res[i] = (index->Reverse_Lookup(i) * right_operand) != val;
res[i] = (raw.value() * right_operand) != val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Div) {
res[i] = (index->Reverse_Lookup(i) / right_operand) != val;
res[i] = (raw.value() / right_operand) != val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] =
(fmod(index->Reverse_Lookup(i), right_operand)) != val;
res[i] = (fmod(raw.value(), right_operand)) != val;
} else {
PanicInfo(
OpTypeInvalid,
@ -330,20 +332,19 @@ struct ArithOpIndexFunc {
}
} else if constexpr (cmp_op == proto::plan::OpType::GreaterThan) {
if constexpr (arith_op == proto::plan::ArithOpType::Add) {
res[i] = (index->Reverse_Lookup(i) + right_operand) > val;
res[i] = (raw.value() + right_operand) > val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Sub) {
res[i] = (index->Reverse_Lookup(i) - right_operand) > val;
res[i] = (raw.value() - right_operand) > val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mul) {
res[i] = (index->Reverse_Lookup(i) * right_operand) > val;
res[i] = (raw.value() * right_operand) > val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Div) {
res[i] = (index->Reverse_Lookup(i) / right_operand) > val;
res[i] = (raw.value() / right_operand) > val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] =
(fmod(index->Reverse_Lookup(i), right_operand)) > val;
res[i] = (fmod(raw.value(), right_operand)) > val;
} else {
PanicInfo(
OpTypeInvalid,
@ -353,20 +354,19 @@ struct ArithOpIndexFunc {
}
} else if constexpr (cmp_op == proto::plan::OpType::GreaterEqual) {
if constexpr (arith_op == proto::plan::ArithOpType::Add) {
res[i] = (index->Reverse_Lookup(i) + right_operand) >= val;
res[i] = (raw.value() + right_operand) >= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Sub) {
res[i] = (index->Reverse_Lookup(i) - right_operand) >= val;
res[i] = (raw.value() - right_operand) >= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mul) {
res[i] = (index->Reverse_Lookup(i) * right_operand) >= val;
res[i] = (raw.value() * right_operand) >= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Div) {
res[i] = (index->Reverse_Lookup(i) / right_operand) >= val;
res[i] = (raw.value() / right_operand) >= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] =
(fmod(index->Reverse_Lookup(i), right_operand)) >= val;
res[i] = (fmod(raw.value(), right_operand)) >= val;
} else {
PanicInfo(
OpTypeInvalid,
@ -376,20 +376,19 @@ struct ArithOpIndexFunc {
}
} else if constexpr (cmp_op == proto::plan::OpType::LessThan) {
if constexpr (arith_op == proto::plan::ArithOpType::Add) {
res[i] = (index->Reverse_Lookup(i) + right_operand) < val;
res[i] = (raw.value() + right_operand) < val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Sub) {
res[i] = (index->Reverse_Lookup(i) - right_operand) < val;
res[i] = (raw.value() - right_operand) < val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mul) {
res[i] = (index->Reverse_Lookup(i) * right_operand) < val;
res[i] = (raw.value() * right_operand) < val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Div) {
res[i] = (index->Reverse_Lookup(i) / right_operand) < val;
res[i] = (raw.value() / right_operand) < val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] =
(fmod(index->Reverse_Lookup(i), right_operand)) < val;
res[i] = (fmod(raw.value(), right_operand)) < val;
} else {
PanicInfo(
OpTypeInvalid,
@ -399,20 +398,19 @@ struct ArithOpIndexFunc {
}
} else if constexpr (cmp_op == proto::plan::OpType::LessEqual) {
if constexpr (arith_op == proto::plan::ArithOpType::Add) {
res[i] = (index->Reverse_Lookup(i) + right_operand) <= val;
res[i] = (raw.value() + right_operand) <= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Sub) {
res[i] = (index->Reverse_Lookup(i) - right_operand) <= val;
res[i] = (raw.value() - right_operand) <= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mul) {
res[i] = (index->Reverse_Lookup(i) * right_operand) <= val;
res[i] = (raw.value() * right_operand) <= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Div) {
res[i] = (index->Reverse_Lookup(i) / right_operand) <= val;
res[i] = (raw.value() / right_operand) <= val;
} else if constexpr (arith_op ==
proto::plan::ArithOpType::Mod) {
res[i] =
(fmod(index->Reverse_Lookup(i), right_operand)) <= val;
res[i] = (fmod(raw.value(), right_operand)) <= val;
} else {
PanicInfo(
OpTypeInvalid,

View File

@ -15,6 +15,7 @@
// limitations under the License.
#include "BinaryRangeExpr.h"
#include <utility>
#include "query/Utils.h"
@ -150,8 +151,12 @@ PhyBinaryRangeFilterExpr::PreCheckOverflow(HighPrecisionType& val1,
cached_overflow_res_->size() == batch_size) {
return cached_overflow_res_;
}
auto res = std::make_shared<ColumnVector>(TargetBitmap(batch_size));
return res;
auto valid_res = ProcessChunksForValid<T>(is_index_mode_);
auto res_vec = std::make_shared<ColumnVector>(TargetBitmap(batch_size),
std::move(valid_res));
cached_overflow_res_ = res_vec;
return res_vec;
};
if constexpr (std::is_integral_v<T> && !std::is_same_v<bool, T>) {
@ -207,12 +212,12 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForIndex() {
func(index_ptr, val1, val2, lower_inclusive, upper_inclusive));
};
auto res = ProcessIndexChunks<T>(execute_sub_batch, val1, val2);
AssertInfo(res.size() == real_batch_size,
AssertInfo(res->size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
res.size(),
res->size(),
real_batch_size);
return std::make_shared<ColumnVector>(std::move(res));
return res;
}
template <typename T>
@ -240,14 +245,18 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForData() {
PreCheckOverflow<T>(val1, val2, lower_inclusive, upper_inclusive)) {
return res;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto execute_sub_batch = [lower_inclusive, upper_inclusive](
const T* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
HighPrecisionType val1,
HighPrecisionType val2) {
if (lower_inclusive && upper_inclusive) {
@ -263,6 +272,16 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForData() {
BinaryRangeElementFunc<T, false, false> func;
func(val1, val2, data, size, res);
}
// there is a batch operation in BinaryRangeElementFunc,
// so not divide data again for the reason that it may reduce performance if the null distribution is scattered
// but to mask res with valid_data after the batch operation.
if (valid_data != nullptr) {
for (int i = 0; i < size; i++) {
if (!valid_data[i]) {
res[i] = valid_res[i] = false;
}
}
}
};
auto skip_index_func =
[val1, val2, lower_inclusive, upper_inclusive](
@ -282,7 +301,7 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForData() {
}
};
int64_t processed_size = ProcessDataChunks<T>(
execute_sub_batch, skip_index_func, res, val1, val2);
execute_sub_batch, skip_index_func, res, valid_res, val1, val2);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -301,9 +320,11 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
bool lower_inclusive = expr_->lower_inclusive_;
bool upper_inclusive = expr_->upper_inclusive_;
@ -313,26 +334,28 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson() {
auto execute_sub_batch = [lower_inclusive, upper_inclusive, pointer](
const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val1,
ValueType val2) {
if (lower_inclusive && upper_inclusive) {
BinaryRangeElementFuncForJson<ValueType, true, true> func;
func(val1, val2, pointer, data, size, res);
func(val1, val2, pointer, data, valid_data, size, res, valid_res);
} else if (lower_inclusive && !upper_inclusive) {
BinaryRangeElementFuncForJson<ValueType, true, false> func;
func(val1, val2, pointer, data, size, res);
func(val1, val2, pointer, data, valid_data, size, res, valid_res);
} else if (!lower_inclusive && upper_inclusive) {
BinaryRangeElementFuncForJson<ValueType, false, true> func;
func(val1, val2, pointer, data, size, res);
func(val1, val2, pointer, data, valid_data, size, res, valid_res);
} else {
BinaryRangeElementFuncForJson<ValueType, false, false> func;
func(val1, val2, pointer, data, size, res);
func(val1, val2, pointer, data, valid_data, size, res, valid_res);
}
};
int64_t processed_size = ProcessDataChunks<milvus::Json>(
execute_sub_batch, std::nullptr_t{}, res, val1, val2);
execute_sub_batch, std::nullptr_t{}, res, valid_res, val1, val2);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -351,9 +374,11 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
bool lower_inclusive = expr_->lower_inclusive_;
bool upper_inclusive = expr_->upper_inclusive_;
@ -366,27 +391,29 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray() {
auto execute_sub_batch = [lower_inclusive, upper_inclusive](
const milvus::ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val1,
ValueType val2,
int index) {
if (lower_inclusive && upper_inclusive) {
BinaryRangeElementFuncForArray<ValueType, true, true> func;
func(val1, val2, index, data, size, res);
func(val1, val2, index, data, valid_data, size, res, valid_res);
} else if (lower_inclusive && !upper_inclusive) {
BinaryRangeElementFuncForArray<ValueType, true, false> func;
func(val1, val2, index, data, size, res);
func(val1, val2, index, data, valid_data, size, res, valid_res);
} else if (!lower_inclusive && upper_inclusive) {
BinaryRangeElementFuncForArray<ValueType, false, true> func;
func(val1, val2, index, data, size, res);
func(val1, val2, index, data, valid_data, size, res, valid_res);
} else {
BinaryRangeElementFuncForArray<ValueType, false, false> func;
func(val1, val2, index, data, size, res);
func(val1, val2, index, data, valid_data, size, res, valid_res);
}
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, val1, val2, index);
execute_sub_batch, std::nullptr_t{}, res, valid_res, val1, val2, index);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",

View File

@ -54,6 +54,10 @@ struct BinaryRangeElementFunc {
#define BinaryRangeJSONCompare(cmp) \
do { \
if (valid_data != nullptr && !valid_data[i]) { \
res[i] = valid_res[i] = false; \
break; \
} \
auto x = src[i].template at<GetType>(pointer); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
@ -81,8 +85,10 @@ struct BinaryRangeElementFuncForJson {
ValueType val2,
const std::string& pointer,
const milvus::Json* src,
const bool* valid_data,
size_t n,
TargetBitmapView res) {
TargetBitmapView res,
TargetBitmapView valid_res) {
for (size_t i = 0; i < n; ++i) {
if constexpr (lower_inclusive && upper_inclusive) {
BinaryRangeJSONCompare(val1 <= value && value <= val2);
@ -107,9 +113,15 @@ struct BinaryRangeElementFuncForArray {
ValueType val2,
int index,
const milvus::ArrayView* src,
const bool* valid_data,
size_t n,
TargetBitmapView res) {
TargetBitmapView res,
TargetBitmapView valid_res) {
for (size_t i = 0; i < n; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (lower_inclusive && upper_inclusive) {
if (index >= src[i].length()) {
res[i] = false;

View File

@ -16,6 +16,7 @@
#include "CompareExpr.h"
#include "common/type_c.h"
#include <optional>
#include "query/Relational.h"
namespace milvus {
@ -58,12 +59,19 @@ PhyCompareFilterExpr::GetChunkData(FieldId field_id,
segment_->chunk_scalar_index<T>(field_id,
current_chunk_id));
}
return indexing.Reverse_Lookup(current_chunk_pos++);
auto raw = indexing.Reverse_Lookup(current_chunk_pos);
current_chunk_pos++;
if (!raw.has_value()) {
return std::nullopt;
}
return raw.value();
};
}
}
auto chunk_data =
segment_->chunk_data<T>(field_id, current_chunk_id).data();
auto chunk_valid_data =
segment_->chunk_data<T>(field_id, current_chunk_id).valid_data();
auto current_chunk_size = segment_->chunk_size(field_id, current_chunk_id);
return
[=, &current_chunk_id, &current_chunk_pos]() mutable -> const number {
@ -72,10 +80,16 @@ PhyCompareFilterExpr::GetChunkData(FieldId field_id,
current_chunk_pos = 0;
chunk_data =
segment_->chunk_data<T>(field_id, current_chunk_id).data();
chunk_valid_data =
segment_->chunk_data<T>(field_id, current_chunk_id)
.valid_data();
current_chunk_size =
segment_->chunk_size(field_id, current_chunk_id);
}
if (chunk_valid_data && !chunk_valid_data[current_chunk_pos]) {
current_chunk_pos++;
return std::nullopt;
}
return chunk_data[current_chunk_pos++];
};
}
@ -103,7 +117,12 @@ PhyCompareFilterExpr::GetChunkData<std::string>(FieldId field_id,
segment_->chunk_scalar_index<std::string>(
field_id, current_chunk_id));
}
return indexing.Reverse_Lookup(current_chunk_pos++);
auto raw = indexing.Reverse_Lookup(current_chunk_pos);
current_chunk_pos++;
if (!raw.has_value()) {
return std::nullopt;
}
return raw.value();
};
}
}
@ -114,6 +133,9 @@ PhyCompareFilterExpr::GetChunkData<std::string>(FieldId field_id,
auto chunk_data =
segment_->chunk_data<std::string>(field_id, current_chunk_id)
.data();
auto chunk_valid_data =
segment_->chunk_data<std::string>(field_id, current_chunk_id)
.valid_data();
auto current_chunk_size =
segment_->chunk_size(field_id, current_chunk_id);
return [=,
@ -126,16 +148,26 @@ PhyCompareFilterExpr::GetChunkData<std::string>(FieldId field_id,
segment_
->chunk_data<std::string>(field_id, current_chunk_id)
.data();
chunk_valid_data =
segment_
->chunk_data<std::string>(field_id, current_chunk_id)
.valid_data();
current_chunk_size =
segment_->chunk_size(field_id, current_chunk_id);
}
if (chunk_valid_data && !chunk_valid_data[current_chunk_pos]) {
current_chunk_pos++;
return std::nullopt;
}
return chunk_data[current_chunk_pos++];
};
} else {
auto chunk_data =
segment_->chunk_view<std::string_view>(field_id, current_chunk_id)
.first.data();
auto chunk_valid_data =
segment_->chunk_data<std::string_view>(field_id, current_chunk_id)
.valid_data();
auto current_chunk_size =
segment_->chunk_size(field_id, current_chunk_id);
return [=,
@ -148,9 +180,17 @@ PhyCompareFilterExpr::GetChunkData<std::string>(FieldId field_id,
->chunk_view<std::string_view>(
field_id, current_chunk_id)
.first.data();
chunk_valid_data = segment_
->chunk_data<std::string_view>(
field_id, current_chunk_id)
.valid_data();
current_chunk_size =
segment_->chunk_size(field_id, current_chunk_id);
}
if (chunk_valid_data && !chunk_valid_data[current_chunk_pos]) {
current_chunk_pos++;
return std::nullopt;
}
return std::string(chunk_data[current_chunk_pos++]);
};
@ -203,9 +243,11 @@ PhyCompareFilterExpr::ExecCompareExprDispatcher(OpType op) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto left = GetChunkData(expr_->left_data_type_,
expr_->left_field_id_,
@ -218,8 +260,15 @@ PhyCompareFilterExpr::ExecCompareExprDispatcher(OpType op) {
right_current_chunk_id_,
right_current_chunk_pos_);
for (int i = 0; i < real_batch_size; ++i) {
res[i] = boost::apply_visitor(
milvus::query::Relational<decltype(op)>{}, left(), right());
if (!left().has_value() || !right().has_value()) {
res[i] = false;
valid_res[i] = false;
continue;
}
res[i] =
boost::apply_visitor(milvus::query::Relational<decltype(op)>{},
left().value(),
right().value());
}
return res_vec;
} else {
@ -228,9 +277,11 @@ PhyCompareFilterExpr::ExecCompareExprDispatcher(OpType op) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto left_data_barrier =
segment_->num_chunk_data(expr_->left_field_id_);
@ -255,10 +306,16 @@ PhyCompareFilterExpr::ExecCompareExprDispatcher(OpType op) {
for (int i = chunk_id == current_chunk_id_ ? current_chunk_pos_ : 0;
i < chunk_size;
++i) {
res[processed_rows++] = boost::apply_visitor(
milvus::query::Relational<decltype(op)>{},
left(i),
right(i));
if (!left(i).has_value() || !right(i).has_value()) {
res[processed_rows] = false;
valid_res[processed_rows] = false;
} else {
res[processed_rows] = boost::apply_visitor(
milvus::query::Relational<decltype(op)>{},
left(i).value(),
right(i).value());
}
processed_rows++;
if (processed_rows >= batch_size_) {
current_chunk_id_ = chunk_id;
@ -280,12 +337,23 @@ PhyCompareFilterExpr::GetChunkData(FieldId field_id,
auto& indexing = segment_->chunk_scalar_index<T>(field_id, chunk_id);
if (indexing.HasRawData()) {
return [&indexing](int i) -> const number {
return indexing.Reverse_Lookup(i);
auto raw = indexing.Reverse_Lookup(i);
if (!raw.has_value()) {
return std::nullopt;
}
return raw.value();
};
}
}
auto chunk_data = segment_->chunk_data<T>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
auto chunk_valid_data =
segment_->chunk_data<T>(field_id, chunk_id).valid_data();
return [chunk_data, chunk_valid_data](int i) -> const number {
if (chunk_valid_data && !chunk_valid_data[i]) {
return std::nullopt;
}
return chunk_data[i];
};
}
template <>
@ -297,8 +365,12 @@ PhyCompareFilterExpr::GetChunkData<std::string>(FieldId field_id,
auto& indexing =
segment_->chunk_scalar_index<std::string>(field_id, chunk_id);
if (indexing.HasRawData()) {
return [&indexing](int i) -> const std::string {
return indexing.Reverse_Lookup(i);
return [&indexing](int i) -> const number {
auto raw = indexing.Reverse_Lookup(i);
if (!raw.has_value()) {
return std::nullopt;
}
return raw.value();
};
}
}
@ -308,12 +380,23 @@ PhyCompareFilterExpr::GetChunkData<std::string>(FieldId field_id,
.growing_enable_mmap) {
auto chunk_data =
segment_->chunk_data<std::string>(field_id, chunk_id).data();
return [chunk_data](int i) -> const number { return chunk_data[i]; };
auto chunk_valid_data =
segment_->chunk_data<std::string>(field_id, chunk_id).valid_data();
return [chunk_data, chunk_valid_data](int i) -> const number {
if (chunk_valid_data && !chunk_valid_data[i]) {
return std::nullopt;
}
return chunk_data[i];
};
} else {
auto chunk_data =
segment_->chunk_view<std::string_view>(field_id, chunk_id)
.first.data();
return [chunk_data](int i) -> const number {
auto chunk_info =
segment_->chunk_view<std::string_view>(field_id, chunk_id);
auto chunk_data = chunk_info.first.data();
auto chunk_valid_data = chunk_info.second.data();
return [chunk_data, chunk_valid_data](int i) -> const number {
if (chunk_valid_data && !chunk_valid_data[i]) {
return std::nullopt;
}
return std::string(chunk_data[i]);
};
}
@ -450,9 +533,11 @@ PhyCompareFilterExpr::ExecCompareRightType() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto expr_type = expr_->op_type_;
auto execute_sub_batch = [expr_type](const T* left,
@ -491,15 +576,14 @@ PhyCompareFilterExpr::ExecCompareRightType() {
break;
}
default:
PanicInfo(
OpTypeInvalid,
fmt::format(
"unsupported operator type for compare column expr: {}",
expr_type));
PanicInfo(OpTypeInvalid,
fmt::format("unsupported operator type for "
"compare column expr: {}",
expr_type));
}
};
int64_t processed_size =
ProcessBothDataChunks<T, U>(execute_sub_batch, res);
ProcessBothDataChunks<T, U>(execute_sub_batch, res, valid_res);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",

View File

@ -18,6 +18,7 @@
#include <fmt/core.h>
#include <boost/variant.hpp>
#include <optional>
#include "common/EasyAssert.h"
#include "common/Types.h"
@ -29,14 +30,17 @@
namespace milvus {
namespace exec {
using number = boost::variant<bool,
int8_t,
int16_t,
int32_t,
int64_t,
float,
double,
std::string>;
using number_type = boost::variant<bool,
int8_t,
int16_t,
int32_t,
int64_t,
float,
double,
std::string>;
using number = std::optional<number_type>;
using ChunkDataAccessor = std::function<const number(int)>;
using MultipleChunkDataAccessor = std::function<const number()>;
@ -264,16 +268,19 @@ class PhyCompareFilterExpr : public Expr {
template <typename T, typename U, typename FUNC, typename... ValTypes>
int64_t
ProcessBothDataChunks(FUNC func, TargetBitmapView res, ValTypes... values) {
ProcessBothDataChunks(FUNC func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
if (segment_->is_chunked()) {
return ProcessBothDataChunksForMultipleChunk<T,
U,
FUNC,
ValTypes...>(
func, res, values...);
func, res, valid_res, values...);
} else {
return ProcessBothDataChunksForSingleChunk<T, U, FUNC, ValTypes...>(
func, res, values...);
func, res, valid_res, values...);
}
}
@ -281,6 +288,7 @@ class PhyCompareFilterExpr : public Expr {
int64_t
ProcessBothDataChunksForSingleChunk(FUNC func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
int64_t processed_size = 0;
@ -304,6 +312,20 @@ class PhyCompareFilterExpr : public Expr {
const T* left_data = left_chunk.data() + data_pos;
const U* right_data = right_chunk.data() + data_pos;
func(left_data, right_data, size, res + processed_size, values...);
const bool* left_valid_data = left_chunk.valid_data();
const bool* right_valid_data = right_chunk.valid_data();
// mask with valid_data
for (int i = 0; i < size; ++i) {
if (left_valid_data && !left_valid_data[i + data_pos]) {
res[processed_size + i] = false;
valid_res[processed_size + i] = false;
continue;
}
if (right_valid_data && !right_valid_data[i + data_pos]) {
res[processed_size + i] = false;
valid_res[processed_size + i] = false;
}
}
processed_size += size;
if (processed_size >= batch_size_) {
@ -320,6 +342,7 @@ class PhyCompareFilterExpr : public Expr {
int64_t
ProcessBothDataChunksForMultipleChunk(FUNC func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
int64_t processed_size = 0;
@ -347,6 +370,20 @@ class PhyCompareFilterExpr : public Expr {
const T* left_data = left_chunk.data() + data_pos;
const U* right_data = right_chunk.data() + data_pos;
func(left_data, right_data, size, res + processed_size, values...);
const bool* left_valid_data = left_chunk.valid_data();
const bool* right_valid_data = right_chunk.valid_data();
// mask with valid_data
for (int i = 0; i < size; ++i) {
if (left_valid_data && !left_valid_data[i + data_pos]) {
res[processed_size + i] = false;
valid_res[processed_size + i] = false;
continue;
}
if (right_valid_data && !right_valid_data[i + data_pos]) {
res[processed_size + i] = false;
valid_res[processed_size + i] = false;
}
}
processed_size += size;
if (processed_size >= batch_size_) {

View File

@ -44,22 +44,30 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto execute_sub_batch = [](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer) {
for (int i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = data[i].exist(pointer);
}
};
int64_t processed_size = ProcessDataChunks<Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",

View File

@ -16,6 +16,7 @@
#pragma once
#include <algorithm>
#include <memory>
#include <string>
@ -248,6 +249,7 @@ class SegmentExpr : public Expr {
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
// For sealed segment, only single chunk
Assert(num_data_chunk_ == 1);
@ -256,13 +258,16 @@ class SegmentExpr : public Expr {
auto& skip_index = segment_->GetSkipIndex();
if (!skip_func || !skip_func(skip_index, field_id_, 0)) {
auto data_vec =
segment_
->get_batch_views<T>(
field_id_, 0, current_data_chunk_pos_, need_size)
.first;
func(data_vec.data(), need_size, res, values...);
auto views_info = segment_->get_batch_views<T>(
field_id_, 0, current_data_chunk_pos_, need_size);
// first is the raw data, second is valid_data
// use valid_data to see if raw data is null
func(views_info.first.data(),
views_info.second.data(),
need_size,
res,
valid_res,
values...);
}
current_data_chunk_pos_ += need_size;
return need_size;
@ -274,6 +279,7 @@ class SegmentExpr : public Expr {
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
int64_t processed_size = 0;
@ -281,7 +287,7 @@ class SegmentExpr : public Expr {
std::is_same_v<T, Json>) {
if (segment_->type() == SegmentType::Sealed) {
return ProcessChunkForSealedSeg<T>(
func, skip_func, res, values...);
func, skip_func, res, valid_res, values...);
}
}
@ -303,7 +309,16 @@ class SegmentExpr : public Expr {
if (!skip_func || !skip_func(skip_index, field_id_, i)) {
auto chunk = segment_->chunk_data<T>(field_id_, i);
const T* data = chunk.data() + data_pos;
func(data, size, res + processed_size, values...);
const bool* valid_data = chunk.valid_data();
if (valid_data != nullptr) {
valid_data += data_pos;
}
func(data,
valid_data,
size,
res + processed_size,
valid_res + processed_size,
values...);
}
processed_size += size;
@ -322,6 +337,7 @@ class SegmentExpr : public Expr {
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
int64_t processed_size = 0;
@ -356,13 +372,21 @@ class SegmentExpr : public Expr {
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json>) {
if (segment_->type() == SegmentType::Sealed) {
// first is the raw data, second is valid_data
// use valid_data to see if raw data is null
auto data_vec = segment_
->get_batch_views<T>(
field_id_, i, data_pos, size)
.first;
auto valid_data = segment_
->get_batch_views<T>(
field_id_, i, data_pos, size)
.second;
func(data_vec.data(),
valid_data.data(),
size,
res + processed_size,
valid_res + processed_size,
values...);
is_seal = true;
}
@ -370,7 +394,16 @@ class SegmentExpr : public Expr {
if (!is_seal) {
auto chunk = segment_->chunk_data<T>(field_id_, i);
const T* data = chunk.data() + data_pos;
func(data, size, res + processed_size, values...);
const bool* valid_data = chunk.valid_data();
if (valid_data != nullptr) {
valid_data += data_pos;
}
func(data,
valid_data,
size,
res + processed_size,
valid_res + processed_size,
values...);
}
}
@ -403,8 +436,10 @@ class SegmentExpr : public Expr {
int
ProcessIndexOneChunk(TargetBitmap& result,
TargetBitmap& valid_result,
size_t chunk_id,
const TargetBitmap& chunk_res,
const TargetBitmap& chunk_valid_res,
int processed_rows) {
auto data_pos =
chunk_id == current_index_chunk_ ? current_index_chunk_pos_ : 0;
@ -416,33 +451,41 @@ class SegmentExpr : public Expr {
// chunk_res.begin() + data_pos,
// chunk_res.begin() + data_pos + size);
result.append(chunk_res, data_pos, size);
valid_result.append(chunk_valid_res, data_pos, size);
return size;
}
template <typename T, typename FUNC, typename... ValTypes>
TargetBitmap
VectorPtr
ProcessIndexChunks(FUNC func, ValTypes... values) {
typedef std::
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
IndexInnerType;
using Index = index::ScalarIndex<IndexInnerType>;
TargetBitmap result;
TargetBitmap valid_result;
int processed_rows = 0;
for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) {
// This cache result help getting result for every batch loop.
// It avoids indexing execute for evevy batch because indexing
// It avoids indexing execute for every batch because indexing
// executing costs quite much time.
if (cached_index_chunk_id_ != i) {
const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);
auto* index_ptr = const_cast<Index*>(&index);
cached_index_chunk_res_ = std::move(func(index_ptr, values...));
auto valid_result = index_ptr->IsNotNull();
cached_index_chunk_valid_res_ = std::move(valid_result);
cached_index_chunk_id_ = i;
}
auto size = ProcessIndexOneChunk(
result, i, cached_index_chunk_res_, processed_rows);
auto size = ProcessIndexOneChunk(result,
valid_result,
i,
cached_index_chunk_res_,
cached_index_chunk_valid_res_,
processed_rows);
if (processed_rows + size >= batch_size_) {
current_index_chunk_ = i;
@ -454,23 +497,136 @@ class SegmentExpr : public Expr {
processed_rows += size;
}
return result;
return std::make_shared<ColumnVector>(std::move(result),
std::move(valid_result));
}
template <typename T>
TargetBitmap
ProcessChunksForValid(bool use_index) {
if (use_index) {
return ProcessIndexChunksForValid<T>();
} else {
return ProcessDataChunksForValid<T>();
}
}
template <typename T>
TargetBitmap
ProcessDataChunksForValid() {
TargetBitmap valid_result(batch_size_);
valid_result.set();
int64_t processed_size = 0;
for (size_t i = current_data_chunk_; i < num_data_chunk_; i++) {
auto data_pos =
(i == current_data_chunk_) ? current_data_chunk_pos_ : 0;
auto size =
(i == (num_data_chunk_ - 1))
? (segment_->type() == SegmentType::Growing
? (active_count_ % size_per_chunk_ == 0
? size_per_chunk_ - data_pos
: active_count_ % size_per_chunk_ - data_pos)
: active_count_ - data_pos)
: size_per_chunk_ - data_pos;
size = std::min(size, batch_size_ - processed_size);
auto chunk = segment_->chunk_data<T>(field_id_, i);
const bool* valid_data = chunk.valid_data();
if (valid_data == nullptr) {
return valid_result;
}
valid_data += data_pos;
for (int i = 0; i < size; i++) {
if (!valid_data[i]) {
valid_result[i + data_pos] = false;
}
}
processed_size += size;
if (processed_size >= batch_size_) {
current_data_chunk_ = i;
current_data_chunk_pos_ = data_pos + size;
break;
}
}
return valid_result;
}
int
ProcessIndexOneChunkForValid(TargetBitmap& valid_result,
size_t chunk_id,
const TargetBitmap& chunk_valid_res,
int processed_rows) {
auto data_pos =
chunk_id == current_index_chunk_ ? current_index_chunk_pos_ : 0;
auto size = std::min(
std::min(size_per_chunk_ - data_pos, batch_size_ - processed_rows),
int64_t(chunk_valid_res.size()));
valid_result.append(chunk_valid_res, data_pos, size);
return size;
}
template <typename T>
TargetBitmap
ProcessIndexChunksForValid() {
typedef std::
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
IndexInnerType;
using Index = index::ScalarIndex<IndexInnerType>;
int processed_rows = 0;
TargetBitmap valid_result;
valid_result.set();
for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) {
// This cache result help getting result for every batch loop.
// It avoids indexing execute for every batch because indexing
// executing costs quite much time.
if (cached_index_chunk_id_ != i) {
const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);
auto* index_ptr = const_cast<Index*>(&index);
auto execute_sub_batch = [](Index* index_ptr) {
TargetBitmap res = index_ptr->IsNotNull();
return res;
};
cached_index_chunk_valid_res_ = execute_sub_batch(index_ptr);
cached_index_chunk_id_ = i;
}
auto size = ProcessIndexOneChunkForValid(
valid_result, i, cached_index_chunk_valid_res_, processed_rows);
if (processed_rows + size >= batch_size_) {
current_index_chunk_ = i;
current_index_chunk_pos_ = i == current_index_chunk_
? current_index_chunk_pos_ + size
: size;
break;
}
processed_rows += size;
}
return valid_result;
}
template <typename FUNC, typename... ValTypes>
TargetBitmap
VectorPtr
ProcessTextMatchIndex(FUNC func, ValTypes... values) {
TargetBitmap result;
TargetBitmap valid_result;
if (cached_match_res_ == nullptr) {
auto index = segment_->GetTextIndex(field_id_);
auto res = std::move(func(index, values...));
auto valid_res = index->IsNotNull();
cached_match_res_ = std::make_shared<TargetBitmap>(std::move(res));
cached_index_chunk_valid_res_ = std::move(valid_res);
if (cached_match_res_->size() < active_count_) {
// some entities are not visible in inverted index.
// only happend on growing segment.
TargetBitmap tail(active_count_ - cached_match_res_->size());
cached_match_res_->append(tail);
cached_index_chunk_valid_res_.append(tail);
}
}
@ -481,9 +637,13 @@ class SegmentExpr : public Expr {
: batch_size_;
result.append(
*cached_match_res_, current_data_chunk_pos_, real_batch_size);
valid_result.append(cached_index_chunk_valid_res_,
current_data_chunk_pos_,
real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return result;
return std::make_shared<ColumnVector>(std::move(result),
std::move(valid_result));
}
template <typename T, typename FUNC, typename... ValTypes>
@ -581,6 +741,8 @@ class SegmentExpr : public Expr {
// Cache for index scan to avoid search index every batch
int64_t cached_index_chunk_id_{-1};
TargetBitmap cached_index_chunk_res_{};
// Cache for chunk valid res.
TargetBitmap cached_index_chunk_valid_res_{};
// Cache for text match.
std::shared_ptr<TargetBitmap> cached_match_res_{nullptr};

View File

@ -15,6 +15,7 @@
// limitations under the License.
#include "JsonContainsExpr.h"
#include <utility>
#include "common/Types.h"
namespace milvus {
@ -173,17 +174,21 @@ PhyJsonContainsFilterExpr::ExecArrayContains() {
AssertInfo(expr_->column_.nested_path_.size() == 0,
"[ExecArrayContains]nested path must be null");
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
std::unordered_set<GetType> elements;
for (auto const& element : expr_->vals_) {
elements.insert(GetValueFromProto<GetType>(element));
}
auto execute_sub_batch = [](const milvus::ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::unordered_set<GetType>& elements) {
auto executor = [&](size_t i) {
const auto& array = data[i];
@ -195,12 +200,16 @@ PhyJsonContainsFilterExpr::ExecArrayContains() {
return false;
};
for (int i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -221,9 +230,11 @@ PhyJsonContainsFilterExpr::ExecJsonContains() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
std::unordered_set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
@ -231,8 +242,10 @@ PhyJsonContainsFilterExpr::ExecJsonContains() {
elements.insert(GetValueFromProto<GetType>(element));
}
auto execute_sub_batch = [](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer,
const std::unordered_set<GetType>& elements) {
auto executor = [&](size_t i) {
@ -253,12 +266,16 @@ PhyJsonContainsFilterExpr::ExecJsonContains() {
return false;
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -274,9 +291,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
std::vector<proto::plan::Array> elements;
@ -285,8 +304,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray() {
}
auto execute_sub_batch =
[](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::Array>& elements) {
auto executor = [&](size_t i) -> bool {
@ -316,12 +337,16 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray() {
return false;
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<milvus::Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -344,9 +369,11 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
std::unordered_set<GetType> elements;
for (auto const& element : expr_->vals_) {
@ -354,8 +381,10 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll() {
}
auto execute_sub_batch = [](const milvus::ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::unordered_set<GetType>& elements) {
auto executor = [&](size_t i) {
std::unordered_set<GetType> tmp_elements(elements);
@ -369,12 +398,16 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll() {
return tmp_elements.size() == 0;
};
for (int i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -395,9 +428,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
std::unordered_set<GetType> elements;
@ -406,8 +441,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll() {
}
auto execute_sub_batch = [](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer,
const std::unordered_set<GetType>& elements) {
auto executor = [&](const size_t i) -> bool {
@ -431,12 +468,16 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll() {
return tmp_elements.size() == 0;
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -451,9 +492,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
@ -467,8 +510,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType() {
auto execute_sub_batch =
[](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::GenericValue>& elements,
const std::unordered_set<int> elements_index) {
@ -553,6 +598,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType() {
return tmp_elements_index.size() == 0;
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
@ -560,6 +609,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType() {
int64_t processed_size = ProcessDataChunks<Json>(execute_sub_batch,
std::nullptr_t{},
res,
valid_res,
pointer,
elements,
elements_index);
@ -578,9 +628,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
@ -590,8 +642,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray() {
}
auto execute_sub_batch =
[](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::Array>& elements) {
auto executor = [&](const size_t i) {
@ -625,12 +679,16 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray() {
return exist_elements_index.size() == elements.size();
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -646,9 +704,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
@ -662,8 +722,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType() {
auto execute_sub_batch =
[](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::GenericValue>& elements) {
auto executor = [&](const size_t i) {
@ -739,12 +801,16 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType() {
return false;
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, elements);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -832,12 +898,12 @@ PhyJsonContainsFilterExpr::ExecArrayContainsForIndexSegmentImpl() {
}
};
auto res = ProcessIndexChunks<GetType>(execute_sub_batch, elems);
AssertInfo(res.size() == real_batch_size,
AssertInfo(res->size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
res.size(),
res->size(),
real_batch_size);
return std::make_shared<ColumnVector>(std::move(res));
return res;
}
} //namespace exec

View File

@ -45,6 +45,10 @@ PhyLogicalBinaryExpr::Eval(EvalCtx& context, VectorPtr& result) {
"unsupported logical operator: {}",
expr_->GetOpTypeString());
}
TargetBitmapView lvalid_view(lflat->GetValidRawData(), size);
TargetBitmapView rvalid_view(rflat->GetValidRawData(), size);
LogicalElementFunc<LogicalOpType::Or> func;
func(lvalid_view, rvalid_view, size);
result = std::move(left);
}

View File

@ -30,6 +30,9 @@ PhyLogicalUnaryExpr::Eval(EvalCtx& context, VectorPtr& result) {
auto flat_vec = GetColumnVector(result);
TargetBitmapView data(flat_vec->GetRawData(), flat_vec->size());
data.flip();
TargetBitmapView valid_data(flat_vec->GetValidRawData(),
flat_vec->size());
data &= valid_data;
}
}

View File

@ -15,6 +15,8 @@
// limitations under the License.
#include "TermExpr.h"
#include <memory>
#include <utility>
#include "query/Utils.h"
namespace milvus {
namespace exec {
@ -199,9 +201,12 @@ PhyTermFilterExpr::ExecPkTermImpl() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
// pk valid_bitmap is always all true
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
for (size_t i = 0; i < real_batch_size; ++i) {
res[i] = cached_bits_[current_data_chunk_pos_++];
@ -241,17 +246,21 @@ PhyTermFilterExpr::ExecTermArrayVariableInField() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
AssertInfo(expr_->vals_.size() == 1,
"element length in json array must be one");
ValueType target_val = GetValueFromProto<ValueType>(expr_->vals_[0]);
auto execute_sub_batch = [](const ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const ValueType& target_val) {
auto executor = [&](size_t i) {
for (int i = 0; i < data[i].length(); i++) {
@ -263,12 +272,16 @@ PhyTermFilterExpr::ExecTermArrayVariableInField() {
return false;
};
for (int i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
executor(i);
}
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, target_val);
execute_sub_batch, std::nullptr_t{}, res, valid_res, target_val);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -289,9 +302,11 @@ PhyTermFilterExpr::ExecTermArrayFieldInVariable() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
int index = -1;
if (expr_->column_.nested_path_.size() > 0) {
@ -309,12 +324,18 @@ PhyTermFilterExpr::ExecTermArrayFieldInVariable() {
}
auto execute_sub_batch = [](const ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
int index,
const std::unordered_set<ValueType>& term_set) {
for (int i = 0; i < size; ++i) {
if (index >= data[i].length()) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if (term_set.empty() || index >= data[i].length()) {
res[i] = false;
continue;
}
@ -324,7 +345,7 @@ PhyTermFilterExpr::ExecTermArrayFieldInVariable() {
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, index, term_set);
execute_sub_batch, std::nullptr_t{}, res, valid_res, index, term_set);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -344,9 +365,11 @@ PhyTermFilterExpr::ExecTermJsonVariableInField() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
AssertInfo(expr_->vals_.size() == 1,
"element length in json array must be one");
@ -354,8 +377,10 @@ PhyTermFilterExpr::ExecTermJsonVariableInField() {
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto execute_sub_batch = [](const Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string pointer,
const ValueType& target_val) {
auto executor = [&](size_t i) {
@ -375,11 +400,15 @@ PhyTermFilterExpr::ExecTermJsonVariableInField() {
return false;
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<milvus::Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, val);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, val);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -399,9 +428,11 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
std::unordered_set<ValueType> term_set;
@ -416,8 +447,10 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable() {
}
auto execute_sub_batch = [](const Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string pointer,
const std::unordered_set<ValueType>& terms) {
auto executor = [&](size_t i) {
@ -439,11 +472,19 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable() {
return terms.find(ValueType(x.value())) != terms.end();
};
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if (terms.empty()) {
res[i] = false;
continue;
}
res[i] = executor(i);
}
};
int64_t processed_size = ProcessDataChunks<milvus::Json>(
execute_sub_batch, std::nullptr_t{}, res, pointer, term_set);
execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, term_set);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -489,12 +530,12 @@ PhyTermFilterExpr::ExecVisitorImplForIndex() {
return func(index_ptr, vals.size(), vals.data());
};
auto res = ProcessIndexChunks<T>(execute_sub_batch, vals);
AssertInfo(res.size() == real_batch_size,
AssertInfo(res->size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
res.size(),
res->size(),
real_batch_size);
return std::make_shared<ColumnVector>(std::move(res));
return res;
}
template <>
@ -516,7 +557,7 @@ PhyTermFilterExpr::ExecVisitorImplForIndex<bool>() {
return std::move(func(index_ptr, vals.size(), (bool*)vals.data()));
};
auto res = ProcessIndexChunks<bool>(execute_sub_batch, vals);
return std::make_shared<ColumnVector>(std::move(res));
return res;
}
template <typename T>
@ -527,9 +568,11 @@ PhyTermFilterExpr::ExecVisitorImplForData() {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
std::vector<T> vals;
for (auto& val : expr_->vals_) {
@ -542,16 +585,22 @@ PhyTermFilterExpr::ExecVisitorImplForData() {
}
std::unordered_set<T> vals_set(vals.begin(), vals.end());
auto execute_sub_batch = [](const T* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
const std::unordered_set<T>& vals) {
TermElementFuncSet<T> func;
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
res[i] = func(vals, data[i]);
}
};
int64_t processed_size = ProcessDataChunks<T>(
execute_sub_batch, std::nullptr_t{}, res, vals_set);
execute_sub_batch, std::nullptr_t{}, res, valid_res, vals_set);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",

View File

@ -15,6 +15,7 @@
// limitations under the License.
#include "UnaryExpr.h"
#include <optional>
#include "common/Json.h"
namespace milvus {
@ -260,9 +261,11 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray() {
if (real_batch_size == 0) {
return nullptr;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
ValueType val = GetValueFromProto<ValueType>(expr_->val_);
auto op_type = expr_->op_type_;
@ -271,48 +274,50 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray() {
index = std::stoi(expr_->column_.nested_path_[0]);
}
auto execute_sub_batch = [op_type](const milvus::ArrayView* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val,
int index) {
switch (op_type) {
case proto::plan::GreaterThan: {
UnaryElementFuncForArray<ValueType, proto::plan::GreaterThan>
func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
case proto::plan::GreaterEqual: {
UnaryElementFuncForArray<ValueType, proto::plan::GreaterEqual>
func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
case proto::plan::LessThan: {
UnaryElementFuncForArray<ValueType, proto::plan::LessThan> func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
case proto::plan::LessEqual: {
UnaryElementFuncForArray<ValueType, proto::plan::LessEqual>
func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
case proto::plan::Equal: {
UnaryElementFuncForArray<ValueType, proto::plan::Equal> func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
case proto::plan::NotEqual: {
UnaryElementFuncForArray<ValueType, proto::plan::NotEqual> func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
case proto::plan::PrefixMatch: {
UnaryElementFuncForArray<ValueType, proto::plan::PrefixMatch>
func;
func(data, size, val, index, res);
func(data, valid_data, size, val, index, res, valid_res);
break;
}
default:
@ -323,7 +328,7 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray() {
}
};
int64_t processed_size = ProcessDataChunks<milvus::ArrayView>(
execute_sub_batch, std::nullptr_t{}, res, val, index);
execute_sub_batch, std::nullptr_t{}, res, valid_res, val, index);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -432,14 +437,14 @@ PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) {
}
return res;
});
AssertInfo(batch_res.size() == real_batch_size,
AssertInfo(batch_res->size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
batch_res.size(),
batch_res->size(),
real_batch_size);
// return the result.
return std::make_shared<ColumnVector>(std::move(batch_res));
return batch_res;
}
template <typename ExprValueType>
@ -455,9 +460,11 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
ExprValueType val = GetValueFromProto<ExprValueType>(expr_->val_);
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto op_type = expr_->op_type_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
@ -492,12 +499,18 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
} while (false)
auto execute_sub_batch = [op_type, pointer](const milvus::Json* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ExprValueType val) {
switch (op_type) {
case proto::plan::GreaterThan: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
@ -508,6 +521,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
case proto::plan::GreaterEqual: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
@ -518,6 +535,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
case proto::plan::LessThan: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
@ -528,6 +549,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
case proto::plan::LessEqual: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
@ -538,6 +563,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
case proto::plan::Equal: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -554,6 +583,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
case proto::plan::NotEqual: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
@ -570,6 +603,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
case proto::plan::PrefixMatch: {
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
@ -584,6 +621,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
for (size_t i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
@ -601,7 +642,7 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
}
};
int64_t processed_size = ProcessDataChunks<milvus::Json>(
execute_sub_batch, std::nullptr_t{}, res, val);
execute_sub_batch, std::nullptr_t{}, res, valid_res, val);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
@ -693,12 +734,12 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForIndex() {
};
auto val = GetValueFromProto<IndexInnerType>(expr_->val_);
auto res = ProcessIndexChunks<T>(execute_sub_batch, val);
AssertInfo(res.size() == real_batch_size,
AssertInfo(res->size() == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}",
res.size(),
res->size(),
real_batch_size);
return std::make_shared<ColumnVector>(std::move(res));
return res;
}
template <typename T>
@ -720,10 +761,11 @@ PhyUnaryRangeFilterExpr::PreCheckOverflow() {
switch (expr_->op_type_) {
case proto::plan::GreaterThan:
case proto::plan::GreaterEqual: {
auto valid_res = ProcessChunksForValid<T>(CanUseIndex<T>());
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(batch_size));
cached_overflow_res_ = res_vec;
TargetBitmap(batch_size), std::move(valid_res));
TargetBitmapView res(res_vec->GetRawData(), batch_size);
cached_overflow_res_ = res_vec;
if (milvus::query::lt_lb<T>(val)) {
res.set();
@ -733,10 +775,11 @@ PhyUnaryRangeFilterExpr::PreCheckOverflow() {
}
case proto::plan::LessThan:
case proto::plan::LessEqual: {
auto valid_res = ProcessChunksForValid<T>(CanUseIndex<T>());
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(batch_size));
cached_overflow_res_ = res_vec;
TargetBitmap(batch_size), std::move(valid_res));
TargetBitmapView res(res_vec->GetRawData(), batch_size);
cached_overflow_res_ = res_vec;
if (milvus::query::gt_ub<T>(val)) {
res.set();
@ -745,19 +788,21 @@ PhyUnaryRangeFilterExpr::PreCheckOverflow() {
return res_vec;
}
case proto::plan::Equal: {
auto valid_res = ProcessChunksForValid<T>(CanUseIndex<T>());
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(batch_size));
cached_overflow_res_ = res_vec;
TargetBitmap(batch_size), std::move(valid_res));
TargetBitmapView res(res_vec->GetRawData(), batch_size);
cached_overflow_res_ = res_vec;
res.reset();
return res_vec;
}
case proto::plan::NotEqual: {
auto valid_res = ProcessChunksForValid<T>(CanUseIndex<T>());
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(batch_size));
cached_overflow_res_ = res_vec;
TargetBitmap(batch_size), std::move(valid_res));
TargetBitmapView res(res_vec->GetRawData(), batch_size);
cached_overflow_res_ = res_vec;
res.set();
return res_vec;
@ -788,13 +833,17 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData() {
return nullptr;
}
IndexInnerType val = GetValueFromProto<IndexInnerType>(expr_->val_);
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size));
auto res_vec = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size), TargetBitmap(real_batch_size));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
valid_res.set();
auto expr_type = expr_->op_type_;
auto execute_sub_batch = [expr_type](const T* data,
const bool* valid_data,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
IndexInnerType val) {
switch (expr_type) {
case proto::plan::GreaterThan: {
@ -843,6 +892,16 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData() {
fmt::format("unsupported operator type for unary expr: {}",
expr_type));
}
// there is a batch operation in BinaryRangeElementFunc,
// so not divide data again for the reason that it may reduce performance if the null distribution is scattered
// but to mask res with valid_data after the batch operation.
if (valid_data != nullptr) {
for (int i = 0; i < size; i++) {
if (!valid_data[i]) {
res[i] = valid_res[i] = false;
}
}
}
};
auto skip_index_func = [expr_type, val](const SkipIndex& skip_index,
FieldId field_id,
@ -850,8 +909,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData() {
return skip_index.CanSkipUnaryRange<T>(
field_id, chunk_id, expr_type, val);
};
int64_t processed_size =
ProcessDataChunks<T>(execute_sub_batch, skip_index_func, res, val);
int64_t processed_size = ProcessDataChunks<T>(
execute_sub_batch, skip_index_func, res, valid_res, val);
AssertInfo(processed_size == real_batch_size,
"internal error: expr processed rows {} not equal "
"expect batch size {}, related params[active_count:{}, "
@ -881,7 +940,7 @@ PhyUnaryRangeFilterExpr::ExecTextMatch() {
return index->MatchQuery(query);
};
auto res = ProcessTextMatchIndex(func, query);
return std::make_shared<ColumnVector>(std::move(res));
return res;
};
} // namespace exec

View File

@ -148,11 +148,17 @@ struct UnaryElementFuncForArray {
ValueType>;
void
operator()(const ArrayView* src,
const bool* valid_data,
size_t size,
ValueType val,
int index,
TargetBitmapView res) {
TargetBitmapView res,
TargetBitmapView valid_res) {
for (int i = 0; i < size; ++i) {
if (valid_data != nullptr && !valid_data[i]) {
res[i] = valid_res[i] = false;
continue;
}
if constexpr (op == proto::plan::OpType::Equal) {
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = src[i].is_same_array(val);
@ -224,7 +230,11 @@ struct UnaryIndexFuncForMatch {
RegexMatcher matcher(regex_pattern);
for (int64_t i = 0; i < cnt; i++) {
auto raw = index->Reverse_Lookup(i);
res[i] = matcher(raw);
if (!raw.has_value()) {
res[i] = false;
continue;
}
res[i] = matcher(raw.value());
}
return res;
}

View File

@ -68,6 +68,7 @@ PhyFilterBitsNode::GetOutput() {
operator_context_->get_exec_context(), exprs_.get(), input_.get());
TargetBitmap bitset;
TargetBitmap valid_bitset;
while (num_processed_rows_ < need_process_rows_) {
exprs_->Eval(0, 1, true, eval_ctx, results_);
@ -79,13 +80,17 @@ PhyFilterBitsNode::GetOutput() {
auto col_vec_size = col_vec->size();
TargetBitmapView view(col_vec->GetRawData(), col_vec_size);
bitset.append(view);
TargetBitmapView valid_view(col_vec->GetValidRawData(), col_vec_size);
valid_bitset.append(valid_view);
num_processed_rows_ += col_vec_size;
}
bitset.flip();
Assert(bitset.size() == need_process_rows_);
Assert(valid_bitset.size() == need_process_rows_);
// num_processed_rows_ = need_process_rows_;
std::vector<VectorPtr> col_res;
col_res.push_back(std::make_shared<ColumnVector>(std::move(bitset)));
col_res.push_back(std::make_shared<ColumnVector>(std::move(bitset),
std::move(valid_bitset)));
std::chrono::high_resolution_clock::time_point scalar_end =
std::chrono::high_resolution_clock::now();
double scalar_cost =

View File

@ -51,13 +51,15 @@ PhyMvccNode::GetOutput() {
is_finished_ = true;
return nullptr;
}
auto col_input =
is_source_node_
? std::make_shared<ColumnVector>(TargetBitmap(active_count_))
: GetColumnVector(input_);
// the first vector is filtering result and second bitset is a valid bitset
// if valid_bitset[i]==false, means result[i] is null
auto col_input = is_source_node_ ? std::make_shared<ColumnVector>(
TargetBitmap(active_count_),
TargetBitmap(active_count_))
: GetColumnVector(input_);
TargetBitmapView data(col_input->GetRawData(), col_input->size());
// need to expose null?
segment_->mask_with_timestamps(data, query_timestamp_);
segment_->mask_with_delete(data, active_count_, query_timestamp_);
is_finished_ = true;

View File

@ -100,7 +100,9 @@ class SealedDataGetter : public DataGetter<T> {
}
return field_data_->operator[](idx);
} else {
return (*field_index_).Reverse_Lookup(idx);
auto raw = (*field_index_).Reverse_Lookup(idx);
AssertInfo(raw.has_value(), "field data not found");
return raw.value();
}
}
};

View File

@ -80,7 +80,7 @@ BitmapIndex<T>::Build(const Config& config) {
template <typename T>
void
BitmapIndex<T>::Build(size_t n, const T* data) {
BitmapIndex<T>::Build(size_t n, const T* data, const bool* valid_data) {
if (is_built_) {
return;
}
@ -89,12 +89,14 @@ BitmapIndex<T>::Build(size_t n, const T* data) {
}
total_num_rows_ = n;
valid_bitset = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
T* p = const_cast<T*>(data);
for (int i = 0; i < n; ++i, ++p) {
data_[*p].add(i);
valid_bitset.set(i);
if (valid_data == nullptr || valid_data[i]) {
data_[*p].add(i);
valid_bitset_.set(i);
}
}
if (data_.size() < DEFAULT_BITMAP_INDEX_BUILD_MODE_BOUND) {
@ -120,7 +122,7 @@ BitmapIndex<T>::BuildPrimitiveField(
if (data->is_valid(i)) {
auto val = reinterpret_cast<const T*>(data->RawValue(i));
data_[*val].add(offset);
valid_bitset.set(offset);
valid_bitset_.set(offset);
}
offset++;
}
@ -139,7 +141,7 @@ BitmapIndex<T>::BuildWithFieldData(
PanicInfo(DataIsEmpty, "scalar bitmap index can not build null values");
}
total_num_rows_ = total_num_rows;
valid_bitset = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
switch (schema_.data_type()) {
case proto::schema::DataType::Bool:
@ -184,7 +186,7 @@ BitmapIndex<T>::BuildArrayField(const std::vector<FieldDataPtr>& field_datas) {
auto val = array->template get_data<T>(j);
data_[val].add(offset);
}
valid_bitset.set(offset);
valid_bitset_.set(offset);
}
offset++;
}
@ -359,7 +361,7 @@ BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset.set(v);
valid_bitset_.set(v);
}
}
}
@ -422,7 +424,7 @@ BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
data_[key] = value;
}
for (const auto& v : value) {
valid_bitset.set(v);
valid_bitset_.set(v);
}
}
}
@ -516,7 +518,7 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
index_meta_buffer->size);
auto index_length = index_meta.first;
total_num_rows_ = index_meta.second;
valid_bitset = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
@ -645,7 +647,7 @@ BitmapIndex<T>::NotIn(const size_t n, const T* values) {
}
}
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
res &= valid_bitset;
res &= valid_bitset_;
return res;
} else {
TargetBitmap res(total_num_rows_, false);
@ -657,7 +659,7 @@ BitmapIndex<T>::NotIn(const size_t n, const T* values) {
}
res.flip();
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
res &= valid_bitset;
res &= valid_bitset_;
return res;
}
}
@ -667,7 +669,7 @@ const TargetBitmap
BitmapIndex<T>::IsNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, true);
res &= valid_bitset;
res &= valid_bitset_;
res.flip();
return res;
}
@ -677,7 +679,7 @@ const TargetBitmap
BitmapIndex<T>::IsNotNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap res(total_num_rows_, true);
res &= valid_bitset;
res &= valid_bitset_;
return res;
}
@ -1086,11 +1088,15 @@ BitmapIndex<T>::Reverse_Lookup_InCache(size_t idx) const {
}
template <typename T>
T
std::optional<T>
BitmapIndex<T>::Reverse_Lookup(size_t idx) const {
AssertInfo(is_built_, "index has not been built");
AssertInfo(idx < total_num_rows_, "out of range of total coun");
if (!valid_bitset_[idx]) {
return std::nullopt;
}
if (use_offset_cache_) {
return Reverse_Lookup_InCache(idx);
}
@ -1125,6 +1131,7 @@ BitmapIndex<T>::Reverse_Lookup(size_t idx) const {
fmt::format(
"scalar bitmap index can not lookup target value of index {}",
idx));
return std::nullopt;
}
template <typename T>

View File

@ -77,7 +77,7 @@ class BitmapIndex : public ScalarIndex<T> {
}
void
Build(size_t n, const T* values) override;
Build(size_t n, const T* values, const bool* valid_data = nullptr) override;
void
Build(const Config& config = {}) override;
@ -106,7 +106,7 @@ class BitmapIndex : public ScalarIndex<T> {
T upper_bound_value,
bool ub_inclusive) override;
T
std::optional<T>
Reverse_Lookup(size_t offset) const override;
int64_t
@ -267,7 +267,7 @@ class BitmapIndex : public ScalarIndex<T> {
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
// generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate
TargetBitmap valid_bitset;
TargetBitmap valid_bitset_;
};
} // namespace index

View File

@ -67,10 +67,12 @@ class HybridScalarIndex : public ScalarIndex<T> {
}
void
Build(size_t n, const T* values) override {
Build(size_t n,
const T* values,
const bool* valid_data = nullptr) override {
SelectIndexBuildType(n, values);
auto index = GetInternalIndex();
index->Build(n, values);
index->Build(n, values, valid_data);
is_built_ = true;
}
@ -133,7 +135,7 @@ class HybridScalarIndex : public ScalarIndex<T> {
lower_bound_value, lb_inclusive, upper_bound_value, ub_inclusive);
}
T
std::optional<T>
Reverse_Lookup(size_t offset) const override {
return internal_index_->Reverse_Lookup(offset);
}

View File

@ -94,7 +94,7 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
* deprecated, only used in small chunk index.
*/
void
Build(size_t n, const T* values) override {
Build(size_t n, const T* values, const bool* valid_data) override {
PanicInfo(ErrorCode::NotImplemented, "Build should not be called");
}
@ -136,7 +136,7 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
return false;
}
T
std::optional<T>
Reverse_Lookup(size_t offset) const override {
PanicInfo(ErrorCode::NotImplemented,
"Reverse_Lookup should not be handled by inverted index");

View File

@ -80,7 +80,7 @@ class ScalarIndex : public IndexBase {
GetIndexType() const = 0;
virtual void
Build(size_t n, const T* values) = 0;
Build(size_t n, const T* values, const bool* valid_data = nullptr) = 0;
virtual const TargetBitmap
In(size_t n, const T* values) = 0;
@ -117,7 +117,7 @@ class ScalarIndex : public IndexBase {
T upper_bound_value,
bool ub_inclusive) = 0;
virtual T
virtual std::optional<T>
Reverse_Lookup(size_t offset) const = 0;
virtual const TargetBitmap

View File

@ -16,6 +16,7 @@
#include <algorithm>
#include <memory>
#include <optional>
#include <utility>
#include <pb/schema.pb.h>
#include <vector>
@ -61,7 +62,7 @@ ScalarIndexSort<T>::Build(const Config& config) {
template <typename T>
void
ScalarIndexSort<T>::Build(size_t n, const T* values) {
ScalarIndexSort<T>::Build(size_t n, const T* values, const bool* valid_data) {
if (is_built_)
return;
if (n == 0) {
@ -69,13 +70,17 @@ ScalarIndexSort<T>::Build(size_t n, const T* values) {
}
data_.reserve(n);
total_num_rows_ = n;
valid_bitset = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
idx_to_offsets_.resize(n);
T* p = const_cast<T*>(values);
for (size_t i = 0; i < n; ++i) {
data_.emplace_back(IndexStructure(*p++, i));
valid_bitset.set(i);
for (size_t i = 0; i < n; ++i, ++p) {
if (!valid_data || valid_data[i]) {
data_.emplace_back(IndexStructure(*p, i));
valid_bitset_.set(i);
}
}
std::sort(data_.begin(), data_.end());
for (size_t i = 0; i < data_.size(); ++i) {
idx_to_offsets_[data_[i].idx_] = i;
@ -97,7 +102,7 @@ ScalarIndexSort<T>::BuildWithFieldData(
}
data_.reserve(length);
valid_bitset = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
int64_t offset = 0;
for (const auto& data : field_datas) {
auto slice_num = data->get_num_rows();
@ -105,7 +110,7 @@ ScalarIndexSort<T>::BuildWithFieldData(
if (data->is_valid(i)) {
auto value = reinterpret_cast<const T*>(data->RawValue(i));
data_.emplace_back(IndexStructure(*value, offset));
valid_bitset.set(offset);
valid_bitset_.set(offset);
}
offset++;
}
@ -175,11 +180,11 @@ ScalarIndexSort<T>::LoadWithoutAssemble(const BinarySet& index_binary,
index_num_rows->data.get(),
(size_t)index_num_rows->size);
idx_to_offsets_.resize(total_num_rows_);
valid_bitset = TargetBitmap(total_num_rows_, false);
valid_bitset_ = TargetBitmap(total_num_rows_, false);
memcpy(data_.data(), index_data->data.get(), (size_t)index_data->size);
for (size_t i = 0; i < data_.size(); ++i) {
idx_to_offsets_[data_[i].idx_] = i;
valid_bitset.set(data_[i].idx_);
valid_bitset_.set(data_[i].idx_);
}
is_built_ = true;
@ -256,7 +261,7 @@ ScalarIndexSort<T>::NotIn(const size_t n, const T* values) {
}
}
// NotIn(null) and In(null) is both false, need to mask with IsNotNull operate
bitset &= valid_bitset;
bitset &= valid_bitset_;
return bitset;
}
@ -265,7 +270,7 @@ const TargetBitmap
ScalarIndexSort<T>::IsNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(total_num_rows_, true);
bitset &= valid_bitset;
bitset &= valid_bitset_;
bitset.flip();
return bitset;
}
@ -275,7 +280,7 @@ const TargetBitmap
ScalarIndexSort<T>::IsNotNull() {
AssertInfo(is_built_, "index has not been built");
TargetBitmap bitset(total_num_rows_, true);
bitset &= valid_bitset;
bitset &= valid_bitset_;
return bitset;
}
@ -355,11 +360,14 @@ ScalarIndexSort<T>::Range(T lower_bound_value,
}
template <typename T>
T
std::optional<T>
ScalarIndexSort<T>::Reverse_Lookup(size_t idx) const {
AssertInfo(idx < idx_to_offsets_.size(), "out of range of total count");
AssertInfo(is_built_, "index has not been built");
if (!valid_bitset_[idx]) {
return std::nullopt;
}
auto offset = idx_to_offsets_[idx];
return data_[offset].a_;
}

View File

@ -56,7 +56,7 @@ class ScalarIndexSort : public ScalarIndex<T> {
}
void
Build(size_t n, const T* values) override;
Build(size_t n, const T* values, const bool* valid_data = nullptr) override;
void
Build(const Config& config = {}) override;
@ -82,7 +82,7 @@ class ScalarIndexSort : public ScalarIndex<T> {
T upper_bound_value,
bool ub_inclusive) override;
T
std::optional<T>
Reverse_Lookup(size_t offset) const override;
int64_t
@ -127,8 +127,8 @@ class ScalarIndexSort : public ScalarIndex<T> {
std::vector<IndexStructure<T>> data_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
size_t total_num_rows_{0};
// generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate
TargetBitmap valid_bitset;
// generate valid_bitset_ to speed up NotIn and IsNull and IsNotNull operate
TargetBitmap valid_bitset_;
};
template <typename T>

View File

@ -19,6 +19,7 @@
#include <boost/uuid/uuid_generators.hpp>
#include <cstring>
#include <memory>
#include <optional>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
@ -118,7 +119,9 @@ StringIndexMarisa::BuildWithFieldData(
}
void
StringIndexMarisa::Build(size_t n, const std::string* values) {
StringIndexMarisa::Build(size_t n,
const std::string* values,
const bool* valid_data) {
if (built_) {
PanicInfo(IndexAlreadyBuild, "index has been built");
}
@ -127,12 +130,14 @@ StringIndexMarisa::Build(size_t n, const std::string* values) {
{
// fill key set.
for (size_t i = 0; i < n; i++) {
keyset.push_back(values[i].c_str());
if (valid_data == nullptr || valid_data[i]) {
keyset.push_back(values[i].c_str());
}
}
}
trie_.build(keyset, MARISA_LABEL_ORDER);
fill_str_ids(n, values);
fill_str_ids(n, values, valid_data);
fill_offsets();
built_ = true;
@ -213,7 +218,7 @@ StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set,
auto str_ids = set.GetByName(MARISA_STR_IDS);
auto str_ids_len = str_ids->size;
str_ids_.resize(str_ids_len / sizeof(size_t));
str_ids_.resize(str_ids_len / sizeof(size_t), MARISA_NULL_KEY_ID);
memcpy(str_ids_.data(), str_ids->data.get(), str_ids_len);
fill_offsets();
@ -491,9 +496,14 @@ StringIndexMarisa::PrefixMatch(std::string_view prefix) {
}
void
StringIndexMarisa::fill_str_ids(size_t n, const std::string* values) {
str_ids_.resize(n);
StringIndexMarisa::fill_str_ids(size_t n,
const std::string* values,
const bool* valid_data) {
str_ids_.resize(n, MARISA_NULL_KEY_ID);
for (size_t i = 0; i < n; i++) {
if (valid_data != nullptr && !valid_data[i]) {
continue;
}
auto str = values[i];
auto str_id = lookup(str);
AssertInfo(valid_str_id(str_id), "invalid marisa key");
@ -534,11 +544,13 @@ StringIndexMarisa::prefix_match(const std::string_view prefix) {
}
return ret;
}
std::string
std::optional<std::string>
StringIndexMarisa::Reverse_Lookup(size_t offset) const {
AssertInfo(offset < str_ids_.size(), "out of range of total count");
marisa::Agent agent;
if (str_ids_[offset] < 0) {
return std::nullopt;
}
agent.set_query(str_ids_[offset]);
trie_.reverse_lookup(agent);
return std::string(agent.key().ptr(), agent.key().length());

View File

@ -55,7 +55,9 @@ class StringIndexMarisa : public StringIndex {
}
void
Build(size_t n, const std::string* values) override;
Build(size_t n,
const std::string* values,
const bool* valid_data = nullptr) override;
void
Build(const Config& config = {}) override;
@ -87,7 +89,7 @@ class StringIndexMarisa : public StringIndex {
const TargetBitmap
PrefixMatch(const std::string_view prefix) override;
std::string
std::optional<std::string>
Reverse_Lookup(size_t offset) const override;
BinarySet
@ -100,7 +102,7 @@ class StringIndexMarisa : public StringIndex {
private:
void
fill_str_ids(size_t n, const std::string* values);
fill_str_ids(size_t n, const std::string* values, const bool* valid_data);
void
fill_offsets();
@ -122,7 +124,7 @@ class StringIndexMarisa : public StringIndex {
private:
Config config_;
marisa::Trie trie_;
std::vector<size_t> str_ids_; // used to retrieve.
std::vector<int64_t> str_ids_; // used to retrieve.
std::map<size_t, std::vector<size_t>> str_ids_to_offsets_;
bool built_ = false;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;

View File

@ -26,7 +26,7 @@ template <typename T>
inline index::ScalarIndexPtr<T>
generate_scalar_index(Span<T> data) {
auto indexing = std::make_unique<index::ScalarIndexSort<T>>();
indexing->Build(data.row_count(), data.data());
indexing->Build(data.row_count(), data.data(), data.valid_data());
return indexing;
}
@ -34,7 +34,7 @@ template <>
inline index::ScalarIndexPtr<std::string>
generate_scalar_index(Span<std::string> data) {
auto indexing = index::CreateStringIndexSort();
indexing->Build(data.row_count(), data.data());
indexing->Build(data.row_count(), data.data(), data.valid_data());
return indexing;
}

View File

@ -196,8 +196,9 @@ ChunkedSegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
if (!is_sorted_by_pk_ && insert_record_.empty_pks() &&
int64_index->HasRawData()) {
for (int i = 0; i < row_count; ++i) {
insert_record_.insert_pk(int64_index->Reverse_Lookup(i),
i);
auto raw = int64_index->Reverse_Lookup(i);
AssertInfo(raw.has_value(), "pk not found");
insert_record_.insert_pk(raw.value(), i);
}
insert_record_.seal_pks();
}
@ -210,8 +211,9 @@ ChunkedSegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
if (!is_sorted_by_pk_ && insert_record_.empty_pks() &&
string_index->HasRawData()) {
for (int i = 0; i < row_count; ++i) {
insert_record_.insert_pk(
string_index->Reverse_Lookup(i), i);
auto raw = string_index->Reverse_Lookup(i);
AssertInfo(raw.has_value(), "pk not found");
insert_record_.insert_pk(raw.value(), i);
}
insert_record_.seal_pks();
}
@ -1630,7 +1632,11 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
"converted to string index");
auto n = impl->Size();
for (size_t i = 0; i < n; i++) {
index->AddText(impl->Reverse_Lookup(i), i);
auto raw = impl->Reverse_Lookup(i);
if (!raw.has_value()) {
continue;
}
index->AddText(raw.value(), i);
}
}
}

View File

@ -299,6 +299,7 @@ ScalarFieldIndexing<T>::BuildIndexRange(int64_t ack_beg,
for (int chunk_id = ack_beg; chunk_id < ack_end; chunk_id++) {
auto chunk_data = source->get_chunk_data(chunk_id);
// build index for chunk
// seem no lint, not pass valid_data here
// TODO
if constexpr (std::is_same_v<T, std::string>) {
auto indexing = index::CreateStringIndexSort();

View File

@ -198,8 +198,9 @@ SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
if (!is_sorted_by_pk_ && insert_record_.empty_pks() &&
int64_index->HasRawData()) {
for (int i = 0; i < row_count; ++i) {
insert_record_.insert_pk(int64_index->Reverse_Lookup(i),
i);
auto raw = int64_index->Reverse_Lookup(i);
AssertInfo(raw.has_value(), "Primary key not found");
insert_record_.insert_pk(raw.value(), i);
}
insert_record_.seal_pks();
}
@ -212,8 +213,9 @@ SegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
if (!is_sorted_by_pk_ && insert_record_.empty_pks() &&
string_index->HasRawData()) {
for (int i = 0; i < row_count; ++i) {
insert_record_.insert_pk(
string_index->Reverse_Lookup(i), i);
auto raw = string_index->Reverse_Lookup(i);
AssertInfo(raw.has_value(), "Primary key not found");
insert_record_.insert_pk(raw.value(), i);
}
insert_record_.seal_pks();
}
@ -2108,7 +2110,11 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
"converted to string index");
auto n = impl->Size();
for (size_t i = 0; i < n; i++) {
index->AddText(impl->Reverse_Lookup(i), i);
auto raw = impl->Reverse_Lookup(i);
if (!raw.has_value()) {
continue;
}
index->AddText(raw.value(), i);
}
}
}

View File

@ -683,6 +683,11 @@ ReverseDataFromIndex(const index::IndexBase* index,
data_array->set_field_id(field_meta.get_id().get());
data_array->set_type(static_cast<milvus::proto::schema::DataType>(
field_meta.get_data_type()));
auto nullable = field_meta.is_nullable();
std::vector<bool> valid_data;
if (nullable) {
valid_data.resize(count);
}
auto scalar_array = data_array->mutable_scalars();
switch (data_type) {
@ -691,7 +696,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<bool> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_bool_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -702,7 +716,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int8_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -713,7 +736,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int16_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -724,7 +756,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int32_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_int_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -735,7 +776,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<int64_t> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_long_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -746,7 +796,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<float> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_float_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -757,7 +816,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<double> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_double_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -768,7 +836,16 @@ ReverseDataFromIndex(const index::IndexBase* index,
auto ptr = dynamic_cast<const IndexType*>(index);
std::vector<std::string> raw_data(count);
for (int64_t i = 0; i < count; ++i) {
raw_data[i] = ptr->Reverse_Lookup(seg_offsets[i]);
auto raw = ptr->Reverse_Lookup(seg_offsets[i]);
// if has no value, means nullable must be true, no need to check nullable again here
if (!raw.has_value()) {
valid_data[i] = false;
continue;
}
if (nullable) {
valid_data[i] = true;
}
raw_data[i] = raw.value();
}
auto obj = scalar_array->mutable_string_data();
*(obj->mutable_data()) = {raw_data.begin(), raw_data.end()};
@ -780,6 +857,11 @@ ReverseDataFromIndex(const index::IndexBase* index,
}
}
if (nullable) {
*(data_array->mutable_valid_data()) = {valid_data.begin(),
valid_data.end()};
}
return data_array;
}

File diff suppressed because it is too large Load Diff

View File

@ -166,7 +166,8 @@ GenAlwaysFalseExpr(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
}
auto
GenAlwaysTrueExpr(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
GenAlwaysTrueExprIfValid(const FieldMeta& fvec_meta,
const FieldMeta& str_meta) {
auto always_false_expr = GenAlwaysFalseExpr(fvec_meta, str_meta);
auto not_expr = GenNotExpr();
not_expr->set_allocated_child(always_false_expr);
@ -196,7 +197,7 @@ GenAlwaysFalsePlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
auto
GenAlwaysTruePlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta) {
auto always_true_expr = GenAlwaysTrueExpr(fvec_meta, str_meta);
auto always_true_expr = GenAlwaysTrueExprIfValid(fvec_meta, str_meta);
proto::plan::VectorType vector_type;
if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT) {
vector_type = proto::plan::VectorType::FloatVector;
@ -299,6 +300,82 @@ TEST(StringExpr, Term) {
}
}
TEST(StringExpr, TermNullable) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR, true);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField(
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto vec_2k_3k = []() -> std::vector<std::string> {
std::vector<std::string> ret;
for (int i = 2000; i < 3000; i++) {
ret.push_back(std::to_string(i));
}
return ret;
}();
std::map<int, std::vector<std::string>> terms = {
{0, {"2000", "3000"}},
{1, {"2000"}},
{2, {"3000"}},
{3, {}},
{4, {vec_2k_3k}},
};
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> str_col;
FixedVector<bool> valid_data;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_str_col = raw_data.get_col(str_meta.get_id());
auto begin = FIELD_DATA(new_str_col, string).begin();
auto end = FIELD_DATA(new_str_col, string).end();
str_col.insert(str_col.end(), begin, end);
auto new_str_valid_col = raw_data.get_col_valid(str_meta.get_id());
valid_data.insert(valid_data.end(),
new_str_valid_col.begin(),
new_str_valid_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (const auto& [_, term] : terms) {
auto plan_proto = GenTermPlan(fvec_meta, str_meta, term);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
BitsetType final;
final = ExecuteQueryExpr(
plan->plan_node_->plannodes_->sources()[0]->sources()[0],
seg_promote,
N * num_iters,
MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
if (!valid_data[i]) {
ASSERT_EQ(ans, false);
continue;
}
auto val = str_col[i];
auto ref = std::find(term.begin(), term.end(), val) != term.end();
ASSERT_EQ(ans, ref) << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, Compare) {
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
@ -395,6 +472,7 @@ TEST(StringExpr, Compare) {
for (const auto& [op, ref_func] : testcases) {
auto plan_proto = gen_compare_plan(op);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
BitsetType final;
final = ExecuteQueryExpr(
plan->plan_node_->plannodes_->sources()[0]->sources()[0],
@ -414,6 +492,269 @@ TEST(StringExpr, Compare) {
}
}
TEST(StringExpr, CompareNullable) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR, true);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField(
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
const auto& another_str_meta = schema->operator[](FieldName("another_str"));
auto gen_compare_plan =
[&, fvec_meta, str_meta, another_str_meta](
proto::plan::OpType op) -> std::unique_ptr<proto::plan::PlanNode> {
auto str_col_info =
test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto another_str_col_info =
test::GenColumnInfo(another_str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto compare_expr = GenCompareExpr(op);
compare_expr->set_allocated_left_column_info(str_col_info);
compare_expr->set_allocated_right_column_info(another_str_col_info);
auto expr = test::GenExpr().release();
expr->set_allocated_compare_expr(compare_expr);
proto::plan::VectorType vector_type;
if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT) {
vector_type = proto::plan::VectorType::FloatVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_BINARY) {
vector_type = proto::plan::VectorType::BinaryVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT16) {
vector_type = proto::plan::VectorType::Float16Vector;
}
auto anns = GenAnns(expr, vector_type, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return plan_node;
};
std::vector<std::tuple<proto::plan::OpType,
std::function<bool(std::string&, std::string&)>>>
testcases{
{proto::plan::OpType::GreaterThan,
[](std::string& v1, std::string& v2) { return v1 > v2; }},
{proto::plan::OpType::GreaterEqual,
[](std::string& v1, std::string& v2) { return v1 >= v2; }},
{proto::plan::OpType::LessThan,
[](std::string& v1, std::string& v2) { return v1 < v2; }},
{proto::plan::OpType::LessEqual,
[](std::string& v1, std::string& v2) { return v1 <= v2; }},
{proto::plan::OpType::Equal,
[](std::string& v1, std::string& v2) { return v1 == v2; }},
{proto::plan::OpType::NotEqual,
[](std::string& v1, std::string& v2) { return v1 != v2; }},
{proto::plan::OpType::PrefixMatch,
[](std::string& v1, std::string& v2) {
return PrefixMatch(v1, v2);
}},
};
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> str_col;
std::vector<std::string> another_str_col;
FixedVector<bool> valid_data;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto reserve_col = [&, raw_data](const FieldMeta& field_meta,
std::vector<std::string>& str_col) {
auto new_str_col = raw_data.get_col(field_meta.get_id());
auto begin = FIELD_DATA(new_str_col, string).begin();
auto end = FIELD_DATA(new_str_col, string).end();
str_col.insert(str_col.end(), begin, end);
};
auto new_str_valid_col = raw_data.get_col_valid(str_meta.get_id());
valid_data.insert(valid_data.end(),
new_str_valid_col.begin(),
new_str_valid_col.end());
reserve_col(str_meta, str_col);
reserve_col(another_str_meta, another_str_col);
{
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (const auto& [op, ref_func] : testcases) {
auto plan_proto = gen_compare_plan(op);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
BitsetType final;
final = ExecuteQueryExpr(
plan->plan_node_->plannodes_->sources()[0]->sources()[0],
seg_promote,
N * num_iters,
MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
if (!valid_data[i]) {
ASSERT_EQ(ans, false);
continue;
}
auto val = str_col[i];
auto another_val = another_str_col[i];
auto ref = ref_func(val, another_val);
ASSERT_EQ(ans, ref) << "@" << op << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, CompareNullable2) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR);
schema->AddDebugField("another_str", DataType::VARCHAR, true);
schema->AddDebugField(
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
const auto& another_str_meta = schema->operator[](FieldName("another_str"));
auto gen_compare_plan =
[&, fvec_meta, str_meta, another_str_meta](
proto::plan::OpType op) -> std::unique_ptr<proto::plan::PlanNode> {
auto str_col_info =
test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto another_str_col_info =
test::GenColumnInfo(another_str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto compare_expr = GenCompareExpr(op);
compare_expr->set_allocated_left_column_info(str_col_info);
compare_expr->set_allocated_right_column_info(another_str_col_info);
auto expr = test::GenExpr().release();
expr->set_allocated_compare_expr(compare_expr);
proto::plan::VectorType vector_type;
if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT) {
vector_type = proto::plan::VectorType::FloatVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_BINARY) {
vector_type = proto::plan::VectorType::BinaryVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT16) {
vector_type = proto::plan::VectorType::Float16Vector;
}
auto anns = GenAnns(expr, vector_type, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return plan_node;
};
std::vector<std::tuple<proto::plan::OpType,
std::function<bool(std::string&, std::string&)>>>
testcases{
{proto::plan::OpType::GreaterThan,
[](std::string& v1, std::string& v2) { return v1 > v2; }},
{proto::plan::OpType::GreaterEqual,
[](std::string& v1, std::string& v2) { return v1 >= v2; }},
{proto::plan::OpType::LessThan,
[](std::string& v1, std::string& v2) { return v1 < v2; }},
{proto::plan::OpType::LessEqual,
[](std::string& v1, std::string& v2) { return v1 <= v2; }},
{proto::plan::OpType::Equal,
[](std::string& v1, std::string& v2) { return v1 == v2; }},
{proto::plan::OpType::NotEqual,
[](std::string& v1, std::string& v2) { return v1 != v2; }},
{proto::plan::OpType::PrefixMatch,
[](std::string& v1, std::string& v2) {
return PrefixMatch(v1, v2);
}},
};
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> str_col;
std::vector<std::string> another_str_col;
FixedVector<bool> valid_data;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto reserve_col = [&, raw_data](const FieldMeta& field_meta,
std::vector<std::string>& str_col) {
auto new_str_col = raw_data.get_col(field_meta.get_id());
auto begin = FIELD_DATA(new_str_col, string).begin();
auto end = FIELD_DATA(new_str_col, string).end();
str_col.insert(str_col.end(), begin, end);
};
auto new_str_valid_col =
raw_data.get_col_valid(another_str_meta.get_id());
valid_data.insert(valid_data.end(),
new_str_valid_col.begin(),
new_str_valid_col.end());
reserve_col(str_meta, str_col);
reserve_col(another_str_meta, another_str_col);
{
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (const auto& [op, ref_func] : testcases) {
auto plan_proto = gen_compare_plan(op);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
BitsetType final;
final = ExecuteQueryExpr(
plan->plan_node_->plannodes_->sources()[0]->sources()[0],
seg_promote,
N * num_iters,
MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
if (!valid_data[i]) {
ASSERT_EQ(ans, false);
continue;
}
auto val = str_col[i];
auto another_val = another_str_col[i];
auto ref = ref_func(val, another_val);
ASSERT_EQ(ans, ref) << "@" << op << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, UnaryRange) {
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
@ -510,6 +851,116 @@ TEST(StringExpr, UnaryRange) {
}
}
TEST(StringExpr, UnaryRangeNullable) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR, true);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField(
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto gen_unary_range_plan =
[&, fvec_meta, str_meta](
proto::plan::OpType op,
std::string value) -> std::unique_ptr<proto::plan::PlanNode> {
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr = test::GenUnaryRangeExpr(op, value);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr().release();
expr->set_allocated_unary_range_expr(unary_range_expr);
proto::plan::VectorType vector_type;
if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT) {
vector_type = proto::plan::VectorType::FloatVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_BINARY) {
vector_type = proto::plan::VectorType::BinaryVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT16) {
vector_type = proto::plan::VectorType::Float16Vector;
}
auto anns = GenAnns(expr, vector_type, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return plan_node;
};
std::vector<std::tuple<proto::plan::OpType,
std::string,
std::function<bool(std::string&)>>>
testcases{
{proto::plan::OpType::GreaterThan,
"2000",
[](std::string& val) { return val > "2000"; }},
{proto::plan::OpType::GreaterEqual,
"2000",
[](std::string& val) { return val >= "2000"; }},
{proto::plan::OpType::LessThan,
"3000",
[](std::string& val) { return val < "3000"; }},
{proto::plan::OpType::LessEqual,
"3000",
[](std::string& val) { return val <= "3000"; }},
{proto::plan::OpType::PrefixMatch,
"a",
[](std::string& val) { return PrefixMatch(val, "a"); }},
};
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> str_col;
FixedVector<bool> valid_data;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_str_col = raw_data.get_col(str_meta.get_id());
auto begin = FIELD_DATA(new_str_col, string).begin();
auto end = FIELD_DATA(new_str_col, string).end();
str_col.insert(str_col.end(), begin, end);
auto new_str_valid_col = raw_data.get_col_valid(str_meta.get_id());
valid_data.insert(valid_data.end(),
new_str_valid_col.begin(),
new_str_valid_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (const auto& [op, value, ref_func] : testcases) {
auto plan_proto = gen_unary_range_plan(op, value);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
BitsetType final;
final = ExecuteQueryExpr(
plan->plan_node_->plannodes_->sources()[0]->sources()[0],
seg_promote,
N * num_iters,
MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
if (!valid_data[i]) {
ASSERT_EQ(ans, false);
continue;
}
auto val = str_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref)
<< "@" << op << "@" << value << "@" << i << "!!" << val;
}
}
}
TEST(StringExpr, BinaryRange) {
auto schema = GenTestSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
@ -625,6 +1076,136 @@ TEST(StringExpr, BinaryRange) {
}
}
TEST(StringExpr, BinaryRangeNullable) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR, true);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField(
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto gen_binary_range_plan =
[&, fvec_meta, str_meta](
bool lb_inclusive,
bool ub_inclusive,
std::string lb,
std::string ub) -> std::unique_ptr<proto::plan::PlanNode> {
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto binary_range_expr =
GenBinaryRangeExpr(lb_inclusive, ub_inclusive, lb, ub);
binary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr().release();
expr->set_allocated_binary_range_expr(binary_range_expr);
proto::plan::VectorType vector_type;
if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT) {
vector_type = proto::plan::VectorType::FloatVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_BINARY) {
vector_type = proto::plan::VectorType::BinaryVector;
} else if (fvec_meta.get_data_type() == DataType::VECTOR_FLOAT16) {
vector_type = proto::plan::VectorType::Float16Vector;
}
auto anns = GenAnns(expr, vector_type, fvec_meta.get_id().get(), "$0");
auto plan_node = std::make_unique<proto::plan::PlanNode>();
plan_node->set_allocated_vector_anns(anns);
return plan_node;
};
// bool lb_inclusive, bool ub_inclusive, std::string lb, std::string ub
std::vector<std::tuple<bool,
bool,
std::string,
std::string,
std::function<bool(std::string&)>>>
testcases{
{false,
false,
"2000",
"3000",
[](std::string& val) { return val > "2000" && val < "3000"; }},
{false,
true,
"2000",
"3000",
[](std::string& val) { return val > "2000" && val <= "3000"; }},
{true,
false,
"2000",
"3000",
[](std::string& val) { return val >= "2000" && val < "3000"; }},
{true,
true,
"2000",
"3000",
[](std::string& val) { return val >= "2000" && val <= "3000"; }},
{true,
true,
"2000",
"1000",
[](std::string& val) { return false; }},
};
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> str_col;
FixedVector<bool> valid_data;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_str_col = raw_data.get_col(str_meta.get_id());
auto begin = FIELD_DATA(new_str_col, string).begin();
auto end = FIELD_DATA(new_str_col, string).end();
str_col.insert(str_col.end(), begin, end);
auto new_str_valid_col = raw_data.get_col_valid(str_meta.get_id());
valid_data.insert(valid_data.end(),
new_str_valid_col.begin(),
new_str_valid_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (const auto& [lb_inclusive, ub_inclusive, lb, ub, ref_func] :
testcases) {
auto plan_proto =
gen_binary_range_plan(lb_inclusive, ub_inclusive, lb, ub);
auto plan = ProtoParser(*schema).CreatePlan(*plan_proto);
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
BitsetType final;
final = ExecuteQueryExpr(
plan->plan_node_->plannodes_->sources()[0]->sources()[0],
seg_promote,
N * num_iters,
MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
if (!valid_data[i]) {
ASSERT_EQ(ans, false);
continue;
}
auto val = str_col[i];
auto ref = ref_func(val);
ASSERT_EQ(ans, ref)
<< "@" << lb_inclusive << "@" << ub_inclusive << "@" << lb
<< "@" << ub << "@" << i << "!!" << val;
}
}
}
TEST(AlwaysTrueStringPlan, SearchWithOutputFields) {
auto schema = GenStrPKSchema();
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
@ -718,7 +1299,7 @@ TEST(AlwaysTrueStringPlan, QueryWithOutputFields) {
dataset.timestamps_.data(),
dataset.raw_);
auto expr_proto = GenAlwaysTrueExpr(fvec_meta, str_meta);
auto expr_proto = GenAlwaysTrueExprIfValid(fvec_meta, str_meta);
auto plan_proto = GenPlanNode();
plan_proto->mutable_query()->set_allocated_predicates(expr_proto);
SetTargetEntry(plan_proto, {str_meta.get_id().get()});
@ -733,4 +1314,47 @@ TEST(AlwaysTrueStringPlan, QueryWithOutputFields) {
ASSERT_EQ(retrieved->fields_data().size(), 1);
ASSERT_EQ(retrieved->fields_data(0).scalars().string_data().data().size(),
N);
ASSERT_EQ(retrieved->fields_data(0).valid_data_size(), 0);
}
TEST(AlwaysTrueStringPlan, QueryWithOutputFieldsNullable) {
auto schema = std::make_shared<Schema>();
schema->AddDebugField("str", DataType::VARCHAR, true);
schema->AddDebugField("another_str", DataType::VARCHAR);
schema->AddDebugField(
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto pk = schema->AddDebugField("int64", DataType::INT64);
schema->set_primary_field_id(pk);
const auto& fvec_meta = schema->operator[](FieldName("fvec"));
const auto& str_meta = schema->operator[](FieldName("str"));
auto N = 10000;
auto dataset = DataGen(schema, N);
auto vec_col = dataset.get_col<float>(fvec_meta.get_id());
auto str_col =
dataset.get_col(str_meta.get_id())->scalars().string_data().data();
auto valid_data = dataset.get_col_valid(str_meta.get_id());
auto segment = CreateGrowingSegment(schema, empty_index_meta);
segment->PreInsert(N);
segment->Insert(0,
N,
dataset.row_ids_.data(),
dataset.timestamps_.data(),
dataset.raw_);
auto expr_proto = GenAlwaysTrueExprIfValid(fvec_meta, str_meta);
auto plan_proto = GenPlanNode();
plan_proto->mutable_query()->set_allocated_predicates(expr_proto);
SetTargetEntry(plan_proto, {str_meta.get_id().get()});
auto plan = ProtoParser(*schema).CreateRetrievePlan(*plan_proto);
Timestamp time = MAX_TIMESTAMP;
auto retrieved = segment->Retrieve(
nullptr, plan.get(), time, DEFAULT_MAX_OUTPUT_SIZE, false);
ASSERT_EQ(retrieved->offset().size(), N / 2);
ASSERT_EQ(retrieved->fields_data().size(), 1);
ASSERT_EQ(retrieved->fields_data(0).scalars().string_data().data().size(),
N / 2);
ASSERT_EQ(retrieved->fields_data(0).valid_data().size(), N / 2);
}

View File

@ -139,7 +139,9 @@ template <typename T>
inline void
assert_reverse(ScalarIndex<T>* index, const std::vector<T>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_EQ(index->Reverse_Lookup(offset), arr[offset]);
auto raw = index->Reverse_Lookup(offset);
ASSERT_TRUE(raw.has_value());
ASSERT_EQ(raw.value(), arr[offset]);
}
}
@ -147,7 +149,9 @@ template <>
inline void
assert_reverse(ScalarIndex<float>* index, const std::vector<float>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_TRUE(compare_float(index->Reverse_Lookup(offset), arr[offset]));
auto raw = index->Reverse_Lookup(offset);
ASSERT_TRUE(raw.has_value());
ASSERT_TRUE(compare_float(raw.value(), arr[offset]));
}
}
@ -155,7 +159,9 @@ template <>
inline void
assert_reverse(ScalarIndex<double>* index, const std::vector<double>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_TRUE(compare_double(index->Reverse_Lookup(offset), arr[offset]));
auto raw = index->Reverse_Lookup(offset);
ASSERT_TRUE(raw.has_value());
ASSERT_TRUE(compare_double(raw.value(), arr[offset]));
}
}
@ -164,7 +170,9 @@ inline void
assert_reverse(ScalarIndex<std::string>* index,
const std::vector<std::string>& arr) {
for (size_t offset = 0; offset < arr.size(); ++offset) {
ASSERT_TRUE(arr[offset].compare(index->Reverse_Lookup(offset)) == 0);
auto raw = index->Reverse_Lookup(offset);
ASSERT_TRUE(raw.has_value());
ASSERT_TRUE(arr[offset].compare(raw.value()) == 0);
}
}

View File

@ -667,8 +667,14 @@ DataGenForJsonArray(SchemaPtr schema,
auto insert_data = std::make_unique<InsertRecordProto>();
auto insert_cols = [&insert_data](
auto& data, int64_t count, auto& field_meta) {
FixedVector<bool> valid_data(count);
if (field_meta.is_nullable()) {
for (int i = 0; i < count; ++i) {
valid_data[i] = i % 2 == 0 ? true : false;
}
}
auto array = milvus::segcore::CreateDataArrayFrom(
data.data(), nullptr, count, field_meta);
data.data(), valid_data.data(), count, field_meta);
insert_data->mutable_fields_data()->AddAllocated(array.release());
};
for (auto field_id : schema->get_field_ids()) {

View File

@ -13019,7 +13019,6 @@ class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.tags(CaseLabel.GPU)
@pytest.mark.skip(reason="issue #36184")
def test_search_after_different_index_with_params_none_default_data(self, varchar_scalar_index, numeric_scalar_index,
null_data_percent, _async):
"""