enhance: Add json key inverted index in stats for optimization (#38039)

Add json key inverted index in stats for optimization
https://github.com/milvus-io/milvus/issues/36995

---------

Signed-off-by: Xianhui.Lin <xianhui.lin@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
pull/41217/head
Xianhui Lin 2025-04-10 15:20:28 +08:00 committed by GitHub
parent a308d2c886
commit 3bc24c264f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
133 changed files with 10969 additions and 4913 deletions

View File

@ -414,6 +414,7 @@ queryNode:
buildParallelRate: 0.5 # the ratio of building interim index parallel matched with cpu num
multipleChunkedEnable: true # Enable multiple chunked search
knowhereScoreConsistency: false # Enable knowhere strong consistency score computation logic
jsonKeyStatsCommitInterval: 200 # the commit interval for the JSON key Stats to commit
loadMemoryUsageFactor: 1 # The multiply factor of calculating the memory usage while loading segments
enableDisk: false # enable querynode load disk index, and search on disk index
maxDiskUsagePercentage: 95
@ -636,6 +637,10 @@ dataCoord:
indexTaskSlotUsage: 64 # slot usage of index task per 512mb
statsTaskSlotUsage: 8 # slot usage of stats task per 512mb
analyzeTaskSlotUsage: 65535 # slot usage of analyze task
jsonStatsTriggerCount: 10 # jsonkey stats task count per trigger
jsonStatsTriggerInterval: 10 # jsonkey task interval per trigger
enabledJSONKeyStatsInSort: false # Indicates whether to enable JSON key stats task with sort
jsonKeyStatsMemoryBudgetInTantivy: 16777216 # the memory budget for the JSON index In Tantivy, the unit is bytes
ip: # TCP/IP address of dataCoord. If not specified, use the first unicastable address
port: 13333 # TCP port of dataCoord
grpc:
@ -891,6 +896,8 @@ common:
sync:
taskPoolReleaseTimeoutSeconds: 60 # The maximum time to wait for the task to finish and release resources in the pool
enabledOptimizeExpr: true # Indicates whether to enable optimize expr
enabledJSONKeyStats: false # Indicates sealedsegment whether to enable JSON key stats
enabledGrowingSegmentJSONKeyStats: false # Indicates growingsegment whether to enable JSON key stats
# QuotaConfig, configurations of Milvus quota and limits.
# By default, we enable:

View File

@ -30,6 +30,9 @@ int CPU_NUM = DEFAULT_CPU_NUM;
int64_t EXEC_EVAL_EXPR_BATCH_SIZE = DEFAULT_EXEC_EVAL_EXPR_BATCH_SIZE;
bool OPTIMIZE_EXPR_ENABLED = DEFAULT_OPTIMIZE_EXPR_ENABLED;
int64_t JSON_KEY_STATS_COMMIT_INTERVAL = DEFAULT_JSON_KEY_STATS_COMMIT_INTERVAL;
bool GROWING_JSON_KEY_STATS_ENABLED = DEFAULT_GROWING_JSON_KEY_STATS_ENABLED;
void
SetIndexSliceSize(const int64_t size) {
FILE_SLICE_SIZE = size << 20;
@ -74,4 +77,18 @@ SetDefaultOptimizeExprEnable(bool val) {
LOG_INFO("set default optimize expr enabled: {}", OPTIMIZE_EXPR_ENABLED);
}
void
SetDefaultJSONKeyStatsCommitInterval(int64_t val) {
JSON_KEY_STATS_COMMIT_INTERVAL = val;
LOG_INFO("set default json key Stats commit interval: {}",
JSON_KEY_STATS_COMMIT_INTERVAL);
}
void
SetDefaultGrowingJSONKeyStatsEnable(bool val) {
GROWING_JSON_KEY_STATS_ENABLED = val;
LOG_INFO("set default growing json key index enable: {}",
GROWING_JSON_KEY_STATS_ENABLED);
}
} // namespace milvus

View File

@ -29,8 +29,9 @@ extern int64_t MIDDLE_PRIORITY_THREAD_CORE_COEFFICIENT;
extern int64_t LOW_PRIORITY_THREAD_CORE_COEFFICIENT;
extern int CPU_NUM;
extern int64_t EXEC_EVAL_EXPR_BATCH_SIZE;
extern int64_t JSON_KEY_STATS_COMMIT_INTERVAL;
extern bool OPTIMIZE_EXPR_ENABLED;
extern bool GROWING_JSON_KEY_STATS_ENABLED;
void
SetIndexSliceSize(const int64_t size);
@ -52,6 +53,12 @@ SetDefaultExecEvalExprBatchSize(int64_t val);
void
SetDefaultOptimizeExprEnable(bool val);
void
SetDefaultJSONKeyStatsCommitInterval(int64_t val);
void
SetDefaultGrowingJSONKeyStatsEnable(bool val);
struct BufferView {
struct Element {
const char* data_;

View File

@ -49,6 +49,7 @@ const char PAGE_RETAIN_ORDER[] = "page_retain_order";
const char TEXT_LOG_ROOT_PATH[] = "text_log";
const char ITERATIVE_FILTER[] = "iterative_filter";
const char HINTS[] = "hints";
const char JSON_KEY_INDEX_LOG_ROOT_PATH[] = "json_key_index_log";
const char DEFAULT_PLANNODE_ID[] = "0";
const char DEAFULT_QUERY_ID[] = "0";
@ -82,3 +83,6 @@ const std::string JSON_CAST_TYPE = "json_cast_type";
const std::string JSON_PATH = "json_path";
const bool DEFAULT_OPTIMIZE_EXPR_ENABLED = true;
const int64_t DEFAULT_CONVERT_OR_TO_IN_NUMERIC_LIMIT = 150;
const int64_t DEFAULT_JSON_INDEX_MEMORY_BUDGET = 16777216; // bytes, 16MB
const bool DEFAULT_GROWING_JSON_KEY_STATS_ENABLED = false;
const int64_t DEFAULT_JSON_KEY_STATS_COMMIT_INTERVAL = 200;

View File

@ -12,7 +12,7 @@
#include "common/FieldMeta.h"
#include "common/SystemProperty.h"
#include "common/protobuf_utils.h"
#include "common/Common.h"
#include <boost/lexical_cast.hpp>
#include <optional>
@ -39,6 +39,11 @@ FieldMeta::enable_match() const {
return string_info_->enable_match;
}
bool
FieldMeta::enable_growing_jsonStats() const {
return IsJsonDataType(type_) && GROWING_JSON_KEY_STATS_ENABLED;
}
bool
FieldMeta::enable_analyzer() const {
if (!IsStringDataType(type_)) {

View File

@ -148,6 +148,9 @@ class FieldMeta {
bool
enable_analyzer() const;
bool
enable_growing_jsonStats() const;
TokenizerParams
get_analyzer_params() const;

View File

@ -149,6 +149,25 @@ class Json {
return doc;
}
value_result<document>
doc(uint16_t offset, uint16_t length) const {
thread_local simdjson::ondemand::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.iterate(
data_.data() + offset, length, length + simdjson::SIMDJSON_PADDING);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {} offset {}, length {}: {}, "
"total_json:{}",
std::string(data_.data() + offset, length),
offset,
length,
simdjson::error_message(doc.error()),
data_);
return doc;
}
value_result<simdjson::dom::element>
dom_doc() const {
if (data_.size() == 0) {
@ -166,6 +185,20 @@ class Json {
return doc;
}
value_result<simdjson::dom::element>
dom_doc(uint16_t offset, uint16_t length) const {
thread_local simdjson::dom::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.parse(data_.data() + offset, length);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {}: {}",
std::string(data_.data() + offset, length),
simdjson::error_message(doc.error()));
return doc;
}
bool
exist(std::string_view pointer) const {
auto doc = this->doc();
@ -207,6 +240,22 @@ class Json {
return doc().at_pointer(pointer).get<T>();
}
template <typename T>
value_result<T>
at(uint16_t offset, uint16_t length) const {
return doc(offset, length).get<T>();
}
std::string_view
at_string(uint16_t offset, uint16_t length) const {
return std::string_view(data_.data() + offset, length);
}
value_result<simdjson::dom::array>
array_at(uint16_t offset, uint16_t length) const {
return dom_doc(offset, length).get_array();
}
// get dom array by JSON pointer,
// call `size()` to get array size,
// call `at()` to get array element by index,

View File

@ -25,7 +25,7 @@
#include "common/Tracer.h"
#include "log/Log.h"
std::once_flag flag1, flag2, flag3, flag4, flag5, flag6, flag7;
std::once_flag flag1, flag2, flag3, flag4, flag5, flag6, flag7, flag8, flag9;
std::once_flag traceFlag;
void
@ -86,6 +86,22 @@ InitDefaultOptimizeExprEnable(bool val) {
val);
}
void
InitDefaultJSONKeyStatsCommitInterval(int64_t val) {
std::call_once(
flag8,
[](int val) { milvus::SetDefaultJSONKeyStatsCommitInterval(val); },
val);
}
void
InitDefaultGrowingJSONKeyStatsEnable(bool val) {
std::call_once(
flag9,
[](bool val) { milvus::SetDefaultGrowingJSONKeyStatsEnable(val); },
val);
}
void
InitTrace(CTraceConfig* config) {
auto traceConfig = milvus::tracer::TraceConfig{config->exporter,

View File

@ -51,6 +51,12 @@ SetTrace(CTraceConfig* config);
void
InitDefaultOptimizeExprEnable(bool val);
void
InitDefaultJSONKeyStatsCommitInterval(int64_t val);
void
InitDefaultGrowingJSONKeyStatsEnable(bool val);
#ifdef __cplusplus
};
#endif

View File

@ -0,0 +1,509 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
/*
* MIT License
*
* Copyright (c) 2010 Serge Zaitsev
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef JSMN_H
#define JSMN_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#define JSMN_STATIC
#ifdef JSMN_STATIC
#define JSMN_API static
#else
#define JSMN_API extern
#endif
/**
* JSON type identifier. Basic types are:
* o Object
* o Array
* o String
* o Other primitive: number, boolean (true/false) or null
*/
typedef enum {
JSMN_UNDEFINED = 0,
JSMN_OBJECT = 1 << 0,
JSMN_ARRAY = 1 << 1,
JSMN_STRING = 1 << 2,
JSMN_PRIMITIVE = 1 << 3
} jsmntype_t;
enum jsmnerr {
/* Not enough tokens were provided */
JSMN_ERROR_NOMEM = -1,
/* Invalid character inside JSON string */
JSMN_ERROR_INVAL = -2,
/* The string is not a full JSON packet, more bytes expected */
JSMN_ERROR_PART = -3
};
/**
* JSON token description.
* type type (object, array, string etc.)
* start start position in JSON data string
* end end position in JSON data string
*/
typedef struct jsmntok {
jsmntype_t type;
int start;
int end;
int size;
#ifdef JSMN_PARENT_LINKS
int parent;
#endif
} jsmntok_t;
/**
* JSON parser. Contains an array of token blocks available. Also stores
* the string being parsed now and current position in that string.
*/
typedef struct jsmn_parser {
unsigned int pos; /* offset in the JSON string */
unsigned int toknext; /* next token to allocate */
int toksuper; /* superior token node, e.g. parent object or array */
} jsmn_parser;
/**
* Create JSON parser over an array of tokens
*/
JSMN_API void
jsmn_init(jsmn_parser* parser);
/**
* Run JSON parser. It parses a JSON data string into and array of tokens, each
* describing
* a single JSON object.
*/
JSMN_API int
jsmn_parse(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const unsigned int num_tokens);
#ifndef JSMN_HEADER
/**
* Allocates a fresh unused token from the token pool.
*/
static jsmntok_t*
jsmn_alloc_token(jsmn_parser* parser,
jsmntok_t* tokens,
const size_t num_tokens) {
jsmntok_t* tok;
if (parser->toknext >= num_tokens) {
return NULL;
}
tok = &tokens[parser->toknext++];
tok->start = tok->end = -1;
tok->size = 0;
#ifdef JSMN_PARENT_LINKS
tok->parent = -1;
#endif
return tok;
}
/**
* Fills token type and boundaries.
*/
static void
jsmn_fill_token(jsmntok_t* token,
const jsmntype_t type,
const int start,
const int end) {
token->type = type;
token->start = start;
token->end = end;
token->size = 0;
}
/**
* Fills next available token with JSON primitive.
*/
static int
jsmn_parse_primitive(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const size_t num_tokens) {
jsmntok_t* token;
int start;
start = parser->pos;
for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
switch (js[parser->pos]) {
#ifndef JSMN_STRICT
/* In strict mode primitive must be followed by "," or "}" or "]" */
case ':':
#endif
case '\t':
case '\r':
case '\n':
case ' ':
case ',':
case ']':
case '}':
goto found;
default:
/* to quiet a warning from gcc*/
break;
}
if (js[parser->pos] < 32 || js[parser->pos] >= 127) {
parser->pos = start;
return JSMN_ERROR_INVAL;
}
}
#ifdef JSMN_STRICT
/* In strict mode primitive must be followed by a comma/object/array */
parser->pos = start;
return JSMN_ERROR_PART;
#endif
found:
if (tokens == NULL) {
parser->pos--;
return 0;
}
token = jsmn_alloc_token(parser, tokens, num_tokens);
if (token == NULL) {
parser->pos = start;
return JSMN_ERROR_NOMEM;
}
jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos);
#ifdef JSMN_PARENT_LINKS
token->parent = parser->toksuper;
#endif
parser->pos--;
return 0;
}
/**
* Fills next token with JSON string.
*/
static int
jsmn_parse_string(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const size_t num_tokens) {
jsmntok_t* token;
int start = parser->pos;
/* Skip starting quote */
parser->pos++;
for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
char c = js[parser->pos];
/* Quote: end of string */
if (c == '\"') {
if (tokens == NULL) {
return 0;
}
token = jsmn_alloc_token(parser, tokens, num_tokens);
if (token == NULL) {
parser->pos = start;
return JSMN_ERROR_NOMEM;
}
jsmn_fill_token(token, JSMN_STRING, start + 1, parser->pos);
#ifdef JSMN_PARENT_LINKS
token->parent = parser->toksuper;
#endif
return 0;
}
/* Backslash: Quoted symbol expected */
if (c == '\\' && parser->pos + 1 < len) {
int i;
parser->pos++;
switch (js[parser->pos]) {
/* Allowed escaped symbols */
case '\"':
case '/':
case '\\':
case 'b':
case 'f':
case 'r':
case 'n':
case 't':
break;
/* Allows escaped symbol \uXXXX */
case 'u':
parser->pos++;
for (i = 0;
i < 4 && parser->pos < len && js[parser->pos] != '\0';
i++) {
/* If it isn't a hex character we have an error */
if (!((js[parser->pos] >= 48 &&
js[parser->pos] <= 57) || /* 0-9 */
(js[parser->pos] >= 65 &&
js[parser->pos] <= 70) || /* A-F */
(js[parser->pos] >= 97 &&
js[parser->pos] <= 102))) { /* a-f */
parser->pos = start;
return JSMN_ERROR_INVAL;
}
parser->pos++;
}
parser->pos--;
break;
/* Unexpected symbol */
default:
parser->pos = start;
return JSMN_ERROR_INVAL;
}
}
}
parser->pos = start;
return JSMN_ERROR_PART;
}
/**
* Parse JSON string and fill tokens.
*/
JSMN_API int
jsmn_parse(jsmn_parser* parser,
const char* js,
const size_t len,
jsmntok_t* tokens,
const unsigned int num_tokens) {
int r;
int i;
jsmntok_t* token;
int count = parser->toknext;
for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) {
char c;
jsmntype_t type;
c = js[parser->pos];
switch (c) {
case '{':
case '[':
count++;
if (tokens == NULL) {
break;
}
token = jsmn_alloc_token(parser, tokens, num_tokens);
if (token == NULL) {
return JSMN_ERROR_NOMEM;
}
if (parser->toksuper != -1) {
jsmntok_t* t = &tokens[parser->toksuper];
#ifdef JSMN_STRICT
/* In strict mode an object or array can't become a key */
if (t->type == JSMN_OBJECT) {
return JSMN_ERROR_INVAL;
}
#endif
t->size++;
#ifdef JSMN_PARENT_LINKS
token->parent = parser->toksuper;
#endif
}
token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY);
token->start = parser->pos;
parser->toksuper = parser->toknext - 1;
break;
case '}':
case ']':
if (tokens == NULL) {
break;
}
type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY);
#ifdef JSMN_PARENT_LINKS
if (parser->toknext < 1) {
return JSMN_ERROR_INVAL;
}
token = &tokens[parser->toknext - 1];
for (;;) {
if (token->start != -1 && token->end == -1) {
if (token->type != type) {
return JSMN_ERROR_INVAL;
}
token->end = parser->pos + 1;
parser->toksuper = token->parent;
break;
}
if (token->parent == -1) {
if (token->type != type || parser->toksuper == -1) {
return JSMN_ERROR_INVAL;
}
break;
}
token = &tokens[token->parent];
}
#else
for (i = parser->toknext - 1; i >= 0; i--) {
token = &tokens[i];
if (token->start != -1 && token->end == -1) {
if (token->type != type) {
return JSMN_ERROR_INVAL;
}
parser->toksuper = -1;
token->end = parser->pos + 1;
break;
}
}
/* Error if unmatched closing bracket */
if (i == -1) {
return JSMN_ERROR_INVAL;
}
for (; i >= 0; i--) {
token = &tokens[i];
if (token->start != -1 && token->end == -1) {
parser->toksuper = i;
break;
}
}
#endif
break;
case '\"':
r = jsmn_parse_string(parser, js, len, tokens, num_tokens);
if (r < 0) {
return r;
}
count++;
if (parser->toksuper != -1 && tokens != NULL) {
tokens[parser->toksuper].size++;
}
break;
case '\t':
case '\r':
case '\n':
case ' ':
break;
case ':':
parser->toksuper = parser->toknext - 1;
break;
case ',':
if (tokens != NULL && parser->toksuper != -1 &&
tokens[parser->toksuper].type != JSMN_ARRAY &&
tokens[parser->toksuper].type != JSMN_OBJECT) {
#ifdef JSMN_PARENT_LINKS
parser->toksuper = tokens[parser->toksuper].parent;
#else
for (i = parser->toknext - 1; i >= 0; i--) {
if (tokens[i].type == JSMN_ARRAY ||
tokens[i].type == JSMN_OBJECT) {
if (tokens[i].start != -1 && tokens[i].end == -1) {
parser->toksuper = i;
break;
}
}
}
#endif
}
break;
#ifdef JSMN_STRICT
/* In strict mode primitives are: numbers and booleans */
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case 't':
case 'f':
case 'n':
/* And they must not be keys of the object */
if (tokens != NULL && parser->toksuper != -1) {
const jsmntok_t* t = &tokens[parser->toksuper];
if (t->type == JSMN_OBJECT ||
(t->type == JSMN_STRING && t->size != 0)) {
return JSMN_ERROR_INVAL;
}
}
#else
/* In non-strict mode every unquoted value is a primitive */
default:
#endif
r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens);
if (r < 0) {
return r;
}
count++;
if (parser->toksuper != -1 && tokens != NULL) {
tokens[parser->toksuper].size++;
}
break;
#ifdef JSMN_STRICT
/* Unexpected char in strict mode */
default:
return JSMN_ERROR_INVAL;
#endif
}
}
if (tokens != NULL) {
for (i = parser->toknext - 1; i >= 0; i--) {
/* Unmatched opened object or array */
if (tokens[i].start != -1 && tokens[i].end == -1) {
return JSMN_ERROR_PART;
}
}
}
return count;
}
/**
* Creates a new parser based over a given buffer with an array of tokens
* available.
*/
JSMN_API void
jsmn_init(jsmn_parser* parser) {
parser->pos = 0;
parser->toknext = 0;
parser->toksuper = -1;
}
#endif /* JSMN_HEADER */
#ifdef __cplusplus
}
#endif
#endif /* JSMN_H */

View File

@ -176,6 +176,7 @@ class QueryContext : public Context {
const milvus::segcore::SegmentInternalInterface* segment,
int64_t active_count,
milvus::Timestamp timestamp,
int32_t consistency_level = 0,
std::shared_ptr<QueryConfig> query_config =
std::make_shared<QueryConfig>(),
folly::Executor* executor = nullptr,
@ -187,7 +188,8 @@ class QueryContext : public Context {
active_count_(active_count),
query_timestamp_(timestamp),
query_config_(query_config),
executor_(executor) {
executor_(executor),
consistency_level_(consistency_level) {
}
folly::Executor*
@ -270,6 +272,11 @@ class QueryContext : public Context {
return std::move(retrieve_result_);
}
int32_t
get_consistency_level() {
return consistency_level_;
}
private:
folly::Executor* executor_;
//folly::Executor::KeepAlive<> executor_keepalive_;
@ -291,6 +298,8 @@ class QueryContext : public Context {
// used for store segment search/retrieve result
milvus::SearchResult search_result_;
milvus::RetrieveResult retrieve_result_;
int32_t consistency_level_ = 0;
};
// Represent the state of one thread of query execution.

View File

@ -449,7 +449,8 @@ class PhyBinaryArithOpEvalRangeExpr : public SegmentExpr {
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -457,7 +458,8 @@ class PhyBinaryArithOpEvalRangeExpr : public SegmentExpr {
expr->column_.nested_path_,
DataType::NONE,
active_count,
batch_size),
batch_size,
consistency_level),
expr_(expr) {
}

View File

@ -385,6 +385,9 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) {
const auto& bitmap_input = context.get_bitmap_input();
auto* input = context.get_offset_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecRangeVisitorImplForJsonForIndex<ValueType>();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -505,6 +508,246 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) {
return res_vec;
}
template <typename ValueType>
VectorPtr
PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJsonForIndex() {
using GetType = std::conditional_t<std::is_same_v<ValueType, std::string>,
std::string_view,
ValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
#define BinaryRangeJSONIndexCompare(cmp) \
do { \
auto val = json.at<GetType>(offset, size); \
if (val.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto val = json.at<double>(offset, size); \
return !val.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
} while (false)
#define BinaryRangeJSONTypeCompare(cmp) \
do { \
if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (type == uint8_t(milvus::index::JSONType::STRING)) { \
auto val = json.at_string(offset, size); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto val = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto val = \
std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto val = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto val = \
std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} \
} while (false)
#define BinaryRangeJSONTypeCompareWithValue(cmp) \
do { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float val = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t val = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float val = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t val = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
bool val = *reinterpret_cast<bool*>(&value); \
return (cmp); \
} \
} while (false)
bool lower_inclusive = expr_->lower_inclusive_;
bool upper_inclusive = expr_->upper_inclusive_;
ValueType val1 = GetValueFromProto<ValueType>(expr_->lower_val_);
ValueType val2 = GetValueFromProto<ValueType>(expr_->upper_val_);
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment,
&field_id,
val1,
val2,
lower_inclusive,
upper_inclusive](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if constexpr (std::is_same_v<GetType, int64_t>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType,
std::string_view>) {
if (type != uint8_t(milvus::index::JSONType::STRING) &&
type !=
uint8_t(milvus::index::JSONType::STRING_ESCAPE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, double>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, bool>) {
if (type != uint8_t(milvus::index::JSONType::BOOL)) {
return false;
}
}
if (lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) <= val &&
val <= static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 <= val &&
val <= val2);
}
} else if (lower_inclusive && !upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) <= val &&
val < static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 <= val &&
val < val2);
}
} else if (!lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) < val &&
val <= static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 < val &&
val <= val2);
}
} else {
if (type == uint8_t(milvus::index::JSONType::FLOAT)) {
BinaryRangeJSONTypeCompareWithValue(
static_cast<float>(val1) < val &&
val < static_cast<float>(val2));
} else {
BinaryRangeJSONTypeCompareWithValue(val1 < val &&
val < val2);
}
}
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
if (lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 <= val && val <= val2);
} else {
BinaryRangeJSONIndexCompare(
val1 <= ValueType(val.value()) &&
ValueType(val.value()) <= val2);
}
} else if (lower_inclusive && !upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 <= val && val < val2);
} else {
BinaryRangeJSONIndexCompare(
val1 <= ValueType(val.value()) &&
ValueType(val.value()) < val2);
}
} else if (!lower_inclusive && upper_inclusive) {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 < val && val <= val2);
} else {
BinaryRangeJSONIndexCompare(
val1 < ValueType(val.value()) &&
ValueType(val.value()) <= val2);
}
} else {
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
BinaryRangeJSONTypeCompare(val1 < val && val < val2);
} else {
BinaryRangeJSONIndexCompare(
val1 < ValueType(val.value()) &&
ValueType(val.value()) < val2);
}
}
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename ValueType>
VectorPtr
PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(EvalCtx& context) {

View File

@ -245,7 +245,8 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -253,7 +254,8 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
expr->column_.nested_path_,
DataType::NONE,
active_count,
batch_size),
batch_size,
consistency_level),
expr_(expr) {
}
@ -308,6 +310,10 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr {
VectorPtr
ExecRangeVisitorImplForJson(EvalCtx& context);
template <typename ValueType>
VectorPtr
ExecRangeVisitorImplForJsonForIndex();
template <typename ValueType>
VectorPtr
ExecRangeVisitorImplForArray(EvalCtx& context);

View File

@ -98,6 +98,9 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return EvalJsonExistsForDataSegmentForIndex();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -159,5 +162,49 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
return res_vec;
}
VectorPtr
PhyExistsFilterExpr::EvalJsonExistsForDataSegmentForIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, field_id, pointer](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
uint32_t value) {
return true;
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
} //namespace exec
} // namespace milvus

View File

@ -42,7 +42,8 @@ class PhyExistsFilterExpr : public SegmentExpr {
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -51,7 +52,8 @@ class PhyExistsFilterExpr : public SegmentExpr {
DataType::NONE,
active_count,
batch_size,
true),
true,
consistency_level),
expr_(expr) {
}
@ -80,6 +82,9 @@ class PhyExistsFilterExpr : public SegmentExpr {
VectorPtr
EvalJsonExistsForIndex();
VectorPtr
EvalJsonExistsForDataSegmentForIndex();
private:
std::shared_ptr<const milvus::expr::ExistsExpr> expr_;
};

View File

@ -154,7 +154,6 @@ CompileExpression(const expr::TypedExprPtr& expr,
const std::unordered_set<std::string>& flatten_candidates,
bool enable_constant_folding) {
ExprPtr result;
auto compiled_inputs = CompileInputs(expr, context, flatten_candidates);
auto GetTypes = [](const std::vector<ExprPtr>& exprs) {
@ -183,7 +182,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyUnaryRangeFilterExpr",
context->get_segment(),
context->get_active_count(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::LogicalUnaryExpr>(expr)) {
result = std::make_shared<PhyLogicalUnaryExpr>(
@ -197,7 +197,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
context->get_segment(),
context->get_active_count(),
context->get_query_timestamp(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::LogicalBinaryExpr>(expr)) {
if (casted_expr->op_type_ ==
@ -220,7 +221,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyBinaryRangeFilterExpr",
context->get_segment(),
context->get_active_count(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::AlwaysTrueExpr>(expr)) {
result = std::make_shared<PhyAlwaysTrueExpr>(
@ -238,7 +240,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyBinaryArithOpEvalRangeExpr",
context->get_segment(),
context->get_active_count(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr =
std::dynamic_pointer_cast<const milvus::expr::CompareExpr>(
expr)) {
@ -258,7 +261,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyExistsFilterExpr",
context->get_segment(),
context->get_active_count(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto casted_expr = std::dynamic_pointer_cast<
const milvus::expr::JsonContainsExpr>(expr)) {
result = std::make_shared<PhyJsonContainsFilterExpr>(
@ -267,7 +271,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyJsonContainsFilterExpr",
context->get_segment(),
context->get_active_count(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else if (auto value_expr =
std::dynamic_pointer_cast<const milvus::expr::ValueExpr>(
expr)) {
@ -298,7 +303,8 @@ CompileExpression(const expr::TypedExprPtr& expr,
"PhyNullExpr",
context->get_segment(),
context->get_active_count(),
context->query_config()->get_expr_batch_size());
context->query_config()->get_expr_batch_size(),
context->get_consistency_level());
} else {
PanicInfo(ExprInvalid, "unsupport expr: ", expr->ToString());
}
@ -481,7 +487,8 @@ ConvertMultiOrToInExpr(std::vector<std::shared_ptr<Expr>>& exprs,
query_context->get_segment(),
query_context->get_active_count(),
query_context->get_query_timestamp(),
query_context->query_config()->get_expr_batch_size());
query_context->query_config()->get_expr_batch_size(),
query_context->get_consistency_level());
}
inline void

View File

@ -31,7 +31,9 @@
#include "expr/ITypeExpr.h"
#include "log/Log.h"
#include "query/PlanProto.h"
#include "segcore/SegmentSealedImpl.h"
#include "segcore/SegmentInterface.h"
#include "segcore/SegmentGrowingImpl.h"
namespace milvus {
namespace exec {
@ -138,7 +140,9 @@ class SegmentExpr : public Expr {
const DataType value_type,
int64_t active_count,
int64_t batch_size,
int32_t consistency_level,
bool allow_any_json_cast_type = false)
: Expr(DataType::BOOL, std::move(input), name),
segment_(segment),
field_id_(field_id),
@ -146,7 +150,8 @@ class SegmentExpr : public Expr {
value_type_(value_type),
allow_any_json_cast_type_(allow_any_json_cast_type),
active_count_(active_count),
batch_size_(batch_size) {
batch_size_(batch_size),
consistency_level_(consistency_level) {
size_per_chunk_ = segment_->size_per_chunk();
AssertInfo(
batch_size_ > 0,
@ -1219,6 +1224,23 @@ class SegmentExpr : public Expr {
use_index_ = false;
}
bool
CanUseJsonKeyIndex(FieldId field_id) const {
if (segment_->type() == SegmentType::Sealed) {
auto sealed_seg =
dynamic_cast<const segcore::SegmentSealed*>(segment_);
Assert(sealed_seg != nullptr);
if (sealed_seg->GetJsonKeyIndex(field_id) != nullptr) {
return true;
}
} else if (segment_->type() == SegmentType ::Growing) {
if (segment_->GetJsonKeyIndex(field_id) != nullptr) {
return true;
}
}
return false;
}
protected:
const segcore::SegmentInternalInterface* segment_;
const FieldId field_id_;
@ -1255,6 +1277,7 @@ class SegmentExpr : public Expr {
// Cache for text match.
std::shared_ptr<TargetBitmap> cached_match_res_{nullptr};
int32_t consistency_level_{0};
};
bool

View File

@ -259,6 +259,11 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsByKeyIndex<ExprValueType>();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -349,10 +354,99 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
return res_vec;
}
template <typename ExprValueType>
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::unordered_set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (!arg_inited_) {
arg_set_ = std::make_shared<SortVectorElement<GetType>>(expr_->vals_);
arg_inited_ = true;
}
if (arg_set_->Empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [this, segment, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
continue;
}
if (this->arg_set_->In(val.value())) {
return true;
}
}
return false;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsArrayByKeyIndex();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -452,6 +546,85 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
return res_vec;
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::vector<proto::plan::Array> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
for (auto const& element : expr_->vals_) {
elements.emplace_back(GetValueFromProto<proto::plan::Array>(element));
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
for (auto&& it : array) {
auto val = it.get_array();
if (val.error()) {
continue;
}
for (auto const& element : elements) {
if (CompareTwoJsonArray(val, element)) {
return true;
}
}
}
return false;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename ExprValueType>
VectorPtr
PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) {
@ -519,7 +692,6 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) {
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
processed_size =
@ -550,6 +722,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
ExprValueType>;
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsAllByKeyIndex<ExprValueType>();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -643,10 +820,98 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
return res_vec;
}
template <typename ExprValueType>
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
for (auto const& element : expr_->vals_) {
elements.insert(GetValueFromProto<GetType>(element));
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
std::set<GetType> tmp_elements(elements);
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
continue;
}
tmp_elements.erase(val.value());
if (tmp_elements.size() == 0) {
return true;
}
}
return tmp_elements.empty();
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsAllWithDiffTypeByKeyIndex();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -805,10 +1070,157 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
return res_vec;
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto elements = expr_->vals_;
std::set<int> elements_index;
int i = 0;
for (auto& element : elements) {
elements_index.insert(i);
i++;
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &elements_index, &field_id](
bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
std::set<int> tmp_elements_index(elements_index);
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
for (auto&& it : array) {
int i = -1;
for (auto& element : elements) {
i++;
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
}
if (val.value() == element.bool_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val,
element.array_val())) {
tmp_elements_index.erase(i);
}
break;
}
default:
PanicInfo(
DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
if (tmp_elements_index.size() == 0) {
return true;
}
}
if (tmp_elements_index.size() == 0) {
return true;
}
}
return tmp_elements_index.size() == 0;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsAllArrayByKeyIndex();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -914,10 +1326,97 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
return res_vec;
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
std::vector<proto::plan::Array> elements;
for (auto const& element : expr_->vals_) {
elements.emplace_back(GetValueFromProto<proto::plan::Array>(element));
}
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
std::set<int> exist_elements_index;
for (auto&& it : array) {
auto json_array = it.get_array();
if (json_array.error()) {
continue;
}
for (int index = 0; index < elements.size(); ++index) {
if (CompareTwoJsonArray(json_array, elements[index])) {
exist_elements_index.insert(index);
}
}
if (exist_elements_index.size() == elements.size()) {
return true;
}
}
return exist_elements_index.size() == elements.size();
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonContainsWithDiffTypeByKeyIndex();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -1066,6 +1565,134 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
return res_vec;
}
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto elements = expr_->vals_;
if (elements.empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
auto filter_func = [segment, &elements, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
return false;
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
// Note: array can only be iterated once
for (auto&& it : array) {
for (auto const& element : elements) {
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
}
if (val.value() == element.bool_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val,
element.array_val())) {
return true;
}
break;
}
default:
PanicInfo(
DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
}
}
return false;
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
VectorPtr
PhyJsonContainsFilterExpr::EvalArrayContainsForIndexSegment() {
switch (expr_->column_.element_type_) {

View File

@ -36,7 +36,8 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -44,7 +45,8 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
expr->column_.nested_path_,
DataType::NONE,
active_count,
batch_size),
batch_size,
consistency_level),
expr_(expr) {
}
@ -74,6 +76,10 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr
ExecJsonContains(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecJsonContainsByKeyIndex();
template <typename ExprValueType>
VectorPtr
ExecArrayContains(EvalCtx& context);
@ -82,6 +88,10 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr
ExecJsonContainsAll(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecJsonContainsAllByKeyIndex();
template <typename ExprValueType>
VectorPtr
ExecArrayContainsAll(EvalCtx& context);
@ -89,15 +99,27 @@ class PhyJsonContainsFilterExpr : public SegmentExpr {
VectorPtr
ExecJsonContainsArray(EvalCtx& context);
VectorPtr
ExecJsonContainsArrayByKeyIndex();
VectorPtr
ExecJsonContainsAllArray(EvalCtx& context);
VectorPtr
ExecJsonContainsAllArrayByKeyIndex();
VectorPtr
ExecJsonContainsAllWithDiffType(EvalCtx& context);
VectorPtr
ExecJsonContainsAllWithDiffTypeByKeyIndex();
VectorPtr
ExecJsonContainsWithDiffType(EvalCtx& context);
VectorPtr
ExecJsonContainsWithDiffTypeByKeyIndex();
VectorPtr
EvalArrayContainsForIndexSegment();

View File

@ -35,7 +35,8 @@ class PhyNullExpr : public SegmentExpr {
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -43,7 +44,8 @@ class PhyNullExpr : public SegmentExpr {
expr->column_.nested_path_,
DataType::NONE,
active_count,
batch_size),
batch_size,
consistency_level),
expr_(expr) {
}

View File

@ -539,6 +539,153 @@ PhyTermFilterExpr::ExecTermJsonVariableInField(EvalCtx& context) {
return res_vec;
}
template <typename ValueType>
VectorPtr
PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() {
using GetType = std::conditional_t<std::is_same_v<ValueType, std::string>,
std::string_view,
ValueType>;
auto real_batch_size = GetNextBatchSize();
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (!arg_inited_) {
arg_set_ = std::make_shared<SortVectorElement<ValueType>>(expr_->vals_);
if constexpr (std::is_same_v<GetType, double>) {
arg_set_float_ =
std::make_shared<SortVectorElement<float>>(expr_->vals_);
}
arg_inited_ = true;
}
if (arg_set_->Empty()) {
MoveCursor();
return std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
auto vals = expr_->vals_;
Assert(index != nullptr);
auto filter_func = [this, segment, &field_id](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if constexpr (std::is_same_v<GetType, int64_t>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType,
std::string_view>) {
if (type != uint8_t(milvus::index::JSONType::STRING) &&
type !=
uint8_t(milvus::index::JSONType::STRING_ESCAPE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, double>) {
if (type != uint8_t(milvus::index::JSONType::INT32) &&
type != uint8_t(milvus::index::JSONType::INT64) &&
type != uint8_t(milvus::index::JSONType::FLOAT) &&
type != uint8_t(milvus::index::JSONType::DOUBLE)) {
return false;
}
} else if constexpr (std::is_same_v<GetType, bool>) {
if (type != uint8_t(milvus::index::JSONType::BOOL)) {
return false;
}
}
if constexpr (std::is_same_v<GetType, int64_t>) {
return this->arg_set_->In(value);
} else if constexpr (std::is_same_v<GetType, double>) {
float restoredValue = *reinterpret_cast<float*>(&value);
return this->arg_set_float_->In(restoredValue);
} else if constexpr (std::is_same_v<GetType, bool>) {
bool restoredValue = *reinterpret_cast<bool*>(&value);
return this->arg_set_->In(restoredValue);
}
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
if (type == uint8_t(milvus::index::JSONType::STRING) ||
type == uint8_t(milvus::index::JSONType::DOUBLE) ||
type == uint8_t(milvus::index::JSONType::INT64)) {
if (type == uint8_t(milvus::index::JSONType::STRING)) {
if constexpr (std::is_same_v<GetType,
std::string_view>) {
auto val = json.at_string(offset, size);
return this->arg_set_->In(ValueType(val));
} else {
return false;
}
} else if (type ==
uint8_t(milvus::index::JSONType::DOUBLE)) {
if constexpr (std::is_same_v<GetType, double>) {
auto val = std::stod(
std::string(json.at_string(offset, size)));
return this->arg_set_->In(ValueType(val));
} else {
return false;
}
} else if (type ==
uint8_t(milvus::index::JSONType::INT64)) {
if constexpr (std::is_same_v<GetType, int64_t>) {
auto val = std::stoll(
std::string(json.at_string(offset, size)));
return this->arg_set_->In(ValueType(val));
} else {
return false;
}
}
} else {
auto val = json.at<GetType>(offset, size);
if (val.error()) {
return false;
}
return this->arg_set_->In(ValueType(val.value()));
}
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename ValueType>
VectorPtr
PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) {
@ -548,6 +695,9 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecJsonInVariableByKeyIndex<ValueType>();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();

View File

@ -57,7 +57,8 @@ class PhyTermFilterExpr : public SegmentExpr {
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
milvus::Timestamp timestamp,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -67,7 +68,8 @@ class PhyTermFilterExpr : public SegmentExpr {
? DataType::NONE
: FromValCase(expr->vals_[0].val_case()),
active_count,
batch_size),
batch_size,
consistency_level),
expr_(expr),
query_timestamp_(timestamp) {
}
@ -137,6 +139,10 @@ class PhyTermFilterExpr : public SegmentExpr {
VectorPtr
ExecTermArrayFieldInVariable(EvalCtx& context);
template <typename ValueType>
VectorPtr
ExecJsonInVariableByKeyIndex();
private:
std::shared_ptr<const milvus::expr::TermFilterExpr> expr_;
milvus::Timestamp query_timestamp_;
@ -144,7 +150,9 @@ class PhyTermFilterExpr : public SegmentExpr {
TargetBitmap cached_bits_;
bool arg_inited_{false};
std::shared_ptr<MultiElement> arg_set_;
std::shared_ptr<MultiElement> arg_set_float_;
SingleElement arg_val_;
int32_t consistency_level_ = 0;
};
} //namespace exec
} // namespace milvus

View File

@ -21,9 +21,9 @@
#include "common/type_c.h"
#include "log/Log.h"
#include <boost/regex.hpp>
namespace milvus {
namespace exec {
template <typename T>
bool
PhyUnaryRangeFilterExpr::CanUseIndexForArray() {
@ -617,6 +617,11 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) {
return ExecRangeVisitorImplJsonForIndex<ExprValueType>();
}
auto real_batch_size =
has_offset_input_ ? input->size() : GetNextBatchSize();
if (real_batch_size == 0) {
@ -898,6 +903,506 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
return res_vec;
}
std::pair<std::string, std::string>
PhyUnaryRangeFilterExpr::SplitAtFirstSlashDigit(std::string input) {
boost::regex rgx("/\\d+");
boost::smatch match;
if (boost::regex_search(input, match, rgx)) {
std::string firstPart = input.substr(0, match.position());
std::string secondPart = input.substr(match.position());
return {firstPart, secondPart};
} else {
return {input, ""};
}
}
template <typename ExprValueType>
VectorPtr
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() {
using GetType =
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointerpath = milvus::Json::pointer(expr_->column_.nested_path_);
auto pointerpair = SplitAtFirstSlashDigit(pointerpath);
std::string pointer = pointerpair.first;
std::string arrayIndex = pointerpair.second;
#define UnaryRangeJSONIndexCompare(cmp) \
do { \
auto x = json.at<GetType>(offset, size); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.at<double>(offset, size); \
return !x.error() && (cmp); \
} \
return false; \
} \
return (cmp); \
} while (false)
#define UnaryJSONTypeCompare(cmp) \
do { \
if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (type == uint8_t(milvus::index::JSONType::STRING)) { \
auto x = json.at_string(offset, size); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto x = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto x = std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::INT64)) { \
auto x = \
std::stoll(std::string(json.at_string(offset, size))); \
return (cmp); \
} else if (type == uint8_t(milvus::index::JSONType::DOUBLE)) { \
auto x = std::stod(std::string(json.at_string(offset, size))); \
return (cmp); \
} else { \
return false; \
} \
} \
} while (false)
#define UnaryJSONTypeCompareWithValue(cmp) \
do { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float x = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t x = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
float x = *reinterpret_cast<float*>(&value); \
return (cmp); \
} else { \
int64_t x = value; \
return (cmp); \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
bool x = *reinterpret_cast<bool*>(&value); \
return (cmp); \
} \
} while (false)
#define CompareValueWithOpType(type, value, val, op_type) \
switch (op_type) { \
case proto::plan::GreaterThan: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x > static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x > val); \
} \
break; \
case proto::plan::GreaterEqual: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x >= static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x >= val); \
} \
break; \
case proto::plan::LessThan: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x < static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x < val); \
} \
break; \
case proto::plan::LessEqual: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x <= static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x <= val); \
} \
break; \
case proto::plan::Equal: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x == static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x == val); \
} \
break; \
case proto::plan::NotEqual: \
if (type == uint8_t(milvus::index::JSONType::FLOAT)) { \
UnaryJSONTypeCompareWithValue(x != static_cast<float>(val)); \
} else { \
UnaryJSONTypeCompareWithValue(x != val); \
} \
break; \
default: \
return false; \
}
#define UnaryRangeJSONIndexCompareWithArrayIndex(cmp) \
do { \
if (type != uint8_t(milvus::index::JSONType::UNKNOWN)) { \
return false; \
} \
auto array = json.array_at(offset, size); \
if (array.error()) { \
return false; \
} \
auto value = array.at_pointer(arrayIndex); \
if (value.error()) { \
return false; \
} \
if constexpr (std::is_same_v<GetType, int64_t> || \
std::is_same_v<GetType, double>) { \
if (!value.is_number()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (!value.is_string()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
if (!value.is_bool()) { \
return false; \
} \
} \
auto x = value.get<GetType>(); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = value.get<double>(); \
return !x.error() && (cmp); \
} \
} \
return (cmp); \
} while (false)
#define UnaryRangeJSONIndexCompareNotEqual(cmp) \
do { \
auto x = json.at<GetType>(offset, size); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = json.at<double>(offset, size); \
return x.error() || (cmp); \
} \
return true; \
} \
return (cmp); \
} while (false)
#define UnaryRangeJSONIndexCompareNotEqualWithArrayIndex(cmp) \
do { \
auto array = json.array_at(offset, size); \
if (array.error()) { \
return false; \
} \
auto value = array.at_pointer(arrayIndex); \
if (value.error()) { \
return false; \
} \
if constexpr (std::is_same_v<GetType, int64_t> || \
std::is_same_v<GetType, double>) { \
if (!value.is_number()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, std::string_view>) { \
if (!value.is_string()) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, bool>) { \
if (!value.is_bool()) { \
return false; \
} \
} \
auto x = value.get<GetType>(); \
if (x.error()) { \
if constexpr (std::is_same_v<GetType, int64_t>) { \
auto x = value.get<double>(); \
return x.error() || (cmp); \
} \
} \
return (cmp); \
} while (false)
#define CHECKISJSONTYPEWITHOFFSET(type) \
(type == uint8_t(milvus::index::JSONType::STRING) || \
type == uint8_t(milvus::index::JSONType::DOUBLE) || \
type == uint8_t(milvus::index::JSONType::INT64))
#define CHECKJSONTYPEISNUMBER(type) \
if ((type != uint8_t(milvus::index::JSONType::INT32)) && \
(type != uint8_t(milvus::index::JSONType::INT64)) && \
(type != uint8_t(milvus::index::JSONType::FLOAT)) && \
(type != uint8_t(milvus::index::JSONType::DOUBLE))) { \
return false; \
}
#define ISVALIDJSONTYPE(type, GetType) \
if constexpr (std::is_same_v<GetType, int64_t>) { \
CHECKJSONTYPEISNUMBER(type) \
} else if constexpr (std::is_same_v<GetType, std::string_view>) { \
if ((type != uint8_t(milvus::index::JSONType::STRING)) && \
(type != uint8_t(milvus::index::JSONType::STRING_ESCAPE))) { \
return false; \
} \
} else if constexpr (std::is_same_v<GetType, double>) { \
CHECKJSONTYPEISNUMBER(type) \
} else if constexpr (std::is_same_v<GetType, bool>) { \
if (type != uint8_t(milvus::index::JSONType::BOOL)) { \
return false; \
} \
}
ExprValueType val = GetValueFromProto<ExprValueType>(expr_->val_);
auto op_type = expr_->op_type_;
if (cached_index_chunk_id_ != 0) {
const segcore::SegmentInternalInterface* segment = nullptr;
if (segment_->type() == SegmentType::Growing) {
segment =
dynamic_cast<const segcore::SegmentGrowingImpl*>(segment_);
} else if (segment_->type() == SegmentType::Sealed) {
segment = dynamic_cast<const segcore::SegmentSealed*>(segment_);
}
auto field_id = expr_->column_.field_id_;
auto* index = segment->GetJsonKeyIndex(field_id);
Assert(index != nullptr);
Assert(segment != nullptr);
auto filter_func = [segment,
field_id,
op_type,
val,
arrayIndex,
pointer](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if (type == uint8_t(milvus::index::JSONType::UNKNOWN) ||
!arrayIndex.empty()) {
return false;
}
ISVALIDJSONTYPE(type, GetType);
switch (op_type) {
case proto::plan::GreaterThan:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::GreaterEqual:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::LessThan:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::LessEqual:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::Equal:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::NotEqual:
CompareValueWithOpType(type, value, val, op_type);
case proto::plan::PrefixMatch:
case proto::plan::Match:
default:
return false;
}
} else {
auto json_pair = segment->GetJsonData(field_id, row_id);
if (!json_pair.second) {
return false;
}
auto json = milvus::Json(json_pair.first.data(),
json_pair.first.size());
switch (op_type) {
case proto::plan::GreaterThan:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) > val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x > val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) > val);
}
}
}
case proto::plan::GreaterEqual:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) >= val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x >= val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) >= val);
}
}
}
case proto::plan::LessThan:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) < val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x < val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) < val);
}
}
}
case proto::plan::LessEqual:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) <= val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x <= val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) <= val);
}
}
}
case proto::plan::Equal:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
if (type !=
uint8_t(milvus::index::JSONType::UNKNOWN)) {
return false;
}
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
return CompareTwoJsonArray(array.value(), val);
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
ExprValueType(x.value()) == val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x == val);
} else {
UnaryRangeJSONIndexCompare(
ExprValueType(x.value()) == val);
}
}
}
case proto::plan::NotEqual:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
if (type !=
uint8_t(milvus::index::JSONType::UNKNOWN)) {
return false;
}
auto array = json.array_at(offset, size);
if (array.error()) {
return false;
}
return !CompareTwoJsonArray(array.value(), val);
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareNotEqualWithArrayIndex(
ExprValueType(x.value()) != val);
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(x != val);
} else {
UnaryRangeJSONIndexCompareNotEqual(
ExprValueType(x.value()) != val);
}
}
}
case proto::plan::PrefixMatch:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
milvus::query::Match(
ExprValueType(x.value()),
val,
op_type));
} else {
if (CHECKISJSONTYPEWITHOFFSET(type)) {
UnaryJSONTypeCompare(
milvus::query::Match(x, val, op_type));
} else {
UnaryRangeJSONIndexCompare(
milvus::query::Match(
ExprValueType(x.value()),
val,
op_type));
}
}
}
case proto::plan::Match:
if constexpr (std::is_same_v<GetType,
proto::plan::Array>) {
return false;
} else {
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
if (!arrayIndex.empty()) {
UnaryRangeJSONIndexCompareWithArrayIndex(
matcher(ExprValueType(x.value())));
} else {
UnaryRangeJSONIndexCompare(
matcher(ExprValueType(x.value())));
}
}
default:
return false;
}
}
};
bool is_growing = segment_->type() == SegmentType::Growing;
bool is_strong_consistency = consistency_level_ == 0;
cached_index_chunk_res_ = index
->FilterByPath(pointer,
active_count_,
is_growing,
is_strong_consistency,
filter_func)
.clone();
cached_index_chunk_id_ = 0;
}
TargetBitmap result;
result.append(
cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::make_shared<ColumnVector>(std::move(result),
TargetBitmap(real_batch_size, true));
}
template <typename T>
VectorPtr
PhyUnaryRangeFilterExpr::ExecRangeVisitorImpl(EvalCtx& context) {

View File

@ -335,7 +335,8 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size)
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
@ -343,7 +344,8 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
expr->column_.nested_path_,
FromValCase(expr->val_.val_case()),
active_count,
batch_size),
batch_size,
consistency_level),
expr_(expr) {
}
@ -411,6 +413,10 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
VectorPtr
ExecRangeVisitorImplJson(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecRangeVisitorImplJsonForIndex();
template <typename ExprValueType>
VectorPtr
ExecRangeVisitorImplArray(EvalCtx& context);
@ -442,6 +448,9 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
VectorPtr
ExecTextMatch();
std::pair<std::string, std::string>
SplitAtFirstSlashDigit(std::string input);
private:
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_;
int64_t overflow_check_pos_{0};

View File

@ -64,6 +64,16 @@ CompareTwoJsonArray(T arr1, const proto::plan::Array& arr2) {
simdjson::ondemand::value>>>) {
json_array_length = arr1.size();
}
if constexpr (std::is_same_v<
T,
simdjson::simdjson_result<simdjson::dom::array>>) {
json_array_length = arr1.size();
}
if constexpr (std::is_same_v<T, simdjson::dom::array>) {
json_array_length = arr1.size();
}
if (arr2.array_size() != json_array_length) {
return false;
}

View File

@ -218,7 +218,7 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
std::vector<std::string> null_offset_files;
std::shared_ptr<FieldDataBase> null_offset_data;
auto find_file = [&](const std::string& target) -> auto{
auto find_file = [&](const std::string& target) -> auto {
return std::find_if(inverted_index_files.begin(),
inverted_index_files.end(),
[&](const std::string& filename) {

View File

@ -0,0 +1,476 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <boost/uuid/random_generator.hpp>
#include <boost/uuid/uuid_io.hpp>
#include "index/JsonKeyStatsInvertedIndex.h"
#include "index/InvertedIndexUtil.h"
#include "index/Utils.h"
#include "storage/MmapManager.h"
namespace milvus::index {
constexpr const char* TMP_JSON_INVERTED_LOG_PREFIX =
"/tmp/milvus/json-key-inverted-index-log/";
void
JsonKeyStatsInvertedIndex::AddJSONEncodeValue(
const std::vector<std::string>& paths,
uint8_t flag,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t length,
int32_t value,
std::map<std::string, std::vector<int64_t>>& mp) {
std::string key = "";
if (!paths.empty()) {
key = std::string("/") + Join(paths, "/");
}
LOG_DEBUG(
"insert inverted key: {}, flag: {}, type: {}, row_id: {}, offset: "
"{}, length:{}, value:{}",
key,
flag,
type,
row_id,
offset,
length,
value);
int64_t combine_id = 0;
if (flag) {
combine_id = EncodeValue(flag, type, row_id, value);
} else {
combine_id = EncodeOffset(flag, type, row_id, offset, length);
}
mp[key].push_back(combine_id);
}
void
JsonKeyStatsInvertedIndex::AddInvertedRecord(
std::map<std::string, std::vector<int64_t>>& mp) {
for (auto& iter : mp) {
for (auto value : iter.second) {
wrapper_->add_array_data<std::string>(&iter.first, 1, value);
}
}
}
void
JsonKeyStatsInvertedIndex::TravelJson(
const char* json,
jsmntok* tokens,
int& index,
std::vector<std::string>& path,
int32_t offset,
std::map<std::string, std::vector<int64_t>>& mp) {
jsmntok current = tokens[0];
Assert(current.type != JSMN_UNDEFINED);
if (current.type == JSMN_OBJECT) {
if (!path.empty()) {
AddJSONEncodeValue(path,
0,
0,
offset,
current.start,
current.end - current.start,
0,
mp);
}
int j = 1;
for (int i = 0; i < current.size; i++) {
Assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0);
std::string key(json + tokens[j].start,
tokens[j].end - tokens[j].start);
path.push_back(key);
j++;
int consumed = 0;
TravelJson(json, tokens + j, consumed, path, offset, mp);
path.pop_back();
j += consumed;
}
index = j;
} else if (current.type == JSMN_PRIMITIVE) {
std::string value(json + current.start, current.end - current.start);
auto type = getType(value);
if (type == JSONType::INT32) {
AddJSONEncodeValue(path,
1,
static_cast<uint8_t>(JSONType::INT32),
offset,
current.start,
current.end - current.start,
stoi(value),
mp);
} else if (type == JSONType::INT64) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::INT64),
offset,
current.start,
current.end - current.start,
0,
mp);
} else if (type == JSONType::FLOAT) {
auto fvalue = stof(value);
int32_t valueBits = *reinterpret_cast<int32_t*>(&fvalue);
AddJSONEncodeValue(path,
1,
static_cast<uint8_t>(JSONType::FLOAT),
offset,
current.start,
current.end - current.start,
valueBits,
mp);
} else if (type == JSONType::DOUBLE) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::DOUBLE),
offset,
current.start,
current.end - current.start,
0,
mp);
} else if (type == JSONType::BOOL) {
AddJSONEncodeValue(path,
1,
static_cast<uint8_t>(JSONType::BOOL),
offset,
current.start,
current.end - current.start,
value == "true" ? 1 : 0,
mp);
}
index++;
} else if (current.type == JSMN_ARRAY) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::UNKNOWN),
offset,
current.start,
current.end - current.start,
0,
mp);
// skip array parse
int count = current.size;
int j = 1;
while (count > 0) {
count--;
if (tokens[j].size != 0) {
count += tokens[j].size;
}
j++;
}
index = j;
} else if (current.type == JSMN_STRING) {
Assert(current.size == 0);
std::string value(json + current.start, current.end - current.start);
if (has_escape_sequence(value)) {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::STRING_ESCAPE),
offset,
current.start - 1,
current.end - current.start + 2,
0,
mp);
} else {
AddJSONEncodeValue(path,
0,
static_cast<uint8_t>(JSONType::STRING),
offset,
current.start,
current.end - current.start,
0,
mp);
}
index++;
}
}
void
JsonKeyStatsInvertedIndex::AddJson(
const char* json,
int64_t offset,
std::map<std::string, std::vector<int64_t>>& mp) {
jsmn_parser parser;
jsmntok_t* tokens = (jsmntok_t*)malloc(16 * sizeof(jsmntok_t));
if (!tokens) {
PanicInfo(ErrorCode::UnexpectedError, "alloc jsmn token failed");
return;
}
int num_tokens = 0;
int token_capacity = 16;
jsmn_init(&parser);
while (1) {
int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity);
if (r < 0) {
if (r == JSMN_ERROR_NOMEM) {
// Reallocate tokens array if not enough space
token_capacity *= 2;
tokens = (jsmntok_t*)realloc(
tokens, token_capacity * sizeof(jsmntok_t));
if (!tokens) {
PanicInfo(ErrorCode::UnexpectedError, "realloc failed");
}
continue;
} else {
free(tokens);
PanicInfo(ErrorCode::UnexpectedError,
"Failed to parse Json: {}, error: {}",
json,
int(r));
}
}
num_tokens = r;
break;
}
int index = 0;
std::vector<std::string> paths;
TravelJson(json, tokens, index, paths, offset, mp);
free(tokens);
}
JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex(
const storage::FileManagerContext& ctx,
bool is_load,
int64_t json_stats_tantivy_memory_budget,
uint32_t tantivy_index_version)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
LOG_INFO("json_stats_tantivy_memory_budget:{}",
json_stats_tantivy_memory_budget);
schema_ = ctx.fieldDataMeta.field_schema;
field_id_ = ctx.fieldDataMeta.field_id;
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
if (is_load) {
auto prefix = disk_file_manager_->GetLocalJsonKeyIndexPrefix();
path_ = prefix;
} else {
auto prefix = disk_file_manager_->GetJsonKeyIndexIdentifier();
path_ = std::string(TMP_JSON_INVERTED_LOG_PREFIX) + prefix;
boost::filesystem::create_directories(path_);
std::string field_name =
std::to_string(disk_file_manager_->GetFieldDataMeta().field_id);
d_type_ = TantivyDataType::Keyword;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field_name.c_str(),
d_type_,
path_.c_str(),
tantivy_index_version,
false,
false,
1,
json_stats_tantivy_memory_budget);
}
}
JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex(
int64_t commit_interval_in_ms, const char* unique_id)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Keyword;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
unique_id, d_type_, "", TANTIVY_INDEX_LATEST_VERSION, false, true);
}
JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex(
int64_t commit_interval_in_ms,
const char* unique_id,
const std::string& path)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
boost::filesystem::path prefix = path;
boost::filesystem::path sub_path = unique_id;
path_ = (prefix / sub_path).string();
boost::filesystem::create_directories(path_);
d_type_ = TantivyDataType::Keyword;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
unique_id, d_type_, path_.c_str(), TANTIVY_INDEX_LATEST_VERSION);
}
IndexStatsPtr
JsonKeyStatsInvertedIndex::Upload(const Config& config) {
finish();
boost::filesystem::path p(path_);
boost::filesystem::directory_iterator end_iter;
for (boost::filesystem::directory_iterator iter(p); iter != end_iter;
iter++) {
if (boost::filesystem::is_directory(*iter)) {
LOG_WARN("{} is a directory", iter->path().string());
} else {
LOG_INFO("trying to add json key inverted index log: {}",
iter->path().string());
AssertInfo(
disk_file_manager_->AddJsonKeyIndexLog(iter->path().string()),
"failed to add json key inverted index log: {}",
iter->path().string());
LOG_INFO("json key inverted index log: {} added",
iter->path().string());
}
}
auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize();
auto binary_set = Serialize(config);
mem_file_manager_->AddFile(binary_set);
auto remote_mem_path_to_size =
mem_file_manager_->GetRemotePathsToFileSize();
std::vector<SerializedIndexFileInfo> index_files;
index_files.reserve(remote_paths_to_size.size() +
remote_mem_path_to_size.size());
for (auto& file : remote_paths_to_size) {
index_files.emplace_back(disk_file_manager_->GetFileName(file.first),
file.second);
}
for (auto& file : remote_mem_path_to_size) {
index_files.emplace_back(file.first, file.second);
}
return IndexStats::New(mem_file_manager_->GetAddedTotalMemSize() +
disk_file_manager_->GetAddedTotalFileSize(),
std::move(index_files));
}
void
JsonKeyStatsInvertedIndex::Load(milvus::tracer::TraceContext ctx,
const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
AssertInfo(index_files.has_value(),
"index file paths is empty when load json key index");
for (auto& index_file : index_files.value()) {
boost::filesystem::path p(index_file);
if (!p.has_parent_path()) {
auto remote_prefix =
disk_file_manager_->GetRemoteJsonKeyLogPrefix();
index_file = remote_prefix + "/" + index_file;
}
}
disk_file_manager_->CacheJsonKeyIndexToDisk(index_files.value());
AssertInfo(
tantivy_index_exist(path_.c_str()), "index not exist: {}", path_);
wrapper_ = std::make_shared<TantivyIndexWrapper>(path_.c_str());
LOG_INFO("load json key index done for field id:{} with dir:{}",
field_id_,
path_);
}
void
JsonKeyStatsInvertedIndex::BuildWithFieldData(
const std::vector<FieldDataPtr>& field_datas) {
AssertInfo(schema_.data_type() == proto::schema::DataType::JSON,
"schema data type is {}",
schema_.data_type());
BuildWithFieldData(field_datas, schema_.nullable());
}
void
JsonKeyStatsInvertedIndex::BuildWithFieldData(
const std::vector<FieldDataPtr>& field_datas, bool nullable) {
int64_t offset = 0;
std::map<std::string, std::vector<int64_t>> mp;
if (nullable) {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
if (!data->is_valid(i)) {
continue;
}
AddJson(static_cast<const milvus::Json*>(data->RawValue(i))
->data()
.data(),
offset++,
mp);
}
}
} else {
for (const auto& data : field_datas) {
auto n = data->get_num_rows();
for (int i = 0; i < n; i++) {
AddJson(static_cast<const milvus::Json*>(data->RawValue(i))
->data()
.data(),
offset++,
mp);
}
}
}
AddInvertedRecord(mp);
LOG_INFO("build json key index done for field id:{}", field_id_);
}
void
JsonKeyStatsInvertedIndex::AddJSONDatas(size_t n,
const std::string* jsonDatas,
const bool* valids,
int64_t offset_begin) {
std::map<std::string, std::vector<int64_t>> mp;
for (int i = 0; i < n; i++) {
auto offset = i + offset_begin;
if (valids != nullptr && !valids[i]) {
continue;
}
AddJson(jsonDatas[i].c_str(), offset, mp);
}
AddInvertedRecord(mp);
is_data_uncommitted_ = true;
LOG_INFO("build json key index done for AddJSONDatas");
if (shouldTriggerCommit()) {
Commit();
}
}
void
JsonKeyStatsInvertedIndex::Finish() {
finish();
}
bool
JsonKeyStatsInvertedIndex::shouldTriggerCommit() {
auto span = (std::chrono::duration<double, std::milli>(
stdclock::now() - last_commit_time_.load()))
.count();
return span > commit_interval_in_ms_;
}
void
JsonKeyStatsInvertedIndex::Commit() {
std::unique_lock<std::mutex> lck(mtx_, std::defer_lock);
if (lck.try_lock()) {
is_data_uncommitted_ = false;
wrapper_->commit();
last_commit_time_.store(stdclock::now());
}
}
void
JsonKeyStatsInvertedIndex::Reload() {
std::unique_lock<std::mutex> lck(mtx_, std::defer_lock);
if (lck.try_lock()) {
wrapper_->reload();
}
}
void
JsonKeyStatsInvertedIndex::CreateReader() {
wrapper_->create_reader();
}
} // namespace milvus::index

View File

@ -0,0 +1,298 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <string>
#include <boost/filesystem.hpp>
#include "index/InvertedIndexTantivy.h"
#include "common/jsmn.h"
namespace milvus::index {
enum class JSONType {
UNKNOWN,
BOOL,
INT32,
INT64,
FLOAT,
DOUBLE,
STRING,
STRING_ESCAPE
};
using stdclock = std::chrono::high_resolution_clock;
class JsonKeyStatsInvertedIndex : public InvertedIndexTantivy<std::string> {
public:
explicit JsonKeyStatsInvertedIndex(
const storage::FileManagerContext& ctx,
bool is_load,
int64_t json_stats_tantivy_memory_budget = 16777216,
uint32_t tantivy_index_version = TANTIVY_INDEX_LATEST_VERSION);
explicit JsonKeyStatsInvertedIndex(int64_t commit_interval_in_ms,
const char* unique_id);
explicit JsonKeyStatsInvertedIndex(int64_t commit_interval_in_ms,
const char* unique_id,
const std::string& path);
~JsonKeyStatsInvertedIndex() override{};
public:
IndexStatsPtr
Upload(const Config& config) override;
void
Load(milvus::tracer::TraceContext ctx, const Config& config) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas, bool nullable);
const TargetBitmap
FilterByPath(
const std::string& path,
int32_t row,
bool is_growing,
bool is_strong_consistency,
std::function<bool(
bool, uint8_t, uint32_t, uint16_t, uint16_t, int32_t)> filter) {
auto processArray = [this, &path, row, &filter]() {
TargetBitmap bitset(row);
auto array = wrapper_->term_query_i64(path);
LOG_INFO("json key filter size:{}", array.array_.len);
for (size_t j = 0; j < array.array_.len; j++) {
auto the_offset = array.array_.array[j];
if (DecodeValid(the_offset)) {
auto tuple = DecodeValue(the_offset);
auto row_id = std::get<1>(tuple);
if (row_id >= row) {
continue;
}
bitset[row_id] = filter(true,
std::get<0>(tuple),
std::get<1>(tuple),
0,
0,
std::get<2>(tuple));
} else {
auto tuple = DecodeOffset(the_offset);
auto row_id = std::get<1>(tuple);
if (row_id >= row) {
continue;
}
bitset[row_id] = filter(false,
std::get<0>(tuple),
std::get<1>(tuple),
std::get<2>(tuple),
std::get<3>(tuple),
0);
}
}
return bitset;
};
if (is_growing) {
if (shouldTriggerCommit() || is_strong_consistency) {
if (is_data_uncommitted_) {
Commit();
}
Reload();
return processArray();
} else {
return processArray();
}
} else {
return processArray();
}
}
void
AddJSONDatas(size_t n,
const std::string* jsonDatas,
const bool* valids,
int64_t offset_begin);
void
Finish();
void
Commit();
void
Reload();
void
CreateReader();
bool
has_escape_sequence(const std::string& str) {
for (size_t i = 0; i < str.size(); ++i) {
if (str[i] == '\\' && i + 1 < str.size()) {
char next = str[i + 1];
if (next == 'n' || next == 't' || next == 'r' || next == 'b' ||
next == 'f' || next == 'v' || next == '\\' ||
next == '\"' || next == '\'' || next == '0' ||
next == 'u' || next == '/') {
return true;
}
}
}
return false;
}
private:
void
AddJson(const char* json,
int64_t offset,
std::map<std::string, std::vector<int64_t>>& mp);
void
TravelJson(const char* json,
jsmntok* tokens,
int& index,
std::vector<std::string>& path,
int32_t offset,
std::map<std::string, std::vector<int64_t>>& mp);
void
AddJSONEncodeValue(const std::vector<std::string>& paths,
uint8_t flag,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t length,
int32_t value,
std::map<std::string, std::vector<int64_t>>& mp);
int64_t
EncodeOffset(uint8_t flag,
uint8_t type,
uint32_t row_id,
uint16_t row_offset,
uint16_t size) {
row_id &= 0x0FFFFFFF;
return static_cast<int64_t>(flag) << 63 |
static_cast<int64_t>(type) << 60 |
static_cast<int64_t>(row_id) << 32 |
static_cast<int64_t>(row_offset) << 16 |
static_cast<int64_t>(size);
}
int64_t
EncodeValue(uint8_t flag, uint8_t type, uint32_t row_id, int32_t value) {
row_id &= 0x0FFFFFFF;
return static_cast<int64_t>(flag) << 63 |
static_cast<int64_t>(type) << 60 |
static_cast<int64_t>(row_id) << 32 |
static_cast<uint32_t>(value);
}
bool
DecodeValid(int64_t encode_offset) {
return (encode_offset >> 63) & 1;
}
std::tuple<uint8_t, uint32_t, int32_t>
DecodeValue(int64_t encode_offset) {
uint8_t type = (encode_offset >> 60) & 0x7;
uint32_t row_id = (encode_offset >> 32) & 0x0FFFFFFF;
int32_t value = static_cast<int32_t>(encode_offset & 0xFFFFFFFF);
return std::make_tuple(type, row_id, value);
}
std::tuple<uint8_t, uint32_t, uint16_t, uint16_t>
DecodeOffset(int64_t encode_offset) {
uint8_t type = (encode_offset >> 60) & 0x7;
uint32_t row_id = (encode_offset >> 32) & 0x0FFFFFFF;
uint16_t row_offset = (encode_offset >> 16) & 0xFFFF;
uint16_t size = encode_offset & 0xFFFF;
return std::make_tuple(type, row_id, row_offset, size);
}
bool
shouldTriggerCommit();
bool
isBoolean(const std::string& str) {
return str == "true" || str == "false";
}
bool
isInt32(const std::string& str) {
std::istringstream iss(str);
int64_t num;
iss >> num;
return !iss.fail() && iss.eof() &&
num >= std::numeric_limits<int32_t>::min() &&
num <= std::numeric_limits<int32_t>::max();
}
bool
isInt64(const std::string& str) {
std::istringstream iss(str);
int64_t num;
iss >> num;
return !iss.fail() && iss.eof();
}
bool
isFloat(const std::string& str) {
try {
float d = std::stof(str);
return true;
} catch (...) {
return false;
}
}
bool
isDouble(const std::string& str) {
try {
double d = std::stod(str);
return true;
} catch (...) {
return false;
}
}
JSONType
getType(const std::string& str) {
if (isBoolean(str)) {
return JSONType::BOOL;
} else if (isInt32(str)) {
return JSONType::INT32;
} else if (isInt64(str)) {
return JSONType::INT64;
} else if (isFloat(str)) {
return JSONType::FLOAT;
} else if (isDouble(str)) {
return JSONType::DOUBLE;
}
return JSONType::UNKNOWN;
}
void
AddInvertedRecord(std::map<std::string, std::vector<int64_t>>& mp);
private:
int64_t field_id_;
mutable std::mutex mtx_;
std::atomic<stdclock::time_point> last_commit_time_;
int64_t commit_interval_in_ms_;
std::atomic<bool> is_data_uncommitted_ = false;
};
} // namespace milvus::index

View File

@ -34,6 +34,7 @@
#include "pb/index_cgo_msg.pb.h"
#include "storage/Util.h"
#include "index/Meta.h"
#include "index/JsonKeyStatsInvertedIndex.h"
using namespace milvus;
CStatus
@ -225,6 +226,81 @@ CreateIndex(CIndex* res_index,
}
}
CStatus
BuildJsonKeyIndex(ProtoLayoutInterface result,
const uint8_t* serialized_build_index_info,
const uint64_t len) {
try {
auto build_index_info =
std::make_unique<milvus::proto::indexcgo::BuildIndexInfo>();
auto res =
build_index_info->ParseFromArray(serialized_build_index_info, len);
AssertInfo(res, "Unmarshall build index info failed");
auto field_type =
static_cast<DataType>(build_index_info->field_schema().data_type());
auto storage_config =
get_storage_config(build_index_info->storage_config());
auto config = get_config(build_index_info);
// init file manager
milvus::storage::FieldDataMeta field_meta{
build_index_info->collectionid(),
build_index_info->partitionid(),
build_index_info->segmentid(),
build_index_info->field_schema().fieldid(),
build_index_info->field_schema()};
milvus::storage::IndexMeta index_meta{
build_index_info->segmentid(),
build_index_info->field_schema().fieldid(),
build_index_info->buildid(),
build_index_info->index_version(),
"",
build_index_info->field_schema().name(),
field_type,
build_index_info->dim(),
};
uint32_t tantivy_index_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
auto chunk_manager =
milvus::storage::CreateChunkManager(storage_config);
milvus::storage::FileManagerContext fileManagerContext(
field_meta, index_meta, chunk_manager);
auto field_schema =
FieldMeta::ParseFrom(build_index_info->field_schema());
auto index = std::make_unique<index::JsonKeyStatsInvertedIndex>(
fileManagerContext,
false,
build_index_info->json_key_stats_tantivy_memory(),
tantivy_index_version);
index->Build(config);
auto create_index_result = index->Upload(config);
create_index_result->SerializeAt(
reinterpret_cast<milvus::ProtoLayout*>(result));
auto status = CStatus();
status.error_code = Success;
status.error_msg = "";
return status;
} catch (SegcoreError& e) {
auto status = CStatus();
status.error_code = e.get_error_code();
status.error_msg = strdup(e.what());
return status;
} catch (std::exception& e) {
auto status = CStatus();
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
return status;
}
}
CStatus
BuildTextIndex(ProtoLayoutInterface result,
const uint8_t* serialized_build_index_info,

View File

@ -36,6 +36,11 @@ CreateIndex(CIndex* res_index,
CStatus
DeleteIndex(CIndex index);
CStatus
BuildJsonKeyIndex(ProtoLayoutInterface c_binary_set,
const uint8_t* serialized_build_index_info,
const uint64_t len);
CStatus
BuildTextIndex(ProtoLayoutInterface c_binary_set,
const uint8_t* serialized_build_index_info,

View File

@ -147,6 +147,12 @@ class ChunkedColumnBase : public ColumnBase {
"GetBatchBuffer only supported for VariableColumn");
}
virtual std::string_view
RawAt(const size_t i) const {
PanicInfo(ErrorCode::Unsupported,
"RawAt only supported for VariableColumn");
}
virtual std::pair<std::vector<std::string_view>, FixedVector<bool>>
StringViews(int64_t chunk_id,
std::optional<std::pair<int64_t, int64_t>> offset_len) const {
@ -387,7 +393,7 @@ class ChunkedVariableColumn : public ChunkedColumnBase {
}
std::string_view
RawAt(const int i) const {
RawAt(const size_t i) const {
return std::string_view((*this)[i]);
}
};

View File

@ -333,6 +333,12 @@ class SingleChunkColumnBase : public ColumnBase {
"viewsbyoffsets only supported for VariableColumn");
}
virtual std::string_view
RawAt(const size_t i) const {
PanicInfo(ErrorCode::Unsupported,
"RawAt only supported for VariableColumn");
}
virtual void
AppendBatch(const FieldDataPtr data) override {
size_t required_size = data_size_ + data->DataSize();
@ -801,7 +807,7 @@ class SingleChunkVariableColumn : public SingleChunkColumnBase {
}
std::string_view
RawAt(const int i) const {
RawAt(const size_t i) const {
return std::string_view((*this)[i]);
}

View File

@ -33,10 +33,12 @@ class ExecPlanNodeVisitor : PlanNodeVisitor {
public:
ExecPlanNodeVisitor(const segcore::SegmentInterface& segment,
Timestamp timestamp,
const PlaceholderGroup& placeholder_group)
const PlaceholderGroup& placeholder_group,
int32_t consystency_level)
: segment_(segment),
timestamp_(timestamp),
placeholder_group_(placeholder_group) {
placeholder_group_(placeholder_group),
consystency_level_(consystency_level) {
}
SearchResult
@ -60,6 +62,7 @@ class ExecPlanNodeVisitor : PlanNodeVisitor {
const PlaceholderGroup& placeholder_group_;
SearchResultOpt search_result_opt_;
int32_t consystency_level_ = 0;
};
} // namespace impl
@ -80,7 +83,6 @@ ExecPlanNodeVisitor::ExecuteTask(
plan.plan_node_->ToString(),
query_context->get_active_count(),
query_context->get_query_timestamp());
auto task =
milvus::exec::Task::Create(DEFAULT_TASK_ID, plan, 0, query_context);
int64_t processed_num = 0;
@ -127,8 +129,12 @@ ExecPlanNodeVisitor::VectorVisitorImpl(VectorPlanNode& node) {
auto plan = plan::PlanFragment(node.plannodes_);
// Set query context
auto query_context = std::make_shared<milvus::exec::QueryContext>(
DEAFULT_QUERY_ID, segment, active_count, timestamp_);
auto query_context =
std::make_shared<milvus::exec::QueryContext>(DEAFULT_QUERY_ID,
segment,
active_count,
timestamp_,
consystency_level_);
query_context->set_search_info(node.search_info_);
query_context->set_placeholder_group(placeholder_group_);
@ -178,8 +184,12 @@ ExecPlanNodeVisitor::visit(RetrievePlanNode& node) {
auto plan = plan::PlanFragment(node.plannodes_);
// Set query context
auto query_context = std::make_shared<milvus::exec::QueryContext>(
DEAFULT_QUERY_ID, segment, active_count, timestamp_);
auto query_context =
std::make_shared<milvus::exec::QueryContext>(DEAFULT_QUERY_ID,
segment,
active_count,
timestamp_,
consystency_level_);
// Do task execution
auto bitset_holder = ExecuteTask(plan, query_context);

View File

@ -46,15 +46,20 @@ class ExecPlanNodeVisitor : public PlanNodeVisitor {
public:
ExecPlanNodeVisitor(const segcore::SegmentInterface& segment,
Timestamp timestamp,
const PlaceholderGroup* placeholder_group)
const PlaceholderGroup* placeholder_group,
int32_t consystency_level = 0)
: segment_(segment),
timestamp_(timestamp),
placeholder_group_(placeholder_group) {
placeholder_group_(placeholder_group),
consystency_level_(consystency_level) {
}
ExecPlanNodeVisitor(const segcore::SegmentInterface& segment,
Timestamp timestamp)
: segment_(segment), timestamp_(timestamp) {
Timestamp timestamp,
int32_t consystency_level = 0)
: segment_(segment),
timestamp_(timestamp),
consystency_level_(consystency_level) {
placeholder_group_ = nullptr;
}
@ -108,6 +113,7 @@ class ExecPlanNodeVisitor : public PlanNodeVisitor {
SearchResultOpt search_result_opt_;
RetrieveResultOpt retrieve_result_opt_;
bool expr_use_pk_index_ = false;
int32_t consystency_level_ = 0;
};
// for test use only

View File

@ -97,6 +97,31 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
void
LoadTextIndex(FieldId field_id,
std::unique_ptr<index::TextMatchIndex> index) override;
void
LoadJsonKeyIndex(
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) override {
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
json_key_indexes_[field_id] = std::move(index);
}
index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override {
std::shared_lock lck(mutex_);
auto iter = json_key_indexes_.find(field_id);
if (iter == json_key_indexes_.end()) {
return nullptr;
}
return iter->second.get();
}
std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const override {
auto column = fields_.at(field_id);
bool is_valid = column->IsValid(offset);
return std::make_pair(std::move(column->RawAt(offset)), is_valid);
}
public:
size_t
@ -406,6 +431,10 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
// whether the segment is sorted by the pk
bool is_sorted_by_pk_ = false;
// used for json expr optimization
std::unordered_map<FieldId,
std::unique_ptr<index::JsonKeyStatsInvertedIndex>>
json_key_indexes_;
};
} // namespace milvus::segcore

View File

@ -24,6 +24,7 @@
#include "common/EasyAssert.h"
#include "common/FieldData.h"
#include "common/Types.h"
#include "common/Common.h"
#include "fmt/format.h"
#include "log/Log.h"
#include "nlohmann/json.hpp"
@ -170,6 +171,33 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
reserved_offset);
}
// index json.
if (field_meta.enable_growing_jsonStats()) {
std::vector<std::string> jsonDatas(
insert_record_proto->fields_data(data_offset)
.scalars()
.json_data()
.data()
.begin(),
insert_record_proto->fields_data(data_offset)
.scalars()
.json_data()
.data()
.end());
FixedVector<bool> jsonDatas_valid_data(
insert_record_proto->fields_data(data_offset)
.valid_data()
.begin(),
insert_record_proto->fields_data(data_offset)
.valid_data()
.end());
AddJSONDatas(field_id,
jsonDatas.data(),
jsonDatas_valid_data.data(),
num_rows,
reserved_offset);
}
// update average row data size
auto field_data_size = GetRawDataSizeOfDataArray(
&insert_record_proto->fields_data(data_offset),
@ -318,6 +346,15 @@ SegmentGrowingImpl::LoadFieldData(const LoadFieldDataInfo& infos) {
index->Reload();
}
// build json match index
if (field_meta.enable_growing_jsonStats()) {
auto index = GetJsonKeyIndex(field_id);
index->BuildWithFieldData(field_data, field_meta.is_nullable());
index->Commit();
// Reload reader so that the index can be read immediately
index->Reload();
}
// update the mem size
stats_.mem_size += storage::GetByteSizeOfFieldDatas(field_data);
@ -939,4 +976,56 @@ SegmentGrowingImpl::AddTexts(milvus::FieldId field_id,
iter->second->AddTexts(n, texts, texts_valid_data, offset_begin);
}
void
SegmentGrowingImpl::AddJSONDatas(FieldId field_id,
const std::string* jsondatas,
const bool* jsondatas_valid_data,
size_t n,
int64_t offset_begin) {
std::unique_lock lock(mutex_);
auto iter = json_indexes_.find(field_id);
AssertInfo(iter != json_indexes_.end(), "json index not found");
iter->second->AddJSONDatas(
n, jsondatas, jsondatas_valid_data, offset_begin);
}
void
SegmentGrowingImpl::CreateJSONIndexes() {
for (auto [field_id, field_meta] : schema_->get_fields()) {
if (field_meta.enable_growing_jsonStats()) {
CreateJSONIndex(FieldId(field_id));
}
}
}
void
SegmentGrowingImpl::CreateJSONIndex(FieldId field_id) {
std::unique_lock lock(mutex_);
const auto& field_meta = schema_->operator[](field_id);
AssertInfo(IsJsonDataType(field_meta.get_data_type()),
"cannot create json index on non-json type");
std::string unique_id = GetUniqueFieldId(field_meta.get_id().get());
auto index = std::make_unique<index::JsonKeyStatsInvertedIndex>(
JSON_KEY_STATS_COMMIT_INTERVAL, unique_id.c_str());
index->Commit();
index->CreateReader();
json_indexes_[field_id] = std::move(index);
}
std::pair<std::string_view, bool>
SegmentGrowingImpl::GetJsonData(FieldId field_id, size_t offset) const {
auto vec_ptr = dynamic_cast<const ConcurrentVector<Json>*>(
insert_record_.get_data_base(field_id));
auto& src = *vec_ptr;
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_nullable()) {
auto valid_data_ptr = insert_record_.get_valid_data(field_id);
return std::make_pair(std::string_view(src[offset]),
valid_data_ptr->is_valid(offset));
}
return std::make_pair(std::string_view(src[offset]), true);
}
} // namespace milvus::segcore

View File

@ -226,6 +226,9 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t count,
const std::vector<std::string>& dynamic_field_names) const override;
virtual std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const override;
public:
friend std::unique_ptr<SegmentGrowing>
CreateGrowingSegment(SchemaPtr schema,
@ -264,6 +267,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
mcm->Register(mmap_descriptor_);
}
this->CreateTextIndexes();
this->CreateJSONIndexes();
}
~SegmentGrowingImpl() {
@ -414,6 +418,19 @@ class SegmentGrowingImpl : public SegmentGrowing {
void
CreateTextIndexes();
void
AddJSONDatas(FieldId field_id,
const std::string* jsondatas,
const bool* jsondatas_valid_data,
size_t n,
int64_t offset_begin);
void
CreateJSONIndexes();
void
CreateJSONIndex(FieldId field_id);
private:
storage::MmapChunkDescriptorPtr mmap_descriptor_ = nullptr;
SegcoreConfig segcore_config_;

View File

@ -83,11 +83,13 @@ std::unique_ptr<SearchResult>
SegmentInternalInterface::Search(
const query::Plan* plan,
const query::PlaceholderGroup* placeholder_group,
Timestamp timestamp) const {
Timestamp timestamp,
int32_t consistency_level) const {
std::shared_lock lck(mutex_);
milvus::tracer::AddEvent("obtained_segment_lock_mutex");
check_search(plan);
query::ExecPlanNodeVisitor visitor(*this, timestamp, placeholder_group);
query::ExecPlanNodeVisitor visitor(
*this, timestamp, placeholder_group, consistency_level);
auto results = std::make_unique<SearchResult>();
*results = visitor.get_moved_result(*plan->plan_node_);
results->segment_ = (void*)this;
@ -99,11 +101,12 @@ SegmentInternalInterface::Retrieve(tracer::TraceContext* trace_ctx,
const query::RetrievePlan* plan,
Timestamp timestamp,
int64_t limit_size,
bool ignore_non_pk) const {
bool ignore_non_pk,
int32_t consistency_level) const {
std::shared_lock lck(mutex_);
tracer::AutoSpan span("Retrieve", tracer::GetRootSpan());
auto results = std::make_unique<proto::segcore::RetrieveResults>();
query::ExecPlanNodeVisitor visitor(*this, timestamp);
query::ExecPlanNodeVisitor visitor(*this, timestamp, consistency_level);
auto retrieve_results = visitor.get_retrieve_result(*plan->plan_node_);
retrieve_results.segment_ = (void*)this;
results->set_has_more_result(retrieve_results.has_more_result);
@ -292,7 +295,8 @@ SegmentInternalInterface::get_real_count() const {
milvus::plan::GetNextPlanNodeId(), sources);
plan->plan_node_->plannodes_ = plannode;
plan->plan_node_->is_count_ = true;
auto res = Retrieve(nullptr, plan.get(), MAX_TIMESTAMP, INT64_MAX, false);
auto res =
Retrieve(nullptr, plan.get(), MAX_TIMESTAMP, INT64_MAX, false, 0);
AssertInfo(res->fields_data().size() == 1,
"count result should only have one column");
AssertInfo(res->fields_data()[0].has_scalars(),
@ -528,4 +532,13 @@ SegmentInternalInterface::bulk_subscript_not_exist_field(
return result;
}
index::JsonKeyStatsInvertedIndex*
SegmentInternalInterface::GetJsonKeyIndex(FieldId field_id) const {
std::shared_lock lock(mutex_);
auto iter = json_indexes_.find(field_id);
if (iter == json_indexes_.end()) {
return nullptr;
}
return iter->second.get();
}
} // namespace milvus::segcore

View File

@ -38,6 +38,7 @@
#include "index/SkipIndex.h"
#include "mmap/Column.h"
#include "index/TextMatchIndex.h"
#include "index/JsonKeyStatsInvertedIndex.h"
namespace milvus::segcore {
@ -64,14 +65,16 @@ class SegmentInterface {
virtual std::unique_ptr<SearchResult>
Search(const query::Plan* Plan,
const query::PlaceholderGroup* placeholder_group,
Timestamp timestamp) const = 0;
Timestamp timestamp,
int32_t consistency_level = 0) const = 0;
virtual std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(tracer::TraceContext* trace_ctx,
const query::RetrievePlan* Plan,
Timestamp timestamp,
int64_t limit_size,
bool ignore_non_pk) const = 0;
bool ignore_non_pk,
int32_t consistency_level = 0) const = 0;
virtual std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(tracer::TraceContext* trace_ctx,
@ -139,6 +142,11 @@ class SegmentInterface {
GetJsonIndex(FieldId field_id, std::string path) const {
return nullptr;
}
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const = 0;
virtual std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const = 0;
};
// internal API for DSL calculation
@ -247,7 +255,8 @@ class SegmentInternalInterface : public SegmentInterface {
std::unique_ptr<SearchResult>
Search(const query::Plan* Plan,
const query::PlaceholderGroup* placeholder_group,
Timestamp timestamp) const override;
Timestamp timestamp,
int32_t consistency_level = 0) const override;
void
FillPrimaryKeys(const query::Plan* plan,
@ -262,7 +271,8 @@ class SegmentInternalInterface : public SegmentInterface {
const query::RetrievePlan* Plan,
Timestamp timestamp,
int64_t limit_size,
bool ignore_non_pk) const override;
bool ignore_non_pk,
int32_t consistency_level = 0) const override;
std::unique_ptr<proto::segcore::RetrieveResults>
Retrieve(tracer::TraceContext* trace_ctx,
@ -325,6 +335,9 @@ class SegmentInternalInterface : public SegmentInterface {
index::TextMatchIndex*
GetTextIndex(FieldId field_id) const override;
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override;
public:
virtual void
vector_search(SearchInfo& search_info,
@ -519,6 +532,10 @@ class SegmentInternalInterface : public SegmentInterface {
// text-indexes used to do match.
std::unordered_map<FieldId, std::unique_ptr<index::TextMatchIndex>>
text_indexes_;
std::unordered_map<FieldId,
std::unique_ptr<index::JsonKeyStatsInvertedIndex>>
json_indexes_;
};
} // namespace milvus::segcore

View File

@ -69,6 +69,17 @@ class SegmentSealed : public SegmentInternalInterface {
return index->second.get();
}
virtual void
LoadJsonKeyIndex(
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) = 0;
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const = 0;
virtual std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const = 0;
SegmentType
type() const override {
return SegmentType::Sealed;

View File

@ -2147,4 +2147,29 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
text_indexes_[field_id] = std::move(index);
}
void
SegmentSealedImpl::LoadJsonKeyIndex(
FieldId field_id, std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) {
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
json_key_indexes_[field_id] = std::move(index);
}
index::JsonKeyStatsInvertedIndex*
SegmentSealedImpl::GetJsonKeyIndex(FieldId field_id) const {
std::shared_lock lck(mutex_);
auto iter = json_key_indexes_.find(field_id);
if (iter == json_key_indexes_.end()) {
return nullptr;
}
return iter->second.get();
}
std::pair<std::string_view, bool>
SegmentSealedImpl::GetJsonData(FieldId field_id, size_t offset) const {
auto column = fields_.at(field_id);
bool is_valid = column->IsValid(offset);
return std::make_pair(std::move(column->RawAt(offset)), is_valid);
}
} // namespace milvus::segcore

View File

@ -36,6 +36,7 @@
#include "common/Types.h"
#include "common/IndexMeta.h"
#include "index/TextMatchIndex.h"
#include "index/JsonKeyStatsInvertedIndex.h"
namespace milvus::segcore {
@ -100,6 +101,17 @@ class SegmentSealedImpl : public SegmentSealed {
LoadTextIndex(FieldId field_id,
std::unique_ptr<index::TextMatchIndex> index) override;
void
LoadJsonKeyIndex(
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) override;
index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override;
std::pair<std::string_view, bool>
GetJsonData(FieldId field_id, size_t offset) const override;
public:
size_t
GetMemoryUsageInBytes() const override {
@ -412,6 +424,11 @@ class SegmentSealedImpl : public SegmentSealed {
// whether the segment is sorted by the pk
bool is_sorted_by_pk_ = false;
// used for json expr optimization
std::unordered_map<FieldId,
std::unique_ptr<index::JsonKeyStatsInvertedIndex>>
json_key_indexes_;
};
inline SegmentSealedUPtr

View File

@ -111,7 +111,8 @@ AsyncSearch(CTraceContext c_trace,
CSegmentInterface c_segment,
CSearchPlan c_plan,
CPlaceholderGroup c_placeholder_group,
uint64_t timestamp) {
uint64_t timestamp,
int32_t consistency_level) {
auto segment = (milvus::segcore::SegmentInterface*)c_segment;
auto plan = (milvus::query::Plan*)c_plan;
auto phg_ptr = reinterpret_cast<const milvus::query::PlaceholderGroup*>(
@ -120,7 +121,7 @@ AsyncSearch(CTraceContext c_trace,
auto future = milvus::futures::Future<milvus::SearchResult>::async(
milvus::futures::getGlobalCPUExecutor(),
milvus::futures::ExecutePriority::HIGH,
[c_trace, segment, plan, phg_ptr, timestamp](
[c_trace, segment, plan, phg_ptr, timestamp, consistency_level](
milvus::futures::CancellationToken cancel_token) {
// save trace context into search_info
auto& trace_ctx = plan->plan_node_->search_info_.trace_ctx_;
@ -131,7 +132,8 @@ AsyncSearch(CTraceContext c_trace,
auto span = milvus::tracer::StartSpan("SegCoreSearch", &trace_ctx);
milvus::tracer::SetRootSpan(span);
auto search_result = segment->Search(plan, phg_ptr, timestamp);
auto search_result =
segment->Search(plan, phg_ptr, timestamp, consistency_level);
if (!milvus::PositivelyRelated(
plan->plan_node_->search_info_.metric_type_)) {
for (auto& dis : search_result->distances_) {
@ -179,21 +181,31 @@ AsyncRetrieve(CTraceContext c_trace,
CRetrievePlan c_plan,
uint64_t timestamp,
int64_t limit_size,
bool ignore_non_pk) {
bool ignore_non_pk,
int32_t consistency_level) {
auto segment = static_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto plan = static_cast<const milvus::query::RetrievePlan*>(c_plan);
auto future = milvus::futures::Future<CRetrieveResult>::async(
milvus::futures::getGlobalCPUExecutor(),
milvus::futures::ExecutePriority::HIGH,
[c_trace, segment, plan, timestamp, limit_size, ignore_non_pk](
milvus::futures::CancellationToken cancel_token) {
[c_trace,
segment,
plan,
timestamp,
limit_size,
ignore_non_pk,
consistency_level](milvus::futures::CancellationToken cancel_token) {
auto trace_ctx = milvus::tracer::TraceContext{
c_trace.traceID, c_trace.spanID, c_trace.traceFlags};
milvus::tracer::AutoSpan span("SegCoreRetrieve", &trace_ctx, true);
auto retrieve_result = segment->Retrieve(
&trace_ctx, plan, timestamp, limit_size, ignore_non_pk);
auto retrieve_result = segment->Retrieve(&trace_ctx,
plan,
timestamp,
limit_size,
ignore_non_pk,
consistency_level);
return CreateLeakedCRetrieveResultFromProto(
std::move(retrieve_result));
@ -479,6 +491,60 @@ LoadTextIndex(CSegmentInterface c_segment,
}
}
CStatus
LoadJsonKeyIndex(CTraceContext c_trace,
CSegmentInterface c_segment,
const uint8_t* serialized_load_json_key_index_info,
const uint64_t len) {
try {
auto ctx = milvus::tracer::TraceContext{
c_trace.traceID, c_trace.spanID, c_trace.traceFlags};
auto segment_interface =
reinterpret_cast<milvus::segcore::SegmentInterface*>(c_segment);
auto segment =
dynamic_cast<milvus::segcore::SegmentSealed*>(segment_interface);
AssertInfo(segment != nullptr, "segment conversion failed");
auto info_proto =
std::make_unique<milvus::proto::indexcgo::LoadJsonKeyIndexInfo>();
info_proto->ParseFromArray(serialized_load_json_key_index_info, len);
milvus::storage::FieldDataMeta field_meta{info_proto->collectionid(),
info_proto->partitionid(),
segment->get_segment_id(),
info_proto->fieldid(),
info_proto->schema()};
milvus::storage::IndexMeta index_meta{segment->get_segment_id(),
info_proto->fieldid(),
info_proto->buildid(),
info_proto->version()};
auto remote_chunk_manager =
milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
milvus::Config config;
std::vector<std::string> files;
for (const auto& f : info_proto->files()) {
files.push_back(f);
}
config["index_files"] = files;
milvus::storage::FileManagerContext file_ctx(
field_meta, index_meta, remote_chunk_manager);
auto index = std::make_unique<milvus::index::JsonKeyStatsInvertedIndex>(
file_ctx, true);
index->Load(ctx, config);
segment->LoadJsonKeyIndex(milvus::FieldId(info_proto->fieldid()),
std::move(index));
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(&e);
}
}
CStatus
UpdateFieldRawDataSize(CSegmentInterface c_segment,
int64_t field_id,

View File

@ -50,7 +50,8 @@ AsyncSearch(CTraceContext c_trace,
CSegmentInterface c_segment,
CSearchPlan c_plan,
CPlaceholderGroup c_placeholder_group,
uint64_t timestamp);
uint64_t timestamp,
int32_t consistency_level);
void
DeleteRetrieveResult(CRetrieveResult* retrieve_result);
@ -61,7 +62,8 @@ AsyncRetrieve(CTraceContext c_trace,
CRetrievePlan c_plan,
uint64_t timestamp,
int64_t limit_size,
bool ignore_non_pk);
bool ignore_non_pk,
int32_t consistency_level);
CFuture* // Future<CRetrieveResult>
AsyncRetrieveByOffsets(CTraceContext c_trace,
@ -122,6 +124,12 @@ LoadTextIndex(CSegmentInterface c_segment,
const uint8_t* serialized_load_text_index_info,
const uint64_t len);
CStatus
LoadJsonKeyIndex(CTraceContext c_trace,
CSegmentInterface c_segment,
const uint8_t* serialied_load_json_key_index_info,
const uint64_t len);
CStatus
UpdateFieldRawDataSize(CSegmentInterface c_segment,
int64_t field_id,

View File

@ -79,8 +79,18 @@ DiskFileManagerImpl::GetRemoteTextLogPath(const std::string& file_name,
return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num);
}
std::string
DiskFileManagerImpl::GetRemoteJsonKeyIndexPath(const std::string& file_name,
int64_t slice_num) {
auto remote_prefix = GetRemoteJsonKeyLogPrefix();
return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num);
}
bool
DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
DiskFileManagerImpl::AddFileInternal(
const std::string& file,
const std::function<std::string(const std::string&, int)>&
get_remote_path) noexcept {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
FILEMANAGER_TRY
@ -116,8 +126,7 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
}
auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset);
batch_remote_files.emplace_back(
GetRemoteIndexPath(fileName, slice_num));
batch_remote_files.emplace_back(get_remote_path(fileName, slice_num));
remote_file_sizes.emplace_back(batch_size);
local_file_offsets.emplace_back(offset);
offset += batch_size;
@ -132,58 +141,29 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
return true;
} // namespace knowhere
bool
DiskFileManagerImpl::AddFile(const std::string& file) noexcept {
return AddFileInternal(file,
[this](const std::string& file_name, int slice_num) {
return GetRemoteIndexPath(file_name, slice_num);
});
}
bool
DiskFileManagerImpl::AddJsonKeyIndexLog(const std::string& file) noexcept {
return AddFileInternal(
file, [this](const std::string& file_name, int slice_num) {
return GetRemoteJsonKeyIndexPath(file_name, slice_num);
});
}
bool
DiskFileManagerImpl::AddTextLog(const std::string& file) noexcept {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
FILEMANAGER_TRY
if (!local_chunk_manager->Exist(file)) {
LOG_ERROR("local file {} not exists", file);
return false;
}
// record local file path
local_paths_.emplace_back(file);
auto fileName = GetFileName(file);
auto fileSize = local_chunk_manager->Size(file);
added_total_file_size_ += fileSize;
std::vector<std::string> batch_remote_files;
std::vector<int64_t> remote_file_sizes;
std::vector<int64_t> local_file_offsets;
int slice_num = 0;
auto parallel_degree =
uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE);
for (int64_t offset = 0; offset < fileSize; slice_num++) {
if (batch_remote_files.size() >= parallel_degree) {
AddBatchIndexFiles(file,
local_file_offsets,
batch_remote_files,
remote_file_sizes);
batch_remote_files.clear();
remote_file_sizes.clear();
local_file_offsets.clear();
}
auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset);
batch_remote_files.emplace_back(
GetRemoteTextLogPath(fileName, slice_num));
remote_file_sizes.emplace_back(batch_size);
local_file_offsets.emplace_back(offset);
offset += batch_size;
}
if (batch_remote_files.size() > 0) {
AddBatchIndexFiles(
file, local_file_offsets, batch_remote_files, remote_file_sizes);
}
FILEMANAGER_CATCH
FILEMANAGER_END
return true;
} // namespace knowhere
return AddFileInternal(
file, [this](const std::string& file_name, int slice_num) {
return GetRemoteTextLogPath(file_name, slice_num);
});
}
void
DiskFileManagerImpl::AddBatchIndexFiles(
@ -238,8 +218,9 @@ DiskFileManagerImpl::AddBatchIndexFiles(
}
void
DiskFileManagerImpl::CacheIndexToDisk(
const std::vector<std::string>& remote_files) {
DiskFileManagerImpl::CacheIndexToDiskInternal(
const std::vector<std::string>& remote_files,
const std::function<std::string()>& get_local_index_prefix) noexcept {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
@ -265,7 +246,7 @@ DiskFileManagerImpl::CacheIndexToDisk(
for (auto& slices : index_slices) {
auto prefix = slices.first;
auto local_index_file_name =
GetLocalIndexObjectPrefix() +
get_local_index_prefix() +
prefix.substr(prefix.find_last_of('/') + 1);
local_chunk_manager->CreateFile(local_index_file_name);
auto file =
@ -305,58 +286,25 @@ DiskFileManagerImpl::CacheIndexToDisk(
}
}
void
DiskFileManagerImpl::CacheIndexToDisk(
const std::vector<std::string>& remote_files) {
return CacheIndexToDiskInternal(
remote_files, [this]() { return GetLocalIndexObjectPrefix(); });
}
void
DiskFileManagerImpl::CacheTextLogToDisk(
const std::vector<std::string>& remote_files) {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
return CacheIndexToDiskInternal(
remote_files, [this]() { return GetLocalTextIndexPrefix(); });
}
std::map<std::string, std::vector<int>> index_slices;
for (auto& file_path : remote_files) {
auto pos = file_path.find_last_of("_");
AssertInfo(pos > 0, "invalided index file path:{}", file_path);
try {
auto idx = std::stoi(file_path.substr(pos + 1));
index_slices[file_path.substr(0, pos)].emplace_back(idx);
} catch (const std::logic_error& e) {
auto err_message = fmt::format(
"invalided text log path:{}, error:{}", file_path, e.what());
LOG_ERROR(err_message);
throw std::logic_error(err_message);
}
}
for (auto& slices : index_slices) {
std::sort(slices.second.begin(), slices.second.end());
}
for (auto& slices : index_slices) {
auto prefix = slices.first;
auto local_index_file_name =
GetLocalTextIndexPrefix() + "/" +
prefix.substr(prefix.find_last_of('/') + 1);
local_chunk_manager->CreateFile(local_index_file_name);
auto file =
File::Open(local_index_file_name, O_CREAT | O_RDWR | O_TRUNC);
// Get the remote files
std::vector<std::string> batch_remote_files;
batch_remote_files.reserve(slices.second.size());
for (int& iter : slices.second) {
auto origin_file = prefix + "_" + std::to_string(iter);
batch_remote_files.push_back(origin_file);
}
auto index_chunks = GetObjectData(rcm_.get(), batch_remote_files);
for (auto& chunk : index_chunks) {
auto index_data = chunk.get()->GetFieldData();
auto index_size = index_data->Size();
auto chunk_data = reinterpret_cast<uint8_t*>(
const_cast<void*>(index_data->Data()));
file.Write(chunk_data, index_size);
}
local_paths_.emplace_back(local_index_file_name);
}
void
DiskFileManagerImpl::CacheJsonKeyIndexToDisk(
const std::vector<std::string>& remote_files) {
return CacheIndexToDiskInternal(
remote_files, [this]() { return GetLocalJsonKeyIndexPrefix(); });
}
template <typename DataType>
@ -649,6 +597,12 @@ DiskFileManagerImpl::GetFileName(const std::string& localfile) {
return localPath.filename().string();
}
std::string
DiskFileManagerImpl::GetIndexIdentifier() {
return GenIndexPathIdentifier(index_meta_.build_id,
index_meta_.index_version);
}
std::string
DiskFileManagerImpl::GetLocalIndexObjectPrefix() {
auto local_chunk_manager =
@ -657,6 +611,14 @@ DiskFileManagerImpl::GetLocalIndexObjectPrefix() {
local_chunk_manager, index_meta_.build_id, index_meta_.index_version);
}
std::string
DiskFileManagerImpl::GetTextIndexIdentifier() {
return std::to_string(index_meta_.build_id) + "/" +
std::to_string(index_meta_.index_version) + "/" +
std::to_string(field_meta_.segment_id) + "/" +
std::to_string(field_meta_.field_id);
}
std::string
DiskFileManagerImpl::GetLocalTextIndexPrefix() {
auto local_chunk_manager =
@ -669,17 +631,37 @@ DiskFileManagerImpl::GetLocalTextIndexPrefix() {
}
std::string
DiskFileManagerImpl::GetIndexIdentifier() {
return GenIndexPathIdentifier(index_meta_.build_id,
index_meta_.index_version);
DiskFileManagerImpl::GetJsonKeyIndexIdentifier() {
return GenJsonKeyIndexPathIdentifier(index_meta_.build_id,
index_meta_.index_version,
field_meta_.collection_id,
field_meta_.partition_id,
field_meta_.segment_id,
field_meta_.field_id);
}
std::string
DiskFileManagerImpl::GetTextIndexIdentifier() {
return std::to_string(index_meta_.build_id) + "/" +
std::to_string(index_meta_.index_version) + "/" +
std::to_string(field_meta_.segment_id) +
std::to_string(field_meta_.field_id);
DiskFileManagerImpl::GetLocalJsonKeyIndexPrefix() {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
return GenJsonKeyIndexPathPrefix(local_chunk_manager,
index_meta_.build_id,
index_meta_.index_version,
field_meta_.collection_id,
field_meta_.partition_id,
field_meta_.segment_id,
field_meta_.field_id);
}
std::string
DiskFileManagerImpl::GetRemoteJsonKeyLogPrefix() {
return GenJsonKeyIndexPathPrefix(rcm_,
index_meta_.build_id,
index_meta_.index_version,
field_meta_.collection_id,
field_meta_.partition_id,
field_meta_.segment_id,
field_meta_.field_id);
}
std::string

View File

@ -51,28 +51,43 @@ class DiskFileManagerImpl : public FileManagerImpl {
bool
AddTextLog(const std::string& filename) noexcept;
bool
AddJsonKeyIndexLog(const std::string& filename) noexcept;
public:
std::string
GetName() const override {
return "DiskFileManagerImpl";
}
std::string
GetLocalIndexObjectPrefix();
// Similar to GetTextIndexIdentifier, segment_id and field_id is also required.
std::string
GetLocalTextIndexPrefix();
std::string
GetIndexIdentifier();
std::string
GetLocalIndexObjectPrefix();
// Different from user index, a text index task may have multiple text fields sharing same build_id/task_id. So
// segment_id and field_id are required to identify a unique text index, in case that we support multiple index task
// in the same indexnode at the same time later.
std::string
GetTextIndexIdentifier();
// Similar to GetTextIndexIdentifier, segment_id and field_id is also required.
std::string
GetLocalTextIndexPrefix();
// Used for building index, using this index identifier mode to construct tmp building-index dir.
std::string
GetJsonKeyIndexIdentifier();
// Used for loading index, using this index prefix dir to store index.
std::string
GetLocalJsonKeyIndexPrefix();
// Used for upload index to remote storage, using this index prefix dir as remote storage directory
std::string
GetRemoteJsonKeyLogPrefix();
std::string
GetLocalRawDataObjectPrefix();
@ -92,6 +107,9 @@ class DiskFileManagerImpl : public FileManagerImpl {
void
CacheTextLogToDisk(const std::vector<std::string>& remote_files);
void
CacheJsonKeyIndexToDisk(const std::vector<std::string>& remote_files);
void
AddBatchIndexFiles(const std::string& local_file_name,
const std::vector<int64_t>& local_file_offsets,
@ -115,21 +133,34 @@ class DiskFileManagerImpl : public FileManagerImpl {
return added_total_file_size_;
}
std::string
GetFileName(const std::string& localfile);
private:
int64_t
GetIndexBuildId() {
return index_meta_.build_id;
}
std::string
GetFileName(const std::string& localfile);
std::string
GetRemoteIndexPath(const std::string& file_name, int64_t slice_num) const;
std::string
GetRemoteTextLogPath(const std::string& file_name, int64_t slice_num) const;
std::string
GetRemoteJsonKeyIndexPath(const std::string& file_name, int64_t slice_num);
bool
AddFileInternal(const std::string& file_name,
const std::function<std::string(const std::string&, int)>&
get_remote_path) noexcept;
void
CacheIndexToDiskInternal(
const std::vector<std::string>& remote_files,
const std::function<std::string()>& get_local_index_prefix) noexcept;
private:
// local file path (abs path)
std::vector<std::string> local_paths_;

View File

@ -549,6 +549,37 @@ GenTextIndexPathPrefix(ChunkManagerPtr cm,
return (prefix / path / path1).string();
}
std::string
GenJsonKeyIndexPathIdentifier(int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id) {
return std::to_string(build_id) + "/" + std::to_string(index_version) +
"/" + std::to_string(collection_id) + "/" +
std::to_string(partition_id) + "/" + std::to_string(segment_id) +
"/" + std::to_string(field_id) + "/";
}
std::string
GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id) {
return cm->GetRootPath() + "/" + std::string(JSON_KEY_INDEX_LOG_ROOT_PATH) +
"/" +
GenJsonKeyIndexPathIdentifier(build_id,
index_version,
collection_id,
partition_id,
segment_id,
field_id);
}
std::string
GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id) {
boost::filesystem::path prefix = cm->GetRootPath();

View File

@ -92,6 +92,23 @@ GenTextIndexPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,
int64_t field_id);
std::string
GenJsonKeyIndexPathIdentifier(int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id);
std::string
GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version,
int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id);
std::string
GenFieldRawDataPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,

View File

@ -69,6 +69,68 @@ struct RustArrayWrapper {
}
}
};
struct RustArrayI64Wrapper {
NO_COPY_OR_ASSIGN(RustArrayI64Wrapper);
explicit RustArrayI64Wrapper(RustArrayI64&& array) {
array_.array = array.array;
array_.len = array.len;
array_.cap = array.cap;
array.array = nullptr;
array.len = 0;
array.cap = 0;
}
RustArrayI64Wrapper(RustArrayI64Wrapper&& other) noexcept {
array_.array = other.array_.array;
array_.len = other.array_.len;
array_.cap = other.array_.cap;
other.array_.array = nullptr;
other.array_.len = 0;
other.array_.cap = 0;
}
RustArrayI64Wrapper&
operator=(RustArrayI64Wrapper&& other) noexcept {
if (this != &other) {
free();
array_.array = other.array_.array;
array_.len = other.array_.len;
array_.cap = other.array_.cap;
other.array_.array = nullptr;
other.array_.len = 0;
other.array_.cap = 0;
}
return *this;
}
~RustArrayI64Wrapper() {
free();
}
void
debug() {
std::stringstream ss;
ss << "[ ";
for (int i = 0; i < array_.len; i++) {
ss << array_.array[i] << " ";
}
ss << "]";
std::cout << ss.str() << std::endl;
}
RustArrayI64 array_;
private:
void
free() {
if (array_.array != nullptr) {
free_rust_array_i64(array_);
}
}
};
struct RustResultWrapper {
NO_COPY_OR_ASSIGN(RustResultWrapper);

View File

@ -149,6 +149,8 @@ RustResult tantivy_term_query_bool(void *ptr, bool term);
RustResult tantivy_term_query_keyword(void *ptr, const char *term);
RustResult tantivy_term_query_keyword_i64(void *ptr, const char *term);
RustResult tantivy_lower_bound_range_query_keyword(void *ptr,
const char *lower_bound,
bool inclusive);
@ -180,7 +182,8 @@ RustResult tantivy_create_index(const char *field_name,
const char *path,
uint32_t tantivy_index_version,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes);
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
RustResult tantivy_create_index_with_single_segment(const char *field_name,
TantivyDataType data_type,

View File

@ -120,7 +120,7 @@ macro_rules! impl_from_for_enum {
};
}
impl_from_for_enum!(Value, None => (), RustArrayI64 => RustArrayI64, RustArray => RustArray, RustArray => Vec<u32>, U32 => u32, Ptr => *mut c_void);
impl_from_for_enum!(Value, None => (), RustArrayI64 => RustArrayI64, RustArrayI64 => Vec<i64>, RustArray => RustArray, RustArray => Vec<u32>, U32 => u32, Ptr => *mut c_void);
#[repr(C)]
pub struct RustResult {
@ -202,7 +202,7 @@ macro_rules! cstr_to_str {
#[no_mangle]
pub extern "C" fn test_enum_with_array() -> RustResult {
let array = vec![1, 2, 3];
let array: Vec<u32> = vec![1, 2, 3];
RustResult::from(Result::Ok(array))
}

View File

@ -162,7 +162,7 @@ impl IndexReaderWrapper {
}
pub fn term_query_f64(&self, term: f64) -> Result<Vec<u32>> {
let q = TermQuery::new(
let q: TermQuery = TermQuery::new(
Term::from_field_f64(self.field, term),
IndexRecordOption::Basic,
);
@ -235,6 +235,14 @@ impl IndexReaderWrapper {
self.search(&q)
}
pub fn term_query_keyword_i64(&self, term: &str) -> Result<Vec<i64>> {
let q = TermQuery::new(
Term::from_field_text(self.field, term),
IndexRecordOption::Basic,
);
self.search_i64(&q)
}
pub fn lower_bound_range_query_keyword(
&self,
lower_bound: &str,

View File

@ -192,6 +192,13 @@ pub extern "C" fn tantivy_term_query_keyword(ptr: *mut c_void, term: *const c_ch
unsafe { (*real).term_query_keyword(term).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_term_query_keyword_i64(ptr: *mut c_void, term: *const c_char) -> RustResult {
let real = ptr as *mut IndexReaderWrapper;
let term = cstr_to_str!(term);
unsafe { (*real).term_query_keyword_i64(term).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_lower_bound_range_query_keyword(
ptr: *mut c_void,

View File

@ -11,6 +11,7 @@ impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer,
// and then execute the disconjunction of term query.
pub(crate) fn match_query(&self, q: &str) -> Result<Vec<u32>> {
// clone the tokenizer to make `match_query` thread-safe.
let mut tokenizer = self
.index
.tokenizer_for_field(self.field)

View File

@ -29,6 +29,7 @@ impl IndexWriterWrapper {
num_threads: usize,
overall_memory_budget_in_bytes: usize,
tanviy_index_version: TantivyIndexVersion,
in_ram: bool,
) -> Result<IndexWriterWrapper> {
init_log();
match tanviy_index_version {
@ -39,6 +40,7 @@ impl IndexWriterWrapper {
path,
num_threads,
overall_memory_budget_in_bytes,
in_ram,
)?;
Ok(IndexWriterWrapper::V5(writer))
}
@ -49,12 +51,12 @@ impl IndexWriterWrapper {
path,
num_threads,
overall_memory_budget_in_bytes,
in_ram,
)?;
Ok(IndexWriterWrapper::V7(writer))
}
}
}
pub fn new_with_single_segment(
field_name: &str,
data_type: TantivyDataType,

View File

@ -28,6 +28,7 @@ pub extern "C" fn tantivy_create_index(
tantivy_index_version: u32,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram : bool,
) -> RustResult {
let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path);
@ -44,6 +45,7 @@ pub extern "C" fn tantivy_create_index(
num_threads,
overall_memory_budget_in_bytes,
tantivy_index_version,
in_ram,
) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(e) => RustResult::from_error(e.to_string()),

View File

@ -104,6 +104,7 @@ impl IndexWriterWrapperImpl {
path: String,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> Result<IndexWriterWrapperImpl> {
info!(
"create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5",
@ -114,7 +115,11 @@ impl IndexWriterWrapperImpl {
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index = if in_ram {
Index::create_in_ram(schema)
} else {
Index::create_in_dir(path.clone(), schema)?
};
let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapperImpl {

View File

@ -103,6 +103,7 @@ impl IndexWriterWrapperImpl {
path: String,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> Result<IndexWriterWrapperImpl> {
info!(
"create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7",
@ -113,7 +114,11 @@ impl IndexWriterWrapperImpl {
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index = if in_ram {
Index::create_in_ram(schema)
} else {
Index::create_in_dir(path.clone(), schema)?
};
let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapperImpl {

View File

@ -84,6 +84,7 @@ struct TantivyIndexWrapper {
const char* path,
uint32_t tantivy_index_version,
bool inverted_single_semgnent = false,
bool in_ram = false,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
@ -101,7 +102,8 @@ struct TantivyIndexWrapper {
path,
tantivy_index_version,
num_threads,
overall_memory_budget_in_bytes));
overall_memory_budget_in_bytes,
in_ram));
}
AssertInfo(res.result_->success,
"failed to create index: {}",
@ -146,7 +148,6 @@ struct TantivyIndexWrapper {
writer_ = res.result_->value.ptr._0;
path_ = std::string(path);
}
// create reader.
void
create_reader() {
@ -626,6 +627,22 @@ struct TantivyIndexWrapper {
return RustArrayWrapper(std::move(res.result_->value.rust_array._0));
}
RustArrayI64Wrapper
term_query_i64(std::string term) {
auto array = [&]() {
return tantivy_term_query_keyword_i64(reader_, term.c_str());
}();
auto res = RustResultWrapper(array);
AssertInfo(res.result_->success,
"TantivyIndexWrapper.term_query_i64: {}",
res.result_->error);
AssertInfo(res.result_->value.tag == Value::Tag::RustArrayI64,
"TantivyIndexWrapper.term_query_i64: invalid result type");
return RustArrayI64Wrapper(
std::move(res.result_->value.rust_array_i64._0));
}
template <typename T>
RustArrayWrapper
lower_bound_range_query(T lower_bound, bool inclusive) {

View File

@ -96,6 +96,7 @@ set(MILVUS_TEST_FILES
test_cached_search_iterator.cpp
test_random_sample.cpp
test_json_index.cpp
test_json_key_stats_index.cpp
)
if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -93,7 +93,7 @@ Search_GrowingIndex(benchmark::State& state) {
Timestamp ts = 10000000;
for (auto _ : state) {
auto qr = segment->Search(search_plan.get(), ph_group.get(), ts);
auto qr = segment->Search(search_plan.get(), ph_group.get(), ts, 0);
}
}
@ -130,7 +130,7 @@ Search_Sealed(benchmark::State& state) {
Timestamp ts = 10000000;
for (auto _ : state) {
auto qr = segment->Search(search_plan.get(), ph_group.get(), ts);
auto qr = segment->Search(search_plan.get(), ph_group.get(), ts, 0);
}
}

View File

@ -229,7 +229,7 @@ TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) {
ph_group.get()};
auto nlist = segcore_config.get_nlist();
auto binlog_index_sr =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);
@ -262,7 +262,7 @@ TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) {
EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_FALSE(segment->HasFieldData(vec_field_id));
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63);
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
auto similary = GetKnnSearchRecall(num_queries,
binlog_index_sr->seg_offsets_.data(),
topk,
@ -328,7 +328,7 @@ TEST_P(BinlogIndexTest, AccuracyWithMapFieldData) {
ph_group.get()};
auto nlist = segcore_config.get_nlist();
auto binlog_index_sr =
segment->Search(plan.get(), ph_group.get(), 1L << 63);
segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);

View File

@ -46,6 +46,7 @@
#include "segcore/load_index_c.h"
#include "test_utils/c_api_test_utils.h"
#include "segcore/vector_index_c.h"
#include "common/jsmn.h"
namespace chrono = std::chrono;
@ -69,7 +70,7 @@ CRetrieve(CSegmentInterface c_segment,
uint64_t timestamp,
CRetrieveResult** result) {
auto future = AsyncRetrieve(
{}, c_segment, c_plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE, false);
{}, c_segment, c_plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE, false, 0);
auto futurePtr = static_cast<milvus::futures::IFuture*>(
static_cast<void*>(static_cast<CFuture*>(future)));

View File

@ -157,6 +157,7 @@ TEST_P(TaskTest, CallExprEmpty) {
segment_.get(),
100000,
MAX_TIMESTAMP,
0,
std::make_shared<milvus::exec::QueryConfig>(
std::unordered_map<std::string, std::string>{}));
@ -194,6 +195,7 @@ TEST_P(TaskTest, UnaryExpr) {
segment_.get(),
100000,
MAX_TIMESTAMP,
0,
std::make_shared<milvus::exec::QueryConfig>(
std::unordered_map<std::string, std::string>{}));
@ -240,6 +242,7 @@ TEST_P(TaskTest, LogicalExpr) {
segment_.get(),
100000,
MAX_TIMESTAMP,
0,
std::make_shared<milvus::exec::QueryConfig>(
std::unordered_map<std::string, std::string>{}));

View File

@ -59,14 +59,18 @@ using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
class ExprTest : public ::testing::TestWithParam<
std::pair<milvus::DataType, knowhere::MetricType>> {
class ExprTest
: public ::testing::TestWithParam<
std::tuple<std::pair<milvus::DataType, knowhere::MetricType>, bool>> {
public:
void
SetUp() override {
auto param = GetParam();
data_type = param.first;
metric_type = param.second;
data_type = std::get<0>(param).first; // Get the DataType from the pair
metric_type =
std::get<0>(param).second; // Get the MetricType from the pair
GROWING_JSON_KEY_STATS_ENABLED =
std::get<1>(param); // Get the bool parameter
}
// replace the metric type in the plan string with the proper type
@ -81,13 +85,29 @@ class ExprTest : public ::testing::TestWithParam<
knowhere::MetricType metric_type;
};
// Instantiate test suite with new bool parameter
INSTANTIATE_TEST_SUITE_P(
ExprTestSuite,
ExprTest,
::testing::Values(
std::pair(milvus::DataType::VECTOR_FLOAT, knowhere::metric::L2),
std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT, knowhere::metric::IP),
std::pair(milvus::DataType::VECTOR_BINARY, knowhere::metric::JACCARD)));
std::make_tuple(std::pair(milvus::DataType::VECTOR_FLOAT,
knowhere::metric::L2),
false),
std::make_tuple(std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT,
knowhere::metric::IP),
false),
std::make_tuple(std::pair(milvus::DataType::VECTOR_BINARY,
knowhere::metric::JACCARD),
false),
std::make_tuple(std::pair(milvus::DataType::VECTOR_FLOAT,
knowhere::metric::L2),
true),
std::make_tuple(std::pair(milvus::DataType::VECTOR_SPARSE_FLOAT,
knowhere::metric::IP),
true),
std::make_tuple(std::pair(milvus::DataType::VECTOR_BINARY,
knowhere::metric::JACCARD),
true)));
TEST_P(ExprTest, Range) {
SUCCEED();
@ -842,7 +862,7 @@ TEST_P(ExprTest, TestBinaryRangeJSON) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (auto testcase : testcases) {
auto check = [&](int64_t value) {
@ -966,7 +986,7 @@ TEST_P(ExprTest, TestBinaryRangeJSONNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
for (auto testcase : testcases) {
auto check = [&](int64_t value, bool valid) {
@ -1085,7 +1105,7 @@ TEST_P(ExprTest, TestExistsJson) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) {
@ -1162,7 +1182,7 @@ TEST_P(ExprTest, TestExistsJsonNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) {
@ -1245,16 +1265,13 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
int64_t val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{10, {"int"}},
{20, {"int"}},
{30, {"int"}},
{40, {"int"}},
{10, {"double"}},
{20, {"double"}},
{30, {"double"}},
{40, {"double"}},
};
std::vector<Testcase> testcases{{10, {"int"}},
{20, {"int"}},
{30, {"int"}},
{40, {"int"}},
{1, {"array", "0"}},
{2, {"array", "1"}},
{3, {"array", "2"}}};
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);
@ -1278,7 +1295,7 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<OpType> ops{
@ -1356,13 +1373,16 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
if (testcase.nested_path[0] == "int") {
if (testcase.nested_path[0] == "int" ||
testcase.nested_path[0] == "array") {
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<int64_t>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
ASSERT_EQ(ans, ref) << "@" << i << "op" << op;
if (i % 2 == 0) {
ASSERT_EQ(view[int(i / 2)], ref);
}
@ -1381,6 +1401,272 @@ TEST_P(ExprTest, TestUnaryRangeJson) {
}
}
{
struct Testcase {
int64_t val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{{1.1, {"double"}},
{2.2, {"double"}},
{3.3, {"double"}},
{4.4, {"double"}},
{1e40, {"double"}}};
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);
auto json_fid = schema->AddDebugField("json", DataType::JSON);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> json_col;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_json_col = raw_data.get_col<std::string>(json_fid);
json_col.insert(
json_col.end(), new_json_col.begin(), new_json_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<OpType> ops{
OpType::Equal,
OpType::NotEqual,
OpType::GreaterThan,
OpType::GreaterEqual,
OpType::LessThan,
OpType::LessEqual,
};
for (const auto& testcase : testcases) {
auto check = [&](double value) { return value == testcase.val; };
std::function<bool(double)> f = check;
for (auto& op : ops) {
switch (op) {
case OpType::Equal: {
f = [&](double value) { return value == testcase.val; };
break;
}
case OpType::NotEqual: {
f = [&](double value) { return value != testcase.val; };
break;
}
case OpType::GreaterEqual: {
f = [&](double value) { return value >= testcase.val; };
break;
}
case OpType::GreaterThan: {
f = [&](double value) { return value > testcase.val; };
break;
}
case OpType::LessEqual: {
f = [&](double value) { return value <= testcase.val; };
break;
}
case OpType::LessThan: {
f = [&](double value) { return value < testcase.val; };
break;
}
default: {
PanicInfo(Unsupported, "unsupported range node");
}
}
auto pointer = milvus::Json::pointer(testcase.nested_path);
proto::plan::GenericValue value;
value.set_float_val(testcase.val);
auto expr =
std::make_shared<milvus::expr::UnaryRangeFilterExpr>(
milvus::expr::ColumnInfo(
json_fid, DataType::JSON, testcase.nested_path),
op,
value,
std::vector<proto::plan::GenericValue>{});
auto plan = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, expr);
auto final = ExecuteQueryExpr(
plan, seg_promote, N * num_iters, MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
// specify some offsets and do scalar filtering on these offsets
milvus::exec::OffsetVector offsets;
offsets.reserve(N * num_iters / 2);
for (auto i = 0; i < N * num_iters; ++i) {
if (i % 2 == 0) {
offsets.emplace_back(i);
}
}
auto col_vec = milvus::test::gen_filter_res(plan.get(),
seg_promote,
N * num_iters,
MAX_TIMESTAMP,
&offsets);
BitsetTypeView view(col_vec->GetRawData(), col_vec->size());
EXPECT_EQ(view.size(), N * num_iters / 2);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<double>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
if (i % 2 == 0) {
ASSERT_EQ(view[int(i / 2)], ref);
}
}
}
}
}
{
struct Testcase {
std::string val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{"abc", {"string"}},
{"This is a line break\\nThis is a new line!", {"string"}}};
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);
auto json_fid = schema->AddDebugField("json", DataType::JSON);
schema->set_primary_field_id(i64_fid);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<std::string> json_col;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_json_col = raw_data.get_col<std::string>(json_fid);
json_col.insert(
json_col.end(), new_json_col.begin(), new_json_col.end());
seg->PreInsert(N);
seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<OpType> ops{
OpType::Equal,
OpType::NotEqual,
OpType::GreaterThan,
OpType::GreaterEqual,
OpType::LessThan,
OpType::LessEqual,
};
for (const auto& testcase : testcases) {
auto check = [&](std::string_view value) {
return value == testcase.val;
};
std::function<bool(std::string_view)> f = check;
for (auto& op : ops) {
switch (op) {
case OpType::Equal: {
f = [&](std::string_view value) {
return value == testcase.val;
};
break;
}
case OpType::NotEqual: {
f = [&](std::string_view value) {
return value != testcase.val;
};
break;
}
case OpType::GreaterEqual: {
f = [&](std::string_view value) {
return value >= testcase.val;
};
break;
}
case OpType::GreaterThan: {
f = [&](std::string_view value) {
return value > testcase.val;
};
break;
}
case OpType::LessEqual: {
f = [&](std::string_view value) {
return value <= testcase.val;
};
break;
}
case OpType::LessThan: {
f = [&](std::string_view value) {
return value < testcase.val;
};
break;
}
default: {
PanicInfo(Unsupported, "unsupported range node");
}
}
auto pointer = milvus::Json::pointer(testcase.nested_path);
proto::plan::GenericValue value;
value.set_string_val(testcase.val);
auto expr =
std::make_shared<milvus::expr::UnaryRangeFilterExpr>(
milvus::expr::ColumnInfo(
json_fid, DataType::JSON, testcase.nested_path),
op,
value,
std::vector<proto::plan::GenericValue>{});
auto plan = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, expr);
auto final = ExecuteQueryExpr(
plan, seg_promote, N * num_iters, MAX_TIMESTAMP);
EXPECT_EQ(final.size(), N * num_iters);
// specify some offsets and do scalar filtering on these offsets
milvus::exec::OffsetVector offsets;
offsets.reserve(N * num_iters / 2);
for (auto i = 0; i < N * num_iters; ++i) {
if (i % 2 == 0) {
offsets.emplace_back(i);
}
}
auto col_vec = milvus::test::gen_filter_res(plan.get(),
seg_promote,
N * num_iters,
MAX_TIMESTAMP,
&offsets);
BitsetTypeView view(col_vec->GetRawData(), col_vec->size());
EXPECT_EQ(view.size(), N * num_iters / 2);
for (int i = 0; i < N * num_iters; ++i) {
auto ans = final[i];
auto val =
milvus::Json(simdjson::padded_string(json_col[i]))
.template at<std::string_view>(pointer)
.value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
if (i % 2 == 0) {
ASSERT_EQ(view[int(i / 2)], ref);
}
}
}
}
}
struct TestArrayCase {
proto::plan::GenericValue val;
std::vector<std::string> nested_path;
@ -1457,16 +1743,13 @@ TEST_P(ExprTest, TestUnaryRangeJsonNullable) {
int64_t val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{10, {"int"}},
{20, {"int"}},
{30, {"int"}},
{40, {"int"}},
{10, {"double"}},
{20, {"double"}},
{30, {"double"}},
{40, {"double"}},
};
std::vector<Testcase> testcases{{10, {"int"}},
{20, {"int"}},
{30, {"int"}},
{40, {"int"}},
{1, {"array", "0"}},
{2, {"array", "1"}},
{3, {"array", "2"}}};
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);
@ -1492,7 +1775,7 @@ TEST_P(ExprTest, TestUnaryRangeJsonNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
std::vector<OpType> ops{
@ -1717,7 +2000,7 @@ TEST_P(ExprTest, TestTermJson) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) {
@ -1810,7 +2093,7 @@ TEST_P(ExprTest, TestTermJsonNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
for (auto testcase : testcases) {
@ -11573,7 +11856,7 @@ TEST_P(ExprTest, TestUnaryRangeWithJSON) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0;
@ -11833,7 +12116,7 @@ TEST_P(ExprTest, TestUnaryRangeWithJSONNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0;
@ -12139,7 +12422,7 @@ TEST_P(ExprTest, TestTermWithJSON) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0;
@ -12372,7 +12655,7 @@ TEST_P(ExprTest, TestTermWithJSONNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0;
@ -12550,7 +12833,7 @@ TEST_P(ExprTest, TestExistsWithJSON) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0;
@ -12778,7 +13061,7 @@ TEST_P(ExprTest, TestExistsWithJSONNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
int offset = 0;
@ -13661,7 +13944,7 @@ TEST_P(ExprTest, TestJsonContainsAny) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -13951,7 +14234,7 @@ TEST_P(ExprTest, TestJsonContainsAnyNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -14252,7 +14535,7 @@ TEST_P(ExprTest, TestJsonContainsAll) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<Testcase<bool>> bool_testcases{{{true, true}, {"bool"}},
@ -14566,7 +14849,7 @@ TEST_P(ExprTest, TestJsonContainsAllNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
std::vector<Testcase<bool>> bool_testcases{{{true, true}, {"bool"}},
@ -14890,7 +15173,7 @@ TEST_P(ExprTest, TestJsonContainsArray) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -15278,7 +15561,7 @@ TEST_P(ExprTest, TestJsonContainsArrayNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
proto::plan::GenericValue generic_a;
@ -15702,7 +15985,7 @@ TEST_P(ExprTest, TestJsonContainsDiffTypeArray) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
proto::plan::GenericValue int_value;
@ -15833,7 +16116,7 @@ TEST_P(ExprTest, TestJsonContainsDiffTypeArrayNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);
@ -15968,7 +16251,7 @@ TEST_P(ExprTest, TestJsonContainsDiffType) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
proto::plan::GenericValue int_val;
@ -16103,7 +16386,7 @@ TEST_P(ExprTest, TestJsonContainsDiffTypeNullable) {
raw_data.timestamps_.data(),
raw_data.raw_);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*seg_promote, MAX_TIMESTAMP);

View File

@ -0,0 +1,588 @@
// Copyright(C) 2019 - 2020 Zilliz.All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <functional>
#include <boost/filesystem.hpp>
#include <unordered_set>
#include <memory>
#include "common/Tracer.h"
#include "index/BitmapIndex.h"
#include "storage/Util.h"
#include "storage/InsertData.h"
#include "indexbuilder/IndexFactory.h"
#include "index/IndexFactory.h"
#include "test_utils/indexbuilder_test_utils.h"
#include "index/Meta.h"
#include "index/JsonKeyStatsInvertedIndex.h"
#include "common/Json.h"
#include "common/Types.h"
using namespace milvus::index;
using namespace milvus::indexbuilder;
using namespace milvus;
using namespace milvus::index;
static std::vector<milvus::Json>
GenerateJsons(int size) {
std::vector<Json> jsons;
std::default_random_engine random(42);
std::normal_distribution<> distr(0, 1);
for (int i = 0; i < size; i++) {
auto str = R"({"int":)" + std::to_string(random()) + R"(,"double":)" +
std::to_string(static_cast<double>(random())) +
R"(,"string":")" + std::to_string(random()) +
R"(","bool": true)" + R"(, "array": [1,2,3])" + "}";
jsons.push_back(milvus::Json(simdjson::padded_string(str)));
}
return jsons;
}
class JsonKeyStatsIndexTest : public ::testing::TestWithParam<bool> {
protected:
void
Init(int64_t collection_id,
int64_t partition_id,
int64_t segment_id,
int64_t field_id,
int64_t index_build_id,
int64_t index_version,
int64_t size) {
proto::schema::FieldSchema field_schema;
field_schema.set_data_type(proto::schema::DataType::JSON);
field_schema.set_nullable(nullable_);
auto field_meta = storage::FieldDataMeta{
collection_id, partition_id, segment_id, field_id, field_schema};
auto index_meta = storage::IndexMeta{
segment_id, field_id, index_build_id, index_version};
data_ = std::move(GenerateJsons(size));
auto field_data = storage::CreateFieldData(DataType::JSON, nullable_);
if (nullable_) {
valid_data.reserve(size_);
for (size_t i = 0; i < size_; i++) {
valid_data.push_back(false);
}
}
if (nullable_) {
int byteSize = (size_ + 7) / 8;
uint8_t* valid_data_ = new uint8_t[byteSize];
for (int i = 0; i < size_; i++) {
bool value = valid_data[i];
int byteIndex = i / 8;
int bitIndex = i % 8;
if (value) {
valid_data_[byteIndex] |= (1 << bitIndex);
} else {
valid_data_[byteIndex] &= ~(1 << bitIndex);
}
}
field_data->FillFieldData(data_.data(), valid_data_, data_.size());
delete[] valid_data_;
} else {
field_data->FillFieldData(data_.data(), data_.size());
}
storage::InsertData insert_data(field_data);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}",
"/tmp/test-jsonkey-index/",
collection_id,
partition_id,
segment_id,
field_id,
0);
chunk_manager_->Write(
log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_);
std::vector<std::string> index_files;
Config config;
config["insert_files"] = std::vector<std::string>{log_path};
auto build_index =
std::make_shared<JsonKeyStatsInvertedIndex>(ctx, false);
build_index->Build(config);
auto create_index_result = build_index->Upload(config);
auto memSize = create_index_result->GetMemSize();
auto serializedSize = create_index_result->GetSerializedSize();
ASSERT_GT(memSize, 0);
ASSERT_GT(serializedSize, 0);
index_files = create_index_result->GetIndexFiles();
index::CreateIndexInfo index_info{};
config["index_files"] = index_files;
index_ = std::make_shared<JsonKeyStatsInvertedIndex>(ctx, true);
index_->Load(milvus::tracer::TraceContext{}, config);
}
void
SetUp() override {
nullable_ = GetParam();
type_ = DataType::JSON;
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t field_id = 101;
int64_t index_build_id = 1000;
int64_t index_version = 10000;
size_ = 1;
std::string root_path = "/tmp/test-jsonkey-index/";
storage::StorageConfig storage_config;
storage_config.storage_type = "local";
storage_config.root_path = root_path;
chunk_manager_ = storage::CreateChunkManager(storage_config);
Init(collection_id,
partition_id,
segment_id,
field_id,
index_build_id,
index_version,
size_);
}
virtual ~JsonKeyStatsIndexTest() override {
boost::filesystem::remove_all(chunk_manager_->GetRootPath());
}
public:
std::shared_ptr<JsonKeyStatsInvertedIndex> index_;
DataType type_;
bool nullable_;
size_t size_;
FixedVector<bool> valid_data;
std::vector<milvus::Json> data_;
std::vector<std::string> json_col;
std::shared_ptr<storage::ChunkManager> chunk_manager_;
};
INSTANTIATE_TEST_SUITE_P(JsonKeyStatsIndexTestSuite,
JsonKeyStatsIndexTest,
::testing::Values(true, false));
TEST_P(JsonKeyStatsIndexTest, HasEscapeSequence) {
EXPECT_TRUE(index_->has_escape_sequence("Hello\\nWorld"));
EXPECT_TRUE(index_->has_escape_sequence("Tab\\tCharacter"));
EXPECT_TRUE(index_->has_escape_sequence("Carriage\\rReturn"));
EXPECT_TRUE(index_->has_escape_sequence("Backspace\\bTest"));
EXPECT_TRUE(index_->has_escape_sequence("FormFeed\\fTest"));
EXPECT_TRUE(index_->has_escape_sequence("Vertical\\vTab"));
EXPECT_TRUE(index_->has_escape_sequence("Backslash\\\\Test"));
EXPECT_TRUE(index_->has_escape_sequence("Quote\\\"Test"));
EXPECT_TRUE(index_->has_escape_sequence("SingleQuote\\'Test"));
EXPECT_FALSE(index_->has_escape_sequence("No escape sequence here"));
EXPECT_FALSE(index_->has_escape_sequence("Just a backslash \\"));
EXPECT_FALSE(index_->has_escape_sequence(""));
}
TEST_P(JsonKeyStatsIndexTest, TestTermInFunc) {
struct Testcase {
std::vector<int64_t> term;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{{1, 2, 3, 4}, {"int"}},
{{10, 100, 1000, 10000}, {"int"}},
{{100, 10000, 9999, 444}, {"int"}},
{{23, 42, 66, 17, 25}, {"int"}},
};
for (auto testcase : testcases) {
auto check = [&](int64_t value) {
std::unordered_set<int64_t> term_set(testcase.term.begin(),
testcase.term.end());
return term_set.find(value) != term_set.end();
};
std::unordered_set<int64_t> term_set(testcase.term.begin(),
testcase.term.end());
auto filter_func = [&term_set, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
return term_set.find(int64_t(value)) != term_set.end();
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto val = data_[i].template at<int64_t>(pointer).value();
auto ans = bitset[i];
auto ref = check(val);
ASSERT_EQ(ans, ref);
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestUnaryRangeInFunc) {
struct Testcase {
int64_t val;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{10, {"int"}},
{20, {"int"}},
{30, {"int"}},
{40, {"int"}},
};
std::vector<OpType> ops{
OpType::Equal,
OpType::NotEqual,
OpType::GreaterThan,
OpType::GreaterEqual,
OpType::LessThan,
OpType::LessEqual,
};
for (const auto& testcase : testcases) {
auto check = [&](int64_t value) { return value == testcase.val; };
std::function<bool(int64_t)> f = check;
for (auto& op : ops) {
switch (op) {
case OpType::Equal: {
f = [&](int64_t value) { return value == testcase.val; };
break;
}
case OpType::NotEqual: {
f = [&](int64_t value) { return value != testcase.val; };
break;
}
case OpType::GreaterEqual: {
f = [&](int64_t value) { return value >= testcase.val; };
break;
}
case OpType::GreaterThan: {
f = [&](int64_t value) { return value > testcase.val; };
break;
}
case OpType::LessEqual: {
f = [&](int64_t value) { return value <= testcase.val; };
break;
}
case OpType::LessThan: {
f = [&](int64_t value) { return value < testcase.val; };
break;
}
default: {
PanicInfo(Unsupported, "unsupported range node");
}
}
auto filter_func = [&op, &testcase, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
switch (op) {
case OpType::GreaterThan:
return int64_t(value) > testcase.val;
case OpType::GreaterEqual:
return int64_t(value) >= testcase.val;
case OpType::LessThan:
return int64_t(value) < testcase.val;
case OpType::LessEqual:
return int64_t(value) <= testcase.val;
case OpType::Equal:
return int64_t(value) == testcase.val;
case OpType::NotEqual:
return int64_t(value) != testcase.val;
default:
return false;
}
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
if (testcase.nested_path[0] == "int") {
auto val =
data_[i].template at<int64_t>(pointer).value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
} else {
auto val =
data_[i].template at<double>(pointer).value();
auto ref = f(val);
ASSERT_EQ(ans, ref);
}
}
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestBinaryRangeInFunc) {
struct Testcase {
bool lower_inclusive;
bool upper_inclusive;
int64_t lower;
int64_t upper;
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{true, false, 10, 20, {"int"}},
{true, true, 20, 30, {"int"}},
{false, true, 30, 40, {"int"}},
{false, false, 40, 50, {"int"}},
{true, false, 10, 20, {"double"}},
{true, true, 20, 30, {"double"}},
{false, true, 30, 40, {"double"}},
{false, false, 40, 50, {"double"}},
};
for (const auto& testcase : testcases) {
auto check = [&](int64_t value) {
if (testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower <= value && value <= testcase.upper;
} else if (testcase.lower_inclusive && !testcase.upper_inclusive) {
return testcase.lower <= value && value < testcase.upper;
} else if (!testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower < value && value <= testcase.upper;
} else {
return testcase.lower < value && value < testcase.upper;
}
};
auto filter_func = [&testcase, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (valid) {
if (testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower <= int64_t(value) &&
int64_t(value) <= testcase.upper;
} else if (testcase.lower_inclusive &&
!testcase.upper_inclusive) {
return testcase.lower <= int64_t(value) &&
int64_t(value) < testcase.upper;
} else if (!testcase.lower_inclusive &&
testcase.upper_inclusive) {
return testcase.lower < int64_t(value) &&
int64_t(value) <= testcase.upper;
} else {
return testcase.lower < int64_t(value) &&
int64_t(value) < testcase.upper;
}
} else {
auto val =
this->data_[row_id].template at<int64_t>(offset, size);
if (val.error()) {
return false;
}
if (testcase.lower_inclusive && testcase.upper_inclusive) {
return testcase.lower <= int64_t(val.value()) &&
int64_t(val.value()) <= testcase.upper;
} else if (testcase.lower_inclusive &&
!testcase.upper_inclusive) {
return testcase.lower <= int64_t(val.value()) &&
int64_t(val.value()) < testcase.upper;
} else if (!testcase.lower_inclusive &&
testcase.upper_inclusive) {
return testcase.lower < int64_t(val.value()) &&
int64_t(val.value()) <= testcase.upper;
} else {
return testcase.lower < int64_t(val.value()) &&
int64_t(val.value()) < testcase.upper;
}
}
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
if (testcase.nested_path[0] == "int") {
auto val = data_[i].template at<int64_t>(pointer).value();
auto ref = check(val);
ASSERT_EQ(ans, ref);
} else {
auto val = data_[i].template at<double>(pointer).value();
auto ref = check(val);
ASSERT_EQ(ans, ref);
}
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestExistInFunc) {
struct Testcase {
std::vector<std::string> nested_path;
};
std::vector<Testcase> testcases{
{{"A"}},
{{"int"}},
{{"double"}},
{{"B"}},
};
for (const auto& testcase : testcases) {
auto pointer = milvus::Json::pointer(testcase.nested_path);
auto filter_func = [&pointer, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
return this->data_[row_id].exist(pointer);
};
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
auto val = data_[i].exist(pointer);
ASSERT_EQ(ans, val);
}
}
}
}
TEST_P(JsonKeyStatsIndexTest, TestJsonContainsAllFunc) {
struct Testcase {
std::vector<int64_t> term;
std::vector<std::string> nested_path;
};
{
std::vector<Testcase> testcases{
{{1, 2, 3}, {"array"}},
{{10, 100}, {"array"}},
{{100, 1000}, {"array"}},
};
for (const auto& testcase : testcases) {
auto check = [&](const std::vector<int64_t>& values) {
for (auto const& e : testcase.term) {
if (std::find(values.begin(), values.end(), e) ==
values.end()) {
return false;
}
}
return true;
};
auto pointer = milvus::Json::pointer(testcase.nested_path);
std::unordered_set<int64_t> elements;
for (auto const& element : testcase.term) {
elements.insert(element);
}
auto filter_func = [&elements, this](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
auto array = this->data_[row_id].array_at(offset, size);
std::unordered_set<int64_t> tmp_elements(elements);
for (auto&& it : array) {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
tmp_elements.erase(val.value());
if (tmp_elements.size() == 0) {
return true;
}
}
return tmp_elements.empty();
};
auto bitset =
index_->FilterByPath(pointer, size_, false, true, filter_func);
ASSERT_EQ(bitset.size(), size_);
for (int i = 0; i < bitset.size(); ++i) {
if (nullable_ && !valid_data[i]) {
ASSERT_EQ(bitset[i], false);
} else {
auto ans = bitset[i];
auto array = data_[i].array_at(pointer);
std::vector<int64_t> res;
for (const auto& element : array) {
res.push_back(element.template get<int64_t>());
}
ASSERT_EQ(ans, check(res));
}
}
}
}
}
TEST(GrowingJsonKeyStatsIndexTest, GrowingIndex) {
using Index = index::JsonKeyStatsInvertedIndex;
auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
"json",
"/tmp/test-jsonkey-index/");
auto str = R"({"int":)" + std::to_string(1) + R"(,"double":)" +
std::to_string(static_cast<double>(1)) + R"(,"string":")" +
std::to_string(1) + R"(","bool": true)" +
R"(, "array": [1,2,3])" + "}";
auto str1 = R"({"int":)" + std::to_string(2) + "}";
auto str2 = R"({"int":)" + std::to_string(3) + "}";
std::vector<std::string> jsonDatas;
jsonDatas.push_back(str);
jsonDatas.push_back(str1);
jsonDatas.push_back(str2);
std::vector<milvus::Json> jsons;
for (const auto& jsonData : jsonDatas) {
jsons.push_back(milvus::Json(simdjson::padded_string(jsonData)));
}
index->CreateReader();
index->AddJSONDatas(jsonDatas.size(), jsonDatas.data(), nullptr, 0);
index->Commit();
index->Reload();
int64_t checkVal = 1;
auto filter_func = [jsons, checkVal](bool valid,
uint8_t type,
uint32_t row_id,
uint16_t offset,
uint16_t size,
int32_t value) {
if (value == checkVal) {
return true;
}
return false;
};
auto pointer = milvus::Json::pointer({"int"});
auto bitset =
index->FilterByPath(pointer, jsonDatas.size(), true, true, filter_func);
ASSERT_EQ(bitset.size(), jsonDatas.size());
for (int i = 0; i < bitset.size(); ++i) {
auto val = jsons[i].template at<int64_t>(pointer).value();
auto ans = bitset[i];
auto ref = val == checkVal;
ASSERT_EQ(ans, ref);
}
}

View File

@ -171,7 +171,7 @@ TEST_F(GrowingSegmentRegexQueryTest, RegexQueryOnJsonField) {
auto typed_expr = parser.ParseExprs(*expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto segpromote = dynamic_cast<SegmentGrowingImpl*>(seg.get());
BitsetType final;
final = ExecuteQueryExpr(parsed, segpromote, N, MAX_TIMESTAMP);

View File

@ -187,7 +187,7 @@ CSearch(CSegmentInterface c_segment,
uint64_t timestamp,
CSearchResult* result) {
auto future =
AsyncSearch({}, c_segment, c_plan, c_placeholder_group, timestamp);
AsyncSearch({}, c_segment, c_plan, c_placeholder_group, timestamp, 0);
auto futurePtr = static_cast<milvus::futures::IFuture*>(
static_cast<void*>(static_cast<CFuture*>(future)));

View File

@ -54,7 +54,8 @@ get_default_mmap_config() {
.disk_limit =
uint64_t(2) * uint64_t(1024) * uint64_t(1024) * uint64_t(1024),
.fix_file_size = uint64_t(4) * uint64_t(1024) * uint64_t(1024),
.growing_enable_mmap = false};
.growing_enable_mmap = false,
};
return mmap_config;
}

View File

@ -164,6 +164,7 @@ func (gc *garbageCollector) work(ctx context.Context) {
gc.recycleUnusedSegIndexes(ctx)
gc.recycleUnusedAnalyzeFiles(ctx)
gc.recycleUnusedTextIndexFiles(ctx)
gc.recycleUnusedJSONIndexFiles(ctx)
})
}()
go func() {
@ -470,11 +471,16 @@ func (gc *garbageCollector) recycleDroppedSegments(ctx context.Context) {
logs[key] = struct{}{}
}
for key := range getJSONKeyLogs(segment, gc) {
logs[key] = struct{}{}
}
log.Info("GC segment start...", zap.Int("insert_logs", len(segment.GetBinlogs())),
zap.Int("delta_logs", len(segment.GetDeltalogs())),
zap.Int("stats_logs", len(segment.GetStatslogs())),
zap.Int("bm25_logs", len(segment.GetBm25Statslogs())),
zap.Int("text_logs", len(segment.GetTextStatsLogs())))
zap.Int("text_logs", len(segment.GetTextStatsLogs())),
zap.Int("json_key_logs", len(segment.GetJsonKeyStats())))
if err := gc.removeObjectFiles(ctx, logs); err != nil {
log.Warn("GC segment remove logs failed", zap.Error(err))
continue
@ -585,6 +591,20 @@ func getTextLogs(sinfo *SegmentInfo) map[string]struct{} {
return textLogs
}
func getJSONKeyLogs(sinfo *SegmentInfo, gc *garbageCollector) map[string]struct{} {
jsonkeyLogs := make(map[string]struct{})
for _, flog := range sinfo.GetJsonKeyStats() {
for _, file := range flog.GetFiles() {
prefix := fmt.Sprintf("%s/%s/%d/%d/%d/%d/%d/%d", gc.option.cli.RootPath(), common.JSONIndexPath,
flog.GetBuildID(), flog.GetVersion(), sinfo.GetCollectionID(), sinfo.GetPartitionID(), sinfo.GetID(), flog.GetFieldID())
file = path.Join(prefix, file)
jsonkeyLogs[file] = struct{}{}
}
}
return jsonkeyLogs
}
// removeObjectFiles remove file from oss storage, return error if any log failed to remove.
func (gc *garbageCollector) removeObjectFiles(ctx context.Context, filePaths map[string]struct{}) error {
futures := make([]*conc.Future[struct{}], 0)
@ -904,3 +924,64 @@ func (gc *garbageCollector) recycleUnusedTextIndexFiles(ctx context.Context) {
metrics.GarbageCollectorRunCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Add(1)
}
// recycleUnusedJSONIndexFiles load meta file info and compares OSS keys
// if missing found, performs gc cleanup
func (gc *garbageCollector) recycleUnusedJSONIndexFiles(ctx context.Context) {
start := time.Now()
log := log.Ctx(ctx).With(zap.String("gcName", "recycleUnusedJSONIndexFiles"), zap.Time("startAt", start))
log.Info("start recycleUnusedJSONIndexFiles...")
defer func() { log.Info("recycleUnusedJSONIndexFiles done", zap.Duration("timeCost", time.Since(start))) }()
hasJSONIndexSegments := gc.meta.SelectSegments(ctx, SegmentFilterFunc(func(info *SegmentInfo) bool {
return len(info.GetJsonKeyStats()) != 0
}))
fileNum := 0
deletedFilesNum := atomic.NewInt32(0)
for _, seg := range hasJSONIndexSegments {
for _, fieldStats := range seg.GetJsonKeyStats() {
log := log.With(zap.Int64("segmentID", seg.GetID()), zap.Int64("fieldID", fieldStats.GetFieldID()))
// clear low version task
for i := int64(1); i < fieldStats.GetVersion(); i++ {
prefix := fmt.Sprintf("%s/%s/%d/%d/%d/%d/%d/%d", gc.option.cli.RootPath(), common.JSONIndexPath,
fieldStats.GetBuildID(), i, seg.GetCollectionID(), seg.GetPartitionID(), seg.GetID(), fieldStats.GetFieldID())
futures := make([]*conc.Future[struct{}], 0)
err := gc.option.cli.WalkWithPrefix(ctx, prefix, true, func(files *storage.ChunkObjectInfo) bool {
file := files.FilePath
future := gc.option.removeObjectPool.Submit(func() (struct{}, error) {
log := log.With(zap.String("file", file))
log.Info("garbageCollector recycleUnusedJSONIndexFiles remove file...")
if err := gc.option.cli.Remove(ctx, file); err != nil {
log.Warn("garbageCollector recycleUnusedJSONIndexFiles remove file failed", zap.Error(err))
return struct{}{}, err
}
deletedFilesNum.Inc()
log.Info("garbageCollector recycleUnusedJSONIndexFiles remove file success")
return struct{}{}, nil
})
futures = append(futures, future)
return true
})
// Wait for all remove tasks done.
if err := conc.BlockOnAll(futures...); err != nil {
// error is logged, and can be ignored here.
log.Warn("some task failure in remove object pool", zap.Error(err))
}
log = log.With(zap.Int("deleteJSONKeyIndexNum", int(deletedFilesNum.Load())), zap.Int("walkFileNum", fileNum))
if err != nil {
log.Warn("json index files recycle failed when walk with prefix", zap.Error(err))
return
}
}
}
}
log.Info("json index files recycle done")
metrics.GarbageCollectorRunCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Add(1)
}

View File

@ -440,7 +440,6 @@ func newSegmentIndexMeta(catalog metastore.DataCoordCatalog) *indexMeta {
}
func TestMeta_CreateIndex(t *testing.T) {
indexParams := []*commonpb.KeyValuePair{
{
Key: common.IndexTypeKey,

View File

@ -73,6 +73,9 @@ func (jm *statsJobManager) triggerStatsTaskLoop() {
ticker := time.NewTicker(Params.DataCoordCfg.TaskCheckInterval.GetAsDuration(time.Second))
defer ticker.Stop()
lastJSONStatsLastTrigger := time.Now().Unix()
maxJSONStatsTaskCount := 0
for {
select {
case <-jm.ctx.Done():
@ -82,6 +85,7 @@ func (jm *statsJobManager) triggerStatsTaskLoop() {
jm.triggerSortStatsTask()
jm.triggerTextStatsTask()
jm.triggerBM25StatsTask()
lastJSONStatsLastTrigger, maxJSONStatsTaskCount = jm.triggerJsonKeyIndexStatsTask(lastJSONStatsLastTrigger, maxJSONStatsTaskCount)
case segID := <-getStatsTaskChSingleton():
log.Info("receive new segment to trigger stats task", zap.Int64("segmentID", segID))
@ -141,10 +145,21 @@ func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
}
for _, fieldID := range fieldIDs {
if segment.GetTextStatsLogs() == nil {
if segment.GetTextStatsLogs()[fieldID] == nil {
return true
}
if segment.GetTextStatsLogs()[fieldID] == nil {
}
return false
}
func needDoJsonKeyIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 &&
segment.GetIsSorted()) {
return false
}
for _, fieldID := range fieldIDs {
if segment.GetJsonKeyStats()[fieldID] == nil {
return true
}
}
@ -182,6 +197,38 @@ func (jm *statsJobManager) triggerTextStatsTask() {
}
}
func (jm *statsJobManager) triggerJsonKeyIndexStatsTask(lastJSONStatsLastTrigger int64, maxJSONStatsTaskCount int) (int64, int) {
collections := jm.mt.GetCollections()
for _, collection := range collections {
needTriggerFieldIDs := make([]UniqueID, 0)
for _, field := range collection.Schema.GetFields() {
h := typeutil.CreateFieldSchemaHelper(field)
if h.EnableJSONKeyStatsIndex() && Params.CommonCfg.EnabledJSONKeyStats.GetAsBool() {
needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID())
}
}
segments := jm.mt.SelectSegments(jm.ctx, WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool {
return needDoJsonKeyIndex(seg, needTriggerFieldIDs)
}))
if time.Now().Unix()-lastJSONStatsLastTrigger > int64(Params.DataCoordCfg.JSONStatsTriggerInterval.GetAsDuration(time.Minute).Seconds()) {
lastJSONStatsLastTrigger = time.Now().Unix()
maxJSONStatsTaskCount = 0
}
for _, segment := range segments {
if maxJSONStatsTaskCount >= Params.DataCoordCfg.JSONStatsTriggerCount.GetAsInt() {
break
}
if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_JsonKeyIndexJob, true); err != nil {
log.Warn("create stats task with json key index for segment failed, wait for retry:",
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
continue
}
maxJSONStatsTaskCount++
}
}
return lastJSONStatsLastTrigger, maxJSONStatsTaskCount
}
func (jm *statsJobManager) triggerBM25StatsTask() {
collections := jm.mt.GetCollections()
for _, collection := range collections {

View File

@ -2158,6 +2158,7 @@ func (m *meta) SaveStatsResultSegment(oldSegmentID int64, result *workerpb.Stats
Statslogs: result.GetStatsLogs(),
TextStatsLogs: result.GetTextStatsLogs(),
Bm25Statslogs: result.GetBm25Logs(),
JsonKeyStats: result.GetJsonKeyStatsLogs(),
Deltalogs: nil,
CompactionFrom: []int64{oldSegmentID},
IsSorted: true,

View File

@ -43,6 +43,18 @@ func SetTextIndexLogs(textIndexLogs map[int64]*datapb.TextIndexStats) SegmentOpe
}
}
func SetJsonKeyIndexLogs(jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats) SegmentOperator {
return func(segment *SegmentInfo) bool {
if segment.JsonKeyStats == nil {
segment.JsonKeyStats = make(map[int64]*datapb.JsonKeyStats)
}
for field, logs := range jsonKeyIndexLogs {
segment.JsonKeyStats[field] = logs
}
return true
}
}
type segmentCriterion struct {
collectionID int64
channel string

View File

@ -2,7 +2,6 @@ package datacoord
import (
"context"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"testing"
"time"
@ -33,6 +32,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/mq/msgstream"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
"github.com/milvus-io/milvus/pkg/v2/proto/workerpb"
"github.com/milvus-io/milvus/pkg/v2/util/merr"

View File

@ -244,10 +244,14 @@ func (st *statsTask) PreCheck(ctx context.Context, dependency *taskScheduler) bo
CollectionTtl: collTtl.Nanoseconds(),
CurrentTs: tsoutil.GetCurrentTime(),
// update version after check
TaskVersion: statsMeta.GetVersion() + 1,
BinlogMaxSize: Params.DataNodeCfg.BinLogMaxSize.GetAsUint64(),
StorageVersion: segment.StorageVersion,
TaskSlot: st.taskSlot,
TaskVersion: statsMeta.GetVersion() + 1,
BinlogMaxSize: Params.DataNodeCfg.BinLogMaxSize.GetAsUint64(),
StorageVersion: segment.StorageVersion,
TaskSlot: st.taskSlot,
EnableJsonKeyStats: Params.CommonCfg.EnabledJSONKeyStats.GetAsBool(),
JsonKeyStatsTantivyMemory: Params.DataCoordCfg.JSONKeyStatsMemoryBudgetInTantivy.GetAsInt64(),
JsonKeyStatsDataFormat: 1,
EnableJsonKeyStatsInSort: Params.DataCoordCfg.EnabledJSONKeyStatsInSort.GetAsBool(),
}
log.Info("stats task pre check successfully", zap.String("subJobType", st.subJobType.String()),
@ -373,6 +377,13 @@ func (st *statsTask) SetJobInfo(meta *meta) error {
zap.Int64("segmentID", st.segmentID), zap.Error(err))
return err
}
case indexpb.StatsSubJob_JsonKeyIndexJob:
err := meta.UpdateSegment(st.taskInfo.GetSegmentID(), SetJsonKeyIndexLogs(st.taskInfo.GetJsonKeyStatsLogs()))
if err != nil {
log.Warn("save json key index stats result failed", zap.Int64("taskId", st.taskID),
zap.Int64("segmentID", st.segmentID), zap.Error(err))
return err
}
case indexpb.StatsSubJob_BM25Job:
// TODO: support bm25 job
}

View File

@ -22,10 +22,9 @@ import (
"testing"
"time"
"go.uber.org/atomic"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"go.uber.org/atomic"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"

View File

@ -312,18 +312,19 @@ func (m *TaskManager) WaitTaskFinish() {
}
type StatsTaskInfo struct {
Cancel context.CancelFunc
State indexpb.JobState
FailReason string
CollID typeutil.UniqueID
PartID typeutil.UniqueID
SegID typeutil.UniqueID
InsertChannel string
NumRows int64
InsertLogs []*datapb.FieldBinlog
StatsLogs []*datapb.FieldBinlog
TextStatsLogs map[int64]*datapb.TextIndexStats
Bm25Logs []*datapb.FieldBinlog
Cancel context.CancelFunc
State indexpb.JobState
FailReason string
CollID typeutil.UniqueID
PartID typeutil.UniqueID
SegID typeutil.UniqueID
InsertChannel string
NumRows int64
InsertLogs []*datapb.FieldBinlog
StatsLogs []*datapb.FieldBinlog
TextStatsLogs map[int64]*datapb.TextIndexStats
Bm25Logs []*datapb.FieldBinlog
JSONKeyStatsLogs map[int64]*datapb.JsonKeyStats
}
func (m *TaskManager) LoadOrStoreStatsTask(clusterID string, taskID typeutil.UniqueID, info *StatsTaskInfo) *StatsTaskInfo {
@ -410,24 +411,46 @@ func (m *TaskManager) StoreStatsTextIndexResult(
}
}
func (m *TaskManager) StoreJSONKeyStatsResult(
clusterID string,
taskID typeutil.UniqueID,
collID typeutil.UniqueID,
partID typeutil.UniqueID,
segID typeutil.UniqueID,
channel string,
jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats,
) {
key := Key{ClusterID: clusterID, TaskID: taskID}
m.stateLock.Lock()
defer m.stateLock.Unlock()
if info, ok := m.statsTasks[key]; ok {
info.JSONKeyStatsLogs = jsonKeyIndexLogs
info.SegID = segID
info.CollID = collID
info.PartID = partID
info.InsertChannel = channel
}
}
func (m *TaskManager) GetStatsTaskInfo(clusterID string, taskID typeutil.UniqueID) *StatsTaskInfo {
m.stateLock.Lock()
defer m.stateLock.Unlock()
if info, ok := m.statsTasks[Key{ClusterID: clusterID, TaskID: taskID}]; ok {
return &StatsTaskInfo{
Cancel: info.Cancel,
State: info.State,
FailReason: info.FailReason,
CollID: info.CollID,
PartID: info.PartID,
SegID: info.SegID,
InsertChannel: info.InsertChannel,
NumRows: info.NumRows,
InsertLogs: info.InsertLogs,
StatsLogs: info.StatsLogs,
TextStatsLogs: info.TextStatsLogs,
Bm25Logs: info.Bm25Logs,
Cancel: info.Cancel,
State: info.State,
FailReason: info.FailReason,
CollID: info.CollID,
PartID: info.PartID,
SegID: info.SegID,
InsertChannel: info.InsertChannel,
NumRows: info.NumRows,
InsertLogs: info.InsertLogs,
StatsLogs: info.StatsLogs,
TextStatsLogs: info.TextStatsLogs,
Bm25Logs: info.Bm25Logs,
JSONKeyStatsLogs: info.JSONKeyStatsLogs,
}
}
return nil

View File

@ -93,6 +93,20 @@ func (s *statsTaskInfoSuite) Test_Methods() {
})
})
s.Run("storeStatsJsonIndexResult", func() {
s.manager.StoreJSONKeyStatsResult(s.cluster, s.taskID, 1, 2, 3, "ch1",
map[int64]*datapb.JsonKeyStats{
100: {
FieldID: 100,
Version: 1,
Files: []string{"file1"},
LogSize: 1024,
MemorySize: 1024,
JsonKeyStatsDataFormat: 1,
},
})
})
s.Run("getStatsTaskInfo", func() {
taskInfo := s.manager.GetStatsTaskInfo(s.cluster, s.taskID)

View File

@ -38,12 +38,14 @@ import (
"github.com/milvus-io/milvus/internal/util/indexcgowrapper"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexcgopb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/proto/workerpb"
_ "github.com/milvus-io/milvus/pkg/v2/util/funcutil"
"github.com/milvus-io/milvus/pkg/v2/util/metautil"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/timerecord"
"github.com/milvus-io/milvus/pkg/v2/util/tsoutil"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
@ -311,6 +313,26 @@ func (st *statsTask) Execute(ctx context.Context) error {
return err
}
}
if (st.req.EnableJsonKeyStatsInSort && st.req.GetSubJobType() == indexpb.StatsSubJob_Sort) || st.req.GetSubJobType() == indexpb.StatsSubJob_JsonKeyIndexJob {
if !st.req.GetEnableJsonKeyStats() {
return nil
}
err = st.createJSONKeyStats(ctx,
st.req.GetStorageConfig(),
st.req.GetCollectionID(),
st.req.GetPartitionID(),
st.req.GetTargetSegmentID(),
st.req.GetTaskVersion(),
st.req.GetTaskID(),
st.req.GetJsonKeyStatsTantivyMemory(),
st.req.GetJsonKeyStatsDataFormat(),
insertLogs)
if err != nil {
log.Warn("stats wrong, failed to create json index", zap.Error(err))
return err
}
}
return nil
}
@ -466,3 +488,108 @@ func (st *statsTask) createTextIndex(ctx context.Context,
textIndexLogs)
return nil
}
func (st *statsTask) createJSONKeyStats(ctx context.Context,
storageConfig *indexpb.StorageConfig,
collectionID int64,
partitionID int64,
segmentID int64,
version int64,
taskID int64,
tantivyMemory int64,
jsonKeyStatsDataFormat int64,
insertBinlogs []*datapb.FieldBinlog,
) error {
log := log.Ctx(ctx).With(
zap.String("clusterID", st.req.GetClusterID()),
zap.Int64("taskID", st.req.GetTaskID()),
zap.Int64("collectionID", st.req.GetCollectionID()),
zap.Int64("partitionID", st.req.GetPartitionID()),
zap.Int64("segmentID", st.req.GetSegmentID()),
zap.Any("statsJobType", st.req.GetSubJobType()),
zap.Int64("jsonKeyStatsDataFormat", jsonKeyStatsDataFormat),
)
if jsonKeyStatsDataFormat != 1 {
log.Info("create json key index failed dataformat invalid")
return nil
}
fieldBinlogs := lo.GroupBy(insertBinlogs, func(binlog *datapb.FieldBinlog) int64 {
return binlog.GetFieldID()
})
getInsertFiles := func(fieldID int64) ([]string, error) {
binlogs, ok := fieldBinlogs[fieldID]
if !ok {
return nil, fmt.Errorf("field binlog not found for field %d", fieldID)
}
result := make([]string, 0, len(binlogs))
for _, binlog := range binlogs {
for _, file := range binlog.GetBinlogs() {
result = append(result, metautil.BuildInsertLogPath(storageConfig.GetRootPath(), collectionID, partitionID, segmentID, fieldID, file.GetLogID()))
}
}
return result, nil
}
newStorageConfig, err := ParseStorageConfig(storageConfig)
if err != nil {
return err
}
jsonKeyIndexStats := make(map[int64]*datapb.JsonKeyStats)
for _, field := range st.req.GetSchema().GetFields() {
h := typeutil.CreateFieldSchemaHelper(field)
if !h.EnableJSONKeyStatsIndex() {
continue
}
log.Info("field enable json key index, ready to create json key index", zap.Int64("field id", field.GetFieldID()))
files, err := getInsertFiles(field.GetFieldID())
if err != nil {
return err
}
buildIndexParams := &indexcgopb.BuildIndexInfo{
BuildID: taskID,
CollectionID: collectionID,
PartitionID: partitionID,
SegmentID: segmentID,
IndexVersion: version,
InsertFiles: files,
FieldSchema: field,
StorageConfig: newStorageConfig,
JsonKeyStatsTantivyMemory: tantivyMemory,
}
uploaded, err := indexcgowrapper.CreateJSONKeyStats(ctx, buildIndexParams)
if err != nil {
return err
}
jsonKeyIndexStats[field.GetFieldID()] = &datapb.JsonKeyStats{
FieldID: field.GetFieldID(),
Version: version,
BuildID: taskID,
Files: lo.Keys(uploaded),
JsonKeyStatsDataFormat: jsonKeyStatsDataFormat,
}
log.Info("field enable json key index, create json key index done",
zap.Int64("field id", field.GetFieldID()),
zap.Strings("files", lo.Keys(uploaded)),
)
}
totalElapse := st.tr.RecordSpan()
st.manager.StoreJSONKeyStatsResult(st.req.GetClusterID(),
st.req.GetTaskID(),
st.req.GetCollectionID(),
st.req.GetPartitionID(),
st.req.GetTargetSegmentID(),
st.req.GetInsertChannel(),
jsonKeyIndexStats)
metrics.DataNodeBuildJSONStatsLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(totalElapse.Seconds())
log.Info("create json key index done",
zap.Int64("target segmentID", st.req.GetTargetSegmentID()),
zap.Duration("total elapse", totalElapse))
return nil
}

View File

@ -29,12 +29,12 @@ import "C"
import (
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/util/hardware"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
)
func getCurrentIndexVersion(v int32) int32 {

View File

@ -462,18 +462,19 @@ func (node *DataNode) QueryJobsV2(ctx context.Context, req *workerpb.QueryJobsV2
info := node.taskManager.GetStatsTaskInfo(req.GetClusterID(), taskID)
if info != nil {
results = append(results, &workerpb.StatsResult{
TaskID: taskID,
State: info.State,
FailReason: info.FailReason,
CollectionID: info.CollID,
PartitionID: info.PartID,
SegmentID: info.SegID,
Channel: info.InsertChannel,
InsertLogs: info.InsertLogs,
StatsLogs: info.StatsLogs,
TextStatsLogs: info.TextStatsLogs,
Bm25Logs: info.Bm25Logs,
NumRows: info.NumRows,
TaskID: taskID,
State: info.State,
FailReason: info.FailReason,
CollectionID: info.CollID,
PartitionID: info.PartID,
SegmentID: info.SegID,
Channel: info.InsertChannel,
InsertLogs: info.InsertLogs,
StatsLogs: info.StatsLogs,
TextStatsLogs: info.TextStatsLogs,
Bm25Logs: info.Bm25Logs,
NumRows: info.NumRows,
JsonKeyStatsLogs: info.JSONKeyStatsLogs,
})
}
}

View File

@ -540,22 +540,23 @@ func (s *IndexServiceSuite) Test_CreateStatsTask() {
s.Run("normal case", func() {
taskID := int64(100)
req := &workerpb.CreateStatsRequest{
ClusterID: "cluster2",
TaskID: taskID,
CollectionID: s.collID,
PartitionID: s.partID,
InsertChannel: "ch1",
SegmentID: s.segID,
InsertLogs: fieldBinlogs,
DeltaLogs: nil,
StorageConfig: s.storageConfig,
Schema: generateTestSchema(),
TargetSegmentID: s.segID + 1,
StartLogID: s.logID + 100,
EndLogID: s.logID + 200,
NumRows: s.numRows,
BinlogMaxSize: 131000,
SubJobType: indexpb.StatsSubJob_Sort,
ClusterID: "cluster2",
TaskID: taskID,
CollectionID: s.collID,
PartitionID: s.partID,
InsertChannel: "ch1",
SegmentID: s.segID,
InsertLogs: fieldBinlogs,
DeltaLogs: nil,
StorageConfig: s.storageConfig,
Schema: generateTestSchema(),
TargetSegmentID: s.segID + 1,
StartLogID: s.logID + 100,
EndLogID: s.logID + 200,
NumRows: s.numRows,
BinlogMaxSize: 131000,
SubJobType: indexpb.StatsSubJob_Sort,
EnableJsonKeyStats: false,
}
status, err := s.in.CreateJobV2(ctx, &workerpb.CreateJobV2Request{

View File

@ -1240,7 +1240,7 @@ func GenSimpleRetrievePlan(collection *segcore.CCollection) (*segcore.RetrievePl
return nil, err
}
plan, err2 := segcore.NewRetrievePlan(collection, planBytes, timestamp, 100)
plan, err2 := segcore.NewRetrievePlan(collection, planBytes, timestamp, 100, 0)
return plan, err2
}

View File

@ -3825,7 +3825,8 @@ func (node *Proxy) Query(ctx context.Context, request *milvuspb.QueryRequest) (*
commonpbutil.WithMsgType(commonpb.MsgType_Retrieve),
commonpbutil.WithSourceID(paramtable.GetNodeID()),
),
ReqID: paramtable.GetNodeID(),
ReqID: paramtable.GetNodeID(),
ConsistencyLevel: request.ConsistencyLevel,
},
request: request,
qc: node.queryCoord,

View File

@ -599,7 +599,7 @@ func (t *queryTask) queryShard(ctx context.Context, nodeID int64, qn types.Query
retrieveReq.MvccTimestamp = mvccTs
retrieveReq.GuaranteeTimestamp = mvccTs
}
retrieveReq.ConsistencyLevel = t.ConsistencyLevel
req := &querypb.QueryRequest{
Req: retrieveReq,
DmlChannels: []string{channel},

View File

@ -969,8 +969,9 @@ func (t *searchTask) Requery(span trace.Span) error {
commonpbutil.WithMsgType(commonpb.MsgType_Retrieve),
commonpbutil.WithSourceID(paramtable.GetNodeID()),
),
ReqID: paramtable.GetNodeID(),
PartitionIDs: t.GetPartitionIDs(), // use search partitionIDs
ReqID: paramtable.GetNodeID(),
PartitionIDs: t.GetPartitionIDs(), // use search partitionIDs
ConsistencyLevel: t.ConsistencyLevel,
},
request: queryReq,
plan: plan,

View File

@ -23,6 +23,7 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
@ -31,6 +32,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
@ -89,20 +91,28 @@ func (c *IndexChecker) Check(ctx context.Context) []task.Task {
}
collection := c.meta.CollectionManager.GetCollection(ctx, collectionID)
schema := c.meta.CollectionManager.GetCollectionSchema(ctx, collectionID)
if collection == nil {
log.Warn("collection released during check index", zap.Int64("collection", collectionID))
continue
}
if schema == nil && paramtable.Get().CommonCfg.EnabledJSONKeyStats.GetAsBool() {
collectionSchema, err1 := c.broker.DescribeCollection(ctx, collectionID)
if err1 == nil {
schema = collectionSchema.GetSchema()
c.meta.PutCollectionSchema(ctx, collectionID, collectionSchema.GetSchema())
}
}
replicas := c.meta.ReplicaManager.GetByCollection(ctx, collectionID)
for _, replica := range replicas {
tasks = append(tasks, c.checkReplica(ctx, collection, replica, indexInfos)...)
tasks = append(tasks, c.checkReplica(ctx, collection, replica, indexInfos, schema)...)
}
}
return tasks
}
func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collection, replica *meta.Replica, indexInfos []*indexpb.IndexInfo) []task.Task {
func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collection, replica *meta.Replica, indexInfos []*indexpb.IndexInfo, schema *schemapb.CollectionSchema) []task.Task {
log := log.Ctx(ctx).With(
zap.Int64("collectionID", collection.GetCollectionID()),
)
@ -113,6 +123,9 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
roNodeSet := typeutil.NewUniqueSet(replica.GetRONodes()...)
targets := make(map[int64][]int64) // segmentID => FieldID
idSegmentsStats := make(map[int64]*meta.Segment)
targetsStats := make(map[int64][]int64) // segmentID => FieldID
for _, segment := range segments {
// skip update index in read only node
if roNodeSet.Contain(segment.Node) {
@ -120,9 +133,13 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
}
missing := c.checkSegment(segment, indexInfos)
missingStats := c.checkSegmentStats(segment, schema, collection.LoadFields)
if len(missing) > 0 {
targets[segment.GetID()] = missing
idSegments[segment.GetID()] = segment
} else if len(missingStats) > 0 {
targetsStats[segment.GetID()] = missingStats
idSegmentsStats[segment.GetID()] = segment
}
}
@ -150,6 +167,29 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
return c.createSegmentUpdateTask(ctx, idSegments[segmentID], replica)
})
segmentsStatsToUpdate := typeutil.NewSet[int64]()
for _, segmentIDs := range lo.Chunk(lo.Keys(idSegmentsStats), MaxSegmentNumPerGetIndexInfoRPC) {
segmentInfos, err := c.broker.GetSegmentInfo(ctx, segmentIDs...)
if err != nil {
log.Warn("failed to get SegmentInfo for segments", zap.Int64s("segmentIDs", segmentIDs), zap.Error(err))
continue
}
for _, segmentInfo := range segmentInfos {
fields := targetsStats[segmentInfo.ID]
missingFields := typeutil.NewSet(fields...)
for field := range segmentInfo.GetJsonKeyStats() {
if missingFields.Contain(field) {
segmentsStatsToUpdate.Insert(segmentInfo.ID)
}
}
}
}
tasksStats := lo.FilterMap(segmentsStatsToUpdate.Collect(), func(segmentID int64, _ int) (task.Task, bool) {
return c.createSegmentStatsUpdateTask(ctx, idSegmentsStats[segmentID], replica)
})
tasks = append(tasks, tasksStats...)
return tasks
}
@ -193,3 +233,58 @@ func (c *IndexChecker) createSegmentUpdateTask(ctx context.Context, segment *met
t.SetReason("missing index")
return t, true
}
func (c *IndexChecker) checkSegmentStats(segment *meta.Segment, schema *schemapb.CollectionSchema, loadField []int64) (missFieldIDs []int64) {
var result []int64
if paramtable.Get().CommonCfg.EnabledJSONKeyStats.GetAsBool() {
if schema == nil {
log.Warn("schema released during check index", zap.Int64("collection", segment.GetCollectionID()))
return result
}
loadFieldMap := make(map[int64]struct{})
for _, v := range loadField {
loadFieldMap[v] = struct{}{}
}
jsonStatsFieldMap := make(map[int64]struct{})
for _, v := range segment.JSONIndexField {
jsonStatsFieldMap[v] = struct{}{}
}
for _, field := range schema.GetFields() {
// Check if the field exists in both loadFieldMap and jsonStatsFieldMap
h := typeutil.CreateFieldSchemaHelper(field)
if h.EnableJSONKeyStatsIndex() {
if _, ok := loadFieldMap[field.FieldID]; ok {
if _, ok := jsonStatsFieldMap[field.FieldID]; !ok {
result = append(result, field.FieldID)
}
}
}
}
}
return result
}
func (c *IndexChecker) createSegmentStatsUpdateTask(ctx context.Context, segment *meta.Segment, replica *meta.Replica) (task.Task, bool) {
action := task.NewSegmentActionWithScope(segment.Node, task.ActionTypeStatsUpdate, segment.GetInsertChannel(), segment.GetID(), querypb.DataScope_Historical, int(segment.GetNumOfRows()))
t, err := task.NewSegmentTask(
ctx,
params.Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond),
c.ID(),
segment.GetCollectionID(),
replica,
action,
)
if err != nil {
log.Warn("create segment stats update task failed",
zap.Int64("collection", segment.GetCollectionID()),
zap.String("channel", segment.GetInsertChannel()),
zap.Int64("node", segment.Node),
zap.Error(err),
)
return nil, false
}
t.SetPriority(task.TaskPriorityLow)
t.SetReason("missing json stats")
return t, true
}

View File

@ -24,6 +24,7 @@ import (
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/metastore/kv/querycoord"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
@ -97,6 +98,12 @@ func (suite *IndexCheckerSuite) TestLoadIndex() {
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -133,6 +140,8 @@ func (suite *IndexCheckerSuite) TestLoadIndex() {
},
}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 1)
@ -162,6 +171,12 @@ func (suite *IndexCheckerSuite) TestIndexInfoNotMatch() {
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -211,7 +226,8 @@ func (suite *IndexCheckerSuite) TestIndexInfoNotMatch() {
IndexID: 1000,
},
}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 0)
}
@ -223,6 +239,12 @@ func (suite *IndexCheckerSuite) TestGetIndexInfoFailed() {
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -251,7 +273,8 @@ func (suite *IndexCheckerSuite) TestGetIndexInfoFailed() {
IndexID: 1000,
},
}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 0)
}
@ -263,6 +286,12 @@ func (suite *IndexCheckerSuite) TestCreateNewIndex() {
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
@ -317,13 +346,234 @@ func (suite *IndexCheckerSuite) TestCreateNewIndex() {
IndexFilePaths: []string{"index"},
},
}}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Len(tasks, 1)
suite.Len(tasks[0].Actions(), 1)
suite.Equal(tasks[0].Actions()[0].(*task.SegmentAction).Type(), task.ActionTypeUpdate)
}
func (suite *IndexCheckerSuite) TestLoadJsonIndex() {
checker := suite.checker
ctx := context.Background()
paramtable.Get().Save(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key, "true")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key)
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
coll.LoadFields = []int64{101}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Address: "localhost",
Hostname: "localhost",
}))
checker.meta.ResourceManager.HandleNodeUp(ctx, 1)
checker.meta.ResourceManager.HandleNodeUp(ctx, 2)
// dist
fieldIndexInfo := &querypb.FieldIndexInfo{
FieldID: 101,
IndexID: 1000,
EnableIndex: true,
}
indexInfo := make(map[int64]*querypb.FieldIndexInfo)
indexInfo[fieldIndexInfo.IndexID] = fieldIndexInfo
segment := utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")
segment.IndexInfo = indexInfo
checker.dist.SegmentDistManager.Update(1, segment)
// broker
suite.broker.EXPECT().ListIndexes(mock.Anything, mock.Anything).Call.Return(
func(ctx context.Context, collectionID int64) ([]*indexpb.IndexInfo, error) {
return []*indexpb.IndexInfo{
{
FieldID: 101,
IndexID: 1000,
},
}, nil
},
)
mockJSONKeyStats := map[int64]*datapb.JsonKeyStats{
101: {
FieldID: 101,
},
}
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{
{
ID: 2,
JsonKeyStats: mockJSONKeyStats,
},
}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 1)
t := tasks[0]
suite.Require().Len(t.Actions(), 1)
action, ok := t.Actions()[0].(*task.SegmentAction)
suite.Require().True(ok)
suite.EqualValues(200, t.ReplicaID())
suite.Equal(task.ActionTypeStatsUpdate, action.Type())
suite.EqualValues(2, action.GetSegmentID())
// test skip load json index for read only node
suite.nodeMgr.Stopping(1)
suite.nodeMgr.Stopping(2)
suite.meta.ResourceManager.HandleNodeStopping(ctx, 1)
suite.meta.ResourceManager.HandleNodeStopping(ctx, 2)
utils.RecoverAllCollection(suite.meta)
tasks = checker.Check(context.Background())
suite.Require().Len(tasks, 0)
}
func (suite *IndexCheckerSuite) TestJsonIndexNotMatch() {
checker := suite.checker
ctx := context.Background()
paramtable.Get().Save(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key, "true")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key)
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Address: "localhost",
Hostname: "localhost",
}))
checker.meta.ResourceManager.HandleNodeUp(ctx, 1)
checker.meta.ResourceManager.HandleNodeUp(ctx, 2)
// dist
checker.dist.SegmentDistManager.Update(1, utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel"))
// broker
suite.broker.EXPECT().ListIndexes(mock.Anything, mock.Anything).Call.Return(
func(ctx context.Context, collectionID int64) ([]*indexpb.IndexInfo, error) {
return []*indexpb.IndexInfo{
{
FieldID: 101,
IndexID: 1000,
},
}, nil
},
)
suite.broker.EXPECT().GetIndexInfo(mock.Anything, mock.Anything, mock.AnythingOfType("int64")).
Return(map[int64][]*querypb.FieldIndexInfo{2: {
{
FieldID: 101,
IndexID: 1000,
EnableIndex: false,
IndexFilePaths: []string{"index"},
},
}}, nil)
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{
{},
}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Require().Len(tasks, 0)
}
func (suite *IndexCheckerSuite) TestCreateNewJsonIndex() {
checker := suite.checker
ctx := context.Background()
paramtable.Get().Save(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key, "true")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.EnabledJSONKeyStats.Key)
// meta
coll := utils.CreateTestCollection(1, 1)
coll.FieldIndexID = map[int64]int64{101: 1000}
coll.LoadFields = []int64{101}
coll.Schema = &schemapb.CollectionSchema{
Name: "test_loadJsonIndex",
Fields: []*schemapb.FieldSchema{
{FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"},
},
}
checker.meta.CollectionManager.PutCollection(ctx, coll)
checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Address: "localhost",
Hostname: "localhost",
}))
checker.meta.ResourceManager.HandleNodeUp(ctx, 1)
checker.meta.ResourceManager.HandleNodeUp(ctx, 2)
// dist
fieldIndexInfo := &querypb.FieldIndexInfo{
FieldID: 101,
IndexID: 1000,
EnableIndex: true,
}
indexInfo := make(map[int64]*querypb.FieldIndexInfo)
indexInfo[fieldIndexInfo.IndexID] = fieldIndexInfo
segment := utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")
segment.IndexInfo = indexInfo
checker.dist.SegmentDistManager.Update(1, segment)
// broker
suite.broker.EXPECT().ListIndexes(mock.Anything, mock.Anything).Call.Return(
func(ctx context.Context, collectionID int64) ([]*indexpb.IndexInfo, error) {
return []*indexpb.IndexInfo{
{
FieldID: 101,
IndexID: 1000,
},
}, nil
},
)
mockJSONKeyStats := map[int64]*datapb.JsonKeyStats{
101: {
FieldID: 101,
},
}
suite.broker.EXPECT().GetSegmentInfo(mock.Anything, mock.Anything).
Return([]*datapb.SegmentInfo{
{
ID: 2,
JsonKeyStats: mockJSONKeyStats,
},
}, nil).Maybe()
tasks := checker.Check(context.Background())
suite.Len(tasks, 1)
suite.Len(tasks[0].Actions(), 1)
suite.Equal(tasks[0].Actions()[0].(*task.SegmentAction).Type(), task.ActionTypeStatsUpdate)
}
func TestIndexChecker(t *testing.T) {
suite.Run(t, new(IndexCheckerSuite))
}

View File

@ -173,6 +173,7 @@ func (dh *distHandler) updateSegmentsDistribution(ctx context.Context, resp *que
Version: s.GetVersion(),
LastDeltaTimestamp: s.GetLastDeltaTimestamp(),
IndexInfo: s.GetIndexInfo(),
JSONIndexField: s.GetFieldJsonIndexStats(),
})
}

View File

@ -222,6 +222,7 @@ func (job *LoadCollectionJob) Execute() error {
},
CreatedAt: time.Now(),
LoadSpan: sp,
Schema: job.collInfo.GetSchema(),
}
job.undo.IsNewCollection = true
err = job.meta.CollectionManager.PutCollection(job.ctx, collection, partitions...)
@ -426,6 +427,7 @@ func (job *LoadPartitionJob) Execute() error {
},
CreatedAt: time.Now(),
LoadSpan: sp,
Schema: job.collInfo.GetSchema(),
}
err = job.meta.CollectionManager.PutCollection(job.ctx, collection, partitions...)
if err != nil {

View File

@ -50,6 +50,7 @@ type Collection struct {
mut sync.RWMutex
refreshNotifier chan struct{}
LoadSpan trace.Span
Schema *schemapb.CollectionSchema
}
func (collection *Collection) SetRefreshNotifier(notifier chan struct{}) {
@ -85,6 +86,7 @@ func (collection *Collection) Clone() *Collection {
UpdatedAt: collection.UpdatedAt,
refreshNotifier: collection.refreshNotifier,
LoadSpan: collection.LoadSpan,
Schema: collection.Schema,
}
}
@ -238,6 +240,7 @@ func (m *CollectionManager) upgradeLoadFields(ctx context.Context, collection *q
err = m.putCollection(ctx, true, &Collection{
CollectionLoadInfo: collection,
LoadPercentage: 100,
Schema: resp.GetSchema(),
})
if err != nil {
return err
@ -253,6 +256,27 @@ func (m *CollectionManager) GetCollection(ctx context.Context, collectionID type
return m.collections[collectionID]
}
func (m *CollectionManager) GetCollectionSchema(ctx context.Context, collectionID typeutil.UniqueID) *schemapb.CollectionSchema {
m.rwmutex.RLock()
defer m.rwmutex.RUnlock()
collection, ok := m.collections[collectionID]
if !ok {
return nil
}
return collection.Schema
}
func (m *CollectionManager) PutCollectionSchema(ctx context.Context, collectionID typeutil.UniqueID, schema *schemapb.CollectionSchema) {
m.rwmutex.Lock()
defer m.rwmutex.Unlock()
collection, ok := m.collections[collectionID]
if !ok {
return
}
collection.Schema = schema
}
func (m *CollectionManager) GetPartition(ctx context.Context, partitionID typeutil.UniqueID) *Partition {
m.rwmutex.RLock()
defer m.rwmutex.RUnlock()

View File

@ -125,6 +125,7 @@ type Segment struct {
Version int64 // Version is the timestamp of loading segment
LastDeltaTimestamp uint64 // The timestamp of the last delta record
IndexInfo map[int64]*querypb.FieldIndexInfo // index info of loaded segment, indexID -> FieldIndexInfo
JSONIndexField []int64 // json index info of loaded segment
}
func SegmentFromInfo(info *datapb.SegmentInfo) *Segment {

View File

@ -33,12 +33,14 @@ const (
ActionTypeGrow ActionType = iota + 1
ActionTypeReduce
ActionTypeUpdate
ActionTypeStatsUpdate
)
var ActionTypeName = map[ActionType]string{
ActionTypeGrow: "Grow",
ActionTypeReduce: "Reduce",
ActionTypeUpdate: "Update",
ActionTypeGrow: "Grow",
ActionTypeReduce: "Reduce",
ActionTypeUpdate: "Update",
ActionTypeStatsUpdate: "StatsUpdate",
}
func (t ActionType) String() string {

View File

@ -156,7 +156,7 @@ func (ex *Executor) removeTask(task Task, step int) {
func (ex *Executor) executeSegmentAction(task *SegmentTask, step int) {
switch task.Actions()[step].Type() {
case ActionTypeGrow, ActionTypeUpdate:
case ActionTypeGrow, ActionTypeUpdate, ActionTypeStatsUpdate:
ex.loadSegment(task, step)
case ActionTypeReduce:
@ -469,6 +469,9 @@ func (ex *Executor) executeLeaderAction(task *LeaderTask, step int) {
case ActionTypeUpdate:
ex.updatePartStatsVersions(task, step)
case ActionTypeStatsUpdate:
ex.updatePartStatsVersions(task, step)
}
}

View File

@ -49,13 +49,15 @@ const (
TaskTypeReduce
TaskTypeMove
TaskTypeUpdate
TaskTypeStatsUpdate
)
var TaskTypeName = map[Type]string{
TaskTypeGrow: "Grow",
TaskTypeReduce: "Reduce",
TaskTypeMove: "Move",
TaskTypeUpdate: "Update",
TaskTypeGrow: "Grow",
TaskTypeReduce: "Reduce",
TaskTypeMove: "Move",
TaskTypeUpdate: "Update",
TaskTypeStatsUpdate: "StatsUpdate",
}
type Type int32

View File

@ -95,6 +95,8 @@ func GetTaskType(task Task) Type {
return TaskTypeReduce
case task.Actions()[0].Type() == ActionTypeUpdate:
return TaskTypeUpdate
case task.Actions()[0].Type() == ActionTypeStatsUpdate:
return TaskTypeStatsUpdate
}
return 0
}
@ -132,6 +134,10 @@ func packLoadSegmentRequest(
loadScope = querypb.LoadScope_Index
}
if action.Type() == ActionTypeStatsUpdate {
loadScope = querypb.LoadScope_Stats
}
if task.Source() == utils.LeaderChecker {
loadScope = querypb.LoadScope_Delta
}

View File

@ -74,22 +74,23 @@ func PackSegmentLoadInfo(segment *datapb.SegmentInfo, channelCheckpoint *msgpb.M
zap.Duration("tsLag", tsLag))
}
loadInfo := &querypb.SegmentLoadInfo{
SegmentID: segment.ID,
PartitionID: segment.PartitionID,
CollectionID: segment.CollectionID,
BinlogPaths: segment.Binlogs,
NumOfRows: segment.NumOfRows,
Statslogs: segment.Statslogs,
Deltalogs: segment.Deltalogs,
Bm25Logs: segment.Bm25Statslogs,
InsertChannel: segment.InsertChannel,
IndexInfos: indexes,
StartPosition: segment.GetStartPosition(),
DeltaPosition: channelCheckpoint,
Level: segment.GetLevel(),
StorageVersion: segment.GetStorageVersion(),
IsSorted: segment.GetIsSorted(),
TextStatsLogs: segment.GetTextStatsLogs(),
SegmentID: segment.ID,
PartitionID: segment.PartitionID,
CollectionID: segment.CollectionID,
BinlogPaths: segment.Binlogs,
NumOfRows: segment.NumOfRows,
Statslogs: segment.Statslogs,
Deltalogs: segment.Deltalogs,
Bm25Logs: segment.Bm25Statslogs,
InsertChannel: segment.InsertChannel,
IndexInfos: indexes,
StartPosition: segment.GetStartPosition(),
DeltaPosition: channelCheckpoint,
Level: segment.GetLevel(),
StorageVersion: segment.GetStorageVersion(),
IsSorted: segment.GetIsSorted(),
TextStatsLogs: segment.GetTextStatsLogs(),
JsonKeyStatsLogs: segment.GetJsonKeyStats(),
}
return loadInfo
}

View File

@ -175,6 +175,45 @@ func (node *QueryNode) loadIndex(ctx context.Context, req *querypb.LoadSegmentsR
return status
}
func (node *QueryNode) loadStats(ctx context.Context, req *querypb.LoadSegmentsRequest) *commonpb.Status {
log := log.Ctx(ctx).With(
zap.Int64("collectionID", req.GetCollectionID()),
zap.Int64s("segmentIDs", lo.Map(req.GetInfos(), func(info *querypb.SegmentLoadInfo, _ int) int64 { return info.GetSegmentID() })),
)
status := merr.Success()
log.Info("start to load stats")
for _, info := range req.GetInfos() {
log := log.With(zap.Int64("segmentID", info.GetSegmentID()))
segment := node.manager.Segment.GetSealed(info.GetSegmentID())
if segment == nil {
log.Warn("segment not found for load stats operation")
continue
}
localSegment, ok := segment.(*segments.LocalSegment)
if !ok {
log.Warn("segment not local for load stats opeartion")
continue
}
if localSegment.IsLazyLoad() {
localSegment.SetLoadInfo(info)
localSegment.SetNeedUpdatedVersion(req.GetVersion())
node.manager.DiskCache.MarkItemNeedReload(ctx, localSegment.ID())
return nil
}
err := node.loader.LoadJSONIndex(ctx, localSegment, info)
if err != nil {
log.Warn("failed to load stats", zap.Error(err))
status = merr.Status(err)
break
}
}
return status
}
func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryRequest, channel string) (*internalpb.RetrieveResults, error) {
msgID := req.Req.Base.GetMsgID()
traceID := trace.SpanFromContext(ctx).SpanContext().TraceID()

Some files were not shown because too many files have changed in this diff Show More