mirror of https://github.com/milvus-io/milvus.git
enhance: rename tokenizer to analyzer and check analyzer params (#37478)
relate: https://github.com/milvus-io/milvus/issues/35853 --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>pull/37565/head
parent
ff00a12805
commit
12951f0abb
|
@ -20,7 +20,7 @@
|
|||
namespace milvus {
|
||||
TokenizerParams
|
||||
ParseTokenizerParams(const TypeParams& params) {
|
||||
auto iter = params.find("tokenizer_params");
|
||||
auto iter = params.find("analyzer_params");
|
||||
if (iter == params.end()) {
|
||||
return "{}";
|
||||
}
|
||||
|
@ -39,19 +39,19 @@ FieldMeta::enable_match() const {
|
|||
}
|
||||
|
||||
bool
|
||||
FieldMeta::enable_tokenizer() const {
|
||||
FieldMeta::enable_analyzer() const {
|
||||
if (!IsStringDataType(type_)) {
|
||||
return false;
|
||||
}
|
||||
if (!string_info_.has_value()) {
|
||||
return false;
|
||||
}
|
||||
return string_info_->enable_tokenizer;
|
||||
return string_info_->enable_analyzer;
|
||||
}
|
||||
|
||||
TokenizerParams
|
||||
FieldMeta::get_tokenizer_params() const {
|
||||
Assert(enable_tokenizer());
|
||||
FieldMeta::get_analyzer_params() const {
|
||||
Assert(enable_analyzer());
|
||||
auto params = string_info_->params;
|
||||
return ParseTokenizerParams(params);
|
||||
}
|
||||
|
@ -109,7 +109,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
|
|||
return b;
|
||||
};
|
||||
|
||||
bool enable_tokenizer = get_bool_value("enable_tokenizer");
|
||||
bool enable_analyzer = get_bool_value("enable_analyzer");
|
||||
bool enable_match = get_bool_value("enable_match");
|
||||
|
||||
return FieldMeta{name,
|
||||
|
@ -118,7 +118,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
|
|||
max_len,
|
||||
nullable,
|
||||
enable_match,
|
||||
enable_tokenizer,
|
||||
enable_analyzer,
|
||||
type_map};
|
||||
}
|
||||
|
||||
|
|
|
@ -64,13 +64,13 @@ class FieldMeta {
|
|||
int64_t max_length,
|
||||
bool nullable,
|
||||
bool enable_match,
|
||||
bool enable_tokenizer,
|
||||
bool enable_analyzer,
|
||||
std::map<std::string, std::string>& params)
|
||||
: name_(name),
|
||||
id_(id),
|
||||
type_(type),
|
||||
string_info_(StringInfo{
|
||||
max_length, enable_match, enable_tokenizer, std::move(params)}),
|
||||
max_length, enable_match, enable_analyzer, std::move(params)}),
|
||||
nullable_(nullable) {
|
||||
Assert(IsStringDataType(type_));
|
||||
}
|
||||
|
@ -125,10 +125,10 @@ class FieldMeta {
|
|||
enable_match() const;
|
||||
|
||||
bool
|
||||
enable_tokenizer() const;
|
||||
enable_analyzer() const;
|
||||
|
||||
TokenizerParams
|
||||
get_tokenizer_params() const;
|
||||
get_analyzer_params() const;
|
||||
|
||||
std::optional<knowhere::MetricType>
|
||||
get_metric_type() const {
|
||||
|
@ -203,7 +203,7 @@ class FieldMeta {
|
|||
struct StringInfo {
|
||||
int64_t max_length;
|
||||
bool enable_match;
|
||||
bool enable_tokenizer;
|
||||
bool enable_analyzer;
|
||||
std::map<std::string, std::string> params;
|
||||
};
|
||||
FieldName name_;
|
||||
|
|
|
@ -121,7 +121,7 @@ class Schema {
|
|||
int64_t max_length,
|
||||
bool nullable,
|
||||
bool enable_match,
|
||||
bool enable_tokenizer,
|
||||
bool enable_analyzer,
|
||||
std::map<std::string, std::string>& params) {
|
||||
auto field_meta = FieldMeta(name,
|
||||
id,
|
||||
|
@ -129,7 +129,7 @@ class Schema {
|
|||
max_length,
|
||||
nullable,
|
||||
enable_match,
|
||||
enable_tokenizer,
|
||||
enable_analyzer,
|
||||
params);
|
||||
this->AddField(std::move(field_meta));
|
||||
}
|
||||
|
|
|
@ -21,18 +21,18 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";
|
|||
|
||||
TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params)
|
||||
const char* analyzer_params)
|
||||
: commit_interval_in_ms_(commit_interval_in_ms),
|
||||
last_commit_time_(stdclock::now()) {
|
||||
d_type_ = TantivyDataType::Text;
|
||||
std::string field_name = "tmp_text_index";
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||
field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
|
||||
field_name.c_str(), true, "", tokenizer_name, analyzer_params);
|
||||
}
|
||||
|
||||
TextMatchIndex::TextMatchIndex(const std::string& path,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params)
|
||||
const char* analyzer_params)
|
||||
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
||||
last_commit_time_(stdclock::now()) {
|
||||
path_ = path;
|
||||
|
@ -42,12 +42,12 @@ TextMatchIndex::TextMatchIndex(const std::string& path,
|
|||
false,
|
||||
path_.c_str(),
|
||||
tokenizer_name,
|
||||
tokenizer_params);
|
||||
analyzer_params);
|
||||
}
|
||||
|
||||
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params)
|
||||
const char* analyzer_params)
|
||||
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
||||
last_commit_time_(stdclock::now()) {
|
||||
schema_ = ctx.fieldDataMeta.field_schema;
|
||||
|
@ -65,7 +65,7 @@ TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
|
|||
false,
|
||||
path_.c_str(),
|
||||
tokenizer_name,
|
||||
tokenizer_params);
|
||||
analyzer_params);
|
||||
}
|
||||
|
||||
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx)
|
||||
|
@ -172,8 +172,8 @@ TextMatchIndex::CreateReader() {
|
|||
|
||||
void
|
||||
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
|
||||
const char* tokenizer_params) {
|
||||
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
|
||||
const char* analyzer_params) {
|
||||
wrapper_->register_tokenizer(tokenizer_name, analyzer_params);
|
||||
}
|
||||
|
||||
TargetBitmap
|
||||
|
|
|
@ -24,15 +24,15 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
|
|||
// for growing segment.
|
||||
explicit TextMatchIndex(int64_t commit_interval_in_ms,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params);
|
||||
const char* analyzer_params);
|
||||
// for sealed segment.
|
||||
explicit TextMatchIndex(const std::string& path,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params);
|
||||
const char* analyzer_params);
|
||||
// for building index.
|
||||
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params);
|
||||
const char* analyzer_params);
|
||||
// for loading index
|
||||
explicit TextMatchIndex(const storage::FileManagerContext& ctx);
|
||||
|
||||
|
@ -64,7 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
|
|||
CreateReader();
|
||||
|
||||
void
|
||||
RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);
|
||||
RegisterTokenizer(const char* tokenizer_name, const char* analyzer_params);
|
||||
|
||||
TargetBitmap
|
||||
MatchQuery(const std::string& query);
|
||||
|
|
|
@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set,
|
|||
auto index = std::make_unique<index::TextMatchIndex>(
|
||||
fileManagerContext,
|
||||
"milvus_tokenizer",
|
||||
field_schema.get_tokenizer_params().c_str());
|
||||
field_schema.get_analyzer_params().c_str());
|
||||
index->Build(config);
|
||||
auto binary =
|
||||
std::make_unique<knowhere::BinarySet>(index->Upload(config));
|
||||
|
|
|
@ -1516,13 +1516,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
|||
index = std::make_unique<index::TextMatchIndex>(
|
||||
std::numeric_limits<int64_t>::max(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
} else {
|
||||
// build text index using mmap.
|
||||
index = std::make_unique<index::TextMatchIndex>(
|
||||
cfg.GetMmapPath(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -1572,7 +1572,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
|||
index->Reload();
|
||||
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
@ -1583,7 +1583,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex(
|
|||
std::unique_lock lck(mutex_);
|
||||
const auto& field_meta = schema_->operator[](field_id);
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
||||
|
|
|
@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
|
|||
"cannot create text index on non-string type");
|
||||
// todo: make this(200) configurable.
|
||||
auto index = std::make_unique<index::TextMatchIndex>(
|
||||
200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str());
|
||||
200, "milvus_tokenizer", field_meta.get_analyzer_params().c_str());
|
||||
index->Commit();
|
||||
index->CreateReader();
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
||||
|
|
|
@ -2014,13 +2014,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
|||
index = std::make_unique<index::TextMatchIndex>(
|
||||
std::numeric_limits<int64_t>::max(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
} else {
|
||||
// build text index using mmap.
|
||||
index = std::make_unique<index::TextMatchIndex>(
|
||||
cfg.GetMmapPath(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -2069,7 +2069,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
|||
index->Reload();
|
||||
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
@ -2080,7 +2080,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
|
|||
std::unique_lock lck(mutex_);
|
||||
const auto& field_meta = schema_->operator[](field_id);
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
field_meta.get_analyzer_params().c_str());
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
||||
|
|
|
@ -16,11 +16,11 @@ to_set(const RustArrayWrapper& w) {
|
|||
int
|
||||
main(int argc, char* argv[]) {
|
||||
std::string tokenizer_name = "jieba";
|
||||
std::map<std::string, std::string> tokenizer_params;
|
||||
tokenizer_params["tokenizer"] = tokenizer_name;
|
||||
std::map<std::string, std::string> analyzer_params;
|
||||
analyzer_params["tokenizer"] = tokenizer_name;
|
||||
|
||||
auto text_index = TantivyIndexWrapper(
|
||||
"text_demo", true, "", tokenizer_name.c_str(), tokenizer_params);
|
||||
"text_demo", true, "", tokenizer_name.c_str(), analyzer_params);
|
||||
auto write_single_text = [&text_index](const std::string& s,
|
||||
int64_t offset) {
|
||||
text_index.add_data(&s, 1, offset);
|
||||
|
@ -38,7 +38,7 @@ main(int argc, char* argv[]) {
|
|||
}
|
||||
|
||||
text_index.create_reader();
|
||||
text_index.register_tokenizer(tokenizer_name.c_str(), tokenizer_params);
|
||||
text_index.register_tokenizer(tokenizer_name.c_str(), analyzer_params);
|
||||
|
||||
{
|
||||
auto result = to_set(text_index.match_query("北京"));
|
||||
|
|
|
@ -88,9 +88,7 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);
|
|||
|
||||
RustArray tantivy_match_query(void *ptr, const char *query);
|
||||
|
||||
void tantivy_register_tokenizer(void *ptr,
|
||||
const char *tokenizer_name,
|
||||
const char *tokenizer_params);
|
||||
void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, const char *analyzer_params);
|
||||
|
||||
void *tantivy_create_index(const char *field_name,
|
||||
TantivyDataType data_type,
|
||||
|
@ -144,7 +142,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
|
|||
void *tantivy_create_text_writer(const char *field_name,
|
||||
const char *path,
|
||||
const char *tokenizer_name,
|
||||
const char *tokenizer_params,
|
||||
const char *analyzer_params,
|
||||
uintptr_t num_threads,
|
||||
uintptr_t overall_memory_budget_in_bytes,
|
||||
bool in_ram);
|
||||
|
@ -159,7 +157,7 @@ bool tantivy_token_stream_advance(void *token_stream);
|
|||
|
||||
const char *tantivy_token_stream_get_token(void *token_stream);
|
||||
|
||||
void *tantivy_create_tokenizer(const char *tokenizer_params);
|
||||
void *tantivy_create_tokenizer(const char *analyzer_params);
|
||||
|
||||
void *tantivy_clone_tokenizer(void *ptr);
|
||||
|
||||
|
|
|
@ -24,12 +24,12 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
|
|||
pub extern "C" fn tantivy_register_tokenizer(
|
||||
ptr: *mut c_void,
|
||||
tokenizer_name: *const c_char,
|
||||
tokenizer_params: *const c_char,
|
||||
analyzer_params: *const c_char,
|
||||
) {
|
||||
init_log();
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
|
||||
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
|
||||
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => unsafe {
|
||||
|
|
|
@ -13,7 +13,7 @@ pub extern "C" fn tantivy_create_text_writer(
|
|||
field_name: *const c_char,
|
||||
path: *const c_char,
|
||||
tokenizer_name: *const c_char,
|
||||
tokenizer_params: *const c_char,
|
||||
analyzer_params: *const c_char,
|
||||
num_threads: usize,
|
||||
overall_memory_budget_in_bytes: usize,
|
||||
in_ram: bool,
|
||||
|
@ -22,7 +22,7 @@ pub extern "C" fn tantivy_create_text_writer(
|
|||
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
|
||||
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
|
||||
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
|
||||
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
|
||||
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => {
|
||||
|
|
|
@ -9,9 +9,9 @@ use crate::{
|
|||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
|
||||
pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> *mut c_void {
|
||||
init_log();
|
||||
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
|
||||
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => create_binding(text_analyzer),
|
||||
|
|
|
@ -14,7 +14,7 @@ namespace milvus::tantivy {
|
|||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer";
|
||||
static const char* DEFAULT_TOKENIZER_PARAMS = "{}";
|
||||
static const char* DEFAULT_analyzer_params = "{}";
|
||||
static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
|
||||
static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
|
||||
DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
|
||||
|
@ -101,14 +101,14 @@ struct TantivyIndexWrapper {
|
|||
bool in_ram,
|
||||
const char* path,
|
||||
const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
|
||||
const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
|
||||
const char* analyzer_params = DEFAULT_analyzer_params,
|
||||
uintptr_t num_threads = DEFAULT_NUM_THREADS,
|
||||
uintptr_t overall_memory_budget_in_bytes =
|
||||
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
|
||||
writer_ = tantivy_create_text_writer(field_name,
|
||||
path,
|
||||
tokenizer_name,
|
||||
tokenizer_params,
|
||||
analyzer_params,
|
||||
num_threads,
|
||||
overall_memory_budget_in_bytes,
|
||||
in_ram);
|
||||
|
@ -132,10 +132,10 @@ struct TantivyIndexWrapper {
|
|||
|
||||
void
|
||||
register_tokenizer(const char* tokenizer_name,
|
||||
const char* tokenizer_params) {
|
||||
const char* analyzer_params) {
|
||||
if (reader_ != nullptr) {
|
||||
tantivy_register_tokenizer(
|
||||
reader_, tokenizer_name, tokenizer_params);
|
||||
reader_, tokenizer_name, analyzer_params);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ TEST(ValidateTextSchema, JieBa) {
|
|||
milvus::proto::schema::FieldSchema schema;
|
||||
{
|
||||
auto kv = schema.add_type_params();
|
||||
kv->set_key("tokenizer_params");
|
||||
kv->set_key("analyzer_params");
|
||||
kv->set_value(R"({"tokenizer": "jieba"})");
|
||||
}
|
||||
|
||||
|
@ -47,10 +47,10 @@ set_cmap(CMap m, const std::string& key, const std::string& value) {
|
|||
}
|
||||
|
||||
TEST(CTokenizer, Default) {
|
||||
auto tokenizer_params = R"({"tokenizer": "standard"})";
|
||||
auto analyzer_params = R"({"tokenizer": "standard"})";
|
||||
CTokenizer tokenizer;
|
||||
{
|
||||
auto status = create_tokenizer(tokenizer_params, &tokenizer);
|
||||
auto status = create_tokenizer(analyzer_params, &tokenizer);
|
||||
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
|
||||
}
|
||||
|
||||
|
|
|
@ -84,9 +84,9 @@ TEST(ParseTokenizerParams, NoTokenizerParams) {
|
|||
}
|
||||
|
||||
TEST(ParseTokenizerParams, Default) {
|
||||
TypeParams params{{"tokenizer_params", R"({"tokenizer": "standard"})"}};
|
||||
TypeParams params{{"analyzer_params", R"({"tokenizer": "standard"})"}};
|
||||
auto p = ParseTokenizerParams(params);
|
||||
ASSERT_EQ(params.at("tokenizer_params"), p);
|
||||
ASSERT_EQ(params.at("analyzer_params"), p);
|
||||
}
|
||||
|
||||
TEST(TextMatch, Index) {
|
||||
|
@ -248,8 +248,8 @@ TEST(TextMatch, SealedNaive) {
|
|||
TEST(TextMatch, GrowingJieBa) {
|
||||
auto schema = GenTestSchema({
|
||||
{"enable_match", "true"},
|
||||
{"enable_tokenizer", "true"},
|
||||
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
|
||||
{"enable_analyzer", "true"},
|
||||
{"analyzer_params", R"({"tokenizer": "jieba"})"},
|
||||
});
|
||||
auto seg = CreateGrowingSegment(schema, empty_index_meta);
|
||||
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
|
||||
|
@ -325,8 +325,8 @@ TEST(TextMatch, GrowingJieBa) {
|
|||
TEST(TextMatch, SealedJieBa) {
|
||||
auto schema = GenTestSchema({
|
||||
{"enable_match", "true"},
|
||||
{"enable_tokenizer", "true"},
|
||||
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
|
||||
{"enable_analyzer", "true"},
|
||||
{"analyzer_params", R"({"tokenizer": "jieba"})"},
|
||||
});
|
||||
auto seg = CreateSealedSegment(schema, empty_index_meta);
|
||||
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
|
||||
|
|
|
@ -58,7 +58,7 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() {
|
|||
Key: "enable_match", Value: "true",
|
||||
},
|
||||
{
|
||||
Key: "enable_tokenizer", Value: "true",
|
||||
Key: "enable_analyzer", Value: "true",
|
||||
},
|
||||
},
|
||||
},
|
||||
|
|
|
@ -1328,7 +1328,11 @@ func (h *HandlersV2) createCollection(ctx context.Context, c *gin.Context, anyRe
|
|||
}
|
||||
}
|
||||
for key, fieldParam := range field.ElementTypeParams {
|
||||
fieldSchema.TypeParams = append(fieldSchema.TypeParams, &commonpb.KeyValuePair{Key: key, Value: fmt.Sprintf("%v", fieldParam)})
|
||||
value, err := getElementTypeParams(fieldParam)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
fieldSchema.TypeParams = append(fieldSchema.TypeParams, &commonpb.KeyValuePair{Key: key, Value: value})
|
||||
}
|
||||
if lo.Contains(allOutputFields, field.FieldName) {
|
||||
fieldSchema.IsFunctionOutput = true
|
||||
|
|
|
@ -1510,6 +1510,18 @@ func convertToExtraParams(indexParam IndexParam) ([]*commonpb.KeyValuePair, erro
|
|||
return params, nil
|
||||
}
|
||||
|
||||
func getElementTypeParams(param interface{}) (string, error) {
|
||||
if str, ok := param.(string); ok {
|
||||
return str, nil
|
||||
}
|
||||
|
||||
jsonBytes, err := json.Marshal(param)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(jsonBytes), nil
|
||||
}
|
||||
|
||||
func MetricsHandlerFunc(c *gin.Context) {
|
||||
path := c.Request.URL.Path
|
||||
metrics.RestfulFunctionCall.WithLabelValues(
|
||||
|
|
|
@ -47,7 +47,7 @@ func TestEmbeddingNode_BM25_Operator(t *testing.T) {
|
|||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{
|
||||
Key: "enable_tokenizer",
|
||||
Key: "enable_analyzer",
|
||||
Value: "true",
|
||||
},
|
||||
},
|
||||
|
|
|
@ -1250,7 +1250,7 @@ func TestCatalog_CreateCollection(t *testing.T) {
|
|||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{
|
||||
Key: "enable_tokenizer",
|
||||
Key: "enable_analyzer",
|
||||
Value: "true",
|
||||
},
|
||||
},
|
||||
|
@ -1354,7 +1354,7 @@ func TestCatalog_DropCollection(t *testing.T) {
|
|||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{
|
||||
Key: "enable_tokenizer",
|
||||
Key: "enable_analyzer",
|
||||
Value: "true",
|
||||
},
|
||||
},
|
||||
|
|
|
@ -407,7 +407,7 @@ func (t *createCollectionTask) PreExecute(ctx context.Context) error {
|
|||
return err
|
||||
}
|
||||
|
||||
if err := ctokenizer.ValidateTextSchema(field); err != nil {
|
||||
if err := ctokenizer.ValidateTextSchema(field, wasBm25FunctionInputField(t.schema, field)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3133,7 +3133,7 @@ func TestCreateCollectionTaskWithPartitionKey(t *testing.T) {
|
|||
Value: strconv.Itoa(testMaxVarCharLength),
|
||||
},
|
||||
{
|
||||
Key: "enable_tokenizer",
|
||||
Key: "enable_analyzer",
|
||||
Value: "true",
|
||||
},
|
||||
},
|
||||
|
|
|
@ -697,6 +697,15 @@ func checkFunctionOutputField(function *schemapb.FunctionSchema, fields []*schem
|
|||
return nil
|
||||
}
|
||||
|
||||
func wasBm25FunctionInputField(coll *schemapb.CollectionSchema, field *schemapb.FieldSchema) bool {
|
||||
for _, fun := range coll.GetFunctions() {
|
||||
if fun.GetType() == schemapb.FunctionType_BM25 && field.GetName() == fun.GetInputFieldNames()[0] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func checkFunctionInputField(function *schemapb.FunctionSchema, fields []*schemapb.FieldSchema) error {
|
||||
switch function.GetType() {
|
||||
case schemapb.FunctionType_BM25:
|
||||
|
@ -705,8 +714,8 @@ func checkFunctionInputField(function *schemapb.FunctionSchema, fields []*schema
|
|||
len(fields), fields[0].DataType.String())
|
||||
}
|
||||
h := typeutil.CreateFieldSchemaHelper(fields[0])
|
||||
if !h.EnableTokenizer() {
|
||||
return fmt.Errorf("BM25 function input field must set enable_tokenizer to true")
|
||||
if !h.EnableAnalyzer() {
|
||||
return fmt.Errorf("BM25 function input field must set enable_analyzer to true")
|
||||
}
|
||||
|
||||
default:
|
||||
|
|
|
@ -2638,7 +2638,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Valid function schema", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2657,7 +2657,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - duplicate function names", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2702,7 +2702,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - output field not found", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
{
|
||||
|
@ -2721,7 +2721,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - nullable input field", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}, Nullable: true},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}, Nullable: true},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2741,7 +2741,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - output field is primary key", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, IsPrimaryKey: true},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2761,7 +2761,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - output field is partition key", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, IsPartitionKey: true},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2781,7 +2781,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - output field is clustering key", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, IsClusteringKey: true},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2801,7 +2801,7 @@ func TestValidateFunction(t *testing.T) {
|
|||
t.Run("Invalid function schema - nullable output field", func(t *testing.T) {
|
||||
schema := &schemapb.CollectionSchema{
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
|
||||
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
|
||||
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, Nullable: true},
|
||||
},
|
||||
Functions: []*schemapb.FunctionSchema{
|
||||
|
@ -2827,7 +2827,7 @@ func TestValidateFunctionInputField(t *testing.T) {
|
|||
fields := []*schemapb.FieldSchema{
|
||||
{
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}},
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}},
|
||||
},
|
||||
}
|
||||
err := checkFunctionInputField(function, fields)
|
||||
|
@ -2854,7 +2854,7 @@ func TestValidateFunctionInputField(t *testing.T) {
|
|||
fields := []*schemapb.FieldSchema{
|
||||
{
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "false"}},
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "false"}},
|
||||
},
|
||||
}
|
||||
err := checkFunctionInputField(function, fields)
|
||||
|
@ -2868,11 +2868,11 @@ func TestValidateFunctionInputField(t *testing.T) {
|
|||
fields := []*schemapb.FieldSchema{
|
||||
{
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}},
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}},
|
||||
},
|
||||
{
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}},
|
||||
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}},
|
||||
},
|
||||
}
|
||||
err := checkFunctionInputField(function, fields)
|
||||
|
|
|
@ -17,14 +17,14 @@ import (
|
|||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
func ValidateTextSchema(fieldSchema *schemapb.FieldSchema) error {
|
||||
func ValidateTextSchema(fieldSchema *schemapb.FieldSchema, EnableBM25 bool) error {
|
||||
h := typeutil.CreateFieldSchemaHelper(fieldSchema)
|
||||
if !h.EnableMatch() {
|
||||
if !h.EnableMatch() && !EnableBM25 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if !h.EnableTokenizer() {
|
||||
return fmt.Errorf("field %s is set to enable match but not enable tokenizer", fieldSchema.Name)
|
||||
if !h.EnableAnalyzer() {
|
||||
return fmt.Errorf("field %s is set to enable match or bm25 function but not enable analyzer", fieldSchema.Name)
|
||||
}
|
||||
|
||||
bs, err := proto.Marshal(fieldSchema)
|
||||
|
|
|
@ -16,7 +16,7 @@ func TestValidateEmptyTextSchema(t *testing.T) {
|
|||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{},
|
||||
}
|
||||
assert.Nil(t, ValidateTextSchema(fs))
|
||||
assert.Nil(t, ValidateTextSchema(fs, false))
|
||||
}
|
||||
|
||||
func TestValidateTextSchema(t *testing.T) {
|
||||
|
@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) {
|
|||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{Key: "enable_match", Value: "true"},
|
||||
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
|
||||
{Key: "analyzer_params", Value: `{"tokenizer": "standard"}`},
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -41,32 +41,32 @@ func TestValidateTextSchema(t *testing.T) {
|
|||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{Key: "enable_match", Value: "true"},
|
||||
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
|
||||
{Key: "analyzer_params", Value: `{"tokenizer": "standard"}`},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for idx, tt := range tests {
|
||||
t.Run(fmt.Sprintf("enable_tokenizer not set %d", idx), func(t *testing.T) {
|
||||
err := ValidateTextSchema(tt)
|
||||
t.Run(fmt.Sprintf("enable_analyzer not set %d", idx), func(t *testing.T) {
|
||||
err := ValidateTextSchema(tt, false)
|
||||
assert.NotNil(t, err)
|
||||
})
|
||||
}
|
||||
|
||||
for idx, tt := range tests {
|
||||
t.Run(fmt.Sprintf("enable_tokenizer set to false %d", idx), func(t *testing.T) {
|
||||
t.Run(fmt.Sprintf("enable_analyzer set to false %d", idx), func(t *testing.T) {
|
||||
tt.TypeParams = append(tt.TypeParams, &commonpb.KeyValuePair{
|
||||
Key: "enable_tokenizer",
|
||||
Key: "enable_analyzer",
|
||||
Value: "false",
|
||||
})
|
||||
err := ValidateTextSchema(tt)
|
||||
err := ValidateTextSchema(tt, false)
|
||||
assert.NotNil(t, err)
|
||||
})
|
||||
}
|
||||
for idx, tt := range tests {
|
||||
t.Run(fmt.Sprintf("enable_tokenizer set to true %d", idx), func(t *testing.T) {
|
||||
t.Run(fmt.Sprintf("enable_analyzer set to true %d", idx), func(t *testing.T) {
|
||||
tt.TypeParams[len(tt.TypeParams)-1].Value = "true"
|
||||
err := ValidateTextSchema(tt)
|
||||
err := ValidateTextSchema(tt, false)
|
||||
assert.Nil(t, err)
|
||||
})
|
||||
}
|
||||
|
|
|
@ -30,6 +30,8 @@ import (
|
|||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
const analyzerParams = "analyzer_params"
|
||||
|
||||
// BM25 Runner
|
||||
// Input: string
|
||||
// Output: map[uint32]float32
|
||||
|
@ -40,9 +42,9 @@ type BM25FunctionRunner struct {
|
|||
concurrency int
|
||||
}
|
||||
|
||||
func getTokenizerParams(field *schemapb.FieldSchema) string {
|
||||
func getAnalyzerParams(field *schemapb.FieldSchema) string {
|
||||
for _, param := range field.GetTypeParams() {
|
||||
if param.Key == "tokenizer_params" {
|
||||
if param.Key == analyzerParams {
|
||||
return param.Value
|
||||
}
|
||||
}
|
||||
|
@ -66,7 +68,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
|
|||
}
|
||||
|
||||
if field.GetFieldID() == schema.GetInputFieldIds()[0] {
|
||||
params = getTokenizerParams(field)
|
||||
params = getAnalyzerParams(field)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -53,11 +53,11 @@ func (h *FieldSchemaHelper) EnableMatch() bool {
|
|||
return err == nil && enable
|
||||
}
|
||||
|
||||
func (h *FieldSchemaHelper) EnableTokenizer() bool {
|
||||
func (h *FieldSchemaHelper) EnableAnalyzer() bool {
|
||||
if !IsStringType(h.schema.GetDataType()) {
|
||||
return false
|
||||
}
|
||||
s, err := h.typeParams.Get("enable_tokenizer")
|
||||
s, err := h.typeParams.Get("enable_analyzer")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -777,15 +777,15 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
|
|||
|
||||
def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
|
||||
auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
fields = [
|
||||
gen_int64_field(),
|
||||
gen_float_field(),
|
||||
gen_string_field(),
|
||||
gen_string_field(name="text", max_length=2000, enable_tokenizer=True, enable_match=True,
|
||||
tokenizer_params=tokenizer_params),
|
||||
gen_string_field(name="text", max_length=2000, enable_analyzer=True, enable_match=True,
|
||||
analyzer_params=analyzer_params),
|
||||
gen_json_field(),
|
||||
gen_array_field(name="array_int", element_type=DataType.INT64),
|
||||
gen_array_field(name="array_float", element_type=DataType.FLOAT),
|
||||
|
@ -1799,7 +1799,7 @@ def get_text_field_name(schema=None):
|
|||
schema = gen_default_collection_schema()
|
||||
fields = schema.fields
|
||||
for field in fields:
|
||||
if field.dtype == DataType.VARCHAR and field.params.get("enable_tokenizer", False):
|
||||
if field.dtype == DataType.VARCHAR and field.params.get("enable_analyzer", False):
|
||||
return field.name
|
||||
return None
|
||||
|
||||
|
@ -1900,7 +1900,7 @@ def gen_varchar_data(length: int, nb: int, text_mode=False):
|
|||
def gen_data_by_collection_field(field, nb=None, start=None):
|
||||
# if nb is None, return one data, else return a list of data
|
||||
data_type = field.dtype
|
||||
enable_tokenizer = field.params.get("enable_tokenizer", False)
|
||||
enable_analyzer = field.params.get("enable_analyzer", False)
|
||||
if data_type == DataType.BOOL:
|
||||
if nb is None:
|
||||
return random.choice([True, False])
|
||||
|
@ -1936,8 +1936,8 @@ def gen_data_by_collection_field(field, nb=None, start=None):
|
|||
max_length = min(20, max_length-1)
|
||||
length = random.randint(0, max_length)
|
||||
if nb is None:
|
||||
return gen_varchar_data(length=length, nb=1, text_mode=enable_tokenizer)[0]
|
||||
return gen_varchar_data(length=length, nb=nb, text_mode=enable_tokenizer)
|
||||
return gen_varchar_data(length=length, nb=1, text_mode=enable_analyzer)[0]
|
||||
return gen_varchar_data(length=length, nb=nb, text_mode=enable_analyzer)
|
||||
if data_type == DataType.JSON:
|
||||
if nb is None:
|
||||
return {"name": fake.name(), "address": fake.address()}
|
||||
|
|
|
@ -27,8 +27,8 @@ pytest-parallel
|
|||
pytest-random-order
|
||||
|
||||
# pymilvus
|
||||
pymilvus==2.5.0rc106
|
||||
pymilvus[bulk_writer]==2.5.0rc106
|
||||
pymilvus==2.5.0rc108
|
||||
pymilvus[bulk_writer]==2.5.0rc108
|
||||
|
||||
# for customize config test
|
||||
python-benedict==0.24.3
|
||||
|
|
|
@ -771,7 +771,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
|||
cf.gen_int64_field(name=df.int_field, nullable=nullable),
|
||||
cf.gen_float_field(name=df.float_field, nullable=nullable),
|
||||
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
|
||||
cf.gen_string_field(name=df.text_field, enable_tokenizer=True, enable_match=True, nullable=nullable),
|
||||
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
|
||||
cf.gen_json_field(name=df.json_field, nullable=nullable),
|
||||
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
|
||||
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
|
||||
|
@ -938,7 +938,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
|||
cf.gen_int64_field(name=df.int_field, nullable=nullable),
|
||||
cf.gen_float_field(name=df.float_field),
|
||||
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
|
||||
cf.gen_string_field(name=df.text_field, enable_tokenizer=True, enable_match=True, nullable=nullable),
|
||||
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
|
||||
cf.gen_json_field(name=df.json_field),
|
||||
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
|
||||
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
|
||||
|
@ -1091,7 +1091,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
|||
cf.gen_int64_field(name=df.int_field, nullable=nullable),
|
||||
cf.gen_float_field(name=df.float_field, nullable=nullable),
|
||||
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
|
||||
cf.gen_string_field(name=df.text_field, enable_tokenizer=True, enable_match=True, nullable=nullable),
|
||||
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
|
||||
cf.gen_json_field(name=df.json_field, nullable=nullable),
|
||||
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
|
||||
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -4452,7 +4452,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: text match successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
|
@ -4462,34 +4462,34 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=enable_partition_key,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -4589,7 +4589,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: text match successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
|
@ -4599,34 +4599,34 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=enable_partition_key,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -4723,7 +4723,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: get the correct token, text match successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
# "lowercase", "asciifolding", "alphanumonly" was system filter
|
||||
"filter":["lowercase", "asciifolding", "alphanumonly",
|
||||
|
@ -4742,33 +4742,33 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -4843,7 +4843,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: query successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
|
@ -4854,33 +4854,33 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -4957,7 +4957,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: query successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
|
@ -4968,33 +4968,33 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -5100,7 +5100,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
"""
|
||||
|
||||
# 1. initialize with data
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
|
@ -5111,33 +5111,33 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -5245,7 +5245,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
"""
|
||||
# 1. initialize with data
|
||||
fake_en = Faker("en_US")
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
dim = 128
|
||||
|
@ -5255,33 +5255,33 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -5362,7 +5362,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
# 1. initialize with data
|
||||
fake_en = Faker("en_US")
|
||||
analyzer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
dim = 128
|
||||
default_fields = [
|
||||
|
@ -5375,7 +5375,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
|
@ -5383,7 +5383,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
|
@ -5391,7 +5391,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
|
@ -5399,7 +5399,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
|
@ -5472,7 +5472,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
expected: text match successfully and result is correct
|
||||
"""
|
||||
# 1. initialize with data
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
|
@ -5483,33 +5483,33 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -5614,7 +5614,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
"""
|
||||
# 1. initialize with data
|
||||
analyzer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
|
@ -5624,7 +5624,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
nullable=True,
|
||||
|
@ -5633,7 +5633,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
nullable=True,
|
||||
|
@ -5642,7 +5642,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
nullable=True,
|
||||
|
@ -5651,7 +5651,7 @@ class TestQueryTextMatch(TestcaseBase):
|
|||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
nullable=True,
|
||||
|
@ -5731,7 +5731,7 @@ class TestQueryTextMatchNegative(TestcaseBase):
|
|||
2. create collection
|
||||
expected: create collection failed and return error
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": "Unsupported",
|
||||
}
|
||||
dim = 128
|
||||
|
@ -5741,41 +5741,41 @@ class TestQueryTextMatchNegative(TestcaseBase):
|
|||
name="title",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="overview",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="genres",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="producer",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="cast",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -5856,7 +5856,7 @@ class TestQueryFunction(TestcaseBase):
|
|||
expected: create collection failed and return error
|
||||
"""
|
||||
analyzer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
dim = 128
|
||||
default_fields = [
|
||||
|
@ -5865,7 +5865,7 @@ class TestQueryFunction(TestcaseBase):
|
|||
name="title",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
|
@ -5873,14 +5873,14 @@ class TestQueryFunction(TestcaseBase):
|
|||
name="overview",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="age",
|
||||
dtype=DataType.INT64,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
|
|
|
@ -13301,7 +13301,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: text match successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
|
@ -13311,34 +13311,34 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=enable_partition_key,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||
|
@ -13462,7 +13462,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||
3. verify the result
|
||||
expected: text match successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
|
@ -13472,34 +13472,34 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=enable_partition_key,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
||||
|
|
|
@ -6,7 +6,7 @@ pyyaml~=6.0
|
|||
numpy~=1.24.3
|
||||
allure-pytest>=2.8.18
|
||||
Faker==19.2.0
|
||||
pymilvus==2.4.0rc39
|
||||
pymilvus==2.5.0rc108
|
||||
scikit-learn~=1.1.3
|
||||
pytest-xdist==2.5.0
|
||||
minio==7.1.14
|
||||
|
|
|
@ -205,9 +205,10 @@ class TestCreateCollection(TestBase):
|
|||
"isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"elementTypeParams": {"max_length": "1000",
|
||||
"enable_analyzer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": "default"
|
||||
"tokenizer": "standard"
|
||||
},
|
||||
"enable_match": True}},
|
||||
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
|
||||
|
|
|
@ -255,7 +255,7 @@ class TestCreateIndex(TestBase):
|
|||
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
||||
@pytest.mark.parametrize("nb", [3000])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
@pytest.mark.parametrize("tokenizer", ['default', 'jieba'])
|
||||
@pytest.mark.parametrize("tokenizer", ['standard', 'jieba'])
|
||||
@pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
|
||||
@pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
|
||||
@pytest.mark.parametrize("bm25_b", [0.7, 0.5])
|
||||
|
@ -279,7 +279,7 @@ class TestCreateIndex(TestBase):
|
|||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": tokenizer,
|
||||
},
|
||||
|
@ -302,7 +302,7 @@ class TestCreateIndex(TestBase):
|
|||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 0
|
||||
if tokenizer == 'default':
|
||||
if tokenizer == 'standard':
|
||||
fake = fake_en
|
||||
elif tokenizer == 'jieba':
|
||||
fake = fake_zh
|
||||
|
|
|
@ -1204,7 +1204,7 @@ class TestSearchVector(TestBase):
|
|||
@pytest.mark.parametrize("nb", [3000])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
@pytest.mark.parametrize("groupingField", ['user_id', None])
|
||||
@pytest.mark.parametrize("tokenizer", ['default'])
|
||||
@pytest.mark.parametrize("tokenizer", ['standard'])
|
||||
def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
|
||||
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
|
||||
"""
|
||||
|
@ -1224,7 +1224,7 @@ class TestSearchVector(TestBase):
|
|||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": tokenizer,
|
||||
},
|
||||
|
@ -1252,7 +1252,7 @@ class TestSearchVector(TestBase):
|
|||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 0
|
||||
if tokenizer == 'default':
|
||||
if tokenizer == 'standard':
|
||||
fake = fake_en
|
||||
elif tokenizer == 'jieba':
|
||||
fake = fake_zh
|
||||
|
@ -1339,7 +1339,7 @@ class TestSearchVector(TestBase):
|
|||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": tokenizer,
|
||||
},
|
||||
|
@ -1367,7 +1367,7 @@ class TestSearchVector(TestBase):
|
|||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 0
|
||||
if tokenizer == 'default':
|
||||
if tokenizer == 'standard':
|
||||
fake = fake_en
|
||||
elif tokenizer == 'jieba':
|
||||
fake = fake_zh
|
||||
|
@ -1893,7 +1893,7 @@ class TestSearchVector(TestBase):
|
|||
language = "zh"
|
||||
# create a collection
|
||||
dim = 128
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
name = gen_collection_name()
|
||||
|
@ -1903,34 +1903,34 @@ class TestSearchVector(TestBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
@ -2730,7 +2730,7 @@ class TestQueryVector(TestBase):
|
|||
language = "zh"
|
||||
# create a collection
|
||||
dim = 128
|
||||
tokenizer_params = {
|
||||
analyzer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
name = gen_collection_name()
|
||||
|
@ -2740,34 +2740,34 @@ class TestQueryVector(TestBase):
|
|||
name="word",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
is_partition_key=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="paragraph",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(
|
||||
name="text",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_tokenizer=True,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
tokenizer_params=tokenizer_params,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue