enhance: rename tokenizer to analyzer and check analyzer params (#37478)

relate: https://github.com/milvus-io/milvus/issues/35853

---------

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
pull/37565/head
aoiasd 2024-11-10 16:12:26 +08:00 committed by GitHub
parent ff00a12805
commit 12951f0abb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
40 changed files with 514 additions and 488 deletions

View File

@ -20,7 +20,7 @@
namespace milvus {
TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("tokenizer_params");
auto iter = params.find("analyzer_params");
if (iter == params.end()) {
return "{}";
}
@ -39,19 +39,19 @@ FieldMeta::enable_match() const {
}
bool
FieldMeta::enable_tokenizer() const {
FieldMeta::enable_analyzer() const {
if (!IsStringDataType(type_)) {
return false;
}
if (!string_info_.has_value()) {
return false;
}
return string_info_->enable_tokenizer;
return string_info_->enable_analyzer;
}
TokenizerParams
FieldMeta::get_tokenizer_params() const {
Assert(enable_tokenizer());
FieldMeta::get_analyzer_params() const {
Assert(enable_analyzer());
auto params = string_info_->params;
return ParseTokenizerParams(params);
}
@ -109,7 +109,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
return b;
};
bool enable_tokenizer = get_bool_value("enable_tokenizer");
bool enable_analyzer = get_bool_value("enable_analyzer");
bool enable_match = get_bool_value("enable_match");
return FieldMeta{name,
@ -118,7 +118,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
max_len,
nullable,
enable_match,
enable_tokenizer,
enable_analyzer,
type_map};
}

View File

@ -64,13 +64,13 @@ class FieldMeta {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
bool enable_analyzer,
std::map<std::string, std::string>& params)
: name_(name),
id_(id),
type_(type),
string_info_(StringInfo{
max_length, enable_match, enable_tokenizer, std::move(params)}),
max_length, enable_match, enable_analyzer, std::move(params)}),
nullable_(nullable) {
Assert(IsStringDataType(type_));
}
@ -125,10 +125,10 @@ class FieldMeta {
enable_match() const;
bool
enable_tokenizer() const;
enable_analyzer() const;
TokenizerParams
get_tokenizer_params() const;
get_analyzer_params() const;
std::optional<knowhere::MetricType>
get_metric_type() const {
@ -203,7 +203,7 @@ class FieldMeta {
struct StringInfo {
int64_t max_length;
bool enable_match;
bool enable_tokenizer;
bool enable_analyzer;
std::map<std::string, std::string> params;
};
FieldName name_;

View File

@ -121,7 +121,7 @@ class Schema {
int64_t max_length,
bool nullable,
bool enable_match,
bool enable_tokenizer,
bool enable_analyzer,
std::map<std::string, std::string>& params) {
auto field_meta = FieldMeta(name,
id,
@ -129,7 +129,7 @@ class Schema {
max_length,
nullable,
enable_match,
enable_tokenizer,
enable_analyzer,
params);
this->AddField(std::move(field_meta));
}

View File

@ -21,18 +21,18 @@ constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";
TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params)
const char* analyzer_params)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
std::string field_name = "tmp_text_index";
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
field_name.c_str(), true, "", tokenizer_name, analyzer_params);
}
TextMatchIndex::TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params)
const char* analyzer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
path_ = path;
@ -42,12 +42,12 @@ TextMatchIndex::TextMatchIndex(const std::string& path,
false,
path_.c_str(),
tokenizer_name,
tokenizer_params);
analyzer_params);
}
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params)
const char* analyzer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
schema_ = ctx.fieldDataMeta.field_schema;
@ -65,7 +65,7 @@ TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
false,
path_.c_str(),
tokenizer_name,
tokenizer_params);
analyzer_params);
}
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx)
@ -172,8 +172,8 @@ TextMatchIndex::CreateReader() {
void
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
const char* tokenizer_params) {
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
const char* analyzer_params) {
wrapper_->register_tokenizer(tokenizer_name, analyzer_params);
}
TargetBitmap

View File

@ -24,15 +24,15 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
// for growing segment.
explicit TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params);
const char* analyzer_params);
// for sealed segment.
explicit TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params);
const char* analyzer_params);
// for building index.
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params);
const char* analyzer_params);
// for loading index
explicit TextMatchIndex(const storage::FileManagerContext& ctx);
@ -64,7 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
CreateReader();
void
RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);
RegisterTokenizer(const char* tokenizer_name, const char* analyzer_params);
TargetBitmap
MatchQuery(const std::string& query);

View File

@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set,
auto index = std::make_unique<index::TextMatchIndex>(
fileManagerContext,
"milvus_tokenizer",
field_schema.get_tokenizer_params().c_str());
field_schema.get_analyzer_params().c_str());
index->Build(config);
auto binary =
std::make_unique<knowhere::BinarySet>(index->Upload(config));

View File

@ -1516,13 +1516,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
} else {
// build text index using mmap.
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
}
{
@ -1572,7 +1572,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index->Reload();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}
@ -1583,7 +1583,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex(
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

View File

@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
"cannot create text index on non-string type");
// todo: make this(200) configurable.
auto index = std::make_unique<index::TextMatchIndex>(
200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str());
200, "milvus_tokenizer", field_meta.get_analyzer_params().c_str());
index->Commit();
index->CreateReader();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

View File

@ -2014,13 +2014,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
} else {
// build text index using mmap.
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
"milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
}
{
@ -2069,7 +2069,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index->Reload();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}
@ -2080,7 +2080,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params().c_str());
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

View File

@ -16,11 +16,11 @@ to_set(const RustArrayWrapper& w) {
int
main(int argc, char* argv[]) {
std::string tokenizer_name = "jieba";
std::map<std::string, std::string> tokenizer_params;
tokenizer_params["tokenizer"] = tokenizer_name;
std::map<std::string, std::string> analyzer_params;
analyzer_params["tokenizer"] = tokenizer_name;
auto text_index = TantivyIndexWrapper(
"text_demo", true, "", tokenizer_name.c_str(), tokenizer_params);
"text_demo", true, "", tokenizer_name.c_str(), analyzer_params);
auto write_single_text = [&text_index](const std::string& s,
int64_t offset) {
text_index.add_data(&s, 1, offset);
@ -38,7 +38,7 @@ main(int argc, char* argv[]) {
}
text_index.create_reader();
text_index.register_tokenizer(tokenizer_name.c_str(), tokenizer_params);
text_index.register_tokenizer(tokenizer_name.c_str(), analyzer_params);
{
auto result = to_set(text_index.match_query("北京"));

View File

@ -88,9 +88,7 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);
RustArray tantivy_match_query(void *ptr, const char *query);
void tantivy_register_tokenizer(void *ptr,
const char *tokenizer_name,
const char *tokenizer_params);
void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, const char *analyzer_params);
void *tantivy_create_index(const char *field_name,
TantivyDataType data_type,
@ -144,7 +142,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
void *tantivy_create_text_writer(const char *field_name,
const char *path,
const char *tokenizer_name,
const char *tokenizer_params,
const char *analyzer_params,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
@ -159,7 +157,7 @@ bool tantivy_token_stream_advance(void *token_stream);
const char *tantivy_token_stream_get_token(void *token_stream);
void *tantivy_create_tokenizer(const char *tokenizer_params);
void *tantivy_create_tokenizer(const char *analyzer_params);
void *tantivy_clone_tokenizer(void *ptr);

View File

@ -24,12 +24,12 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
pub extern "C" fn tantivy_register_tokenizer(
ptr: *mut c_void,
tokenizer_name: *const c_char,
tokenizer_params: *const c_char,
analyzer_params: *const c_char,
) {
init_log();
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => unsafe {

View File

@ -13,7 +13,7 @@ pub extern "C" fn tantivy_create_text_writer(
field_name: *const c_char,
path: *const c_char,
tokenizer_name: *const c_char,
tokenizer_params: *const c_char,
analyzer_params: *const c_char,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
@ -22,7 +22,7 @@ pub extern "C" fn tantivy_create_text_writer(
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => {

View File

@ -9,9 +9,9 @@ use crate::{
};
#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
pub extern "C" fn tantivy_create_tokenizer(analyzer_params: *const c_char) -> *mut c_void {
init_log();
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let params = unsafe{c_str_to_str(analyzer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Ok(text_analyzer) => create_binding(text_analyzer),

View File

@ -14,7 +14,7 @@ namespace milvus::tantivy {
using Map = std::map<std::string, std::string>;
static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer";
static const char* DEFAULT_TOKENIZER_PARAMS = "{}";
static const char* DEFAULT_analyzer_params = "{}";
static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
@ -101,14 +101,14 @@ struct TantivyIndexWrapper {
bool in_ram,
const char* path,
const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
const char* analyzer_params = DEFAULT_analyzer_params,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
writer_ = tantivy_create_text_writer(field_name,
path,
tokenizer_name,
tokenizer_params,
analyzer_params,
num_threads,
overall_memory_budget_in_bytes,
in_ram);
@ -132,10 +132,10 @@ struct TantivyIndexWrapper {
void
register_tokenizer(const char* tokenizer_name,
const char* tokenizer_params) {
const char* analyzer_params) {
if (reader_ != nullptr) {
tantivy_register_tokenizer(
reader_, tokenizer_name, tokenizer_params);
reader_, tokenizer_name, analyzer_params);
}
}

View File

@ -31,7 +31,7 @@ TEST(ValidateTextSchema, JieBa) {
milvus::proto::schema::FieldSchema schema;
{
auto kv = schema.add_type_params();
kv->set_key("tokenizer_params");
kv->set_key("analyzer_params");
kv->set_value(R"({"tokenizer": "jieba"})");
}
@ -47,10 +47,10 @@ set_cmap(CMap m, const std::string& key, const std::string& value) {
}
TEST(CTokenizer, Default) {
auto tokenizer_params = R"({"tokenizer": "standard"})";
auto analyzer_params = R"({"tokenizer": "standard"})";
CTokenizer tokenizer;
{
auto status = create_tokenizer(tokenizer_params, &tokenizer);
auto status = create_tokenizer(analyzer_params, &tokenizer);
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
}

View File

@ -84,9 +84,9 @@ TEST(ParseTokenizerParams, NoTokenizerParams) {
}
TEST(ParseTokenizerParams, Default) {
TypeParams params{{"tokenizer_params", R"({"tokenizer": "standard"})"}};
TypeParams params{{"analyzer_params", R"({"tokenizer": "standard"})"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(params.at("tokenizer_params"), p);
ASSERT_EQ(params.at("analyzer_params"), p);
}
TEST(TextMatch, Index) {
@ -248,8 +248,8 @@ TEST(TextMatch, SealedNaive) {
TEST(TextMatch, GrowingJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"enable_tokenizer", "true"},
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
{"enable_analyzer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};
@ -325,8 +325,8 @@ TEST(TextMatch, GrowingJieBa) {
TEST(TextMatch, SealedJieBa) {
auto schema = GenTestSchema({
{"enable_match", "true"},
{"enable_tokenizer", "true"},
{"tokenizer_params", R"({"tokenizer": "jieba"})"},
{"enable_analyzer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
});
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代"};

View File

@ -58,7 +58,7 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() {
Key: "enable_match", Value: "true",
},
{
Key: "enable_tokenizer", Value: "true",
Key: "enable_analyzer", Value: "true",
},
},
},

View File

@ -1328,7 +1328,11 @@ func (h *HandlersV2) createCollection(ctx context.Context, c *gin.Context, anyRe
}
}
for key, fieldParam := range field.ElementTypeParams {
fieldSchema.TypeParams = append(fieldSchema.TypeParams, &commonpb.KeyValuePair{Key: key, Value: fmt.Sprintf("%v", fieldParam)})
value, err := getElementTypeParams(fieldParam)
if err != nil {
return nil, err
}
fieldSchema.TypeParams = append(fieldSchema.TypeParams, &commonpb.KeyValuePair{Key: key, Value: value})
}
if lo.Contains(allOutputFields, field.FieldName) {
fieldSchema.IsFunctionOutput = true

View File

@ -1510,6 +1510,18 @@ func convertToExtraParams(indexParam IndexParam) ([]*commonpb.KeyValuePair, erro
return params, nil
}
func getElementTypeParams(param interface{}) (string, error) {
if str, ok := param.(string); ok {
return str, nil
}
jsonBytes, err := json.Marshal(param)
if err != nil {
return "", err
}
return string(jsonBytes), nil
}
func MetricsHandlerFunc(c *gin.Context) {
path := c.Request.URL.Path
metrics.RestfulFunctionCall.WithLabelValues(

View File

@ -47,7 +47,7 @@ func TestEmbeddingNode_BM25_Operator(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Key: "enable_analyzer",
Value: "true",
},
},

View File

@ -1250,7 +1250,7 @@ func TestCatalog_CreateCollection(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Key: "enable_analyzer",
Value: "true",
},
},
@ -1354,7 +1354,7 @@ func TestCatalog_DropCollection(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{
Key: "enable_tokenizer",
Key: "enable_analyzer",
Value: "true",
},
},

View File

@ -407,7 +407,7 @@ func (t *createCollectionTask) PreExecute(ctx context.Context) error {
return err
}
if err := ctokenizer.ValidateTextSchema(field); err != nil {
if err := ctokenizer.ValidateTextSchema(field, wasBm25FunctionInputField(t.schema, field)); err != nil {
return err
}
}

View File

@ -3133,7 +3133,7 @@ func TestCreateCollectionTaskWithPartitionKey(t *testing.T) {
Value: strconv.Itoa(testMaxVarCharLength),
},
{
Key: "enable_tokenizer",
Key: "enable_analyzer",
Value: "true",
},
},

View File

@ -697,6 +697,15 @@ func checkFunctionOutputField(function *schemapb.FunctionSchema, fields []*schem
return nil
}
func wasBm25FunctionInputField(coll *schemapb.CollectionSchema, field *schemapb.FieldSchema) bool {
for _, fun := range coll.GetFunctions() {
if fun.GetType() == schemapb.FunctionType_BM25 && field.GetName() == fun.GetInputFieldNames()[0] {
return true
}
}
return false
}
func checkFunctionInputField(function *schemapb.FunctionSchema, fields []*schemapb.FieldSchema) error {
switch function.GetType() {
case schemapb.FunctionType_BM25:
@ -705,8 +714,8 @@ func checkFunctionInputField(function *schemapb.FunctionSchema, fields []*schema
len(fields), fields[0].DataType.String())
}
h := typeutil.CreateFieldSchemaHelper(fields[0])
if !h.EnableTokenizer() {
return fmt.Errorf("BM25 function input field must set enable_tokenizer to true")
if !h.EnableAnalyzer() {
return fmt.Errorf("BM25 function input field must set enable_analyzer to true")
}
default:

View File

@ -2638,7 +2638,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Valid function schema", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector},
},
Functions: []*schemapb.FunctionSchema{
@ -2657,7 +2657,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - duplicate function names", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector},
},
Functions: []*schemapb.FunctionSchema{
@ -2702,7 +2702,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - output field not found", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
},
Functions: []*schemapb.FunctionSchema{
{
@ -2721,7 +2721,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - nullable input field", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}, Nullable: true},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}, Nullable: true},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector},
},
Functions: []*schemapb.FunctionSchema{
@ -2741,7 +2741,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - output field is primary key", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, IsPrimaryKey: true},
},
Functions: []*schemapb.FunctionSchema{
@ -2761,7 +2761,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - output field is partition key", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, IsPartitionKey: true},
},
Functions: []*schemapb.FunctionSchema{
@ -2781,7 +2781,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - output field is clustering key", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, IsClusteringKey: true},
},
Functions: []*schemapb.FunctionSchema{
@ -2801,7 +2801,7 @@ func TestValidateFunction(t *testing.T) {
t.Run("Invalid function schema - nullable output field", func(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}}},
{Name: "input_field", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}}},
{Name: "output_field", DataType: schemapb.DataType_SparseFloatVector, Nullable: true},
},
Functions: []*schemapb.FunctionSchema{
@ -2827,7 +2827,7 @@ func TestValidateFunctionInputField(t *testing.T) {
fields := []*schemapb.FieldSchema{
{
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}},
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}},
},
}
err := checkFunctionInputField(function, fields)
@ -2854,7 +2854,7 @@ func TestValidateFunctionInputField(t *testing.T) {
fields := []*schemapb.FieldSchema{
{
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "false"}},
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "false"}},
},
}
err := checkFunctionInputField(function, fields)
@ -2868,11 +2868,11 @@ func TestValidateFunctionInputField(t *testing.T) {
fields := []*schemapb.FieldSchema{
{
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}},
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}},
},
{
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_tokenizer", Value: "true"}},
TypeParams: []*commonpb.KeyValuePair{{Key: "enable_analyzer", Value: "true"}},
},
}
err := checkFunctionInputField(function, fields)

View File

@ -17,14 +17,14 @@ import (
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
func ValidateTextSchema(fieldSchema *schemapb.FieldSchema) error {
func ValidateTextSchema(fieldSchema *schemapb.FieldSchema, EnableBM25 bool) error {
h := typeutil.CreateFieldSchemaHelper(fieldSchema)
if !h.EnableMatch() {
if !h.EnableMatch() && !EnableBM25 {
return nil
}
if !h.EnableTokenizer() {
return fmt.Errorf("field %s is set to enable match but not enable tokenizer", fieldSchema.Name)
if !h.EnableAnalyzer() {
return fmt.Errorf("field %s is set to enable match or bm25 function but not enable analyzer", fieldSchema.Name)
}
bs, err := proto.Marshal(fieldSchema)

View File

@ -16,7 +16,7 @@ func TestValidateEmptyTextSchema(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{},
}
assert.Nil(t, ValidateTextSchema(fs))
assert.Nil(t, ValidateTextSchema(fs, false))
}
func TestValidateTextSchema(t *testing.T) {
@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{Key: "enable_match", Value: "true"},
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
{Key: "analyzer_params", Value: `{"tokenizer": "standard"}`},
},
},
{
@ -41,32 +41,32 @@ func TestValidateTextSchema(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{Key: "enable_match", Value: "true"},
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
{Key: "analyzer_params", Value: `{"tokenizer": "standard"}`},
},
},
}
for idx, tt := range tests {
t.Run(fmt.Sprintf("enable_tokenizer not set %d", idx), func(t *testing.T) {
err := ValidateTextSchema(tt)
t.Run(fmt.Sprintf("enable_analyzer not set %d", idx), func(t *testing.T) {
err := ValidateTextSchema(tt, false)
assert.NotNil(t, err)
})
}
for idx, tt := range tests {
t.Run(fmt.Sprintf("enable_tokenizer set to false %d", idx), func(t *testing.T) {
t.Run(fmt.Sprintf("enable_analyzer set to false %d", idx), func(t *testing.T) {
tt.TypeParams = append(tt.TypeParams, &commonpb.KeyValuePair{
Key: "enable_tokenizer",
Key: "enable_analyzer",
Value: "false",
})
err := ValidateTextSchema(tt)
err := ValidateTextSchema(tt, false)
assert.NotNil(t, err)
})
}
for idx, tt := range tests {
t.Run(fmt.Sprintf("enable_tokenizer set to true %d", idx), func(t *testing.T) {
t.Run(fmt.Sprintf("enable_analyzer set to true %d", idx), func(t *testing.T) {
tt.TypeParams[len(tt.TypeParams)-1].Value = "true"
err := ValidateTextSchema(tt)
err := ValidateTextSchema(tt, false)
assert.Nil(t, err)
})
}

View File

@ -30,6 +30,8 @@ import (
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
const analyzerParams = "analyzer_params"
// BM25 Runner
// Input: string
// Output: map[uint32]float32
@ -40,9 +42,9 @@ type BM25FunctionRunner struct {
concurrency int
}
func getTokenizerParams(field *schemapb.FieldSchema) string {
func getAnalyzerParams(field *schemapb.FieldSchema) string {
for _, param := range field.GetTypeParams() {
if param.Key == "tokenizer_params" {
if param.Key == analyzerParams {
return param.Value
}
}
@ -66,7 +68,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
}
if field.GetFieldID() == schema.GetInputFieldIds()[0] {
params = getTokenizerParams(field)
params = getAnalyzerParams(field)
}
}

View File

@ -53,11 +53,11 @@ func (h *FieldSchemaHelper) EnableMatch() bool {
return err == nil && enable
}
func (h *FieldSchemaHelper) EnableTokenizer() bool {
func (h *FieldSchemaHelper) EnableAnalyzer() bool {
if !IsStringType(h.schema.GetDataType()) {
return false
}
s, err := h.typeParams.Get("enable_tokenizer")
s, err := h.typeParams.Get("enable_analyzer")
if err != nil {
return false
}

View File

@ -777,15 +777,15 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
}
fields = [
gen_int64_field(),
gen_float_field(),
gen_string_field(),
gen_string_field(name="text", max_length=2000, enable_tokenizer=True, enable_match=True,
tokenizer_params=tokenizer_params),
gen_string_field(name="text", max_length=2000, enable_analyzer=True, enable_match=True,
analyzer_params=analyzer_params),
gen_json_field(),
gen_array_field(name="array_int", element_type=DataType.INT64),
gen_array_field(name="array_float", element_type=DataType.FLOAT),
@ -1799,7 +1799,7 @@ def get_text_field_name(schema=None):
schema = gen_default_collection_schema()
fields = schema.fields
for field in fields:
if field.dtype == DataType.VARCHAR and field.params.get("enable_tokenizer", False):
if field.dtype == DataType.VARCHAR and field.params.get("enable_analyzer", False):
return field.name
return None
@ -1900,7 +1900,7 @@ def gen_varchar_data(length: int, nb: int, text_mode=False):
def gen_data_by_collection_field(field, nb=None, start=None):
# if nb is None, return one data, else return a list of data
data_type = field.dtype
enable_tokenizer = field.params.get("enable_tokenizer", False)
enable_analyzer = field.params.get("enable_analyzer", False)
if data_type == DataType.BOOL:
if nb is None:
return random.choice([True, False])
@ -1936,8 +1936,8 @@ def gen_data_by_collection_field(field, nb=None, start=None):
max_length = min(20, max_length-1)
length = random.randint(0, max_length)
if nb is None:
return gen_varchar_data(length=length, nb=1, text_mode=enable_tokenizer)[0]
return gen_varchar_data(length=length, nb=nb, text_mode=enable_tokenizer)
return gen_varchar_data(length=length, nb=1, text_mode=enable_analyzer)[0]
return gen_varchar_data(length=length, nb=nb, text_mode=enable_analyzer)
if data_type == DataType.JSON:
if nb is None:
return {"name": fake.name(), "address": fake.address()}

View File

@ -27,8 +27,8 @@ pytest-parallel
pytest-random-order
# pymilvus
pymilvus==2.5.0rc106
pymilvus[bulk_writer]==2.5.0rc106
pymilvus==2.5.0rc108
pymilvus[bulk_writer]==2.5.0rc108
# for customize config test
python-benedict==0.24.3

View File

@ -771,7 +771,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_tokenizer=True, enable_match=True, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
@ -938,7 +938,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
cf.gen_string_field(name=df.text_field, enable_tokenizer=True, enable_match=True, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
cf.gen_json_field(name=df.json_field),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
@ -1091,7 +1091,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_tokenizer=True, enable_match=True, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),

File diff suppressed because it is too large Load Diff

View File

@ -4452,7 +4452,7 @@ class TestQueryTextMatch(TestcaseBase):
3. verify the result
expected: text match successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
@ -4462,34 +4462,34 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -4589,7 +4589,7 @@ class TestQueryTextMatch(TestcaseBase):
3. verify the result
expected: text match successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
@ -4599,34 +4599,34 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -4723,7 +4723,7 @@ class TestQueryTextMatch(TestcaseBase):
3. verify the result
expected: get the correct token, text match successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
# "lowercase", "asciifolding", "alphanumonly" was system filter
"filter":["lowercase", "asciifolding", "alphanumonly",
@ -4742,33 +4742,33 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -4843,7 +4843,7 @@ class TestQueryTextMatch(TestcaseBase):
3. verify the result
expected: query successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
}
# 1. initialize with data
@ -4854,33 +4854,33 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -4957,7 +4957,7 @@ class TestQueryTextMatch(TestcaseBase):
3. verify the result
expected: query successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
}
# 1. initialize with data
@ -4968,33 +4968,33 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -5100,7 +5100,7 @@ class TestQueryTextMatch(TestcaseBase):
"""
# 1. initialize with data
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
}
# 1. initialize with data
@ -5111,33 +5111,33 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -5245,7 +5245,7 @@ class TestQueryTextMatch(TestcaseBase):
"""
# 1. initialize with data
fake_en = Faker("en_US")
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
}
dim = 128
@ -5255,33 +5255,33 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -5362,7 +5362,7 @@ class TestQueryTextMatch(TestcaseBase):
# 1. initialize with data
fake_en = Faker("en_US")
analyzer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
dim = 128
default_fields = [
@ -5375,7 +5375,7 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
@ -5383,7 +5383,7 @@ class TestQueryTextMatch(TestcaseBase):
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
@ -5391,7 +5391,7 @@ class TestQueryTextMatch(TestcaseBase):
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
@ -5399,7 +5399,7 @@ class TestQueryTextMatch(TestcaseBase):
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
@ -5472,7 +5472,7 @@ class TestQueryTextMatch(TestcaseBase):
expected: text match successfully and result is correct
"""
# 1. initialize with data
tokenizer_params = {
analyzer_params = {
"tokenizer": "standard",
}
# 1. initialize with data
@ -5483,33 +5483,33 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -5614,7 +5614,7 @@ class TestQueryTextMatch(TestcaseBase):
"""
# 1. initialize with data
analyzer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
# 1. initialize with data
dim = 128
@ -5624,7 +5624,7 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
nullable=True,
@ -5633,7 +5633,7 @@ class TestQueryTextMatch(TestcaseBase):
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
nullable=True,
@ -5642,7 +5642,7 @@ class TestQueryTextMatch(TestcaseBase):
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
nullable=True,
@ -5651,7 +5651,7 @@ class TestQueryTextMatch(TestcaseBase):
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
nullable=True,
@ -5731,7 +5731,7 @@ class TestQueryTextMatchNegative(TestcaseBase):
2. create collection
expected: create collection failed and return error
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": "Unsupported",
}
dim = 128
@ -5741,41 +5741,41 @@ class TestQueryTextMatchNegative(TestcaseBase):
name="title",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="overview",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="genres",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="producer",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="cast",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -5856,7 +5856,7 @@ class TestQueryFunction(TestcaseBase):
expected: create collection failed and return error
"""
analyzer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
dim = 128
default_fields = [
@ -5865,7 +5865,7 @@ class TestQueryFunction(TestcaseBase):
name="title",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
@ -5873,14 +5873,14 @@ class TestQueryFunction(TestcaseBase):
name="overview",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="age",
dtype=DataType.INT64,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),

View File

@ -13301,7 +13301,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
3. verify the result
expected: text match successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
@ -13311,34 +13311,34 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
@ -13462,7 +13462,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
3. verify the result
expected: text match successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
@ -13472,34 +13472,34 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),

View File

@ -6,7 +6,7 @@ pyyaml~=6.0
numpy~=1.24.3
allure-pytest>=2.8.18
Faker==19.2.0
pymilvus==2.4.0rc39
pymilvus==2.5.0rc108
scikit-learn~=1.1.3
pytest-xdist==2.5.0
minio==7.1.14

View File

@ -205,9 +205,10 @@ class TestCreateCollection(TestBase):
"isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"elementTypeParams": {"max_length": "1000",
"enable_analyzer": True,
"analyzer_params": {
"tokenizer": "default"
"tokenizer": "standard"
},
"enable_match": True}},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},

View File

@ -255,7 +255,7 @@ class TestCreateIndex(TestBase):
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("tokenizer", ['default', 'jieba'])
@pytest.mark.parametrize("tokenizer", ['standard', 'jieba'])
@pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
@pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
@pytest.mark.parametrize("bm25_b", [0.7, 0.5])
@ -279,7 +279,7 @@ class TestCreateIndex(TestBase):
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
@ -302,7 +302,7 @@ class TestCreateIndex(TestBase):
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'default':
if tokenizer == 'standard':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh

View File

@ -1204,7 +1204,7 @@ class TestSearchVector(TestBase):
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("groupingField", ['user_id', None])
@pytest.mark.parametrize("tokenizer", ['default'])
@pytest.mark.parametrize("tokenizer", ['standard'])
def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
"""
@ -1224,7 +1224,7 @@ class TestSearchVector(TestBase):
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
@ -1252,7 +1252,7 @@ class TestSearchVector(TestBase):
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'default':
if tokenizer == 'standard':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
@ -1339,7 +1339,7 @@ class TestSearchVector(TestBase):
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"elementTypeParams": {"max_length": "1000", "enable_analyzer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
@ -1367,7 +1367,7 @@ class TestSearchVector(TestBase):
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'default':
if tokenizer == 'standard':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
@ -1893,7 +1893,7 @@ class TestSearchVector(TestBase):
language = "zh"
# create a collection
dim = 128
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
name = gen_collection_name()
@ -1903,34 +1903,34 @@ class TestSearchVector(TestBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -2730,7 +2730,7 @@ class TestQueryVector(TestBase):
language = "zh"
# create a collection
dim = 128
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
name = gen_collection_name()
@ -2740,34 +2740,34 @@ class TestQueryVector(TestBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]