test: [cherry-pick]remove xfail for text match testcases (#39932)

pr: https://github.com/milvus-io/milvus/pull/39920

/kind improvement

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
pull/39994/head
zhuwenxing 2025-02-19 14:15:11 +08:00 committed by GitHub
parent 750b70ac00
commit 02198e14af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 29 additions and 28 deletions

View File

@ -4800,13 +4800,10 @@ class TestQueryTextMatch(TestcaseBase):
assert any(
[token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}"
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
@pytest.mark.xfail(reason="unstable case, issue: https://github.com/milvus-io/milvus/issues/36962")
def test_query_text_match_with_growing_segment(
self, tokenizer, enable_inverted_index, enable_partition_key
):
@ -4817,7 +4814,7 @@ class TestQueryTextMatch(TestcaseBase):
3. verify the result
expected: text match successfully and result is correct
"""
tokenizer_params = {
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
@ -4827,34 +4824,34 @@ class TestQueryTextMatch(TestcaseBase):
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_tokenizer=True,
enable_analyzer=True,
enable_match=True,
tokenizer_params=tokenizer_params,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
@ -4897,6 +4894,7 @@ class TestQueryTextMatch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
time.sleep(3)
# analyze the croup
text_fields = ["word", "sentence", "paragraph", "text"]
wf_map = {}
@ -4905,22 +4903,12 @@ class TestQueryTextMatch(TestcaseBase):
# query single field for one token
for field in text_fields:
token = wf_map[field].most_common()[0][0]
expr = f"TextMatch({field}, '{token}')"
expr = f"text_match({field}, '{token}')"
log.info(f"expr: {expr}")
res, _ = collection_w.query(expr=expr, output_fields=["id", field])
assert len(res) > 0
log.info(f"res len {len(res)}")
for r in res:
assert token in r[field]
# verify inverted index
if enable_inverted_index:
if field == "word":
expr = f"{field} == '{token}'"
log.info(f"expr: {expr}")
res, _ = collection_w.query(expr=expr, output_fields=["id", field])
log.info(f"res len {len(res)}")
for r in res:
assert r[field] == token
assert len(res) > 0
# query single field for multi-word
for field in text_fields:
# match top 10 most common words
@ -4928,12 +4916,25 @@ class TestQueryTextMatch(TestcaseBase):
for word, count in wf_map[field].most_common(10):
top_10_tokens.append(word)
string_of_top_10_words = " ".join(top_10_tokens)
expr = f"TextMatch({field}, '{string_of_top_10_words}')"
expr = f"text_match({field}, '{string_of_top_10_words}')"
log.info(f"expr {expr}")
res, _ = collection_w.query(expr=expr, output_fields=["id", field])
log.info(f"res len {len(res)}")
for r in res:
assert any([token in r[field] for token in top_10_tokens])
assert len(res) > 0
# flush and then query again
collection_w.flush()
for field in text_fields:
# match top 10 most common words
top_10_tokens = []
for word, count in wf_map[field].most_common(10):
top_10_tokens.append(word)
string_of_top_10_words = " ".join(top_10_tokens)
expr = f"text_match({field}, '{string_of_top_10_words}')"
log.info(f"expr {expr}")
res, _ = collection_w.query(expr=expr, output_fields=["id", field])
log.info(f"res len {len(res)}")
assert len(res) > 0
@pytest.mark.tags(CaseLabel.L0)