milvus/tests/python_client/milvus_client/test_milvus_client_analyzer.py

62 lines
2.2 KiB
Python

import pytest
from base.client_v2_base import TestMilvusClientV2Base
from common.common_type import CaseLabel
from common.phrase_match_generator import generate_text_by_analyzer
class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
analyzer_params_list = [
{
"tokenizer": "standard",
"filter": [
{
"type": "stop",
"stop_words": ["is", "the", "this", "a", "an", "and", "or"],
}
],
},
{
"tokenizer": "jieba",
},
{
"tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
"filter": [
{
"type": "stop",
"stop_words": ["", "", "", "", "", "", "", ""],
}
],
},
{"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}},
{"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("analyzer_params", analyzer_params_list)
def test_analyzer(self, analyzer_params):
"""
target: test analyzer
method: use different analyzer params, then run analyzer to get the tokens
expected: verify the tokens
"""
client = self._client()
text = generate_text_by_analyzer(analyzer_params)
res, result = self.run_analyzer(client, text, analyzer_params)
tokens = res.tokens
# Check tokens are not empty
assert len(tokens) > 0, "No tokens were generated"
# Check tokens are related to input text (all token should be a substring of the text)
assert all(
token.lower() in text.lower() for token in tokens
), "some of the tokens do not appear in the original text"
if "filter" in analyzer_params:
for filter in analyzer_params["filter"]:
if filter["type"] == "stop":
stop_words = filter["stop_words"]
assert not any(
token in stop_words for token in tokens
), "some of the tokens are stop words"