milvus/tests/python_client/milvus_client/test_milvus_client_analyzer.py

455 lines
18 KiB
Python

import pytest
from typing import Any, List, Dict, Protocol, cast
import uuid
from base.client_v2_base import TestMilvusClientV2Base
from common.common_type import CaseLabel
from common.text_generator import generate_text_by_analyzer
class AnalyzerResult(Protocol):
"""Protocol for analyzer result to help with type inference"""
tokens: List[Dict[str, Any]]
class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
@staticmethod
def get_expected_jieba_tokens(text, analyzer_params):
"""
Generate expected tokens using jieba library based on analyzer parameters
"""
import jieba
import importlib
importlib.reload(jieba)
tokenizer_config = analyzer_params.get("tokenizer", {})
# Set up custom dictionary if provided
if "dict" in tokenizer_config:
custom_dict = tokenizer_config["dict"]
if "_default_" in custom_dict:
# jieba.dt.initialize()
for word in custom_dict:
if word != "_default_":
jieba.add_word(word)
print(f"dict length: {len(jieba.dt.FREQ)}")
print(jieba.dt.FREQ)
else:
file_path = f"/tmp/{uuid.uuid4()}.txt"
# add custom words to jieba_dict.txt
for word in custom_dict:
with open(file_path, "w") as f:
f.write(f"{word} 1")
jieba.set_dictionary(file_path)
jieba.dt.tmp_dir = None
jieba.dt.cache_file = None
jieba.dt.FREQ = {}
jieba.dt.initialize()
# Configure mode
mode = tokenizer_config.get("mode", "search")
hmm = tokenizer_config.get("hmm", True)
if mode == "exact":
tokens = list(jieba.cut(text, HMM=hmm))
elif mode == "search":
tokens = list(jieba.cut_for_search(text, HMM=hmm))
else:
tokens = list(jieba.cut(text, HMM=hmm))
# Filter out empty tokens
tokens = [token for token in tokens if token.strip()]
return tokens
analyzer_params_list = [
{
"tokenizer": "standard",
"filter": [
{
"type": "stop",
"stop_words": ["is", "the", "this", "a", "an", "and", "or"],
}
],
},
{
"tokenizer": "jieba",
"filter": [
{
"type": "stop",
"stop_words": ["is", "the", "this", "a", "an", "and", "or", "", "", "", "一个", "", ""],
}
],
},
{
"tokenizer": "icu"
}
# {
# "tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
# "filter": [
# {
# "type": "stop",
# "stop_words": ["は", "が", "の", "に", "を", "で", "と", "た"],
# }
# ],
# },
# {"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}},
# {"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
]
jieba_custom_analyzer_params_list = [
# # Test dict parameter with custom dictionary
{
"tokenizer": {
"type": "jieba",
"dict": ["结巴分词器"],
"mode": "exact",
"hmm": False
}
},
# Test dict parameter with default dict and custom dict
{
"tokenizer": {
"type": "jieba",
"dict": ["_default_", "结巴分词器"],
"mode": "search",
"hmm": False
}
},
# Test exact mode with hmm enabled
{
"tokenizer": {
"type": "jieba",
"dict": ["结巴分词器"],
"mode": "exact",
"hmm": True
}
},
# Test search mode with hmm enabled
{
"tokenizer": {
"type": "jieba",
"dict": ["结巴分词器"],
"mode": "search",
"hmm": True
}
},
# Test with only mode configuration
{
"tokenizer": {
"type": "jieba",
"mode": "exact"
}
},
# Test with only hmm configuration
{
"tokenizer": {
"type": "jieba",
"hmm": False
}
}
]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("analyzer_params", analyzer_params_list)
def test_analyzer(self, analyzer_params):
"""
target: test analyzer
method: use different analyzer params, then run analyzer to get the tokens
expected: verify the tokens
"""
client = self._client()
text = generate_text_by_analyzer(analyzer_params)
res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
res_2, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
# Cast to help type inference for gRPC response
analyzer_res = cast(AnalyzerResult, res)
analyzer_res_2 = cast(AnalyzerResult, res_2)
# verify the result are the same when run analyzer twice
for i in range(len(analyzer_res.tokens)):
assert analyzer_res.tokens[i]["token"] == analyzer_res_2.tokens[i]["token"]
assert analyzer_res.tokens[i]["hash"] == analyzer_res_2.tokens[i]["hash"]
assert analyzer_res.tokens[i]["start_offset"] == analyzer_res_2.tokens[i]["start_offset"]
assert analyzer_res.tokens[i]["end_offset"] == analyzer_res_2.tokens[i]["end_offset"]
assert analyzer_res.tokens[i]["position"] == analyzer_res_2.tokens[i]["position"]
assert analyzer_res.tokens[i]["position_length"] == analyzer_res_2.tokens[i]["position_length"]
tokens = analyzer_res.tokens
token_list = [r["token"] for r in tokens]
# Check tokens are not empty
assert len(token_list) > 0, "No tokens were generated"
# Check tokens are related to input text (all token should be a substring of the text)
assert all(
token.lower() in text.lower() for token in token_list
), "some of the tokens do not appear in the original text"
if "filter" in analyzer_params:
for filter in analyzer_params["filter"]:
if filter["type"] == "stop":
stop_words = filter["stop_words"]
assert not any(
token in stop_words for token in tokens
), "some of the tokens are stop words"
# Check hash value and detail
for r in tokens:
assert isinstance(r["hash"], int)
assert isinstance(r["start_offset"], int)
assert isinstance(r["end_offset"], int)
assert isinstance(r["position"], int)
assert isinstance(r["position_length"], int)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("analyzer_params", jieba_custom_analyzer_params_list)
def test_jieba_custom_analyzer(self, analyzer_params):
"""
target: test jieba analyzer with custom configurations
method: use different jieba analyzer params with dict, mode, and hmm configurations
expected: verify the tokens are generated correctly based on configuration
"""
client = self._client()
text = "milvus结巴分词器中文测试"
res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True)
analyzer_res = cast(AnalyzerResult, res)
tokens = analyzer_res.tokens
token_list = [r["token"] for r in tokens]
# Check tokens are not empty
assert len(token_list) > 0, "No tokens were generated"
# Generate expected tokens using jieba library and compare
expected_tokens = self.get_expected_jieba_tokens(text, analyzer_params)
assert sorted(token_list) == sorted(expected_tokens), f"Expected {expected_tokens}, but got {token_list}"
# Verify token details
for r in tokens:
assert isinstance(r["token"], str)
assert isinstance(r["start_offset"], int)
assert isinstance(r["end_offset"], int)
assert isinstance(r["position"], int)
assert isinstance(r["position_length"], int)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("invalid_analyzer_params", [
{"tokenizer": "invalid_tokenizer"},
{"tokenizer": 123},
{"tokenizer": None},
{"tokenizer": []},
{"tokenizer": {"type": "invalid_type"}},
{"tokenizer": {"type": None}},
{"filter": "invalid_filter"},
{"filter": [{"type": None}]},
{"filter": [{"invalid_key": "value"}]},
])
def test_analyzer_with_invalid_params(self, invalid_analyzer_params):
"""
target: test analyzer with invalid parameters
method: use invalid analyzer params and expect errors
expected: analyzer should raise appropriate exceptions
"""
client = self._client()
text = "test text for invalid analyzer"
with pytest.raises(Exception):
self.run_analyzer(client, text, invalid_analyzer_params)
@pytest.mark.tags(CaseLabel.L1)
def test_analyzer_with_empty_params(self):
"""
target: test analyzer with empty parameters (uses default)
method: use empty analyzer params
expected: analyzer should use default configuration and work normally
"""
client = self._client()
text = "test text for empty analyzer"
# Empty params should use default configuration
res, _ = self.run_analyzer(client, text, {})
analyzer_res = cast(AnalyzerResult, res)
assert len(analyzer_res.tokens) > 0
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("invalid_text", [
None,
123,
True,
False,
])
def test_analyzer_with_invalid_text(self, invalid_text):
"""
target: test analyzer with invalid text input
method: use valid analyzer params but invalid text
expected: analyzer should handle invalid text appropriately
"""
client = self._client()
analyzer_params = {"tokenizer": "standard"}
with pytest.raises(Exception):
self.run_analyzer(client, invalid_text, analyzer_params)
@pytest.mark.tags(CaseLabel.L1)
def test_analyzer_with_empty_text(self):
"""
target: test analyzer with empty text
method: use empty text input
expected: analyzer should return empty tokens
"""
client = self._client()
analyzer_params = {"tokenizer": "standard"}
res, _ = self.run_analyzer(client, "", analyzer_params)
analyzer_res = cast(AnalyzerResult, res)
assert len(analyzer_res.tokens) == 0
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("text_input", [
[],
{},
["list", "of", "strings"],
{"key": "value"},
])
def test_analyzer_with_structured_text(self, text_input):
"""
target: test analyzer with structured text input (list/dict)
method: use list or dict as text input
expected: analyzer should handle structured input and return tokens
"""
client = self._client()
analyzer_params = {"tokenizer": "standard"}
res, _ = self.run_analyzer(client, text_input, analyzer_params)
# For structured input, API returns direct list format
assert isinstance(res, list)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("invalid_jieba_params", [
{"tokenizer": {"type": "jieba", "dict": "not_a_list"}},
{"tokenizer": {"type": "jieba", "dict": [123, 456]}},
{"tokenizer": {"type": "jieba", "mode": "invalid_mode"}},
{"tokenizer": {"type": "jieba", "mode": 123}},
{"tokenizer": {"type": "jieba", "hmm": "not_boolean"}},
{"tokenizer": {"type": "jieba", "hmm": 123}},
])
def test_jieba_analyzer_with_invalid_config(self, invalid_jieba_params):
"""
target: test jieba analyzer with invalid configurations
method: use jieba analyzer with invalid dict, mode, or hmm values
expected: analyzer should raise appropriate exceptions
"""
client = self._client()
text = "测试文本 for jieba analyzer"
with pytest.raises(Exception):
self.run_analyzer(client, text, invalid_jieba_params)
@pytest.mark.tags(CaseLabel.L1)
def test_jieba_analyzer_with_empty_dict(self):
"""
target: test jieba analyzer with empty dictionary
method: use jieba analyzer with empty dict list
expected: analyzer should work with empty dict (uses default)
"""
client = self._client()
text = "测试文本 for jieba analyzer"
jieba_params = {"tokenizer": {"type": "jieba", "dict": []}}
res, _ = self.run_analyzer(client, text, jieba_params)
analyzer_res = cast(AnalyzerResult, res)
assert len(analyzer_res.tokens) > 0
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("invalid_dict_config", [
{"tokenizer": {"type": "jieba", "dict": None}},
{"tokenizer": {"type": "jieba", "dict": "invalid_string"}},
{"tokenizer": {"type": "jieba", "dict": 123}},
{"tokenizer": {"type": "jieba", "dict": True}},
{"tokenizer": {"type": "jieba", "dict": {"invalid": "dict"}}},
])
def test_jieba_analyzer_with_invalid_dict_values(self, invalid_dict_config):
"""
target: test jieba analyzer with invalid dict configurations
method: use jieba analyzer with invalid dict values
expected: analyzer should raise appropriate exceptions
"""
client = self._client()
text = "测试文本 for jieba analyzer"
with pytest.raises(Exception):
self.run_analyzer(client, text, invalid_dict_config)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("edge_case_dict_config", [
{"tokenizer": {"type": "jieba", "dict": ["", "valid_word"]}}, # Empty string in list
{"tokenizer": {"type": "jieba", "dict": ["valid_word", "valid_word"]}}, # Duplicate words
{"tokenizer": {"type": "jieba", "dict": ["_default_"]}}, # Only default dict
])
def test_jieba_analyzer_with_edge_case_dict_values(self, edge_case_dict_config):
"""
target: test jieba analyzer with edge case dict configurations
method: use jieba analyzer with edge case dict values
expected: analyzer should handle these cases gracefully
"""
client = self._client()
text = "测试文本 for jieba analyzer"
res, _ = self.run_analyzer(client, text, edge_case_dict_config, with_detail=True)
analyzer_res = cast(AnalyzerResult, res)
# These should work but might not be recommended usage
assert len(analyzer_res.tokens) >= 0
@pytest.mark.tags(CaseLabel.L1)
def test_jieba_analyzer_with_unknown_param(self):
"""
target: test jieba analyzer with unknown parameter
method: use jieba analyzer with invalid parameter name
expected: analyzer should ignore unknown parameters and work normally
"""
client = self._client()
text = "测试文本 for jieba analyzer"
jieba_params = {"tokenizer": {"type": "jieba", "invalid_param": "value"}}
res, _ = self.run_analyzer(client, text, jieba_params)
analyzer_res = cast(AnalyzerResult, res)
assert len(analyzer_res.tokens) > 0
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("invalid_filter_params", [
{"tokenizer": "standard", "filter": [{"type": "stop"}]},
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]},
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]},
{"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]},
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": None}]},
])
def test_analyzer_with_invalid_filter(self, invalid_filter_params):
"""
target: test analyzer with invalid filter configurations
method: use analyzer with invalid filter parameters
expected: analyzer should handle invalid filters appropriately
"""
client = self._client()
text = "This is a test text with stop words"
with pytest.raises(Exception):
self.run_analyzer(client, text, invalid_filter_params)
@pytest.mark.tags(CaseLabel.L1)
def test_analyzer_with_empty_stop_words(self):
"""
target: test analyzer with empty stop words list
method: use stop filter with empty stop_words list
expected: analyzer should work normally with empty stop words (no filtering)
"""
client = self._client()
text = "This is a test text with stop words"
filter_params = {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": []}]}
res, _ = self.run_analyzer(client, text, filter_params, with_detail=True)
analyzer_res = cast(AnalyzerResult, res)
tokens = analyzer_res.tokens
token_list = [r["token"] for r in tokens]
assert len(token_list) > 0
# With empty stop words, no filtering should occur
assert "is" in token_list # Common stop word should still be present