mirror of https://github.com/milvus-io/milvus.git
455 lines
18 KiB
Python
455 lines
18 KiB
Python
import pytest
|
|
from typing import Any, List, Dict, Protocol, cast
|
|
import uuid
|
|
from base.client_v2_base import TestMilvusClientV2Base
|
|
from common.common_type import CaseLabel
|
|
from common.text_generator import generate_text_by_analyzer
|
|
|
|
|
|
class AnalyzerResult(Protocol):
|
|
"""Protocol for analyzer result to help with type inference"""
|
|
tokens: List[Dict[str, Any]]
|
|
|
|
|
|
class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
|
|
|
|
@staticmethod
|
|
def get_expected_jieba_tokens(text, analyzer_params):
|
|
"""
|
|
Generate expected tokens using jieba library based on analyzer parameters
|
|
"""
|
|
import jieba
|
|
import importlib
|
|
importlib.reload(jieba)
|
|
tokenizer_config = analyzer_params.get("tokenizer", {})
|
|
# Set up custom dictionary if provided
|
|
if "dict" in tokenizer_config:
|
|
custom_dict = tokenizer_config["dict"]
|
|
if "_default_" in custom_dict:
|
|
# jieba.dt.initialize()
|
|
for word in custom_dict:
|
|
if word != "_default_":
|
|
jieba.add_word(word)
|
|
print(f"dict length: {len(jieba.dt.FREQ)}")
|
|
print(jieba.dt.FREQ)
|
|
else:
|
|
file_path = f"/tmp/{uuid.uuid4()}.txt"
|
|
# add custom words to jieba_dict.txt
|
|
for word in custom_dict:
|
|
with open(file_path, "w") as f:
|
|
f.write(f"{word} 1")
|
|
jieba.set_dictionary(file_path)
|
|
jieba.dt.tmp_dir = None
|
|
jieba.dt.cache_file = None
|
|
jieba.dt.FREQ = {}
|
|
jieba.dt.initialize()
|
|
|
|
|
|
# Configure mode
|
|
mode = tokenizer_config.get("mode", "search")
|
|
hmm = tokenizer_config.get("hmm", True)
|
|
|
|
if mode == "exact":
|
|
tokens = list(jieba.cut(text, HMM=hmm))
|
|
elif mode == "search":
|
|
tokens = list(jieba.cut_for_search(text, HMM=hmm))
|
|
else:
|
|
tokens = list(jieba.cut(text, HMM=hmm))
|
|
|
|
# Filter out empty tokens
|
|
tokens = [token for token in tokens if token.strip()]
|
|
|
|
return tokens
|
|
|
|
analyzer_params_list = [
|
|
{
|
|
"tokenizer": "standard",
|
|
"filter": [
|
|
{
|
|
"type": "stop",
|
|
"stop_words": ["is", "the", "this", "a", "an", "and", "or"],
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tokenizer": "jieba",
|
|
"filter": [
|
|
{
|
|
"type": "stop",
|
|
"stop_words": ["is", "the", "this", "a", "an", "and", "or", "是", "的", "这", "一个", "和", "或"],
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tokenizer": "icu"
|
|
}
|
|
# {
|
|
# "tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
|
|
# "filter": [
|
|
# {
|
|
# "type": "stop",
|
|
# "stop_words": ["は", "が", "の", "に", "を", "で", "と", "た"],
|
|
# }
|
|
# ],
|
|
# },
|
|
# {"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}},
|
|
# {"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
|
|
]
|
|
|
|
jieba_custom_analyzer_params_list = [
|
|
# # Test dict parameter with custom dictionary
|
|
{
|
|
"tokenizer": {
|
|
"type": "jieba",
|
|
"dict": ["结巴分词器"],
|
|
"mode": "exact",
|
|
"hmm": False
|
|
}
|
|
},
|
|
# Test dict parameter with default dict and custom dict
|
|
{
|
|
"tokenizer": {
|
|
"type": "jieba",
|
|
"dict": ["_default_", "结巴分词器"],
|
|
"mode": "search",
|
|
"hmm": False
|
|
}
|
|
},
|
|
# Test exact mode with hmm enabled
|
|
{
|
|
"tokenizer": {
|
|
"type": "jieba",
|
|
"dict": ["结巴分词器"],
|
|
"mode": "exact",
|
|
"hmm": True
|
|
}
|
|
},
|
|
# Test search mode with hmm enabled
|
|
{
|
|
"tokenizer": {
|
|
"type": "jieba",
|
|
"dict": ["结巴分词器"],
|
|
"mode": "search",
|
|
"hmm": True
|
|
}
|
|
},
|
|
# Test with only mode configuration
|
|
{
|
|
"tokenizer": {
|
|
"type": "jieba",
|
|
"mode": "exact"
|
|
}
|
|
},
|
|
# Test with only hmm configuration
|
|
{
|
|
"tokenizer": {
|
|
"type": "jieba",
|
|
"hmm": False
|
|
}
|
|
}
|
|
]
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("analyzer_params", analyzer_params_list)
|
|
def test_analyzer(self, analyzer_params):
|
|
"""
|
|
target: test analyzer
|
|
method: use different analyzer params, then run analyzer to get the tokens
|
|
expected: verify the tokens
|
|
"""
|
|
client = self._client()
|
|
text = generate_text_by_analyzer(analyzer_params)
|
|
res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
|
|
res_2, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
|
|
|
|
# Cast to help type inference for gRPC response
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
analyzer_res_2 = cast(AnalyzerResult, res_2)
|
|
|
|
# verify the result are the same when run analyzer twice
|
|
for i in range(len(analyzer_res.tokens)):
|
|
assert analyzer_res.tokens[i]["token"] == analyzer_res_2.tokens[i]["token"]
|
|
assert analyzer_res.tokens[i]["hash"] == analyzer_res_2.tokens[i]["hash"]
|
|
assert analyzer_res.tokens[i]["start_offset"] == analyzer_res_2.tokens[i]["start_offset"]
|
|
assert analyzer_res.tokens[i]["end_offset"] == analyzer_res_2.tokens[i]["end_offset"]
|
|
assert analyzer_res.tokens[i]["position"] == analyzer_res_2.tokens[i]["position"]
|
|
assert analyzer_res.tokens[i]["position_length"] == analyzer_res_2.tokens[i]["position_length"]
|
|
|
|
tokens = analyzer_res.tokens
|
|
token_list = [r["token"] for r in tokens]
|
|
# Check tokens are not empty
|
|
assert len(token_list) > 0, "No tokens were generated"
|
|
|
|
# Check tokens are related to input text (all token should be a substring of the text)
|
|
assert all(
|
|
token.lower() in text.lower() for token in token_list
|
|
), "some of the tokens do not appear in the original text"
|
|
|
|
if "filter" in analyzer_params:
|
|
for filter in analyzer_params["filter"]:
|
|
if filter["type"] == "stop":
|
|
stop_words = filter["stop_words"]
|
|
assert not any(
|
|
token in stop_words for token in tokens
|
|
), "some of the tokens are stop words"
|
|
|
|
# Check hash value and detail
|
|
for r in tokens:
|
|
assert isinstance(r["hash"], int)
|
|
assert isinstance(r["start_offset"], int)
|
|
assert isinstance(r["end_offset"], int)
|
|
assert isinstance(r["position"], int)
|
|
assert isinstance(r["position_length"], int)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("analyzer_params", jieba_custom_analyzer_params_list)
|
|
def test_jieba_custom_analyzer(self, analyzer_params):
|
|
"""
|
|
target: test jieba analyzer with custom configurations
|
|
method: use different jieba analyzer params with dict, mode, and hmm configurations
|
|
expected: verify the tokens are generated correctly based on configuration
|
|
"""
|
|
client = self._client()
|
|
text = "milvus结巴分词器中文测试"
|
|
res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True)
|
|
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
tokens = analyzer_res.tokens
|
|
token_list = [r["token"] for r in tokens]
|
|
|
|
# Check tokens are not empty
|
|
assert len(token_list) > 0, "No tokens were generated"
|
|
|
|
# Generate expected tokens using jieba library and compare
|
|
expected_tokens = self.get_expected_jieba_tokens(text, analyzer_params)
|
|
assert sorted(token_list) == sorted(expected_tokens), f"Expected {expected_tokens}, but got {token_list}"
|
|
|
|
# Verify token details
|
|
for r in tokens:
|
|
assert isinstance(r["token"], str)
|
|
assert isinstance(r["start_offset"], int)
|
|
assert isinstance(r["end_offset"], int)
|
|
assert isinstance(r["position"], int)
|
|
assert isinstance(r["position_length"], int)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("invalid_analyzer_params", [
|
|
{"tokenizer": "invalid_tokenizer"},
|
|
{"tokenizer": 123},
|
|
{"tokenizer": None},
|
|
{"tokenizer": []},
|
|
{"tokenizer": {"type": "invalid_type"}},
|
|
{"tokenizer": {"type": None}},
|
|
{"filter": "invalid_filter"},
|
|
{"filter": [{"type": None}]},
|
|
{"filter": [{"invalid_key": "value"}]},
|
|
])
|
|
def test_analyzer_with_invalid_params(self, invalid_analyzer_params):
|
|
"""
|
|
target: test analyzer with invalid parameters
|
|
method: use invalid analyzer params and expect errors
|
|
expected: analyzer should raise appropriate exceptions
|
|
"""
|
|
client = self._client()
|
|
text = "test text for invalid analyzer"
|
|
|
|
with pytest.raises(Exception):
|
|
self.run_analyzer(client, text, invalid_analyzer_params)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
def test_analyzer_with_empty_params(self):
|
|
"""
|
|
target: test analyzer with empty parameters (uses default)
|
|
method: use empty analyzer params
|
|
expected: analyzer should use default configuration and work normally
|
|
"""
|
|
client = self._client()
|
|
text = "test text for empty analyzer"
|
|
|
|
# Empty params should use default configuration
|
|
res, _ = self.run_analyzer(client, text, {})
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
assert len(analyzer_res.tokens) > 0
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("invalid_text", [
|
|
None,
|
|
123,
|
|
True,
|
|
False,
|
|
])
|
|
def test_analyzer_with_invalid_text(self, invalid_text):
|
|
"""
|
|
target: test analyzer with invalid text input
|
|
method: use valid analyzer params but invalid text
|
|
expected: analyzer should handle invalid text appropriately
|
|
"""
|
|
client = self._client()
|
|
analyzer_params = {"tokenizer": "standard"}
|
|
|
|
with pytest.raises(Exception):
|
|
self.run_analyzer(client, invalid_text, analyzer_params)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
def test_analyzer_with_empty_text(self):
|
|
"""
|
|
target: test analyzer with empty text
|
|
method: use empty text input
|
|
expected: analyzer should return empty tokens
|
|
"""
|
|
client = self._client()
|
|
analyzer_params = {"tokenizer": "standard"}
|
|
|
|
res, _ = self.run_analyzer(client, "", analyzer_params)
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
assert len(analyzer_res.tokens) == 0
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("text_input", [
|
|
[],
|
|
{},
|
|
["list", "of", "strings"],
|
|
{"key": "value"},
|
|
])
|
|
def test_analyzer_with_structured_text(self, text_input):
|
|
"""
|
|
target: test analyzer with structured text input (list/dict)
|
|
method: use list or dict as text input
|
|
expected: analyzer should handle structured input and return tokens
|
|
"""
|
|
client = self._client()
|
|
analyzer_params = {"tokenizer": "standard"}
|
|
|
|
res, _ = self.run_analyzer(client, text_input, analyzer_params)
|
|
# For structured input, API returns direct list format
|
|
assert isinstance(res, list)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("invalid_jieba_params", [
|
|
{"tokenizer": {"type": "jieba", "dict": "not_a_list"}},
|
|
{"tokenizer": {"type": "jieba", "dict": [123, 456]}},
|
|
{"tokenizer": {"type": "jieba", "mode": "invalid_mode"}},
|
|
{"tokenizer": {"type": "jieba", "mode": 123}},
|
|
{"tokenizer": {"type": "jieba", "hmm": "not_boolean"}},
|
|
{"tokenizer": {"type": "jieba", "hmm": 123}},
|
|
])
|
|
def test_jieba_analyzer_with_invalid_config(self, invalid_jieba_params):
|
|
"""
|
|
target: test jieba analyzer with invalid configurations
|
|
method: use jieba analyzer with invalid dict, mode, or hmm values
|
|
expected: analyzer should raise appropriate exceptions
|
|
"""
|
|
client = self._client()
|
|
text = "测试文本 for jieba analyzer"
|
|
|
|
with pytest.raises(Exception):
|
|
self.run_analyzer(client, text, invalid_jieba_params)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
def test_jieba_analyzer_with_empty_dict(self):
|
|
"""
|
|
target: test jieba analyzer with empty dictionary
|
|
method: use jieba analyzer with empty dict list
|
|
expected: analyzer should work with empty dict (uses default)
|
|
"""
|
|
client = self._client()
|
|
text = "测试文本 for jieba analyzer"
|
|
jieba_params = {"tokenizer": {"type": "jieba", "dict": []}}
|
|
|
|
res, _ = self.run_analyzer(client, text, jieba_params)
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
assert len(analyzer_res.tokens) > 0
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("invalid_dict_config", [
|
|
{"tokenizer": {"type": "jieba", "dict": None}},
|
|
{"tokenizer": {"type": "jieba", "dict": "invalid_string"}},
|
|
{"tokenizer": {"type": "jieba", "dict": 123}},
|
|
{"tokenizer": {"type": "jieba", "dict": True}},
|
|
{"tokenizer": {"type": "jieba", "dict": {"invalid": "dict"}}},
|
|
])
|
|
def test_jieba_analyzer_with_invalid_dict_values(self, invalid_dict_config):
|
|
"""
|
|
target: test jieba analyzer with invalid dict configurations
|
|
method: use jieba analyzer with invalid dict values
|
|
expected: analyzer should raise appropriate exceptions
|
|
"""
|
|
client = self._client()
|
|
text = "测试文本 for jieba analyzer"
|
|
|
|
with pytest.raises(Exception):
|
|
self.run_analyzer(client, text, invalid_dict_config)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("edge_case_dict_config", [
|
|
{"tokenizer": {"type": "jieba", "dict": ["", "valid_word"]}}, # Empty string in list
|
|
{"tokenizer": {"type": "jieba", "dict": ["valid_word", "valid_word"]}}, # Duplicate words
|
|
{"tokenizer": {"type": "jieba", "dict": ["_default_"]}}, # Only default dict
|
|
])
|
|
def test_jieba_analyzer_with_edge_case_dict_values(self, edge_case_dict_config):
|
|
"""
|
|
target: test jieba analyzer with edge case dict configurations
|
|
method: use jieba analyzer with edge case dict values
|
|
expected: analyzer should handle these cases gracefully
|
|
"""
|
|
client = self._client()
|
|
text = "测试文本 for jieba analyzer"
|
|
|
|
res, _ = self.run_analyzer(client, text, edge_case_dict_config, with_detail=True)
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
# These should work but might not be recommended usage
|
|
assert len(analyzer_res.tokens) >= 0
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
def test_jieba_analyzer_with_unknown_param(self):
|
|
"""
|
|
target: test jieba analyzer with unknown parameter
|
|
method: use jieba analyzer with invalid parameter name
|
|
expected: analyzer should ignore unknown parameters and work normally
|
|
"""
|
|
client = self._client()
|
|
text = "测试文本 for jieba analyzer"
|
|
jieba_params = {"tokenizer": {"type": "jieba", "invalid_param": "value"}}
|
|
|
|
res, _ = self.run_analyzer(client, text, jieba_params)
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
assert len(analyzer_res.tokens) > 0
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("invalid_filter_params", [
|
|
{"tokenizer": "standard", "filter": [{"type": "stop"}]},
|
|
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]},
|
|
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]},
|
|
{"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]},
|
|
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": None}]},
|
|
])
|
|
def test_analyzer_with_invalid_filter(self, invalid_filter_params):
|
|
"""
|
|
target: test analyzer with invalid filter configurations
|
|
method: use analyzer with invalid filter parameters
|
|
expected: analyzer should handle invalid filters appropriately
|
|
"""
|
|
client = self._client()
|
|
text = "This is a test text with stop words"
|
|
|
|
with pytest.raises(Exception):
|
|
self.run_analyzer(client, text, invalid_filter_params)
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
def test_analyzer_with_empty_stop_words(self):
|
|
"""
|
|
target: test analyzer with empty stop words list
|
|
method: use stop filter with empty stop_words list
|
|
expected: analyzer should work normally with empty stop words (no filtering)
|
|
"""
|
|
client = self._client()
|
|
text = "This is a test text with stop words"
|
|
filter_params = {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": []}]}
|
|
|
|
res, _ = self.run_analyzer(client, text, filter_params, with_detail=True)
|
|
analyzer_res = cast(AnalyzerResult, res)
|
|
tokens = analyzer_res.tokens
|
|
token_list = [r["token"] for r in tokens]
|
|
|
|
assert len(token_list) > 0
|
|
# With empty stop words, no filtering should occur
|
|
assert "is" in token_list # Common stop word should still be present |