milvus/tests/python_client/milvus_client/test_milvus_client_analyzer.py

import pytest
from typing import Any, List, Dict, Protocol, cast
import uuid
from base.client_v2_base import TestMilvusClientV2Base
from common.common_type import CaseLabel
from common.text_generator import generate_text_by_analyzer


class AnalyzerResult(Protocol):
    """Protocol for analyzer result to help with type inference"""
    tokens: List[Dict[str, Any]]


class TestMilvusClientAnalyzer(TestMilvusClientV2Base):

    @staticmethod
    def get_expected_jieba_tokens(text, analyzer_params):
        """
        Generate expected tokens using jieba library based on analyzer parameters
        """
        import jieba
        import importlib
        importlib.reload(jieba)
        tokenizer_config = analyzer_params.get("tokenizer", {})
        # Set up custom dictionary if provided
        if "dict" in tokenizer_config:
            custom_dict = tokenizer_config["dict"]
            if "_default_" in custom_dict:
                # jieba.dt.initialize()
                for word in custom_dict:
                    if word != "_default_":
                        jieba.add_word(word)
                print(f"dict length: {len(jieba.dt.FREQ)}")
                print(jieba.dt.FREQ)
            else:
                file_path = f"/tmp/{uuid.uuid4()}.txt"
                # add custom words to jieba_dict.txt
                for word in custom_dict:
                    with open(file_path, "w") as f:
                        f.write(f"{word} 1")
                jieba.set_dictionary(file_path)
                jieba.dt.tmp_dir = None
                jieba.dt.cache_file = None
                jieba.dt.FREQ = {}
                jieba.dt.initialize()


        # Configure mode
        mode = tokenizer_config.get("mode", "search")
        hmm = tokenizer_config.get("hmm", True)

        if mode == "exact":
            tokens = list(jieba.cut(text, HMM=hmm))
        elif mode == "search":
            tokens = list(jieba.cut_for_search(text, HMM=hmm))
        else:
            tokens = list(jieba.cut(text, HMM=hmm))

        # Filter out empty tokens
        tokens = [token for token in tokens if token.strip()]

        return tokens

    analyzer_params_list = [
        {
            "tokenizer": "standard",
            "filter": [
                {
                    "type": "stop",
                    "stop_words": ["is", "the", "this", "a", "an", "and", "or"],
                }
            ],
        },
        {
            "tokenizer": "jieba",
            "filter": [
                {
                    "type": "stop",
                    "stop_words": ["is", "the", "this", "a", "an", "and", "or", "是", "的", "这", "一个", "和", "或"],
                }
            ],
        },
        {
            "tokenizer": "icu"
        }
        # {
        #     "tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
        #     "filter": [
        #         {
        #             "type": "stop",
        #             "stop_words": ["は", "が", "の", "に", "を", "で", "と", "た"],
        #         }
        #     ],
        # },
        # {"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}},
        # {"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
    ]

    jieba_custom_analyzer_params_list = [
        # # Test dict parameter with custom dictionary
        {
            "tokenizer": {
                "type": "jieba",
                "dict": ["结巴分词器"],
                "mode": "exact",
                "hmm": False
            }
        },
        # Test dict parameter with default dict and custom dict
        {
            "tokenizer": {
                "type": "jieba",
                "dict": ["_default_", "结巴分词器"],
                "mode": "search",
                "hmm": False
            }
        },
        # Test exact mode with hmm enabled
        {
            "tokenizer": {
                "type": "jieba",
                "dict": ["结巴分词器"],
                "mode": "exact",
                "hmm": True
            }
        },
        # Test search mode with hmm enabled
        {
            "tokenizer": {
                "type": "jieba",
                "dict": ["结巴分词器"],
                "mode": "search",
                "hmm": True
            }
        },
        # Test with only mode configuration
        {
            "tokenizer": {
                "type": "jieba",
                "mode": "exact"
            }
        },
        # Test with only hmm configuration
        {
            "tokenizer": {
                "type": "jieba",
                "hmm": False
            }
        }
    ]
    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("analyzer_params", analyzer_params_list)
    def test_analyzer(self, analyzer_params):
        """
        target: test analyzer
        method: use different analyzer params, then run analyzer to get the tokens
        expected: verify the tokens
        """
        client = self._client()
        text = generate_text_by_analyzer(analyzer_params)
        res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
        res_2, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)

        # Cast to help type inference for gRPC response
        analyzer_res = cast(AnalyzerResult, res)
        analyzer_res_2 = cast(AnalyzerResult, res_2)

        # verify the result are the same when run analyzer twice
        for i in range(len(analyzer_res.tokens)):
            assert analyzer_res.tokens[i]["token"] == analyzer_res_2.tokens[i]["token"]
            assert analyzer_res.tokens[i]["hash"] == analyzer_res_2.tokens[i]["hash"]
            assert analyzer_res.tokens[i]["start_offset"] == analyzer_res_2.tokens[i]["start_offset"]
            assert analyzer_res.tokens[i]["end_offset"] == analyzer_res_2.tokens[i]["end_offset"]
            assert analyzer_res.tokens[i]["position"] == analyzer_res_2.tokens[i]["position"]
            assert analyzer_res.tokens[i]["position_length"] == analyzer_res_2.tokens[i]["position_length"]

        tokens = analyzer_res.tokens
        token_list = [r["token"] for r in tokens]
        # Check tokens are not empty
        assert len(token_list) > 0, "No tokens were generated"

        # Check tokens are related to input text (all token should be a substring of the text)
        assert all(
            token.lower() in text.lower() for token in token_list
        ), "some of the tokens do not appear in the original text"

        if "filter" in analyzer_params:
            for filter in analyzer_params["filter"]:
                if filter["type"] == "stop":
                    stop_words = filter["stop_words"]
                    assert not any(
                        token in stop_words for token in tokens
                    ), "some of the tokens are stop words"

        # Check hash value and detail
        for r in tokens:
            assert isinstance(r["hash"], int)
            assert isinstance(r["start_offset"], int)
            assert isinstance(r["end_offset"], int)
            assert isinstance(r["position"], int)
            assert isinstance(r["position_length"], int)

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("analyzer_params", jieba_custom_analyzer_params_list)
    def test_jieba_custom_analyzer(self, analyzer_params):
        """
        target: test jieba analyzer with custom configurations
        method: use different jieba analyzer params with dict, mode, and hmm configurations
        expected: verify the tokens are generated correctly based on configuration
        """
        client = self._client()
        text = "milvus结巴分词器中文测试"
        res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True)

        analyzer_res = cast(AnalyzerResult, res)
        tokens = analyzer_res.tokens
        token_list = [r["token"] for r in tokens]

        # Check tokens are not empty
        assert len(token_list) > 0, "No tokens were generated"

        # Generate expected tokens using jieba library and compare
        expected_tokens = self.get_expected_jieba_tokens(text, analyzer_params)
        assert sorted(token_list) == sorted(expected_tokens), f"Expected {expected_tokens}, but got {token_list}"

        # Verify token details
        for r in tokens:
            assert isinstance(r["token"], str)
            assert isinstance(r["start_offset"], int)
            assert isinstance(r["end_offset"], int)
            assert isinstance(r["position"], int)
            assert isinstance(r["position_length"], int)

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("invalid_analyzer_params", [
        {"tokenizer": "invalid_tokenizer"},
        {"tokenizer": 123},
        {"tokenizer": None},
        {"tokenizer": []},
        {"tokenizer": {"type": "invalid_type"}},
        {"tokenizer": {"type": None}},
        {"filter": "invalid_filter"},
        {"filter": [{"type": None}]},
        {"filter": [{"invalid_key": "value"}]},
    ])
    def test_analyzer_with_invalid_params(self, invalid_analyzer_params):
        """
        target: test analyzer with invalid parameters
        method: use invalid analyzer params and expect errors
        expected: analyzer should raise appropriate exceptions
        """
        client = self._client()
        text = "test text for invalid analyzer"

        with pytest.raises(Exception):
            self.run_analyzer(client, text, invalid_analyzer_params)

    @pytest.mark.tags(CaseLabel.L1)
    def test_analyzer_with_empty_params(self):
        """
        target: test analyzer with empty parameters (uses default)
        method: use empty analyzer params
        expected: analyzer should use default configuration and work normally
        """
        client = self._client()
        text = "test text for empty analyzer"

        # Empty params should use default configuration
        res, _ = self.run_analyzer(client, text, {})
        analyzer_res = cast(AnalyzerResult, res)
        assert len(analyzer_res.tokens) > 0

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("invalid_text", [
        None,
        123,
        True,
        False,
    ])
    def test_analyzer_with_invalid_text(self, invalid_text):
        """
        target: test analyzer with invalid text input
        method: use valid analyzer params but invalid text
        expected: analyzer should handle invalid text appropriately
        """
        client = self._client()
        analyzer_params = {"tokenizer": "standard"}

        with pytest.raises(Exception):
            self.run_analyzer(client, invalid_text, analyzer_params)

    @pytest.mark.tags(CaseLabel.L1)
    def test_analyzer_with_empty_text(self):
        """
        target: test analyzer with empty text
        method: use empty text input
        expected: analyzer should return empty tokens
        """
        client = self._client()
        analyzer_params = {"tokenizer": "standard"}

        res, _ = self.run_analyzer(client, "", analyzer_params)
        analyzer_res = cast(AnalyzerResult, res)
        assert len(analyzer_res.tokens) == 0

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("text_input", [
        [],
        {},
        ["list", "of", "strings"],
        {"key": "value"},
    ])
    def test_analyzer_with_structured_text(self, text_input):
        """
        target: test analyzer with structured text input (list/dict)
        method: use list or dict as text input
        expected: analyzer should handle structured input and return tokens
        """
        client = self._client()
        analyzer_params = {"tokenizer": "standard"}

        res, _ = self.run_analyzer(client, text_input, analyzer_params)
        # For structured input, API returns direct list format
        assert isinstance(res, list)

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("invalid_jieba_params", [
        {"tokenizer": {"type": "jieba", "dict": "not_a_list"}},
        {"tokenizer": {"type": "jieba", "dict": [123, 456]}},
        {"tokenizer": {"type": "jieba", "mode": "invalid_mode"}},
        {"tokenizer": {"type": "jieba", "mode": 123}},
        {"tokenizer": {"type": "jieba", "hmm": "not_boolean"}},
        {"tokenizer": {"type": "jieba", "hmm": 123}},
    ])
    def test_jieba_analyzer_with_invalid_config(self, invalid_jieba_params):
        """
        target: test jieba analyzer with invalid configurations
        method: use jieba analyzer with invalid dict, mode, or hmm values
        expected: analyzer should raise appropriate exceptions
        """
        client = self._client()
        text = "测试文本 for jieba analyzer"

        with pytest.raises(Exception):
            self.run_analyzer(client, text, invalid_jieba_params)

    @pytest.mark.tags(CaseLabel.L1)
    def test_jieba_analyzer_with_empty_dict(self):
        """
        target: test jieba analyzer with empty dictionary
        method: use jieba analyzer with empty dict list
        expected: analyzer should work with empty dict (uses default)
        """
        client = self._client()
        text = "测试文本 for jieba analyzer"
        jieba_params = {"tokenizer": {"type": "jieba", "dict": []}}

        res, _ = self.run_analyzer(client, text, jieba_params)
        analyzer_res = cast(AnalyzerResult, res)
        assert len(analyzer_res.tokens) > 0

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("invalid_dict_config", [
        {"tokenizer": {"type": "jieba", "dict": None}},
        {"tokenizer": {"type": "jieba", "dict": "invalid_string"}},
        {"tokenizer": {"type": "jieba", "dict": 123}},
        {"tokenizer": {"type": "jieba", "dict": True}},
        {"tokenizer": {"type": "jieba", "dict": {"invalid": "dict"}}},
    ])
    def test_jieba_analyzer_with_invalid_dict_values(self, invalid_dict_config):
        """
        target: test jieba analyzer with invalid dict configurations
        method: use jieba analyzer with invalid dict values
        expected: analyzer should raise appropriate exceptions
        """
        client = self._client()
        text = "测试文本 for jieba analyzer"

        with pytest.raises(Exception):
            self.run_analyzer(client, text, invalid_dict_config)

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("edge_case_dict_config", [
        {"tokenizer": {"type": "jieba", "dict": ["", "valid_word"]}},  # Empty string in list
        {"tokenizer": {"type": "jieba", "dict": ["valid_word", "valid_word"]}},  # Duplicate words
        {"tokenizer": {"type": "jieba", "dict": ["_default_"]}},  # Only default dict
    ])
    def test_jieba_analyzer_with_edge_case_dict_values(self, edge_case_dict_config):
        """
        target: test jieba analyzer with edge case dict configurations
        method: use jieba analyzer with edge case dict values
        expected: analyzer should handle these cases gracefully
        """
        client = self._client()
        text = "测试文本 for jieba analyzer"

        res, _ = self.run_analyzer(client, text, edge_case_dict_config, with_detail=True)
        analyzer_res = cast(AnalyzerResult, res)
        # These should work but might not be recommended usage
        assert len(analyzer_res.tokens) >= 0

    @pytest.mark.tags(CaseLabel.L1)
    def test_jieba_analyzer_with_unknown_param(self):
        """
        target: test jieba analyzer with unknown parameter
        method: use jieba analyzer with invalid parameter name
        expected: analyzer should ignore unknown parameters and work normally
        """
        client = self._client()
        text = "测试文本 for jieba analyzer"
        jieba_params = {"tokenizer": {"type": "jieba", "invalid_param": "value"}}

        res, _ = self.run_analyzer(client, text, jieba_params)
        analyzer_res = cast(AnalyzerResult, res)
        assert len(analyzer_res.tokens) > 0

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("invalid_filter_params", [
        {"tokenizer": "standard", "filter": [{"type": "stop"}]},
        {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]},
        {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]},
        {"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]},
        {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": None}]},
    ])
    def test_analyzer_with_invalid_filter(self, invalid_filter_params):
        """
        target: test analyzer with invalid filter configurations
        method: use analyzer with invalid filter parameters
        expected: analyzer should handle invalid filters appropriately
        """
        client = self._client()
        text = "This is a test text with stop words"

        with pytest.raises(Exception):
            self.run_analyzer(client, text, invalid_filter_params)

    @pytest.mark.tags(CaseLabel.L1)
    def test_analyzer_with_empty_stop_words(self):
        """
        target: test analyzer with empty stop words list
        method: use stop filter with empty stop_words list
        expected: analyzer should work normally with empty stop words (no filtering)
        """
        client = self._client()
        text = "This is a test text with stop words"
        filter_params = {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": []}]}

        res, _ = self.run_analyzer(client, text, filter_params, with_detail=True)
        analyzer_res = cast(AnalyzerResult, res)
        tokens = analyzer_res.tokens
        token_list = [r["token"] for r in tokens]

        assert len(token_list) > 0
        # With empty stop words, no filtering should occur
        assert "is" in token_list  # Common stop word should still be present