milvus/tests/python_client/testcases/test_phrase_match.py

from common.common_type import CaseLabel
from common.phrase_match_generator import PhraseMatchTestGenerator
import pytest
import pandas as pd
from pymilvus import FieldSchema, CollectionSchema, DataType

from common.common_type import CheckTasks
from utils.util_log import test_log as log
from common import common_func as cf
from base.client_base import TestcaseBase

prefix = "phrase_match"


def init_collection_schema(
        dim: int, tokenizer: str, enable_partition_key: bool
) -> CollectionSchema:
    """Initialize collection schema with specified parameters"""
    analyzer_params = {"tokenizer": tokenizer}
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
        FieldSchema(
            name="text",
            dtype=DataType.VARCHAR,
            max_length=65535,
            enable_analyzer=True,
            enable_match=True,
            is_partition_key=enable_partition_key,
            analyzer_params=analyzer_params,
        ),
        FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
    ]
    return CollectionSchema(fields=fields, description="phrase match test collection")


@pytest.mark.tags(CaseLabel.L0)
class TestQueryPhraseMatch(TestcaseBase):
    """
    Test cases for phrase match functionality in Milvus using PhraseMatchTestGenerator.
    This class verifies the phrase matching capabilities with different configurations
    including various tokenizers, partition keys, and index settings.
    """

    @pytest.mark.parametrize("enable_partition_key", [True])
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("tokenizer", ["standard", "jieba"])
    def test_query_phrase_match_with_different_tokenizer(
            self, tokenizer, enable_inverted_index, enable_partition_key
    ):
        """
        target: Verify phrase match functionality with different tokenizers (standard, jieba)
        method: 1. Generate test data using PhraseMatchTestGenerator with language-specific content
               2. Create collection with appropriate schema (primary key, text field with analyzer, vector field)
               3. Build both vector (IVF_SQ8) and inverted indexes
               4. Execute phrase match queries with various slop values
               5. Compare results against Tantivy reference implementation
        expected: Milvus phrase match results should exactly match the reference implementation
                 results for all queries and slop values
        note: Test is marked to xfail for jieba tokenizer due to known issues
        """
        if tokenizer == "jieba":
            pytest.xfail("Jieba tokenizer has known issues with phrase matching ")

        # Initialize parameters
        dim = 128
        data_size = 3000
        num_queries = 10

        # Initialize generator based on tokenizer
        language = "zh" if tokenizer == "jieba" else "en"
        generator = PhraseMatchTestGenerator(language=language)

        # Create collection
        collection_w = self.init_collection_wrap(
            name=cf.gen_unique_str(prefix),
            schema=init_collection_schema(dim, tokenizer, enable_partition_key),
        )

        # Generate test data
        test_data = generator.generate_test_data(data_size, dim)
        df = pd.DataFrame(test_data)
        log.info(f"Test data: \n{df['text']}")
        # Insert data into collection
        insert_data = [
            {"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
        ]
        collection_w.insert(insert_data)
        collection_w.flush()

        # Create indexes
        collection_w.create_index(
            "emb",
            {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
        )
        if enable_inverted_index:
            collection_w.create_index(
                "text", {"index_type": "INVERTED", "params": {"tokenizer": tokenizer}}
            )

        collection_w.load()

        # Generate and execute test queries
        test_queries = generator.generate_test_queries(num_queries)

        for query in test_queries:
            expr = f"phrase_match(text, '{query['query']}', {query['slop']})"
            log.info(f"Testing query: {expr}")

            # Execute query
            results, _ = collection_w.query(expr=expr, output_fields=["id", "text"])

            # Get expected matches using Tantivy
            expected_matches = generator.get_query_results(
                query["query"], query["slop"]
            )
            # Get actual matches from Milvus
            actual_matches = [r["id"] for r in results]
            if set(actual_matches) != set(expected_matches):
                log.info(f"collection schema: {collection_w.schema}")
                for match_id in expected_matches:
                    # query by id to get text
                    res, _ = collection_w.query(
                        expr=f"id == {match_id}", output_fields=["text"]
                    )
                    text = res[0]["text"]
                    log.info(f"Expected match: {match_id}, text: {text}")

                for match_id in actual_matches:
                    # query by id to get text
                    res, _ = collection_w.query(
                        expr=f"id == {match_id}", output_fields=["text"]
                    )
                    text = res[0]["text"]
                    log.info(f"Matched document: {match_id}, text: {text}")
            # Assert results match
            assert (
                    set(actual_matches) == set(expected_matches)
            ), f"Mismatch in results for query '{query['query']}' with slop {query['slop']}"

    @pytest.mark.parametrize("enable_partition_key", [True])
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_phrase_match_as_filter_in_vector_search(
            self, tokenizer, enable_inverted_index, enable_partition_key
    ):
        """
        target: Verify phrase match functionality when used as a filter in vector search
        method: 1. Generate test data with both text content and vector embeddings
               2. Create collection with vector field (128d) and text field
               3. Build both vector index (IVF_SQ8) and text inverted index
               4. Perform vector search with phrase match as a filter condition
               5. Verify the combined search results maintain accuracy
        expected: The system should correctly combine vector search with phrase match filtering
                 while maintaining both search accuracy and performance
        """
        # Initialize parameters
        dim = 128
        data_size = 3000
        num_queries = 10

        # Initialize generator based on tokenizer
        language = "zh" if tokenizer == "jieba" else "en"
        generator = PhraseMatchTestGenerator(language=language)

        # Create collection
        collection_w = self.init_collection_wrap(
            name=cf.gen_unique_str(prefix),
            schema=init_collection_schema(dim, tokenizer, enable_partition_key),
        )

        # Generate test data
        test_data = generator.generate_test_data(data_size, dim)
        df = pd.DataFrame(test_data)
        log.info(f"Test data: \n{df['text']}")
        # Insert data into collection
        insert_data = [
            {"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
        ]
        collection_w.insert(insert_data)
        collection_w.flush()

        # Create indexes
        collection_w.create_index(
            "emb",
            {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
        )
        if enable_inverted_index:
            collection_w.create_index(
                "text", {"index_type": "INVERTED", "params": {"tokenizer": tokenizer}}
            )

        collection_w.load()

        # Generate and execute test queries
        test_queries = generator.generate_test_queries(num_queries)

        for query in test_queries:
            expr = f"phrase_match(text, '{query['query']}', {query['slop']})"
            log.info(f"Testing query: {expr}")

            # Execute filter search
            data = [generator.generate_embedding(dim) for _ in range(10)]
            results, _ = collection_w.search(
                data,
                anns_field="emb",
                param={},
                limit=10,
                expr=expr,
                output_fields=["id", "text"],
            )

            # Get expected matches using Tantivy
            expected_matches = generator.get_query_results(
                query["query"], query["slop"]
            )
            # assert results satisfy the filter
            for hits in results:
                for hit in hits:
                    assert hit.id in expected_matches

    @pytest.mark.parametrize("slop_value", [0, 1, 2, 5, 10])
    def test_slop_parameter(self, slop_value):
        """
        target: Verify phrase matching behavior with varying slop values
        method: 1. Create collection with standard tokenizer
               2. Generate and insert data with controlled word gaps between terms
               3. Test phrase matching with specific slop values (0, 1, 2, etc.)
               4. Verify matches at different word distances
               5. Compare results with Tantivy reference implementation
        expected: Results should only match phrases where words are within the specified
                 slop distance, validating the slop parameter's distance control
        """
        dim = 128
        data_size = 3000
        num_queries = 2
        tokenizer = "standard"
        enable_partition_key = True
        # Initialize generator based on tokenizer
        language = "zh" if tokenizer == "jieba" else "en"
        generator = PhraseMatchTestGenerator(language=language)

        # Create collection
        collection_w = self.init_collection_wrap(
            name=cf.gen_unique_str(prefix),
            schema=init_collection_schema(dim, tokenizer, enable_partition_key),
        )

        # Generate test data
        test_data = generator.generate_test_data(data_size, dim)
        df = pd.DataFrame(test_data)
        log.info(f"Test data: {df['text']}")
        # Insert data into collection
        insert_data = [
            {"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
        ]
        collection_w.insert(insert_data)
        collection_w.flush()

        # Create indexes
        collection_w.create_index(
            "emb",
            {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
        )

        collection_w.create_index("text", {"index_type": "INVERTED"})

        collection_w.load()

        # Generate and execute test queries
        test_queries = generator.generate_test_queries(num_queries)

        for query in test_queries:
            expr = f"phrase_match(text, '{query['query']}', {slop_value})"
            log.info(f"Testing query: {expr}")

            # Execute query
            results, _ = collection_w.query(expr=expr, output_fields=["id", "text"])

            # Get expected matches using Tantivy
            expected_matches = generator.get_query_results(query["query"], slop_value)
            # Get actual matches from Milvus
            actual_matches = [r["id"] for r in results]
            if set(actual_matches) != set(expected_matches):
                log.info(f"collection schema: {collection_w.schema}")
                for match_id in expected_matches:
                    # query by id to get text
                    res, _ = collection_w.query(
                        expr=f"id == {match_id}", output_fields=["text"]
                    )
                    text = res[0]["text"]
                    log.info(f"Expected match: {match_id}, text: {text}")

                for match_id in actual_matches:
                    # query by id to get text
                    res, _ = collection_w.query(
                        expr=f"id == {match_id}", output_fields=["text"]
                    )
                    text = res[0]["text"]
                    log.info(f"Matched document: {match_id}, text: {text}")
            # Assert results match
            assert (
                    set(actual_matches) == set(expected_matches)
            ), f"Mismatch in results for query '{query['query']}' with slop {slop_value}"

    def test_query_phrase_match_with_different_patterns(self):
        """
        target: Verify phrase matching with various text patterns and complexities
        method: 1. Create collection with standard tokenizer
               2. Generate and insert data with diverse phrase patterns:
                  - Exact phrases ("love swimming and running")
                  - Phrases with gaps ("enjoy very basketball")
                  - Complex phrases ("practice tennis seriously often")
                  - Multiple term phrases ("swimming running cycling")
               3. Test each pattern with appropriate slop values
               4. Verify minimum match count for each pattern
        expected: System should correctly identify and match each pattern type
                 with the specified number of matches per pattern
        """
        dim = 128
        collection_name = f"{prefix}_patterns"
        schema = init_collection_schema(dim, "standard", False)
        collection = self.init_collection_wrap(name=collection_name, schema=schema)

        # Generate data with various patterns
        generator = PhraseMatchTestGenerator(language="en")
        data = generator.generate_test_data(3000, dim)
        collection.insert(data)
        # Test various patterns
        test_patterns = [
            ("love swimming and running", 0),  # Exact phrase
            ("enjoy very basketball", 1),  # Phrase with gap
            ("practice tennis seriously often", 2),  # Complex phrase
            ("swimming running cycling", 5),  # Multiple activities
        ]

        # Generate and insert documents that match the patterns
        num_docs_per_pattern = 100
        pattern_documents = generator.generate_pattern_documents(
            test_patterns, dim, num_docs_per_pattern=num_docs_per_pattern
        )
        collection.insert(pattern_documents)
        df = pd.DataFrame(pattern_documents)[["id", "text"]]
        log.info(f"Test data:\n {df}")

        collection.create_index(
            field_name="text", index_params={"index_type": "INVERTED"}
        )
        collection.create_index(
            field_name="emb",
            index_params={
                "index_type": "IVF_SQ8",
                "metric_type": "L2",
                "params": {"nlist": 64},
            },
        )
        collection.load()

        for pattern, slop in test_patterns:
            results, _ = collection.query(
                expr=f'phrase_match(text, "{pattern}", {slop})', output_fields=["text"]
            )
            log.info(
                f"Pattern '{pattern}' with slop {slop} found {len(results)} matches"
            )
            assert len(results) >= num_docs_per_pattern


@pytest.mark.tags(CaseLabel.L1)
class TestQueryPhraseMatchNegative(TestcaseBase):
    def test_query_phrase_match_with_invalid_slop(self):
        """
        target: Verify error handling for invalid slop values in phrase matching
        method: 1. Create collection with standard test data
               2. Test phrase matching with invalid slop values:
                  - Negative slop values (-1)
                  - Extremely large slop values (10^31)
               3. Verify error handling and response
        expected: System should:
                 1. Reject queries with invalid slop values
                 2. Return appropriate error responses
                 3. Maintain system stability after invalid queries
        """
        dim = 128
        collection_name = f"{prefix}_invalid_slop"
        schema = init_collection_schema(dim, "standard", False)
        collection = self.init_collection_wrap(name=collection_name, schema=schema)

        # Insert some test data
        generator = PhraseMatchTestGenerator(language="en")
        data = generator.generate_test_data(100, dim)
        collection.insert(data)

        collection.create_index(
            field_name="text", index_params={"index_type": "INVERTED"}
        )
        collection.create_index(
            field_name="emb",
            index_params={
                "index_type": "IVF_SQ8",
                "metric_type": "L2",
                "params": {"nlist": 64},
            },
        )
        collection.load()

        # Test invalid inputs
        invalid_cases = [
            ("valid query", -1),  # Negative slop
            ("valid query", 10 ** 31),  # Very large slop
        ]

        for query, slop in invalid_cases:
            res, result = collection.query(
                expr=f'phrase_match(text, "{query}", {slop})',
                output_fields=["text"],
                check_task=CheckTasks.check_nothing,
            )
            log.info(f"Query: '{query[:10]}' with slop {slop} returned {res}")
            assert result is False