milvus/tests/python_client/testcases/test_phrase_match.py

420 lines
17 KiB
Python

from common.common_type import CaseLabel
from common.phrase_match_generator import PhraseMatchTestGenerator
import pytest
import pandas as pd
from pymilvus import FieldSchema, CollectionSchema, DataType
from common.common_type import CheckTasks
from utils.util_log import test_log as log
from common import common_func as cf
from base.client_base import TestcaseBase
prefix = "phrase_match"
def init_collection_schema(
dim: int, tokenizer: str, enable_partition_key: bool
) -> CollectionSchema:
"""Initialize collection schema with specified parameters"""
analyzer_params = {"tokenizer": tokenizer}
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
is_partition_key=enable_partition_key,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
return CollectionSchema(fields=fields, description="phrase match test collection")
@pytest.mark.tags(CaseLabel.L0)
class TestQueryPhraseMatch(TestcaseBase):
"""
Test cases for phrase match functionality in Milvus using PhraseMatchTestGenerator.
This class verifies the phrase matching capabilities with different configurations
including various tokenizers, partition keys, and index settings.
"""
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("tokenizer", ["standard", "jieba"])
def test_query_phrase_match_with_different_tokenizer(
self, tokenizer, enable_inverted_index, enable_partition_key
):
"""
target: Verify phrase match functionality with different tokenizers (standard, jieba)
method: 1. Generate test data using PhraseMatchTestGenerator with language-specific content
2. Create collection with appropriate schema (primary key, text field with analyzer, vector field)
3. Build both vector (IVF_SQ8) and inverted indexes
4. Execute phrase match queries with various slop values
5. Compare results against Tantivy reference implementation
expected: Milvus phrase match results should exactly match the reference implementation
results for all queries and slop values
note: Test is marked to xfail for jieba tokenizer due to known issues
"""
if tokenizer == "jieba":
pytest.xfail("Jieba tokenizer has known issues with phrase matching ")
# Initialize parameters
dim = 128
data_size = 3000
num_queries = 10
# Initialize generator based on tokenizer
language = "zh" if tokenizer == "jieba" else "en"
generator = PhraseMatchTestGenerator(language=language)
# Create collection
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix),
schema=init_collection_schema(dim, tokenizer, enable_partition_key),
)
# Generate test data
test_data = generator.generate_test_data(data_size, dim)
df = pd.DataFrame(test_data)
log.info(f"Test data: \n{df['text']}")
# Insert data into collection
insert_data = [
{"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
]
collection_w.insert(insert_data)
collection_w.flush()
# Create indexes
collection_w.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
if enable_inverted_index:
collection_w.create_index(
"text", {"index_type": "INVERTED", "params": {"tokenizer": tokenizer}}
)
collection_w.load()
# Generate and execute test queries
test_queries = generator.generate_test_queries(num_queries)
for query in test_queries:
expr = f"phrase_match(text, '{query['query']}', {query['slop']})"
log.info(f"Testing query: {expr}")
# Execute query
results, _ = collection_w.query(expr=expr, output_fields=["id", "text"])
# Get expected matches using Tantivy
expected_matches = generator.get_query_results(
query["query"], query["slop"]
)
# Get actual matches from Milvus
actual_matches = [r["id"] for r in results]
if set(actual_matches) != set(expected_matches):
log.info(f"collection schema: {collection_w.schema}")
for match_id in expected_matches:
# query by id to get text
res, _ = collection_w.query(
expr=f"id == {match_id}", output_fields=["text"]
)
text = res[0]["text"]
log.info(f"Expected match: {match_id}, text: {text}")
for match_id in actual_matches:
# query by id to get text
res, _ = collection_w.query(
expr=f"id == {match_id}", output_fields=["text"]
)
text = res[0]["text"]
log.info(f"Matched document: {match_id}, text: {text}")
# Assert results match
assert (
set(actual_matches) == set(expected_matches)
), f"Mismatch in results for query '{query['query']}' with slop {query['slop']}"
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_phrase_match_as_filter_in_vector_search(
self, tokenizer, enable_inverted_index, enable_partition_key
):
"""
target: Verify phrase match functionality when used as a filter in vector search
method: 1. Generate test data with both text content and vector embeddings
2. Create collection with vector field (128d) and text field
3. Build both vector index (IVF_SQ8) and text inverted index
4. Perform vector search with phrase match as a filter condition
5. Verify the combined search results maintain accuracy
expected: The system should correctly combine vector search with phrase match filtering
while maintaining both search accuracy and performance
"""
# Initialize parameters
dim = 128
data_size = 3000
num_queries = 10
# Initialize generator based on tokenizer
language = "zh" if tokenizer == "jieba" else "en"
generator = PhraseMatchTestGenerator(language=language)
# Create collection
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix),
schema=init_collection_schema(dim, tokenizer, enable_partition_key),
)
# Generate test data
test_data = generator.generate_test_data(data_size, dim)
df = pd.DataFrame(test_data)
log.info(f"Test data: \n{df['text']}")
# Insert data into collection
insert_data = [
{"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
]
collection_w.insert(insert_data)
collection_w.flush()
# Create indexes
collection_w.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
if enable_inverted_index:
collection_w.create_index(
"text", {"index_type": "INVERTED", "params": {"tokenizer": tokenizer}}
)
collection_w.load()
# Generate and execute test queries
test_queries = generator.generate_test_queries(num_queries)
for query in test_queries:
expr = f"phrase_match(text, '{query['query']}', {query['slop']})"
log.info(f"Testing query: {expr}")
# Execute filter search
data = [generator.generate_embedding(dim) for _ in range(10)]
results, _ = collection_w.search(
data,
anns_field="emb",
param={},
limit=10,
expr=expr,
output_fields=["id", "text"],
)
# Get expected matches using Tantivy
expected_matches = generator.get_query_results(
query["query"], query["slop"]
)
# assert results satisfy the filter
for hits in results:
for hit in hits:
assert hit.id in expected_matches
@pytest.mark.parametrize("slop_value", [0, 1, 2, 5, 10])
def test_slop_parameter(self, slop_value):
"""
target: Verify phrase matching behavior with varying slop values
method: 1. Create collection with standard tokenizer
2. Generate and insert data with controlled word gaps between terms
3. Test phrase matching with specific slop values (0, 1, 2, etc.)
4. Verify matches at different word distances
5. Compare results with Tantivy reference implementation
expected: Results should only match phrases where words are within the specified
slop distance, validating the slop parameter's distance control
"""
dim = 128
data_size = 3000
num_queries = 2
tokenizer = "standard"
enable_partition_key = True
# Initialize generator based on tokenizer
language = "zh" if tokenizer == "jieba" else "en"
generator = PhraseMatchTestGenerator(language=language)
# Create collection
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix),
schema=init_collection_schema(dim, tokenizer, enable_partition_key),
)
# Generate test data
test_data = generator.generate_test_data(data_size, dim)
df = pd.DataFrame(test_data)
log.info(f"Test data: {df['text']}")
# Insert data into collection
insert_data = [
{"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
]
collection_w.insert(insert_data)
collection_w.flush()
# Create indexes
collection_w.create_index(
"emb",
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
# Generate and execute test queries
test_queries = generator.generate_test_queries(num_queries)
for query in test_queries:
expr = f"phrase_match(text, '{query['query']}', {slop_value})"
log.info(f"Testing query: {expr}")
# Execute query
results, _ = collection_w.query(expr=expr, output_fields=["id", "text"])
# Get expected matches using Tantivy
expected_matches = generator.get_query_results(query["query"], slop_value)
# Get actual matches from Milvus
actual_matches = [r["id"] for r in results]
if set(actual_matches) != set(expected_matches):
log.info(f"collection schema: {collection_w.schema}")
for match_id in expected_matches:
# query by id to get text
res, _ = collection_w.query(
expr=f"id == {match_id}", output_fields=["text"]
)
text = res[0]["text"]
log.info(f"Expected match: {match_id}, text: {text}")
for match_id in actual_matches:
# query by id to get text
res, _ = collection_w.query(
expr=f"id == {match_id}", output_fields=["text"]
)
text = res[0]["text"]
log.info(f"Matched document: {match_id}, text: {text}")
# Assert results match
assert (
set(actual_matches) == set(expected_matches)
), f"Mismatch in results for query '{query['query']}' with slop {slop_value}"
def test_query_phrase_match_with_different_patterns(self):
"""
target: Verify phrase matching with various text patterns and complexities
method: 1. Create collection with standard tokenizer
2. Generate and insert data with diverse phrase patterns:
- Exact phrases ("love swimming and running")
- Phrases with gaps ("enjoy very basketball")
- Complex phrases ("practice tennis seriously often")
- Multiple term phrases ("swimming running cycling")
3. Test each pattern with appropriate slop values
4. Verify minimum match count for each pattern
expected: System should correctly identify and match each pattern type
with the specified number of matches per pattern
"""
dim = 128
collection_name = f"{prefix}_patterns"
schema = init_collection_schema(dim, "standard", False)
collection = self.init_collection_wrap(name=collection_name, schema=schema)
# Generate data with various patterns
generator = PhraseMatchTestGenerator(language="en")
data = generator.generate_test_data(3000, dim)
collection.insert(data)
# Test various patterns
test_patterns = [
("love swimming and running", 0), # Exact phrase
("enjoy very basketball", 1), # Phrase with gap
("practice tennis seriously often", 2), # Complex phrase
("swimming running cycling", 5), # Multiple activities
]
# Generate and insert documents that match the patterns
num_docs_per_pattern = 100
pattern_documents = generator.generate_pattern_documents(
test_patterns, dim, num_docs_per_pattern=num_docs_per_pattern
)
collection.insert(pattern_documents)
df = pd.DataFrame(pattern_documents)[["id", "text"]]
log.info(f"Test data:\n {df}")
collection.create_index(
field_name="text", index_params={"index_type": "INVERTED"}
)
collection.create_index(
field_name="emb",
index_params={
"index_type": "IVF_SQ8",
"metric_type": "L2",
"params": {"nlist": 64},
},
)
collection.load()
for pattern, slop in test_patterns:
results, _ = collection.query(
expr=f'phrase_match(text, "{pattern}", {slop})', output_fields=["text"]
)
log.info(
f"Pattern '{pattern}' with slop {slop} found {len(results)} matches"
)
assert len(results) >= num_docs_per_pattern
@pytest.mark.tags(CaseLabel.L1)
class TestQueryPhraseMatchNegative(TestcaseBase):
def test_query_phrase_match_with_invalid_slop(self):
"""
target: Verify error handling for invalid slop values in phrase matching
method: 1. Create collection with standard test data
2. Test phrase matching with invalid slop values:
- Negative slop values (-1)
- Extremely large slop values (10^31)
3. Verify error handling and response
expected: System should:
1. Reject queries with invalid slop values
2. Return appropriate error responses
3. Maintain system stability after invalid queries
"""
dim = 128
collection_name = f"{prefix}_invalid_slop"
schema = init_collection_schema(dim, "standard", False)
collection = self.init_collection_wrap(name=collection_name, schema=schema)
# Insert some test data
generator = PhraseMatchTestGenerator(language="en")
data = generator.generate_test_data(100, dim)
collection.insert(data)
collection.create_index(
field_name="text", index_params={"index_type": "INVERTED"}
)
collection.create_index(
field_name="emb",
index_params={
"index_type": "IVF_SQ8",
"metric_type": "L2",
"params": {"nlist": 64},
},
)
collection.load()
# Test invalid inputs
invalid_cases = [
("valid query", -1), # Negative slop
("valid query", 10 ** 31), # Very large slop
]
for query, slop in invalid_cases:
res, result = collection.query(
expr=f'phrase_match(text, "{query}", {slop})',
output_fields=["text"],
check_task=CheckTasks.check_nothing,
)
log.info(f"Query: '{query[:10]}' with slop {slop} returned {res}")
assert result is False