mirror of https://github.com/milvus-io/milvus.git
420 lines
17 KiB
Python
420 lines
17 KiB
Python
from common.common_type import CaseLabel
|
|
from common.phrase_match_generator import PhraseMatchTestGenerator
|
|
import pytest
|
|
import pandas as pd
|
|
from pymilvus import FieldSchema, CollectionSchema, DataType
|
|
|
|
from common.common_type import CheckTasks
|
|
from utils.util_log import test_log as log
|
|
from common import common_func as cf
|
|
from base.client_base import TestcaseBase
|
|
|
|
prefix = "phrase_match"
|
|
|
|
|
|
def init_collection_schema(
|
|
dim: int, tokenizer: str, enable_partition_key: bool
|
|
) -> CollectionSchema:
|
|
"""Initialize collection schema with specified parameters"""
|
|
analyzer_params = {"tokenizer": tokenizer}
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
is_partition_key=enable_partition_key,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
]
|
|
return CollectionSchema(fields=fields, description="phrase match test collection")
|
|
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
class TestQueryPhraseMatch(TestcaseBase):
|
|
"""
|
|
Test cases for phrase match functionality in Milvus using PhraseMatchTestGenerator.
|
|
This class verifies the phrase matching capabilities with different configurations
|
|
including various tokenizers, partition keys, and index settings.
|
|
"""
|
|
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("tokenizer", ["standard", "jieba"])
|
|
def test_query_phrase_match_with_different_tokenizer(
|
|
self, tokenizer, enable_inverted_index, enable_partition_key
|
|
):
|
|
"""
|
|
target: Verify phrase match functionality with different tokenizers (standard, jieba)
|
|
method: 1. Generate test data using PhraseMatchTestGenerator with language-specific content
|
|
2. Create collection with appropriate schema (primary key, text field with analyzer, vector field)
|
|
3. Build both vector (IVF_SQ8) and inverted indexes
|
|
4. Execute phrase match queries with various slop values
|
|
5. Compare results against Tantivy reference implementation
|
|
expected: Milvus phrase match results should exactly match the reference implementation
|
|
results for all queries and slop values
|
|
note: Test is marked to xfail for jieba tokenizer due to known issues
|
|
"""
|
|
if tokenizer == "jieba":
|
|
pytest.xfail("Jieba tokenizer has known issues with phrase matching ")
|
|
|
|
# Initialize parameters
|
|
dim = 128
|
|
data_size = 3000
|
|
num_queries = 10
|
|
|
|
# Initialize generator based on tokenizer
|
|
language = "zh" if tokenizer == "jieba" else "en"
|
|
generator = PhraseMatchTestGenerator(language=language)
|
|
|
|
# Create collection
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix),
|
|
schema=init_collection_schema(dim, tokenizer, enable_partition_key),
|
|
)
|
|
|
|
# Generate test data
|
|
test_data = generator.generate_test_data(data_size, dim)
|
|
df = pd.DataFrame(test_data)
|
|
log.info(f"Test data: \n{df['text']}")
|
|
# Insert data into collection
|
|
insert_data = [
|
|
{"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
|
|
]
|
|
collection_w.insert(insert_data)
|
|
collection_w.flush()
|
|
|
|
# Create indexes
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index(
|
|
"text", {"index_type": "INVERTED", "params": {"tokenizer": tokenizer}}
|
|
)
|
|
|
|
collection_w.load()
|
|
|
|
# Generate and execute test queries
|
|
test_queries = generator.generate_test_queries(num_queries)
|
|
|
|
for query in test_queries:
|
|
expr = f"phrase_match(text, '{query['query']}', {query['slop']})"
|
|
log.info(f"Testing query: {expr}")
|
|
|
|
# Execute query
|
|
results, _ = collection_w.query(expr=expr, output_fields=["id", "text"])
|
|
|
|
# Get expected matches using Tantivy
|
|
expected_matches = generator.get_query_results(
|
|
query["query"], query["slop"]
|
|
)
|
|
# Get actual matches from Milvus
|
|
actual_matches = [r["id"] for r in results]
|
|
if set(actual_matches) != set(expected_matches):
|
|
log.info(f"collection schema: {collection_w.schema}")
|
|
for match_id in expected_matches:
|
|
# query by id to get text
|
|
res, _ = collection_w.query(
|
|
expr=f"id == {match_id}", output_fields=["text"]
|
|
)
|
|
text = res[0]["text"]
|
|
log.info(f"Expected match: {match_id}, text: {text}")
|
|
|
|
for match_id in actual_matches:
|
|
# query by id to get text
|
|
res, _ = collection_w.query(
|
|
expr=f"id == {match_id}", output_fields=["text"]
|
|
)
|
|
text = res[0]["text"]
|
|
log.info(f"Matched document: {match_id}, text: {text}")
|
|
# Assert results match
|
|
assert (
|
|
set(actual_matches) == set(expected_matches)
|
|
), f"Mismatch in results for query '{query['query']}' with slop {query['slop']}"
|
|
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_phrase_match_as_filter_in_vector_search(
|
|
self, tokenizer, enable_inverted_index, enable_partition_key
|
|
):
|
|
"""
|
|
target: Verify phrase match functionality when used as a filter in vector search
|
|
method: 1. Generate test data with both text content and vector embeddings
|
|
2. Create collection with vector field (128d) and text field
|
|
3. Build both vector index (IVF_SQ8) and text inverted index
|
|
4. Perform vector search with phrase match as a filter condition
|
|
5. Verify the combined search results maintain accuracy
|
|
expected: The system should correctly combine vector search with phrase match filtering
|
|
while maintaining both search accuracy and performance
|
|
"""
|
|
# Initialize parameters
|
|
dim = 128
|
|
data_size = 3000
|
|
num_queries = 10
|
|
|
|
# Initialize generator based on tokenizer
|
|
language = "zh" if tokenizer == "jieba" else "en"
|
|
generator = PhraseMatchTestGenerator(language=language)
|
|
|
|
# Create collection
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix),
|
|
schema=init_collection_schema(dim, tokenizer, enable_partition_key),
|
|
)
|
|
|
|
# Generate test data
|
|
test_data = generator.generate_test_data(data_size, dim)
|
|
df = pd.DataFrame(test_data)
|
|
log.info(f"Test data: \n{df['text']}")
|
|
# Insert data into collection
|
|
insert_data = [
|
|
{"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
|
|
]
|
|
collection_w.insert(insert_data)
|
|
collection_w.flush()
|
|
|
|
# Create indexes
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index(
|
|
"text", {"index_type": "INVERTED", "params": {"tokenizer": tokenizer}}
|
|
)
|
|
|
|
collection_w.load()
|
|
|
|
# Generate and execute test queries
|
|
test_queries = generator.generate_test_queries(num_queries)
|
|
|
|
for query in test_queries:
|
|
expr = f"phrase_match(text, '{query['query']}', {query['slop']})"
|
|
log.info(f"Testing query: {expr}")
|
|
|
|
# Execute filter search
|
|
data = [generator.generate_embedding(dim) for _ in range(10)]
|
|
results, _ = collection_w.search(
|
|
data,
|
|
anns_field="emb",
|
|
param={},
|
|
limit=10,
|
|
expr=expr,
|
|
output_fields=["id", "text"],
|
|
)
|
|
|
|
# Get expected matches using Tantivy
|
|
expected_matches = generator.get_query_results(
|
|
query["query"], query["slop"]
|
|
)
|
|
# assert results satisfy the filter
|
|
for hits in results:
|
|
for hit in hits:
|
|
assert hit.id in expected_matches
|
|
|
|
@pytest.mark.parametrize("slop_value", [0, 1, 2, 5, 10])
|
|
def test_slop_parameter(self, slop_value):
|
|
"""
|
|
target: Verify phrase matching behavior with varying slop values
|
|
method: 1. Create collection with standard tokenizer
|
|
2. Generate and insert data with controlled word gaps between terms
|
|
3. Test phrase matching with specific slop values (0, 1, 2, etc.)
|
|
4. Verify matches at different word distances
|
|
5. Compare results with Tantivy reference implementation
|
|
expected: Results should only match phrases where words are within the specified
|
|
slop distance, validating the slop parameter's distance control
|
|
"""
|
|
dim = 128
|
|
data_size = 3000
|
|
num_queries = 2
|
|
tokenizer = "standard"
|
|
enable_partition_key = True
|
|
# Initialize generator based on tokenizer
|
|
language = "zh" if tokenizer == "jieba" else "en"
|
|
generator = PhraseMatchTestGenerator(language=language)
|
|
|
|
# Create collection
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix),
|
|
schema=init_collection_schema(dim, tokenizer, enable_partition_key),
|
|
)
|
|
|
|
# Generate test data
|
|
test_data = generator.generate_test_data(data_size, dim)
|
|
df = pd.DataFrame(test_data)
|
|
log.info(f"Test data: {df['text']}")
|
|
# Insert data into collection
|
|
insert_data = [
|
|
{"id": d["id"], "text": d["text"], "emb": d["emb"]} for d in test_data
|
|
]
|
|
collection_w.insert(insert_data)
|
|
collection_w.flush()
|
|
|
|
# Create indexes
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
|
)
|
|
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
|
|
collection_w.load()
|
|
|
|
# Generate and execute test queries
|
|
test_queries = generator.generate_test_queries(num_queries)
|
|
|
|
for query in test_queries:
|
|
expr = f"phrase_match(text, '{query['query']}', {slop_value})"
|
|
log.info(f"Testing query: {expr}")
|
|
|
|
# Execute query
|
|
results, _ = collection_w.query(expr=expr, output_fields=["id", "text"])
|
|
|
|
# Get expected matches using Tantivy
|
|
expected_matches = generator.get_query_results(query["query"], slop_value)
|
|
# Get actual matches from Milvus
|
|
actual_matches = [r["id"] for r in results]
|
|
if set(actual_matches) != set(expected_matches):
|
|
log.info(f"collection schema: {collection_w.schema}")
|
|
for match_id in expected_matches:
|
|
# query by id to get text
|
|
res, _ = collection_w.query(
|
|
expr=f"id == {match_id}", output_fields=["text"]
|
|
)
|
|
text = res[0]["text"]
|
|
log.info(f"Expected match: {match_id}, text: {text}")
|
|
|
|
for match_id in actual_matches:
|
|
# query by id to get text
|
|
res, _ = collection_w.query(
|
|
expr=f"id == {match_id}", output_fields=["text"]
|
|
)
|
|
text = res[0]["text"]
|
|
log.info(f"Matched document: {match_id}, text: {text}")
|
|
# Assert results match
|
|
assert (
|
|
set(actual_matches) == set(expected_matches)
|
|
), f"Mismatch in results for query '{query['query']}' with slop {slop_value}"
|
|
|
|
def test_query_phrase_match_with_different_patterns(self):
|
|
"""
|
|
target: Verify phrase matching with various text patterns and complexities
|
|
method: 1. Create collection with standard tokenizer
|
|
2. Generate and insert data with diverse phrase patterns:
|
|
- Exact phrases ("love swimming and running")
|
|
- Phrases with gaps ("enjoy very basketball")
|
|
- Complex phrases ("practice tennis seriously often")
|
|
- Multiple term phrases ("swimming running cycling")
|
|
3. Test each pattern with appropriate slop values
|
|
4. Verify minimum match count for each pattern
|
|
expected: System should correctly identify and match each pattern type
|
|
with the specified number of matches per pattern
|
|
"""
|
|
dim = 128
|
|
collection_name = f"{prefix}_patterns"
|
|
schema = init_collection_schema(dim, "standard", False)
|
|
collection = self.init_collection_wrap(name=collection_name, schema=schema)
|
|
|
|
# Generate data with various patterns
|
|
generator = PhraseMatchTestGenerator(language="en")
|
|
data = generator.generate_test_data(3000, dim)
|
|
collection.insert(data)
|
|
# Test various patterns
|
|
test_patterns = [
|
|
("love swimming and running", 0), # Exact phrase
|
|
("enjoy very basketball", 1), # Phrase with gap
|
|
("practice tennis seriously often", 2), # Complex phrase
|
|
("swimming running cycling", 5), # Multiple activities
|
|
]
|
|
|
|
# Generate and insert documents that match the patterns
|
|
num_docs_per_pattern = 100
|
|
pattern_documents = generator.generate_pattern_documents(
|
|
test_patterns, dim, num_docs_per_pattern=num_docs_per_pattern
|
|
)
|
|
collection.insert(pattern_documents)
|
|
df = pd.DataFrame(pattern_documents)[["id", "text"]]
|
|
log.info(f"Test data:\n {df}")
|
|
|
|
collection.create_index(
|
|
field_name="text", index_params={"index_type": "INVERTED"}
|
|
)
|
|
collection.create_index(
|
|
field_name="emb",
|
|
index_params={
|
|
"index_type": "IVF_SQ8",
|
|
"metric_type": "L2",
|
|
"params": {"nlist": 64},
|
|
},
|
|
)
|
|
collection.load()
|
|
|
|
for pattern, slop in test_patterns:
|
|
results, _ = collection.query(
|
|
expr=f'phrase_match(text, "{pattern}", {slop})', output_fields=["text"]
|
|
)
|
|
log.info(
|
|
f"Pattern '{pattern}' with slop {slop} found {len(results)} matches"
|
|
)
|
|
assert len(results) >= num_docs_per_pattern
|
|
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
class TestQueryPhraseMatchNegative(TestcaseBase):
|
|
def test_query_phrase_match_with_invalid_slop(self):
|
|
"""
|
|
target: Verify error handling for invalid slop values in phrase matching
|
|
method: 1. Create collection with standard test data
|
|
2. Test phrase matching with invalid slop values:
|
|
- Negative slop values (-1)
|
|
- Extremely large slop values (10^31)
|
|
3. Verify error handling and response
|
|
expected: System should:
|
|
1. Reject queries with invalid slop values
|
|
2. Return appropriate error responses
|
|
3. Maintain system stability after invalid queries
|
|
"""
|
|
dim = 128
|
|
collection_name = f"{prefix}_invalid_slop"
|
|
schema = init_collection_schema(dim, "standard", False)
|
|
collection = self.init_collection_wrap(name=collection_name, schema=schema)
|
|
|
|
# Insert some test data
|
|
generator = PhraseMatchTestGenerator(language="en")
|
|
data = generator.generate_test_data(100, dim)
|
|
collection.insert(data)
|
|
|
|
collection.create_index(
|
|
field_name="text", index_params={"index_type": "INVERTED"}
|
|
)
|
|
collection.create_index(
|
|
field_name="emb",
|
|
index_params={
|
|
"index_type": "IVF_SQ8",
|
|
"metric_type": "L2",
|
|
"params": {"nlist": 64},
|
|
},
|
|
)
|
|
collection.load()
|
|
|
|
# Test invalid inputs
|
|
invalid_cases = [
|
|
("valid query", -1), # Negative slop
|
|
("valid query", 10 ** 31), # Very large slop
|
|
]
|
|
|
|
for query, slop in invalid_cases:
|
|
res, result = collection.query(
|
|
expr=f'phrase_match(text, "{query}", {slop})',
|
|
output_fields=["text"],
|
|
check_task=CheckTasks.check_nothing,
|
|
)
|
|
log.info(f"Query: '{query[:10]}' with slop {slop} returned {res}")
|
|
assert result is False
|