milvus/tests/python_client/common/phrase_match_generator.py

480 lines
15 KiB
Python

import re
import jieba
from faker import Faker
from tantivy import SchemaBuilder, Document, Index, Query
from typing import List, Dict
import numpy as np
import random
class PhraseMatchTestGenerator:
def __init__(self, language="en"):
"""
Initialize the test data generator
Args:
language: Language for text generation ('en' for English, 'zh' for Chinese)
"""
self.language = language
self.index = None
self.documents = []
# English vocabulary
self.en_activities = [
"swimming",
"football",
"basketball",
"tennis",
"volleyball",
"baseball",
"golf",
"rugby",
"cricket",
"boxing",
"running",
"cycling",
"skating",
"skiing",
"surfing",
"diving",
"climbing",
"yoga",
"dancing",
"hiking",
]
self.en_verbs = [
"love",
"like",
"enjoy",
"play",
"practice",
"prefer",
"do",
"learn",
"teach",
"watch",
"start",
"begin",
"continue",
"finish",
"master",
"try",
]
self.en_connectors = [
"and",
"or",
"but",
"while",
"after",
"before",
"then",
"also",
"plus",
"with",
]
self.en_modifiers = [
"very much",
"a lot",
"seriously",
"casually",
"professionally",
"regularly",
"often",
"sometimes",
"daily",
"weekly",
]
# Chinese vocabulary
self.zh_activities = [
"游泳",
"足球",
"篮球",
"网球",
"排球",
"棒球",
"高尔夫",
"橄榄球",
"板球",
"拳击",
"跑步",
"骑行",
"滑冰",
"滑雪",
"冲浪",
"潜水",
"攀岩",
"瑜伽",
"跳舞",
"徒步",
]
self.zh_verbs = [
"喜欢",
"热爱",
"享受",
"",
"练习",
"偏好",
"",
"学习",
"",
"观看",
"开始",
"开启",
"继续",
"完成",
"掌握",
"尝试",
]
self.zh_connectors = [
"",
"或者",
"但是",
"同时",
"之后",
"之前",
"然后",
"",
"加上",
"",
]
self.zh_modifiers = [
"非常",
"很多",
"认真地",
"随意地",
"专业地",
"定期地",
"经常",
"有时候",
"每天",
"每周",
]
# Set vocabulary based on language
self.activities = self.zh_activities if language == "zh" else self.en_activities
self.verbs = self.zh_verbs if language == "zh" else self.en_verbs
self.connectors = self.zh_connectors if language == "zh" else self.en_connectors
self.modifiers = self.zh_modifiers if language == "zh" else self.en_modifiers
def tokenize_text(self, text: str) -> List[str]:
"""Tokenize text using jieba tokenizer"""
text = text.strip()
text = re.sub(r"[^\w\s]", " ", text)
text = text.replace("\n", " ")
if self.language == "zh":
text = text.replace(" ", "")
return list(jieba.cut_for_search(text))
else:
return list(text.split())
def generate_embedding(self, dim: int) -> List[float]:
"""Generate random embedding vector"""
return list(np.random.random(dim))
def generate_text_pattern(self) -> str:
"""Generate test document text with various patterns"""
patterns = [
# Simple pattern with two activities
lambda: f"{random.choice(self.activities)} {random.choice(self.activities)}",
# Pattern with connector between activities
lambda: f"{random.choice(self.activities)} {random.choice(self.connectors)} {random.choice(self.activities)}",
# Pattern with modifier between activities
lambda: f"{random.choice(self.activities)} {random.choice(self.modifiers)} {random.choice(self.activities)}",
# Complex pattern with verb and activities
lambda: f"{random.choice(self.verbs)} {random.choice(self.activities)} {random.choice(self.activities)}",
# Pattern with multiple gaps
lambda: f"{random.choice(self.activities)} {random.choice(self.modifiers)} {random.choice(self.connectors)} {random.choice(self.activities)}",
]
return random.choice(patterns)()
def generate_test_data(self, num_documents: int, dim: int) -> List[Dict]:
"""
Generate test documents with text and embeddings
Args:
num_documents: Number of documents to generate
dim: Dimension of embedding vectors
Returns:
List of dictionaries containing document data
"""
# Generate documents
self.documents = []
for i in range(num_documents):
self.documents.append(
{
"id": i,
"text": self.generate_text_pattern()
if self.language == "en"
else self.generate_text_pattern().replace(" ", ""),
"emb": self.generate_embedding(dim),
}
)
# Initialize Tantivy index
schema_builder = SchemaBuilder()
schema_builder.add_text_field("text", stored=True)
schema_builder.add_unsigned_field("doc_id", stored=True)
schema = schema_builder.build()
self.index = Index(schema=schema, path=None)
writer = self.index.writer()
# Index all documents
for doc in self.documents:
document = Document()
new_text = " ".join(self.tokenize_text(doc["text"]))
document.add_text("text", new_text)
document.add_unsigned("doc_id", doc["id"])
writer.add_document(document)
writer.commit()
self.index.reload()
return self.documents
def _generate_random_word(self, exclude_words: List[str]) -> str:
"""
Generate a random word that is not in the exclude_words list using Faker
"""
fake = Faker()
while True:
word = fake.word()
if word not in exclude_words:
return word
def generate_pattern_documents(self, patterns: List[tuple], dim: int, num_docs_per_pattern: int = 1) -> List[Dict]:
"""
Generate documents that match specific test patterns with their corresponding slop values
Args:
patterns: List of tuples containing (pattern, slop) pairs
dim: Dimension of embedding vectors
num_docs_per_pattern: Number of documents to generate for each pattern
Returns:
List of dictionaries containing document data with text and embeddings
"""
pattern_documents = []
for pattern, slop in patterns:
# Split pattern into components
pattern_words = pattern.split()
# Generate multiple documents for each pattern
if slop == 0: # Exact phrase
text = " ".join(pattern_words)
pattern_documents.append({
"id": random.randint(0, 1000000), "text": text, "emb": self.generate_embedding(dim)})
else: # Pattern with gaps
# Generate slop number of unique words
insert_words = []
for _ in range(slop):
new_word = self._generate_random_word(pattern_words + insert_words)
insert_words.append(new_word)
# Insert the words randomly between the pattern words
all_words = pattern_words.copy()
for word in insert_words:
# Random position between pattern words
pos = random.randint(1, len(all_words))
all_words.insert(pos, word)
text = " ".join(all_words)
pattern_documents.append({
"id": random.randint(0, 1000000),
"text": text,
"emb": self.generate_embedding(dim)})
new_pattern_documents = []
start = 1000000
for i in range(num_docs_per_pattern):
for doc in pattern_documents:
new_doc = dict(doc)
new_doc["id"] = start + len(new_pattern_documents)
new_pattern_documents.append(new_doc)
return new_pattern_documents
def generate_test_queries(self, num_queries: int) -> List[Dict]:
"""
Generate test queries with varying slop values
Args:
num_queries: Number of queries to generate
Returns:
List of dictionaries containing query information
"""
queries = []
slop_values = [0, 1, 2, 3] # Common slop values
for i in range(num_queries):
# Randomly select two or three words for the query
num_words = random.choice([2, 3])
words = random.sample(self.activities, num_words)
queries.append(
{
"id": i,
"query": " ".join(words)
if self.language == "en"
else "".join(words),
"slop": random.choice(slop_values),
"type": f"{num_words}_words",
}
)
return queries
def get_query_results(self, query: str, slop: int) -> List[Dict]:
"""
Get all documents that match the phrase query
Args:
query: Query phrase
slop: Maximum allowed word gap
Returns:
List[Dict]: List of matching documents with their ids and texts
"""
if self.index is None:
raise RuntimeError("No documents indexed. Call generate_test_data first.")
# Clean and normalize query
query_terms = self.tokenize_text(query)
# Create phrase query
searcher = self.index.searcher()
phrase_query = Query.phrase_query(self.index.schema, "text", query_terms, slop)
# Search for matches
results = searcher.search(phrase_query, limit=len(self.documents))
# Extract all matching documents
matched_docs = []
for _, doc_address in results.hits:
doc = searcher.doc(doc_address)
doc_id = doc.to_dict()["doc_id"]
matched_docs.extend(doc_id)
return matched_docs
class KoreanTextGenerator:
def __init__(self):
# Sports/Activities (Nouns)
self.activities = [
"수영", "축구", "농구", "테니스",
"배구", "야구", "골프", "럭비",
"달리기", "자전거", "스케이트", "스키",
"서핑", "다이빙", "등산", "요가",
"", "하이킹", "독서", "요리"
]
# Verbs (Base Form)
self.verbs = [
"좋아하다", "즐기다", "하다", "배우다",
"가르치다", "보다", "시작하다", "계속하다",
"연습하다", "선호하다", "마스터하다", "도전하다"
]
# Connectors
self.connectors = [
"그리고", "또는", "하지만", "그런데",
"그래서", "또한", "게다가", "그러면서",
"동시에", "함께"
]
# Modifiers (Frequency/Degree)
self.modifiers = [
"매우", "자주", "가끔", "열심히",
"전문적으로", "규칙적으로", "매일", "일주일에 한 번",
"취미로", "진지하게"
]
def conjugate_verb(self, verb):
# Simple Korean verb conjugation (using informal style "-아/어요")
if verb.endswith("하다"):
return verb.replace("하다", "해요")
elif verb.endswith(""):
return verb[:-1] + "아요"
return verb
def sentence(self):
# Build basic sentence structure
activity = random.choice(self.activities)
verb = random.choice(self.verbs)
modifier = random.choice(self.modifiers)
# Conjugate verb
conjugated_verb = self.conjugate_verb(verb)
# Build sentence (Korean word order: Subject + Object + Modifier + Verb)
sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}"
# Randomly add connector and another activity
if random.choice([True, False]):
connector = random.choice(self.connectors)
second_activity = random.choice(self.activities)
second_verb = self.conjugate_verb(random.choice(self.verbs))
sentence += f" {connector} {second_activity}{second_verb}"
return sentence + "."
def paragraph(self, num_sentences=3):
return '\n'.join([self.sentence() for _ in range(num_sentences)])
def text(self, num_sentences=5):
return '\n'.join([self.sentence() for _ in range(num_sentences)])
def generate_text_by_analyzer(analyzer_params):
"""
Generate text data based on the given analyzer parameters
Args:
analyzer_params: Dictionary containing the analyzer parameters
Returns:
str: Generated text data
"""
if analyzer_params["tokenizer"] == "standard":
fake = Faker("en_US")
elif analyzer_params["tokenizer"] == "jieba":
fake = Faker("zh_CN")
elif analyzer_params["tokenizer"]["type"] == "lindera":
# Generate random Japanese text
if analyzer_params["tokenizer"]["dict_kind"] == "ipadic":
fake = Faker("ja_JP")
elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic":
fake = KoreanTextGenerator()
elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict":
fake = Faker("zh_CN")
else:
raise ValueError("Invalid dict_kind")
else:
raise ValueError("Invalid analyzer parameters")
text = fake.text()
stop_words = []
if "filter" in analyzer_params:
for filter in analyzer_params["filter"]:
if filter["type"] == "stop":
stop_words.extend(filter["stop_words"])
# add stop words to the text
text += " " + " ".join(stop_words)
return text