milvus/tests/python_client/testcases/test_full_text_search.py

3449 lines
128 KiB
Python

from pymilvus import (
FieldSchema, CollectionSchema, DataType, Function, FunctionType, AnnSearchRequest, WeightedRanker
)
from common.common_type import CaseLabel, CheckTasks
from common import common_func as cf
from common import common_type as ct
from utils.util_log import test_log as log
from base.client_base import TestcaseBase
import random
import pytest
import pandas as pd
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
# patch faker to generate text with specific distribution
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
pd.set_option("expand_frame_repr", False)
prefix = "full_text_search_collection"
class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test create collection with full text search
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_collection_for_full_text_search(self, tokenizer):
"""
target: test create collection with full text search
method: create collection with full text search, use bm25 function
expected: create collection successfully
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
text_fields = ["text", "paragraph"]
for field in text_fields:
bm25_function = Function(
name=f"{field}_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=[field],
output_field_names=[f"{field}_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
res, _ = collection_w.describe()
assert len(res["functions"]) == len(text_fields)
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer):
"""
target: test create collection with full text search twice with same schema
method: create collection with full text search, use bm25 function, then create again
expected: create collection successfully and create again successfully
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
text_fields = ["text", "paragraph"]
for field in text_fields:
bm25_function = Function(
name=f"{field}_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=[field],
output_field_names=[f"{field}_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
c_name = cf.gen_unique_str(prefix)
self.init_collection_wrap(
name=c_name, schema=schema
)
collection_w = self.init_collection_wrap(
name=c_name, schema=schema
)
res, _ = collection_w.describe()
assert len(res["functions"]) == len(text_fields)
# @pytest.mark.skip("skip")
class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
"""
******************************************************************
The following cases are used to test create collection with full text search negative
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("tokenizer", ["unsupported"])
@pytest.mark.skip(reason="check not implement may cause panic")
def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer):
"""
target: test create collection with full text search with unsupported tokenizer
method: create collection with full text search, use bm25 function and unsupported tokenizer
expected: create collection failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
text_fields = ["text", "paragraph"]
for field in text_fields:
bm25_function = Function(
name=f"{field}_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=[field],
output_field_names=[f"{field}_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
res, result = collection_w.describe()
log.info(f"collection describe {res}")
assert not result, "create collection with unsupported tokenizer should be failed"
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("valid_output", [True, False])
@pytest.mark.parametrize("valid_input", [True, False])
def test_create_collection_for_full_text_search_with_invalid_input_output(self, valid_output, valid_input):
"""
target: test create collection with full text search with invalid input/output in bm25 function
method: create collection with full text search, use bm25 function and invalid input/output
expected: create collection failed
"""
analyzer_params = {
"tokenizer": "standard",
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
if valid_input:
input_field_names = ["text"]
else:
input_field_names = ["invalid_inout"]
if valid_output:
output_field_names = ["text_sparse_emb"]
else:
output_field_names = ["invalid_output"]
bm25_function = Function(
name=f"text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=input_field_names,
output_field_names=output_field_names,
params={},
)
schema.add_function(bm25_function)
if (not valid_output) or (not valid_input):
self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema,
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: "field not found in collection"}
)
else:
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
res, result = collection_w.describe()
log.info(f"collection describe {res}")
assert result, "create collection with valid input/output should be successful"
@pytest.mark.tags(CaseLabel.L1)
def test_create_collection_for_full_text_search_with_field_not_tokenized(self):
"""
target: test create collection with full text search with field not tokenized
method: create collection with full text search, use bm25 function and input field not tokenized
expected: create collection failed
"""
analyzer_params = {
"tokenizer": "standard",
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=False,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name=f"text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={
},
)
schema.add_function(bm25_function)
check_task = CheckTasks.err_res
check_items = {ct.err_code: 65535, ct.err_msg: "BM25 function input field must set enable_analyzer to true"}
self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema,
check_task=check_task,
check_items=check_items
)
# @pytest.mark.skip("skip")
class TestInsertWithFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test insert with full text search
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nullable", [False, True])
@pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullable):
"""
target: test insert data with full text search
method: 1. insert data with varchar in different language
2. query count and verify the result
expected: insert successfully and count is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if text_lang == "zh":
fake = fake_zh
elif text_lang == "hybrid":
fake = Faker()
if nullable:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
"text": fake.text().lower(), # function input should not be None
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
else:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
if text_lang == "hybrid":
hybrid_data = []
for i in range(data_size):
fake = random.choice([fake_en, fake_zh, Faker("de_DE")])
tmp = {
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
hybrid_data.append(tmp)
data = hybrid_data + data
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
num_entities = collection_w.num_entities
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_dynamic_field", [True])
@pytest.mark.parametrize("nullable", [False])
@pytest.mark.parametrize("text_lang", ["en"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field):
"""
target: test insert data with full text search and enable dynamic field
method: 1. create collection with full text search and enable dynamic field
2. insert data with varchar
3. query count and verify the result
expected: insert successfully and count is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=enable_dynamic_field)
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if text_lang == "zh":
fake = fake_zh
elif text_lang == "de":
fake = Faker("de_DE")
elif text_lang == "hybrid":
fake = Faker()
if nullable:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
"text": fake.text().lower(), # function input should not be None
"emb": [random.random() for _ in range(dim)],
f"dynamic_field_{i}": f"dynamic_value_{i}"
}
for i in range(data_size)
]
else:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
f"dynamic_field_{i}": f"dynamic_value_{i}"
}
for i in range(data_size)
]
if text_lang == "hybrid":
hybrid_data = []
for i in range(data_size):
fake = random.choice([fake_en, fake_zh, Faker("de_DE")])
tmp = {
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
f"dynamic_field_{i}": f"dynamic_value_{i}"
}
hybrid_data.append(tmp)
data = hybrid_data + data
# df = pd.DataFrame(data)
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(data), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(data)
else data[i: len(data)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
num_entities = collection_w.num_entities
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nullable", [True])
@pytest.mark.parametrize("text_lang", ["en"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable):
"""
target: test insert data for full text search with dataframe
method: 1. insert data with varchar in dataframe format
2. query count and verify the result
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if text_lang == "zh":
fake = fake_zh
elif text_lang == "de":
fake = Faker("de_DE")
elif text_lang == "hybrid":
fake = Faker()
if nullable:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
"text": fake.text().lower(), # function input should not be None
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
else:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
if text_lang == "hybrid":
hybrid_data = []
for i in range(data_size):
fake = random.choice([fake_en, fake_zh, Faker("de_DE")])
tmp = {
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
hybrid_data.append(tmp)
data = hybrid_data + data
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(df[i: i + batch_size])
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
num_entities = collection_w.num_entities
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
"""
target: test insert data with full text search with part of empty string
method: 1. insert data with part of empty string
2. query count and verify the result
3. search with text
expected: insert successfully, count is correct, and search result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
log.info(f"collection describe {collection_w.describe()}")
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
data = [
{
"id": i,
"word": fake.word().lower() if random.random() < 0.5 else "",
"sentence": fake.sentence().lower() if random.random() < 0.5 else "",
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else "",
"text": fake.text().lower() if random.random() < 0.5 else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
log.info(f"analyze documents")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
num_entities = collection_w.num_entities
# query with count(*)
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
# query with expr
res, _ = collection_w.query(
expr="id >= 0",
output_fields=["text"]
)
assert len(res) == len(data)
# search with text
nq = 2
limit = 100
search_data = [fake.text().lower() + random.choice(tokens) for _ in range(nq)]
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
param={},
limit=limit,
output_fields=["id", "text"])
assert len(res_list) == nq
for i in range(nq):
assert len(res_list[i]) == limit
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
result_text = r.text
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
# @pytest.mark.skip("skip")
class TestInsertWithFullTextSearchNegative(TestcaseBase):
"""
******************************************************************
The following cases are used to test insert with full text search negative
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nullable", [True])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable):
"""
target: test insert data with full text search with non varchar data
method: 1. insert data with non varchar data
expected: insert failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
fake = fake_zh
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower() if random.random() < 0.5 else 1, # mix some int data
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)],
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: "inconsistent with defined schema"},
)
# @pytest.mark.skip("skip")
class TestUpsertWithFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test upsert with full text search
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nullable", [False, True])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_upsert_for_full_text_search(self, tokenizer, nullable):
"""
target: test upsert data for full text search
method: 1. insert data with varchar
2. upsert in half of the data
3. check the data
expected: upsert successfully and data is updated
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
if nullable:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
"text": fake.text().lower(), # function input should not be None
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
else:
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
num_entities = collection_w.num_entities
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
# upsert in half of the data
upsert_data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size // 2)
]
upsert_data += data[data_size // 2:]
for i in range(0, len(upsert_data), batch_size):
collection_w.upsert(
upsert_data[i: i + batch_size]
if i + batch_size < len(upsert_data)
else upsert_data[i: len(upsert_data)]
)
res, _ = collection_w.query(
expr="id >= 0",
output_fields=["*"]
)
upsert_data_map = {}
for d in upsert_data:
upsert_data_map[d["id"]] = d
for r in res:
_id = r["id"]
word = r["word"]
assert word == upsert_data_map[_id]["word"]
# @pytest.mark.skip("skip")
class TestUpsertWithFullTextSearchNegative(TestcaseBase):
"""
******************************************************************
The following cases are used to test upsert data in full text search with negative condition
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nullable", [False])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable):
"""
target: test upsert data for full text search with no varchar data
method: 1. insert data with varchar data
2. upsert in half of the data with some data is int
expected: upsert failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
nullable=nullable,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
language = "en"
if tokenizer == "jieba":
fake = fake_zh
language = "zh"
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
num_entities = collection_w.num_entities
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
# upsert in half of the data
upsert_data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower() if random.random() < 0.5 else 1, # mix some int data
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
check_items = {ct.err_code: 1, ct.err_msg: "inconsistent with defined schema"}
check_task = CheckTasks.err_res
collection_w.upsert(upsert_data,
check_task=check_task,
check_items=check_items)
class TestDeleteWithFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test delete data in full text search
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_delete_for_full_text_search(self, tokenizer):
"""
target: test delete data for full text search
method: 1. insert data with varchar
2. delete half of the data
3. check the data
expected: delete successfully and data is deleted
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
fake = fake_zh
data = [
{
"id": i,
"word": fake.word().lower(),
"sentence": fake.sentence().lower(),
"paragraph": fake.paragraph().lower(),
"text": fake.text().lower(),
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": "SPARSE_INVERTED_INDEX",
"metric_type": "BM25",
"params": {
"drop_ratio_build": 0.3,
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
num_entities = collection_w.num_entities
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert len(data) == num_entities
assert len(data) == count
# delete half of the data
delete_ids = [i for i in range(data_size // 2)]
collection_w.delete(
expr=f"id in {delete_ids}"
)
res, _ = collection_w.query(
expr="",
output_fields=["count(*)"]
)
count = res[0]["count(*)"]
assert count == data_size // 2
# query with delete expr and get empty result
res, _ = collection_w.query(
expr=f"id in {delete_ids}",
output_fields=["*"]
)
assert len(res) == 0
# search with text has been deleted, not in the result
search_data = df["text"].to_list()[:data_size // 2]
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
param={},
limit=100,
output_fields=["id", "text"])
for i in range(len(res_list)):
query_text = search_data[i]
result_texts = [r.text for r in res_list[i]]
assert query_text not in result_texts
class TestDeleteWithFullTextSearchNegative(TestcaseBase):
"""
todo: add some negative cases
"""
pass
# @pytest.mark.skip("skip")
class TestCreateIndexWithFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test full text search in index creation
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("b", [0.1])
@pytest.mark.parametrize("k", [1.2])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_index_for_full_text_search_default(
self, tokenizer, index_type, k, b
):
"""
target: test create index for full text search
method: 1. enable full text search and insert data with varchar
2. create index for full text search with different index type
3. verify the index info by describe index
expected: create index successfully and index info is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
empty_percent = 0.0
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": k,
"bm25_b": b,
}
}
)
# describe index info to verify
res = collection_w.indexes
index_info = [r.to_dict() for r in res]
log.info(f"index info: {index_info}")
for info in index_info:
if info["index_name"] == "text_sparse_emb":
assert info["index_param"]["index_type"] == index_type
assert info["index_param"]["metric_type"] == "BM25"
assert info["index_param"]["params"]["bm25_k1"] == k
assert info["index_param"]["params"]["bm25_b"] == b
break
class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
"""
******************************************************************
The following cases are used to test full text search in index creation negative
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("b", [0.5])
@pytest.mark.parametrize("k", [1.5])
@pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_full_text_search_with_invalid_index_type(
self, tokenizer, index_type, k, b
):
"""
target: test create index for full text search with invalid index type
method: 1. enable full text search and insert data with varchar
2. create index for full text search with invalid index type
expected: create index failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
empty_percent = 0.0
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
error = {"err_code": 1100, "err_msg": "invalid"}
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": k,
"bm25_b": b,
}
},
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("b", [0.5])
@pytest.mark.parametrize("k", [1.5])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_full_text_search_index_with_invalid_metric_type(
self, tokenizer, index_type, metric_type, k, b
):
"""
target: test create index for full text search with invalid metric type
method: 1. enable full text search and insert data with varchar
2. create index for full text search with invalid metric type
expected: create index failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
empty_percent = 0.0
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
error = {ct.err_code: 65535, ct.err_msg: "index metric type of BM25 function output field must be BM25"}
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": metric_type,
"params": {
"bm25_k1": k,
"bm25_b": b,
}
},
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("b", [0.5])
@pytest.mark.parametrize("k", [1.5])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
self, tokenizer, index_type, k, b
):
"""
target: test create index using bm25 metric type for non bm25 output field (dense float vector or
sparse float vector not for bm25)
method: 1. enable full text search and insert data with varchar
2. create index using bm25 metric type for non bm25 output field
expected: create index failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
empty_percent = 0.0
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"}
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "BM25", "params": {"M": 16, "efConstruction": 500}},
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("b", [-1])
@pytest.mark.parametrize("k", [-1])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_full_text_search_with_invalid_bm25_params(
self, tokenizer, index_type, k, b
):
"""
target: test create index for full text search with invalid bm25 params
method: 1. enable full text search and insert data with varchar
2. create index for full text search with invalid bm25 params
expected: create index failed
"""
analyzer_params = {
"tokenizer": tokenizer,
}
empty_percent = 0.0
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=True,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
check_task = CheckTasks.err_res
error = {"err_code": 1100, "err_msg": "invalid"} # todo, update error code and message
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": k,
"bm25_b": b,
}
},
check_task=check_task,
check_items=error
)
# @pytest.mark.skip("skip")
class TestSearchWithFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test search for full text search
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0.5])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
@pytest.mark.parametrize("expr", ["text_match", "id_range"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.parametrize("offset", [10, 0])
def test_full_text_search_default(
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. search with text
3. verify the result
expected: full text search successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
most_freq_word = word_freq.most_common(10)
tokens = [item[0] for item in most_freq_word]
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
limit = 100
token = random.choice(tokens)
search_data = [fake.text().lower() + f" {token} " for _ in range(nq)]
if expr == "text_match":
filter = f"TEXT_MATCH(text, '{token}')"
res, _ = collection_w.query(
expr=filter,
)
elif expr == "id_range":
filter = f"id < {data_size // 2}"
else:
filter = ""
res, _ = collection_w.query(
expr=filter,
limit=limit,
)
candidates_num = len(res)
log.info(f"search data: {search_data}")
# use offset = 0 to get all the results
full_res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit + offset,
offset=0,
output_fields=["id", "text"])
full_res_id_list = []
for i in range(nq):
res = full_res_list[i]
tmp = []
for r in res:
tmp.append(r.id)
full_res_id_list.append(tmp)
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit,
offset=offset,
output_fields=["id", "text"])
# verify correctness
for i in range(nq):
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
_id = r.id
# get the first id of the result in which position is larger than offset
if j == 0:
first_id = _id
p = full_res_id_list[i].index(first_id)
assert 1.2 * offset >= p >= offset * 0.8
result_text = r.text
# verify search result satisfies the filter
if expr == "text_match":
assert token in result_text
if expr == "id_range":
assert _id < data_size // 2
# verify search result has overlap with search text
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
log.info(f"overlap {overlap}")
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0.5])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("expr", ["text_match"])
@pytest.mark.parametrize("offset", [10])
@pytest.mark.parametrize("tokenizer", ["jieba"])
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
def test_full_text_search_with_jieba_tokenizer(
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key,
empty_percent, index_type, nq, inverted_index_algo):
"""
target: test full text search
method: 1. enable full text search with jieba tokenizer and insert data with varchar
2. search with text
3. verify the result
expected: full text search successfully and result is correct
"""
if tokenizer == "jieba":
lang_type = "chinese"
else:
lang_type = "english"
analyzer_params = {
"type": lang_type,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = []
for item in word_freq.most_common(20):
if len(item[0]) == 2:
tokens.append(item[0])
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
"inverted_index_algo": inverted_index_algo
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
limit = 100
token = random.choice(tokens)
search_data = [fake.text().lower() + " " + token for _ in range(nq)]
if expr == "text_match":
filter = f"text_match(text, '{token}')"
res, _ = collection_w.query(
expr=filter,
)
elif expr == "id_range":
filter = f"id < {data_size // 2}"
else:
filter = ""
res, _ = collection_w.query(
expr=filter,
limit=limit,
)
candidates_num = len(res)
log.info(f"search data: {search_data}")
# use offset = 0 to get all the results
full_res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit + offset,
offset=0,
output_fields=["id", "text"])
full_res_id_list = []
for i in range(nq):
res = full_res_list[i]
tmp = []
for r in res:
tmp.append(r.id)
full_res_id_list.append(tmp)
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit,
offset=offset,
output_fields=["id", "text"])
# verify correctness
for i in range(nq):
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
_id = r.id
# get the first id of the result in which position is larger than offset
if j == 0:
first_id = _id
p = full_res_id_list[i].index(first_id)
assert 1.2 * offset >= p >= offset * 0.8
result_text = r.text
# verify search result satisfies the filter
if expr == "text_match":
assert token in result_text
if expr == "id_range":
assert _id < data_size // 2
# verify search result has overlap with search text
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
log.info(f"overlap {overlap}")
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0.5])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("expr", ["id_range"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.parametrize("offset", [0])
def test_full_text_search_for_growing_segment(
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. search with text
3. verify the result
expected: full text search successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
most_freq_word = word_freq.most_common(10)
tokens = [item[0] for item in most_freq_word]
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
limit = 100
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
if expr == "text_match":
filter = f"TextMatch(text, '{tokens[0]}')"
res, _ = collection_w.query(
expr=filter,
)
elif expr == "id_range":
filter = f"id < {data_size // 2}"
else:
filter = ""
res, _ = collection_w.query(
expr=filter,
limit=limit,
)
candidates_num = len(res)
log.info(f"search data: {search_data}")
# use offset = 0 to get all the results
full_res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit + offset,
offset=0,
output_fields=["id", "text"])
full_res_id_list = []
for i in range(nq):
res = full_res_list[i]
tmp = []
for r in res:
tmp.append(r.id)
full_res_id_list.append(tmp)
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit,
offset=offset,
output_fields=["id", "text"])
# verify correctness
for i in range(nq):
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
_id = r.id
# get the first id of the result in which position is larger than offset
if j == 0:
first_id = _id
p = full_res_id_list[i].index(first_id)
assert 1.2 * offset >= p >= offset * 0.8
result_text = r.text
# verify search result satisfies the filter
if expr == "text_match":
assert tokens[0] in result_text
if expr == "id_range":
assert _id < data_size // 2
# verify search result has overlap with search text
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
log.info(f"overlap {overlap}")
assert len(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("expr", [None])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_full_text_search_with_range_search(
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. range search with text
3. verify the result
expected: full text search successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
limit = 1000
search_data = [fake.text().lower() + random.choice(tokens) for _ in range(nq)]
log.info(f"search data: {search_data}")
# get distance with search data
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
param={
},
limit=limit, # get a wider range of search result
output_fields=["id", "text"])
distance_list = []
for i in range(nq):
res = res_list[i]
for j in range(len(res)):
r = res[j]
distance = r.distance
distance_list.append(distance)
distance_list = sorted(distance_list)
# get the range of distance 30% ~70%
low = distance_list[int(len(distance_list) * 0.3)]
high = distance_list[int(len(distance_list) * 0.7)]
res_list, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
param={
"params": {
"radius": low, "range_filter": high
}
},
limit=limit,
output_fields=["id", "text"])
# verify correctness
for i in range(nq):
log.info(f"res: {len(res_list[i])}")
assert len(res_list[i]) < limit # less than limit, because the range is set
res = res_list[i]
for j in range(len(res)):
r = res[j]
tmp_distance = r.distance
assert low <= tmp_distance <= high
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nq", [1])
@pytest.mark.parametrize("empty_percent", [0])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("expr", [None])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_full_text_search_with_search_iterator(
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. iterator search with text
3. verify the result
expected: full text search successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
log.info(f"search data: {search_data}")
# get distance with search data
batch_size = 100
limit = batch_size * 10
iterator, _ = collection_w.search_iterator(
data=search_data,
anns_field="text_sparse_emb",
batch_size=100,
param={
"metric_type": "BM25",
},
output_fields=["id", "text"],
limit=limit
)
iter_result = []
while True:
result = iterator.next()
if not result:
iterator.close()
break
else:
iter_result.append(len(result))
for r in iter_result[:-1]:
assert r == batch_size
class TestSearchWithFullTextSearchNegative(TestcaseBase):
"""
******************************************************************
The following cases are used to test search for full text search negative
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("empty_percent", [0])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("invalid_search_data", ["empty_text"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022")
def test_search_for_full_text_search_with_empty_string_search_data(
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. search with empty text
3. verify the result
expected: full text search successfully but result is empty
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
nq = 2
limit = 100
search_data = ["" for _ in range(nq)]
log.info(f"search data: {search_data}")
res, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
param={},
limit=limit,
output_fields=["id", "text"],
)
assert len(res) == nq
for r in res:
assert len(r) == 0
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("empty_percent", [0])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
@pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_search_for_full_text_search_with_invalid_search_data(
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. search with sparse vector or dense vector
3. verify the result
expected: full text search failed and return error
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
if tokenizer == "jieba":
language = "zh"
fake = fake_zh
else:
language = "en"
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"emb": [random.random() for _ in range(dim)],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
corpus = df["text"].to_list()
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
nq = 2
limit = 100
if invalid_search_data == "sparse_vector":
search_data = cf.gen_vectors(nb=nq, dim=1000, vector_data_type="SPARSE_FLOAT_VECTOR")
else:
search_data = cf.gen_vectors(nb=nq, dim=1000, vector_data_type="FLOAT_VECTOR")
log.info(f"search data: {search_data}")
error = {ct.err_code: 65535,
ct.err_msg: "please provide varchar/text for BM25 Function based search"}
collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
param={},
limit=limit,
output_fields=["id", "text"],
check_task=CheckTasks.err_res,
check_items=error
)
# @pytest.mark.skip("skip")
class TestHybridSearchWithFullTextSearch(TestcaseBase):
"""
******************************************************************
The following cases are used to test hybrid search with full text search
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("empty_percent", [0])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
def test_hybrid_search_with_full_text_search(
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, inverted_index_algo
):
"""
target: test full text search
method: 1. enable full text search and insert data with varchar
2. hybrid search with text, spase vector and dense vector
3. verify the result
expected: hybrid search successfully and result is correct
"""
analyzer_params = {
"tokenizer": tokenizer,
}
dim = 128
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
FieldSchema(
name="word",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
is_partition_key=enable_partition_key,
),
FieldSchema(
name="sentence",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="paragraph",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
analyzer_params=analyzer_params,
),
FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=65535,
enable_analyzer=True,
enable_match=True,
analyzer_params=analyzer_params,
),
FieldSchema(name="dense_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
FieldSchema(name="neural_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
]
schema = CollectionSchema(fields=fields, description="test collection")
bm25_function = Function(
name="text_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names=["text_sparse_emb"],
params={},
)
schema.add_function(bm25_function)
data_size = 5000
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
fake = fake_en
data = [
{
"id": i,
"word": fake.word().lower() if random.random() >= empty_percent else "",
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
"text": fake.text().lower() if random.random() >= empty_percent else "",
"dense_emb": [random.random() for _ in range(dim)],
"neural_sparse_emb": cf.gen_vectors(nb=1, dim=1000, vector_data_type="SPARSE_FLOAT_VECTOR")[0],
}
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
data[i: i + batch_size]
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.create_index(
"dense_emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
)
collection_w.create_index(
"neural_sparse_emb",
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
)
collection_w.create_index(
"text_sparse_emb",
{
"index_type": index_type,
"metric_type": "BM25",
"params": {
"bm25_k1": 1.5,
"bm25_b": 0.75,
"inverted_index_algo": inverted_index_algo
}
}
)
if enable_inverted_index:
collection_w.create_index("text", {"index_type": "INVERTED"})
collection_w.load()
nq = 2
limit = 100
bm25_search = AnnSearchRequest(
data=[fake.text().lower() for _ in range(nq)],
anns_field="text_sparse_emb",
param={},
limit=limit,
)
dense_search = AnnSearchRequest(
data=[[random.random() for _ in range(dim)] for _ in range(nq)],
anns_field="dense_emb",
param={},
limit=limit,
)
sparse_search = AnnSearchRequest(
data=cf.gen_vectors(nb=nq, dim=dim, vector_data_type="SPARSE_FLOAT_VECTOR"),
anns_field="neural_sparse_emb",
param={},
limit=limit,
)
# hybrid search
res_list, _ = collection_w.hybrid_search(
reqs=[bm25_search, dense_search, sparse_search],
rerank=WeightedRanker(0.5, 0.5, 0.5),
limit=limit,
output_fields=["id", "text"]
)
assert len(res_list) == nq
# check the result correctness
for i in range(nq):
log.info(f"res length: {len(res_list[i])}")
assert len(res_list[i]) == limit