mirror of https://github.com/milvus-io/milvus.git
3449 lines
128 KiB
Python
3449 lines
128 KiB
Python
from pymilvus import (
|
|
FieldSchema, CollectionSchema, DataType, Function, FunctionType, AnnSearchRequest, WeightedRanker
|
|
)
|
|
from common.common_type import CaseLabel, CheckTasks
|
|
from common import common_func as cf
|
|
from common import common_type as ct
|
|
from utils.util_log import test_log as log
|
|
from base.client_base import TestcaseBase
|
|
|
|
import random
|
|
import pytest
|
|
import pandas as pd
|
|
from faker import Faker
|
|
|
|
Faker.seed(19530)
|
|
fake_en = Faker("en_US")
|
|
fake_zh = Faker("zh_CN")
|
|
|
|
# patch faker to generate text with specific distribution
|
|
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
|
cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
|
|
|
|
pd.set_option("expand_frame_repr", False)
|
|
|
|
prefix = "full_text_search_collection"
|
|
|
|
|
|
class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test create collection with full text search
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_collection_for_full_text_search(self, tokenizer):
|
|
"""
|
|
target: test create collection with full text search
|
|
method: create collection with full text search, use bm25 function
|
|
expected: create collection successfully
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
text_fields = ["text", "paragraph"]
|
|
for field in text_fields:
|
|
bm25_function = Function(
|
|
name=f"{field}_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=[field],
|
|
output_field_names=[f"{field}_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
res, _ = collection_w.describe()
|
|
assert len(res["functions"]) == len(text_fields)
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer):
|
|
"""
|
|
target: test create collection with full text search twice with same schema
|
|
method: create collection with full text search, use bm25 function, then create again
|
|
expected: create collection successfully and create again successfully
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
text_fields = ["text", "paragraph"]
|
|
for field in text_fields:
|
|
bm25_function = Function(
|
|
name=f"{field}_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=[field],
|
|
output_field_names=[f"{field}_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
c_name = cf.gen_unique_str(prefix)
|
|
self.init_collection_wrap(
|
|
name=c_name, schema=schema
|
|
)
|
|
collection_w = self.init_collection_wrap(
|
|
name=c_name, schema=schema
|
|
)
|
|
res, _ = collection_w.describe()
|
|
assert len(res["functions"]) == len(text_fields)
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test create collection with full text search negative
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("tokenizer", ["unsupported"])
|
|
@pytest.mark.skip(reason="check not implement may cause panic")
|
|
def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer):
|
|
"""
|
|
target: test create collection with full text search with unsupported tokenizer
|
|
method: create collection with full text search, use bm25 function and unsupported tokenizer
|
|
expected: create collection failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
text_fields = ["text", "paragraph"]
|
|
for field in text_fields:
|
|
bm25_function = Function(
|
|
name=f"{field}_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=[field],
|
|
output_field_names=[f"{field}_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
res, result = collection_w.describe()
|
|
log.info(f"collection describe {res}")
|
|
assert not result, "create collection with unsupported tokenizer should be failed"
|
|
|
|
@pytest.mark.tags(CaseLabel.L2)
|
|
@pytest.mark.parametrize("valid_output", [True, False])
|
|
@pytest.mark.parametrize("valid_input", [True, False])
|
|
def test_create_collection_for_full_text_search_with_invalid_input_output(self, valid_output, valid_input):
|
|
"""
|
|
target: test create collection with full text search with invalid input/output in bm25 function
|
|
method: create collection with full text search, use bm25 function and invalid input/output
|
|
expected: create collection failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": "standard",
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
if valid_input:
|
|
input_field_names = ["text"]
|
|
else:
|
|
input_field_names = ["invalid_inout"]
|
|
if valid_output:
|
|
output_field_names = ["text_sparse_emb"]
|
|
else:
|
|
output_field_names = ["invalid_output"]
|
|
|
|
bm25_function = Function(
|
|
name=f"text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=input_field_names,
|
|
output_field_names=output_field_names,
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
if (not valid_output) or (not valid_input):
|
|
self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema,
|
|
check_task=CheckTasks.err_res,
|
|
check_items={ct.err_code: 1, ct.err_msg: "field not found in collection"}
|
|
)
|
|
else:
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
res, result = collection_w.describe()
|
|
log.info(f"collection describe {res}")
|
|
assert result, "create collection with valid input/output should be successful"
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
def test_create_collection_for_full_text_search_with_field_not_tokenized(self):
|
|
"""
|
|
target: test create collection with full text search with field not tokenized
|
|
method: create collection with full text search, use bm25 function and input field not tokenized
|
|
expected: create collection failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": "standard",
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=False,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
|
|
bm25_function = Function(
|
|
name=f"text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={
|
|
},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
check_task = CheckTasks.err_res
|
|
check_items = {ct.err_code: 65535, ct.err_msg: "BM25 function input field must set enable_analyzer to true"}
|
|
self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema,
|
|
check_task=check_task,
|
|
check_items=check_items
|
|
)
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestInsertWithFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test insert with full text search
|
|
******************************************************************
|
|
"""
|
|
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("nullable", [False, True])
|
|
@pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullable):
|
|
"""
|
|
target: test insert data with full text search
|
|
method: 1. insert data with varchar in different language
|
|
2. query count and verify the result
|
|
expected: insert successfully and count is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if text_lang == "zh":
|
|
fake = fake_zh
|
|
elif text_lang == "hybrid":
|
|
fake = Faker()
|
|
|
|
if nullable:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
|
|
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
|
|
"text": fake.text().lower(), # function input should not be None
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
else:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
if text_lang == "hybrid":
|
|
hybrid_data = []
|
|
for i in range(data_size):
|
|
fake = random.choice([fake_en, fake_zh, Faker("de_DE")])
|
|
tmp = {
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
hybrid_data.append(tmp)
|
|
data = hybrid_data + data
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
num_entities = collection_w.num_entities
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("enable_dynamic_field", [True])
|
|
@pytest.mark.parametrize("nullable", [False])
|
|
@pytest.mark.parametrize("text_lang", ["en"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field):
|
|
"""
|
|
target: test insert data with full text search and enable dynamic field
|
|
method: 1. create collection with full text search and enable dynamic field
|
|
2. insert data with varchar
|
|
3. query count and verify the result
|
|
expected: insert successfully and count is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=enable_dynamic_field)
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if text_lang == "zh":
|
|
fake = fake_zh
|
|
elif text_lang == "de":
|
|
fake = Faker("de_DE")
|
|
elif text_lang == "hybrid":
|
|
fake = Faker()
|
|
|
|
if nullable:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
|
|
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
|
|
"text": fake.text().lower(), # function input should not be None
|
|
"emb": [random.random() for _ in range(dim)],
|
|
f"dynamic_field_{i}": f"dynamic_value_{i}"
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
else:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
f"dynamic_field_{i}": f"dynamic_value_{i}"
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
if text_lang == "hybrid":
|
|
hybrid_data = []
|
|
for i in range(data_size):
|
|
fake = random.choice([fake_en, fake_zh, Faker("de_DE")])
|
|
tmp = {
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
f"dynamic_field_{i}": f"dynamic_value_{i}"
|
|
}
|
|
hybrid_data.append(tmp)
|
|
data = hybrid_data + data
|
|
# df = pd.DataFrame(data)
|
|
# log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(data), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(data)
|
|
else data[i: len(data)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
num_entities = collection_w.num_entities
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("nullable", [True])
|
|
@pytest.mark.parametrize("text_lang", ["en"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable):
|
|
"""
|
|
target: test insert data for full text search with dataframe
|
|
method: 1. insert data with varchar in dataframe format
|
|
2. query count and verify the result
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if text_lang == "zh":
|
|
fake = fake_zh
|
|
elif text_lang == "de":
|
|
fake = Faker("de_DE")
|
|
elif text_lang == "hybrid":
|
|
fake = Faker()
|
|
|
|
if nullable:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
|
|
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
|
|
"text": fake.text().lower(), # function input should not be None
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
else:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
if text_lang == "hybrid":
|
|
hybrid_data = []
|
|
for i in range(data_size):
|
|
fake = random.choice([fake_en, fake_zh, Faker("de_DE")])
|
|
tmp = {
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
hybrid_data.append(tmp)
|
|
data = hybrid_data + data
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(df[i: i + batch_size])
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
num_entities = collection_w.num_entities
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
|
|
@pytest.mark.tags(CaseLabel.L2)
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
|
|
"""
|
|
target: test insert data with full text search with part of empty string
|
|
method: 1. insert data with part of empty string
|
|
2. query count and verify the result
|
|
3. search with text
|
|
expected: insert successfully, count is correct, and search result is correct
|
|
"""
|
|
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
log.info(f"collection describe {collection_w.describe()}")
|
|
fake = fake_en
|
|
language = "en"
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
language = "zh"
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() < 0.5 else "",
|
|
"sentence": fake.sentence().lower() if random.random() < 0.5 else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else "",
|
|
"text": fake.text().lower() if random.random() < 0.5 else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
log.info(f"analyze documents")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
tokens = list(word_freq.keys())
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
num_entities = collection_w.num_entities
|
|
# query with count(*)
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
# query with expr
|
|
res, _ = collection_w.query(
|
|
expr="id >= 0",
|
|
output_fields=["text"]
|
|
)
|
|
assert len(res) == len(data)
|
|
|
|
# search with text
|
|
nq = 2
|
|
limit = 100
|
|
search_data = [fake.text().lower() + random.choice(tokens) for _ in range(nq)]
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
param={},
|
|
limit=limit,
|
|
output_fields=["id", "text"])
|
|
assert len(res_list) == nq
|
|
for i in range(nq):
|
|
assert len(res_list[i]) == limit
|
|
search_text = search_data[i]
|
|
log.info(f"res: {res_list[i]}")
|
|
res = res_list[i]
|
|
for j in range(len(res)):
|
|
r = res[j]
|
|
result_text = r.text
|
|
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
|
|
assert len(
|
|
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestInsertWithFullTextSearchNegative(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test insert with full text search negative
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("nullable", [True])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable):
|
|
"""
|
|
target: test insert data with full text search with non varchar data
|
|
method: 1. insert data with non varchar data
|
|
expected: insert failed
|
|
"""
|
|
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower() if random.random() < 0.5 else 1, # mix some int data
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)],
|
|
check_task=CheckTasks.err_res,
|
|
check_items={ct.err_code: 1, ct.err_msg: "inconsistent with defined schema"},
|
|
)
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestUpsertWithFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test upsert with full text search
|
|
******************************************************************
|
|
"""
|
|
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("nullable", [False, True])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_upsert_for_full_text_search(self, tokenizer, nullable):
|
|
"""
|
|
target: test upsert data for full text search
|
|
method: 1. insert data with varchar
|
|
2. upsert in half of the data
|
|
3. check the data
|
|
expected: upsert successfully and data is updated
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
language = "en"
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
language = "zh"
|
|
|
|
if nullable:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower() if random.random() < 0.5 else None,
|
|
"paragraph": fake.paragraph().lower() if random.random() < 0.5 else None,
|
|
"text": fake.text().lower(), # function input should not be None
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
else:
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
num_entities = collection_w.num_entities
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
|
|
# upsert in half of the data
|
|
upsert_data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size // 2)
|
|
]
|
|
upsert_data += data[data_size // 2:]
|
|
for i in range(0, len(upsert_data), batch_size):
|
|
collection_w.upsert(
|
|
upsert_data[i: i + batch_size]
|
|
if i + batch_size < len(upsert_data)
|
|
else upsert_data[i: len(upsert_data)]
|
|
)
|
|
res, _ = collection_w.query(
|
|
expr="id >= 0",
|
|
output_fields=["*"]
|
|
)
|
|
upsert_data_map = {}
|
|
for d in upsert_data:
|
|
upsert_data_map[d["id"]] = d
|
|
for r in res:
|
|
_id = r["id"]
|
|
word = r["word"]
|
|
assert word == upsert_data_map[_id]["word"]
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestUpsertWithFullTextSearchNegative(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test upsert data in full text search with negative condition
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("nullable", [False])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable):
|
|
"""
|
|
target: test upsert data for full text search with no varchar data
|
|
method: 1. insert data with varchar data
|
|
2. upsert in half of the data with some data is int
|
|
expected: upsert failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
nullable=nullable,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
language = "en"
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
language = "zh"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
num_entities = collection_w.num_entities
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
|
|
# upsert in half of the data
|
|
upsert_data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower() if random.random() < 0.5 else 1, # mix some int data
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
check_items = {ct.err_code: 1, ct.err_msg: "inconsistent with defined schema"}
|
|
check_task = CheckTasks.err_res
|
|
collection_w.upsert(upsert_data,
|
|
check_task=check_task,
|
|
check_items=check_items)
|
|
|
|
|
|
class TestDeleteWithFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test delete data in full text search
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_delete_for_full_text_search(self, tokenizer):
|
|
"""
|
|
target: test delete data for full text search
|
|
method: 1. insert data with varchar
|
|
2. delete half of the data
|
|
3. check the data
|
|
expected: delete successfully and data is deleted
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
fake = fake_zh
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower(),
|
|
"sentence": fake.sentence().lower(),
|
|
"paragraph": fake.paragraph().lower(),
|
|
"text": fake.text().lower(),
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": "SPARSE_INVERTED_INDEX",
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"drop_ratio_build": 0.3,
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
num_entities = collection_w.num_entities
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert len(data) == num_entities
|
|
assert len(data) == count
|
|
|
|
# delete half of the data
|
|
delete_ids = [i for i in range(data_size // 2)]
|
|
collection_w.delete(
|
|
expr=f"id in {delete_ids}"
|
|
)
|
|
res, _ = collection_w.query(
|
|
expr="",
|
|
output_fields=["count(*)"]
|
|
)
|
|
count = res[0]["count(*)"]
|
|
assert count == data_size // 2
|
|
|
|
# query with delete expr and get empty result
|
|
res, _ = collection_w.query(
|
|
expr=f"id in {delete_ids}",
|
|
output_fields=["*"]
|
|
)
|
|
assert len(res) == 0
|
|
|
|
# search with text has been deleted, not in the result
|
|
search_data = df["text"].to_list()[:data_size // 2]
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
param={},
|
|
limit=100,
|
|
output_fields=["id", "text"])
|
|
for i in range(len(res_list)):
|
|
query_text = search_data[i]
|
|
result_texts = [r.text for r in res_list[i]]
|
|
assert query_text not in result_texts
|
|
|
|
|
|
class TestDeleteWithFullTextSearchNegative(TestcaseBase):
|
|
"""
|
|
todo: add some negative cases
|
|
"""
|
|
pass
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestCreateIndexWithFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test full text search in index creation
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("b", [0.1])
|
|
@pytest.mark.parametrize("k", [1.2])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_index_for_full_text_search_default(
|
|
self, tokenizer, index_type, k, b
|
|
):
|
|
"""
|
|
target: test create index for full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. create index for full text search with different index type
|
|
3. verify the index info by describe index
|
|
expected: create index successfully and index info is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
empty_percent = 0.0
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": k,
|
|
"bm25_b": b,
|
|
}
|
|
}
|
|
)
|
|
# describe index info to verify
|
|
res = collection_w.indexes
|
|
index_info = [r.to_dict() for r in res]
|
|
log.info(f"index info: {index_info}")
|
|
for info in index_info:
|
|
if info["index_name"] == "text_sparse_emb":
|
|
assert info["index_param"]["index_type"] == index_type
|
|
assert info["index_param"]["metric_type"] == "BM25"
|
|
assert info["index_param"]["params"]["bm25_k1"] == k
|
|
assert info["index_param"]["params"]["bm25_b"] == b
|
|
break
|
|
|
|
|
|
class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test full text search in index creation negative
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L2)
|
|
@pytest.mark.parametrize("b", [0.5])
|
|
@pytest.mark.parametrize("k", [1.5])
|
|
@pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_full_text_search_with_invalid_index_type(
|
|
self, tokenizer, index_type, k, b
|
|
):
|
|
"""
|
|
target: test create index for full text search with invalid index type
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. create index for full text search with invalid index type
|
|
expected: create index failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
empty_percent = 0.0
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
error = {"err_code": 1100, "err_msg": "invalid"}
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": k,
|
|
"bm25_b": b,
|
|
}
|
|
},
|
|
check_task=CheckTasks.err_res,
|
|
check_items=error
|
|
)
|
|
|
|
@pytest.mark.tags(CaseLabel.L2)
|
|
@pytest.mark.parametrize("b", [0.5])
|
|
@pytest.mark.parametrize("k", [1.5])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_full_text_search_index_with_invalid_metric_type(
|
|
self, tokenizer, index_type, metric_type, k, b
|
|
):
|
|
"""
|
|
target: test create index for full text search with invalid metric type
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. create index for full text search with invalid metric type
|
|
expected: create index failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
empty_percent = 0.0
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
error = {ct.err_code: 65535, ct.err_msg: "index metric type of BM25 function output field must be BM25"}
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": metric_type,
|
|
"params": {
|
|
"bm25_k1": k,
|
|
"bm25_b": b,
|
|
}
|
|
},
|
|
check_task=CheckTasks.err_res,
|
|
check_items=error
|
|
)
|
|
|
|
@pytest.mark.tags(CaseLabel.L2)
|
|
@pytest.mark.parametrize("b", [0.5])
|
|
@pytest.mark.parametrize("k", [1.5])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
|
|
self, tokenizer, index_type, k, b
|
|
):
|
|
"""
|
|
target: test create index using bm25 metric type for non bm25 output field (dense float vector or
|
|
sparse float vector not for bm25)
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. create index using bm25 metric type for non bm25 output field
|
|
expected: create index failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
empty_percent = 0.0
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"}
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "BM25", "params": {"M": 16, "efConstruction": 500}},
|
|
check_task=CheckTasks.err_res,
|
|
check_items=error
|
|
)
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("b", [-1])
|
|
@pytest.mark.parametrize("k", [-1])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_create_full_text_search_with_invalid_bm25_params(
|
|
self, tokenizer, index_type, k, b
|
|
):
|
|
"""
|
|
target: test create index for full text search with invalid bm25 params
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. create index for full text search with invalid bm25 params
|
|
expected: create index failed
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
empty_percent = 0.0
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=True,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
|
|
check_task = CheckTasks.err_res
|
|
error = {"err_code": 1100, "err_msg": "invalid"} # todo, update error code and message
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": k,
|
|
"bm25_b": b,
|
|
}
|
|
},
|
|
check_task=check_task,
|
|
check_items=error
|
|
)
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestSearchWithFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test search for full text search
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("nq", [2])
|
|
@pytest.mark.parametrize("empty_percent", [0.5])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
|
|
@pytest.mark.parametrize("expr", ["text_match", "id_range"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
@pytest.mark.parametrize("offset", [10, 0])
|
|
def test_full_text_search_default(
|
|
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. search with text
|
|
3. verify the result
|
|
expected: full text search successfully and result is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
most_freq_word = word_freq.most_common(10)
|
|
tokens = [item[0] for item in most_freq_word]
|
|
if len(tokens) == 0:
|
|
log.info(f"empty tokens, add a dummy token")
|
|
tokens = ["dummy"]
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.flush()
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
limit = 100
|
|
token = random.choice(tokens)
|
|
search_data = [fake.text().lower() + f" {token} " for _ in range(nq)]
|
|
if expr == "text_match":
|
|
filter = f"TEXT_MATCH(text, '{token}')"
|
|
res, _ = collection_w.query(
|
|
expr=filter,
|
|
)
|
|
elif expr == "id_range":
|
|
filter = f"id < {data_size // 2}"
|
|
else:
|
|
filter = ""
|
|
res, _ = collection_w.query(
|
|
expr=filter,
|
|
limit=limit,
|
|
)
|
|
candidates_num = len(res)
|
|
log.info(f"search data: {search_data}")
|
|
# use offset = 0 to get all the results
|
|
full_res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
expr=filter,
|
|
param={},
|
|
limit=limit + offset,
|
|
offset=0,
|
|
output_fields=["id", "text"])
|
|
full_res_id_list = []
|
|
for i in range(nq):
|
|
res = full_res_list[i]
|
|
tmp = []
|
|
for r in res:
|
|
tmp.append(r.id)
|
|
full_res_id_list.append(tmp)
|
|
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
expr=filter,
|
|
param={},
|
|
limit=limit,
|
|
offset=offset,
|
|
output_fields=["id", "text"])
|
|
|
|
# verify correctness
|
|
for i in range(nq):
|
|
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
|
|
search_text = search_data[i]
|
|
log.info(f"res: {res_list[i]}")
|
|
res = res_list[i]
|
|
for j in range(len(res)):
|
|
r = res[j]
|
|
_id = r.id
|
|
# get the first id of the result in which position is larger than offset
|
|
if j == 0:
|
|
first_id = _id
|
|
p = full_res_id_list[i].index(first_id)
|
|
assert 1.2 * offset >= p >= offset * 0.8
|
|
result_text = r.text
|
|
# verify search result satisfies the filter
|
|
if expr == "text_match":
|
|
assert token in result_text
|
|
if expr == "id_range":
|
|
assert _id < data_size // 2
|
|
# verify search result has overlap with search text
|
|
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
|
|
log.info(f"overlap {overlap}")
|
|
assert len(
|
|
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("nq", [2])
|
|
@pytest.mark.parametrize("empty_percent", [0.5])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("expr", ["text_match"])
|
|
@pytest.mark.parametrize("offset", [10])
|
|
@pytest.mark.parametrize("tokenizer", ["jieba"])
|
|
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
|
def test_full_text_search_with_jieba_tokenizer(
|
|
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key,
|
|
empty_percent, index_type, nq, inverted_index_algo):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search with jieba tokenizer and insert data with varchar
|
|
2. search with text
|
|
3. verify the result
|
|
expected: full text search successfully and result is correct
|
|
"""
|
|
if tokenizer == "jieba":
|
|
lang_type = "chinese"
|
|
else:
|
|
lang_type = "english"
|
|
|
|
analyzer_params = {
|
|
"type": lang_type,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
tokens = []
|
|
for item in word_freq.most_common(20):
|
|
if len(item[0]) == 2:
|
|
tokens.append(item[0])
|
|
if len(tokens) == 0:
|
|
log.info(f"empty tokens, add a dummy token")
|
|
tokens = ["dummy"]
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
"inverted_index_algo": inverted_index_algo
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
limit = 100
|
|
token = random.choice(tokens)
|
|
search_data = [fake.text().lower() + " " + token for _ in range(nq)]
|
|
if expr == "text_match":
|
|
filter = f"text_match(text, '{token}')"
|
|
res, _ = collection_w.query(
|
|
expr=filter,
|
|
)
|
|
elif expr == "id_range":
|
|
filter = f"id < {data_size // 2}"
|
|
else:
|
|
filter = ""
|
|
res, _ = collection_w.query(
|
|
expr=filter,
|
|
limit=limit,
|
|
)
|
|
candidates_num = len(res)
|
|
log.info(f"search data: {search_data}")
|
|
# use offset = 0 to get all the results
|
|
full_res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
expr=filter,
|
|
param={},
|
|
limit=limit + offset,
|
|
offset=0,
|
|
output_fields=["id", "text"])
|
|
full_res_id_list = []
|
|
for i in range(nq):
|
|
res = full_res_list[i]
|
|
tmp = []
|
|
for r in res:
|
|
tmp.append(r.id)
|
|
full_res_id_list.append(tmp)
|
|
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
expr=filter,
|
|
param={},
|
|
limit=limit,
|
|
offset=offset,
|
|
output_fields=["id", "text"])
|
|
|
|
# verify correctness
|
|
for i in range(nq):
|
|
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
|
|
search_text = search_data[i]
|
|
log.info(f"res: {res_list[i]}")
|
|
res = res_list[i]
|
|
for j in range(len(res)):
|
|
r = res[j]
|
|
_id = r.id
|
|
# get the first id of the result in which position is larger than offset
|
|
if j == 0:
|
|
first_id = _id
|
|
p = full_res_id_list[i].index(first_id)
|
|
assert 1.2 * offset >= p >= offset * 0.8
|
|
result_text = r.text
|
|
# verify search result satisfies the filter
|
|
if expr == "text_match":
|
|
assert token in result_text
|
|
if expr == "id_range":
|
|
assert _id < data_size // 2
|
|
# verify search result has overlap with search text
|
|
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
|
|
log.info(f"overlap {overlap}")
|
|
assert len(
|
|
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
|
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("nq", [2])
|
|
@pytest.mark.parametrize("empty_percent", [0.5])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("expr", ["id_range"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
@pytest.mark.parametrize("offset", [0])
|
|
def test_full_text_search_for_growing_segment(
|
|
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. search with text
|
|
3. verify the result
|
|
expected: full text search successfully and result is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
most_freq_word = word_freq.most_common(10)
|
|
tokens = [item[0] for item in most_freq_word]
|
|
if len(tokens) == 0:
|
|
log.info(f"empty tokens, add a dummy token")
|
|
tokens = ["dummy"]
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
limit = 100
|
|
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
|
|
if expr == "text_match":
|
|
filter = f"TextMatch(text, '{tokens[0]}')"
|
|
res, _ = collection_w.query(
|
|
expr=filter,
|
|
)
|
|
elif expr == "id_range":
|
|
filter = f"id < {data_size // 2}"
|
|
else:
|
|
filter = ""
|
|
res, _ = collection_w.query(
|
|
expr=filter,
|
|
limit=limit,
|
|
)
|
|
candidates_num = len(res)
|
|
log.info(f"search data: {search_data}")
|
|
# use offset = 0 to get all the results
|
|
full_res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
expr=filter,
|
|
param={},
|
|
limit=limit + offset,
|
|
offset=0,
|
|
output_fields=["id", "text"])
|
|
full_res_id_list = []
|
|
for i in range(nq):
|
|
res = full_res_list[i]
|
|
tmp = []
|
|
for r in res:
|
|
tmp.append(r.id)
|
|
full_res_id_list.append(tmp)
|
|
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
expr=filter,
|
|
param={},
|
|
limit=limit,
|
|
offset=offset,
|
|
output_fields=["id", "text"])
|
|
|
|
# verify correctness
|
|
for i in range(nq):
|
|
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
|
|
search_text = search_data[i]
|
|
log.info(f"res: {res_list[i]}")
|
|
res = res_list[i]
|
|
for j in range(len(res)):
|
|
r = res[j]
|
|
_id = r.id
|
|
# get the first id of the result in which position is larger than offset
|
|
if j == 0:
|
|
first_id = _id
|
|
p = full_res_id_list[i].index(first_id)
|
|
assert 1.2 * offset >= p >= offset * 0.8
|
|
result_text = r.text
|
|
# verify search result satisfies the filter
|
|
if expr == "text_match":
|
|
assert tokens[0] in result_text
|
|
if expr == "id_range":
|
|
assert _id < data_size // 2
|
|
# verify search result has overlap with search text
|
|
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language)
|
|
log.info(f"overlap {overlap}")
|
|
assert len(
|
|
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("nq", [2])
|
|
@pytest.mark.parametrize("empty_percent", [0])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("expr", [None])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_full_text_search_with_range_search(
|
|
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. range search with text
|
|
3. verify the result
|
|
expected: full text search successfully and result is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
tokens = list(word_freq.keys())
|
|
if len(tokens) == 0:
|
|
log.info(f"empty tokens, add a dummy token")
|
|
tokens = ["dummy"]
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
limit = 1000
|
|
search_data = [fake.text().lower() + random.choice(tokens) for _ in range(nq)]
|
|
log.info(f"search data: {search_data}")
|
|
# get distance with search data
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
param={
|
|
},
|
|
limit=limit, # get a wider range of search result
|
|
output_fields=["id", "text"])
|
|
|
|
distance_list = []
|
|
for i in range(nq):
|
|
res = res_list[i]
|
|
for j in range(len(res)):
|
|
r = res[j]
|
|
distance = r.distance
|
|
distance_list.append(distance)
|
|
distance_list = sorted(distance_list)
|
|
# get the range of distance 30% ~70%
|
|
low = distance_list[int(len(distance_list) * 0.3)]
|
|
high = distance_list[int(len(distance_list) * 0.7)]
|
|
|
|
res_list, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
param={
|
|
"params": {
|
|
"radius": low, "range_filter": high
|
|
}
|
|
},
|
|
limit=limit,
|
|
output_fields=["id", "text"])
|
|
# verify correctness
|
|
for i in range(nq):
|
|
log.info(f"res: {len(res_list[i])}")
|
|
assert len(res_list[i]) < limit # less than limit, because the range is set
|
|
res = res_list[i]
|
|
for j in range(len(res)):
|
|
r = res[j]
|
|
tmp_distance = r.distance
|
|
assert low <= tmp_distance <= high
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("nq", [1])
|
|
@pytest.mark.parametrize("empty_percent", [0])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("expr", [None])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_full_text_search_with_search_iterator(
|
|
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. iterator search with text
|
|
3. verify the result
|
|
expected: full text search successfully and result is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
tokens = list(word_freq.keys())
|
|
if len(tokens) == 0:
|
|
log.info(f"empty tokens, add a dummy token")
|
|
tokens = ["dummy"]
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
|
|
log.info(f"search data: {search_data}")
|
|
# get distance with search data
|
|
batch_size = 100
|
|
limit = batch_size * 10
|
|
iterator, _ = collection_w.search_iterator(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
batch_size=100,
|
|
param={
|
|
"metric_type": "BM25",
|
|
},
|
|
output_fields=["id", "text"],
|
|
limit=limit
|
|
)
|
|
iter_result = []
|
|
while True:
|
|
result = iterator.next()
|
|
if not result:
|
|
iterator.close()
|
|
break
|
|
else:
|
|
iter_result.append(len(result))
|
|
for r in iter_result[:-1]:
|
|
assert r == batch_size
|
|
|
|
class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test search for full text search negative
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L2)
|
|
@pytest.mark.parametrize("empty_percent", [0])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("invalid_search_data", ["empty_text"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022")
|
|
def test_search_for_full_text_search_with_empty_string_search_data(
|
|
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. search with empty text
|
|
3. verify the result
|
|
expected: full text search successfully but result is empty
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
nq = 2
|
|
limit = 100
|
|
search_data = ["" for _ in range(nq)]
|
|
log.info(f"search data: {search_data}")
|
|
res, _ = collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
param={},
|
|
limit=limit,
|
|
output_fields=["id", "text"],
|
|
)
|
|
assert len(res) == nq
|
|
for r in res:
|
|
assert len(r) == 0
|
|
|
|
|
|
@pytest.mark.tags(CaseLabel.L1)
|
|
@pytest.mark.parametrize("empty_percent", [0])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
|
|
@pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
def test_search_for_full_text_search_with_invalid_search_data(
|
|
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. search with sparse vector or dense vector
|
|
3. verify the result
|
|
expected: full text search failed and return error
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
if tokenizer == "jieba":
|
|
language = "zh"
|
|
fake = fake_zh
|
|
else:
|
|
language = "en"
|
|
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"emb": [random.random() for _ in range(dim)],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
corpus = df["text"].to_list()
|
|
log.info(f"dataframe\n{df}")
|
|
texts = df["text"].to_list()
|
|
word_freq = cf.analyze_documents(texts, language=language)
|
|
tokens = list(word_freq.keys())
|
|
if len(tokens) == 0:
|
|
log.info(f"empty tokens, add a dummy token")
|
|
tokens = ["dummy"]
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
nq = 2
|
|
limit = 100
|
|
if invalid_search_data == "sparse_vector":
|
|
search_data = cf.gen_vectors(nb=nq, dim=1000, vector_data_type="SPARSE_FLOAT_VECTOR")
|
|
else:
|
|
search_data = cf.gen_vectors(nb=nq, dim=1000, vector_data_type="FLOAT_VECTOR")
|
|
log.info(f"search data: {search_data}")
|
|
error = {ct.err_code: 65535,
|
|
ct.err_msg: "please provide varchar/text for BM25 Function based search"}
|
|
collection_w.search(
|
|
data=search_data,
|
|
anns_field="text_sparse_emb",
|
|
param={},
|
|
limit=limit,
|
|
output_fields=["id", "text"],
|
|
check_task=CheckTasks.err_res,
|
|
check_items=error
|
|
)
|
|
|
|
|
|
# @pytest.mark.skip("skip")
|
|
class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test hybrid search with full text search
|
|
******************************************************************
|
|
"""
|
|
|
|
@pytest.mark.tags(CaseLabel.L0)
|
|
@pytest.mark.parametrize("empty_percent", [0])
|
|
@pytest.mark.parametrize("enable_partition_key", [True])
|
|
@pytest.mark.parametrize("enable_inverted_index", [True])
|
|
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
|
@pytest.mark.parametrize("tokenizer", ["standard"])
|
|
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
|
def test_hybrid_search_with_full_text_search(
|
|
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, inverted_index_algo
|
|
):
|
|
"""
|
|
target: test full text search
|
|
method: 1. enable full text search and insert data with varchar
|
|
2. hybrid search with text, spase vector and dense vector
|
|
3. verify the result
|
|
expected: hybrid search successfully and result is correct
|
|
"""
|
|
analyzer_params = {
|
|
"tokenizer": tokenizer,
|
|
}
|
|
dim = 128
|
|
fields = [
|
|
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
|
FieldSchema(
|
|
name="word",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
is_partition_key=enable_partition_key,
|
|
),
|
|
FieldSchema(
|
|
name="sentence",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="paragraph",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(
|
|
name="text",
|
|
dtype=DataType.VARCHAR,
|
|
max_length=65535,
|
|
enable_analyzer=True,
|
|
enable_match=True,
|
|
analyzer_params=analyzer_params,
|
|
),
|
|
FieldSchema(name="dense_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
|
FieldSchema(name="neural_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
|
|
]
|
|
schema = CollectionSchema(fields=fields, description="test collection")
|
|
bm25_function = Function(
|
|
name="text_bm25_emb",
|
|
function_type=FunctionType.BM25,
|
|
input_field_names=["text"],
|
|
output_field_names=["text_sparse_emb"],
|
|
params={},
|
|
)
|
|
schema.add_function(bm25_function)
|
|
data_size = 5000
|
|
collection_w = self.init_collection_wrap(
|
|
name=cf.gen_unique_str(prefix), schema=schema
|
|
)
|
|
fake = fake_en
|
|
data = [
|
|
{
|
|
"id": i,
|
|
"word": fake.word().lower() if random.random() >= empty_percent else "",
|
|
"sentence": fake.sentence().lower() if random.random() >= empty_percent else "",
|
|
"paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "",
|
|
"text": fake.text().lower() if random.random() >= empty_percent else "",
|
|
"dense_emb": [random.random() for _ in range(dim)],
|
|
"neural_sparse_emb": cf.gen_vectors(nb=1, dim=1000, vector_data_type="SPARSE_FLOAT_VECTOR")[0],
|
|
}
|
|
for i in range(data_size)
|
|
]
|
|
df = pd.DataFrame(data)
|
|
log.info(f"dataframe\n{df}")
|
|
batch_size = 5000
|
|
for i in range(0, len(df), batch_size):
|
|
collection_w.insert(
|
|
data[i: i + batch_size]
|
|
if i + batch_size < len(df)
|
|
else data[i: len(df)]
|
|
)
|
|
collection_w.create_index(
|
|
"dense_emb",
|
|
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
|
)
|
|
collection_w.create_index(
|
|
"neural_sparse_emb",
|
|
{"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
|
|
)
|
|
collection_w.create_index(
|
|
"text_sparse_emb",
|
|
{
|
|
"index_type": index_type,
|
|
"metric_type": "BM25",
|
|
"params": {
|
|
"bm25_k1": 1.5,
|
|
"bm25_b": 0.75,
|
|
"inverted_index_algo": inverted_index_algo
|
|
}
|
|
}
|
|
)
|
|
if enable_inverted_index:
|
|
collection_w.create_index("text", {"index_type": "INVERTED"})
|
|
collection_w.load()
|
|
nq = 2
|
|
limit = 100
|
|
bm25_search = AnnSearchRequest(
|
|
data=[fake.text().lower() for _ in range(nq)],
|
|
anns_field="text_sparse_emb",
|
|
param={},
|
|
limit=limit,
|
|
)
|
|
dense_search = AnnSearchRequest(
|
|
data=[[random.random() for _ in range(dim)] for _ in range(nq)],
|
|
anns_field="dense_emb",
|
|
param={},
|
|
limit=limit,
|
|
)
|
|
sparse_search = AnnSearchRequest(
|
|
data=cf.gen_vectors(nb=nq, dim=dim, vector_data_type="SPARSE_FLOAT_VECTOR"),
|
|
anns_field="neural_sparse_emb",
|
|
param={},
|
|
limit=limit,
|
|
)
|
|
# hybrid search
|
|
res_list, _ = collection_w.hybrid_search(
|
|
reqs=[bm25_search, dense_search, sparse_search],
|
|
rerank=WeightedRanker(0.5, 0.5, 0.5),
|
|
limit=limit,
|
|
output_fields=["id", "text"]
|
|
)
|
|
assert len(res_list) == nq
|
|
# check the result correctness
|
|
for i in range(nq):
|
|
log.info(f"res length: {len(res_list[i])}")
|
|
assert len(res_list[i]) == limit
|
|
|