test: Add more sparse test cases (#33916)

issue: https://github.com/milvus-io/milvus/issues/31483

Signed-off-by: elstic <hao.wang@zilliz.com>
pull/33860/head
elstic 2024-06-19 15:24:09 +08:00 committed by GitHub
parent 6d5747cb3e
commit 1216a4bcd8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 433 additions and 23 deletions

View File

@ -262,6 +262,11 @@ class TestcaseBase(Base):
if is_binary:
default_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim,
primary_field=primary_field)
if vector_data_type == ct.sparse_vector:
default_schema = cf.gen_default_sparse_schema(auto_id=auto_id, primary_field=primary_field,
enable_dynamic_field=enable_dynamic_field,
with_json=with_json,
multiple_dim_array=multiple_dim_array)
if is_all_data_type:
default_schema = cf.gen_collection_schema_all_datatype(auto_id=auto_id, dim=dim,
primary_field=primary_field,
@ -289,6 +294,9 @@ class TestcaseBase(Base):
# This condition will be removed after auto index feature
if is_binary:
collection_w.create_index(ct.default_binary_vec_field_name, ct.default_bin_flat_index)
elif vector_data_type == ct.sparse_vector:
for vector_name in vector_name_list:
collection_w.create_index(vector_name, ct.default_sparse_inverted_index)
else:
if len(multiple_dim_array) == 0 or is_all_data_type == False:
vector_name_list.append(ct.default_float_vec_field_name)

View File

@ -145,6 +145,12 @@ def gen_double_field(name=ct.default_double_field_name, is_primary=False, descri
def gen_float_vec_field(name=ct.default_float_vec_field_name, is_primary=False, dim=ct.default_dim,
description=ct.default_desc, vector_data_type="FLOAT_VECTOR", **kwargs):
if vector_data_type == "SPARSE_FLOAT_VECTOR":
dtype = DataType.SPARSE_FLOAT_VECTOR
float_vec_field, _ = ApiFieldSchemaWrapper().init_field_schema(name=name, dtype=dtype,
description=description,
is_primary=is_primary, **kwargs)
return float_vec_field
if vector_data_type == "FLOAT_VECTOR":
dtype = DataType.FLOAT_VECTOR
elif vector_data_type == "FLOAT16_VECTOR":
@ -358,9 +364,14 @@ def gen_collection_schema_all_datatype(description=ct.default_desc,
else:
multiple_dim_array.insert(0, dim)
for i in range(len(multiple_dim_array)):
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
if ct.all_float_vector_types[i%3] != ct.sparse_vector:
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
dim=multiple_dim_array[i],
vector_data_type=ct.all_float_vector_types[i%3]))
else:
# The field of a sparse vector cannot be dimensioned
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.sparse_vector}",
vector_data_type=ct.sparse_vector))
schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
primary_field=primary_field, auto_id=auto_id,
@ -384,8 +395,17 @@ def gen_default_binary_collection_schema(description=ct.default_desc, primary_fi
def gen_default_sparse_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
auto_id=False, **kwargs):
auto_id=False, with_json=False, multiple_dim_array=[], **kwargs):
fields = [gen_int64_field(), gen_float_field(), gen_string_field(), gen_sparse_vec_field()]
if with_json:
fields.insert(-1, gen_json_field())
if len(multiple_dim_array) != 0:
for i in range(len(multiple_dim_array)):
vec_name = ct.default_sparse_vec_field_name + "_" + str(i)
vec_field = gen_sparse_vec_field(name=vec_name)
fields.append(vec_field)
sparse_schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
primary_field=primary_field,
auto_id=auto_id, **kwargs)
@ -418,7 +438,7 @@ def gen_vectors(nb, dim, vector_data_type="FLOAT_VECTOR"):
vectors = gen_fp16_vectors(nb, dim)[1]
elif vector_data_type == "BFLOAT16_VECTOR":
vectors = gen_bf16_vectors(nb, dim)[1]
elif vector_data_type == "SPARSE_VECTOR":
elif vector_data_type == "SPARSE_FLOAT_VECTOR":
vectors = gen_sparse_vectors(nb, dim)
if dim > 1:
@ -508,10 +528,10 @@ def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
index = 2
del insert_list[index]
if len(multiple_dim_array) != 0:
if len(multiple_vector_field_name) != len(multiple_dim_array):
log.error("multiple vector feature is enabled, please input the vector field name list "
"not including the default vector field")
assert len(multiple_vector_field_name) == len(multiple_dim_array)
# if len(multiple_vector_field_name) != len(multiple_dim_array):
# log.error("multiple vector feature is enabled, please input the vector field name list "
# "not including the default vector field")
# assert len(multiple_vector_field_name) == len(multiple_dim_array)
for i in range(len(multiple_dim_array)):
new_float_vec_values = gen_vectors(nb, multiple_dim_array[i], vector_data_type=vector_data_type)
insert_list.append(new_float_vec_values)
@ -832,7 +852,7 @@ def gen_default_list_sparse_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
string_values = [str(i) for i in range(start, start + nb)]
json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]}
for i in range(start, start + nb)]
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_VECTOR")
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_FLOAT_VECTOR")
if with_json:
data = [int_values, float_values, string_values, json_values, sparse_vec_values]
else:
@ -1772,7 +1792,7 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
multiple_vector_field_name=vector_name_list,
vector_data_type=vector_data_type,
auto_id=auto_id, primary_field=primary_field)
elif vector_data_type == "FLOAT16_VECTOR" or "BFLOAT16_VECTOR":
elif vector_data_type in ct.all_float_vector_types:
default_data = gen_general_default_list_data(nb // num, dim=dim, start=start, with_json=with_json,
random_primary_key=random_primary_key,
multiple_dim_array=multiple_dim_array,
@ -1972,14 +1992,10 @@ def extract_vector_field_name_list(collection_w):
fields = schema_dict.get('fields')
vector_name_list = []
for field in fields:
if str(field['type']) in ["101", "102", "103"]:
if field['name'] != ct.default_float_vec_field_name:
vector_name_list.append(field['name'])
for field in fields:
if str(field['type']) == 'DataType.FLOAT_VECTOR' \
or str(field['type']) == 'DataType.FLOAT16_VECTOR' \
or str(field['type']) == 'DataType.BFLOAT16_VECTOR':
if field['type'] == DataType.FLOAT_VECTOR \
or field['type'] == DataType.FLOAT16_VECTOR \
or field['type'] == DataType.BFLOAT16_VECTOR \
or field['type'] == DataType.SPARSE_FLOAT_VECTOR:
if field['name'] != ct.default_float_vec_field_name:
vector_name_list.append(field['name'])
@ -2120,11 +2136,13 @@ def gen_vectors_based_on_vector_type(num, dim, vector_data_type):
fp16_vectors: the bytes used for insert
return: raw_vectors and fp16_vectors
"""
if vector_data_type == "FLOAT_VECTOR":
if vector_data_type == ct.float_type:
vectors = [[random.random() for _ in range(dim)] for _ in range(num)]
elif vector_data_type == "FLOAT16_VECTOR":
elif vector_data_type == ct.float16_type:
vectors = gen_fp16_vectors(num, dim)[1]
elif vector_data_type == "BFLOAT16_VECTOR":
elif vector_data_type == ct.bfloat16_type:
vectors = gen_bf16_vectors(num, dim)[1]
elif vector_data_type == ct.sparse_vector:
vectors = gen_sparse_vectors(num, dim)
return vectors

View File

@ -44,7 +44,8 @@ default_binary_vec_field_name = "binary_vector"
float_type = "FLOAT_VECTOR"
float16_type = "FLOAT16_VECTOR"
bfloat16_type = "BFLOAT16_VECTOR"
all_float_vector_types = [float_type, float16_type, bfloat16_type]
sparse_vector = "SPARSE_FLOAT_VECTOR"
all_float_vector_types = [float16_type, bfloat16_type, sparse_vector]
default_sparse_vec_field_name = "sparse_vector"
default_partition_name = "_default"
default_resource_group_name = '__default_resource_group'

View File

@ -50,6 +50,7 @@ vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
max_vector_field_num = ct.max_vector_field_num
SPARSE_FLOAT_VECTOR_data_type = "SPARSE_FLOAT_VECTOR"
class TestCollectionParams(TestcaseBase):
@ -1047,6 +1048,24 @@ class TestCollectionParams(TestcaseBase):
error = {ct.err_code: 65535, ct.err_msg: "maximum field's number should be limited to 64"}
self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L2)
def test_collection_multi_sparse_vectors(self):
"""
target: Test multiple sparse vectors in a collection
method: create 2 sparse vectors in a collection
expected: successful creation of a collection
"""
# 1. connect
self._connect()
# 2. create collection with multiple vectors
c_name = cf.gen_unique_str(prefix)
fields = [cf.gen_int64_field(is_primary=True), cf.gen_float_field(),
cf.gen_float_vec_field(vector_data_type=ct.sparse_vector_data_type), cf.gen_float_vec_field(name="tmp", vector_data_type=sparse_vector_data_type)]
schema = cf.gen_collection_schema(fields=fields)
self.collection_wrap.init_collection(c_name, schema=schema,
check_task=CheckTasks.check_collection_property,
check_items={exp_name: c_name, exp_schema: schema})
class TestCollectionOperation(TestcaseBase):
"""

View File

@ -1,5 +1,7 @@
import random
from time import sleep
import numpy as np
import pytest
import copy
@ -1442,6 +1444,47 @@ class TestIndexInvalid(TestcaseBase):
check_items={ct.err_code: 1,
ct.err_msg: f"<'int' object has no attribute 'items'"})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", " ", "invalid"])
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_invalid_sparse_metric_type(self, metric_type, index):
"""
target: unsupported metric_type create index
method: unsupported metric_type creates an index
expected: raise exception
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
collection_w.insert(data=data)
param = cf.get_index_params_params(index)
params = {"index_type": index, "metric_type": metric_type, "params": param}
error = {ct.err_code: 65535, ct.err_msg: "only IP is the supported metric type for sparse index"}
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
check_task=CheckTasks.err_res,
check_items=error)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("ratio", [-0.5, 1, 3])
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
def test_invalid_sparse_ratio(self, ratio, index):
"""
target: index creation for unsupported ratio parameter
method: indexing of unsupported ratio parameters
expected: raise exception
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
collection_w.insert(data=data)
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
error = {ct.err_code: 1100, ct.err_msg: f"invalid drop_ratio_build: {ratio}, must be in range [0, 1): invalid parameter[expected=valid index params"}
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
check_task=CheckTasks.err_res,
check_items=error)
@pytest.mark.tags(CaseLabel.GPU)
class TestNewIndexAsync(TestcaseBase):

View File

@ -1348,6 +1348,25 @@ class TestInsertInvalid(TestcaseBase):
error = {ct.err_code: 65535, ct.err_msg: "value '+Inf' is not a number or infinity"}
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
@pytest.mark.parametrize("invalid_vector_type ", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
def test_invalid_sparse_vector_data(self, index, invalid_vector_type):
"""
target: insert illegal data type
method: insert illegal data type
expected: raise exception
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
nb = 100
data = cf.gen_default_list_sparse_data(nb=nb)[:-1]
invalid_vec = cf.gen_vectors(nb, dim=128, vector_data_type=invalid_vector_type)
data.append(invalid_vec)
error = {ct.err_code: 1, ct.err_msg: 'input must be a sparse matrix in supported format'}
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
class TestInsertInvalidBinary(TestcaseBase):
"""
@ -1872,6 +1891,30 @@ class TestUpsertValid(TestcaseBase):
collection_w.upsert(df)
assert collection_w.num_entities == ct.default_nb
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
def test_upsert_sparse_data(self, index):
"""
target: multiple upserts and counts(*)
method: multiple upserts and counts(*)
expected: number of data entries normal
"""
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=ct.default_nb)
collection_w.upsert(data=data)
assert collection_w.num_entities == ct.default_nb
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
for i in range(5):
collection_w.upsert(data=data)
collection_w.query(expr=f'{ct.default_int64_field_name} >= 0', output_fields=[ct.default_count_output]
, check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": ct.default_nb}]})
class TestUpsertInvalid(TestcaseBase):
""" Invalid test case of Upsert interface """

View File

@ -3691,6 +3691,37 @@ class TestQueryCount(TestcaseBase):
check_task=CheckTasks.check_query_results,
check_items={exp_res: [{count: res}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_counts_expression_sparse_vectors(self, index):
"""
target: test count with expr
method: count with expr
expected: verify count
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
collection_w.query(expr=default_expr, output_fields=[count],
check_task=CheckTasks.check_query_results,
check_items={exp_res: [{count: ct.default_nb}]})
expr = "int64 > 50 && int64 < 100 && float < 75"
collection_w.query(expr=expr, output_fields=[count],
check_task=CheckTasks.check_query_results,
check_items={exp_res: [{count: 24}]})
batch_size = 100
collection_w.query_iterator(batch_size=batch_size, expr=default_expr,
check_task=CheckTasks.check_query_iterator,
check_items={"count": ct.default_nb,
"batch_size": batch_size})
class TestQueryIterator(TestcaseBase):
"""

View File

@ -6380,6 +6380,35 @@ class TestSearchPagination(TestcaseBase):
default_limit, offset=offset)[0]
assert res1[0].ids == res2[0].ids
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("offset", [1, 5, 20])
def test_search_sparse_with_pagination(self, offset):
"""
target: test search sparse with pagination
method: 1. connect and create a collection
2. search pagination with offset
3. search with offset+limit
4. compare with the search results whose corresponding ids should be the same
expected: search successfully and ids is correct
"""
# 1. create a collection
auto_id = False
collection_w, _, _, insert_ids = \
self.init_collection_general(
prefix, True, auto_id=auto_id, vector_data_type=ct.sparse_vector)[0:4]
# 2. search with offset+limit
search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}, "offset": offset}
search_vectors = cf.gen_default_list_sparse_data()[-1][-2:]
search_res = collection_w.search(search_vectors, ct.default_sparse_vec_field_name,
search_param, default_limit)[0]
# 3. search
_search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}
res = collection_w.search(search_vectors[:default_nq], ct.default_sparse_vec_field_name, _search_param,
default_limit + offset)[0]
assert len(search_res[0].ids) == len(res[0].ids[offset:])
assert sorted(search_res[0].distances, key=numpy.float32) == sorted(
res[0].distances[offset:], key=numpy.float32)
class TestSearchPaginationInvalid(TestcaseBase):
""" Test case of search pagination """
@ -6932,7 +6961,7 @@ class TestCollectionRangeSearch(TestcaseBase):
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("vector_data_type", ct.all_float_vector_types)
@pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
def test_range_search_default(self, index_type, metric, vector_data_type):
"""
target: verify the range search returns correct results
@ -8346,6 +8375,33 @@ class TestCollectionRangeSearch(TestcaseBase):
"limit": nb_old + nb_new,
"_async": _async})
@pytest.mark.tags(CaseLabel.L2)
def test_range_search_sparse(self):
"""
target: test sparse index normal range search
method: create connection, collection, insert and range search
expected: range search successfully
"""
# 1. initialize with data
collection_w = self.init_collection_general(prefix, True, nb=5000,
with_json=True,
vector_data_type=ct.sparse_vector)[0]
range_filter = random.uniform(0.5, 1)
radius = random.uniform(0, 0.5)
# 2. range search
range_search_params = {"metric_type": "IP",
"params": {"radius": radius, "range_filter": range_filter}}
d = cf.gen_default_list_sparse_data(nb=1)
search_res = collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name,
range_search_params, default_limit,
default_search_exp)[0]
# 3. check search results
for hits in search_res:
for distance in hits.distances:
assert range_filter >= distance > radius
class TestCollectionLoadOperation(TestcaseBase):
""" Test case of search combining load and other functions """
@ -10656,6 +10712,53 @@ class TestSearchGroupBy(TestcaseBase):
check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "limit": limit})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_sparse_vectors_group_by(self, index):
"""
target: test search group by works on a collection with sparse vector
method: 1. create a collection
2. create index
3. grouping search
verify: search successfully
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
nb = 5000
data = cf.gen_default_list_sparse_data(nb=nb)
# update float fields
_data = [random.randint(1, 100) for _ in range(nb)]
str_data = [str(i) for i in _data]
data[2] = str_data
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
nq = 2
limit = 20
search_params = ct.default_sparse_search_params
search_vectors = cf.gen_default_list_sparse_data(nb=nq)[-1][-2:]
# verify the results are same if gourp by pk
res = collection_w.search(data=search_vectors, anns_field=ct.default_sparse_vec_field_name,
param=search_params, limit=limit,
group_by_field="varchar",
output_fields=["varchar"],
check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "limit": limit})
hit = res[0]
set_varchar = set()
for item in hit:
a = list(item.fields.values())
set_varchar.add(a[0])
# groupy by is in effect, then there are no duplicate varchar values
assert len(hit) == len(set_varchar)
class TestCollectionHybridSearchValid(TestcaseBase):
""" Test case of search interface """
@ -12534,6 +12637,64 @@ class TestCollectionHybridSearchValid(TestcaseBase):
for i in range(nq):
assert is_sorted_descend(res[i].distances)
@pytest.mark.tags(CaseLabel.L2)
def test_hybrid_search_sparse_normal(self):
"""
target: test hybrid search after loading sparse vectors
method: Test hybrid search after loading sparse vectors
expected: hybrid search successfully with limit(topK)
"""
nb, auto_id, dim, enable_dynamic_field = 20000, False, 768, False
# 1. init collection
collection_w, insert_vectors, _, insert_ids = self.init_collection_general(prefix, True, nb=nb,
multiple_dim_array=[dim, dim*2], with_json=False,
vector_data_type="SPARSE_FLOAT_VECTOR")[0:4]
# 2. extract vector field name
vector_name_list = cf.extract_vector_field_name_list(collection_w)
# 3. prepare search params
req_list = []
search_res_dict_array = []
k = 60
for i in range(len(vector_name_list)):
# vector = cf.gen_sparse_vectors(1, dim)
vector = insert_vectors[0][i+3][-1:]
search_res_dict = {}
search_param = {
"data": vector,
"anns_field": vector_name_list[i],
"param": {"metric_type": "IP", "offset": 0},
"limit": default_limit,
"expr": "int64 > 0"}
req = AnnSearchRequest(**search_param)
req_list.append(req)
# search for get the base line of hybrid_search
search_res = collection_w.search(vector, vector_name_list[i],
default_search_params, default_limit,
default_search_exp,
check_task=CheckTasks.check_search_results,
check_items={"nq": 1,
"ids": insert_ids,
# "limit": default_limit
}
)[0]
ids = search_res[0].ids
for j in range(len(ids)):
search_res_dict[ids[j]] = 1/(j + k +1)
search_res_dict_array.append(search_res_dict)
# 4. calculate hybrid search base line for RRFRanker
ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array)
# 5. hybrid search
hybrid_res = collection_w.hybrid_search(req_list, RRFRanker(k), default_limit,
check_task=CheckTasks.check_search_results,
check_items={"nq": 1,
"ids": insert_ids,
"limit": default_limit})[0]
# 6. compare results through the re-calculated distances
for i in range(len(score_answer[:default_limit])):
delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i])
assert delta < hybrid_search_epsilon
class TestSparseSearch(TestcaseBase):
""" Add some test cases for the sparse vector """
@ -12550,7 +12711,7 @@ class TestSparseSearch(TestcaseBase):
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema(auto_id=False)
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
data = cf.gen_default_list_sparse_data()
data = cf.gen_default_list_sparse_data(nb=10000)
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
@ -12562,6 +12723,12 @@ class TestSparseSearch(TestcaseBase):
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit})
expr = "int64 < 100 "
collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, default_limit,
expr,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
@ -12624,3 +12791,83 @@ class TestSparseSearch(TestcaseBase):
term_expr = f'{ct.default_int64_field_name} in [0, 1, 10, 100]'
res = collection_w.query(term_expr)
assert len(res) == 4
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("ratio", [0.01, 0.1, 0.5, 0.9])
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_search_sparse_ratio(self, ratio, index):
"""
target: create a sparse index by adjusting the ratio parameter.
method: create a sparse index by adjusting the ratio parameter.
expected: search successfully
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema(auto_id=False)
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=10000)
collection_w.insert(data)
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
collection_w.load()
assert collection_w.has_index(index_name=index) == True
search_params = {"metric_type": "IP", "params": {"drop_ratio_search": ratio}}
collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name,
search_params, default_limit,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_sparse_vector_search_output_field(self, index):
"""
target: create sparse vectors and search
method: create sparse vectors and search
expected: normal search
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=10000)
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
d = cf.gen_default_list_sparse_data(nb=1)
collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, 5,
output_fields=["float", "sparse_vector"],
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"limit": default_limit,
"output_fields": ["float", "sparse_vector"]
})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
def test_sparse_vector_search_iterator(self, index):
"""
target: create sparse vectors and search iterator
method: create sparse vectors and search iterator
expected: normal search
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
schema = cf.gen_default_sparse_schema()
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
data = cf.gen_default_list_sparse_data(nb=10000)
collection_w.insert(data)
params = cf.get_index_params_params(index)
index_params = {"index_type": index, "metric_type": "IP", "params": params}
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
collection_w.load()
batch_size = 10
collection_w.search_iterator(data[-1][-1:], ct.default_sparse_vec_field_name,
ct.default_sparse_search_params, batch_size,
check_task=CheckTasks.check_search_iterator,
check_items={"batch_size": batch_size})