mirror of https://github.com/milvus-io/milvus.git
test: Add more sparse test cases (#33916)
issue: https://github.com/milvus-io/milvus/issues/31483 Signed-off-by: elstic <hao.wang@zilliz.com>pull/33860/head
parent
6d5747cb3e
commit
1216a4bcd8
|
@ -262,6 +262,11 @@ class TestcaseBase(Base):
|
|||
if is_binary:
|
||||
default_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim,
|
||||
primary_field=primary_field)
|
||||
if vector_data_type == ct.sparse_vector:
|
||||
default_schema = cf.gen_default_sparse_schema(auto_id=auto_id, primary_field=primary_field,
|
||||
enable_dynamic_field=enable_dynamic_field,
|
||||
with_json=with_json,
|
||||
multiple_dim_array=multiple_dim_array)
|
||||
if is_all_data_type:
|
||||
default_schema = cf.gen_collection_schema_all_datatype(auto_id=auto_id, dim=dim,
|
||||
primary_field=primary_field,
|
||||
|
@ -289,6 +294,9 @@ class TestcaseBase(Base):
|
|||
# This condition will be removed after auto index feature
|
||||
if is_binary:
|
||||
collection_w.create_index(ct.default_binary_vec_field_name, ct.default_bin_flat_index)
|
||||
elif vector_data_type == ct.sparse_vector:
|
||||
for vector_name in vector_name_list:
|
||||
collection_w.create_index(vector_name, ct.default_sparse_inverted_index)
|
||||
else:
|
||||
if len(multiple_dim_array) == 0 or is_all_data_type == False:
|
||||
vector_name_list.append(ct.default_float_vec_field_name)
|
||||
|
|
|
@ -145,6 +145,12 @@ def gen_double_field(name=ct.default_double_field_name, is_primary=False, descri
|
|||
|
||||
def gen_float_vec_field(name=ct.default_float_vec_field_name, is_primary=False, dim=ct.default_dim,
|
||||
description=ct.default_desc, vector_data_type="FLOAT_VECTOR", **kwargs):
|
||||
if vector_data_type == "SPARSE_FLOAT_VECTOR":
|
||||
dtype = DataType.SPARSE_FLOAT_VECTOR
|
||||
float_vec_field, _ = ApiFieldSchemaWrapper().init_field_schema(name=name, dtype=dtype,
|
||||
description=description,
|
||||
is_primary=is_primary, **kwargs)
|
||||
return float_vec_field
|
||||
if vector_data_type == "FLOAT_VECTOR":
|
||||
dtype = DataType.FLOAT_VECTOR
|
||||
elif vector_data_type == "FLOAT16_VECTOR":
|
||||
|
@ -358,9 +364,14 @@ def gen_collection_schema_all_datatype(description=ct.default_desc,
|
|||
else:
|
||||
multiple_dim_array.insert(0, dim)
|
||||
for i in range(len(multiple_dim_array)):
|
||||
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
|
||||
if ct.all_float_vector_types[i%3] != ct.sparse_vector:
|
||||
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
|
||||
dim=multiple_dim_array[i],
|
||||
vector_data_type=ct.all_float_vector_types[i%3]))
|
||||
else:
|
||||
# The field of a sparse vector cannot be dimensioned
|
||||
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.sparse_vector}",
|
||||
vector_data_type=ct.sparse_vector))
|
||||
|
||||
schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
|
||||
primary_field=primary_field, auto_id=auto_id,
|
||||
|
@ -384,8 +395,17 @@ def gen_default_binary_collection_schema(description=ct.default_desc, primary_fi
|
|||
|
||||
|
||||
def gen_default_sparse_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
|
||||
auto_id=False, **kwargs):
|
||||
auto_id=False, with_json=False, multiple_dim_array=[], **kwargs):
|
||||
|
||||
fields = [gen_int64_field(), gen_float_field(), gen_string_field(), gen_sparse_vec_field()]
|
||||
if with_json:
|
||||
fields.insert(-1, gen_json_field())
|
||||
|
||||
if len(multiple_dim_array) != 0:
|
||||
for i in range(len(multiple_dim_array)):
|
||||
vec_name = ct.default_sparse_vec_field_name + "_" + str(i)
|
||||
vec_field = gen_sparse_vec_field(name=vec_name)
|
||||
fields.append(vec_field)
|
||||
sparse_schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
|
||||
primary_field=primary_field,
|
||||
auto_id=auto_id, **kwargs)
|
||||
|
@ -418,7 +438,7 @@ def gen_vectors(nb, dim, vector_data_type="FLOAT_VECTOR"):
|
|||
vectors = gen_fp16_vectors(nb, dim)[1]
|
||||
elif vector_data_type == "BFLOAT16_VECTOR":
|
||||
vectors = gen_bf16_vectors(nb, dim)[1]
|
||||
elif vector_data_type == "SPARSE_VECTOR":
|
||||
elif vector_data_type == "SPARSE_FLOAT_VECTOR":
|
||||
vectors = gen_sparse_vectors(nb, dim)
|
||||
|
||||
if dim > 1:
|
||||
|
@ -508,10 +528,10 @@ def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
|
|||
index = 2
|
||||
del insert_list[index]
|
||||
if len(multiple_dim_array) != 0:
|
||||
if len(multiple_vector_field_name) != len(multiple_dim_array):
|
||||
log.error("multiple vector feature is enabled, please input the vector field name list "
|
||||
"not including the default vector field")
|
||||
assert len(multiple_vector_field_name) == len(multiple_dim_array)
|
||||
# if len(multiple_vector_field_name) != len(multiple_dim_array):
|
||||
# log.error("multiple vector feature is enabled, please input the vector field name list "
|
||||
# "not including the default vector field")
|
||||
# assert len(multiple_vector_field_name) == len(multiple_dim_array)
|
||||
for i in range(len(multiple_dim_array)):
|
||||
new_float_vec_values = gen_vectors(nb, multiple_dim_array[i], vector_data_type=vector_data_type)
|
||||
insert_list.append(new_float_vec_values)
|
||||
|
@ -832,7 +852,7 @@ def gen_default_list_sparse_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
|
|||
string_values = [str(i) for i in range(start, start + nb)]
|
||||
json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]}
|
||||
for i in range(start, start + nb)]
|
||||
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_VECTOR")
|
||||
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_FLOAT_VECTOR")
|
||||
if with_json:
|
||||
data = [int_values, float_values, string_values, json_values, sparse_vec_values]
|
||||
else:
|
||||
|
@ -1772,7 +1792,7 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
|
|||
multiple_vector_field_name=vector_name_list,
|
||||
vector_data_type=vector_data_type,
|
||||
auto_id=auto_id, primary_field=primary_field)
|
||||
elif vector_data_type == "FLOAT16_VECTOR" or "BFLOAT16_VECTOR":
|
||||
elif vector_data_type in ct.all_float_vector_types:
|
||||
default_data = gen_general_default_list_data(nb // num, dim=dim, start=start, with_json=with_json,
|
||||
random_primary_key=random_primary_key,
|
||||
multiple_dim_array=multiple_dim_array,
|
||||
|
@ -1972,14 +1992,10 @@ def extract_vector_field_name_list(collection_w):
|
|||
fields = schema_dict.get('fields')
|
||||
vector_name_list = []
|
||||
for field in fields:
|
||||
if str(field['type']) in ["101", "102", "103"]:
|
||||
if field['name'] != ct.default_float_vec_field_name:
|
||||
vector_name_list.append(field['name'])
|
||||
|
||||
for field in fields:
|
||||
if str(field['type']) == 'DataType.FLOAT_VECTOR' \
|
||||
or str(field['type']) == 'DataType.FLOAT16_VECTOR' \
|
||||
or str(field['type']) == 'DataType.BFLOAT16_VECTOR':
|
||||
if field['type'] == DataType.FLOAT_VECTOR \
|
||||
or field['type'] == DataType.FLOAT16_VECTOR \
|
||||
or field['type'] == DataType.BFLOAT16_VECTOR \
|
||||
or field['type'] == DataType.SPARSE_FLOAT_VECTOR:
|
||||
if field['name'] != ct.default_float_vec_field_name:
|
||||
vector_name_list.append(field['name'])
|
||||
|
||||
|
@ -2120,11 +2136,13 @@ def gen_vectors_based_on_vector_type(num, dim, vector_data_type):
|
|||
fp16_vectors: the bytes used for insert
|
||||
return: raw_vectors and fp16_vectors
|
||||
"""
|
||||
if vector_data_type == "FLOAT_VECTOR":
|
||||
if vector_data_type == ct.float_type:
|
||||
vectors = [[random.random() for _ in range(dim)] for _ in range(num)]
|
||||
elif vector_data_type == "FLOAT16_VECTOR":
|
||||
elif vector_data_type == ct.float16_type:
|
||||
vectors = gen_fp16_vectors(num, dim)[1]
|
||||
elif vector_data_type == "BFLOAT16_VECTOR":
|
||||
elif vector_data_type == ct.bfloat16_type:
|
||||
vectors = gen_bf16_vectors(num, dim)[1]
|
||||
elif vector_data_type == ct.sparse_vector:
|
||||
vectors = gen_sparse_vectors(num, dim)
|
||||
|
||||
return vectors
|
||||
|
|
|
@ -44,7 +44,8 @@ default_binary_vec_field_name = "binary_vector"
|
|||
float_type = "FLOAT_VECTOR"
|
||||
float16_type = "FLOAT16_VECTOR"
|
||||
bfloat16_type = "BFLOAT16_VECTOR"
|
||||
all_float_vector_types = [float_type, float16_type, bfloat16_type]
|
||||
sparse_vector = "SPARSE_FLOAT_VECTOR"
|
||||
all_float_vector_types = [float16_type, bfloat16_type, sparse_vector]
|
||||
default_sparse_vec_field_name = "sparse_vector"
|
||||
default_partition_name = "_default"
|
||||
default_resource_group_name = '__default_resource_group'
|
||||
|
|
|
@ -50,6 +50,7 @@ vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_
|
|||
default_search_field = ct.default_float_vec_field_name
|
||||
default_search_params = ct.default_search_params
|
||||
max_vector_field_num = ct.max_vector_field_num
|
||||
SPARSE_FLOAT_VECTOR_data_type = "SPARSE_FLOAT_VECTOR"
|
||||
|
||||
|
||||
class TestCollectionParams(TestcaseBase):
|
||||
|
@ -1047,6 +1048,24 @@ class TestCollectionParams(TestcaseBase):
|
|||
error = {ct.err_code: 65535, ct.err_msg: "maximum field's number should be limited to 64"}
|
||||
self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_collection_multi_sparse_vectors(self):
|
||||
"""
|
||||
target: Test multiple sparse vectors in a collection
|
||||
method: create 2 sparse vectors in a collection
|
||||
expected: successful creation of a collection
|
||||
"""
|
||||
# 1. connect
|
||||
self._connect()
|
||||
# 2. create collection with multiple vectors
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
fields = [cf.gen_int64_field(is_primary=True), cf.gen_float_field(),
|
||||
cf.gen_float_vec_field(vector_data_type=ct.sparse_vector_data_type), cf.gen_float_vec_field(name="tmp", vector_data_type=sparse_vector_data_type)]
|
||||
schema = cf.gen_collection_schema(fields=fields)
|
||||
self.collection_wrap.init_collection(c_name, schema=schema,
|
||||
check_task=CheckTasks.check_collection_property,
|
||||
check_items={exp_name: c_name, exp_schema: schema})
|
||||
|
||||
|
||||
class TestCollectionOperation(TestcaseBase):
|
||||
"""
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import random
|
||||
from time import sleep
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import copy
|
||||
|
||||
|
@ -1442,6 +1444,47 @@ class TestIndexInvalid(TestcaseBase):
|
|||
check_items={ct.err_code: 1,
|
||||
ct.err_msg: f"<'int' object has no attribute 'items'"})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", " ", "invalid"])
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_invalid_sparse_metric_type(self, metric_type, index):
|
||||
"""
|
||||
target: unsupported metric_type create index
|
||||
method: unsupported metric_type creates an index
|
||||
expected: raise exception
|
||||
"""
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data()
|
||||
collection_w.insert(data=data)
|
||||
param = cf.get_index_params_params(index)
|
||||
params = {"index_type": index, "metric_type": metric_type, "params": param}
|
||||
error = {ct.err_code: 65535, ct.err_msg: "only IP is the supported metric type for sparse index"}
|
||||
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("ratio", [-0.5, 1, 3])
|
||||
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||
def test_invalid_sparse_ratio(self, ratio, index):
|
||||
"""
|
||||
target: index creation for unsupported ratio parameter
|
||||
method: indexing of unsupported ratio parameters
|
||||
expected: raise exception
|
||||
"""
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data()
|
||||
collection_w.insert(data=data)
|
||||
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
|
||||
error = {ct.err_code: 1100, ct.err_msg: f"invalid drop_ratio_build: {ratio}, must be in range [0, 1): invalid parameter[expected=valid index params"}
|
||||
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
|
||||
@pytest.mark.tags(CaseLabel.GPU)
|
||||
class TestNewIndexAsync(TestcaseBase):
|
||||
|
|
|
@ -1348,6 +1348,25 @@ class TestInsertInvalid(TestcaseBase):
|
|||
error = {ct.err_code: 65535, ct.err_msg: "value '+Inf' is not a number or infinity"}
|
||||
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||
@pytest.mark.parametrize("invalid_vector_type ", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||
def test_invalid_sparse_vector_data(self, index, invalid_vector_type):
|
||||
"""
|
||||
target: insert illegal data type
|
||||
method: insert illegal data type
|
||||
expected: raise exception
|
||||
"""
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||
nb = 100
|
||||
data = cf.gen_default_list_sparse_data(nb=nb)[:-1]
|
||||
invalid_vec = cf.gen_vectors(nb, dim=128, vector_data_type=invalid_vector_type)
|
||||
data.append(invalid_vec)
|
||||
error = {ct.err_code: 1, ct.err_msg: 'input must be a sparse matrix in supported format'}
|
||||
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
|
||||
class TestInsertInvalidBinary(TestcaseBase):
|
||||
"""
|
||||
|
@ -1872,6 +1891,30 @@ class TestUpsertValid(TestcaseBase):
|
|||
collection_w.upsert(df)
|
||||
assert collection_w.num_entities == ct.default_nb
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||
def test_upsert_sparse_data(self, index):
|
||||
"""
|
||||
target: multiple upserts and counts(*)
|
||||
method: multiple upserts and counts(*)
|
||||
expected: number of data entries normal
|
||||
"""
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=ct.default_nb)
|
||||
collection_w.upsert(data=data)
|
||||
assert collection_w.num_entities == ct.default_nb
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
collection_w.load()
|
||||
for i in range(5):
|
||||
collection_w.upsert(data=data)
|
||||
collection_w.query(expr=f'{ct.default_int64_field_name} >= 0', output_fields=[ct.default_count_output]
|
||||
, check_task=CheckTasks.check_query_results,
|
||||
check_items={"exp_res": [{"count(*)": ct.default_nb}]})
|
||||
|
||||
|
||||
class TestUpsertInvalid(TestcaseBase):
|
||||
""" Invalid test case of Upsert interface """
|
||||
|
|
|
@ -3691,6 +3691,37 @@ class TestQueryCount(TestcaseBase):
|
|||
check_task=CheckTasks.check_query_results,
|
||||
check_items={exp_res: [{count: res}]})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_counts_expression_sparse_vectors(self, index):
|
||||
"""
|
||||
target: test count with expr
|
||||
method: count with expr
|
||||
expected: verify count
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data()
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
collection_w.load()
|
||||
collection_w.query(expr=default_expr, output_fields=[count],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={exp_res: [{count: ct.default_nb}]})
|
||||
expr = "int64 > 50 && int64 < 100 && float < 75"
|
||||
collection_w.query(expr=expr, output_fields=[count],
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={exp_res: [{count: 24}]})
|
||||
batch_size = 100
|
||||
collection_w.query_iterator(batch_size=batch_size, expr=default_expr,
|
||||
check_task=CheckTasks.check_query_iterator,
|
||||
check_items={"count": ct.default_nb,
|
||||
"batch_size": batch_size})
|
||||
|
||||
|
||||
class TestQueryIterator(TestcaseBase):
|
||||
"""
|
||||
|
|
|
@ -6380,6 +6380,35 @@ class TestSearchPagination(TestcaseBase):
|
|||
default_limit, offset=offset)[0]
|
||||
assert res1[0].ids == res2[0].ids
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("offset", [1, 5, 20])
|
||||
def test_search_sparse_with_pagination(self, offset):
|
||||
"""
|
||||
target: test search sparse with pagination
|
||||
method: 1. connect and create a collection
|
||||
2. search pagination with offset
|
||||
3. search with offset+limit
|
||||
4. compare with the search results whose corresponding ids should be the same
|
||||
expected: search successfully and ids is correct
|
||||
"""
|
||||
# 1. create a collection
|
||||
auto_id = False
|
||||
collection_w, _, _, insert_ids = \
|
||||
self.init_collection_general(
|
||||
prefix, True, auto_id=auto_id, vector_data_type=ct.sparse_vector)[0:4]
|
||||
# 2. search with offset+limit
|
||||
search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}, "offset": offset}
|
||||
search_vectors = cf.gen_default_list_sparse_data()[-1][-2:]
|
||||
search_res = collection_w.search(search_vectors, ct.default_sparse_vec_field_name,
|
||||
search_param, default_limit)[0]
|
||||
# 3. search
|
||||
_search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}
|
||||
res = collection_w.search(search_vectors[:default_nq], ct.default_sparse_vec_field_name, _search_param,
|
||||
default_limit + offset)[0]
|
||||
assert len(search_res[0].ids) == len(res[0].ids[offset:])
|
||||
assert sorted(search_res[0].distances, key=numpy.float32) == sorted(
|
||||
res[0].distances[offset:], key=numpy.float32)
|
||||
|
||||
|
||||
class TestSearchPaginationInvalid(TestcaseBase):
|
||||
""" Test case of search pagination """
|
||||
|
@ -6932,7 +6961,7 @@ class TestCollectionRangeSearch(TestcaseBase):
|
|||
******************************************************************
|
||||
"""
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("vector_data_type", ct.all_float_vector_types)
|
||||
@pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||
def test_range_search_default(self, index_type, metric, vector_data_type):
|
||||
"""
|
||||
target: verify the range search returns correct results
|
||||
|
@ -8346,6 +8375,33 @@ class TestCollectionRangeSearch(TestcaseBase):
|
|||
"limit": nb_old + nb_new,
|
||||
"_async": _async})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_range_search_sparse(self):
|
||||
"""
|
||||
target: test sparse index normal range search
|
||||
method: create connection, collection, insert and range search
|
||||
expected: range search successfully
|
||||
"""
|
||||
# 1. initialize with data
|
||||
collection_w = self.init_collection_general(prefix, True, nb=5000,
|
||||
with_json=True,
|
||||
vector_data_type=ct.sparse_vector)[0]
|
||||
range_filter = random.uniform(0.5, 1)
|
||||
radius = random.uniform(0, 0.5)
|
||||
|
||||
# 2. range search
|
||||
range_search_params = {"metric_type": "IP",
|
||||
"params": {"radius": radius, "range_filter": range_filter}}
|
||||
d = cf.gen_default_list_sparse_data(nb=1)
|
||||
search_res = collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name,
|
||||
range_search_params, default_limit,
|
||||
default_search_exp)[0]
|
||||
|
||||
# 3. check search results
|
||||
for hits in search_res:
|
||||
for distance in hits.distances:
|
||||
assert range_filter >= distance > radius
|
||||
|
||||
|
||||
class TestCollectionLoadOperation(TestcaseBase):
|
||||
""" Test case of search combining load and other functions """
|
||||
|
@ -10656,6 +10712,53 @@ class TestSearchGroupBy(TestcaseBase):
|
|||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq, "limit": limit})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_sparse_vectors_group_by(self, index):
|
||||
"""
|
||||
target: test search group by works on a collection with sparse vector
|
||||
method: 1. create a collection
|
||||
2. create index
|
||||
3. grouping search
|
||||
verify: search successfully
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||
nb = 5000
|
||||
data = cf.gen_default_list_sparse_data(nb=nb)
|
||||
# update float fields
|
||||
_data = [random.randint(1, 100) for _ in range(nb)]
|
||||
str_data = [str(i) for i in _data]
|
||||
data[2] = str_data
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
collection_w.load()
|
||||
|
||||
nq = 2
|
||||
limit = 20
|
||||
search_params = ct.default_sparse_search_params
|
||||
|
||||
search_vectors = cf.gen_default_list_sparse_data(nb=nq)[-1][-2:]
|
||||
# verify the results are same if gourp by pk
|
||||
res = collection_w.search(data=search_vectors, anns_field=ct.default_sparse_vec_field_name,
|
||||
param=search_params, limit=limit,
|
||||
group_by_field="varchar",
|
||||
output_fields=["varchar"],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": nq, "limit": limit})
|
||||
|
||||
hit = res[0]
|
||||
set_varchar = set()
|
||||
for item in hit:
|
||||
a = list(item.fields.values())
|
||||
set_varchar.add(a[0])
|
||||
# groupy by is in effect, then there are no duplicate varchar values
|
||||
assert len(hit) == len(set_varchar)
|
||||
|
||||
|
||||
class TestCollectionHybridSearchValid(TestcaseBase):
|
||||
""" Test case of search interface """
|
||||
|
@ -12534,6 +12637,64 @@ class TestCollectionHybridSearchValid(TestcaseBase):
|
|||
for i in range(nq):
|
||||
assert is_sorted_descend(res[i].distances)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_hybrid_search_sparse_normal(self):
|
||||
"""
|
||||
target: test hybrid search after loading sparse vectors
|
||||
method: Test hybrid search after loading sparse vectors
|
||||
expected: hybrid search successfully with limit(topK)
|
||||
"""
|
||||
nb, auto_id, dim, enable_dynamic_field = 20000, False, 768, False
|
||||
# 1. init collection
|
||||
collection_w, insert_vectors, _, insert_ids = self.init_collection_general(prefix, True, nb=nb,
|
||||
multiple_dim_array=[dim, dim*2], with_json=False,
|
||||
vector_data_type="SPARSE_FLOAT_VECTOR")[0:4]
|
||||
# 2. extract vector field name
|
||||
vector_name_list = cf.extract_vector_field_name_list(collection_w)
|
||||
# 3. prepare search params
|
||||
req_list = []
|
||||
search_res_dict_array = []
|
||||
k = 60
|
||||
|
||||
for i in range(len(vector_name_list)):
|
||||
# vector = cf.gen_sparse_vectors(1, dim)
|
||||
vector = insert_vectors[0][i+3][-1:]
|
||||
search_res_dict = {}
|
||||
search_param = {
|
||||
"data": vector,
|
||||
"anns_field": vector_name_list[i],
|
||||
"param": {"metric_type": "IP", "offset": 0},
|
||||
"limit": default_limit,
|
||||
"expr": "int64 > 0"}
|
||||
req = AnnSearchRequest(**search_param)
|
||||
req_list.append(req)
|
||||
# search for get the base line of hybrid_search
|
||||
search_res = collection_w.search(vector, vector_name_list[i],
|
||||
default_search_params, default_limit,
|
||||
default_search_exp,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": 1,
|
||||
"ids": insert_ids,
|
||||
# "limit": default_limit
|
||||
}
|
||||
)[0]
|
||||
ids = search_res[0].ids
|
||||
for j in range(len(ids)):
|
||||
search_res_dict[ids[j]] = 1/(j + k +1)
|
||||
search_res_dict_array.append(search_res_dict)
|
||||
# 4. calculate hybrid search base line for RRFRanker
|
||||
ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array)
|
||||
# 5. hybrid search
|
||||
hybrid_res = collection_w.hybrid_search(req_list, RRFRanker(k), default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": 1,
|
||||
"ids": insert_ids,
|
||||
"limit": default_limit})[0]
|
||||
# 6. compare results through the re-calculated distances
|
||||
for i in range(len(score_answer[:default_limit])):
|
||||
delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i])
|
||||
assert delta < hybrid_search_epsilon
|
||||
|
||||
|
||||
class TestSparseSearch(TestcaseBase):
|
||||
""" Add some test cases for the sparse vector """
|
||||
|
@ -12550,7 +12711,7 @@ class TestSparseSearch(TestcaseBase):
|
|||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data()
|
||||
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
|
@ -12562,6 +12723,12 @@ class TestSparseSearch(TestcaseBase):
|
|||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit})
|
||||
expr = "int64 < 100 "
|
||||
collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, default_limit,
|
||||
expr,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
|
@ -12624,3 +12791,83 @@ class TestSparseSearch(TestcaseBase):
|
|||
term_expr = f'{ct.default_int64_field_name} in [0, 1, 10, 100]'
|
||||
res = collection_w.query(term_expr)
|
||||
assert len(res) == 4
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("ratio", [0.01, 0.1, 0.5, 0.9])
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_search_sparse_ratio(self, ratio, index):
|
||||
"""
|
||||
target: create a sparse index by adjusting the ratio parameter.
|
||||
method: create a sparse index by adjusting the ratio parameter.
|
||||
expected: search successfully
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||
collection_w.insert(data)
|
||||
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
|
||||
collection_w.load()
|
||||
assert collection_w.has_index(index_name=index) == True
|
||||
search_params = {"metric_type": "IP", "params": {"drop_ratio_search": ratio}}
|
||||
collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_sparse_vector_search_output_field(self, index):
|
||||
"""
|
||||
target: create sparse vectors and search
|
||||
method: create sparse vectors and search
|
||||
expected: normal search
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
collection_w.load()
|
||||
d = cf.gen_default_list_sparse_data(nb=1)
|
||||
collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, 5,
|
||||
output_fields=["float", "sparse_vector"],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit,
|
||||
"output_fields": ["float", "sparse_vector"]
|
||||
})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_sparse_vector_search_iterator(self, index):
|
||||
"""
|
||||
target: create sparse vectors and search iterator
|
||||
method: create sparse vectors and search iterator
|
||||
expected: normal search
|
||||
"""
|
||||
self._connect()
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
collection_w.load()
|
||||
batch_size = 10
|
||||
collection_w.search_iterator(data[-1][-1:], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, batch_size,
|
||||
check_task=CheckTasks.check_search_iterator,
|
||||
check_items={"batch_size": batch_size})
|
Loading…
Reference in New Issue