From 1216a4bcd87ab30f973ba62b1bfbbdc392727539 Mon Sep 17 00:00:00 2001 From: elstic Date: Wed, 19 Jun 2024 15:24:09 +0800 Subject: [PATCH] test: Add more sparse test cases (#33916) issue: https://github.com/milvus-io/milvus/issues/31483 Signed-off-by: elstic --- tests/python_client/base/client_base.py | 8 + tests/python_client/common/common_func.py | 58 ++-- tests/python_client/common/common_type.py | 3 +- .../testcases/test_collection.py | 19 ++ tests/python_client/testcases/test_index.py | 43 +++ tests/python_client/testcases/test_insert.py | 43 +++ tests/python_client/testcases/test_query.py | 31 +++ tests/python_client/testcases/test_search.py | 251 +++++++++++++++++- 8 files changed, 433 insertions(+), 23 deletions(-) diff --git a/tests/python_client/base/client_base.py b/tests/python_client/base/client_base.py index 0b52845885..7f6b56b1fd 100644 --- a/tests/python_client/base/client_base.py +++ b/tests/python_client/base/client_base.py @@ -262,6 +262,11 @@ class TestcaseBase(Base): if is_binary: default_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim, primary_field=primary_field) + if vector_data_type == ct.sparse_vector: + default_schema = cf.gen_default_sparse_schema(auto_id=auto_id, primary_field=primary_field, + enable_dynamic_field=enable_dynamic_field, + with_json=with_json, + multiple_dim_array=multiple_dim_array) if is_all_data_type: default_schema = cf.gen_collection_schema_all_datatype(auto_id=auto_id, dim=dim, primary_field=primary_field, @@ -289,6 +294,9 @@ class TestcaseBase(Base): # This condition will be removed after auto index feature if is_binary: collection_w.create_index(ct.default_binary_vec_field_name, ct.default_bin_flat_index) + elif vector_data_type == ct.sparse_vector: + for vector_name in vector_name_list: + collection_w.create_index(vector_name, ct.default_sparse_inverted_index) else: if len(multiple_dim_array) == 0 or is_all_data_type == False: vector_name_list.append(ct.default_float_vec_field_name) diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 71d3328ced..c425c6e350 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -145,6 +145,12 @@ def gen_double_field(name=ct.default_double_field_name, is_primary=False, descri def gen_float_vec_field(name=ct.default_float_vec_field_name, is_primary=False, dim=ct.default_dim, description=ct.default_desc, vector_data_type="FLOAT_VECTOR", **kwargs): + if vector_data_type == "SPARSE_FLOAT_VECTOR": + dtype = DataType.SPARSE_FLOAT_VECTOR + float_vec_field, _ = ApiFieldSchemaWrapper().init_field_schema(name=name, dtype=dtype, + description=description, + is_primary=is_primary, **kwargs) + return float_vec_field if vector_data_type == "FLOAT_VECTOR": dtype = DataType.FLOAT_VECTOR elif vector_data_type == "FLOAT16_VECTOR": @@ -358,9 +364,14 @@ def gen_collection_schema_all_datatype(description=ct.default_desc, else: multiple_dim_array.insert(0, dim) for i in range(len(multiple_dim_array)): - fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}", + if ct.all_float_vector_types[i%3] != ct.sparse_vector: + fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}", dim=multiple_dim_array[i], vector_data_type=ct.all_float_vector_types[i%3])) + else: + # The field of a sparse vector cannot be dimensioned + fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.sparse_vector}", + vector_data_type=ct.sparse_vector)) schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description, primary_field=primary_field, auto_id=auto_id, @@ -384,8 +395,17 @@ def gen_default_binary_collection_schema(description=ct.default_desc, primary_fi def gen_default_sparse_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name, - auto_id=False, **kwargs): + auto_id=False, with_json=False, multiple_dim_array=[], **kwargs): + fields = [gen_int64_field(), gen_float_field(), gen_string_field(), gen_sparse_vec_field()] + if with_json: + fields.insert(-1, gen_json_field()) + + if len(multiple_dim_array) != 0: + for i in range(len(multiple_dim_array)): + vec_name = ct.default_sparse_vec_field_name + "_" + str(i) + vec_field = gen_sparse_vec_field(name=vec_name) + fields.append(vec_field) sparse_schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description, primary_field=primary_field, auto_id=auto_id, **kwargs) @@ -418,7 +438,7 @@ def gen_vectors(nb, dim, vector_data_type="FLOAT_VECTOR"): vectors = gen_fp16_vectors(nb, dim)[1] elif vector_data_type == "BFLOAT16_VECTOR": vectors = gen_bf16_vectors(nb, dim)[1] - elif vector_data_type == "SPARSE_VECTOR": + elif vector_data_type == "SPARSE_FLOAT_VECTOR": vectors = gen_sparse_vectors(nb, dim) if dim > 1: @@ -508,10 +528,10 @@ def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0, index = 2 del insert_list[index] if len(multiple_dim_array) != 0: - if len(multiple_vector_field_name) != len(multiple_dim_array): - log.error("multiple vector feature is enabled, please input the vector field name list " - "not including the default vector field") - assert len(multiple_vector_field_name) == len(multiple_dim_array) + # if len(multiple_vector_field_name) != len(multiple_dim_array): + # log.error("multiple vector feature is enabled, please input the vector field name list " + # "not including the default vector field") + # assert len(multiple_vector_field_name) == len(multiple_dim_array) for i in range(len(multiple_dim_array)): new_float_vec_values = gen_vectors(nb, multiple_dim_array[i], vector_data_type=vector_data_type) insert_list.append(new_float_vec_values) @@ -832,7 +852,7 @@ def gen_default_list_sparse_data(nb=ct.default_nb, dim=ct.default_dim, start=0, string_values = [str(i) for i in range(start, start + nb)] json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]} for i in range(start, start + nb)] - sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_VECTOR") + sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_FLOAT_VECTOR") if with_json: data = [int_values, float_values, string_values, json_values, sparse_vec_values] else: @@ -1772,7 +1792,7 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ multiple_vector_field_name=vector_name_list, vector_data_type=vector_data_type, auto_id=auto_id, primary_field=primary_field) - elif vector_data_type == "FLOAT16_VECTOR" or "BFLOAT16_VECTOR": + elif vector_data_type in ct.all_float_vector_types: default_data = gen_general_default_list_data(nb // num, dim=dim, start=start, with_json=with_json, random_primary_key=random_primary_key, multiple_dim_array=multiple_dim_array, @@ -1972,14 +1992,10 @@ def extract_vector_field_name_list(collection_w): fields = schema_dict.get('fields') vector_name_list = [] for field in fields: - if str(field['type']) in ["101", "102", "103"]: - if field['name'] != ct.default_float_vec_field_name: - vector_name_list.append(field['name']) - - for field in fields: - if str(field['type']) == 'DataType.FLOAT_VECTOR' \ - or str(field['type']) == 'DataType.FLOAT16_VECTOR' \ - or str(field['type']) == 'DataType.BFLOAT16_VECTOR': + if field['type'] == DataType.FLOAT_VECTOR \ + or field['type'] == DataType.FLOAT16_VECTOR \ + or field['type'] == DataType.BFLOAT16_VECTOR \ + or field['type'] == DataType.SPARSE_FLOAT_VECTOR: if field['name'] != ct.default_float_vec_field_name: vector_name_list.append(field['name']) @@ -2120,11 +2136,13 @@ def gen_vectors_based_on_vector_type(num, dim, vector_data_type): fp16_vectors: the bytes used for insert return: raw_vectors and fp16_vectors """ - if vector_data_type == "FLOAT_VECTOR": + if vector_data_type == ct.float_type: vectors = [[random.random() for _ in range(dim)] for _ in range(num)] - elif vector_data_type == "FLOAT16_VECTOR": + elif vector_data_type == ct.float16_type: vectors = gen_fp16_vectors(num, dim)[1] - elif vector_data_type == "BFLOAT16_VECTOR": + elif vector_data_type == ct.bfloat16_type: vectors = gen_bf16_vectors(num, dim)[1] + elif vector_data_type == ct.sparse_vector: + vectors = gen_sparse_vectors(num, dim) return vectors diff --git a/tests/python_client/common/common_type.py b/tests/python_client/common/common_type.py index 45ad4b985b..33145b9b24 100644 --- a/tests/python_client/common/common_type.py +++ b/tests/python_client/common/common_type.py @@ -44,7 +44,8 @@ default_binary_vec_field_name = "binary_vector" float_type = "FLOAT_VECTOR" float16_type = "FLOAT16_VECTOR" bfloat16_type = "BFLOAT16_VECTOR" -all_float_vector_types = [float_type, float16_type, bfloat16_type] +sparse_vector = "SPARSE_FLOAT_VECTOR" +all_float_vector_types = [float16_type, bfloat16_type, sparse_vector] default_sparse_vec_field_name = "sparse_vector" default_partition_name = "_default" default_resource_group_name = '__default_resource_group' diff --git a/tests/python_client/testcases/test_collection.py b/tests/python_client/testcases/test_collection.py index 71084a0791..a90daff717 100644 --- a/tests/python_client/testcases/test_collection.py +++ b/tests/python_client/testcases/test_collection.py @@ -50,6 +50,7 @@ vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_ default_search_field = ct.default_float_vec_field_name default_search_params = ct.default_search_params max_vector_field_num = ct.max_vector_field_num +SPARSE_FLOAT_VECTOR_data_type = "SPARSE_FLOAT_VECTOR" class TestCollectionParams(TestcaseBase): @@ -1047,6 +1048,24 @@ class TestCollectionParams(TestcaseBase): error = {ct.err_code: 65535, ct.err_msg: "maximum field's number should be limited to 64"} self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error) + @pytest.mark.tags(CaseLabel.L2) + def test_collection_multi_sparse_vectors(self): + """ + target: Test multiple sparse vectors in a collection + method: create 2 sparse vectors in a collection + expected: successful creation of a collection + """ + # 1. connect + self._connect() + # 2. create collection with multiple vectors + c_name = cf.gen_unique_str(prefix) + fields = [cf.gen_int64_field(is_primary=True), cf.gen_float_field(), + cf.gen_float_vec_field(vector_data_type=ct.sparse_vector_data_type), cf.gen_float_vec_field(name="tmp", vector_data_type=sparse_vector_data_type)] + schema = cf.gen_collection_schema(fields=fields) + self.collection_wrap.init_collection(c_name, schema=schema, + check_task=CheckTasks.check_collection_property, + check_items={exp_name: c_name, exp_schema: schema}) + class TestCollectionOperation(TestcaseBase): """ diff --git a/tests/python_client/testcases/test_index.py b/tests/python_client/testcases/test_index.py index 753fb28cd5..a341282280 100644 --- a/tests/python_client/testcases/test_index.py +++ b/tests/python_client/testcases/test_index.py @@ -1,5 +1,7 @@ import random from time import sleep + +import numpy as np import pytest import copy @@ -1442,6 +1444,47 @@ class TestIndexInvalid(TestcaseBase): check_items={ct.err_code: 1, ct.err_msg: f"<'int' object has no attribute 'items'"}) + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("metric_type", ["L2", "COSINE", " ", "invalid"]) + @pytest.mark.parametrize("index", ct.all_index_types[9:11]) + def test_invalid_sparse_metric_type(self, metric_type, index): + """ + target: unsupported metric_type create index + method: unsupported metric_type creates an index + expected: raise exception + """ + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + data = cf.gen_default_list_sparse_data() + collection_w.insert(data=data) + param = cf.get_index_params_params(index) + params = {"index_type": index, "metric_type": metric_type, "params": param} + error = {ct.err_code: 65535, ct.err_msg: "only IP is the supported metric type for sparse index"} + index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params, + check_task=CheckTasks.err_res, + check_items=error) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("ratio", [-0.5, 1, 3]) + @pytest.mark.parametrize("index ", ct.all_index_types[9:11]) + def test_invalid_sparse_ratio(self, ratio, index): + """ + target: index creation for unsupported ratio parameter + method: indexing of unsupported ratio parameters + expected: raise exception + """ + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + data = cf.gen_default_list_sparse_data() + collection_w.insert(data=data) + params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}} + error = {ct.err_code: 1100, ct.err_msg: f"invalid drop_ratio_build: {ratio}, must be in range [0, 1): invalid parameter[expected=valid index params"} + index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params, + check_task=CheckTasks.err_res, + check_items=error) + @pytest.mark.tags(CaseLabel.GPU) class TestNewIndexAsync(TestcaseBase): diff --git a/tests/python_client/testcases/test_insert.py b/tests/python_client/testcases/test_insert.py index 8d0b3e7cec..94f2fc8535 100644 --- a/tests/python_client/testcases/test_insert.py +++ b/tests/python_client/testcases/test_insert.py @@ -1348,6 +1348,25 @@ class TestInsertInvalid(TestcaseBase): error = {ct.err_code: 65535, ct.err_msg: "value '+Inf' is not a number or infinity"} collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error) + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("index ", ct.all_index_types[9:11]) + @pytest.mark.parametrize("invalid_vector_type ", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"]) + def test_invalid_sparse_vector_data(self, index, invalid_vector_type): + """ + target: insert illegal data type + method: insert illegal data type + expected: raise exception + """ + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + nb = 100 + data = cf.gen_default_list_sparse_data(nb=nb)[:-1] + invalid_vec = cf.gen_vectors(nb, dim=128, vector_data_type=invalid_vector_type) + data.append(invalid_vec) + error = {ct.err_code: 1, ct.err_msg: 'input must be a sparse matrix in supported format'} + collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error) + class TestInsertInvalidBinary(TestcaseBase): """ @@ -1872,6 +1891,30 @@ class TestUpsertValid(TestcaseBase): collection_w.upsert(df) assert collection_w.num_entities == ct.default_nb + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("index ", ct.all_index_types[9:11]) + def test_upsert_sparse_data(self, index): + """ + target: multiple upserts and counts(*) + method: multiple upserts and counts(*) + expected: number of data entries normal + """ + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + data = cf.gen_default_list_sparse_data(nb=ct.default_nb) + collection_w.upsert(data=data) + assert collection_w.num_entities == ct.default_nb + params = cf.get_index_params_params(index) + index_params = {"index_type": index, "metric_type": "IP", "params": params} + collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index) + collection_w.load() + for i in range(5): + collection_w.upsert(data=data) + collection_w.query(expr=f'{ct.default_int64_field_name} >= 0', output_fields=[ct.default_count_output] + , check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": ct.default_nb}]}) + class TestUpsertInvalid(TestcaseBase): """ Invalid test case of Upsert interface """ diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index d1e25351c5..23b53c83af 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -3691,6 +3691,37 @@ class TestQueryCount(TestcaseBase): check_task=CheckTasks.check_query_results, check_items={exp_res: [{count: res}]}) + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("index", ct.all_index_types[9:11]) + def test_counts_expression_sparse_vectors(self, index): + """ + target: test count with expr + method: count with expr + expected: verify count + """ + self._connect() + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema) + data = cf.gen_default_list_sparse_data() + collection_w.insert(data) + params = cf.get_index_params_params(index) + index_params = {"index_type": index, "metric_type": "IP", "params": params} + collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index) + collection_w.load() + collection_w.query(expr=default_expr, output_fields=[count], + check_task=CheckTasks.check_query_results, + check_items={exp_res: [{count: ct.default_nb}]}) + expr = "int64 > 50 && int64 < 100 && float < 75" + collection_w.query(expr=expr, output_fields=[count], + check_task=CheckTasks.check_query_results, + check_items={exp_res: [{count: 24}]}) + batch_size = 100 + collection_w.query_iterator(batch_size=batch_size, expr=default_expr, + check_task=CheckTasks.check_query_iterator, + check_items={"count": ct.default_nb, + "batch_size": batch_size}) + class TestQueryIterator(TestcaseBase): """ diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py index e749d9fa8b..8b9627dc7e 100644 --- a/tests/python_client/testcases/test_search.py +++ b/tests/python_client/testcases/test_search.py @@ -6380,6 +6380,35 @@ class TestSearchPagination(TestcaseBase): default_limit, offset=offset)[0] assert res1[0].ids == res2[0].ids + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("offset", [1, 5, 20]) + def test_search_sparse_with_pagination(self, offset): + """ + target: test search sparse with pagination + method: 1. connect and create a collection + 2. search pagination with offset + 3. search with offset+limit + 4. compare with the search results whose corresponding ids should be the same + expected: search successfully and ids is correct + """ + # 1. create a collection + auto_id = False + collection_w, _, _, insert_ids = \ + self.init_collection_general( + prefix, True, auto_id=auto_id, vector_data_type=ct.sparse_vector)[0:4] + # 2. search with offset+limit + search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}, "offset": offset} + search_vectors = cf.gen_default_list_sparse_data()[-1][-2:] + search_res = collection_w.search(search_vectors, ct.default_sparse_vec_field_name, + search_param, default_limit)[0] + # 3. search + _search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}} + res = collection_w.search(search_vectors[:default_nq], ct.default_sparse_vec_field_name, _search_param, + default_limit + offset)[0] + assert len(search_res[0].ids) == len(res[0].ids[offset:]) + assert sorted(search_res[0].distances, key=numpy.float32) == sorted( + res[0].distances[offset:], key=numpy.float32) + class TestSearchPaginationInvalid(TestcaseBase): """ Test case of search pagination """ @@ -6932,7 +6961,7 @@ class TestCollectionRangeSearch(TestcaseBase): ****************************************************************** """ @pytest.mark.tags(CaseLabel.L0) - @pytest.mark.parametrize("vector_data_type", ct.all_float_vector_types) + @pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"]) def test_range_search_default(self, index_type, metric, vector_data_type): """ target: verify the range search returns correct results @@ -8346,6 +8375,33 @@ class TestCollectionRangeSearch(TestcaseBase): "limit": nb_old + nb_new, "_async": _async}) + @pytest.mark.tags(CaseLabel.L2) + def test_range_search_sparse(self): + """ + target: test sparse index normal range search + method: create connection, collection, insert and range search + expected: range search successfully + """ + # 1. initialize with data + collection_w = self.init_collection_general(prefix, True, nb=5000, + with_json=True, + vector_data_type=ct.sparse_vector)[0] + range_filter = random.uniform(0.5, 1) + radius = random.uniform(0, 0.5) + + # 2. range search + range_search_params = {"metric_type": "IP", + "params": {"radius": radius, "range_filter": range_filter}} + d = cf.gen_default_list_sparse_data(nb=1) + search_res = collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name, + range_search_params, default_limit, + default_search_exp)[0] + + # 3. check search results + for hits in search_res: + for distance in hits.distances: + assert range_filter >= distance > radius + class TestCollectionLoadOperation(TestcaseBase): """ Test case of search combining load and other functions """ @@ -10656,6 +10712,53 @@ class TestSearchGroupBy(TestcaseBase): check_task=CheckTasks.check_search_results, check_items={"nq": nq, "limit": limit}) + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("index", ct.all_index_types[9:11]) + def test_sparse_vectors_group_by(self, index): + """ + target: test search group by works on a collection with sparse vector + method: 1. create a collection + 2. create index + 3. grouping search + verify: search successfully + """ + self._connect() + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema) + nb = 5000 + data = cf.gen_default_list_sparse_data(nb=nb) + # update float fields + _data = [random.randint(1, 100) for _ in range(nb)] + str_data = [str(i) for i in _data] + data[2] = str_data + collection_w.insert(data) + params = cf.get_index_params_params(index) + index_params = {"index_type": index, "metric_type": "IP", "params": params} + collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index) + collection_w.load() + + nq = 2 + limit = 20 + search_params = ct.default_sparse_search_params + + search_vectors = cf.gen_default_list_sparse_data(nb=nq)[-1][-2:] + # verify the results are same if gourp by pk + res = collection_w.search(data=search_vectors, anns_field=ct.default_sparse_vec_field_name, + param=search_params, limit=limit, + group_by_field="varchar", + output_fields=["varchar"], + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": limit}) + + hit = res[0] + set_varchar = set() + for item in hit: + a = list(item.fields.values()) + set_varchar.add(a[0]) + # groupy by is in effect, then there are no duplicate varchar values + assert len(hit) == len(set_varchar) + class TestCollectionHybridSearchValid(TestcaseBase): """ Test case of search interface """ @@ -12534,6 +12637,64 @@ class TestCollectionHybridSearchValid(TestcaseBase): for i in range(nq): assert is_sorted_descend(res[i].distances) + @pytest.mark.tags(CaseLabel.L2) + def test_hybrid_search_sparse_normal(self): + """ + target: test hybrid search after loading sparse vectors + method: Test hybrid search after loading sparse vectors + expected: hybrid search successfully with limit(topK) + """ + nb, auto_id, dim, enable_dynamic_field = 20000, False, 768, False + # 1. init collection + collection_w, insert_vectors, _, insert_ids = self.init_collection_general(prefix, True, nb=nb, + multiple_dim_array=[dim, dim*2], with_json=False, + vector_data_type="SPARSE_FLOAT_VECTOR")[0:4] + # 2. extract vector field name + vector_name_list = cf.extract_vector_field_name_list(collection_w) + # 3. prepare search params + req_list = [] + search_res_dict_array = [] + k = 60 + + for i in range(len(vector_name_list)): + # vector = cf.gen_sparse_vectors(1, dim) + vector = insert_vectors[0][i+3][-1:] + search_res_dict = {} + search_param = { + "data": vector, + "anns_field": vector_name_list[i], + "param": {"metric_type": "IP", "offset": 0}, + "limit": default_limit, + "expr": "int64 > 0"} + req = AnnSearchRequest(**search_param) + req_list.append(req) + # search for get the base line of hybrid_search + search_res = collection_w.search(vector, vector_name_list[i], + default_search_params, default_limit, + default_search_exp, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, + "ids": insert_ids, + # "limit": default_limit + } + )[0] + ids = search_res[0].ids + for j in range(len(ids)): + search_res_dict[ids[j]] = 1/(j + k +1) + search_res_dict_array.append(search_res_dict) + # 4. calculate hybrid search base line for RRFRanker + ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array) + # 5. hybrid search + hybrid_res = collection_w.hybrid_search(req_list, RRFRanker(k), default_limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, + "ids": insert_ids, + "limit": default_limit})[0] + # 6. compare results through the re-calculated distances + for i in range(len(score_answer[:default_limit])): + delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i]) + assert delta < hybrid_search_epsilon + class TestSparseSearch(TestcaseBase): """ Add some test cases for the sparse vector """ @@ -12550,7 +12711,7 @@ class TestSparseSearch(TestcaseBase): c_name = cf.gen_unique_str(prefix) schema = cf.gen_default_sparse_schema(auto_id=False) collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema) - data = cf.gen_default_list_sparse_data() + data = cf.gen_default_list_sparse_data(nb=10000) collection_w.insert(data) params = cf.get_index_params_params(index) index_params = {"index_type": index, "metric_type": "IP", "params": params} @@ -12562,6 +12723,12 @@ class TestSparseSearch(TestcaseBase): check_task=CheckTasks.check_search_results, check_items={"nq": default_nq, "limit": default_limit}) + expr = "int64 < 100 " + collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name, + ct.default_sparse_search_params, default_limit, + expr, + check_task=CheckTasks.check_search_results, + check_items={"nq": default_nq}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("index", ct.all_index_types[9:11]) @@ -12624,3 +12791,83 @@ class TestSparseSearch(TestcaseBase): term_expr = f'{ct.default_int64_field_name} in [0, 1, 10, 100]' res = collection_w.query(term_expr) assert len(res) == 4 + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("ratio", [0.01, 0.1, 0.5, 0.9]) + @pytest.mark.parametrize("index", ct.all_index_types[9:11]) + def test_search_sparse_ratio(self, ratio, index): + """ + target: create a sparse index by adjusting the ratio parameter. + method: create a sparse index by adjusting the ratio parameter. + expected: search successfully + """ + self._connect() + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema(auto_id=False) + collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema) + data = cf.gen_default_list_sparse_data(nb=10000) + collection_w.insert(data) + params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}} + collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index) + collection_w.load() + assert collection_w.has_index(index_name=index) == True + search_params = {"metric_type": "IP", "params": {"drop_ratio_search": ratio}} + collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name, + search_params, default_limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": default_nq, + "limit": default_limit}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("index", ct.all_index_types[9:11]) + def test_sparse_vector_search_output_field(self, index): + """ + target: create sparse vectors and search + method: create sparse vectors and search + expected: normal search + """ + self._connect() + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema) + data = cf.gen_default_list_sparse_data(nb=10000) + collection_w.insert(data) + params = cf.get_index_params_params(index) + index_params = {"index_type": index, "metric_type": "IP", "params": params} + collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index) + + collection_w.load() + d = cf.gen_default_list_sparse_data(nb=1) + collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name, + ct.default_sparse_search_params, 5, + output_fields=["float", "sparse_vector"], + check_task=CheckTasks.check_search_results, + check_items={"nq": default_nq, + "limit": default_limit, + "output_fields": ["float", "sparse_vector"] + }) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("index", ct.all_index_types[9:11]) + def test_sparse_vector_search_iterator(self, index): + """ + target: create sparse vectors and search iterator + method: create sparse vectors and search iterator + expected: normal search + """ + self._connect() + c_name = cf.gen_unique_str(prefix) + schema = cf.gen_default_sparse_schema() + collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema) + data = cf.gen_default_list_sparse_data(nb=10000) + collection_w.insert(data) + params = cf.get_index_params_params(index) + index_params = {"index_type": index, "metric_type": "IP", "params": params} + collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index) + + collection_w.load() + batch_size = 10 + collection_w.search_iterator(data[-1][-1:], ct.default_sparse_vec_field_name, + ct.default_sparse_search_params, batch_size, + check_task=CheckTasks.check_search_iterator, + check_items={"batch_size": batch_size}) \ No newline at end of file