From 54150253e797c8cae3da1f9d5d6b407a7c14749f Mon Sep 17 00:00:00 2001 From: yanliang567 <82361606+yanliang567@users.noreply.github.com> Date: Wed, 31 Jan 2024 17:23:03 +0800 Subject: [PATCH] enhance: Add more tests for groupby (#30346) Related issue: #30033 skip the tests before bug fixes --------- Signed-off-by: yanliang567 --- tests/python_client/common/common_func.py | 8 +- tests/python_client/testcases/test_search.py | 393 ++++++++++++++++++- 2 files changed, 397 insertions(+), 4 deletions(-) diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 4c5b77a97a..804897c0c1 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -540,7 +540,7 @@ def gen_default_rows_data_all_data_type(nb=ct.default_nb, dim=ct.default_dim, st return array -def gen_default_binary_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0): +def gen_default_binary_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, start=0, auto_id=False): int_values = pd.Series(data=[i for i in range(start, start + nb)]) float_values = pd.Series(data=[np.float32(i) for i in range(start, start + nb)], dtype="float32") string_values = pd.Series(data=[str(i) for i in range(start, start + nb)], dtype="string") @@ -551,6 +551,12 @@ def gen_default_binary_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, star ct.default_string_field_name: string_values, ct.default_binary_vec_field_name: binary_vec_values }) + if auto_id is True: + df = pd.DataFrame({ + ct.default_float_field_name: float_values, + ct.default_string_field_name: string_values, + ct.default_binary_vec_field_name: binary_vec_values + }) return df, binary_raw_values diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py index cbbcbd86ea..7d86c2038d 100644 --- a/tests/python_client/testcases/test_search.py +++ b/tests/python_client/testcases/test_search.py @@ -9577,10 +9577,10 @@ class TestSearchIterator(TestcaseBase): class TestSearchGroupBy(TestcaseBase): """ Test case of search group by """ - @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("metric", ct.float_metrics) - @pytest.mark.xfail(reason="issue #29883") - def test_search_group_by(self, metric): + @pytest.mark.skip(reason="issue #29883") + def test_search_group_by_default(self, metric): """ target: test search group by method: 1. create a collection with data @@ -9647,3 +9647,390 @@ class TestSearchGroupBy(TestcaseBase): # verify no dup values of the group_by_field in results assert len(grpby_values) == len(set(grpby_values)) + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("metric", ["JACCARD", "HAMMING"]) + @pytest.mark.skip(reason="issue #29883") + def test_search_binary_vec_group_by(self, metric): + """ + target: test search group by + method: 1. create a collection with binary vectors + 2. create index with different metric types + 3. search with group by + verify no duplicate values for group_by_field + 4. search with filtering every value of group_by_field + verify: verify that every record in groupby results is the top1 for that value of the group_by_field + """ + collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False, + is_binary=True)[0] + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_binary_vec_field_name, index_params=_index) + # insert with the same values for scalar fields + for _ in range(30): + data = cf.gen_default_binary_dataframe_data(nb=100, auto_id=True)[0] + collection_w.insert(data) + + collection_w.flush() + collection_w.create_index(ct.default_binary_vec_field_name, index_params=_index) + time.sleep(30) + collection_w.load() + + search_params = {"metric_type": metric, "params": {"ef": 128}} + nq = 2 + limit = 10 + search_vectors = cf.gen_binary_vectors(nq, dim=ct.default_dim)[1] + + # verify the results are same if gourp by pk + res1 = collection_w.search(data=search_vectors, anns_field=ct.default_binary_vec_field_name, + param=search_params, limit=limit, consistency_level=CONSISTENCY_STRONG, + group_by_field=ct.default_int64_field_name)[0] + res2 = collection_w.search(data=search_vectors, anns_field=ct.default_binary_vec_field_name, + param=search_params, limit=limit, consistency_level=CONSISTENCY_STRONG)[0] + # for i in range(nq): + # assert res1[i].ids == res2[i].ids + + # verify that every record in groupby results is the top1 for that value of the group_by_field + supported_grpby_fields = [ct.default_string_field_name] + for grpby_field in supported_grpby_fields: + res1 = collection_w.search(data=search_vectors, anns_field=ct.default_binary_vec_field_name, + param=search_params, limit=limit, + group_by_field=grpby_field, + output_fields=[grpby_field])[0] + for i in range(nq): + grpby_values = [] + results_num = 2 if grpby_field == ct.default_bool_field_name else limit + for l in range(results_num): + top1 = res1[i][l] + top1_grpby_pk = top1.id + top1_grpby_value = top1.fields.get(grpby_field) + expr = f"{grpby_field}=={top1_grpby_value}" + if grpby_field == ct.default_string_field_name: + expr = f"{grpby_field}=='{top1_grpby_value}'" + grpby_values.append(top1_grpby_value) + res_tmp = collection_w.search(data=[search_vectors[i]], anns_field=ct.default_binary_vec_field_name, + param=search_params, limit=1, + expr=expr, + output_fields=[grpby_field])[0] + top1_expr_pk = res_tmp[0][0].id + assert top1_grpby_pk == top1_expr_pk + # verify no dup values of the group_by_field in results + assert len(grpby_values) == len(set(grpby_values)) + + @pytest.mark.skip(reason="issue #29883") + @pytest.mark.tags(CaseLabel.L0) + @pytest.mark.parametrize("grpby_field", [ct.default_string_field_name, ct.default_int8_field_name]) + def test_search_group_by_with_field_indexed(self, grpby_field): + """ + target: test search group by with the field indexed + method: 1. create a collection with data + 2. create index for the vector field and the groupby field + 3. search with group by + 4. search with filtering every value of group_by_field + verify: verify that every record in groupby results is the top1 for that value of the group_by_field + """ + metric = "COSINE" + collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False, + is_all_data_type=True, with_json=False)[0] + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + # insert with the same values(by insert rounds) for scalar fields + for _ in range(100): + data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False) + collection_w.insert(data) + + collection_w.flush() + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + collection_w.create_index(grpby_field) + time.sleep(30) + collection_w.load() + + search_params = {"metric_type": metric, "params": {"ef": 128}} + nq = 2 + limit = 20 + search_vectors = cf.gen_vectors(nq, dim=ct.default_dim) + + # verify that every record in groupby results is the top1 for that value of the group_by_field + res1 = collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name, + param=search_params, limit=limit, + group_by_field=grpby_field, + output_fields=[grpby_field])[0] + for i in range(nq): + grpby_values = [] + results_num = 2 if grpby_field == ct.default_bool_field_name else limit + for l in range(results_num): + top1 = res1[i][l] + top1_grpby_pk = top1.id + top1_grpby_value = top1.fields.get(grpby_field) + expr = f"{grpby_field}=={top1_grpby_value}" + if grpby_field == ct.default_string_field_name: + expr = f"{grpby_field}=='{top1_grpby_value}'" + grpby_values.append(top1_grpby_value) + res_tmp = collection_w.search(data=[search_vectors[i]], anns_field=ct.default_float_vec_field_name, + param=search_params, limit=1, + expr=expr, + output_fields=[grpby_field])[0] + top1_expr_pk = res_tmp[0][0].id + log.info(f"nq={i}, limit={l}") + assert top1_grpby_pk == top1_expr_pk + # verify no dup values of the group_by_field in results + assert len(grpby_values) == len(set(grpby_values)) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="issue #29967") + @pytest.mark.parametrize("grpby_unsupported_field", [ct.default_float_field_name, ct.default_json_field_name, + ct.default_double_field_name, ct.default_float_vec_field_name]) + def test_search_group_by_unsupported_filed(self, grpby_unsupported_field): + """ + target: test search group by with the unsupported field + method: 1. create a collection with data + 2. create index + 3. search with group by the unsupported fields + verify: the error code and msg + """ + metric = "IP" + collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False, + is_all_data_type=True, with_json=True,)[0] + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + collection_w.load() + + search_params = {"metric_type": metric, "params": {"ef": 128}} + nq = 1 + limit = 1 + search_vectors = cf.gen_vectors(nq, dim=ct.default_dim) + + # search with groupby + err_code = 999 + err_msg = "unsupported" + collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name, + param=search_params, limit=limit, + group_by_field=grpby_unsupported_field, + check_task=CheckTasks.err_res, + check_items={"err_code": err_code, "err_msg": err_msg}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("index, params", + zip(ct.all_index_types[:7], + ct.default_index_params[:7])) + def test_search_group_by_unsupported_index(self, index, params): + """ + target: test search group by with the unsupported vector index + method: 1. create a collection with data + 2. create a groupby unsupported index + 3. search with group by + verify: the error code and msg + """ + if index == "HNSW": + pass # HNSW is supported + else: + metric = "L2" + collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False, + is_all_data_type=True, with_json=False)[0] + index_params = {"index_type": index, "params": params, "metric_type": metric} + collection_w.create_index(ct.default_float_vec_field_name, index_params) + collection_w.load() + + search_params = {"params": {}} + nq = 1 + limit = 1 + search_vectors = cf.gen_vectors(nq, dim=ct.default_dim) + + # search with groupby + err_code = 999 + err_msg = "Unexpected index" + if index in ["IVF_FLAT", "IVF_SQ8", "IVF_PQ", "SCANN"]: + err_msg = "not supported for current index type" + collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name, + param=search_params, limit=limit, + group_by_field=ct.default_int8_field_name, + check_task=CheckTasks.err_res, + check_items={"err_code": err_code, "err_msg": err_msg}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("grpby_nonexist_field", ["nonexit_field", 100]) + def test_search_group_by_nonexit_filed(self, grpby_nonexist_field): + """ + target: test search group by with the nonexisting field + method: 1. create a collection with data + 2. create index + 3. search with group by the unsupported fields + verify: the error code and msg + """ + metric = "IP" + collection_w = self.init_collection_general(prefix, insert_data=True, is_index=False, + is_all_data_type=True, with_json=True, )[0] + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + collection_w.load() + + search_params = {"metric_type": metric, "params": {"ef": 128}} + nq = 1 + limit = 1 + search_vectors = cf.gen_vectors(nq, dim=ct.default_dim) + + # search with groupby + err_code = 1700 + err_msg = f"groupBy field not found in schema: field not found[field={grpby_nonexist_field}]" + collection_w.search(data=search_vectors, anns_field=ct.default_float_vec_field_name, + param=search_params, limit=limit, + group_by_field=grpby_nonexist_field, + check_task=CheckTasks.err_res, + check_items={"err_code": err_code, "err_msg": err_msg}) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="issue #30033") + def test_search_pagination_group_by(self): + """ + target: test search group by + method: 1. create a collection with data + 2. create index HNSW + 3. search with groupby and pagination + 4. search with groupby and limits=pages*page_rounds + verify: search with groupby and pagination returns correct results + """ + # 1. create a collection + metric = "COSINE" + collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False, + is_all_data_type=True, with_json=False)[0] + # insert with the same values for scalar fields + for _ in range(50): + data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False) + collection_w.insert(data) + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + collection_w.load() + # 2. search pagination with offset + limit = 10 + page_rounds = 3 + search_param = {"metric_type": metric} + grpby_field = ct.default_string_field_name + search_vectors = cf.gen_vectors(1, dim=ct.default_dim) + all_pages_ids = [] + for r in range(page_rounds): + page_res = collection_w.search(search_vectors, anns_field=default_search_field, + param=search_param, limit=limit, offset=limit * r, + expr=default_search_exp, group_by_field=grpby_field, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": limit}, + )[0] + all_pages_ids += page_res[0].ids + + total_res = collection_w.search(search_vectors, anns_field=default_search_field, + param=search_param, limit=limit * page_rounds, + expr=default_search_exp, group_by_field=grpby_field, + output_fields=[grpby_field], + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": limit * page_rounds} + )[0] + assert total_res[0].ids == all_pages_ids + grpby_field_values = [] + for i in range(limit * page_rounds): + grpby_field_values.append(total_res[0][i].fields.get(grpby_field)) + assert len(grpby_field_values) == len(set(grpby_field_values)) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="issue #30033") + def test_search_iterator_group_by(self): + """ + target: test search group by + method: 1. create a collection with data + 2. create index HNSW + 3. search iterator with group by + 4. search with filtering every value of group_by_field + verify: verify successfully and iterators are correct + """ + metric = "COSINE" + collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False, + is_all_data_type=True, with_json=False)[0] + # insert with the same values for scalar fields + value_num = 50 + for _ in range(value_num): + data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False) + collection_w.insert(data) + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + collection_w.load() + + grpby_field = ct.default_int64_field_name + search_vectors = cf.gen_vectors(1, dim=ct.default_dim) + search_params = {"metric_type": metric} + batch_size = 10 + + # res = collection_w.search(search_vectors,ct.default_float_vec_field_name, + # search_params, group_by_field=grpby_field, limit=10)[0] + + ite_res = collection_w.search_iterator(search_vectors, ct.default_float_vec_field_name, + search_params, batch_size, group_by_field=grpby_field + )[0] + iterators = 0 + while True: + res = ite_res.next() # turn to the next page + if len(res) == 0: + ite_res.close() # close the iterator + break + iterators += 1 + assert iterators == value_num/batch_size + + @pytest.mark.tags(CaseLabel.L2) + def test_range_search_group_by(self): + """ + target: test search group by + method: 1. create a collection with data + 2. create index hnsw + 3. range search with group by + verify: the error code and msg + """ + metric = "COSINE" + collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False, + is_all_data_type=True, with_json=False)[0] + _index = {"index_type": "HNSW", "metric_type": metric, "params": {"M": 16, "efConstruction": 128}} + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + # insert with the same values for scalar fields + for _ in range(30): + data = cf.gen_dataframe_all_data_type(nb=100, auto_id=True, with_json=False) + collection_w.insert(data) + + collection_w.flush() + collection_w.create_index(ct.default_float_vec_field_name, index_params=_index) + time.sleep(10) + collection_w.load() + + nq = 1 + limit = 10 + search_vectors = cf.gen_vectors(nq, dim=ct.default_dim) + grpby_field = ct.default_int32_field_name + range_search_params = {"metric_type": "COSINE", "params": {"radius": 0.1, + "range_filter": 0.5}} + res = collection_w.search(search_vectors, ct.default_float_vec_field_name, + range_search_params, limit, + default_search_exp, group_by_field=grpby_field, + output_fields=[grpby_field], + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": limit})[0] + grpby_field_values = [] + for i in range(limit): + grpby_field_values.append(res[0][i].fields.get(grpby_field)) + assert len(grpby_field_values) == len(set(grpby_field_values)) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="not completed") + def test_hybrid_search_group_by(self): + """ + target: test search group by + method: 1. create a collection with multiple vector fields + 2. create index hnsw and hnsw + 3. hybrid_search with group by + verify: the error code and msg + """ + pass + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="not completed") + def test_multi_vectors_search_one_vector_group_by(self): + """ + target: test search group by + method: 1. create a collection with multiple vector fields + 2. create index hnsw and ivfflat + 3. search on the vector with hnsw index with group by + verify: search successfully + """ + pass