import os import random from utils.util_log import test_log as log from common.common_type import CaseLabel, CheckTasks from common import common_type as ct from common import common_func as cf from base.client_base import TestcaseBase import pytest prefix = "query_iter_" class TestQueryIterator(TestcaseBase): @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("primary_field", [ct.default_string_field_name, ct.default_int64_field_name]) @pytest.mark.parametrize("with_growing", [False, True]) def test_query_iterator_normal(self, primary_field, with_growing): """ target: test query iterator normal method: 1. query iterator 2. check the result, expect pk verify: no pk lost in interator results 3. query iterator with checkpoint file 4. iterator.next() for 10 times 5. delete some entities before calling a new query iterator 6. call a new query iterator with the same checkpoint file, with diff batch_size and output_fields 7. iterator.next() until the end verify: 1. no pk lost in interator results for the 2 iterators 2. no dup pk in the 2 iterators expected: query iterators successfully """ # 1. initialize with data nb = 4000 batch_size = 200 collection_w, _, _, insert_ids, _ = \ self.init_collection_general(prefix, True, is_index=False, nb=nb, is_flush=True, auto_id=False, primary_field=primary_field) collection_w.create_index(ct.default_float_vec_field_name, {"metric_type": "L2"}) collection_w.load() # 2. query iterator expr = "float >= 0" collection_w.query_iterator(batch_size, expr=expr, check_task=CheckTasks.check_query_iterator, check_items={"count": nb, "batch_size": batch_size}) # 3. query iterator with checkpoint file iterator_cp_file = f"/tmp/it_{collection_w.name}_cp" iterator = collection_w.query_iterator(batch_size, expr=expr, iterator_cp_file=iterator_cp_file)[0] iter_times = 0 first_iter_times = nb // batch_size // 2 # only iterate half of the data for the 1st time pk_list1 = [] while iter_times < first_iter_times: iter_times += 1 res = iterator.next() if len(res) == 0: iterator.close() assert False, f"The iterator ends before {first_iter_times} times iterators: iter_times: {iter_times}" break for i in range(len(res)): pk_list1.append(res[i][primary_field]) file_exist = os.path.isfile(iterator_cp_file) assert file_exist is True, "The checkpoint file exists without iterator close" # 4. try to delete and insert some entities before calling a new query iterator delete_ids = random.sample(insert_ids[:nb//2], 101) + random.sample(insert_ids[nb//2:], 101) del_res, _ = collection_w.delete(expr=f"{primary_field} in {delete_ids}") assert del_res.delete_count == len(delete_ids) data = cf.gen_default_list_data(nb=333, start=nb) collection_w.insert(data) if not with_growing: collection_w.flush() # 5. call a new query iterator with the same checkpoint file to continue the first iterator iterator2 = collection_w.query_iterator(batch_size*2, expr=expr, output_fields=[primary_field, ct.default_float_field_name], iterator_cp_file=iterator_cp_file)[0] while True: res = iterator2.next() if len(res) == 0: iterator2.close() break for i in range(len(res)): pk_list1.append(res[i][primary_field]) # 6. verify assert len(pk_list1) == len(set(pk_list1)) == nb file_exist = os.path.isfile(iterator_cp_file) assert file_exist is False, "The checkpoint was deleted after the iterator close" @pytest.mark.tags(CaseLabel.L1) def test_query_iterator_using_default_batch_size(self): """ target: test query iterator normal method: 1. query iterator 2. check the result, expect pk expected: query successfully """ # 1. initialize with data collection_w = self.init_collection_general(prefix, True)[0] # 2. query iterator collection_w.query_iterator(check_task=CheckTasks.check_query_iterator, check_items={"count": ct.default_nb, "batch_size": ct.default_batch_size}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("offset", [500, 1000, 1777]) def test_query_iterator_with_offset(self, offset): """ target: test query iterator normal method: 1. query iterator 2. check the result, expect pk expected: query successfully """ # 1. initialize with data batch_size = 300 collection_w = self.init_collection_general(prefix, True, is_index=False)[0] collection_w.create_index(ct.default_float_vec_field_name, {"metric_type": "L2"}) collection_w.load() # 2. search iterator expr = "int64 >= 0" collection_w.query_iterator(batch_size, expr=expr, offset=offset, check_task=CheckTasks.check_query_iterator, check_items={"count": ct.default_nb - offset, "batch_size": batch_size}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"]) def test_query_iterator_output_different_vector_type(self, vector_data_type): """ target: test query iterator with output fields method: 1. query iterator output different vector type 2. check the result, expect pk expected: query successfully """ # 1. initialize with data batch_size = 400 collection_w = self.init_collection_general(prefix, True, vector_data_type=vector_data_type)[0] # 2. query iterator expr = "int64 >= 0" collection_w.query_iterator(batch_size, expr=expr, output_fields=[ct.default_float_vec_field_name], check_task=CheckTasks.check_query_iterator, check_items={"count": ct.default_nb, "batch_size": batch_size}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("batch_size", [10, 777, 2000]) def test_query_iterator_with_different_batch_size(self, batch_size): """ target: test query iterator normal method: 1. query iterator 2. check the result, expect pk expected: query successfully """ # 1. initialize with data offset = 500 collection_w = self.init_collection_general(prefix, True, is_index=False)[0] collection_w.create_index(ct.default_float_vec_field_name, {"metric_type": "L2"}) collection_w.load() # 2. search iterator expr = "int64 >= 0" collection_w.query_iterator(batch_size=batch_size, expr=expr, offset=offset, check_task=CheckTasks.check_query_iterator, check_items={"count": ct.default_nb - offset, "batch_size": batch_size}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("offset", [0, 10, 1000]) @pytest.mark.parametrize("limit", [0, 100, 10000]) def test_query_iterator_with_different_limit(self, limit, offset): """ target: test query iterator normal method: 1. query iterator 2. check the result, expect pk expected: query successfully """ # 1. initialize with data collection_w = self.init_collection_general(prefix, True)[0] # 2. query iterator Count = limit if limit + offset <= ct.default_nb else ct.default_nb - offset collection_w.query_iterator(limit=limit, expr="", offset=offset, check_task=CheckTasks.check_query_iterator, check_items={"count": max(Count, 0), "batch_size": ct.default_batch_size}) @pytest.mark.tags(CaseLabel.L2) def test_query_iterator_invalid_batch_size(self): """ target: test query iterator invalid limit and offset method: query iterator using invalid limit and offset expected: raise exception """ # 1. initialize with data nb = 17000 # set nb > 16384 collection_w = self.init_collection_general(prefix, True, nb=nb)[0] # 2. search iterator expr = "int64 >= 0" error = {"err_code": 1, "err_msg": "batch size cannot be less than zero"} collection_w.query_iterator(batch_size=-1, expr=expr, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("batch_size", [500]) @pytest.mark.parametrize("auto_id", [False]) def test_query_iterator_empty_expr_with_cp_file_for_times(self, auto_id, batch_size): """ target: verify 2 query iterators with/out checkpoint file works independently method: 1. create a collection 2. query the 1st iterator with empty expr and checkpoint file 3. iterator.next() for some times 4. call a new query iterator with the same checkpoint file expected: verify the 2nd iterator can get the whole results """ # 0. initialize with data collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id)[0:4] # 1. call a new query iterator and iterator for some times iterator_cp_file = f"/tmp/it_{collection_w.name}_cp" iterator = collection_w.query_iterator(batch_size=batch_size//2, iterator_cp_file=iterator_cp_file)[0] iter_times = 0 first_iter_times = ct.default_nb // batch_size // 2 // 2 # only iterate half of the data for the 1st time while iter_times < first_iter_times: iter_times += 1 res = iterator.next() if len(res) == 0: iterator.close() assert False, f"The iterator ends before {first_iter_times} times iterators: iter_times: {iter_times}" break # 2. call a new query iterator to get all the results of the collection collection_w.query_iterator(batch_size=batch_size, check_task=CheckTasks.check_query_iterator, check_items={"batch_size": batch_size, "count": ct.default_nb, "exp_ids": insert_ids}) file_exist = os.path.isfile(iterator_cp_file) assert file_exist is True, "The checkpoint exists if not iterator.close()" iterator.close() file_exist = os.path.isfile(iterator_cp_file) assert file_exist is False, "The checkpoint was deleted after the iterator close" @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("offset", [1000]) @pytest.mark.parametrize("batch_size", [500, 1000]) def test_query_iterator_expr_empty_with_random_pk_pagination(self, batch_size, offset): """ target: test query iterator with empty expression method: create a collection using random pk, query empty expression with a limit expected: return topK results by order """ # 1. initialize with data collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, random_primary_key=True)[0:4] # 2. query with empty expr and check the result exp_ids = sorted(insert_ids) collection_w.query_iterator(batch_size, output_fields=[ct.default_string_field_name], check_task=CheckTasks.check_query_iterator, check_items={"batch_size": batch_size, "count": ct.default_nb, "exp_ids": exp_ids}) # 3. query with pagination exp_ids = sorted(insert_ids)[offset:] collection_w.query_iterator(batch_size, offset=offset, output_fields=[ct.default_string_field_name], check_task=CheckTasks.check_query_iterator, check_items={"batch_size": batch_size, "count": ct.default_nb - offset, "exp_ids": exp_ids}) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("primary_field", [ct.default_string_field_name, ct.default_int64_field_name]) def test_query_iterator_with_dup_pk(self, primary_field): """ target: test query iterator with duplicate pk method: 1. insert entities with duplicate pk 2. query iterator 3. check the result, expect pk expected: query successfully """ # 1. initialize with data nb = 3000 collection_w = self.init_collection_general(prefix, insert_data=False, is_index=False, auto_id=False, primary_field=primary_field)[0] # insert entities with duplicate pk data = cf.gen_default_list_data(nb=nb) for _ in range(3): collection_w.insert(data) collection_w.flush() # create index index_type = "HNSW" index_params = {"index_type": index_type, "metric_type": ct.default_L0_metric, "params": cf.get_index_params_params(index_type)} collection_w.create_index(ct.default_float_vec_field_name, index_params) collection_w.load() # 2. query iterator collection_w.query_iterator(check_task=CheckTasks.check_query_iterator, check_items={"count": nb, "batch_size": ct.default_batch_size}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.skip("issue #37109, need debug due to the resolution of the issue") def test_query_iterator_on_two_collections(self): """ target: test query iterator on two collections method: 1. create two collections 2. query iterator on the first collection 3. check the result, expect pk expected: query successfully """ # 1. initialize with data collection_w = self.init_collection_general(prefix, True)[0] collection_w2 = self.init_collection_general(prefix, False, primary_field=ct.default_string_field_name)[0] data = cf.gen_default_list_data(nb=ct.default_nb, primary_field=ct.default_string_field_name) string_values = [cf.gen_str_by_length(20) for _ in range(ct.default_nb)] data[2] = string_values collection_w2.insert(data) # 2. call a new query iterator and iterator for some times batch_size = 150 iterator_cp_file = f"/tmp/it_{collection_w.name}_cp" iterator2 = collection_w2.query_iterator(batch_size=batch_size // 2, iterator_cp_file=iterator_cp_file)[0] iter_times = 0 first_iter_times = ct.default_nb // batch_size // 2 // 2 # only iterate half of the data for the 1st time while iter_times < first_iter_times: iter_times += 1 res = iterator2.next() if len(res) == 0: iterator2.close() assert False, f"The iterator ends before {first_iter_times} times iterators: iter_times: {iter_times}" break # 3. query iterator on the second collection with the same checkpoint file iterator = collection_w.query_iterator(batch_size=batch_size, iterator_cp_file=iterator_cp_file)[0] print(iterator.next())