test: Add nullable test cases for bulk writer (#37572)

issue: #36129 Signed-off-by: binbin lv <binbin.lv@zilliz.com>
2024-11-12 09:46:28 +08:00 · 2024-11-12 09:46:28 +08:00 · 21b68029a0
parent c1eccce2fa
commit 21b68029a0
2 changed files with 178 additions and 37 deletions
--- a/tests/python_client/testcases/test_bulk_insert.py
+++ b/tests/python_client/testcases/test_bulk_insert.py
@ -1477,7 +1477,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
    @pytest.mark.parametrize("entities", [1000])  # 1000
    @pytest.mark.parametrize("enable_dynamic_field", [True, False])
    @pytest.mark.parametrize("sparse_format", ["doc", "coo"])
-    def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
+    @pytest.mark.parametrize("nullable", [True, False])
+    def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
        """
        collection schema 1: [pk, int64, float64, string float_vector]
        data file: vectors.npy and uid.npy,
@ -1489,14 +1490,14 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
        self._connect()
        fields = [
            cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
-            cf.gen_int64_field(name=df.int_field),
-            cf.gen_float_field(name=df.float_field),
-            cf.gen_string_field(name=df.string_field),
-            cf.gen_json_field(name=df.json_field),
-            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
-            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
-            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
-            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
+            cf.gen_int64_field(name=df.int_field, nullable=nullable),
+            cf.gen_float_field(name=df.float_field, nullable=nullable),
+            cf.gen_string_field(name=df.string_field, nullable=nullable),
+            cf.gen_json_field(name=df.json_field, nullable=nullable),
+            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
+            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
+            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
+            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
            cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
            cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
            cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
@ -1528,14 +1529,14 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
            for i in range(entities):
                row = {
                    df.pk_field: i,
-                    df.int_field: 1,
-                    df.float_field: 1.0,
-                    df.string_field: "string",
-                    df.json_field: json_value[i%len(json_value)],
-                    df.array_int_field: [1, 2],
-                    df.array_float_field: [1.0, 2.0],
-                    df.array_string_field: ["string1", "string2"],
-                    df.array_bool_field: [True, False],
+                    df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
+                    df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
+                    df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
+                    df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
+                    df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
+                    df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
+                    df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
+                    df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
                    df.float_vec_field: cf.gen_vectors(1, dim)[0],
                    df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
                    df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
@ -1606,13 +1607,17 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
    @pytest.mark.parametrize("dim", [128])  # 128
    @pytest.mark.parametrize("entities", [1000])  # 1000
    @pytest.mark.parametrize("enable_dynamic_field", [True, False])
-    def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field):
+    @pytest.mark.parametrize("nullable", [True, False])
+    def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, nullable):
        """
        """
+        if nullable is True:
+            pytest.skip("not support bulk writer numpy files in field(int_scalar) which has 'None' data")
+
        self._connect()
        fields = [
            cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
-            cf.gen_int64_field(name=df.int_field),
+            cf.gen_int64_field(name=df.int_field, nullable=nullable),
            cf.gen_float_field(name=df.float_field),
            cf.gen_string_field(name=df.string_field),
            cf.gen_json_field(name=df.json_field),
@ -1646,7 +1651,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
            for i in range(entities):
                row = {
                    df.pk_field: i,
-                    df.int_field: 1,
+                    df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
                    df.float_field: 1.0,
                    df.string_field: "string",
                    df.json_field: json_value[i%len(json_value)],
@ -1720,20 +1725,21 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
    @pytest.mark.parametrize("entities", [1000])  # 1000
    @pytest.mark.parametrize("enable_dynamic_field", [True, False])
    @pytest.mark.parametrize("sparse_format", ["doc", "coo"])
-    def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
+    @pytest.mark.parametrize("nullable", [True, False])
+    def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
        """
        """
        self._connect()
        fields = [
            cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
-            cf.gen_int64_field(name=df.int_field),
-            cf.gen_float_field(name=df.float_field),
-            cf.gen_string_field(name=df.string_field),
-            cf.gen_json_field(name=df.json_field),
-            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
-            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
-            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
-            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
+            cf.gen_int64_field(name=df.int_field, nullable=nullable),
+            cf.gen_float_field(name=df.float_field, nullable=nullable),
+            cf.gen_string_field(name=df.string_field, nullable=nullable),
+            cf.gen_json_field(name=df.json_field, nullable=nullable),
+            cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
+            cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
+            cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
+            cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
            cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
            cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
            cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
@ -1765,14 +1771,14 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
            for i in range(entities):
                row = {
                    df.pk_field: i,
-                    df.int_field: 1,
-                    df.float_field: 1.0,
-                    df.string_field: "string",
-                    df.json_field: json_value[i%len(json_value)],
-                    df.array_int_field: [1, 2],
-                    df.array_float_field: [1.0, 2.0],
-                    df.array_string_field: ["string1", "string2"],
-                    df.array_bool_field: [True, False],
+                    df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
+                    df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
+                    df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
+                    df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
+                    df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
+                    df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
+                    df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
+                    df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
                    df.float_vec_field: cf.gen_vectors(1, dim)[0],
                    df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
                    df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
--- a/tests/python_client/testcases/test_search.py
+++ b/tests/python_client/testcases/test_search.py
@ -4809,6 +4809,73 @@ class TestCollectionSearch(TestcaseBase):
                                  check_task=CheckTasks.err_res,
                                  check_items={"err_code": 999, "err_msg": f"invalid dimension: {dim}."})

+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.skip(reason="issue #37547")
+    def test_search_verify_expr_cache(self, is_flush):
+        """
+        target: test search case to test expr cache
+        method: 1. create collection with a double datatype field
+                2. search with expr "doubleField == 0"
+                3. drop this collection
+                4. create collection with same collection name and same field name but modify the type of double field
+                   as varchar datatype
+                5. search with expr "doubleField == 0" again
+        expected: 1. search successfully with limit(topK) for the first collection
+                  2. report error for the second collection with the same name
+        """
+        # 1. initialize with data
+        collection_w, _, _, insert_ids, time_stamp = \
+            self.init_collection_general(prefix, True, is_flush=is_flush)[0:5]
+        collection_name = collection_w.name
+        # 2. generate search data
+        vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
+        # 3. search with expr "nullableFid == 0"
+        search_exp = f"{ct.default_float_field_name} == 0"
+        output_fields = [default_int64_field_name, default_float_field_name]
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.check_search_results,
+                            check_items={"nq": default_nq,
+                                         "ids": insert_ids,
+                                         "limit": 1,
+                                         "output_fields": output_fields})
+        # 4. drop collection
+        collection_w.drop()
+        # 5. create the same collection name with same field name but varchar field type
+        int64_field = cf.gen_int64_field(is_primary=True)
+        string_field = cf.gen_string_field(ct.default_float_field_name)
+        json_field = cf.gen_json_field()
+        float_vector_field = cf.gen_float_vec_field()
+        fields = [int64_field, string_field, json_field, float_vector_field]
+        schema = cf.gen_collection_schema(fields)
+        collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
+        int64_values = pd.Series(data=[i for i in range(default_nb)])
+        string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
+        json_values = [{"number": i, "string": str(i), "bool": bool(i),
+                        "list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
+        float_vec_values = cf.gen_vectors(default_nb, default_dim)
+        df = pd.DataFrame({
+            ct.default_int64_field_name: int64_values,
+            ct.default_float_field_name: string_values,
+            ct.default_json_field_name: json_values,
+            ct.default_float_vec_field_name: float_vec_values
+        })
+        collection_w.insert(df)
+        collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
+        collection_w.load()
+        collection_w.flush()
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.err_res,
+                            check_items={"err_code": 1100,
+                                         "err_msg": "failed to create query plan: cannot parse expression: float == 0, "
+                                                    "error: comparisons between VarChar and Int64 are not supported: "
+                                                    "invalid parameter"})
+

 class TestSearchBase(TestcaseBase):
    @pytest.fixture(
@ -13279,6 +13346,74 @@ class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
                                         "limit": default_limit,
                                         "output_fields": output_fields})

+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.skip(reason="issue #37547")
+    def test_search_none_data_expr_cache(self, is_flush):
+        """
+        target: test search case with none data to test expr cache
+        method: 1. create collection with double datatype as nullable field
+                2. search with expr "nullableFid == 0"
+                3. drop this collection
+                4. create collection with same collection name and same field name but modify the type of nullable field
+                   as varchar datatype
+                5. search with expr "nullableFid == 0" again
+        expected: 1. search successfully with limit(topK) for the first collection
+                  2. report error for the second collection with the same name
+        """
+        # 1. initialize with data
+        collection_w, _, _, insert_ids, time_stamp = \
+            self.init_collection_general(prefix, True, is_flush=is_flush,
+                                         nullable_fields={ct.default_float_field_name: 0.5})[0:5]
+        collection_name = collection_w.name
+        # 2. generate search data
+        vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
+        # 3. search with expr "nullableFid == 0"
+        search_exp = f"{ct.default_float_field_name} == 0"
+        output_fields = [default_int64_field_name, default_float_field_name]
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.check_search_results,
+                            check_items={"nq": default_nq,
+                                         "ids": insert_ids,
+                                         "limit": 1,
+                                         "output_fields": output_fields})
+        # 4. drop collection
+        collection_w.drop()
+        # 5. create the same collection name with same field name but varchar field type
+        int64_field = cf.gen_int64_field(is_primary=True)
+        string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
+        json_field = cf.gen_json_field()
+        float_vector_field = cf.gen_float_vec_field()
+        fields = [int64_field, string_field, json_field, float_vector_field]
+        schema = cf.gen_collection_schema(fields)
+        collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
+        int64_values = pd.Series(data=[i for i in range(default_nb)])
+        string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
+        json_values = [{"number": i, "string": str(i), "bool": bool(i),
+                        "list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
+        float_vec_values = cf.gen_vectors(default_nb, default_dim)
+        df = pd.DataFrame({
+            ct.default_int64_field_name: int64_values,
+            ct.default_float_field_name: None,
+            ct.default_json_field_name: json_values,
+            ct.default_float_vec_field_name: float_vec_values
+        })
+        collection_w.insert(df)
+        collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
+        collection_w.load()
+        collection_w.flush()
+        collection_w.search(vectors[:default_nq], default_search_field,
+                            default_search_params, default_limit,
+                            search_exp,
+                            output_fields=output_fields,
+                            check_task=CheckTasks.err_res,
+                            check_items={"err_code": 1100,
+                                         "err_msg": "failed to create query plan: cannot parse expression: float == 0, "
+                                                    "error: comparisons between VarChar and Int64 are not supported: "
+                                                    "invalid parameter"})
+

 class TestSearchWithTextMatchFilter(TestcaseBase):
    """