test: Add nullable test cases for bulk writer (#37572)

issue: #36129

Signed-off-by: binbin lv <binbin.lv@zilliz.com>
pull/37561/head
binbin 2024-11-12 09:46:28 +08:00 committed by GitHub
parent c1eccce2fa
commit 21b68029a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 178 additions and 37 deletions

View File

@ -1477,7 +1477,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("sparse_format", ["doc", "coo"])
def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_json_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
"""
collection schema 1: [pk, int64, float64, string float_vector]
data file: vectors.npy and uid.npy,
@ -1489,14 +1490,14 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
@ -1528,14 +1529,14 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
df.array_int_field: [1, 2],
df.array_float_field: [1.0, 2.0],
df.array_string_field: ["string1", "string2"],
df.array_bool_field: [True, False],
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
df.float_vec_field: cf.gen_vectors(1, dim)[0],
df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],
@ -1606,13 +1607,17 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("dim", [128]) # 128
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_numpy_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, nullable):
"""
"""
if nullable is True:
pytest.skip("not support bulk writer numpy files in field(int_scalar) which has 'None' data")
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
@ -1646,7 +1651,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
@ -1720,20 +1725,21 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("entities", [1000]) # 1000
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("sparse_format", ["doc", "coo"])
def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format):
@pytest.mark.parametrize("nullable", [True, False])
def test_with_all_field_parquet_with_bulk_writer(self, auto_id, dim, entities, enable_dynamic_field, sparse_format, nullable):
"""
"""
self._connect()
fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, nullable=nullable),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
@ -1765,14 +1771,14 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
for i in range(entities):
row = {
df.pk_field: i,
df.int_field: 1,
df.float_field: 1.0,
df.string_field: "string",
df.json_field: json_value[i%len(json_value)],
df.array_int_field: [1, 2],
df.array_float_field: [1.0, 2.0],
df.array_string_field: ["string1", "string2"],
df.array_bool_field: [True, False],
df.int_field: 1 if not (nullable and random.random() < 0.5) else None,
df.float_field: 1.0 if not (nullable and random.random() < 0.5) else None,
df.string_field: "string" if not (nullable and random.random() < 0.5) else None,
df.json_field: json_value[i%len(json_value)] if not (nullable and random.random() < 0.5) else None,
df.array_int_field: [1, 2] if not (nullable and random.random() < 0.5) else None,
df.array_float_field: [1.0, 2.0] if not (nullable and random.random() < 0.5) else None,
df.array_string_field: ["string1", "string2"] if not (nullable and random.random() < 0.5) else None,
df.array_bool_field: [True, False] if not (nullable and random.random() < 0.5) else None,
df.float_vec_field: cf.gen_vectors(1, dim)[0],
df.fp16_vec_field: cf.gen_vectors(1, dim, vector_data_type="FLOAT16_VECTOR")[0],
df.bf16_vec_field: cf.gen_vectors(1, dim, vector_data_type="BFLOAT16_VECTOR")[0],

View File

@ -4809,6 +4809,73 @@ class TestCollectionSearch(TestcaseBase):
check_task=CheckTasks.err_res,
check_items={"err_code": 999, "err_msg": f"invalid dimension: {dim}."})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_verify_expr_cache(self, is_flush):
"""
target: test search case to test expr cache
method: 1. create collection with a double datatype field
2. search with expr "doubleField == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of double field
as varchar datatype
5. search with expr "doubleField == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush)[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: string_values,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})
class TestSearchBase(TestcaseBase):
@pytest.fixture(
@ -13279,6 +13346,74 @@ class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
"limit": default_limit,
"output_fields": output_fields})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #37547")
def test_search_none_data_expr_cache(self, is_flush):
"""
target: test search case with none data to test expr cache
method: 1. create collection with double datatype as nullable field
2. search with expr "nullableFid == 0"
3. drop this collection
4. create collection with same collection name and same field name but modify the type of nullable field
as varchar datatype
5. search with expr "nullableFid == 0" again
expected: 1. search successfully with limit(topK) for the first collection
2. report error for the second collection with the same name
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush,
nullable_fields={ct.default_float_field_name: 0.5})[0:5]
collection_name = collection_w.name
# 2. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 3. search with expr "nullableFid == 0"
search_exp = f"{ct.default_float_field_name} == 0"
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": 1,
"output_fields": output_fields})
# 4. drop collection
collection_w.drop()
# 5. create the same collection name with same field name but varchar field type
int64_field = cf.gen_int64_field(is_primary=True)
string_field = cf.gen_string_field(ct.default_float_field_name, nullable=True)
json_field = cf.gen_json_field()
float_vector_field = cf.gen_float_vec_field()
fields = [int64_field, string_field, json_field, float_vector_field]
schema = cf.gen_collection_schema(fields)
collection_w = self.init_collection_wrap(name=collection_name, schema=schema)
int64_values = pd.Series(data=[i for i in range(default_nb)])
string_values = pd.Series(data=[str(i) for i in range(default_nb)], dtype="string")
json_values = [{"number": i, "string": str(i), "bool": bool(i),
"list": [j for j in range(i, i + ct.default_json_list_length)]} for i in range(default_nb)]
float_vec_values = cf.gen_vectors(default_nb, default_dim)
df = pd.DataFrame({
ct.default_int64_field_name: int64_values,
ct.default_float_field_name: None,
ct.default_json_field_name: json_values,
ct.default_float_vec_field_name: float_vec_values
})
collection_w.insert(df)
collection_w.create_index(ct.default_float_vec_field_name, ct.default_flat_index)
collection_w.load()
collection_w.flush()
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
search_exp,
output_fields=output_fields,
check_task=CheckTasks.err_res,
check_items={"err_code": 1100,
"err_msg": "failed to create query plan: cannot parse expression: float == 0, "
"error: comparisons between VarChar and Int64 are not supported: "
"invalid parameter"})
class TestSearchWithTextMatchFilter(TestcaseBase):
"""