test: Add bulk insert related test cases for default and null support (#36219)

issue: #36129

Signed-off-by: binbin lv <binbin.lv@zilliz.com>
pull/36266/head
binbin 2024-09-18 19:33:17 +08:00 committed by GitHub
parent 23b95aeba3
commit 5ca4d5977a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 180 additions and 64 deletions

View File

@ -323,6 +323,7 @@ class ResponseChecker:
for hits in search_res: for hits in search_res:
searched_original_vectors = [] searched_original_vectors = []
ids = [] ids = []
vector_id = 0
if enable_milvus_client_api: if enable_milvus_client_api:
for hit in hits: for hit in hits:
ids.append(hit['id']) ids.append(hit['id'])
@ -349,12 +350,13 @@ class ResponseChecker:
raise Exception("inserted vectors are needed for distance check") raise Exception("inserted vectors are needed for distance check")
for id in hits.ids: for id in hits.ids:
searched_original_vectors.append(check_items["original_vectors"][id]) searched_original_vectors.append(check_items["original_vectors"][id])
cf.compare_distance_vector_and_vector_list(check_items["vector_nq"][i], cf.compare_distance_vector_and_vector_list(check_items["vector_nq"][vector_id],
searched_original_vectors, searched_original_vectors,
check_items["metric"], hits.distances) check_items["metric"], hits.distances)
log.info("search_results_check: Checked the distances for one nq: OK") log.info("search_results_check: Checked the distances for one nq: OK")
else: else:
pass # just check nq and topk, not specific ids need check pass # just check nq and topk, not specific ids need check
vector_id += 1
log.info("search_results_check: limit (topK) and " log.info("search_results_check: limit (topK) and "
"ids searched for %d queries are correct" % len(search_res)) "ids searched for %d queries are correct" % len(search_res))

View File

@ -14,6 +14,7 @@ from sklearn import preprocessing
from common.common_func import gen_unique_str from common.common_func import gen_unique_str
from common.minio_comm import copy_files_to_minio from common.minio_comm import copy_files_to_minio
from utils.util_log import test_log as log from utils.util_log import test_log as log
import pyarrow as pa
data_source = "/tmp/bulk_insert_data" data_source = "/tmp/bulk_insert_data"
fake = Faker() fake = Faker()
@ -444,7 +445,7 @@ def gen_json_in_numpy_file(dir, data_field, rows, start=0, force=False):
return file_name return file_name
def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False): def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False, nullable=False):
file_name = f"{data_field}.npy" file_name = f"{data_field}.npy"
file = f"{dir}/{file_name}" file = f"{dir}/{file_name}"
if not os.path.exists(file) or force: if not os.path.exists(file) or force:
@ -459,7 +460,10 @@ def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False):
elif data_field == DataField.pk_field: elif data_field == DataField.pk_field:
data = [i for i in range(start, start + rows)] data = [i for i in range(start, start + rows)]
elif data_field == DataField.int_field: elif data_field == DataField.int_field:
data = [random.randint(-999999, 9999999) for _ in range(rows)] if not nullable:
data = [random.randint(-999999, 9999999) for _ in range(rows)]
else:
data = [None for _ in range(rows)]
arr = np.array(data) arr = np.array(data)
log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}") log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}")
np.save(file, arr) np.save(file, arr)
@ -496,11 +500,14 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
array_length = random.randint(0, 10) array_length = random.randint(0, 10)
schema = kwargs.get("schema", None) schema = kwargs.get("schema", None)
schema = schema.to_dict() if schema is not None else None schema = schema.to_dict() if schema is not None else None
nullable = False
if schema is not None: if schema is not None:
fields = schema.get("fields", []) fields = schema.get("fields", [])
for field in fields: for field in fields:
if data_field == field["name"] and "params" in field: if data_field == field["name"]:
dim = field["params"].get("dim", dim) if "params" in field:
dim = field["params"].get("dim", dim)
nullable = field.get("nullable", False)
data = [] data = []
if rows > 0: if rows > 0:
if "vec" in data_field: if "vec" in data_field:
@ -522,37 +529,75 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
else: else:
data = gen_vectors(float_vector=float_vector, rows=rows, dim=dim) data = gen_vectors(float_vector=float_vector, rows=rows, dim=dim)
elif data_field == DataField.float_field: elif data_field == DataField.float_field:
data = [np.float32(random.random()) for _ in range(rows)] if not nullable:
data = [np.float32(random.random()) for _ in range(rows)]
else:
data = [None for _ in range(rows)]
elif data_field == DataField.double_field: elif data_field == DataField.double_field:
data = [np.float64(random.random()) for _ in range(rows)] if not nullable:
data = [np.float64(random.random()) for _ in range(rows)]
else:
data = [None for _ in range(rows)]
elif data_field == DataField.pk_field: elif data_field == DataField.pk_field:
data = [np.int64(i) for i in range(start, start + rows)] if not nullable:
data = [np.int64(i) for i in range(start, start + rows)]
else:
data = [None for _ in range(start, start + rows)]
elif data_field == DataField.int_field: elif data_field == DataField.int_field:
data = [np.int64(random.randint(-999999, 9999999)) for _ in range(rows)] if not nullable:
data = [np.int64(random.randint(-999999, 9999999)) for _ in range(rows)]
else:
data = [None for _ in range(rows)]
elif data_field == DataField.string_field: elif data_field == DataField.string_field:
data = [gen_unique_str(str(i)) for i in range(start, rows + start)] if not nullable:
data = [gen_unique_str(str(i)) for i in range(start, rows + start)]
else:
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.bool_field: elif data_field == DataField.bool_field:
data = [random.choice([True, False]) for i in range(start, rows + start)] if not nullable:
data = [random.choice([True, False]) for i in range(start, rows + start)]
else:
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.json_field: elif data_field == DataField.json_field:
data = pd.Series([json.dumps({ if not nullable:
gen_unique_str(): random.randint(-999999, 9999999) data = pd.Series([json.dumps({
}) for i in range(start, rows + start)], dtype=np.dtype("str")) gen_unique_str(): random.randint(-999999, 9999999)
}) for i in range(start, rows + start)], dtype=np.dtype("str"))
else:
data = pd.Series([json.dumps({
gen_unique_str(): None}) for _ in range(start, rows + start)])
elif data_field == DataField.array_bool_field: elif data_field == DataField.array_bool_field:
data = pd.Series( if not nullable:
data = pd.Series(
[np.array([random.choice([True, False]) for _ in range(array_length)], dtype=np.dtype("bool")) [np.array([random.choice([True, False]) for _ in range(array_length)], dtype=np.dtype("bool"))
for i in range(start, rows + start)]) for i in range(start, rows + start)])
else:
data = pd.Series(
[np.array(None) for i in range(start, rows + start)])
elif data_field == DataField.array_int_field: elif data_field == DataField.array_int_field:
data = pd.Series( if not nullable:
data = pd.Series(
[np.array([random.randint(-999999, 9999999) for _ in range(array_length)], dtype=np.dtype("int64")) [np.array([random.randint(-999999, 9999999) for _ in range(array_length)], dtype=np.dtype("int64"))
for i in range(start, rows + start)]) for i in range(start, rows + start)])
else:
data = pd.Series(
[np.array(None) for i in range(start, rows + start)])
elif data_field == DataField.array_float_field: elif data_field == DataField.array_float_field:
data = pd.Series( if not nullable:
data = pd.Series(
[np.array([random.random() for _ in range(array_length)], dtype=np.dtype("float32")) [np.array([random.random() for _ in range(array_length)], dtype=np.dtype("float32"))
for i in range(start, rows + start)]) for i in range(start, rows + start)])
else:
data = pd.Series(
[np.array(None) for i in range(start, rows + start)])
elif data_field == DataField.array_string_field: elif data_field == DataField.array_string_field:
data = pd.Series( if not nullable:
data = pd.Series(
[np.array([gen_unique_str(str(i)) for _ in range(array_length)], dtype=np.dtype("str")) [np.array([gen_unique_str(str(i)) for _ in range(array_length)], dtype=np.dtype("str"))
for i in range(start, rows + start)]) for i in range(start, rows + start)])
else:
data = pd.Series(
[np.array(None) for i in range(start, rows + start)])
return data return data
@ -627,14 +672,18 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
schema = kwargs.get("schema", None) schema = kwargs.get("schema", None)
schema = schema.to_dict() if schema is not None else None schema = schema.to_dict() if schema is not None else None
data = [] data = []
nullable = False
for r in range(rows): for r in range(rows):
d = {} d = {}
for data_field in data_fields: for data_field in data_fields:
d[data_field] = None
if schema is not None: if schema is not None:
fields = schema.get("fields", []) fields = schema.get("fields", [])
for field in fields: for field in fields:
if data_field == field["name"] and "params" in field: if data_field == field["name"]:
dim = field["params"].get("dim", dim) if "params" in field:
dim = field["params"].get("dim", dim)
nullable = field.get("nullable", False)
if "vec" in data_field: if "vec" in data_field:
if "float" in data_field: if "float" in data_field:
@ -651,31 +700,52 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
if "fp16" in data_field: if "fp16" in data_field:
d[data_field] = gen_fp16_vectors(1, dim, True)[1][0] d[data_field] = gen_fp16_vectors(1, dim, True)[1][0]
elif data_field == DataField.float_field: elif data_field == DataField.float_field:
d[data_field] = random.random() if not nullable:
d[data_field] = random.random()
elif data_field == DataField.double_field: elif data_field == DataField.double_field:
d[data_field] = random.random() if not nullable:
d[data_field] = random.random()
elif data_field == DataField.pk_field: elif data_field == DataField.pk_field:
d[data_field] = r+start if not nullable:
d[data_field] = r+start
elif data_field == DataField.int_field: elif data_field == DataField.int_field:
d[data_field] =random.randint(-999999, 9999999) if not nullable:
d[data_field] = random.randint(-999999, 9999999)
elif data_field == DataField.string_field: elif data_field == DataField.string_field:
d[data_field] = gen_unique_str(str(r + start)) if not nullable:
d[data_field] = gen_unique_str(str(r + start))
elif data_field == DataField.bool_field: elif data_field == DataField.bool_field:
d[data_field] = random.choice([True, False]) if not nullable:
d[data_field] = random.choice([True, False])
elif data_field == DataField.json_field: elif data_field == DataField.json_field:
d[data_field] = {str(r+start): r+start} if not nullable:
d[data_field] = {str(r+start): r+start}
else:
d[data_field] = {str(r + start): None}
elif data_field == DataField.array_bool_field: elif data_field == DataField.array_bool_field:
array_length = random.randint(0, 10) if array_length is None else array_length array_length = random.randint(0, 10) if array_length is None else array_length
d[data_field] = [random.choice([True, False]) for _ in range(array_length)] if not nullable:
d[data_field] = [random.choice([True, False]) for _ in range(array_length)]
else:
d[data_field] = None
elif data_field == DataField.array_int_field: elif data_field == DataField.array_int_field:
array_length = random.randint(0, 10) if array_length is None else array_length array_length = random.randint(0, 10) if array_length is None else array_length
d[data_field] = [random.randint(-999999, 9999999) for _ in range(array_length)] if not nullable:
d[data_field] = [random.randint(-999999, 9999999) for _ in range(array_length)]
else:
d[data_field] = None
elif data_field == DataField.array_float_field: elif data_field == DataField.array_float_field:
array_length = random.randint(0, 10) if array_length is None else array_length array_length = random.randint(0, 10) if array_length is None else array_length
d[data_field] = [random.random() for _ in range(array_length)] if not nullable:
d[data_field] = [random.random() for _ in range(array_length)]
else:
d[data_field] = None
elif data_field == DataField.array_string_field: elif data_field == DataField.array_string_field:
array_length = random.randint(0, 10) if array_length is None else array_length array_length = random.randint(0, 10) if array_length is None else array_length
d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)] if not nullable:
d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)]
else:
d[data_field] = None
if enable_dynamic_field: if enable_dynamic_field:
d[str(r+start)] = r+start d[str(r+start)] = r+start
d["name"] = fake.name() d["name"] = fake.name()
@ -685,7 +755,8 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
return data return data
def gen_new_json_files(float_vector, rows, dim, data_fields, file_nums=1, array_length=None, file_size=None, err_type="", enable_dynamic_field=False, **kwargs): def gen_new_json_files(float_vector, rows, dim, data_fields, file_nums=1, array_length=None, file_size=None,
err_type="", enable_dynamic_field=False, **kwargs):
schema = kwargs.get("schema", None) schema = kwargs.get("schema", None)
dir_prefix = f"json-{uuid.uuid4()}" dir_prefix = f"json-{uuid.uuid4()}"
data_source_new = f"{data_source}/{dir_prefix}" data_source_new = f"{data_source}/{dir_prefix}"
@ -703,7 +774,9 @@ def gen_new_json_files(float_vector, rows, dim, data_fields, file_nums=1, array_
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{i}-{int(time.time())}.json" file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{i}-{int(time.time())}.json"
file = f"{data_source_new}/{file_name}" file = f"{data_source_new}/{file_name}"
Path(file).parent.mkdir(parents=True, exist_ok=True) Path(file).parent.mkdir(parents=True, exist_ok=True)
data = gen_dict_data_by_data_field(data_fields=data_fields, rows=rows, start=start_uid, float_vector=float_vector, dim=dim, array_length=array_length, enable_dynamic_field=enable_dynamic_field, **kwargs) data = gen_dict_data_by_data_field(data_fields=data_fields, rows=rows, start=start_uid,
float_vector=float_vector, dim=dim, array_length=array_length,
enable_dynamic_field=enable_dynamic_field, **kwargs)
# log.info(f"data: {data}") # log.info(f"data: {data}")
with open(file, "w") as f: with open(file, "w") as f:
json.dump(data, f) json.dump(data, f)
@ -742,14 +815,17 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
json.dump(schema, f) json.dump(schema, f)
files = [] files = []
start_uid = 0 start_uid = 0
nullable = False
if file_nums == 1: if file_nums == 1:
# gen the numpy file without subfolders if only one set of files # gen the numpy file without subfolders if only one set of files
for data_field in data_fields: for data_field in data_fields:
if schema is not None: if schema is not None:
fields = schema.get("fields", []) fields = schema.get("fields", [])
for field in fields: for field in fields:
if data_field == field["name"] and "params" in field: if data_field == field["name"]:
dim = field["params"].get("dim", dim) if "params" in field:
dim = field["params"].get("dim", dim)
nullable = field.get("nullable", False)
if "vec" in data_field: if "vec" in data_field:
vector_type = "float32" vector_type = "float32"
if "float" in data_field: if "float" in data_field:
@ -775,7 +851,7 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
file_name = gen_json_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force) file_name = gen_json_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
else: else:
file_name = gen_int_or_float_in_numpy_file(dir=data_source_new, data_field=data_field, file_name = gen_int_or_float_in_numpy_file(dir=data_source_new, data_field=data_field,
rows=rows, force=force) rows=rows, force=force, nullable=nullable)
files.append(file_name) files.append(file_name)
if enable_dynamic_field and include_meta: if enable_dynamic_field and include_meta:
file_name = gen_dynamic_field_in_numpy_file(dir=data_source_new, rows=rows, force=force) file_name = gen_dynamic_field_in_numpy_file(dir=data_source_new, rows=rows, force=force)
@ -827,7 +903,9 @@ def gen_dynamic_field_data_in_parquet_file(rows, start=0):
return data return data
def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_group_size=None, file_nums=1, array_length=None, err_type="", enable_dynamic_field=False, include_meta=True, sparse_format="doc", **kwargs): def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_group_size=None, file_nums=1,
array_length=None, err_type="", enable_dynamic_field=False, include_meta=True,
sparse_format="doc", **kwargs):
schema = kwargs.get("schema", None) schema = kwargs.get("schema", None)
u_id = f"parquet-{uuid.uuid4()}" u_id = f"parquet-{uuid.uuid4()}"
data_source_new = f"{data_source}/{u_id}" data_source_new = f"{data_source}/{u_id}"
@ -850,7 +928,8 @@ def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_
all_field_data = {} all_field_data = {}
for data_field in data_fields: for data_field in data_fields:
data = gen_data_by_data_field(data_field=data_field, rows=rows, start=0, data = gen_data_by_data_field(data_field=data_field, rows=rows, start=0,
float_vector=float_vector, dim=dim, array_length=array_length, sparse_format=sparse_format, **kwargs) float_vector=float_vector, dim=dim, array_length=array_length,
sparse_format=sparse_format, **kwargs)
all_field_data[data_field] = data all_field_data[data_field] = data
if enable_dynamic_field and include_meta: if enable_dynamic_field and include_meta:
all_field_data["$meta"] = gen_dynamic_field_data_in_parquet_file(rows=rows, start=0) all_field_data["$meta"] = gen_dynamic_field_data_in_parquet_file(rows=rows, start=0)
@ -1023,8 +1102,10 @@ def prepare_bulk_insert_numpy_files(minio_endpoint="", bucket_name="milvus-bucke
return files return files
def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, array_length=None, file_size=None, row_group_size=None, def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, array_length=None,
enable_dynamic_field=False, data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False, include_meta=True, sparse_format="doc", **kwargs): file_size=None, row_group_size=None, enable_dynamic_field=False,
data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False,
include_meta=True, sparse_format="doc", **kwargs):
""" """
Generate column based files based on params in parquet format and copy them to the minio Generate column based files based on params in parquet format and copy them to the minio
Note: each field in data_fields would be generated one parquet file. Note: each field in data_fields would be generated one parquet file.

View File

@ -748,7 +748,9 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("entities", [2000]) @pytest.mark.parametrize("entities", [2000])
@pytest.mark.parametrize("enable_dynamic_field", [True]) @pytest.mark.parametrize("enable_dynamic_field", [True])
@pytest.mark.parametrize("enable_partition_key", [True, False]) @pytest.mark.parametrize("enable_partition_key", [True, False])
def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key): @pytest.mark.parametrize("nullable", [True, False])
def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities, enable_dynamic_field,
enable_partition_key, nullable):
""" """
collection schema 1: [pk, int64, float64, string float_vector] collection schema 1: [pk, int64, float64, string float_vector]
data file: vectors.npy and uid.npy, data file: vectors.npy and uid.npy,
@ -757,20 +759,22 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
2. import data 2. import data
3. verify 3. verify
""" """
if enable_partition_key is True and nullable is True:
pytest.skip("partition key field not support nullable")
float_vec_field_dim = dim float_vec_field_dim = dim
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8 binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
bf16_vec_field_dim = dim+random.randint(-16, 32) bf16_vec_field_dim = dim+random.randint(-16, 32)
fp16_vec_field_dim = dim+random.randint(-16, 32) fp16_vec_field_dim = dim+random.randint(-16, 32)
fields = [ fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id), cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field), cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field), cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
cf.gen_json_field(name=df.json_field), cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64), cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT), cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100), cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL), cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim), cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim), cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
@ -878,10 +882,18 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
assert "name" in fields_from_search assert "name" in fields_from_search
assert "address" in fields_from_search assert "address" in fields_from_search
# query data # query data
res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field]) if not nullable:
expr_field = df.string_field
expr = f"{expr_field} >= '0'"
else:
expr_field = df.pk_field
expr = f"{expr_field} >= 0"
res, _ = self.collection_wrap.query(expr=f"{expr}", output_fields=[expr_field, df.int_field])
assert len(res) == entities assert len(res) == entities
query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)] log.info(res)
res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field]) query_data = [r[expr_field] for r in res][:len(self.collection_wrap.partitions)]
res, _ = self.collection_wrap.query(expr=f"{expr_field} in {query_data}", output_fields=[expr_field])
assert len(res) == len(query_data) assert len(res) == len(query_data)
if enable_partition_key: if enable_partition_key:
assert len(self.collection_wrap.partitions) > 1 assert len(self.collection_wrap.partitions) > 1
@ -893,7 +905,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("enable_dynamic_field", [True, False]) @pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("enable_partition_key", [True, False]) @pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("include_meta", [True, False]) @pytest.mark.parametrize("include_meta", [True, False])
def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key, include_meta): @pytest.mark.parametrize("nullable", [True, False])
def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key, include_meta, nullable):
""" """
collection schema 1: [pk, int64, float64, string float_vector] collection schema 1: [pk, int64, float64, string float_vector]
data file: vectors.npy and uid.npy, data file: vectors.npy and uid.npy,
@ -905,13 +918,15 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
""" """
if enable_dynamic_field is False and include_meta is True: if enable_dynamic_field is False and include_meta is True:
pytest.skip("include_meta only works with enable_dynamic_field") pytest.skip("include_meta only works with enable_dynamic_field")
if nullable is True:
pytest.skip("issue #36241")
float_vec_field_dim = dim float_vec_field_dim = dim
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8 binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
bf16_vec_field_dim = dim+random.randint(-16, 32) bf16_vec_field_dim = dim+random.randint(-16, 32)
fp16_vec_field_dim = dim+random.randint(-16, 32) fp16_vec_field_dim = dim+random.randint(-16, 32)
fields = [ fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id), cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field), cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field), cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
cf.gen_json_field(name=df.json_field), cf.gen_json_field(name=df.json_field),
@ -1037,7 +1052,9 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("enable_dynamic_field", [True, False]) @pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("enable_partition_key", [True, False]) @pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("include_meta", [True, False]) @pytest.mark.parametrize("include_meta", [True, False])
def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key, include_meta): @pytest.mark.parametrize("nullable", [True, False])
def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable_dynamic_field,
enable_partition_key, include_meta, nullable):
""" """
collection schema 1: [pk, int64, float64, string float_vector] collection schema 1: [pk, int64, float64, string float_vector]
data file: vectors.parquet and uid.parquet, data file: vectors.parquet and uid.parquet,
@ -1048,20 +1065,24 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
""" """
if enable_dynamic_field is False and include_meta is True: if enable_dynamic_field is False and include_meta is True:
pytest.skip("include_meta only works with enable_dynamic_field") pytest.skip("include_meta only works with enable_dynamic_field")
if nullable is True:
pytest.skip("issue #36252")
if enable_partition_key is True and nullable is True:
pytest.skip("partition key field not support nullable")
float_vec_field_dim = dim float_vec_field_dim = dim
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8 binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
bf16_vec_field_dim = dim+random.randint(-16, 32) bf16_vec_field_dim = dim+random.randint(-16, 32)
fp16_vec_field_dim = dim+random.randint(-16, 32) fp16_vec_field_dim = dim+random.randint(-16, 32)
fields = [ fields = [
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id), cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
cf.gen_int64_field(name=df.int_field), cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field), cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
cf.gen_json_field(name=df.json_field), cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64), cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT), cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100), cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL), cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim), cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim), cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),

View File

@ -3804,7 +3804,8 @@ class TestCollectionSearch(TestcaseBase):
enable_dynamic_field=enable_dynamic_field)[:2] enable_dynamic_field=enable_dynamic_field)[:2]
# search with output field vector # search with output field vector
output_fields = [default_float_field_name, default_string_field_name, default_search_field] output_fields = [default_float_field_name, default_string_field_name,
default_json_field_name, default_search_field]
original_entities = [] original_entities = []
if enable_dynamic_field: if enable_dynamic_field:
entities = [] entities = []
@ -3812,6 +3813,7 @@ class TestCollectionSearch(TestcaseBase):
entities.append({default_int64_field_name: vector[default_int64_field_name], entities.append({default_int64_field_name: vector[default_int64_field_name],
default_float_field_name: vector[default_float_field_name], default_float_field_name: vector[default_float_field_name],
default_string_field_name: vector[default_string_field_name], default_string_field_name: vector[default_string_field_name],
default_json_field_name: vector[default_json_field_name],
default_search_field: vector[default_search_field]}) default_search_field: vector[default_search_field]})
original_entities.append(pd.DataFrame(entities)) original_entities.append(pd.DataFrame(entities))
else: else:
@ -3824,6 +3826,15 @@ class TestCollectionSearch(TestcaseBase):
"limit": default_limit, "limit": default_limit,
"original_entities": original_entities, "original_entities": original_entities,
"output_fields": output_fields}) "output_fields": output_fields})
if enable_dynamic_field:
collection_w.search(vectors[:1], default_search_field,
default_search_params, default_limit, default_search_exp,
output_fields=["$meta", default_search_field],
check_task=CheckTasks.check_search_results,
check_items={"nq": 1,
"limit": default_limit,
"original_entities": original_entities,
"output_fields": output_fields})
@pytest.mark.tags(CaseLabel.L2) @pytest.mark.tags(CaseLabel.L2)
def test_search_output_vector_field_and_pk_field(self, enable_dynamic_field): def test_search_output_vector_field_and_pk_field(self, enable_dynamic_field):
@ -13432,6 +13443,7 @@ class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
check_items={"batch_size": batch_size}) check_items={"batch_size": batch_size})
@pytest.mark.tags(CaseLabel.L1) @pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #36213")
def test_search_normal_none_data_partition_key(self, is_flush, enable_dynamic_field, vector_data_type, null_data_percent): def test_search_normal_none_data_partition_key(self, is_flush, enable_dynamic_field, vector_data_type, null_data_percent):
""" """
target: test search normal case with none data inserted target: test search normal case with none data inserted