mirror of https://github.com/milvus-io/milvus.git
test: Add bulk insert related test cases for default and null support (#36219)
issue: #36129 Signed-off-by: binbin lv <binbin.lv@zilliz.com>pull/36266/head
parent
23b95aeba3
commit
5ca4d5977a
|
|
@ -323,6 +323,7 @@ class ResponseChecker:
|
||||||
for hits in search_res:
|
for hits in search_res:
|
||||||
searched_original_vectors = []
|
searched_original_vectors = []
|
||||||
ids = []
|
ids = []
|
||||||
|
vector_id = 0
|
||||||
if enable_milvus_client_api:
|
if enable_milvus_client_api:
|
||||||
for hit in hits:
|
for hit in hits:
|
||||||
ids.append(hit['id'])
|
ids.append(hit['id'])
|
||||||
|
|
@ -349,12 +350,13 @@ class ResponseChecker:
|
||||||
raise Exception("inserted vectors are needed for distance check")
|
raise Exception("inserted vectors are needed for distance check")
|
||||||
for id in hits.ids:
|
for id in hits.ids:
|
||||||
searched_original_vectors.append(check_items["original_vectors"][id])
|
searched_original_vectors.append(check_items["original_vectors"][id])
|
||||||
cf.compare_distance_vector_and_vector_list(check_items["vector_nq"][i],
|
cf.compare_distance_vector_and_vector_list(check_items["vector_nq"][vector_id],
|
||||||
searched_original_vectors,
|
searched_original_vectors,
|
||||||
check_items["metric"], hits.distances)
|
check_items["metric"], hits.distances)
|
||||||
log.info("search_results_check: Checked the distances for one nq: OK")
|
log.info("search_results_check: Checked the distances for one nq: OK")
|
||||||
else:
|
else:
|
||||||
pass # just check nq and topk, not specific ids need check
|
pass # just check nq and topk, not specific ids need check
|
||||||
|
vector_id += 1
|
||||||
log.info("search_results_check: limit (topK) and "
|
log.info("search_results_check: limit (topK) and "
|
||||||
"ids searched for %d queries are correct" % len(search_res))
|
"ids searched for %d queries are correct" % len(search_res))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ from sklearn import preprocessing
|
||||||
from common.common_func import gen_unique_str
|
from common.common_func import gen_unique_str
|
||||||
from common.minio_comm import copy_files_to_minio
|
from common.minio_comm import copy_files_to_minio
|
||||||
from utils.util_log import test_log as log
|
from utils.util_log import test_log as log
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
data_source = "/tmp/bulk_insert_data"
|
data_source = "/tmp/bulk_insert_data"
|
||||||
fake = Faker()
|
fake = Faker()
|
||||||
|
|
@ -444,7 +445,7 @@ def gen_json_in_numpy_file(dir, data_field, rows, start=0, force=False):
|
||||||
return file_name
|
return file_name
|
||||||
|
|
||||||
|
|
||||||
def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False):
|
def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False, nullable=False):
|
||||||
file_name = f"{data_field}.npy"
|
file_name = f"{data_field}.npy"
|
||||||
file = f"{dir}/{file_name}"
|
file = f"{dir}/{file_name}"
|
||||||
if not os.path.exists(file) or force:
|
if not os.path.exists(file) or force:
|
||||||
|
|
@ -459,7 +460,10 @@ def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False):
|
||||||
elif data_field == DataField.pk_field:
|
elif data_field == DataField.pk_field:
|
||||||
data = [i for i in range(start, start + rows)]
|
data = [i for i in range(start, start + rows)]
|
||||||
elif data_field == DataField.int_field:
|
elif data_field == DataField.int_field:
|
||||||
data = [random.randint(-999999, 9999999) for _ in range(rows)]
|
if not nullable:
|
||||||
|
data = [random.randint(-999999, 9999999) for _ in range(rows)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(rows)]
|
||||||
arr = np.array(data)
|
arr = np.array(data)
|
||||||
log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}")
|
log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}")
|
||||||
np.save(file, arr)
|
np.save(file, arr)
|
||||||
|
|
@ -496,11 +500,14 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
|
||||||
array_length = random.randint(0, 10)
|
array_length = random.randint(0, 10)
|
||||||
schema = kwargs.get("schema", None)
|
schema = kwargs.get("schema", None)
|
||||||
schema = schema.to_dict() if schema is not None else None
|
schema = schema.to_dict() if schema is not None else None
|
||||||
|
nullable = False
|
||||||
if schema is not None:
|
if schema is not None:
|
||||||
fields = schema.get("fields", [])
|
fields = schema.get("fields", [])
|
||||||
for field in fields:
|
for field in fields:
|
||||||
if data_field == field["name"] and "params" in field:
|
if data_field == field["name"]:
|
||||||
dim = field["params"].get("dim", dim)
|
if "params" in field:
|
||||||
|
dim = field["params"].get("dim", dim)
|
||||||
|
nullable = field.get("nullable", False)
|
||||||
data = []
|
data = []
|
||||||
if rows > 0:
|
if rows > 0:
|
||||||
if "vec" in data_field:
|
if "vec" in data_field:
|
||||||
|
|
@ -522,37 +529,75 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
|
||||||
else:
|
else:
|
||||||
data = gen_vectors(float_vector=float_vector, rows=rows, dim=dim)
|
data = gen_vectors(float_vector=float_vector, rows=rows, dim=dim)
|
||||||
elif data_field == DataField.float_field:
|
elif data_field == DataField.float_field:
|
||||||
data = [np.float32(random.random()) for _ in range(rows)]
|
if not nullable:
|
||||||
|
data = [np.float32(random.random()) for _ in range(rows)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(rows)]
|
||||||
elif data_field == DataField.double_field:
|
elif data_field == DataField.double_field:
|
||||||
data = [np.float64(random.random()) for _ in range(rows)]
|
if not nullable:
|
||||||
|
data = [np.float64(random.random()) for _ in range(rows)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(rows)]
|
||||||
elif data_field == DataField.pk_field:
|
elif data_field == DataField.pk_field:
|
||||||
data = [np.int64(i) for i in range(start, start + rows)]
|
if not nullable:
|
||||||
|
data = [np.int64(i) for i in range(start, start + rows)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(start, start + rows)]
|
||||||
elif data_field == DataField.int_field:
|
elif data_field == DataField.int_field:
|
||||||
data = [np.int64(random.randint(-999999, 9999999)) for _ in range(rows)]
|
if not nullable:
|
||||||
|
data = [np.int64(random.randint(-999999, 9999999)) for _ in range(rows)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(rows)]
|
||||||
elif data_field == DataField.string_field:
|
elif data_field == DataField.string_field:
|
||||||
data = [gen_unique_str(str(i)) for i in range(start, rows + start)]
|
if not nullable:
|
||||||
|
data = [gen_unique_str(str(i)) for i in range(start, rows + start)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(start, rows + start)]
|
||||||
elif data_field == DataField.bool_field:
|
elif data_field == DataField.bool_field:
|
||||||
data = [random.choice([True, False]) for i in range(start, rows + start)]
|
if not nullable:
|
||||||
|
data = [random.choice([True, False]) for i in range(start, rows + start)]
|
||||||
|
else:
|
||||||
|
data = [None for _ in range(start, rows + start)]
|
||||||
elif data_field == DataField.json_field:
|
elif data_field == DataField.json_field:
|
||||||
data = pd.Series([json.dumps({
|
if not nullable:
|
||||||
gen_unique_str(): random.randint(-999999, 9999999)
|
data = pd.Series([json.dumps({
|
||||||
}) for i in range(start, rows + start)], dtype=np.dtype("str"))
|
gen_unique_str(): random.randint(-999999, 9999999)
|
||||||
|
}) for i in range(start, rows + start)], dtype=np.dtype("str"))
|
||||||
|
else:
|
||||||
|
data = pd.Series([json.dumps({
|
||||||
|
gen_unique_str(): None}) for _ in range(start, rows + start)])
|
||||||
elif data_field == DataField.array_bool_field:
|
elif data_field == DataField.array_bool_field:
|
||||||
data = pd.Series(
|
if not nullable:
|
||||||
|
data = pd.Series(
|
||||||
[np.array([random.choice([True, False]) for _ in range(array_length)], dtype=np.dtype("bool"))
|
[np.array([random.choice([True, False]) for _ in range(array_length)], dtype=np.dtype("bool"))
|
||||||
for i in range(start, rows + start)])
|
for i in range(start, rows + start)])
|
||||||
|
else:
|
||||||
|
data = pd.Series(
|
||||||
|
[np.array(None) for i in range(start, rows + start)])
|
||||||
elif data_field == DataField.array_int_field:
|
elif data_field == DataField.array_int_field:
|
||||||
data = pd.Series(
|
if not nullable:
|
||||||
|
data = pd.Series(
|
||||||
[np.array([random.randint(-999999, 9999999) for _ in range(array_length)], dtype=np.dtype("int64"))
|
[np.array([random.randint(-999999, 9999999) for _ in range(array_length)], dtype=np.dtype("int64"))
|
||||||
for i in range(start, rows + start)])
|
for i in range(start, rows + start)])
|
||||||
|
else:
|
||||||
|
data = pd.Series(
|
||||||
|
[np.array(None) for i in range(start, rows + start)])
|
||||||
elif data_field == DataField.array_float_field:
|
elif data_field == DataField.array_float_field:
|
||||||
data = pd.Series(
|
if not nullable:
|
||||||
|
data = pd.Series(
|
||||||
[np.array([random.random() for _ in range(array_length)], dtype=np.dtype("float32"))
|
[np.array([random.random() for _ in range(array_length)], dtype=np.dtype("float32"))
|
||||||
for i in range(start, rows + start)])
|
for i in range(start, rows + start)])
|
||||||
|
else:
|
||||||
|
data = pd.Series(
|
||||||
|
[np.array(None) for i in range(start, rows + start)])
|
||||||
elif data_field == DataField.array_string_field:
|
elif data_field == DataField.array_string_field:
|
||||||
data = pd.Series(
|
if not nullable:
|
||||||
|
data = pd.Series(
|
||||||
[np.array([gen_unique_str(str(i)) for _ in range(array_length)], dtype=np.dtype("str"))
|
[np.array([gen_unique_str(str(i)) for _ in range(array_length)], dtype=np.dtype("str"))
|
||||||
for i in range(start, rows + start)])
|
for i in range(start, rows + start)])
|
||||||
|
else:
|
||||||
|
data = pd.Series(
|
||||||
|
[np.array(None) for i in range(start, rows + start)])
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -627,14 +672,18 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
|
||||||
schema = kwargs.get("schema", None)
|
schema = kwargs.get("schema", None)
|
||||||
schema = schema.to_dict() if schema is not None else None
|
schema = schema.to_dict() if schema is not None else None
|
||||||
data = []
|
data = []
|
||||||
|
nullable = False
|
||||||
for r in range(rows):
|
for r in range(rows):
|
||||||
d = {}
|
d = {}
|
||||||
for data_field in data_fields:
|
for data_field in data_fields:
|
||||||
|
d[data_field] = None
|
||||||
if schema is not None:
|
if schema is not None:
|
||||||
fields = schema.get("fields", [])
|
fields = schema.get("fields", [])
|
||||||
for field in fields:
|
for field in fields:
|
||||||
if data_field == field["name"] and "params" in field:
|
if data_field == field["name"]:
|
||||||
dim = field["params"].get("dim", dim)
|
if "params" in field:
|
||||||
|
dim = field["params"].get("dim", dim)
|
||||||
|
nullable = field.get("nullable", False)
|
||||||
|
|
||||||
if "vec" in data_field:
|
if "vec" in data_field:
|
||||||
if "float" in data_field:
|
if "float" in data_field:
|
||||||
|
|
@ -651,31 +700,52 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
|
||||||
if "fp16" in data_field:
|
if "fp16" in data_field:
|
||||||
d[data_field] = gen_fp16_vectors(1, dim, True)[1][0]
|
d[data_field] = gen_fp16_vectors(1, dim, True)[1][0]
|
||||||
elif data_field == DataField.float_field:
|
elif data_field == DataField.float_field:
|
||||||
d[data_field] = random.random()
|
if not nullable:
|
||||||
|
d[data_field] = random.random()
|
||||||
elif data_field == DataField.double_field:
|
elif data_field == DataField.double_field:
|
||||||
d[data_field] = random.random()
|
if not nullable:
|
||||||
|
d[data_field] = random.random()
|
||||||
elif data_field == DataField.pk_field:
|
elif data_field == DataField.pk_field:
|
||||||
d[data_field] = r+start
|
if not nullable:
|
||||||
|
d[data_field] = r+start
|
||||||
elif data_field == DataField.int_field:
|
elif data_field == DataField.int_field:
|
||||||
d[data_field] =random.randint(-999999, 9999999)
|
if not nullable:
|
||||||
|
d[data_field] = random.randint(-999999, 9999999)
|
||||||
elif data_field == DataField.string_field:
|
elif data_field == DataField.string_field:
|
||||||
d[data_field] = gen_unique_str(str(r + start))
|
if not nullable:
|
||||||
|
d[data_field] = gen_unique_str(str(r + start))
|
||||||
elif data_field == DataField.bool_field:
|
elif data_field == DataField.bool_field:
|
||||||
d[data_field] = random.choice([True, False])
|
if not nullable:
|
||||||
|
d[data_field] = random.choice([True, False])
|
||||||
elif data_field == DataField.json_field:
|
elif data_field == DataField.json_field:
|
||||||
d[data_field] = {str(r+start): r+start}
|
if not nullable:
|
||||||
|
d[data_field] = {str(r+start): r+start}
|
||||||
|
else:
|
||||||
|
d[data_field] = {str(r + start): None}
|
||||||
elif data_field == DataField.array_bool_field:
|
elif data_field == DataField.array_bool_field:
|
||||||
array_length = random.randint(0, 10) if array_length is None else array_length
|
array_length = random.randint(0, 10) if array_length is None else array_length
|
||||||
d[data_field] = [random.choice([True, False]) for _ in range(array_length)]
|
if not nullable:
|
||||||
|
d[data_field] = [random.choice([True, False]) for _ in range(array_length)]
|
||||||
|
else:
|
||||||
|
d[data_field] = None
|
||||||
elif data_field == DataField.array_int_field:
|
elif data_field == DataField.array_int_field:
|
||||||
array_length = random.randint(0, 10) if array_length is None else array_length
|
array_length = random.randint(0, 10) if array_length is None else array_length
|
||||||
d[data_field] = [random.randint(-999999, 9999999) for _ in range(array_length)]
|
if not nullable:
|
||||||
|
d[data_field] = [random.randint(-999999, 9999999) for _ in range(array_length)]
|
||||||
|
else:
|
||||||
|
d[data_field] = None
|
||||||
elif data_field == DataField.array_float_field:
|
elif data_field == DataField.array_float_field:
|
||||||
array_length = random.randint(0, 10) if array_length is None else array_length
|
array_length = random.randint(0, 10) if array_length is None else array_length
|
||||||
d[data_field] = [random.random() for _ in range(array_length)]
|
if not nullable:
|
||||||
|
d[data_field] = [random.random() for _ in range(array_length)]
|
||||||
|
else:
|
||||||
|
d[data_field] = None
|
||||||
elif data_field == DataField.array_string_field:
|
elif data_field == DataField.array_string_field:
|
||||||
array_length = random.randint(0, 10) if array_length is None else array_length
|
array_length = random.randint(0, 10) if array_length is None else array_length
|
||||||
d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)]
|
if not nullable:
|
||||||
|
d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)]
|
||||||
|
else:
|
||||||
|
d[data_field] = None
|
||||||
if enable_dynamic_field:
|
if enable_dynamic_field:
|
||||||
d[str(r+start)] = r+start
|
d[str(r+start)] = r+start
|
||||||
d["name"] = fake.name()
|
d["name"] = fake.name()
|
||||||
|
|
@ -685,7 +755,8 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def gen_new_json_files(float_vector, rows, dim, data_fields, file_nums=1, array_length=None, file_size=None, err_type="", enable_dynamic_field=False, **kwargs):
|
def gen_new_json_files(float_vector, rows, dim, data_fields, file_nums=1, array_length=None, file_size=None,
|
||||||
|
err_type="", enable_dynamic_field=False, **kwargs):
|
||||||
schema = kwargs.get("schema", None)
|
schema = kwargs.get("schema", None)
|
||||||
dir_prefix = f"json-{uuid.uuid4()}"
|
dir_prefix = f"json-{uuid.uuid4()}"
|
||||||
data_source_new = f"{data_source}/{dir_prefix}"
|
data_source_new = f"{data_source}/{dir_prefix}"
|
||||||
|
|
@ -703,7 +774,9 @@ def gen_new_json_files(float_vector, rows, dim, data_fields, file_nums=1, array_
|
||||||
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{i}-{int(time.time())}.json"
|
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{i}-{int(time.time())}.json"
|
||||||
file = f"{data_source_new}/{file_name}"
|
file = f"{data_source_new}/{file_name}"
|
||||||
Path(file).parent.mkdir(parents=True, exist_ok=True)
|
Path(file).parent.mkdir(parents=True, exist_ok=True)
|
||||||
data = gen_dict_data_by_data_field(data_fields=data_fields, rows=rows, start=start_uid, float_vector=float_vector, dim=dim, array_length=array_length, enable_dynamic_field=enable_dynamic_field, **kwargs)
|
data = gen_dict_data_by_data_field(data_fields=data_fields, rows=rows, start=start_uid,
|
||||||
|
float_vector=float_vector, dim=dim, array_length=array_length,
|
||||||
|
enable_dynamic_field=enable_dynamic_field, **kwargs)
|
||||||
# log.info(f"data: {data}")
|
# log.info(f"data: {data}")
|
||||||
with open(file, "w") as f:
|
with open(file, "w") as f:
|
||||||
json.dump(data, f)
|
json.dump(data, f)
|
||||||
|
|
@ -742,14 +815,17 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
|
||||||
json.dump(schema, f)
|
json.dump(schema, f)
|
||||||
files = []
|
files = []
|
||||||
start_uid = 0
|
start_uid = 0
|
||||||
|
nullable = False
|
||||||
if file_nums == 1:
|
if file_nums == 1:
|
||||||
# gen the numpy file without subfolders if only one set of files
|
# gen the numpy file without subfolders if only one set of files
|
||||||
for data_field in data_fields:
|
for data_field in data_fields:
|
||||||
if schema is not None:
|
if schema is not None:
|
||||||
fields = schema.get("fields", [])
|
fields = schema.get("fields", [])
|
||||||
for field in fields:
|
for field in fields:
|
||||||
if data_field == field["name"] and "params" in field:
|
if data_field == field["name"]:
|
||||||
dim = field["params"].get("dim", dim)
|
if "params" in field:
|
||||||
|
dim = field["params"].get("dim", dim)
|
||||||
|
nullable = field.get("nullable", False)
|
||||||
if "vec" in data_field:
|
if "vec" in data_field:
|
||||||
vector_type = "float32"
|
vector_type = "float32"
|
||||||
if "float" in data_field:
|
if "float" in data_field:
|
||||||
|
|
@ -775,7 +851,7 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
|
||||||
file_name = gen_json_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
|
file_name = gen_json_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
|
||||||
else:
|
else:
|
||||||
file_name = gen_int_or_float_in_numpy_file(dir=data_source_new, data_field=data_field,
|
file_name = gen_int_or_float_in_numpy_file(dir=data_source_new, data_field=data_field,
|
||||||
rows=rows, force=force)
|
rows=rows, force=force, nullable=nullable)
|
||||||
files.append(file_name)
|
files.append(file_name)
|
||||||
if enable_dynamic_field and include_meta:
|
if enable_dynamic_field and include_meta:
|
||||||
file_name = gen_dynamic_field_in_numpy_file(dir=data_source_new, rows=rows, force=force)
|
file_name = gen_dynamic_field_in_numpy_file(dir=data_source_new, rows=rows, force=force)
|
||||||
|
|
@ -827,7 +903,9 @@ def gen_dynamic_field_data_in_parquet_file(rows, start=0):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_group_size=None, file_nums=1, array_length=None, err_type="", enable_dynamic_field=False, include_meta=True, sparse_format="doc", **kwargs):
|
def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_group_size=None, file_nums=1,
|
||||||
|
array_length=None, err_type="", enable_dynamic_field=False, include_meta=True,
|
||||||
|
sparse_format="doc", **kwargs):
|
||||||
schema = kwargs.get("schema", None)
|
schema = kwargs.get("schema", None)
|
||||||
u_id = f"parquet-{uuid.uuid4()}"
|
u_id = f"parquet-{uuid.uuid4()}"
|
||||||
data_source_new = f"{data_source}/{u_id}"
|
data_source_new = f"{data_source}/{u_id}"
|
||||||
|
|
@ -850,7 +928,8 @@ def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_
|
||||||
all_field_data = {}
|
all_field_data = {}
|
||||||
for data_field in data_fields:
|
for data_field in data_fields:
|
||||||
data = gen_data_by_data_field(data_field=data_field, rows=rows, start=0,
|
data = gen_data_by_data_field(data_field=data_field, rows=rows, start=0,
|
||||||
float_vector=float_vector, dim=dim, array_length=array_length, sparse_format=sparse_format, **kwargs)
|
float_vector=float_vector, dim=dim, array_length=array_length,
|
||||||
|
sparse_format=sparse_format, **kwargs)
|
||||||
all_field_data[data_field] = data
|
all_field_data[data_field] = data
|
||||||
if enable_dynamic_field and include_meta:
|
if enable_dynamic_field and include_meta:
|
||||||
all_field_data["$meta"] = gen_dynamic_field_data_in_parquet_file(rows=rows, start=0)
|
all_field_data["$meta"] = gen_dynamic_field_data_in_parquet_file(rows=rows, start=0)
|
||||||
|
|
@ -1023,8 +1102,10 @@ def prepare_bulk_insert_numpy_files(minio_endpoint="", bucket_name="milvus-bucke
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, array_length=None, file_size=None, row_group_size=None,
|
def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, array_length=None,
|
||||||
enable_dynamic_field=False, data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False, include_meta=True, sparse_format="doc", **kwargs):
|
file_size=None, row_group_size=None, enable_dynamic_field=False,
|
||||||
|
data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False,
|
||||||
|
include_meta=True, sparse_format="doc", **kwargs):
|
||||||
"""
|
"""
|
||||||
Generate column based files based on params in parquet format and copy them to the minio
|
Generate column based files based on params in parquet format and copy them to the minio
|
||||||
Note: each field in data_fields would be generated one parquet file.
|
Note: each field in data_fields would be generated one parquet file.
|
||||||
|
|
|
||||||
|
|
@ -748,7 +748,9 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
@pytest.mark.parametrize("entities", [2000])
|
@pytest.mark.parametrize("entities", [2000])
|
||||||
@pytest.mark.parametrize("enable_dynamic_field", [True])
|
@pytest.mark.parametrize("enable_dynamic_field", [True])
|
||||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||||
def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key):
|
@pytest.mark.parametrize("nullable", [True, False])
|
||||||
|
def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities, enable_dynamic_field,
|
||||||
|
enable_partition_key, nullable):
|
||||||
"""
|
"""
|
||||||
collection schema 1: [pk, int64, float64, string float_vector]
|
collection schema 1: [pk, int64, float64, string float_vector]
|
||||||
data file: vectors.npy and uid.npy,
|
data file: vectors.npy and uid.npy,
|
||||||
|
|
@ -757,20 +759,22 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
2. import data
|
2. import data
|
||||||
3. verify
|
3. verify
|
||||||
"""
|
"""
|
||||||
|
if enable_partition_key is True and nullable is True:
|
||||||
|
pytest.skip("partition key field not support nullable")
|
||||||
float_vec_field_dim = dim
|
float_vec_field_dim = dim
|
||||||
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
|
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
|
||||||
bf16_vec_field_dim = dim+random.randint(-16, 32)
|
bf16_vec_field_dim = dim+random.randint(-16, 32)
|
||||||
fp16_vec_field_dim = dim+random.randint(-16, 32)
|
fp16_vec_field_dim = dim+random.randint(-16, 32)
|
||||||
fields = [
|
fields = [
|
||||||
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
|
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
|
||||||
cf.gen_int64_field(name=df.int_field),
|
cf.gen_int64_field(name=df.int_field, nullable=nullable),
|
||||||
cf.gen_float_field(name=df.float_field),
|
cf.gen_float_field(name=df.float_field, nullable=nullable),
|
||||||
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
|
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
|
||||||
cf.gen_json_field(name=df.json_field),
|
cf.gen_json_field(name=df.json_field, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
|
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
|
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
|
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
|
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
|
||||||
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
|
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
|
||||||
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
|
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
|
||||||
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
|
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
|
||||||
|
|
@ -878,10 +882,18 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
assert "name" in fields_from_search
|
assert "name" in fields_from_search
|
||||||
assert "address" in fields_from_search
|
assert "address" in fields_from_search
|
||||||
# query data
|
# query data
|
||||||
res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field])
|
if not nullable:
|
||||||
|
expr_field = df.string_field
|
||||||
|
expr = f"{expr_field} >= '0'"
|
||||||
|
else:
|
||||||
|
expr_field = df.pk_field
|
||||||
|
expr = f"{expr_field} >= 0"
|
||||||
|
|
||||||
|
res, _ = self.collection_wrap.query(expr=f"{expr}", output_fields=[expr_field, df.int_field])
|
||||||
assert len(res) == entities
|
assert len(res) == entities
|
||||||
query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)]
|
log.info(res)
|
||||||
res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field])
|
query_data = [r[expr_field] for r in res][:len(self.collection_wrap.partitions)]
|
||||||
|
res, _ = self.collection_wrap.query(expr=f"{expr_field} in {query_data}", output_fields=[expr_field])
|
||||||
assert len(res) == len(query_data)
|
assert len(res) == len(query_data)
|
||||||
if enable_partition_key:
|
if enable_partition_key:
|
||||||
assert len(self.collection_wrap.partitions) > 1
|
assert len(self.collection_wrap.partitions) > 1
|
||||||
|
|
@ -893,7 +905,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
|
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
|
||||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||||
@pytest.mark.parametrize("include_meta", [True, False])
|
@pytest.mark.parametrize("include_meta", [True, False])
|
||||||
def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key, include_meta):
|
@pytest.mark.parametrize("nullable", [True, False])
|
||||||
|
def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key, include_meta, nullable):
|
||||||
"""
|
"""
|
||||||
collection schema 1: [pk, int64, float64, string float_vector]
|
collection schema 1: [pk, int64, float64, string float_vector]
|
||||||
data file: vectors.npy and uid.npy,
|
data file: vectors.npy and uid.npy,
|
||||||
|
|
@ -905,13 +918,15 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
"""
|
"""
|
||||||
if enable_dynamic_field is False and include_meta is True:
|
if enable_dynamic_field is False and include_meta is True:
|
||||||
pytest.skip("include_meta only works with enable_dynamic_field")
|
pytest.skip("include_meta only works with enable_dynamic_field")
|
||||||
|
if nullable is True:
|
||||||
|
pytest.skip("issue #36241")
|
||||||
float_vec_field_dim = dim
|
float_vec_field_dim = dim
|
||||||
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
|
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
|
||||||
bf16_vec_field_dim = dim+random.randint(-16, 32)
|
bf16_vec_field_dim = dim+random.randint(-16, 32)
|
||||||
fp16_vec_field_dim = dim+random.randint(-16, 32)
|
fp16_vec_field_dim = dim+random.randint(-16, 32)
|
||||||
fields = [
|
fields = [
|
||||||
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
|
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
|
||||||
cf.gen_int64_field(name=df.int_field),
|
cf.gen_int64_field(name=df.int_field, nullable=nullable),
|
||||||
cf.gen_float_field(name=df.float_field),
|
cf.gen_float_field(name=df.float_field),
|
||||||
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
|
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
|
||||||
cf.gen_json_field(name=df.json_field),
|
cf.gen_json_field(name=df.json_field),
|
||||||
|
|
@ -1037,7 +1052,9 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
|
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
|
||||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||||
@pytest.mark.parametrize("include_meta", [True, False])
|
@pytest.mark.parametrize("include_meta", [True, False])
|
||||||
def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable_dynamic_field, enable_partition_key, include_meta):
|
@pytest.mark.parametrize("nullable", [True, False])
|
||||||
|
def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable_dynamic_field,
|
||||||
|
enable_partition_key, include_meta, nullable):
|
||||||
"""
|
"""
|
||||||
collection schema 1: [pk, int64, float64, string float_vector]
|
collection schema 1: [pk, int64, float64, string float_vector]
|
||||||
data file: vectors.parquet and uid.parquet,
|
data file: vectors.parquet and uid.parquet,
|
||||||
|
|
@ -1048,20 +1065,24 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
|
||||||
"""
|
"""
|
||||||
if enable_dynamic_field is False and include_meta is True:
|
if enable_dynamic_field is False and include_meta is True:
|
||||||
pytest.skip("include_meta only works with enable_dynamic_field")
|
pytest.skip("include_meta only works with enable_dynamic_field")
|
||||||
|
if nullable is True:
|
||||||
|
pytest.skip("issue #36252")
|
||||||
|
if enable_partition_key is True and nullable is True:
|
||||||
|
pytest.skip("partition key field not support nullable")
|
||||||
float_vec_field_dim = dim
|
float_vec_field_dim = dim
|
||||||
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
|
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
|
||||||
bf16_vec_field_dim = dim+random.randint(-16, 32)
|
bf16_vec_field_dim = dim+random.randint(-16, 32)
|
||||||
fp16_vec_field_dim = dim+random.randint(-16, 32)
|
fp16_vec_field_dim = dim+random.randint(-16, 32)
|
||||||
fields = [
|
fields = [
|
||||||
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
|
cf.gen_int64_field(name=df.pk_field, is_primary=True, auto_id=auto_id),
|
||||||
cf.gen_int64_field(name=df.int_field),
|
cf.gen_int64_field(name=df.int_field, nullable=nullable),
|
||||||
cf.gen_float_field(name=df.float_field),
|
cf.gen_float_field(name=df.float_field, nullable=nullable),
|
||||||
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
|
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
|
||||||
cf.gen_json_field(name=df.json_field),
|
cf.gen_json_field(name=df.json_field, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
|
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
|
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
|
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
|
||||||
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
|
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
|
||||||
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
|
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
|
||||||
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
|
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
|
||||||
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
|
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
|
||||||
|
|
|
||||||
|
|
@ -3804,7 +3804,8 @@ class TestCollectionSearch(TestcaseBase):
|
||||||
enable_dynamic_field=enable_dynamic_field)[:2]
|
enable_dynamic_field=enable_dynamic_field)[:2]
|
||||||
|
|
||||||
# search with output field vector
|
# search with output field vector
|
||||||
output_fields = [default_float_field_name, default_string_field_name, default_search_field]
|
output_fields = [default_float_field_name, default_string_field_name,
|
||||||
|
default_json_field_name, default_search_field]
|
||||||
original_entities = []
|
original_entities = []
|
||||||
if enable_dynamic_field:
|
if enable_dynamic_field:
|
||||||
entities = []
|
entities = []
|
||||||
|
|
@ -3812,6 +3813,7 @@ class TestCollectionSearch(TestcaseBase):
|
||||||
entities.append({default_int64_field_name: vector[default_int64_field_name],
|
entities.append({default_int64_field_name: vector[default_int64_field_name],
|
||||||
default_float_field_name: vector[default_float_field_name],
|
default_float_field_name: vector[default_float_field_name],
|
||||||
default_string_field_name: vector[default_string_field_name],
|
default_string_field_name: vector[default_string_field_name],
|
||||||
|
default_json_field_name: vector[default_json_field_name],
|
||||||
default_search_field: vector[default_search_field]})
|
default_search_field: vector[default_search_field]})
|
||||||
original_entities.append(pd.DataFrame(entities))
|
original_entities.append(pd.DataFrame(entities))
|
||||||
else:
|
else:
|
||||||
|
|
@ -3824,6 +3826,15 @@ class TestCollectionSearch(TestcaseBase):
|
||||||
"limit": default_limit,
|
"limit": default_limit,
|
||||||
"original_entities": original_entities,
|
"original_entities": original_entities,
|
||||||
"output_fields": output_fields})
|
"output_fields": output_fields})
|
||||||
|
if enable_dynamic_field:
|
||||||
|
collection_w.search(vectors[:1], default_search_field,
|
||||||
|
default_search_params, default_limit, default_search_exp,
|
||||||
|
output_fields=["$meta", default_search_field],
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": 1,
|
||||||
|
"limit": default_limit,
|
||||||
|
"original_entities": original_entities,
|
||||||
|
"output_fields": output_fields})
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.L2)
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
def test_search_output_vector_field_and_pk_field(self, enable_dynamic_field):
|
def test_search_output_vector_field_and_pk_field(self, enable_dynamic_field):
|
||||||
|
|
@ -13432,6 +13443,7 @@ class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
|
||||||
check_items={"batch_size": batch_size})
|
check_items={"batch_size": batch_size})
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.L1)
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
|
@pytest.mark.skip(reason="issue #36213")
|
||||||
def test_search_normal_none_data_partition_key(self, is_flush, enable_dynamic_field, vector_data_type, null_data_percent):
|
def test_search_normal_none_data_partition_key(self, is_flush, enable_dynamic_field, vector_data_type, null_data_percent):
|
||||||
"""
|
"""
|
||||||
target: test search normal case with none data inserted
|
target: test search normal case with none data inserted
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue