test: add bf/f16 bulk insert testcase (#32506)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
pull/32346/head^2
zhuwenxing 2024-04-23 14:05:23 +08:00 committed by GitHub
parent fef7812254
commit a5f0fc4373
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 100 additions and 14 deletions

View File

@ -4,6 +4,7 @@ import os
import time import time
import numpy as np import numpy as np
import jax.numpy as jnp
import pandas as pd import pandas as pd
import random import random
from faker import Faker from faker import Faker
@ -25,6 +26,8 @@ class DataField:
image_float_vec_field = "image_float_vec_field" image_float_vec_field = "image_float_vec_field"
text_float_vec_field = "text_float_vec_field" text_float_vec_field = "text_float_vec_field"
binary_vec_field = "binary_vec_field" binary_vec_field = "binary_vec_field"
bf16_vec_field = "bf16_vec_field"
fp16_vec_field = "fp16_vec_field"
int_field = "int_scalar" int_field = "int_scalar"
string_field = "string_scalar" string_field = "string_scalar"
bool_field = "bool_scalar" bool_field = "bool_scalar"
@ -90,6 +93,42 @@ def gen_binary_vectors(nb, dim):
return vectors return vectors
def gen_fp16_vectors(num, dim):
"""
generate float16 vector data
raw_vectors : the vectors
fp16_vectors: the bytes used for insert
return: raw_vectors and fp16_vectors
"""
raw_vectors = []
fp16_vectors = []
for _ in range(num):
raw_vector = [random.random() for _ in range(dim)]
raw_vectors.append(raw_vector)
fp16_vector = np.array(raw_vector, dtype=np.float16).view(np.uint8).tolist()
fp16_vectors.append(fp16_vector)
return raw_vectors, fp16_vectors
def gen_bf16_vectors(num, dim):
"""
generate brain float16 vector data
raw_vectors : the vectors
bf16_vectors: the bytes used for insert
return: raw_vectors and bf16_vectors
"""
raw_vectors = []
bf16_vectors = []
for _ in range(num):
raw_vector = [random.random() for _ in range(dim)]
raw_vectors.append(raw_vector)
bf16_vector = np.array(jnp.array(raw_vector, dtype=jnp.bfloat16)).view(np.uint8).tolist()
bf16_vectors.append(bf16_vector)
return raw_vectors, bf16_vectors
def gen_row_based_json_file(row_file, str_pk, data_fields, float_vect, def gen_row_based_json_file(row_file, str_pk, data_fields, float_vect,
rows, dim, start_uid=0, err_type="", enable_dynamic_field=False, **kwargs): rows, dim, start_uid=0, err_type="", enable_dynamic_field=False, **kwargs):
@ -311,7 +350,7 @@ def gen_column_base_json_file(col_file, str_pk, data_fields, float_vect,
f.write("\n") f.write("\n")
def gen_vectors_in_numpy_file(dir, data_field, float_vector, rows, dim, force=False): def gen_vectors_in_numpy_file(dir, data_field, float_vector, rows, dim, vector_type="float32", force=False):
file_name = f"{data_field}.npy" file_name = f"{data_field}.npy"
file = f'{dir}/{file_name}' file = f'{dir}/{file_name}'
@ -319,9 +358,18 @@ def gen_vectors_in_numpy_file(dir, data_field, float_vector, rows, dim, force=Fa
# vector columns # vector columns
vectors = [] vectors = []
if rows > 0: if rows > 0:
if float_vector: if vector_type == "float32":
vectors = gen_float_vectors(rows, dim) vectors = gen_float_vectors(rows, dim)
arr = np.array(vectors) arr = np.array(vectors)
elif vector_type == "fp16":
vectors = gen_fp16_vectors(rows, dim)[1]
arr = np.array(vectors, dtype=np.dtype("uint8"))
elif vector_type == "bf16":
vectors = gen_bf16_vectors(rows, dim)[1]
arr = np.array(vectors, dtype=np.dtype("uint8"))
elif vector_type == "binary":
vectors = gen_binary_vectors(rows, (dim // 8))
arr = np.array(vectors, dtype=np.dtype("uint8"))
else: else:
vectors = gen_binary_vectors(rows, (dim // 8)) vectors = gen_binary_vectors(rows, (dim // 8))
arr = np.array(vectors, dtype=np.dtype("uint8")) arr = np.array(vectors, dtype=np.dtype("uint8"))
@ -429,6 +477,12 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
if "float" in data_field: if "float" in data_field:
data = gen_vectors(float_vector=True, rows=rows, dim=dim) data = gen_vectors(float_vector=True, rows=rows, dim=dim)
data = pd.Series([np.array(x, dtype=np.dtype("float32")) for x in data]) data = pd.Series([np.array(x, dtype=np.dtype("float32")) for x in data])
elif "fp16" in data_field:
data = gen_fp16_vectors(rows, dim)[1]
data = pd.Series([np.array(x, dtype=np.dtype("uint8")) for x in data])
elif "bf16" in data_field:
data = gen_bf16_vectors(rows, dim)[1]
data = pd.Series([np.array(x, dtype=np.dtype("uint8")) for x in data])
elif "binary" in data_field: elif "binary" in data_field:
data = gen_vectors(float_vector=False, rows=rows, dim=dim) data = gen_vectors(float_vector=False, rows=rows, dim=dim)
data = pd.Series([np.array(x, dtype=np.dtype("uint8")) for x in data]) data = pd.Series([np.array(x, dtype=np.dtype("uint8")) for x in data])
@ -544,9 +598,14 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
if "vec" in data_field: if "vec" in data_field:
if "float" in data_field: if "float" in data_field:
float_vector = True float_vector = True
d[data_field] = gen_vectors(float_vector=float_vector, rows=1, dim=dim)[0]
if "binary" in data_field: if "binary" in data_field:
float_vector = False float_vector = False
d[data_field] = gen_vectors(float_vector=float_vector, rows=1, dim=dim)[0] d[data_field] = gen_vectors(float_vector=float_vector, rows=1, dim=dim)[0]
if "bf16" in data_field:
d[data_field] = gen_bf16_vectors(1, dim)[1][0]
if "fp16" in data_field:
d[data_field] = gen_fp16_vectors(1, dim)[1][0]
elif data_field == DataField.float_field: elif data_field == DataField.float_field:
d[data_field] = random.random() d[data_field] = random.random()
elif data_field == DataField.double_field: elif data_field == DataField.double_field:
@ -623,12 +682,21 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
# gen the numpy file without subfolders if only one set of files # gen the numpy file without subfolders if only one set of files
for data_field in data_fields: for data_field in data_fields:
if "vec" in data_field: if "vec" in data_field:
vector_type = "float32"
if "float" in data_field: if "float" in data_field:
float_vector = True float_vector = True
vector_type = "float32"
if "binary" in data_field: if "binary" in data_field:
float_vector = False float_vector = False
vector_type = "binary"
if "bf16" in data_field:
float_vector = True
vector_type = "bf16"
if "fp16" in data_field:
float_vector = True
vector_type = "fp16"
file_name = gen_vectors_in_numpy_file(dir=data_source, data_field=data_field, float_vector=float_vector, file_name = gen_vectors_in_numpy_file(dir=data_source, data_field=data_field, float_vector=float_vector,
rows=rows, dim=dim, force=force) vector_type=vector_type, rows=rows, dim=dim, force=force)
elif data_field == DataField.string_field: # string field for numpy not supported yet at 2022-10-17 elif data_field == DataField.string_field: # string field for numpy not supported yet at 2022-10-17
file_name = gen_string_in_numpy_file(dir=data_source, data_field=data_field, rows=rows, force=force) file_name = gen_string_in_numpy_file(dir=data_source, data_field=data_field, rows=rows, force=force)
elif data_field == DataField.bool_field: elif data_field == DataField.bool_field:

View File

@ -848,9 +848,11 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100), cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL), cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float_vec_field(name=df.image_float_vec_field, dim=dim), # cf.gen_float_vec_field(name=df.image_float_vec_field, dim=dim),
cf.gen_float_vec_field(name=df.text_float_vec_field, dim=dim), # cf.gen_float_vec_field(name=df.text_float_vec_field, dim=dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=dim) cf.gen_binary_vec_field(name=df.binary_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim)
] ]
data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)] data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)]
files = prepare_bulk_insert_new_json_files( files = prepare_bulk_insert_new_json_files(
@ -890,6 +892,10 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
self.collection_wrap.create_index( self.collection_wrap.create_index(
field_name=f, index_params=index_params field_name=f, index_params=index_params
) )
for f in [df.bf16_vec_field, df.fp16_vec_field]:
self.collection_wrap.create_index(
field_name=f, index_params={"index_type": "FLAT", "metric_type": "COSINE"}
)
for f in binary_vec_fields: for f in binary_vec_fields:
self.collection_wrap.create_index( self.collection_wrap.create_index(
field_name=f, index_params=ct.default_binary_index field_name=f, index_params=ct.default_binary_index
@ -964,9 +970,11 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_string_field(name=df.string_field), cf.gen_string_field(name=df.string_field),
cf.gen_json_field(name=df.json_field), cf.gen_json_field(name=df.json_field),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float_vec_field(name=df.image_float_vec_field, dim=dim), # cf.gen_float_vec_field(name=df.image_float_vec_field, dim=dim),
cf.gen_float_vec_field(name=df.text_float_vec_field, dim=dim), # cf.gen_float_vec_field(name=df.text_float_vec_field, dim=dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=dim) cf.gen_binary_vec_field(name=df.binary_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim)
] ]
data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)] data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)]
files = prepare_bulk_insert_numpy_files( files = prepare_bulk_insert_numpy_files(
@ -1006,6 +1014,10 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
self.collection_wrap.create_index( self.collection_wrap.create_index(
field_name=f, index_params=index_params field_name=f, index_params=index_params
) )
for f in [df.bf16_vec_field, df.fp16_vec_field]:
self.collection_wrap.create_index(
field_name=f, index_params={"index_type": "FLAT", "metric_type": "COSINE"}
)
for f in binary_vec_fields: for f in binary_vec_fields:
self.collection_wrap.create_index( self.collection_wrap.create_index(
field_name=f, index_params=ct.default_binary_index field_name=f, index_params=ct.default_binary_index
@ -1083,11 +1095,13 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100), cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL), cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
cf.gen_float_vec_field(name=df.float_vec_field, dim=dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=dim),
cf.gen_float_vec_field(name=df.image_float_vec_field, dim=dim), # cf.gen_float_vec_field(name=df.image_float_vec_field, dim=dim),
cf.gen_float_vec_field(name=df.text_float_vec_field, dim=dim), # cf.gen_float_vec_field(name=df.text_float_vec_field, dim=dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=dim) cf.gen_binary_vec_field(name=df.binary_vec_field, dim=dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=dim),
cf.gen_float16_vec_field(name=df.fp16_vec_field, dim=dim)
] ]
data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)] data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)]
files = prepare_bulk_insert_parquet_files( files = prepare_bulk_insert_parquet_files(
minio_endpoint=self.minio_endpoint, minio_endpoint=self.minio_endpoint,
bucket_name=self.bucket_name, bucket_name=self.bucket_name,
@ -1125,6 +1139,10 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
self.collection_wrap.create_index( self.collection_wrap.create_index(
field_name=f, index_params=index_params field_name=f, index_params=index_params
) )
for f in [df.bf16_vec_field, df.fp16_vec_field]:
self.collection_wrap.create_index(
field_name=f, index_params={"index_type": "FLAT", "metric_type": "COSINE"}
)
for f in binary_vec_fields: for f in binary_vec_fields:
self.collection_wrap.create_index( self.collection_wrap.create_index(
field_name=f, index_params=ct.default_binary_index field_name=f, index_params=ct.default_binary_index