mirror of https://github.com/milvus-io/milvus.git
310 lines
11 KiB
Python
310 lines
11 KiB
Python
import time
|
||
import os
|
||
from minio import Minio
|
||
from minio.error import S3Error
|
||
import numpy as np
|
||
import random
|
||
from sklearn import preprocessing
|
||
from common.common_func import gen_unique_str
|
||
|
||
|
||
minio = "10.96.1.23:9000" # TODO update hardcode
|
||
bucket_name = "yanliang-bulk-load" # TODO update hardcode
|
||
|
||
data_source = "/tmp/bulk_load_data"
|
||
|
||
BINARY = "binary"
|
||
FLOAT = "float"
|
||
|
||
|
||
def gen_file_prefix(row_based=True, auto_id=True, prefix=""):
|
||
if row_based:
|
||
if auto_id:
|
||
return f"{prefix}row_auto"
|
||
else:
|
||
return f"{prefix}row_cust"
|
||
else:
|
||
if auto_id:
|
||
return f"{prefix}col_auto"
|
||
else:
|
||
return f"{prefix}col_cust"
|
||
|
||
|
||
def entity_suffix(rows):
|
||
if rows // 1000000 > 0:
|
||
suffix = f"{rows // 1000000}m"
|
||
elif rows // 1000 > 0:
|
||
suffix = f"{rows // 1000}k"
|
||
else:
|
||
suffix = f"{rows}"
|
||
return suffix
|
||
|
||
|
||
def gen_float_vectors(nb, dim):
|
||
vectors = [[random.random() for _ in range(dim)] for _ in range(nb)]
|
||
vectors = preprocessing.normalize(vectors, axis=1, norm='l2')
|
||
return vectors.tolist()
|
||
|
||
|
||
def gen_binary_vectors(nb, dim):
|
||
# binary: each int presents 8 dimension
|
||
# so if binary vector dimension is 16,use [x, y], which x and y could be any int between 0 to 255
|
||
vectors = [[random.randint(0, 255) for _ in range(dim)] for _ in range(nb)]
|
||
# vectors = preprocessing.normalize(vectors, axis=1, norm='l2')
|
||
return vectors
|
||
|
||
|
||
def gen_row_based_json_file(row_file, str_pk, multi_scalars, float_vect, rows, dim, autoid):
|
||
with open(row_file, "w") as f:
|
||
f.write("{")
|
||
f.write("\n")
|
||
f.write('"rows":[')
|
||
f.write("\n")
|
||
for i in range(rows):
|
||
if i > 0:
|
||
f.write(",")
|
||
f.write("\n")
|
||
# pk fields
|
||
if not autoid:
|
||
if str_pk:
|
||
f.write('{"uid":"' + str(gen_unique_str()) + '",')
|
||
else:
|
||
f.write('{"uid":' + str(i) + ',')
|
||
else:
|
||
f.write('{')
|
||
|
||
# scalar fields
|
||
if multi_scalars:
|
||
f.write('"int_scalar":' + str(random.randint(-999999, 9999999)) + ',')
|
||
f.write('"float_scalar":' + str(random.random()) + ',')
|
||
f.write('"string_scalar":"' + str(gen_unique_str()) + '",')
|
||
f.write('"bool_scalar":' + str(random.choice(["true", "false"])) + ',')
|
||
|
||
# vector field
|
||
if float_vect:
|
||
vectors = gen_float_vectors(1, dim)
|
||
else:
|
||
vectors = gen_binary_vectors(1, (dim//8))
|
||
f.write('"vectors":' + ",".join(str(x) for x in vectors) + "}")
|
||
f.write("\n")
|
||
f.write("]")
|
||
f.write("\n")
|
||
f.write("}")
|
||
f.write("\n")
|
||
|
||
|
||
def gen_column_base_json_file(col_file, str_pk, float_vect, multi_scalars, rows, dim, autoid):
|
||
with open(col_file, "w") as f:
|
||
f.write("{")
|
||
f.write("\n")
|
||
# pk columns
|
||
if not autoid:
|
||
if str_pk == "str_pk":
|
||
f.write('"uid":["' + ',"'.join(str(gen_unique_str()) + '"' for i in range(rows)) + '],')
|
||
f.write("\n")
|
||
else:
|
||
f.write('"uid":[' + ",".join(str(i) for i in range(rows)) + "],")
|
||
f.write("\n")
|
||
|
||
# scalar columns
|
||
if multi_scalars:
|
||
f.write('"int_scalar":[' + ",".join(str(random.randint(-999999, 9999999)) for i in range(rows)) + "],")
|
||
f.write("\n")
|
||
f.write('"float_scalar":[' + ",".join(str(random.random()) for i in range(rows)) + "],")
|
||
f.write("\n")
|
||
f.write('"string_scalar":["' + ',"'.join(str(gen_unique_str()) + '"' for i in range(rows)) + '],')
|
||
f.write("\n")
|
||
f.write('"bool_scalar":[' + ",".join(str(random.choice(["true", "false"])) for i in range(rows)) + "],")
|
||
f.write("\n")
|
||
|
||
# vector columns
|
||
if float_vect:
|
||
vectors = gen_float_vectors(rows, dim)
|
||
f.write('"vectors":[' + ",".join(str(x) for x in vectors) + "]")
|
||
f.write("\n")
|
||
else:
|
||
vectors = gen_binary_vectors(rows, (dim//8))
|
||
f.write('"vectors":[' + ",".join(str(x) for x in vectors) + "]")
|
||
f.write("\n")
|
||
|
||
f.write("}")
|
||
f.write("\n")
|
||
|
||
|
||
def gen_vectors_in_numpy_file(dir, vector_type, rows, dim, num):
|
||
# vector columns
|
||
if vector_type == FLOAT:
|
||
vectors = gen_float_vectors(rows, dim)
|
||
else:
|
||
vectors = gen_binary_vectors(rows, (dim // 8))
|
||
|
||
suffix = entity_suffix(rows)
|
||
# print(vectors)
|
||
arr = np.array(vectors)
|
||
path = f"{dir}/{dim}d_{suffix}_{num}"
|
||
if not os.path.isdir(path):
|
||
os.mkdir(path)
|
||
file = f'{path}/vectors_{dim}d_{suffix}.npy'
|
||
np.save(file, arr)
|
||
|
||
|
||
def gen_scalars_in_numpy_file(dir, vector_type, rows, dim, num, start):
|
||
# scalar columns
|
||
if vector_type == FLOAT:
|
||
data = [random.random() for i in range(rows)]
|
||
elif vector_type == "int":
|
||
data = [i for i in range(start, start + rows)]
|
||
|
||
suffix = entity_suffix(rows)
|
||
path = f"{dir}/{dim}d_{suffix}_{num}"
|
||
arr = np.array(data)
|
||
file = f'{path}/uid.npy'
|
||
np.save(file, arr)
|
||
|
||
|
||
def gen_json_file_name(row_based, rows, dim, auto_id, str_pk, float_vector, multi_scalars, file_num):
|
||
suffix = entity_suffix(rows)
|
||
scalars = "only"
|
||
if multi_scalars:
|
||
scalars = "multi_scalars"
|
||
vt = FLOAT
|
||
if not float_vector:
|
||
vt = BINARY
|
||
pk = ""
|
||
if str_pk:
|
||
pk = "str_pk_"
|
||
prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id)
|
||
return f"{prefix}_{pk}{vt}_vectors_{scalars}_{dim}d_{suffix}_{file_num}.json"
|
||
|
||
|
||
def gen_json_files(row_based, rows, dim, auto_id, str_pk, float_vector, multi_scalars, file_nums, force=False):
|
||
"""
|
||
row_based: Boolean
|
||
generate row based json file if True
|
||
generate column base json file if False
|
||
rows: entities of data
|
||
dim: dim of vector data
|
||
auto_id: Boolean
|
||
generate primary key data if False, else not
|
||
str_pk: Boolean
|
||
generate string as primary key if True, else generate INT64 as pk
|
||
float_vector: Boolean
|
||
generate float vectors if True, else binary vectors
|
||
multi_scalars: Boolean
|
||
only generate vector data (and pk data depended on auto_switches) if False
|
||
besides vector data, generate INT, STRING, BOOLEAN, etc scalar data if True
|
||
file_nums: file numbers that to be generated
|
||
"""
|
||
|
||
# gen json files
|
||
for i in range(file_nums):
|
||
file_name = gen_json_file_name(row_based=row_based, rows=rows,dim=dim,
|
||
auto_id=auto_id,str_pk=str_pk,float_vector=float_vector,
|
||
multi_scalars=multi_scalars, file_num=i)
|
||
file = f"{data_source}/{file_name}"
|
||
if not os.path.exists(file) or force:
|
||
if row_based:
|
||
gen_row_based_json_file(row_file=file, str_pk=str_pk,
|
||
float_vect=float_vector, multi_scalars=multi_scalars,
|
||
rows=rows, dim=dim, autoid=auto_id)
|
||
else:
|
||
gen_column_base_json_file(col_file=file, str_pk=str_pk,
|
||
float_vect=float_vector, multi_scalars=multi_scalars,
|
||
rows=rows, dim=dim, autoid=auto_id)
|
||
|
||
def gen_npy_files():
|
||
# # gen numpy files
|
||
# uid = 0
|
||
# for i in range(file_nums):
|
||
# gen_vectors_in_numpy_file(data_dir, FLOAT, rows=rows_list[0], dim=dim_list[0], num=i)
|
||
# gen_scalars_in_numpy_file(data_dir, "int", rows=rows_list[0], dim=dim_list[0], num=i, start=uid)
|
||
# uid += rows_list[0]
|
||
pass
|
||
|
||
|
||
def copy_files_to_bucket(client, r_source, bucket_name, force=False):
|
||
# check 'xxx' bucket exist.
|
||
found = client.bucket_exists(bucket_name)
|
||
if not found:
|
||
print(f"Bucket {bucket_name} not found, create it.")
|
||
client.make_bucket(bucket_name)
|
||
|
||
# copy files from root source folder
|
||
os.chdir(r_source)
|
||
onlyfiles = [f for f in os.listdir(r_source) if
|
||
os.path.isfile(os.path.join(r_source, f))]
|
||
for file in onlyfiles:
|
||
if not file.startswith("."):
|
||
found = False
|
||
try:
|
||
result = client.stat_object(bucket_name, file)
|
||
found = True
|
||
except S3Error as exc:
|
||
pass
|
||
|
||
if force:
|
||
res = client.fput_object(bucket_name, file, f"{r_source}/{file}")
|
||
print(res.object_name)
|
||
elif not found:
|
||
res = client.fput_object(bucket_name, file, f"{r_source}/{file}")
|
||
print(res.object_name)
|
||
|
||
# copy subfolders
|
||
sub_folders = [f.name for f in os.scandir(r_source) if f.is_dir()]
|
||
for sub_folder in sub_folders:
|
||
if sub_folder not in ["backup", "tested"]:
|
||
source = f"{r_source}/{sub_folder}"
|
||
os.chdir(source)
|
||
onlyfiles = [f for f in os.listdir(source) if
|
||
os.path.isfile(os.path.join(source, f))]
|
||
for file in onlyfiles:
|
||
if not file.startswith("."):
|
||
found = False
|
||
try:
|
||
result = client.stat_object(bucket_name, f"{sub_folder}/{file}")
|
||
found = True
|
||
except S3Error as exc:
|
||
pass
|
||
|
||
if force:
|
||
res = client.fput_object(bucket_name, f"{sub_folder}/{file}", f"{source}/{file}")
|
||
print(res.object_name)
|
||
elif not found:
|
||
res = client.fput_object(bucket_name, f"{sub_folder}/{file}", f"{source}/{file}")
|
||
print(res.object_name)
|
||
|
||
|
||
def copy_files_to_minio(host, bucket_name, access_key="minioadmin", secret_key="minioadmin", secure=False):
|
||
client = Minio(
|
||
host,
|
||
access_key=access_key,
|
||
secret_key=secret_key,
|
||
secure=secure,
|
||
)
|
||
try:
|
||
# TODO: not copy all the files, just copy the new generated files
|
||
copy_files_to_bucket(client, r_source=data_source, bucket_name=bucket_name, force=False)
|
||
except S3Error as exc:
|
||
print("error occurred.", exc)
|
||
|
||
|
||
def parpar_bulk_load_data(json_file, row_based, rows, dim, auto_id, str_pk, float_vector, multi_scalars, file_nums, force=False):
|
||
if json_file:
|
||
gen_json_files(row_based=row_based, rows=rows, dim=dim,
|
||
auto_id=auto_id, str_pk=str_pk, float_vector=float_vector,
|
||
multi_scalars=multi_scalars, file_nums=file_nums, force=force)
|
||
|
||
copy_files_to_minio(host=minio, bucket_name=bucket_name)
|
||
else:
|
||
# TODO: for npy files
|
||
# gen_npy_files()
|
||
# copy()
|
||
pass
|
||
|
||
# if __name__ == '__main__':
|
||
# gen_json_files(row_based=True, rows=10,
|
||
# dim=4, auto_id=False, str_pk=False,
|
||
# float_vector=True, multi_scalars=False, file_nums=2)
|
||
#
|
||
# copy_files_to_minio(host=minio, bucket_name=bucket_name)
|