milvus/tests/restful_client_v2/testcases/test_jobs_operation.py

1731 lines
69 KiB
Python

import random
import json
import subprocess
import time
from sklearn import preprocessing
from pathlib import Path
import pandas as pd
import numpy as np
from pymilvus import Collection, utility
from utils.utils import gen_collection_name
from utils.util_log import test_log as logger
import pytest
from base.testbase import TestBase
from uuid import uuid4
IMPORT_TIMEOUT = 360
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.float32):
return float(obj)
return super(NumpyEncoder, self).default(obj)
@pytest.mark.BulkInsert
class TestCreateImportJob(TestBase):
@pytest.mark.parametrize("insert_num", [3000])
@pytest.mark.parametrize("import_task_num", [2])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
def test_job_e2e(self, insert_num, import_task_num, auto_id, is_partition_key, enable_dynamic_field):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for task in rsp['data']["records"]:
task_id = task['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(task_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
c = Collection(name)
c.load(_refresh=True, timeou=120)
res = c.query(
expr="",
output_fields=["count(*)"],
)
assert res[0]["count(*)"] == insert_num * import_task_num
# query data
payload = {
"collectionName": name,
"filter": "book_id > 0",
"outputFields": ["*"],
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
@pytest.mark.parametrize("insert_num", [5000])
@pytest.mark.parametrize("import_task_num", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_field", [True])
def test_import_job_with_db(self, insert_num, import_task_num, auto_id, is_partition_key, enable_dynamic_field):
self.create_database(db_name="test_job")
self.update_database(db_name="test_job")
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for task in rsp['data']["records"]:
task_id = task['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(task_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
c = Collection(name)
c.load(_refresh=True, timeou=120)
res = c.query(
expr="",
output_fields=["count(*)"],
)
assert res[0]["count(*)"] == insert_num * import_task_num
# query data
payload = {
"collectionName": name,
"filter": "book_id > 0",
"outputFields": ["*"],
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
@pytest.mark.parametrize("insert_num", [5000])
@pytest.mark.parametrize("import_task_num", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [False])
@pytest.mark.parametrize("enable_dynamic_field", [True])
def test_import_job_with_partition(self, insert_num, import_task_num, auto_id, is_partition_key, enable_dynamic_field):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create partition
partition_name = "test_partition"
rsp = self.partition_client.partition_create(collection_name=name, partition_name=partition_name)
# create import job
payload = {
"collectionName": name,
"partitionName": partition_name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for task in rsp['data']["records"]:
task_id = task['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(task_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
c = Collection(name)
c.load(_refresh=True, timeou=120)
res = c.query(
expr="",
output_fields=["count(*)"],
)
logger.info(f"count in collection: {res}")
assert res[0]["count(*)"] == insert_num * import_task_num
res = c.query(
expr="",
partition_names=[partition_name],
output_fields=["count(*)"],
)
logger.info(f"count in partition {[partition_name]}: {res}")
assert res[0]["count(*)"] == insert_num * import_task_num
# query data
payload = {
"collectionName": name,
"filter": "book_id > 0",
"outputFields": ["*"],
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
def test_job_import_multi_json_file(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
file_nums = 2
file_names = []
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(1000*file_num, 1000*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == 2000
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
def test_job_import_multi_parquet_file(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
file_nums = 2
file_names = []
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(1000*file_num, 1000*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.parquet"
file_path = f"/tmp/{file_name}"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame(data)
df.to_parquet(file_path, index=False)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == 2000
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
def test_job_import_multi_numpy_file(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
file_nums = 2
file_names = []
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(1000*file_num, 1000*(file_num+1))]
file_list = []
# dump data to file
file_dir = f"bulk_insert_data_{file_num}_{uuid4()}"
base_file_path = f"/tmp/{file_dir}"
df = pd.DataFrame(data)
# each column is a list and convert to a npy file
for column in df.columns:
file_path = f"{base_file_path}/{column}.npy"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
file_name = f"{file_dir}/{column}.npy"
np.save(file_path, np.array(df[column].values.tolist()))
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_list.append(file_name)
file_names.append(file_list)
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == 2000
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
def test_job_import_multi_file_type(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
file_nums = 2
file_names = []
# numpy file
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(1000*file_num, 1000*(file_num+1))]
file_list = []
# dump data to file
file_dir = f"bulk_insert_data_{file_num}_{uuid4()}"
base_file_path = f"/tmp/{file_dir}"
df = pd.DataFrame(data)
# each column is a list and convert to a npy file
for column in df.columns:
file_path = f"{base_file_path}/{column}.npy"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
file_name = f"{file_dir}/{column}.npy"
np.save(file_path, np.array(df[column].values.tolist()))
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_list.append(file_name)
file_names.append(file_list)
# parquet file
for file_num in range(2,file_nums+2):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(1000*file_num, 1000*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.parquet"
file_path = f"/tmp/{file_name}"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame(data)
df.to_parquet(file_path, index=False)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
# json file
for file_num in range(4, file_nums+4):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(1000*file_num, 1000*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == 6000
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
@pytest.mark.parametrize("insert_round", [2])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.skip("stats task will generate a new segment, "
"using collectionID as prefix will import twice as much data")
def test_job_import_binlog_file_type(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, bucket_name, root_path):
# todo: copy binlog file to backup bucket
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "bool", "dataType": "Bool", "elementTypeParams": {}},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "varchar_array", "dataType": "Array", "elementDataType": "VarChar",
"elementTypeParams": {"max_capacity": "1024", "max_length": "256"}},
{"fieldName": "bool_array", "dataType": "Array", "elementDataType": "Bool",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "text_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "image_emb", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
]
},
"indexParams": [
{"fieldName": "text_emb", "indexName": "text_emb", "metricType": "L2"},
{"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# create restore collection
restore_collection_name = f"{name}_restore"
payload["collectionName"] = restore_collection_name
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
# insert data
for i in range(insert_round):
data = []
for i in range(nb):
if auto_id:
tmp = {
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"bool": random.choice([True, False]),
"json": {"key": i},
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([np.float32(random.random()) for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([np.float32(random.random()) for _ in range(dim)])])[
0].tolist(),
}
else:
tmp = {
"book_id": i,
"user_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"bool": random.choice([True, False]),
"json": {"key": i},
"int_array": [i],
"varchar_array": [f"varchar_{i}"],
"bool_array": [random.choice([True, False])],
"text_emb": preprocessing.normalize([np.array([np.float32(random.random()) for _ in range(dim)])])[
0].tolist(),
"image_emb": preprocessing.normalize([np.array([np.float32(random.random()) for _ in range(dim)])])[
0].tolist(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
# flush data to generate binlog file
c = Collection(name)
c.flush()
time.sleep(2)
# query data to make sure the data is inserted
rsp = self.vector_client.vector_query({"collectionName": name, "filter": "user_id > 0", "limit": 50})
assert rsp['code'] == 0
assert len(rsp['data']) == 50
# get collection id
c = Collection(name)
res = c.describe()
collection_id = res["collection_id"]
# create import job
payload = {
"collectionName": restore_collection_name,
"files": [[f"/{root_path}/insert_log/{collection_id}/",
# f"{bucket_name}/{root_path}/delta_log/{collection_id}/"
]],
"options": {
"backup": "true"
}
}
if is_partition_key:
payload["partitionName"] = "_default_0"
rsp = self.import_job_client.create_import_jobs(payload)
assert rsp['code'] == 0
# list import job
payload = {
"collectionName": restore_collection_name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
time.sleep(10)
c_restore = Collection(restore_collection_name)
assert c.num_entities == c_restore.num_entities
@pytest.mark.L2
class TestImportJobAdvance(TestBase):
def test_job_import_recovery_after_chaos(self, release_name):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
file_nums = 10
batch_size = 1000
file_names = []
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(batch_size*file_num, batch_size*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
job_id = rsp['data']['jobId']
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
assert job_id in [job["jobId"] for job in rsp['data']["records"]]
rsp = self.import_job_client.list_import_jobs(payload)
# kill milvus by deleting pod
cmd = f"kubectl delete pod -l 'app.kubernetes.io/instance={release_name}, app.kubernetes.io/name=milvus' "
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
output = result.stdout
return_code = result.returncode
logger.info(f"output: {output}, return_code, {return_code}")
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
try:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
except Exception as e:
logger.error(f"get import job progress failed: {e}")
time.sleep(5)
time.sleep(10)
rsp = self.import_job_client.list_import_jobs(payload)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == file_nums * batch_size
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
@pytest.mark.L2
class TestCreateImportJobAdvance(TestBase):
def test_job_import_with_multi_task_and_datanode(self, release_name):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
task_num = 48
file_nums = 1
batch_size = 100000
file_names = []
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(batch_size*file_num, batch_size*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
for i in range(task_num):
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
job_id = rsp['data']['jobId']
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
assert job_id in [job["jobId"] for job in rsp['data']["records"]]
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
try:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
except Exception as e:
logger.error(f"get import job progress failed: {e}")
time.sleep(5)
time.sleep(10)
rsp = self.import_job_client.list_import_jobs(payload)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == file_nums * batch_size * task_num
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
def test_job_import_with_extremely_large_task_num(self, release_name):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
task_num = 1000
file_nums = 2
batch_size = 10
file_names = []
for file_num in range(file_nums):
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(batch_size*file_num, batch_size*(file_num+1))]
# dump data to file
file_name = f"bulk_insert_data_{file_num}_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
# create dir for file path
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
file_names.append([file_name])
for i in range(task_num):
# create import job
payload = {
"collectionName": name,
"files": file_names,
}
rsp = self.import_job_client.create_import_jobs(payload)
job_id = rsp['data']['jobId']
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
assert job_id in [job["jobId"] for job in rsp['data']["records"]]
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for job in rsp['data']["records"]:
job_id = job['jobId']
finished = False
t0 = time.time()
while not finished:
try:
rsp = self.import_job_client.get_import_job_progress(job_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
except Exception as e:
logger.error(f"get import job progress failed: {e}")
time.sleep(5)
time.sleep(10)
rsp = self.import_job_client.list_import_jobs(payload)
# assert data count
c = Collection(name)
c.load(_refresh=True, timeou=120)
assert c.num_entities == file_nums * batch_size * task_num
# assert import data can be queried
payload = {
"collectionName": name,
"filter": f"book_id in {[i for i in range(1000)]}",
"limit": 100,
"offset": 0,
"outputFields": ["*"]
}
rsp = self.vector_client.vector_query(payload)
assert len(rsp['data']) == 100
@pytest.mark.L1
class TestCreateImportJobNegative(TestBase):
@pytest.mark.parametrize("insert_num", [2])
@pytest.mark.parametrize("import_task_num", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_field", [True])
@pytest.mark.BulkInsert
def test_create_import_job_with_json_dup_dynamic_key(self, insert_num, import_task_num, auto_id, is_partition_key, enable_dynamic_field):
# create collection
name = gen_collection_name()
dim = 16
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"dynamic_key": i,
"book_intro": [random.random() for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"$meta": {"dynamic_key": i+1}})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
logger.info(f"data: {data}")
with open(file_path, "w") as f:
json.dump(data, f)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for task in rsp['data']["records"]:
task_id = task['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(task_id)
if rsp['data']['state'] == "Failed":
assert True
finished = True
if rsp['data']['state'] == "Completed":
assert False
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
def test_import_job_with_empty_files(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# create import job
payload = {
"collectionName": name,
"files": [[]],
}
rsp = self.import_job_client.create_import_jobs(payload)
assert rsp['code'] == 1100 and "empty" in rsp['message']
def test_import_job_with_non_exist_files(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
# create import job
payload = {
"collectionName": name,
"files": [["invalid_file.json"]],
}
rsp = self.import_job_client.create_import_jobs(payload)
time.sleep(5)
rsp = self.import_job_client.get_import_job_progress(rsp['data']['jobId'])
assert rsp["data"]["state"] == "Failed"
def test_import_job_with_non_exist_binlog_files(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
self.collection_client.collection_create(payload)
self.wait_load_completed(name)
# create import job
payload = {
"collectionName": name,
"files": [[
f"invalid_bucket/invalid_root_path/insert_log/invalid_id/",
]],
"options": {
"backup": "true"
}
}
rsp = self.import_job_client.create_import_jobs(payload)
assert rsp['code'] == 1100 and "invalid" in rsp['message']
def test_import_job_with_wrong_file_type(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
# upload file to storage
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(10000)]
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.txt"
file_path = f"/tmp/{file_name}"
json_data = json.dumps(data, cls=NumpyEncoder)
# 将JSON数据保存到txt文件
with open(file_path, 'w') as file:
file.write(json_data)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
rsp = self.import_job_client.create_import_jobs(payload)
assert rsp['code'] == 2100 and "unexpected file type" in rsp['message']
def test_import_job_with_empty_rows(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
# upload file to storage
data = [{
"book_id": i,
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]}
for i in range(0)]
# dump data to file
file_name = "bulk_insert_empty_data.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
rsp = self.import_job_client.create_import_jobs(payload)
job_id = rsp['data']['jobId']
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# wait import job to be completed
res, result = self.import_job_client.wait_import_job_completed(job_id)
assert result
c = Collection(name)
assert c.num_entities == 0
def test_create_import_job_with_new_user(self):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
# create new user
username = "test_user"
password = "12345678"
payload = {
"userName": username,
"password": password
}
self.user_client.user_create(payload)
# try to describe collection with new user
self.collection_client.api_key = f"{username}:{password}"
try:
rsp = self.collection_client.collection_describe(collection_name=name)
logger.info(f"describe collection: {rsp}")
except Exception as e:
logger.error(f"describe collection failed: {e}")
# upload file to storage
data = []
auto_id = True
enable_dynamic_field = True
for i in range(1):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
self.import_job_client.api_key = f"{username}:{password}"
rsp = self.import_job_client.create_import_jobs(payload)
assert rsp['code'] == 1100 and "empty" in rsp['message']
@pytest.mark.parametrize("insert_num", [5000])
@pytest.mark.parametrize("import_task_num", [2])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
def test_get_job_progress_with_mismatch_db_name(self, insert_num, import_task_num, auto_id, is_partition_key, enable_dynamic_field):
# create collection
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
rsp = self.collection_client.collection_create(payload)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
payload = {
"collectionName": name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload)
# list import job
payload = {
"collectionName": name,
}
rsp = self.import_job_client.list_import_jobs(payload)
# get import job progress
for task in rsp['data']["records"]:
task_id = task['jobId']
finished = False
t0 = time.time()
while not finished:
rsp = self.import_job_client.get_import_job_progress(task_id)
if rsp['data']['state'] == "Completed":
finished = True
time.sleep(5)
if time.time() - t0 > IMPORT_TIMEOUT:
assert False, "import job timeout"
c = Collection(name)
time.sleep(10)
c.load(_refresh=True, timeou=120)
res = c.query(
expr="",
output_fields=["count(*)"],
)
assert res[0]["count(*)"] == insert_num * import_task_num
# query data
payload = {
"collectionName": name,
"filter": "book_id > 0",
"outputFields": ["*"],
}
rsp = self.vector_client.vector_query(payload)
assert rsp['code'] == 0
@pytest.mark.L1
class TestListImportJob(TestBase):
def test_list_job_e2e(self):
# create two db
self.create_database(db_name="db1")
self.create_database(db_name="db2")
# create collection
insert_num = 5000
import_task_num = 2
auto_id = True
is_partition_key = True
enable_dynamic_field = True
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
for db_name in ["db1", "db2"]:
rsp = self.collection_client.collection_create(payload, db_name=db_name)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
# create import job
for db in ["db1", "db2"]:
payload = {
"collectionName": name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload, db_name=db)
# list import job
payload = {
}
for db_name in [None, "db1", "db2", "default"]:
try:
rsp = self.import_job_client.list_import_jobs(payload, db_name=db_name)
logger.info(f"job num: {len(rsp['data']['records'])}")
except Exception as e:
logger.error(f"list import job failed: {e}")
@pytest.mark.L1
class TestGetImportJobProgress(TestBase):
def test_list_job_e2e(self):
# create two db
self.create_database(db_name="db1")
self.create_database(db_name="db2")
# create collection
insert_num = 5000
import_task_num = 2
auto_id = True
is_partition_key = True
enable_dynamic_field = True
name = gen_collection_name()
dim = 128
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_field,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "isPartitionKey": is_partition_key, "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
]
},
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
}
for db_name in ["db1", "db2"]:
rsp = self.collection_client.collection_create(payload, db_name=db_name)
# upload file to storage
data = []
for i in range(insert_num):
tmp = {
"word_count": i,
"book_describe": f"book_{i}",
"book_intro": [np.float32(random.random()) for _ in range(dim)]
}
if not auto_id:
tmp["book_id"] = i
if enable_dynamic_field:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
# dump data to file
file_name = f"bulk_insert_data_{uuid4()}.json"
file_path = f"/tmp/{file_name}"
with open(file_path, "w") as f:
json.dump(data, f, cls=NumpyEncoder)
# upload file to minio storage
self.storage_client.upload_file(file_path, file_name)
job_id_list = []
# create import job
for db in ["db1", "db2"]:
payload = {
"collectionName": name,
"files": [[file_name]],
}
for i in range(import_task_num):
rsp = self.import_job_client.create_import_jobs(payload, db_name=db)
job_id_list.append(rsp['data']['jobId'])
time.sleep(5)
# get import job progress
for job_id in job_id_list:
try:
rsp = self.import_job_client.get_import_job_progress(job_id)
logger.info(f"job progress: {rsp}")
except Exception as e:
logger.error(f"get import job progress failed: {e}")
@pytest.mark.L1
class TestGetImportJobProgressNegative(TestBase):
def test_list_job_with_invalid_job_id(self):
# get import job progress with invalid job id
job_id_list = ["invalid_job_id", None]
for job_id in job_id_list:
try:
rsp = self.import_job_client.get_import_job_progress(job_id)
logger.info(f"job progress: {rsp}")
except Exception as e:
logger.error(f"get import job progress failed: {e}")
def test_list_job_with_job_id(self):
# get import job progress with invalid job id
job_id_list = ["invalid_job_id", None]
for job_id in job_id_list:
try:
rsp = self.import_job_client.get_import_job_progress(job_id)
logger.info(f"job progress: {rsp}")
except Exception as e:
logger.error(f"get import job progress failed: {e}")
def test_list_job_with_new_user(self):
# create new user
user_name = "test_user"
password = "12345678"
self.user_client.user_create({
"userName": user_name,
"password": password,
})