mirror of https://github.com/milvus-io/milvus.git
Only allow one file for row-based import (#20234)
Signed-off-by: yhmo <yihua.mo@zilliz.com> Signed-off-by: yhmo <yihua.mo@zilliz.com>pull/20261/head
parent
de45db0475
commit
cd3b3d3da6
|
@ -1698,7 +1698,7 @@ func TestProxy(t *testing.T) {
|
|||
defer wg.Done()
|
||||
req := &milvuspb.ImportRequest{
|
||||
CollectionName: collectionName,
|
||||
Files: []string{"f1.json", "f2.json", "f3.json"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
proxy.stateCode.Store(commonpb.StateCode_Healthy)
|
||||
resp, err := proxy.Import(context.TODO(), req)
|
||||
|
@ -1713,7 +1713,7 @@ func TestProxy(t *testing.T) {
|
|||
defer wg.Done()
|
||||
req := &milvuspb.ImportRequest{
|
||||
CollectionName: "bad_collection_name",
|
||||
Files: []string{"f1", "f2", "f3"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
proxy.stateCode.Store(commonpb.StateCode_Healthy)
|
||||
resp, err := proxy.Import(context.TODO(), req)
|
||||
|
@ -1726,7 +1726,7 @@ func TestProxy(t *testing.T) {
|
|||
defer wg.Done()
|
||||
req := &milvuspb.ImportRequest{
|
||||
CollectionName: "bad_collection_name",
|
||||
Files: []string{"f1", "f2", "f3"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
proxy.stateCode.Store(commonpb.StateCode_Healthy)
|
||||
resp, err := proxy.Import(context.TODO(), req)
|
||||
|
|
|
@ -394,6 +394,12 @@ func (m *importManager) isRowbased(files []string) (bool, error) {
|
|||
}
|
||||
}
|
||||
|
||||
// for row_based, we only allow one file so that each invocation only generate a task
|
||||
if isRowBased && len(files) > 1 {
|
||||
log.Error("row-based import, only allow one JSON file each time", zap.Strings("files", files))
|
||||
return isRowBased, fmt.Errorf("row-based import, only allow one JSON file each time")
|
||||
}
|
||||
|
||||
return isRowBased, nil
|
||||
}
|
||||
|
||||
|
@ -459,6 +465,7 @@ func (m *importManager) importJob(ctx context.Context, req *milvuspb.ImportReque
|
|||
for i := 0; i < len(req.Files); i++ {
|
||||
tID, _, err := m.idAllocator(1)
|
||||
if err != nil {
|
||||
log.Error("failed to allocate ID for import task", zap.Error(err))
|
||||
return err
|
||||
}
|
||||
newTask := &datapb.ImportTaskInfo{
|
||||
|
@ -976,22 +983,29 @@ func (m *importManager) listAllTasks(colName string, limit int64) []*milvuspb.Ge
|
|||
log.Error("failed to load from task store", zap.Error(err))
|
||||
return tasks
|
||||
}
|
||||
taskCount := int64(0)
|
||||
|
||||
// filter tasks by collection name
|
||||
// TODO: how to handle duplicated collection name? for example: a new collection has same name with a dropped collection
|
||||
for _, task := range importTasks {
|
||||
if colName != "" && task.GetCollectionName() != colName {
|
||||
continue
|
||||
}
|
||||
taskCount++
|
||||
if limit > 0 && taskCount > limit {
|
||||
break
|
||||
}
|
||||
|
||||
currTask := &milvuspb.GetImportStateResponse{}
|
||||
m.copyTaskInfo(task, currTask)
|
||||
tasks = append(tasks, currTask)
|
||||
}
|
||||
|
||||
// arrange tasks by id with ascending order, actually, id is the create time of a task
|
||||
rearrangeTasks(tasks)
|
||||
return tasks
|
||||
|
||||
// if limit is 0 or larger than length of tasks, return all tasks
|
||||
if limit <= 0 || limit >= int64(len(tasks)) {
|
||||
return tasks
|
||||
}
|
||||
|
||||
// return the newly tasks from the tail
|
||||
return tasks[len(tasks)-int(limit):]
|
||||
}
|
||||
|
||||
// removeBadImportSegments marks segments of a failed import task as `dropped`.
|
||||
|
|
|
@ -19,7 +19,6 @@ package rootcoord
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"strconv"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -590,6 +589,10 @@ func TestImportManager_ImportJob(t *testing.T) {
|
|||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
|
||||
// row-based import not allow multiple files
|
||||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
|
||||
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
return &datapb.ImportTaskResponse{
|
||||
Status: &commonpb.Status{
|
||||
|
@ -600,6 +603,7 @@ func TestImportManager_ImportJob(t *testing.T) {
|
|||
|
||||
// row-based case, task count equal to file count
|
||||
// since the importServiceFunc return error, tasks will be kept in pending list
|
||||
rowReq.Files = []string{"f1.json"}
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.Equal(t, len(rowReq.Files), len(mgr.pendingTasks))
|
||||
|
@ -626,13 +630,13 @@ func TestImportManager_ImportJob(t *testing.T) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
// row-based case, since the importServiceFunc return success, tasks will be sent to woring list
|
||||
// row-based case, since the importServiceFunc return success, tasks will be sent to working list
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, len(rowReq.Files), len(mgr.workingTasks))
|
||||
|
||||
// column-based case, since the importServiceFunc return success, tasks will be sent to woring list
|
||||
// column-based case, since the importServiceFunc return success, tasks will be sent to working list
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
|
@ -640,7 +644,7 @@ func TestImportManager_ImportJob(t *testing.T) {
|
|||
|
||||
count := 0
|
||||
importServiceFunc = func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
if count >= 2 {
|
||||
if count >= 1 {
|
||||
return &datapb.ImportTaskResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||
|
@ -655,19 +659,26 @@ func TestImportManager_ImportJob(t *testing.T) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
// row-based case, since the importServiceFunc return success for 2 tasks
|
||||
// the 2 tasks are sent to working list, and 1 task left in pending list
|
||||
// row-based case, since the importServiceFunc return success for 1 task
|
||||
// the first task is sent to working list, and 1 task left in pending list
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.Equal(t, len(rowReq.Files)-2, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 2, len(mgr.workingTasks))
|
||||
|
||||
// files count exceed MaxPendingCount, return error
|
||||
for i := 0; i <= MaxPendingCount; i++ {
|
||||
rowReq.Files = append(rowReq.Files, strconv.Itoa(i)+".json")
|
||||
}
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 1, len(mgr.workingTasks))
|
||||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 1, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 1, len(mgr.workingTasks))
|
||||
|
||||
// the pending list already has one task
|
||||
// once task count exceeds MaxPendingCount, return error
|
||||
for i := 0; i <= MaxPendingCount; i++ {
|
||||
resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
if i < MaxPendingCount-1 {
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
} else {
|
||||
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestImportManager_AllDataNodesBusy(t *testing.T) {
|
||||
|
@ -687,7 +698,7 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
|
|||
rowReq := &milvuspb.ImportRequest{
|
||||
CollectionName: "c1",
|
||||
PartitionName: "p1",
|
||||
Files: []string{"f1.json", "f2.json", "f3.json"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
colReq := &milvuspb.ImportRequest{
|
||||
CollectionName: "c1",
|
||||
|
@ -703,7 +714,7 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
|
|||
|
||||
dnList := []int64{1, 2, 3}
|
||||
count := 0
|
||||
fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
if count < len(dnList) {
|
||||
count++
|
||||
return &datapb.ImportTaskResponse{
|
||||
|
@ -725,32 +736,44 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
|
|||
ErrorCode: commonpb.ErrorCode_Success,
|
||||
}, nil
|
||||
}
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, len(rowReq.Files), len(mgr.workingTasks))
|
||||
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
// each data node owns one task
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
for i := 0; i < len(dnList); i++ {
|
||||
resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, i+1, len(mgr.workingTasks))
|
||||
}
|
||||
|
||||
// all data nodes are busy, new task waiting in pending list
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, len(rowReq.Files), len(mgr.pendingTasks))
|
||||
assert.Equal(t, 0, len(mgr.workingTasks))
|
||||
|
||||
// Reset count.
|
||||
// now all data nodes are free again, new task is executed instantly
|
||||
count = 0
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 1, len(mgr.workingTasks))
|
||||
|
||||
mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 2, len(mgr.workingTasks))
|
||||
|
||||
mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 3, len(mgr.workingTasks))
|
||||
|
||||
mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
// all data nodes are busy now, new task is pending
|
||||
resp = mgr.importJob(context.TODO(), colReq, colID, 0)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 1, len(mgr.pendingTasks))
|
||||
assert.Equal(t, 3, len(mgr.workingTasks))
|
||||
}
|
||||
|
@ -769,7 +792,7 @@ func TestImportManager_TaskState(t *testing.T) {
|
|||
colID := int64(100)
|
||||
mockKv := &kv.MockMetaKV{}
|
||||
mockKv.InMemKv = sync.Map{}
|
||||
fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
return &datapb.ImportTaskResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_Success,
|
||||
|
@ -780,7 +803,7 @@ func TestImportManager_TaskState(t *testing.T) {
|
|||
rowReq := &milvuspb.ImportRequest{
|
||||
CollectionName: "c1",
|
||||
PartitionName: "p1",
|
||||
Files: []string{"f1.json", "f2.json", "f3.json"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
|
||||
callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
|
||||
|
@ -789,7 +812,12 @@ func TestImportManager_TaskState(t *testing.T) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
// add 3 tasks, their ID is 10000, 10001, 10002, make sure updateTaskInfo() works correctly
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
rowReq.Files = []string{"f2.json"}
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
rowReq.Files = []string{"f3.json"}
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
|
||||
info := &rootcoordpb.ImportResult{
|
||||
|
@ -865,7 +893,7 @@ func TestImportManager_AllocFail(t *testing.T) {
|
|||
colID := int64(100)
|
||||
mockKv := &kv.MockMetaKV{}
|
||||
mockKv.InMemKv = sync.Map{}
|
||||
fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
|
||||
return &datapb.ImportTaskResponse{
|
||||
Status: &commonpb.Status{
|
||||
ErrorCode: commonpb.ErrorCode_Success,
|
||||
|
@ -876,7 +904,7 @@ func TestImportManager_AllocFail(t *testing.T) {
|
|||
rowReq := &milvuspb.ImportRequest{
|
||||
CollectionName: "c1",
|
||||
PartitionName: "p1",
|
||||
Files: []string{"f1.json", "f2.json", "f3.json"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
|
||||
callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
|
||||
|
@ -884,8 +912,10 @@ func TestImportManager_AllocFail(t *testing.T) {
|
|||
ErrorCode: commonpb.ErrorCode_Success,
|
||||
}, nil
|
||||
}
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
assert.Equal(t, 0, len(mgr.pendingTasks))
|
||||
}
|
||||
|
||||
func TestImportManager_ListAllTasks(t *testing.T) {
|
||||
|
@ -916,7 +946,7 @@ func TestImportManager_ListAllTasks(t *testing.T) {
|
|||
rowReq := &milvuspb.ImportRequest{
|
||||
CollectionName: "c1",
|
||||
PartitionName: "p1",
|
||||
Files: []string{"f1.json", "f2.json", "f3.json"},
|
||||
Files: []string{"f1.json"},
|
||||
}
|
||||
callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
|
||||
return &commonpb.Status{
|
||||
|
@ -924,10 +954,25 @@ func TestImportManager_ListAllTasks(t *testing.T) {
|
|||
}, nil
|
||||
}
|
||||
mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
repeat := 10
|
||||
for i := 0; i < repeat; i++ {
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
}
|
||||
|
||||
tasks := mgr.listAllTasks("", 100)
|
||||
assert.Equal(t, len(rowReq.Files), len(tasks))
|
||||
// list all tasks
|
||||
tasks := mgr.listAllTasks("", int64(repeat))
|
||||
assert.Equal(t, repeat, len(tasks))
|
||||
for i := 0; i < repeat; i++ {
|
||||
assert.Equal(t, int64(i+1), tasks[i].Id)
|
||||
}
|
||||
|
||||
// list few tasks
|
||||
limit := 3
|
||||
tasks = mgr.listAllTasks("", int64(limit))
|
||||
assert.Equal(t, limit, len(tasks))
|
||||
for i := 0; i < limit; i++ {
|
||||
assert.Equal(t, int64(i+repeat-limit+1), tasks[i].Id)
|
||||
}
|
||||
|
||||
resp := mgr.getTaskState(1)
|
||||
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
|
||||
|
@ -943,13 +988,10 @@ func TestImportManager_ListAllTasks(t *testing.T) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
// there are 10 tasks in working list, and 1 task in pending list, totally 11 tasks
|
||||
mgr.importJob(context.TODO(), rowReq, colID, 0)
|
||||
tasks = mgr.listAllTasks("", 100)
|
||||
assert.Equal(t, len(rowReq.Files)*2, len(tasks))
|
||||
tasks = mgr.listAllTasks("", 1)
|
||||
assert.Equal(t, 1, len(tasks))
|
||||
tasks = mgr.listAllTasks("bad-collection-name", 1)
|
||||
assert.Equal(t, 0, len(tasks))
|
||||
assert.Equal(t, repeat+1, len(tasks))
|
||||
|
||||
// the id of tasks must be 1,2,3,4,5,6(sequence not guaranteed)
|
||||
ids := make(map[int64]struct{})
|
||||
|
@ -960,6 +1002,14 @@ func TestImportManager_ListAllTasks(t *testing.T) {
|
|||
delete(ids, tasks[i].Id)
|
||||
}
|
||||
assert.Equal(t, 0, len(ids))
|
||||
|
||||
// list few tasks
|
||||
tasks = mgr.listAllTasks("", 1)
|
||||
assert.Equal(t, 1, len(tasks))
|
||||
|
||||
// invliad collection name, returns empty
|
||||
tasks = mgr.listAllTasks("bad-collection-name", 1)
|
||||
assert.Equal(t, 0, len(tasks))
|
||||
}
|
||||
|
||||
func TestImportManager_setCollectionPartitionName(t *testing.T) {
|
||||
|
@ -1009,11 +1059,16 @@ func TestImportManager_rearrangeTasks(t *testing.T) {
|
|||
func TestImportManager_isRowbased(t *testing.T) {
|
||||
mgr := &importManager{}
|
||||
|
||||
files := []string{"1.json", "2.json"}
|
||||
files := []string{"1.json"}
|
||||
rb, err := mgr.isRowbased(files)
|
||||
assert.Nil(t, err)
|
||||
assert.True(t, rb)
|
||||
|
||||
files = []string{"1.json", "2.json"}
|
||||
rb, err = mgr.isRowbased(files)
|
||||
assert.NotNil(t, err)
|
||||
assert.True(t, rb)
|
||||
|
||||
files = []string{"1.json", "2.npy"}
|
||||
rb, err = mgr.isRowbased(files)
|
||||
assert.NotNil(t, err)
|
||||
|
|
|
@ -419,13 +419,14 @@ func tryFlushBlocks(ctx context.Context,
|
|||
// if segment size is larger than predefined blockSize, flush to create a new binlog file
|
||||
// initialize a new FieldData list for next round batch read
|
||||
if size > int(blockSize) && rowCount > 0 {
|
||||
printFieldsDataInfo(blockData, "import util: prepare to flush block larger than maxBlockSize", nil)
|
||||
printFieldsDataInfo(blockData, "import util: prepare to flush block larger than blockSize", nil)
|
||||
err := callFlushFunc(blockData, i)
|
||||
if err != nil {
|
||||
log.Error("Import util: failed to flush block data", zap.Int("shardID", i))
|
||||
return err
|
||||
}
|
||||
log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount), zap.Int("size", size), zap.Int("shardID", i))
|
||||
log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount),
|
||||
zap.Int("size", size), zap.Int("shardID", i), zap.Int64("blockSize", blockSize))
|
||||
|
||||
blocksData[i] = initSegmentData(collectionSchema)
|
||||
if blocksData[i] == nil {
|
||||
|
|
|
@ -264,11 +264,10 @@ func (p *ImportWrapper) fileValidation(filePaths []string) (bool, error) {
|
|||
fileNames[name] = struct{}{}
|
||||
|
||||
// check file size, single file size cannot exceed MaxFileSize
|
||||
// TODO add context
|
||||
size, err := p.chunkManager.Size(context.TODO(), filePath)
|
||||
size, err := p.chunkManager.Size(p.ctx, filePath)
|
||||
if err != nil {
|
||||
log.Error("import wrapper: failed to get file size", zap.String("filePath", filePath), zap.Error(err))
|
||||
return rowBased, fmt.Errorf("import wrapper: failed to get file size of '%s'", filePath)
|
||||
return rowBased, fmt.Errorf("import wrapper: failed to get file size of '%s', make sure the input path is related path, error:%w", filePath, err)
|
||||
}
|
||||
|
||||
// empty file
|
||||
|
@ -312,7 +311,6 @@ func (p *ImportWrapper) Import(filePaths []string, options ImportOptions) error
|
|||
// data restore function to import milvus native binlog files(for backup/restore tools)
|
||||
// the backup/restore tool provide two paths for a partition, the first path is binlog path, the second is deltalog path
|
||||
if p.isBinlogImport(filePaths) {
|
||||
// TODO: handle the timestamp end point passed from client side, currently use math.MaxUint64
|
||||
return p.doBinlogImport(filePaths, options.TsStartPoint, options.TsEndPoint)
|
||||
}
|
||||
|
||||
|
@ -454,23 +452,51 @@ func (p *ImportWrapper) reportPersisted() error {
|
|||
// isBinlogImport is to judge whether it is binlog import operation
|
||||
// For internal usage by the restore tool: https://github.com/zilliztech/milvus-backup
|
||||
// This tool exports data from a milvus service, and call bulkload interface to import native data into another milvus service.
|
||||
// This tool provides two paths: one is data log path of a partition,the other is delta log path of this partition.
|
||||
// This tool provides two paths: one is insert log path of a partition,the other is delta log path of this partition.
|
||||
// This method checks the filePaths, if the file paths is exist and not a file, we say it is native import.
|
||||
func (p *ImportWrapper) isBinlogImport(filePaths []string) bool {
|
||||
// must contains the insert log path, and the delta log path is optional
|
||||
if len(filePaths) != 1 && len(filePaths) != 2 {
|
||||
log.Info("import wrapper: paths count is not 1 or 2, not binlog import", zap.Int("len", len(filePaths)))
|
||||
// must contains the insert log path, and the delta log path is optional to be empty string
|
||||
if len(filePaths) != 2 {
|
||||
log.Info("import wrapper: paths count is not 2, not binlog import", zap.Int("len", len(filePaths)))
|
||||
return false
|
||||
}
|
||||
|
||||
for i := 0; i < len(filePaths); i++ {
|
||||
filePath := filePaths[i]
|
||||
_, fileType := GetFileNameAndExt(filePath)
|
||||
checkFunc := func(filePath string) bool {
|
||||
// contains file extension, is not a path
|
||||
_, fileType := GetFileNameAndExt(filePath)
|
||||
if len(fileType) != 0 {
|
||||
log.Info("import wrapper: not a path, not binlog import", zap.String("filePath", filePath), zap.String("fileType", fileType))
|
||||
return false
|
||||
}
|
||||
|
||||
// check path existence
|
||||
exist, err := p.chunkManager.Exist(p.ctx, filePath)
|
||||
if err != nil {
|
||||
log.Error("import wrapper: failed to check the path existence, not binlog import", zap.String("filePath", filePath), zap.Error(err))
|
||||
return false
|
||||
}
|
||||
if !exist {
|
||||
log.Info("import wrapper: the input path doesn't exist, not binlog import", zap.String("filePath", filePath))
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// the first path is insert log path
|
||||
filePath := filePaths[0]
|
||||
if len(filePath) == 0 {
|
||||
log.Info("import wrapper: the first path is empty string, not binlog import")
|
||||
return false
|
||||
}
|
||||
|
||||
if !checkFunc(filePath) {
|
||||
return false
|
||||
}
|
||||
|
||||
// the second path is delta log path
|
||||
filePath = filePaths[1]
|
||||
if len(filePath) > 0 && !checkFunc(filePath) {
|
||||
return false
|
||||
}
|
||||
|
||||
log.Info("import wrapper: do binlog import")
|
||||
|
@ -501,12 +527,9 @@ func (p *ImportWrapper) doBinlogImport(filePaths []string, tsStartPoint uint64,
|
|||
func (p *ImportWrapper) parseRowBasedJSON(filePath string, onlyValidate bool) error {
|
||||
tr := timerecord.NewTimeRecorder("json row-based parser: " + filePath)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// for minio storage, chunkManager will download file into local memory
|
||||
// for local storage, chunkManager open the file directly
|
||||
file, err := p.chunkManager.Reader(ctx, filePath)
|
||||
file, err := p.chunkManager.Reader(p.ctx, filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -553,13 +576,11 @@ func (p *ImportWrapper) parseColumnBasedNumpy(filePath string, onlyValidate bool
|
|||
combineFunc func(fields map[storage.FieldID]storage.FieldData) error) error {
|
||||
tr := timerecord.NewTimeRecorder("numpy parser: " + filePath)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
fileName, _ := GetFileNameAndExt(filePath)
|
||||
|
||||
// for minio storage, chunkManager will download file into local memory
|
||||
// for local storage, chunkManager open the file directly
|
||||
file, err := p.chunkManager.Reader(ctx, filePath)
|
||||
file, err := p.chunkManager.Reader(p.ctx, filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -721,7 +742,7 @@ func (p *ImportWrapper) splitFieldsData(fieldsData map[storage.FieldID]storage.F
|
|||
return fmt.Errorf("import wrapper: field '%s' row count %d is not equal to other fields row count: %d", name, count, rowCount)
|
||||
}
|
||||
}
|
||||
log.Info("import wrapper: try to split a block with row count", zap.Int("rowCount", rowCount), zap.Any("rowCountOfEachField", rowCounter))
|
||||
log.Info("import wrapper: try to split a block with row count", zap.Int("rowCount", rowCount))
|
||||
|
||||
primaryData, ok := fieldsData[primaryKey.GetFieldID()]
|
||||
if !ok {
|
||||
|
|
|
@ -37,7 +37,6 @@ import (
|
|||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
|
||||
"github.com/milvus-io/milvus/internal/storage"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/internal/util/timerecord"
|
||||
)
|
||||
|
||||
|
@ -173,7 +172,9 @@ func createMockCallbackFunctions(t *testing.T, rowCounter *rowCounterTest) (Assi
|
|||
}
|
||||
|
||||
func Test_NewImportWrapper(t *testing.T) {
|
||||
f := dependency.NewDefaultFactory(true)
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
@ -226,7 +227,9 @@ func Test_ImportWrapperRowBased(t *testing.T) {
|
|||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(TempFilesPath)
|
||||
|
||||
f := dependency.NewDefaultFactory(true)
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
@ -388,7 +391,9 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
|
|||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(TempFilesPath)
|
||||
|
||||
f := dependency.NewDefaultFactory(true)
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
@ -485,7 +490,9 @@ func Test_ImportWrapperRowBased_perf(t *testing.T) {
|
|||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(TempFilesPath)
|
||||
|
||||
f := dependency.NewDefaultFactory(true)
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
@ -764,7 +771,9 @@ func Test_ImportWrapperReportFailRowBased(t *testing.T) {
|
|||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(TempFilesPath)
|
||||
|
||||
f := dependency.NewDefaultFactory(true)
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
@ -824,7 +833,9 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
|
|||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(TempFilesPath)
|
||||
|
||||
f := dependency.NewDefaultFactory(true)
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
@ -867,11 +878,16 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
|
|||
}
|
||||
|
||||
func Test_ImportWrapperIsBinlogImport(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
err := os.MkdirAll(TempFilesPath, os.ModePerm)
|
||||
assert.Nil(t, err)
|
||||
defer os.RemoveAll(TempFilesPath)
|
||||
|
||||
cm := &MockChunkManager{
|
||||
size: 1,
|
||||
}
|
||||
// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
|
||||
// NewChunkManagerFactory() can specify the root path
|
||||
f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
|
||||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
|
||||
idAllocator := newIDAllocator(ctx, t, nil)
|
||||
schema := perfSchema(128)
|
||||
|
@ -902,13 +918,43 @@ func Test_ImportWrapperIsBinlogImport(t *testing.T) {
|
|||
b = wrapper.isBinlogImport(paths)
|
||||
assert.False(t, b)
|
||||
|
||||
// success
|
||||
// path doesn't exist
|
||||
paths = []string{
|
||||
"/tmp",
|
||||
"/tmp",
|
||||
"path1",
|
||||
"path2",
|
||||
}
|
||||
b = wrapper.isBinlogImport(paths)
|
||||
assert.False(t, b)
|
||||
|
||||
// insert log path is created, but delta log path doesn't exist
|
||||
err = os.MkdirAll(TempFilesPath+paths[0], os.ModePerm)
|
||||
assert.NoError(t, err)
|
||||
|
||||
b = wrapper.isBinlogImport(paths)
|
||||
assert.False(t, b)
|
||||
|
||||
// both the two path are created, success
|
||||
err = os.MkdirAll(TempFilesPath+paths[1], os.ModePerm)
|
||||
assert.NoError(t, err)
|
||||
|
||||
b = wrapper.isBinlogImport(paths)
|
||||
assert.True(t, b)
|
||||
|
||||
// the delta log path is empty, success
|
||||
paths = []string{
|
||||
"path1",
|
||||
"",
|
||||
}
|
||||
b = wrapper.isBinlogImport(paths)
|
||||
assert.True(t, b)
|
||||
|
||||
// path is empty string
|
||||
paths = []string{
|
||||
"",
|
||||
"",
|
||||
}
|
||||
b = wrapper.isBinlogImport(paths)
|
||||
assert.False(t, b)
|
||||
}
|
||||
|
||||
func Test_ImportWrapperDoBinlogImport(t *testing.T) {
|
||||
|
@ -1022,7 +1068,7 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
|
|||
err := wrapper.splitFieldsData(nil, 0)
|
||||
assert.NotNil(t, err)
|
||||
|
||||
// split 100 rows to 4 blocks
|
||||
// split 100 rows to 4 blocks, success
|
||||
rowCount := 100
|
||||
input := initSegmentData(schema)
|
||||
for j := 0; j < rowCount; j++ {
|
||||
|
@ -1039,6 +1085,12 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
|
|||
assert.Equal(t, 4, rowCounter.callTime)
|
||||
assert.Equal(t, rowCount, rowCounter.rowCount)
|
||||
|
||||
// alloc id failed
|
||||
wrapper.rowIDAllocator = newIDAllocator(ctx, t, errors.New("error"))
|
||||
err = wrapper.splitFieldsData(input, 512)
|
||||
assert.NotNil(t, err)
|
||||
wrapper.rowIDAllocator = newIDAllocator(ctx, t, nil)
|
||||
|
||||
// row count of fields are unequal
|
||||
schema.Fields[0].AutoID = false
|
||||
input = initSegmentData(schema)
|
||||
|
@ -1053,4 +1105,29 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
|
|||
}
|
||||
err = wrapper.splitFieldsData(input, 512)
|
||||
assert.NotNil(t, err)
|
||||
|
||||
// primary key not found
|
||||
wrapper.collectionSchema.Fields[0].IsPrimaryKey = false
|
||||
err = wrapper.splitFieldsData(input, 512)
|
||||
assert.NotNil(t, err)
|
||||
wrapper.collectionSchema.Fields[0].IsPrimaryKey = true
|
||||
|
||||
// primary key is varchar, success
|
||||
wrapper.collectionSchema.Fields[0].DataType = schemapb.DataType_VarChar
|
||||
input = initSegmentData(schema)
|
||||
for j := 0; j < rowCount; j++ {
|
||||
pkField := input[101].(*storage.StringFieldData)
|
||||
pkField.Data = append(pkField.Data, strconv.FormatInt(int64(j), 10))
|
||||
|
||||
flagField := input[102].(*storage.BoolFieldData)
|
||||
flagField.Data = append(flagField.Data, true)
|
||||
}
|
||||
rowCounter.callTime = 0
|
||||
rowCounter.rowCount = 0
|
||||
importResult.AutoIds = []int64{}
|
||||
err = wrapper.splitFieldsData(input, 1024)
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, 0, len(importResult.AutoIds))
|
||||
assert.Equal(t, 2, rowCounter.callTime)
|
||||
assert.Equal(t, rowCount, rowCounter.rowCount)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue