Only allow one file for row-based import (#20234)

Signed-off-by: yhmo <yihua.mo@zilliz.com> Signed-off-by: yhmo <yihua.mo@zilliz.com>
2022-11-02 10:23:35 +08:00 · 2022-11-02 10:23:35 +08:00 · cd3b3d3da6
parent de45db0475
commit cd3b3d3da6
6 changed files with 258 additions and 90 deletions
--- a/internal/proxy/proxy_test.go
+++ b/internal/proxy/proxy_test.go
@ -1698,7 +1698,7 @@ func TestProxy(t *testing.T) {
 		defer wg.Done()
 		req := &milvuspb.ImportRequest{
 			CollectionName: collectionName,
-			Files:          []string{"f1.json", "f2.json", "f3.json"},
+			Files:          []string{"f1.json"},
 		}
 		proxy.stateCode.Store(commonpb.StateCode_Healthy)
 		resp, err := proxy.Import(context.TODO(), req)
@ -1713,7 +1713,7 @@ func TestProxy(t *testing.T) {
 		defer wg.Done()
 		req := &milvuspb.ImportRequest{
 			CollectionName: "bad_collection_name",
-			Files:          []string{"f1", "f2", "f3"},
+			Files:          []string{"f1.json"},
 		}
 		proxy.stateCode.Store(commonpb.StateCode_Healthy)
 		resp, err := proxy.Import(context.TODO(), req)
@ -1726,7 +1726,7 @@ func TestProxy(t *testing.T) {
 		defer wg.Done()
 		req := &milvuspb.ImportRequest{
 			CollectionName: "bad_collection_name",
-			Files:          []string{"f1", "f2", "f3"},
+			Files:          []string{"f1.json"},
 		}
 		proxy.stateCode.Store(commonpb.StateCode_Healthy)
 		resp, err := proxy.Import(context.TODO(), req)
--- a/internal/rootcoord/import_manager.go
+++ b/internal/rootcoord/import_manager.go
@ -394,6 +394,12 @@ func (m *importManager) isRowbased(files []string) (bool, error) {
 		}
 	}

+	// for row_based, we only allow one file so that each invocation only generate a task
+	if isRowBased && len(files) > 1 {
+		log.Error("row-based import, only allow one JSON file each time", zap.Strings("files", files))
+		return isRowBased, fmt.Errorf("row-based import, only allow one JSON file each time")
+	}
+
 	return isRowBased, nil
 }

@ -459,6 +465,7 @@ func (m *importManager) importJob(ctx context.Context, req *milvuspb.ImportReque
 			for i := 0; i < len(req.Files); i++ {
 				tID, _, err := m.idAllocator(1)
 				if err != nil {
+					log.Error("failed to allocate ID for import task", zap.Error(err))
 					return err
 				}
 				newTask := &datapb.ImportTaskInfo{
@ -976,22 +983,29 @@ func (m *importManager) listAllTasks(colName string, limit int64) []*milvuspb.Ge
 		log.Error("failed to load from task store", zap.Error(err))
 		return tasks
 	}
-	taskCount := int64(0)
+
+	// filter tasks by collection name
+	// TODO: how to handle duplicated collection name? for example: a new collection has same name with a dropped collection
 	for _, task := range importTasks {
 		if colName != "" && task.GetCollectionName() != colName {
 			continue
 		}
-		taskCount++
-		if limit > 0 && taskCount > limit {
-			break
-		}
+
 		currTask := &milvuspb.GetImportStateResponse{}
 		m.copyTaskInfo(task, currTask)
 		tasks = append(tasks, currTask)
 	}

+	// arrange tasks by id with ascending order, actually, id is the create time of a task
 	rearrangeTasks(tasks)
-	return tasks
+
+	// if limit is 0 or larger than length of tasks, return all tasks
+	if limit <= 0 || limit >= int64(len(tasks)) {
+		return tasks
+	}
+
+	// return the newly tasks from the tail
+	return tasks[len(tasks)-int(limit):]
 }

 // removeBadImportSegments marks segments of a failed import task as `dropped`.
--- a/internal/rootcoord/import_manager_test.go
+++ b/internal/rootcoord/import_manager_test.go
@ -19,7 +19,6 @@ package rootcoord
 import (
 	"context"
 	"errors"
-	"strconv"
 	"sync"
 	"testing"
 	"time"
@ -590,6 +589,10 @@ func TestImportManager_ImportJob(t *testing.T) {
 	resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
 	assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)

+	// row-based import not allow multiple files
+	resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
+	assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
+
 	importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
 		return &datapb.ImportTaskResponse{
 			Status: &commonpb.Status{
@ -600,6 +603,7 @@ func TestImportManager_ImportJob(t *testing.T) {

 	// row-based case, task count equal to file count
 	// since the importServiceFunc return error, tasks will be kept in pending list
+	rowReq.Files = []string{"f1.json"}
 	mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
 	resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
 	assert.Equal(t, len(rowReq.Files), len(mgr.pendingTasks))
@ -626,13 +630,13 @@ func TestImportManager_ImportJob(t *testing.T) {
 		}, nil
 	}

-	// row-based case, since the importServiceFunc return success, tasks will be sent to woring list
+	// row-based case, since the importServiceFunc return success, tasks will be sent to working list
 	mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
 	resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
 	assert.Equal(t, 0, len(mgr.pendingTasks))
 	assert.Equal(t, len(rowReq.Files), len(mgr.workingTasks))

-	// column-based case, since the importServiceFunc return success, tasks will be sent to woring list
+	// column-based case, since the importServiceFunc return success, tasks will be sent to working list
 	mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
 	resp = mgr.importJob(context.TODO(), colReq, colID, 0)
 	assert.Equal(t, 0, len(mgr.pendingTasks))
@ -640,7 +644,7 @@ func TestImportManager_ImportJob(t *testing.T) {

 	count := 0
 	importServiceFunc = func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
-		if count >= 2 {
+		if count >= 1 {
 			return &datapb.ImportTaskResponse{
 				Status: &commonpb.Status{
 					ErrorCode: commonpb.ErrorCode_UnexpectedError,
@ -655,19 +659,26 @@ func TestImportManager_ImportJob(t *testing.T) {
 		}, nil
 	}

-	// row-based case, since the importServiceFunc return success for 2 tasks
-	// the 2 tasks are sent to working list, and 1 task left in pending list
+	// row-based case, since the importServiceFunc return success for 1 task
+	// the first task is sent to working list, and 1 task left in pending list
 	mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
 	resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
-	assert.Equal(t, len(rowReq.Files)-2, len(mgr.pendingTasks))
-	assert.Equal(t, 2, len(mgr.workingTasks))
-
-	// files count exceed MaxPendingCount, return error
-	for i := 0; i <= MaxPendingCount; i++ {
-		rowReq.Files = append(rowReq.Files, strconv.Itoa(i)+".json")
-	}
+	assert.Equal(t, 0, len(mgr.pendingTasks))
+	assert.Equal(t, 1, len(mgr.workingTasks))
 	resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
-	assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
+	assert.Equal(t, 1, len(mgr.pendingTasks))
+	assert.Equal(t, 1, len(mgr.workingTasks))
+
+	// the pending list already has one task
+	// once task count exceeds MaxPendingCount, return error
+	for i := 0; i <= MaxPendingCount; i++ {
+		resp = mgr.importJob(context.TODO(), rowReq, colID, 0)
+		if i < MaxPendingCount-1 {
+			assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
+		} else {
+			assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
+		}
+	}
 }

 func TestImportManager_AllDataNodesBusy(t *testing.T) {
@ -687,7 +698,7 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
 	rowReq := &milvuspb.ImportRequest{
 		CollectionName: "c1",
 		PartitionName:  "p1",
-		Files:          []string{"f1.json", "f2.json", "f3.json"},
+		Files:          []string{"f1.json"},
 	}
 	colReq := &milvuspb.ImportRequest{
 		CollectionName: "c1",
@ -703,7 +714,7 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {

 	dnList := []int64{1, 2, 3}
 	count := 0
-	fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
+	importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
 		if count < len(dnList) {
 			count++
 			return &datapb.ImportTaskResponse{
@ -725,32 +736,44 @@ func TestImportManager_AllDataNodesBusy(t *testing.T) {
 			ErrorCode: commonpb.ErrorCode_Success,
 		}, nil
 	}
-	mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
-	mgr.importJob(context.TODO(), rowReq, colID, 0)
-	assert.Equal(t, 0, len(mgr.pendingTasks))
-	assert.Equal(t, len(rowReq.Files), len(mgr.workingTasks))

-	mgr = newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
-	mgr.importJob(context.TODO(), rowReq, colID, 0)
+	// each data node owns one task
+	mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
+	for i := 0; i < len(dnList); i++ {
+		resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
+		assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
+		assert.Equal(t, 0, len(mgr.pendingTasks))
+		assert.Equal(t, i+1, len(mgr.workingTasks))
+	}
+
+	// all data nodes are busy, new task waiting in pending list
+	mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
+	resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
+	assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
 	assert.Equal(t, len(rowReq.Files), len(mgr.pendingTasks))
 	assert.Equal(t, 0, len(mgr.workingTasks))

-	// Reset count.
+	// now all data nodes are free again, new task is executed instantly
 	count = 0
-	mgr = newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
-	mgr.importJob(context.TODO(), colReq, colID, 0)
+	mgr = newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
+	resp = mgr.importJob(context.TODO(), colReq, colID, 0)
+	assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
 	assert.Equal(t, 0, len(mgr.pendingTasks))
 	assert.Equal(t, 1, len(mgr.workingTasks))

-	mgr.importJob(context.TODO(), colReq, colID, 0)
+	resp = mgr.importJob(context.TODO(), colReq, colID, 0)
+	assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
 	assert.Equal(t, 0, len(mgr.pendingTasks))
 	assert.Equal(t, 2, len(mgr.workingTasks))

-	mgr.importJob(context.TODO(), colReq, colID, 0)
+	resp = mgr.importJob(context.TODO(), colReq, colID, 0)
+	assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
 	assert.Equal(t, 0, len(mgr.pendingTasks))
 	assert.Equal(t, 3, len(mgr.workingTasks))

-	mgr.importJob(context.TODO(), colReq, colID, 0)
+	// all data nodes are busy now, new task is pending
+	resp = mgr.importJob(context.TODO(), colReq, colID, 0)
+	assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
 	assert.Equal(t, 1, len(mgr.pendingTasks))
 	assert.Equal(t, 3, len(mgr.workingTasks))
 }
@ -769,7 +792,7 @@ func TestImportManager_TaskState(t *testing.T) {
 	colID := int64(100)
 	mockKv := &kv.MockMetaKV{}
 	mockKv.InMemKv = sync.Map{}
-	fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
+	importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
 		return &datapb.ImportTaskResponse{
 			Status: &commonpb.Status{
 				ErrorCode: commonpb.ErrorCode_Success,
@ -780,7 +803,7 @@ func TestImportManager_TaskState(t *testing.T) {
 	rowReq := &milvuspb.ImportRequest{
 		CollectionName: "c1",
 		PartitionName:  "p1",
-		Files:          []string{"f1.json", "f2.json", "f3.json"},
+		Files:          []string{"f1.json"},
 	}

 	callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
@ -789,7 +812,12 @@ func TestImportManager_TaskState(t *testing.T) {
 		}, nil
 	}

-	mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
+	// add 3 tasks, their ID is 10000, 10001, 10002, make sure updateTaskInfo() works correctly
+	mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
+	mgr.importJob(context.TODO(), rowReq, colID, 0)
+	rowReq.Files = []string{"f2.json"}
+	mgr.importJob(context.TODO(), rowReq, colID, 0)
+	rowReq.Files = []string{"f3.json"}
 	mgr.importJob(context.TODO(), rowReq, colID, 0)

 	info := &rootcoordpb.ImportResult{
@ -865,7 +893,7 @@ func TestImportManager_AllocFail(t *testing.T) {
 	colID := int64(100)
 	mockKv := &kv.MockMetaKV{}
 	mockKv.InMemKv = sync.Map{}
-	fn := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
+	importServiceFunc := func(ctx context.Context, req *datapb.ImportTaskRequest) (*datapb.ImportTaskResponse, error) {
 		return &datapb.ImportTaskResponse{
 			Status: &commonpb.Status{
 				ErrorCode: commonpb.ErrorCode_Success,
@ -876,7 +904,7 @@ func TestImportManager_AllocFail(t *testing.T) {
 	rowReq := &milvuspb.ImportRequest{
 		CollectionName: "c1",
 		PartitionName:  "p1",
-		Files:          []string{"f1.json", "f2.json", "f3.json"},
+		Files:          []string{"f1.json"},
 	}

 	callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
@ -884,8 +912,10 @@ func TestImportManager_AllocFail(t *testing.T) {
 			ErrorCode: commonpb.ErrorCode_Success,
 		}, nil
 	}
-	mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
-	mgr.importJob(context.TODO(), rowReq, colID, 0)
+	mgr := newImportManager(context.TODO(), mockKv, idAlloc, importServiceFunc, callMarkSegmentsDropped, nil, nil, nil, nil)
+	resp := mgr.importJob(context.TODO(), rowReq, colID, 0)
+	assert.NotEqual(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
+	assert.Equal(t, 0, len(mgr.pendingTasks))
 }

 func TestImportManager_ListAllTasks(t *testing.T) {
@ -916,7 +946,7 @@ func TestImportManager_ListAllTasks(t *testing.T) {
 	rowReq := &milvuspb.ImportRequest{
 		CollectionName: "c1",
 		PartitionName:  "p1",
-		Files:          []string{"f1.json", "f2.json", "f3.json"},
+		Files:          []string{"f1.json"},
 	}
 	callMarkSegmentsDropped := func(ctx context.Context, segIDs []typeutil.UniqueID) (*commonpb.Status, error) {
 		return &commonpb.Status{
@ -924,10 +954,25 @@ func TestImportManager_ListAllTasks(t *testing.T) {
 		}, nil
 	}
 	mgr := newImportManager(context.TODO(), mockKv, idAlloc, fn, callMarkSegmentsDropped, nil, nil, nil, nil)
-	mgr.importJob(context.TODO(), rowReq, colID, 0)
+	repeat := 10
+	for i := 0; i < repeat; i++ {
+		mgr.importJob(context.TODO(), rowReq, colID, 0)
+	}

-	tasks := mgr.listAllTasks("", 100)
-	assert.Equal(t, len(rowReq.Files), len(tasks))
+	// list all tasks
+	tasks := mgr.listAllTasks("", int64(repeat))
+	assert.Equal(t, repeat, len(tasks))
+	for i := 0; i < repeat; i++ {
+		assert.Equal(t, int64(i+1), tasks[i].Id)
+	}
+
+	// list few tasks
+	limit := 3
+	tasks = mgr.listAllTasks("", int64(limit))
+	assert.Equal(t, limit, len(tasks))
+	for i := 0; i < limit; i++ {
+		assert.Equal(t, int64(i+repeat-limit+1), tasks[i].Id)
+	}

 	resp := mgr.getTaskState(1)
 	assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
@ -943,13 +988,10 @@ func TestImportManager_ListAllTasks(t *testing.T) {
 		}, nil
 	}

+	// there are 10 tasks in working list, and 1 task in pending list, totally 11 tasks
 	mgr.importJob(context.TODO(), rowReq, colID, 0)
 	tasks = mgr.listAllTasks("", 100)
-	assert.Equal(t, len(rowReq.Files)*2, len(tasks))
-	tasks = mgr.listAllTasks("", 1)
-	assert.Equal(t, 1, len(tasks))
-	tasks = mgr.listAllTasks("bad-collection-name", 1)
-	assert.Equal(t, 0, len(tasks))
+	assert.Equal(t, repeat+1, len(tasks))

 	// the id of tasks must be 1,2,3,4,5,6(sequence not guaranteed)
 	ids := make(map[int64]struct{})
@ -960,6 +1002,14 @@ func TestImportManager_ListAllTasks(t *testing.T) {
 		delete(ids, tasks[i].Id)
 	}
 	assert.Equal(t, 0, len(ids))
+
+	// list few tasks
+	tasks = mgr.listAllTasks("", 1)
+	assert.Equal(t, 1, len(tasks))
+
+	// invliad collection name, returns empty
+	tasks = mgr.listAllTasks("bad-collection-name", 1)
+	assert.Equal(t, 0, len(tasks))
 }

 func TestImportManager_setCollectionPartitionName(t *testing.T) {
@ -1009,11 +1059,16 @@ func TestImportManager_rearrangeTasks(t *testing.T) {
 func TestImportManager_isRowbased(t *testing.T) {
 	mgr := &importManager{}

-	files := []string{"1.json", "2.json"}
+	files := []string{"1.json"}
 	rb, err := mgr.isRowbased(files)
 	assert.Nil(t, err)
 	assert.True(t, rb)

+	files = []string{"1.json", "2.json"}
+	rb, err = mgr.isRowbased(files)
+	assert.NotNil(t, err)
+	assert.True(t, rb)
+
 	files = []string{"1.json", "2.npy"}
 	rb, err = mgr.isRowbased(files)
 	assert.NotNil(t, err)
--- a/internal/util/importutil/import_util.go
+++ b/internal/util/importutil/import_util.go
@ -419,13 +419,14 @@ func tryFlushBlocks(ctx context.Context,
 		// if segment size is larger than predefined blockSize, flush to create a new binlog file
 		// initialize a new FieldData list for next round batch read
 		if size > int(blockSize) && rowCount > 0 {
-			printFieldsDataInfo(blockData, "import util: prepare to flush block larger than maxBlockSize", nil)
+			printFieldsDataInfo(blockData, "import util: prepare to flush block larger than blockSize", nil)
 			err := callFlushFunc(blockData, i)
 			if err != nil {
 				log.Error("Import util: failed to flush block data", zap.Int("shardID", i))
 				return err
 			}
-			log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount), zap.Int("size", size), zap.Int("shardID", i))
+			log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount),
+				zap.Int("size", size), zap.Int("shardID", i), zap.Int64("blockSize", blockSize))

 			blocksData[i] = initSegmentData(collectionSchema)
 			if blocksData[i] == nil {
--- a/internal/util/importutil/import_wrapper.go
+++ b/internal/util/importutil/import_wrapper.go
@ -264,11 +264,10 @@ func (p *ImportWrapper) fileValidation(filePaths []string) (bool, error) {
 		fileNames[name] = struct{}{}

 		// check file size, single file size cannot exceed MaxFileSize
-		// TODO add context
-		size, err := p.chunkManager.Size(context.TODO(), filePath)
+		size, err := p.chunkManager.Size(p.ctx, filePath)
 		if err != nil {
 			log.Error("import wrapper: failed to get file size", zap.String("filePath", filePath), zap.Error(err))
-			return rowBased, fmt.Errorf("import wrapper: failed to get file size of '%s'", filePath)
+			return rowBased, fmt.Errorf("import wrapper: failed to get file size of '%s', make sure the input path is related path, error:%w", filePath, err)
 		}

 		// empty file
@ -312,7 +311,6 @@ func (p *ImportWrapper) Import(filePaths []string, options ImportOptions) error
 	// data restore function to import milvus native binlog files(for backup/restore tools)
 	// the backup/restore tool provide two paths for a partition, the first path is binlog path, the second is deltalog path
 	if p.isBinlogImport(filePaths) {
-		// TODO: handle the timestamp end point passed from client side, currently use math.MaxUint64
 		return p.doBinlogImport(filePaths, options.TsStartPoint, options.TsEndPoint)
 	}

@ -454,23 +452,51 @@ func (p *ImportWrapper) reportPersisted() error {
 // isBinlogImport is to judge whether it is binlog import operation
 // For internal usage by the restore tool: https://github.com/zilliztech/milvus-backup
 // This tool exports data from a milvus service, and call bulkload interface to import native data into another milvus service.
-// This tool provides two paths: one is data log path of a partition,the other is delta log path of this partition.
+// This tool provides two paths: one is insert log path of a partition,the other is delta log path of this partition.
 // This method checks the filePaths, if the file paths is exist and not a file, we say it is native import.
 func (p *ImportWrapper) isBinlogImport(filePaths []string) bool {
-	// must contains the insert log path, and the delta log path is optional
-	if len(filePaths) != 1 && len(filePaths) != 2 {
-		log.Info("import wrapper: paths count is not 1 or 2, not binlog import", zap.Int("len", len(filePaths)))
+	// must contains the insert log path, and the delta log path is optional to be empty string
+	if len(filePaths) != 2 {
+		log.Info("import wrapper: paths count is not 2, not binlog import", zap.Int("len", len(filePaths)))
 		return false
 	}

-	for i := 0; i < len(filePaths); i++ {
-		filePath := filePaths[i]
-		_, fileType := GetFileNameAndExt(filePath)
+	checkFunc := func(filePath string) bool {
 		// contains file extension, is not a path
+		_, fileType := GetFileNameAndExt(filePath)
 		if len(fileType) != 0 {
 			log.Info("import wrapper: not a path, not binlog import", zap.String("filePath", filePath), zap.String("fileType", fileType))
 			return false
 		}
+
+		// check path existence
+		exist, err := p.chunkManager.Exist(p.ctx, filePath)
+		if err != nil {
+			log.Error("import wrapper: failed to check the path existence, not binlog import", zap.String("filePath", filePath), zap.Error(err))
+			return false
+		}
+		if !exist {
+			log.Info("import wrapper: the input path doesn't exist, not binlog import", zap.String("filePath", filePath))
+			return false
+		}
+		return true
+	}
+
+	// the first path is insert log path
+	filePath := filePaths[0]
+	if len(filePath) == 0 {
+		log.Info("import wrapper: the first path is empty string, not binlog import")
+		return false
+	}
+
+	if !checkFunc(filePath) {
+		return false
+	}
+
+	// the second path is delta log path
+	filePath = filePaths[1]
+	if len(filePath) > 0 && !checkFunc(filePath) {
+		return false
 	}

 	log.Info("import wrapper: do binlog import")
@ -501,12 +527,9 @@ func (p *ImportWrapper) doBinlogImport(filePaths []string, tsStartPoint uint64,
 func (p *ImportWrapper) parseRowBasedJSON(filePath string, onlyValidate bool) error {
 	tr := timerecord.NewTimeRecorder("json row-based parser: " + filePath)

-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
 	// for minio storage, chunkManager will download file into local memory
 	// for local storage, chunkManager open the file directly
-	file, err := p.chunkManager.Reader(ctx, filePath)
+	file, err := p.chunkManager.Reader(p.ctx, filePath)
 	if err != nil {
 		return err
 	}
@ -553,13 +576,11 @@ func (p *ImportWrapper) parseColumnBasedNumpy(filePath string, onlyValidate bool
 	combineFunc func(fields map[storage.FieldID]storage.FieldData) error) error {
 	tr := timerecord.NewTimeRecorder("numpy parser: " + filePath)

-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
 	fileName, _ := GetFileNameAndExt(filePath)

 	// for minio storage, chunkManager will download file into local memory
 	// for local storage, chunkManager open the file directly
-	file, err := p.chunkManager.Reader(ctx, filePath)
+	file, err := p.chunkManager.Reader(p.ctx, filePath)
 	if err != nil {
 		return err
 	}
@ -721,7 +742,7 @@ func (p *ImportWrapper) splitFieldsData(fieldsData map[storage.FieldID]storage.F
 			return fmt.Errorf("import wrapper: field '%s' row count %d is not equal to other fields row count: %d", name, count, rowCount)
 		}
 	}
-	log.Info("import wrapper: try to split a block with row count", zap.Int("rowCount", rowCount), zap.Any("rowCountOfEachField", rowCounter))
+	log.Info("import wrapper: try to split a block with row count", zap.Int("rowCount", rowCount))

 	primaryData, ok := fieldsData[primaryKey.GetFieldID()]
 	if !ok {
--- a/internal/util/importutil/import_wrapper_test.go
+++ b/internal/util/importutil/import_wrapper_test.go
@ -37,7 +37,6 @@ import (
 	"github.com/milvus-io/milvus/internal/proto/datapb"
 	"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
 	"github.com/milvus-io/milvus/internal/storage"
-	"github.com/milvus-io/milvus/internal/util/dependency"
 	"github.com/milvus-io/milvus/internal/util/timerecord"
 )

@ -173,7 +172,9 @@ func createMockCallbackFunctions(t *testing.T, rowCounter *rowCounterTest) (Assi
 }

 func Test_NewImportWrapper(t *testing.T) {
-	f := dependency.NewDefaultFactory(true)
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
@ -226,7 +227,9 @@ func Test_ImportWrapperRowBased(t *testing.T) {
 	assert.Nil(t, err)
 	defer os.RemoveAll(TempFilesPath)

-	f := dependency.NewDefaultFactory(true)
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
@ -388,7 +391,9 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
 	assert.Nil(t, err)
 	defer os.RemoveAll(TempFilesPath)

-	f := dependency.NewDefaultFactory(true)
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
@ -485,7 +490,9 @@ func Test_ImportWrapperRowBased_perf(t *testing.T) {
 	assert.Nil(t, err)
 	defer os.RemoveAll(TempFilesPath)

-	f := dependency.NewDefaultFactory(true)
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
@ -764,7 +771,9 @@ func Test_ImportWrapperReportFailRowBased(t *testing.T) {
 	assert.Nil(t, err)
 	defer os.RemoveAll(TempFilesPath)

-	f := dependency.NewDefaultFactory(true)
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
@ -824,7 +833,9 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
 	assert.Nil(t, err)
 	defer os.RemoveAll(TempFilesPath)

-	f := dependency.NewDefaultFactory(true)
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
 	ctx := context.Background()
 	cm, err := f.NewPersistentStorageChunkManager(ctx)
 	assert.NoError(t, err)
@ -867,11 +878,16 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
 }

 func Test_ImportWrapperIsBinlogImport(t *testing.T) {
-	ctx := context.Background()
+	err := os.MkdirAll(TempFilesPath, os.ModePerm)
+	assert.Nil(t, err)
+	defer os.RemoveAll(TempFilesPath)

-	cm := &MockChunkManager{
-		size: 1,
-	}
+	// NewDefaultFactory() use "/tmp/milvus" as default root path, and cannot specify root path
+	// NewChunkManagerFactory() can specify the root path
+	f := storage.NewChunkManagerFactory("local", storage.RootPath(TempFilesPath))
+	ctx := context.Background()
+	cm, err := f.NewPersistentStorageChunkManager(ctx)
+	assert.NoError(t, err)

 	idAllocator := newIDAllocator(ctx, t, nil)
 	schema := perfSchema(128)
@ -902,13 +918,43 @@ func Test_ImportWrapperIsBinlogImport(t *testing.T) {
 	b = wrapper.isBinlogImport(paths)
 	assert.False(t, b)

-	// success
+	// path doesn't exist
 	paths = []string{
-		"/tmp",
-		"/tmp",
+		"path1",
+		"path2",
+	}
+	b = wrapper.isBinlogImport(paths)
+	assert.False(t, b)
+
+	// insert log path is created, but delta log path doesn't exist
+	err = os.MkdirAll(TempFilesPath+paths[0], os.ModePerm)
+	assert.NoError(t, err)
+
+	b = wrapper.isBinlogImport(paths)
+	assert.False(t, b)
+
+	// both the two path are created, success
+	err = os.MkdirAll(TempFilesPath+paths[1], os.ModePerm)
+	assert.NoError(t, err)
+
+	b = wrapper.isBinlogImport(paths)
+	assert.True(t, b)
+
+	// the delta log path is empty, success
+	paths = []string{
+		"path1",
+		"",
 	}
 	b = wrapper.isBinlogImport(paths)
 	assert.True(t, b)
+
+	// path is empty string
+	paths = []string{
+		"",
+		"",
+	}
+	b = wrapper.isBinlogImport(paths)
+	assert.False(t, b)
 }

 func Test_ImportWrapperDoBinlogImport(t *testing.T) {
@ -1022,7 +1068,7 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
 	err := wrapper.splitFieldsData(nil, 0)
 	assert.NotNil(t, err)

-	// split 100 rows to 4 blocks
+	// split 100 rows to 4 blocks, success
 	rowCount := 100
 	input := initSegmentData(schema)
 	for j := 0; j < rowCount; j++ {
@ -1039,6 +1085,12 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
 	assert.Equal(t, 4, rowCounter.callTime)
 	assert.Equal(t, rowCount, rowCounter.rowCount)

+	// alloc id failed
+	wrapper.rowIDAllocator = newIDAllocator(ctx, t, errors.New("error"))
+	err = wrapper.splitFieldsData(input, 512)
+	assert.NotNil(t, err)
+	wrapper.rowIDAllocator = newIDAllocator(ctx, t, nil)
+
 	// row count of fields are unequal
 	schema.Fields[0].AutoID = false
 	input = initSegmentData(schema)
@ -1053,4 +1105,29 @@ func Test_ImportWrapperSplitFieldsData(t *testing.T) {
 	}
 	err = wrapper.splitFieldsData(input, 512)
 	assert.NotNil(t, err)
+
+	// primary key not found
+	wrapper.collectionSchema.Fields[0].IsPrimaryKey = false
+	err = wrapper.splitFieldsData(input, 512)
+	assert.NotNil(t, err)
+	wrapper.collectionSchema.Fields[0].IsPrimaryKey = true
+
+	// primary key is varchar, success
+	wrapper.collectionSchema.Fields[0].DataType = schemapb.DataType_VarChar
+	input = initSegmentData(schema)
+	for j := 0; j < rowCount; j++ {
+		pkField := input[101].(*storage.StringFieldData)
+		pkField.Data = append(pkField.Data, strconv.FormatInt(int64(j), 10))
+
+		flagField := input[102].(*storage.BoolFieldData)
+		flagField.Data = append(flagField.Data, true)
+	}
+	rowCounter.callTime = 0
+	rowCounter.rowCount = 0
+	importResult.AutoIds = []int64{}
+	err = wrapper.splitFieldsData(input, 1024)
+	assert.Nil(t, err)
+	assert.Equal(t, 0, len(importResult.AutoIds))
+	assert.Equal(t, 2, rowCounter.callTime)
+	assert.Equal(t, rowCount, rowCounter.rowCount)
 }