mirror of https://github.com/milvus-io/milvus.git
enhance: Make import-related error message clearer (#28978)
issue: #28976 Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>pull/29077/head
parent
464bc9e8f4
commit
2b05460ef9
|
@ -103,8 +103,8 @@ func NewBinlogAdapter(ctx context.Context,
|
|||
}
|
||||
|
||||
// amend the segment size to avoid portential OOM risk
|
||||
if adapter.blockSize > MaxSegmentSizeInMemory {
|
||||
adapter.blockSize = MaxSegmentSizeInMemory
|
||||
if adapter.blockSize > Params.DataCoordCfg.SegmentMaxSize.GetAsInt64() {
|
||||
adapter.blockSize = Params.DataCoordCfg.SegmentMaxSize.GetAsInt64()
|
||||
}
|
||||
|
||||
return adapter, nil
|
||||
|
|
|
@ -28,6 +28,7 @@ import (
|
|||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||
"github.com/milvus-io/milvus/internal/storage"
|
||||
"github.com/milvus-io/milvus/pkg/common"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -69,6 +70,7 @@ func createDeltalogBuf(t *testing.T, deleteList interface{}, varcharType bool) [
|
|||
|
||||
func Test_BinlogAdapterNew(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
paramtable.Init()
|
||||
|
||||
// nil schema
|
||||
adapter, err := NewBinlogAdapter(ctx, nil, 1024, 2048, nil, nil, 0, math.MaxUint64)
|
||||
|
@ -103,10 +105,10 @@ func Test_BinlogAdapterNew(t *testing.T) {
|
|||
assert.NoError(t, err)
|
||||
|
||||
// amend blockSize, blockSize should less than MaxSegmentSizeInMemory
|
||||
adapter, err = NewBinlogAdapter(ctx, collectionInfo, MaxSegmentSizeInMemory+1, 1024, &MockChunkManager{}, flushFunc, 0, math.MaxUint64)
|
||||
adapter, err = NewBinlogAdapter(ctx, collectionInfo, Params.DataCoordCfg.SegmentMaxSize.GetAsInt64()+1, 1024, &MockChunkManager{}, flushFunc, 0, math.MaxUint64)
|
||||
assert.NotNil(t, adapter)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, int64(MaxSegmentSizeInMemory), adapter.blockSize)
|
||||
assert.Equal(t, Params.DataCoordCfg.SegmentMaxSize.GetAsInt64(), adapter.blockSize)
|
||||
}
|
||||
|
||||
func Test_BinlogAdapterVerify(t *testing.T) {
|
||||
|
|
|
@ -225,7 +225,7 @@ func (p *BinlogParser) parseSegmentFiles(segmentHolder *SegmentFilesHolder) erro
|
|||
}
|
||||
|
||||
adapter, err := NewBinlogAdapter(p.ctx, p.collectionInfo, p.blockSize,
|
||||
MaxTotalSizeInMemory, p.chunkManager, p.callFlushFunc, p.tsStartPoint, p.tsEndPoint)
|
||||
Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), p.chunkManager, p.callFlushFunc, p.tsStartPoint, p.tsEndPoint)
|
||||
if err != nil {
|
||||
log.Warn("Binlog parser: failed to create binlog adapter", zap.Error(err))
|
||||
return merr.WrapErrImportFailed(fmt.Sprintf("failed to create binlog adapter, error: %v", err))
|
||||
|
|
|
@ -28,10 +28,10 @@ import (
|
|||
"github.com/milvus-io/milvus/internal/allocator"
|
||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
|
||||
"github.com/milvus-io/milvus/internal/querycoordv2/params"
|
||||
"github.com/milvus-io/milvus/internal/storage"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/retry"
|
||||
"github.com/milvus-io/milvus/pkg/util/timerecord"
|
||||
)
|
||||
|
@ -41,20 +41,6 @@ const (
|
|||
NumpyFileExt = ".npy"
|
||||
ParquetFileExt = ".parquet"
|
||||
|
||||
// parsers read JSON/Numpy/CSV files buffer by buffer, this limitation is to define the buffer size.
|
||||
ReadBufferSize = 16 * 1024 * 1024 // 16MB
|
||||
|
||||
// this limitation is to avoid this OOM risk:
|
||||
// simetimes system segment max size is a large number, a single segment fields data might cause OOM.
|
||||
// flush the segment when its data reach this limitation, let the compaction to compact it later.
|
||||
MaxSegmentSizeInMemory = 512 * 1024 * 1024 // 512MB
|
||||
|
||||
// this limitation is to avoid this OOM risk:
|
||||
// if the shard number is a large number, although single segment size is small, but there are lot of in-memory segments,
|
||||
// the total memory size might cause OOM.
|
||||
// TODO: make it configurable.
|
||||
MaxTotalSizeInMemory = 6 * 1024 * 1024 * 1024 // 6GB
|
||||
|
||||
// progress percent value of persist state
|
||||
ProgressValueForPersist = 90
|
||||
|
||||
|
@ -67,6 +53,8 @@ const (
|
|||
ProgressPercent = "progress_percent"
|
||||
)
|
||||
|
||||
var Params *paramtable.ComponentParam = paramtable.Get()
|
||||
|
||||
// ReportImportAttempts is the maximum # of attempts to retry when import fails.
|
||||
var ReportImportAttempts uint = 10
|
||||
|
||||
|
@ -126,8 +114,8 @@ func NewImportWrapper(ctx context.Context, collectionInfo *CollectionInfo, segme
|
|||
// average binlogSize is expected to be half of the maxBinlogSize
|
||||
// and avoid binlogSize to be a tiny value
|
||||
binlogSize := int64(float32(maxBinlogSize) * 0.5)
|
||||
if binlogSize < ReadBufferSize {
|
||||
binlogSize = ReadBufferSize
|
||||
if binlogSize < Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64() {
|
||||
binlogSize = Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64()
|
||||
}
|
||||
|
||||
wrapper := &ImportWrapper{
|
||||
|
@ -234,11 +222,11 @@ func (p *ImportWrapper) fileValidation(filePaths []string) (bool, error) {
|
|||
return rowBased, merr.WrapErrImportFailed(fmt.Sprintf("the file '%s' size is zero", filePath))
|
||||
}
|
||||
|
||||
if size > params.Params.CommonCfg.ImportMaxFileSize.GetAsInt64() {
|
||||
if size > Params.CommonCfg.ImportMaxFileSize.GetAsInt64() {
|
||||
log.Warn("import wrapper: file size exceeds the maximum size", zap.String("filePath", filePath),
|
||||
zap.Int64("fileSize", size), zap.String("MaxFileSize", params.Params.CommonCfg.ImportMaxFileSize.GetValue()))
|
||||
zap.Int64("fileSize", size), zap.String("MaxFileSize", Params.CommonCfg.ImportMaxFileSize.GetValue()))
|
||||
return rowBased, merr.WrapErrImportFailed(fmt.Sprintf("the file '%s' size exceeds the maximum size: %s bytes",
|
||||
filePath, params.Params.CommonCfg.ImportMaxFileSize.GetValue()))
|
||||
filePath, Params.CommonCfg.ImportMaxFileSize.GetValue()))
|
||||
}
|
||||
totalSize += size
|
||||
}
|
||||
|
|
|
@ -37,7 +37,6 @@ import (
|
|||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
|
||||
"github.com/milvus-io/milvus/internal/querycoordv2/params"
|
||||
"github.com/milvus-io/milvus/internal/storage"
|
||||
"github.com/milvus-io/milvus/pkg/common"
|
||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||
|
@ -190,7 +189,7 @@ func Test_ImportWrapperNew(t *testing.T) {
|
|||
ctx := context.Background()
|
||||
cm, err := f.NewPersistentStorageChunkManager(ctx)
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, nil, 1, ReadBufferSize, nil, cm, nil, nil)
|
||||
wrapper := NewImportWrapper(ctx, nil, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), nil, cm, nil, nil)
|
||||
assert.Nil(t, wrapper)
|
||||
|
||||
schema := &schemapb.CollectionSchema{
|
||||
|
@ -210,7 +209,7 @@ func Test_ImportWrapperNew(t *testing.T) {
|
|||
})
|
||||
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper = NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, nil, cm, nil, nil)
|
||||
wrapper = NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), nil, cm, nil, nil)
|
||||
assert.NotNil(t, wrapper)
|
||||
|
||||
assignSegFunc := func(shardID int, partID int64) (int64, string, error) {
|
||||
|
@ -287,7 +286,7 @@ func Test_ImportWrapperRowBased(t *testing.T) {
|
|||
assert.NoError(t, err)
|
||||
|
||||
t.Run("success case", func(t *testing.T) {
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
files := make([]string, 0)
|
||||
files = append(files, filePath)
|
||||
|
@ -313,7 +312,7 @@ func Test_ImportWrapperRowBased(t *testing.T) {
|
|||
assert.NoError(t, err)
|
||||
|
||||
importResult.State = commonpb.ImportState_ImportStarted
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
files := make([]string, 0)
|
||||
files = append(files, filePath)
|
||||
|
@ -325,7 +324,7 @@ func Test_ImportWrapperRowBased(t *testing.T) {
|
|||
t.Run("file doesn't exist", func(t *testing.T) {
|
||||
files := make([]string, 0)
|
||||
files = append(files, "/dummy/dummy.json")
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
err = wrapper.Import(files, ImportOptions{OnlyValidate: true})
|
||||
assert.Error(t, err)
|
||||
})
|
||||
|
@ -368,7 +367,7 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
|
|||
files := createSampleNumpyFiles(t, cm)
|
||||
|
||||
t.Run("success case", func(t *testing.T) {
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
|
||||
err = wrapper.Import(files, DefaultImportOptions())
|
||||
|
@ -386,7 +385,7 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
|
|||
files[1] = filePath
|
||||
|
||||
importResult.State = commonpb.ImportState_ImportStarted
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
|
||||
err = wrapper.Import(files, DefaultImportOptions())
|
||||
|
@ -397,7 +396,7 @@ func Test_ImportWrapperColumnBased_numpy(t *testing.T) {
|
|||
t.Run("file doesn't exist", func(t *testing.T) {
|
||||
files := make([]string, 0)
|
||||
files = append(files, "/dummy/dummy.npy")
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
err = wrapper.Import(files, DefaultImportOptions())
|
||||
assert.Error(t, err)
|
||||
})
|
||||
|
@ -517,7 +516,7 @@ func Test_ImportWrapperRowBased_perf(t *testing.T) {
|
|||
}
|
||||
collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
|
||||
files := make([]string, 0)
|
||||
|
@ -561,7 +560,7 @@ func Test_ImportWrapperFileValidation(t *testing.T) {
|
|||
|
||||
collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, nil, nil)
|
||||
|
||||
t.Run("unsupported file type", func(t *testing.T) {
|
||||
files := []string{"uid.txt"}
|
||||
|
@ -611,7 +610,7 @@ func Test_ImportWrapperFileValidation(t *testing.T) {
|
|||
t.Run("empty file list", func(t *testing.T) {
|
||||
files := []string{}
|
||||
cm.size = 0
|
||||
wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
|
||||
wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, nil, nil)
|
||||
rowBased, err := wrapper.fileValidation(files)
|
||||
assert.NoError(t, err)
|
||||
assert.False(t, rowBased)
|
||||
|
@ -619,8 +618,8 @@ func Test_ImportWrapperFileValidation(t *testing.T) {
|
|||
|
||||
t.Run("file size exceed MaxFileSize limit", func(t *testing.T) {
|
||||
files := []string{"a/1.json"}
|
||||
cm.size = params.Params.CommonCfg.ImportMaxFileSize.GetAsInt64() + 1
|
||||
wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
|
||||
cm.size = Params.CommonCfg.ImportMaxFileSize.GetAsInt64() + 1
|
||||
wrapper = NewImportWrapper(ctx, collectionInfo, int64(segmentSize), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, nil, nil)
|
||||
rowBased, err := wrapper.fileValidation(files)
|
||||
assert.Error(t, err)
|
||||
assert.True(t, rowBased)
|
||||
|
@ -691,7 +690,7 @@ func Test_ImportWrapperReportFailRowBased(t *testing.T) {
|
|||
}
|
||||
collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
|
||||
files := []string{filePath}
|
||||
|
@ -738,7 +737,7 @@ func Test_ImportWrapperReportFailColumnBased_numpy(t *testing.T) {
|
|||
}
|
||||
collectionInfo, err := NewCollectionInfo(createNumpySchema(), 2, []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, idAllocator, cm, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, importResult, reportFunc)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
|
||||
wrapper.reportImportAttempts = 2
|
||||
|
@ -773,7 +772,7 @@ func Test_ImportWrapperIsBinlogImport(t *testing.T) {
|
|||
|
||||
collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, nil, nil)
|
||||
|
||||
// empty paths
|
||||
paths := []string{}
|
||||
|
@ -837,7 +836,7 @@ func Test_ImportWrapperDoBinlogImport(t *testing.T) {
|
|||
|
||||
collectionInfo, err := NewCollectionInfo(schema, int32(shardNum), []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), ReadBufferSize, idAllocator, cm, nil, nil)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(segmentSize), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), idAllocator, cm, nil, nil)
|
||||
paths := []string{
|
||||
"/tmp",
|
||||
"/tmp",
|
||||
|
@ -900,7 +899,7 @@ func Test_ImportWrapperReportPersisted(t *testing.T) {
|
|||
}
|
||||
collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(1024), ReadBufferSize, nil, nil, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, int64(1024), Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), nil, nil, importResult, reportFunc)
|
||||
assert.NotNil(t, wrapper)
|
||||
|
||||
rowCounter := &rowCounterTest{}
|
||||
|
@ -943,7 +942,7 @@ func Test_ImportWrapperUpdateProgressPercent(t *testing.T) {
|
|||
|
||||
collectionInfo, err := NewCollectionInfo(sampleSchema(), 2, []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, nil, nil, nil, nil)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), nil, nil, nil, nil)
|
||||
assert.NotNil(t, wrapper)
|
||||
assert.Equal(t, int64(0), wrapper.progressPercent)
|
||||
|
||||
|
@ -982,7 +981,7 @@ func Test_ImportWrapperFlushFunc(t *testing.T) {
|
|||
schema := sampleSchema()
|
||||
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
||||
assert.NoError(t, err)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, ReadBufferSize, nil, nil, importResult, reportFunc)
|
||||
wrapper := NewImportWrapper(ctx, collectionInfo, 1, Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt64(), nil, nil, importResult, reportFunc)
|
||||
assert.NotNil(t, wrapper)
|
||||
wrapper.SetCallbackFunctions(assignSegmentFunc, flushFunc, saveSegmentFunc)
|
||||
|
||||
|
|
|
@ -130,7 +130,7 @@ func (v *JSONRowConsumer) Handle(rows []map[storage.FieldID]interface{}) error {
|
|||
|
||||
// if rows is nil, that means read to end of file, force flush all data
|
||||
if rows == nil {
|
||||
err := tryFlushBlocks(v.ctx, v.shardsData, v.collectionInfo.Schema, v.callFlushFunc, v.blockSize, MaxTotalSizeInMemory, true)
|
||||
err := tryFlushBlocks(v.ctx, v.shardsData, v.collectionInfo.Schema, v.callFlushFunc, v.blockSize, Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), true)
|
||||
log.Info("JSON row consumer finished")
|
||||
return err
|
||||
}
|
||||
|
@ -138,7 +138,7 @@ func (v *JSONRowConsumer) Handle(rows []map[storage.FieldID]interface{}) error {
|
|||
// rows is not nil, flush in necessary:
|
||||
// 1. data block size larger than v.blockSize will be flushed
|
||||
// 2. total data size exceeds MaxTotalSizeInMemory, the largest data block will be flushed
|
||||
err := tryFlushBlocks(v.ctx, v.shardsData, v.collectionInfo.Schema, v.callFlushFunc, v.blockSize, MaxTotalSizeInMemory, false)
|
||||
err := tryFlushBlocks(v.ctx, v.shardsData, v.collectionInfo.Schema, v.callFlushFunc, v.blockSize, Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), false)
|
||||
if err != nil {
|
||||
log.Warn("JSON row consumer: try flush data but failed", zap.Error(err))
|
||||
return merr.WrapErrImportFailed(fmt.Sprintf("try flush data but failed, error: %v", err))
|
||||
|
|
|
@ -73,7 +73,7 @@ func adjustBufSize(parser *JSONParser, collectionSchema *schemapb.CollectionSche
|
|||
// for low dimensional vector, the bufSize is a large value, read more rows each time
|
||||
bufRowCount := parser.bufRowCount
|
||||
for {
|
||||
if bufRowCount*sizePerRecord > ReadBufferSize {
|
||||
if bufRowCount*sizePerRecord > Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt() {
|
||||
bufRowCount--
|
||||
} else {
|
||||
break
|
||||
|
|
|
@ -563,9 +563,9 @@ func (n *NumpyAdapter) ReadString(count int) ([]string, error) {
|
|||
// read string one by one is not efficient, here we read strings batch by batch, each bach size is no more than 16MB
|
||||
batchRead := 1 // rows of each batch, make sure this value is equal or greater than 1
|
||||
if utf {
|
||||
batchRead += ReadBufferSize / (utf8.UTFMax * maxLen)
|
||||
batchRead += Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt() / (utf8.UTFMax * maxLen)
|
||||
} else {
|
||||
batchRead += ReadBufferSize / maxLen
|
||||
batchRead += Params.DataNodeCfg.BulkInsertReadBufferSize.GetAsInt() / maxLen
|
||||
}
|
||||
|
||||
log.Info("Numpy adapter: prepare to read varchar batch by batch",
|
||||
|
|
|
@ -439,7 +439,7 @@ func (p *NumpyParser) consume(columnReaders []*NumpyColumnReader) error {
|
|||
}
|
||||
tr.Record("splitFieldsData")
|
||||
// when the estimated size is close to blockSize, save to binlog
|
||||
err = tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, MaxTotalSizeInMemory, false)
|
||||
err = tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -447,7 +447,7 @@ func (p *NumpyParser) consume(columnReaders []*NumpyColumnReader) error {
|
|||
}
|
||||
|
||||
// force flush at the end
|
||||
return tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, MaxTotalSizeInMemory, true)
|
||||
return tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), true)
|
||||
}
|
||||
|
||||
// readData method reads numpy data section into a storage.FieldData
|
||||
|
|
|
@ -94,7 +94,7 @@ func NewParquetParser(ctx context.Context,
|
|||
return nil, err
|
||||
}
|
||||
|
||||
fileReader, err := pqarrow.NewFileReader(reader, pqarrow.ArrowReadProperties{BatchSize: 1}, memory.DefaultAllocator)
|
||||
fileReader, err := pqarrow.NewFileReader(reader, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
|
||||
if err != nil {
|
||||
log.Warn("create arrow parquet file reader failed", zap.Error(err))
|
||||
return nil, err
|
||||
|
@ -165,10 +165,22 @@ func (p *ParquetParser) createReaders() error {
|
|||
return merr.WrapErrImportFailed(fmt.Sprintf("there is multi field of fieldName: %s", field.GetName()))
|
||||
}
|
||||
if !verifyFieldSchema(field.GetDataType(), field.GetElementType(), fields[0]) {
|
||||
if fields[0].Type.ID() == arrow.LIST {
|
||||
log.Warn("field schema is not match",
|
||||
zap.String("fieldName", field.GetName()),
|
||||
zap.String("collection schema", field.GetDataType().String()),
|
||||
zap.String("file schema", fields[0].Type.Name()),
|
||||
zap.String("collection schema element type", field.GetElementType().String()),
|
||||
zap.String("file list element type", fields[0].Type.(*arrow.ListType).ElemField().Type.Name()))
|
||||
return merr.WrapErrImportFailed(fmt.Sprintf("array field schema is not match of field: %s, collection field element dataType: %s, file field element dataType:%s",
|
||||
field.GetName(), field.GetElementType().String(), fields[0].Type.(*arrow.ListType).ElemField().Type.Name()))
|
||||
}
|
||||
log.Warn("field schema is not match",
|
||||
zap.String("fieldName", field.GetName()),
|
||||
zap.String("collection schema", field.GetDataType().String()),
|
||||
zap.String("file schema", fields[0].Type.Name()))
|
||||
return merr.WrapErrImportFailed(fmt.Sprintf("field schema is not match, collection field dataType: %s, file field dataType:%s", field.GetDataType().String(), fields[0].Type.Name()))
|
||||
return merr.WrapErrImportFailed(fmt.Sprintf("schema is not match of field: %s, collection field dataType: %s, file field dataType:%s",
|
||||
field.GetName(), field.GetDataType().String(), fields[0].Type.Name()))
|
||||
}
|
||||
indices := schema.FieldIndices(field.GetName())
|
||||
if len(indices) != 1 {
|
||||
|
@ -315,7 +327,7 @@ func (p *ParquetParser) consume() error {
|
|||
}
|
||||
tr.Record("splitFieldsData")
|
||||
// when the estimated size is close to blockSize, save to binlog
|
||||
err = tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, MaxTotalSizeInMemory, false)
|
||||
err = tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -323,7 +335,7 @@ func (p *ParquetParser) consume() error {
|
|||
}
|
||||
|
||||
// force flush at the end
|
||||
return tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, MaxTotalSizeInMemory, true)
|
||||
return tryFlushBlocks(p.ctx, shards, p.collectionInfo.Schema, p.callFlushFunc, p.blockSize, Params.DataNodeCfg.BulkInsertMaxMemorySize.GetAsInt64(), true)
|
||||
}
|
||||
|
||||
// readData method reads Parquet data section into a storage.FieldData
|
||||
|
@ -332,13 +344,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Bool:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]bool, error) {
|
||||
boolReader, ok := chunk.(*array.Boolean)
|
||||
boolData := make([]bool, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not bool", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not bool of field: %s", columnReader.fieldName))
|
||||
}
|
||||
boolData := make([]bool, boolReader.Data().Len())
|
||||
for i := 0; i < boolReader.Data().Len(); i++ {
|
||||
boolData = append(boolData, boolReader.Value(i))
|
||||
boolData[i] = boolReader.Value(i)
|
||||
}
|
||||
return boolData, nil
|
||||
})
|
||||
|
@ -353,13 +365,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Int8:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]int8, error) {
|
||||
int8Reader, ok := chunk.(*array.Int8)
|
||||
int8Data := make([]int8, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not int8", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not int8 of field: %s", columnReader.fieldName))
|
||||
}
|
||||
int8Data := make([]int8, int8Reader.Data().Len())
|
||||
for i := 0; i < int8Reader.Data().Len(); i++ {
|
||||
int8Data = append(int8Data, int8Reader.Value(i))
|
||||
int8Data[i] = int8Reader.Value(i)
|
||||
}
|
||||
return int8Data, nil
|
||||
})
|
||||
|
@ -374,13 +386,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Int16:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]int16, error) {
|
||||
int16Reader, ok := chunk.(*array.Int16)
|
||||
int16Data := make([]int16, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not int16", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not int16 of field: %s", columnReader.fieldName))
|
||||
}
|
||||
int16Data := make([]int16, int16Reader.Data().Len())
|
||||
for i := 0; i < int16Reader.Data().Len(); i++ {
|
||||
int16Data = append(int16Data, int16Reader.Value(i))
|
||||
int16Data[i] = int16Reader.Value(i)
|
||||
}
|
||||
return int16Data, nil
|
||||
})
|
||||
|
@ -395,13 +407,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Int32:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]int32, error) {
|
||||
int32Reader, ok := chunk.(*array.Int32)
|
||||
int32Data := make([]int32, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not int32", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not int32 of field: %s", columnReader.fieldName))
|
||||
}
|
||||
int32Data := make([]int32, int32Reader.Data().Len())
|
||||
for i := 0; i < int32Reader.Data().Len(); i++ {
|
||||
int32Data = append(int32Data, int32Reader.Value(i))
|
||||
int32Data[i] = int32Reader.Value(i)
|
||||
}
|
||||
return int32Data, nil
|
||||
})
|
||||
|
@ -416,13 +428,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Int64:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]int64, error) {
|
||||
int64Reader, ok := chunk.(*array.Int64)
|
||||
int64Data := make([]int64, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not int64", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not int64 of field: %s", columnReader.fieldName))
|
||||
}
|
||||
int64Data := make([]int64, int64Reader.Data().Len())
|
||||
for i := 0; i < int64Reader.Data().Len(); i++ {
|
||||
int64Data = append(int64Data, int64Reader.Value(i))
|
||||
int64Data[i] = int64Reader.Value(i)
|
||||
}
|
||||
return int64Data, nil
|
||||
})
|
||||
|
@ -437,13 +449,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Float:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]float32, error) {
|
||||
float32Reader, ok := chunk.(*array.Float32)
|
||||
float32Data := make([]float32, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not float", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not float of field: %s", columnReader.fieldName))
|
||||
}
|
||||
float32Data := make([]float32, float32Reader.Data().Len())
|
||||
for i := 0; i < float32Reader.Data().Len(); i++ {
|
||||
float32Data = append(float32Data, float32Reader.Value(i))
|
||||
float32Data[i] = float32Reader.Value(i)
|
||||
}
|
||||
return float32Data, nil
|
||||
})
|
||||
|
@ -464,13 +476,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_Double:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]float64, error) {
|
||||
float64Reader, ok := chunk.(*array.Float64)
|
||||
float64Data := make([]float64, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not double", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not double of field: %s", columnReader.fieldName))
|
||||
}
|
||||
float64Data := make([]float64, float64Reader.Data().Len())
|
||||
for i := 0; i < float64Reader.Data().Len(); i++ {
|
||||
float64Data = append(float64Data, float64Reader.Value(i))
|
||||
float64Data[i] = float64Reader.Value(i)
|
||||
}
|
||||
return float64Data, nil
|
||||
})
|
||||
|
@ -491,13 +503,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]string, error) {
|
||||
stringReader, ok := chunk.(*array.String)
|
||||
stringData := make([]string, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not string", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not string of field: %s", columnReader.fieldName))
|
||||
}
|
||||
stringData := make([]string, stringReader.Data().Len())
|
||||
for i := 0; i < stringReader.Data().Len(); i++ {
|
||||
stringData = append(stringData, stringReader.Value(i))
|
||||
stringData[i] = stringReader.Value(i)
|
||||
}
|
||||
return stringData, nil
|
||||
})
|
||||
|
@ -513,13 +525,13 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
// JSON field read data from string array Parquet
|
||||
data, err := ReadData(columnReader, rowCount, func(chunk arrow.Array) ([]string, error) {
|
||||
stringReader, ok := chunk.(*array.String)
|
||||
stringData := make([]string, 0)
|
||||
if !ok {
|
||||
log.Warn("the column data in parquet is not json string", zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not json string of field: %s", columnReader.fieldName))
|
||||
}
|
||||
stringData := make([]string, stringReader.Data().Len())
|
||||
for i := 0; i < stringReader.Data().Len(); i++ {
|
||||
stringData = append(stringData, stringReader.Value(i))
|
||||
stringData[i] = stringReader.Value(i)
|
||||
}
|
||||
return stringData, nil
|
||||
})
|
||||
|
@ -545,7 +557,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}, nil
|
||||
case schemapb.DataType_BinaryVector:
|
||||
data, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]uint8, error) {
|
||||
arrayData := make([][]uint8, 0)
|
||||
arrayData := make([][]uint8, 0, len(offsets))
|
||||
uint8Reader, ok := reader.(*array.Uint8)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not binary", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -553,7 +565,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]uint8, 0)
|
||||
elementData := make([]uint8, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, uint8Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -586,7 +598,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
rowNum := 0
|
||||
if columnReader.columnReader.Field().Type.(*arrow.ListType).Elem().ID() == arrow.FLOAT32 {
|
||||
arrayData, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]float32, error) {
|
||||
arrayData := make([][]float32, 0)
|
||||
arrayData := make([][]float32, 0, len(offsets))
|
||||
float32Reader, ok := reader.(*array.Float32)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not float", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -594,7 +606,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]float32, 0)
|
||||
elementData := make([]float32, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, float32Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -617,7 +629,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
rowNum = len(arrayData)
|
||||
} else if columnReader.columnReader.Field().Type.(*arrow.ListType).Elem().ID() == arrow.FLOAT64 {
|
||||
arrayData, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]float64, error) {
|
||||
arrayData := make([][]float64, 0)
|
||||
arrayData := make([][]float64, 0, len(offsets))
|
||||
float64Reader, ok := reader.(*array.Float64)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not double", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -625,7 +637,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]float64, 0)
|
||||
elementData := make([]float64, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, float64Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -671,7 +683,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
switch columnReader.elementType {
|
||||
case schemapb.DataType_Bool:
|
||||
boolArray, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]bool, error) {
|
||||
arrayData := make([][]bool, 0)
|
||||
arrayData := make([][]bool, 0, len(offsets))
|
||||
boolReader, ok := reader.(*array.Boolean)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not bool", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -679,7 +691,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]bool, 0)
|
||||
elementData := make([]bool, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, boolReader.Value(int(j)))
|
||||
}
|
||||
|
@ -701,7 +713,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
case schemapb.DataType_Int8:
|
||||
int8Array, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]int32, error) {
|
||||
arrayData := make([][]int32, 0)
|
||||
arrayData := make([][]int32, 0, len(offsets))
|
||||
int8Reader, ok := reader.(*array.Int8)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not int8", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -709,7 +721,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]int32, 0)
|
||||
elementData := make([]int32, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, int32(int8Reader.Value(int(j))))
|
||||
}
|
||||
|
@ -731,7 +743,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
case schemapb.DataType_Int16:
|
||||
int16Array, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]int32, error) {
|
||||
arrayData := make([][]int32, 0)
|
||||
arrayData := make([][]int32, 0, len(offsets))
|
||||
int16Reader, ok := reader.(*array.Int16)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not int16", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -739,7 +751,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]int32, 0)
|
||||
elementData := make([]int32, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, int32(int16Reader.Value(int(j))))
|
||||
}
|
||||
|
@ -762,7 +774,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
|
||||
case schemapb.DataType_Int32:
|
||||
int32Array, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]int32, error) {
|
||||
arrayData := make([][]int32, 0)
|
||||
arrayData := make([][]int32, 0, len(offsets))
|
||||
int32Reader, ok := reader.(*array.Int32)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not int32", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -770,7 +782,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]int32, 0)
|
||||
elementData := make([]int32, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, int32Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -793,7 +805,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
|
||||
case schemapb.DataType_Int64:
|
||||
int64Array, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]int64, error) {
|
||||
arrayData := make([][]int64, 0)
|
||||
arrayData := make([][]int64, 0, len(offsets))
|
||||
int64Reader, ok := reader.(*array.Int64)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not int64", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -801,7 +813,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]int64, 0)
|
||||
elementData := make([]int64, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, int64Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -824,7 +836,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
|
||||
case schemapb.DataType_Float:
|
||||
float32Array, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]float32, error) {
|
||||
arrayData := make([][]float32, 0)
|
||||
arrayData := make([][]float32, 0, len(offsets))
|
||||
float32Reader, ok := reader.(*array.Float32)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not float", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -832,7 +844,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]float32, 0)
|
||||
elementData := make([]float32, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, float32Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -855,7 +867,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
|
||||
case schemapb.DataType_Double:
|
||||
float64Array, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]float64, error) {
|
||||
arrayData := make([][]float64, 0)
|
||||
arrayData := make([][]float64, 0, len(offsets))
|
||||
float64Reader, ok := reader.(*array.Float64)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not double", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -863,7 +875,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]float64, 0)
|
||||
elementData := make([]float64, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, float64Reader.Value(int(j)))
|
||||
}
|
||||
|
@ -886,7 +898,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
|
||||
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
||||
stringArray, err := ReadArrayData(columnReader, rowCount, func(offsets []int32, reader arrow.Array) ([][]string, error) {
|
||||
arrayData := make([][]string, 0)
|
||||
arrayData := make([][]string, 0, len(offsets))
|
||||
stringReader, ok := reader.(*array.String)
|
||||
if !ok {
|
||||
log.Warn("the column element data of array in parquet is not string", zap.String("fieldName", columnReader.fieldName))
|
||||
|
@ -894,7 +906,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
}
|
||||
for i := 1; i < len(offsets); i++ {
|
||||
start, end := offsets[i-1], offsets[i]
|
||||
elementData := make([]string, 0)
|
||||
elementData := make([]string, 0, end-start)
|
||||
for j := start; j < end; j++ {
|
||||
elementData = append(elementData, stringReader.Value(int(j)))
|
||||
}
|
||||
|
@ -917,7 +929,7 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
default:
|
||||
log.Warn("unsupported element type", zap.String("element type", columnReader.elementType.String()),
|
||||
zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type: %s of array", columnReader.elementType.String()))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type: %s of array field: %s", columnReader.elementType.String(), columnReader.fieldName))
|
||||
}
|
||||
return &storage.ArrayFieldData{
|
||||
ElementType: columnReader.elementType,
|
||||
|
@ -927,6 +939,6 @@ func (p *ParquetParser) readData(columnReader *ParquetColumnReader, rowCount int
|
|||
log.Warn("Parquet parser: unsupported data type of field",
|
||||
zap.String("dataType", columnReader.dataType.String()),
|
||||
zap.String("fieldName", columnReader.fieldName))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type: %s", columnReader.elementType.String()))
|
||||
return nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type: %s of field: %s", columnReader.elementType.String(), columnReader.fieldName))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -284,7 +284,7 @@ func convertMilvusSchemaToArrowSchema(schema *schemapb.CollectionSchema) *arrow.
|
|||
return arrow.NewSchema(fields, nil)
|
||||
}
|
||||
|
||||
func buildArrayData(dataType, elementType schemapb.DataType, dim, rows int) arrow.Array {
|
||||
func buildArrayData(dataType, elementType schemapb.DataType, dim, rows, arrLen int) arrow.Array {
|
||||
mem := memory.NewGoAllocator()
|
||||
switch dataType {
|
||||
case schemapb.DataType_Bool:
|
||||
|
@ -372,11 +372,11 @@ func buildArrayData(dataType, elementType schemapb.DataType, dim, rows int) arro
|
|||
valid := make([]bool, 0, rows)
|
||||
index := 0
|
||||
for i := 0; i < rows; i++ {
|
||||
index += i
|
||||
index += arrLen
|
||||
offsets = append(offsets, int32(index))
|
||||
valid = append(valid, true)
|
||||
}
|
||||
index += rows
|
||||
index += arrLen
|
||||
switch elementType {
|
||||
case schemapb.DataType_Bool:
|
||||
builder := array.NewListBuilder(mem, &arrow.BooleanType{})
|
||||
|
@ -449,23 +449,27 @@ func buildArrayData(dataType, elementType schemapb.DataType, dim, rows int) arro
|
|||
|
||||
func writeParquet(w io.Writer, milvusSchema *schemapb.CollectionSchema, numRows int) error {
|
||||
schema := convertMilvusSchemaToArrowSchema(milvusSchema)
|
||||
columns := make([]arrow.Array, 0, len(milvusSchema.Fields))
|
||||
for _, field := range milvusSchema.Fields {
|
||||
dim, _ := getFieldDimension(field)
|
||||
columnData := buildArrayData(field.DataType, field.ElementType, dim, numRows)
|
||||
columns = append(columns, columnData)
|
||||
}
|
||||
recordBatch := array.NewRecord(schema, columns, int64(numRows))
|
||||
fw, err := pqarrow.NewFileWriter(schema, w, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fw.Close()
|
||||
|
||||
err = fw.Write(recordBatch)
|
||||
if err != nil {
|
||||
return err
|
||||
batch := 1000
|
||||
for i := 0; i <= numRows/batch; i++ {
|
||||
columns := make([]arrow.Array, 0, len(milvusSchema.Fields))
|
||||
for _, field := range milvusSchema.Fields {
|
||||
dim, _ := getFieldDimension(field)
|
||||
columnData := buildArrayData(field.DataType, field.ElementType, dim, batch, 10)
|
||||
columns = append(columns, columnData)
|
||||
}
|
||||
recordBatch := array.NewRecord(schema, columns, int64(batch))
|
||||
err = fw.Write(recordBatch)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -2590,6 +2590,8 @@ type dataNodeConfig struct {
|
|||
|
||||
// timeout for bulkinsert
|
||||
BulkInsertTimeoutSeconds ParamItem `refreshable:"true"`
|
||||
BulkInsertReadBufferSize ParamItem `refreshable:"true"`
|
||||
BulkInsertMaxMemorySize ParamItem `refreshable:"true"`
|
||||
|
||||
// Skip BF
|
||||
SkipBFStatsLoad ParamItem `refreshable:"true"`
|
||||
|
@ -2781,6 +2783,22 @@ func (p *dataNodeConfig) init(base *BaseTable) {
|
|||
}
|
||||
p.BulkInsertTimeoutSeconds.Init(base.mgr)
|
||||
|
||||
p.BulkInsertReadBufferSize = ParamItem{
|
||||
Key: "datanode.bulkinsert.readBufferSize",
|
||||
Version: "2.3.4",
|
||||
PanicIfEmpty: false,
|
||||
DefaultValue: "16777216",
|
||||
}
|
||||
p.BulkInsertReadBufferSize.Init(base.mgr)
|
||||
|
||||
p.BulkInsertMaxMemorySize = ParamItem{
|
||||
Key: "datanode.bulkinsert.maxMemorySize",
|
||||
Version: "2.3.4",
|
||||
PanicIfEmpty: false,
|
||||
DefaultValue: "6442450944",
|
||||
}
|
||||
p.BulkInsertMaxMemorySize.Init(base.mgr)
|
||||
|
||||
p.ChannelWorkPoolSize = ParamItem{
|
||||
Key: "datanode.channel.workPoolSize",
|
||||
Version: "2.3.2",
|
||||
|
|
Loading…
Reference in New Issue