Fix bug of bulkinsert for dynamic field (#24569)

Signed-off-by: yhmo <yihua.mo@zilliz.com>
pull/24588/head
groot 2023-06-01 14:14:31 +08:00 committed by GitHub
parent 5caa654622
commit 3022e37298
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 135 additions and 25 deletions

View File

@ -401,24 +401,33 @@ func fillDynamicData(blockData map[storage.FieldID]storage.FieldData, collection
rowCount := 0
if len(blockData) > 0 {
for _, v := range blockData {
for id, v := range blockData {
if id == dynamicFieldID {
continue
}
rowCount = v.RowNum()
}
}
_, ok := blockData[dynamicFieldID]
if !ok {
data := &storage.JSONFieldData{
dynamicData, ok := blockData[dynamicFieldID]
if !ok || dynamicData == nil {
// dynamic field data is not provided, create new one
dynamicData = &storage.JSONFieldData{
Data: make([][]byte, 0),
}
}
if dynamicData.RowNum() == 0 {
// fill the dynamic data by row count
data := dynamicData.(*storage.JSONFieldData)
bs := []byte("{}")
for i := 0; i < rowCount; i++ {
data.Data = append(data.Data, bs)
}
blockData[dynamicFieldID] = data
}
blockData[dynamicFieldID] = dynamicData
return nil
}
@ -446,6 +455,12 @@ func tryFlushBlocks(ctx context.Context,
}
blockData := blocksData[i]
err := fillDynamicData(blockData, collectionSchema)
if err != nil {
log.Error("Import util: failed to fill dynamic field", zap.Error(err))
return fmt.Errorf("failed to fill dynamic field, error: %w", err)
}
// Note: even rowCount is 0, the size is still non-zero
size := 0
rowCount := 0
@ -457,12 +472,7 @@ func tryFlushBlocks(ctx context.Context,
// force to flush, called at the end of Read()
if force && rowCount > 0 {
printFieldsDataInfo(blockData, "import util: prepare to force flush a block", nil)
err := fillDynamicData(blockData, collectionSchema)
if err != nil {
log.Error("Import util: failed to fill dynamic field", zap.Error(err))
return fmt.Errorf("failed to fill dynamic field, error: %w", err)
}
err = callFlushFunc(blockData, i)
err := callFlushFunc(blockData, i)
if err != nil {
log.Error("Import util: failed to force flush block data", zap.Int("shardID", i), zap.Error(err))
return fmt.Errorf("failed to force flush block data for shard id %d, error: %w", i, err)
@ -481,12 +491,7 @@ func tryFlushBlocks(ctx context.Context,
// initialize a new FieldData list for next round batch read
if size > int(blockSize) && rowCount > 0 {
printFieldsDataInfo(blockData, "import util: prepare to flush block larger than blockSize", nil)
err := fillDynamicData(blockData, collectionSchema)
if err != nil {
log.Error("Import util: failed to fill dynamic field", zap.Error(err))
return fmt.Errorf("failed to fill dynamic field, error: %w", err)
}
err = callFlushFunc(blockData, i)
err := callFlushFunc(blockData, i)
if err != nil {
log.Error("Import util: failed to flush block data", zap.Int("shardID", i), zap.Error(err))
return fmt.Errorf("failed to flush block data for shard id %d, error: %w", i, err)
@ -520,6 +525,12 @@ func tryFlushBlocks(ctx context.Context,
}
blockData := blocksData[biggestItem]
err := fillDynamicData(blockData, collectionSchema)
if err != nil {
log.Error("Import util: failed to fill dynamic field", zap.Error(err))
return fmt.Errorf("failed to fill dynamic field, error: %w", err)
}
// Note: even rowCount is 0, the size is still non-zero
size := 0
rowCount := 0
@ -530,11 +541,6 @@ func tryFlushBlocks(ctx context.Context,
if rowCount > 0 {
printFieldsDataInfo(blockData, "import util: prepare to flush biggest block", nil)
err := fillDynamicData(blockData, collectionSchema)
if err != nil {
log.Error("Import util: failed to fill dynamic field", zap.Error(err))
return fmt.Errorf("failed to fill dynamic field, error: %w", err)
}
err = callFlushFunc(blockData, biggestItem)
if err != nil {
log.Error("Import util: failed to flush biggest block data", zap.Int("shardID", biggestItem))

View File

@ -738,6 +738,10 @@ func (p *NumpyParser) checkRowCount(fieldsData map[storage.FieldID]storage.Field
if !schema.GetAutoID() {
v, ok := fieldsData[schema.GetFieldID()]
if !ok {
if schema.GetIsDynamic() {
// user might not provide numpy file for dynamic field, skip it, will auto-generate later
continue
}
log.Error("Numpy parser: field not provided", zap.String("fieldName", schema.GetName()))
return 0, nil, fmt.Errorf("field '%s' not provided", schema.GetName())
}
@ -852,12 +856,16 @@ func (p *NumpyParser) splitFieldsData(fieldsData map[storage.FieldID]storage.Fie
schema := p.collectionSchema.Fields[k]
srcData := fieldsData[schema.GetFieldID()]
targetData := shards[shard][schema.GetFieldID()]
if srcData == nil && schema.GetIsDynamic() {
// user might not provide numpy file for dynamic field, skip it, will auto-generate later
continue
}
if srcData == nil || targetData == nil {
log.Error("Numpy parser: cannot append data since source or target field data is nil",
zap.String("FieldName", schema.GetName()),
zap.Bool("sourceNil", srcData == nil), zap.Bool("targetNil", targetData == nil))
return fmt.Errorf("cannot append data for field '%s' since source or target field data is nil",
primaryKey.GetName())
return fmt.Errorf("cannot append data for field '%s', possibly no any fields corresponding to this numpy file, or a required numpy file is not provided",
schema.GetName())
}
appendFunc := appendFunctions[schema.GetName()]
err := appendFunc(srcData, i, targetData)

View File

@ -125,10 +125,40 @@ func Test_NumpyParserValidateFileNames(t *testing.T) {
err = parser.validateFileNames(fileNames)
assert.Error(t, err)
//valid
// valid
fileNames = append(fileNames, "FieldFloatVector.npy")
err = parser.validateFileNames(fileNames)
assert.NoError(t, err)
// has dynamic field
parser.collectionSchema = &schemapb.CollectionSchema{
Name: "schema",
Description: "schema",
AutoID: true,
EnableDynamicField: true,
Fields: []*schemapb.FieldSchema{
{
FieldID: 101,
Name: "FieldInt64",
IsPrimaryKey: true,
AutoID: false,
DataType: schemapb.DataType_Int64,
},
{
FieldID: 102,
Name: "FieldDynamic",
IsDynamic: true,
DataType: schemapb.DataType_JSON,
},
},
}
fileNames = []string{"FieldInt64.npy"}
err = parser.validateFileNames(fileNames)
assert.NoError(t, err)
fileNames = append(fileNames, "FieldDynamic.npy")
err = parser.validateFileNames(fileNames)
assert.NoError(t, err)
}
func Test_NumpyParserValidateHeader(t *testing.T) {
@ -641,6 +671,37 @@ func Test_NumpyParserCheckRowCount(t *testing.T) {
assert.Error(t, err)
assert.Zero(t, rowCount)
assert.Nil(t, primaryKey)
// has dynamic field
parser.collectionSchema = &schemapb.CollectionSchema{
Name: "schema",
Description: "schema",
AutoID: true,
EnableDynamicField: true,
Fields: []*schemapb.FieldSchema{
{
FieldID: 101,
Name: "FieldInt64",
IsPrimaryKey: true,
AutoID: false,
DataType: schemapb.DataType_Int64,
},
{
FieldID: 102,
Name: "FieldDynamic",
IsDynamic: true,
DataType: schemapb.DataType_JSON,
},
},
}
segmentData[101] = &storage.Int64FieldData{
Data: []int64{1, 2, 4},
}
rowCount, primaryKey, err = parser.checkRowCount(segmentData)
assert.NoError(t, err)
assert.Equal(t, 3, rowCount)
assert.NotNil(t, primaryKey)
}
func Test_NumpyParserSplitFieldsData(t *testing.T) {
@ -729,6 +790,41 @@ func Test_NumpyParserSplitFieldsData(t *testing.T) {
schema.AutoID = false
})
t.Run("has dynamic field", func(t *testing.T) {
parser.collectionSchema = &schemapb.CollectionSchema{
Name: "schema",
Description: "schema",
AutoID: true,
EnableDynamicField: true,
Fields: []*schemapb.FieldSchema{
{
FieldID: 101,
Name: "FieldInt64",
IsPrimaryKey: true,
AutoID: false,
DataType: schemapb.DataType_Int64,
},
{
FieldID: 102,
Name: "FieldDynamic",
IsDynamic: true,
DataType: schemapb.DataType_JSON,
},
},
}
shards = make([]map[storage.FieldID]storage.FieldData, 0, parser.shardNum)
for i := 0; i < int(parser.shardNum); i++ {
segmentData := initSegmentData(parser.collectionSchema)
shards = append(shards, segmentData)
}
segmentData = make(map[storage.FieldID]storage.FieldData)
segmentData[101] = &storage.Int64FieldData{
Data: []int64{1, 2, 4},
}
err = parser.splitFieldsData(segmentData, shards)
assert.NoError(t, err)
})
}
func Test_NumpyParserCalcRowCountPerBlock(t *testing.T) {