// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package importutil import ( "context" "encoding/json" "fmt" "path" "runtime/debug" "strconv" "strings" "go.uber.org/zap" "go.uber.org/zap/zapcore" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/allocator" "github.com/milvus-io/milvus/internal/storage" "github.com/milvus-io/milvus/pkg/common" "github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/util/merr" "github.com/milvus-io/milvus/pkg/util/typeutil" ) type ( BlockData map[storage.FieldID]storage.FieldData // a map of field ID to field data ShardData map[int64]BlockData // a map of partition ID to block data ) func isCanceled(ctx context.Context) bool { // canceled? select { case <-ctx.Done(): return true default: break } return false } func initBlockData(collectionSchema *schemapb.CollectionSchema) BlockData { blockData := make(BlockData) // rowID field is a hidden field with fieldID=0, it is always auto-generated by IDAllocator // if primary key is int64 and autoID=true, primary key field is equal to rowID field blockData[common.RowIDField] = &storage.Int64FieldData{ Data: make([]int64, 0), } for i := 0; i < len(collectionSchema.Fields); i++ { schema := collectionSchema.Fields[i] switch schema.DataType { case schemapb.DataType_Bool: blockData[schema.GetFieldID()] = &storage.BoolFieldData{ Data: make([]bool, 0), } case schemapb.DataType_Float: blockData[schema.GetFieldID()] = &storage.FloatFieldData{ Data: make([]float32, 0), } case schemapb.DataType_Double: blockData[schema.GetFieldID()] = &storage.DoubleFieldData{ Data: make([]float64, 0), } case schemapb.DataType_Int8: blockData[schema.GetFieldID()] = &storage.Int8FieldData{ Data: make([]int8, 0), } case schemapb.DataType_Int16: blockData[schema.GetFieldID()] = &storage.Int16FieldData{ Data: make([]int16, 0), } case schemapb.DataType_Int32: blockData[schema.GetFieldID()] = &storage.Int32FieldData{ Data: make([]int32, 0), } case schemapb.DataType_Int64: blockData[schema.GetFieldID()] = &storage.Int64FieldData{ Data: make([]int64, 0), } case schemapb.DataType_BinaryVector: dim, _ := getFieldDimension(schema) blockData[schema.GetFieldID()] = &storage.BinaryVectorFieldData{ Data: make([]byte, 0), Dim: dim, } case schemapb.DataType_FloatVector: dim, _ := getFieldDimension(schema) blockData[schema.GetFieldID()] = &storage.FloatVectorFieldData{ Data: make([]float32, 0), Dim: dim, } case schemapb.DataType_String, schemapb.DataType_VarChar: blockData[schema.GetFieldID()] = &storage.StringFieldData{ Data: make([]string, 0), } case schemapb.DataType_JSON: blockData[schema.GetFieldID()] = &storage.JSONFieldData{ Data: make([][]byte, 0), } case schemapb.DataType_Array: blockData[schema.GetFieldID()] = &storage.ArrayFieldData{ Data: make([]*schemapb.ScalarField, 0), ElementType: schema.GetElementType(), } default: log.Warn("Import util: unsupported data type", zap.String("DataType", getTypeName(schema.DataType))) return nil } } return blockData } func initShardData(collectionSchema *schemapb.CollectionSchema, partitionIDs []int64) ShardData { shardData := make(ShardData) for i := 0; i < len(partitionIDs); i++ { blockData := initBlockData(collectionSchema) if blockData == nil { return nil } shardData[partitionIDs[i]] = blockData } return shardData } func parseFloat(s string, bitsize int, fieldName string) (float64, error) { value, err := strconv.ParseFloat(s, bitsize) if err != nil { return 0, merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%s' for field '%s', error: %v", s, fieldName, err)) } err = typeutil.VerifyFloat(value) if err != nil { return 0, merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%s' for field '%s', error: %v", s, fieldName, err)) } return value, nil } // Validator is field value validator type Validator struct { convertFunc func(obj interface{}, field storage.FieldData) error // convert data function primaryKey bool // true for primary key autoID bool // only for primary key field isString bool // for string field dimension int // only for vector field fieldName string // field name fieldID int64 // field ID } // initValidators constructs valiator methods and data conversion methods func initValidators(collectionSchema *schemapb.CollectionSchema, validators map[storage.FieldID]*Validator) error { if collectionSchema == nil { return merr.WrapErrImportFailed("collection schema is nil") } for i := 0; i < len(collectionSchema.Fields); i++ { schema := collectionSchema.Fields[i] validators[schema.GetFieldID()] = &Validator{} validators[schema.GetFieldID()].primaryKey = schema.GetIsPrimaryKey() validators[schema.GetFieldID()].autoID = schema.GetAutoID() validators[schema.GetFieldID()].fieldName = schema.GetName() validators[schema.GetFieldID()].fieldID = schema.GetFieldID() validators[schema.GetFieldID()].isString = false switch schema.DataType { case schemapb.DataType_Bool: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if value, ok := obj.(bool); ok { field.(*storage.BoolFieldData).Data = append(field.(*storage.BoolFieldData).Data, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for bool type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Float: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if num, ok := obj.(json.Number); ok { value, err := parseFloat(string(num), 32, schema.GetName()) if err != nil { return err } field.(*storage.FloatFieldData).Data = append(field.(*storage.FloatFieldData).Data, float32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for float type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Double: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if num, ok := obj.(json.Number); ok { value, err := parseFloat(string(num), 64, schema.GetName()) if err != nil { return err } field.(*storage.DoubleFieldData).Data = append(field.(*storage.DoubleFieldData).Data, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for double type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Int8: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if num, ok := obj.(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 8) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int8 field '%s', error: %v", num, schema.GetName(), err)) } field.(*storage.Int8FieldData).Data = append(field.(*storage.Int8FieldData).Data, int8(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int8 type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Int16: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if num, ok := obj.(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 16) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int16 field '%s', error: %v", num, schema.GetName(), err)) } field.(*storage.Int16FieldData).Data = append(field.(*storage.Int16FieldData).Data, int16(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int16 type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Int32: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if num, ok := obj.(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 32) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int32 field '%s', error: %v", num, schema.GetName(), err)) } field.(*storage.Int32FieldData).Data = append(field.(*storage.Int32FieldData).Data, int32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int32 type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Int64: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if num, ok := obj.(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 64) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int64 field '%s', error: %v", num, schema.GetName(), err)) } field.(*storage.Int64FieldData).Data = append(field.(*storage.Int64FieldData).Data, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int64 type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_BinaryVector: dim, err := getFieldDimension(schema) if err != nil { return err } validators[schema.GetFieldID()].dimension = dim validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { arr, ok := obj.([]interface{}) if !ok { return merr.WrapErrImportFailed(fmt.Sprintf("'%v' is not an array for binary vector field '%s'", obj, schema.GetName())) } // we use uint8 to represent binary vector in json file, each uint8 value represents 8 dimensions. if len(arr)*8 != dim { return merr.WrapErrImportFailed(fmt.Sprintf("bit size %d doesn't equal to vector dimension %d of field '%s'", len(arr)*8, dim, schema.GetName())) } for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := strconv.ParseUint(string(num), 0, 8) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for binary vector field '%s', error: %v", num, schema.GetName(), err)) } field.(*storage.BinaryVectorFieldData).Data = append(field.(*storage.BinaryVectorFieldData).Data, byte(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for binary vector field '%s'", obj, schema.GetName())) } } return nil } case schemapb.DataType_FloatVector: dim, err := getFieldDimension(schema) if err != nil { return err } validators[schema.GetFieldID()].dimension = dim validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { arr, ok := obj.([]interface{}) if !ok { return merr.WrapErrImportFailed(fmt.Sprintf("'%v' is not an array for float vector field '%s'", obj, schema.GetName())) } if len(arr) != dim { return merr.WrapErrImportFailed(fmt.Sprintf("array size %d doesn't equal to vector dimension %d of field '%s'", len(arr), dim, schema.GetName())) } for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := parseFloat(string(num), 32, schema.GetName()) if err != nil { return err } field.(*storage.FloatVectorFieldData).Data = append(field.(*storage.FloatVectorFieldData).Data, float32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for float vector field '%s'", obj, schema.GetName())) } } return nil } case schemapb.DataType_String, schemapb.DataType_VarChar: validators[schema.GetFieldID()].isString = true validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { if value, ok := obj.(string); ok { field.(*storage.StringFieldData).Data = append(field.(*storage.StringFieldData).Data, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for varchar type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_JSON: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { // for JSON data, we accept two kinds input: string and map[string]interface // user can write JSON content as {"FieldJSON": "{\"x\": 8}"} or {"FieldJSON": {"x": 8}} if value, ok := obj.(string); ok { var dummy interface{} err := json.Unmarshal([]byte(value), &dummy) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for JSON field '%s', error: %v", value, schema.GetName(), err)) } field.(*storage.JSONFieldData).Data = append(field.(*storage.JSONFieldData).Data, []byte(value)) } else if mp, ok := obj.(map[string]interface{}); ok { bs, err := json.Marshal(mp) if err != nil { return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value for JSON field '%s', error: %v", schema.GetName(), err)) } field.(*storage.JSONFieldData).Data = append(field.(*storage.JSONFieldData).Data, bs) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for JSON type field '%s'", obj, schema.GetName())) } return nil } case schemapb.DataType_Array: validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error { arr, ok := obj.([]interface{}) if !ok { return merr.WrapErrImportFailed(fmt.Sprintf("'%v' is not an array for array field '%s'", obj, schema.GetName())) } return getArrayElementData(schema, arr, field) } default: return merr.WrapErrImportFailed(fmt.Sprintf("unsupport data type: %s", getTypeName(collectionSchema.Fields[i].DataType))) } } return nil } func getArrayElementData(schema *schemapb.FieldSchema, arr []interface{}, field storage.FieldData) error { switch schema.GetElementType() { case schemapb.DataType_Bool: boolData := make([]bool, 0) for i := 0; i < len(arr); i++ { if value, ok := arr[i].(bool); ok { boolData = append(boolData, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for bool array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_BoolData{ BoolData: &schemapb.BoolArray{ Data: boolData, }, }, }) case schemapb.DataType_Int8: int8Data := make([]int32, 0) for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 8) if err != nil { return err } int8Data = append(int8Data, int32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_IntData{ IntData: &schemapb.IntArray{ Data: int8Data, }, }, }) case schemapb.DataType_Int16: int16Data := make([]int32, 0) for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 16) if err != nil { return err } int16Data = append(int16Data, int32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_IntData{ IntData: &schemapb.IntArray{ Data: int16Data, }, }, }) case schemapb.DataType_Int32: intData := make([]int32, 0) for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 32) if err != nil { return err } intData = append(intData, int32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_IntData{ IntData: &schemapb.IntArray{ Data: intData, }, }, }) case schemapb.DataType_Int64: longData := make([]int64, 0) for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := strconv.ParseInt(string(num), 0, 64) if err != nil { return err } longData = append(longData, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for long array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_LongData{ LongData: &schemapb.LongArray{ Data: longData, }, }, }) case schemapb.DataType_Float: floatData := make([]float32, 0) for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := parseFloat(string(num), 32, schema.GetName()) if err != nil { return err } floatData = append(floatData, float32(value)) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for float array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_FloatData{ FloatData: &schemapb.FloatArray{ Data: floatData, }, }, }) case schemapb.DataType_Double: doubleData := make([]float64, 0) for i := 0; i < len(arr); i++ { if num, ok := arr[i].(json.Number); ok { value, err := parseFloat(string(num), 32, schema.GetName()) if err != nil { return err } doubleData = append(doubleData, value) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for double array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{ Data: &schemapb.ScalarField_DoubleData{ DoubleData: &schemapb.DoubleArray{ Data: doubleData, }, }, }) case schemapb.DataType_String, schemapb.DataType_VarChar: stringFieldData := &schemapb.ScalarField{ Data: &schemapb.ScalarField_StringData{ StringData: &schemapb.StringArray{ Data: make([]string, 0), }, }, } for i := 0; i < len(arr); i++ { if str, ok := arr[i].(string); ok { stringFieldData.GetStringData().Data = append(stringFieldData.GetStringData().Data, str) } else { return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for string array field '%s'", arr, schema.GetName())) } } field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, stringFieldData) default: return merr.WrapErrImportFailed(fmt.Sprintf("unsupport element type: %v", getTypeName(schema.GetElementType()))) } return nil } func printFieldsDataInfo(fieldsData BlockData, msg string, files []string) { stats := make([]zapcore.Field, 0) for k, v := range fieldsData { stats = append(stats, zap.Int(strconv.FormatInt(k, 10), v.RowNum())) } if len(files) > 0 { stats = append(stats, zap.Any(Files, files)) } log.Info(msg, stats...) } // GetFileNameAndExt extracts file name and extension // for example: "/a/b/c.ttt" returns "c" and ".ttt" func GetFileNameAndExt(filePath string) (string, string) { fileName := path.Base(filePath) fileType := path.Ext(fileName) fileNameWithoutExt := strings.TrimSuffix(fileName, fileType) return fileNameWithoutExt, fileType } // getFieldDimension gets dimension of vecotor field func getFieldDimension(schema *schemapb.FieldSchema) (int, error) { for _, kvPair := range schema.GetTypeParams() { key, value := kvPair.GetKey(), kvPair.GetValue() if key == common.DimKey { dim, err := strconv.Atoi(value) if err != nil { return 0, merr.WrapErrImportFailed(fmt.Sprintf("illegal vector dimension '%s' for field '%s', error: %v", value, schema.GetName(), err)) } return dim, nil } } return 0, merr.WrapErrImportFailed(fmt.Sprintf("vector dimension is not defined for field '%s'", schema.GetName())) } // triggerGC triggers golang gc to return all free memory back to the underlying system at once, // Note: this operation is expensive, and can lead to latency spikes as it holds the heap lock through the whole process func triggerGC() { debug.FreeOSMemory() } // if user didn't provide dynamic data, fill the dynamic field by "{}" func fillDynamicData(blockData BlockData, collectionSchema *schemapb.CollectionSchema) error { if !collectionSchema.GetEnableDynamicField() { return nil } dynamicFieldID := int64(-1) for i := 0; i < len(collectionSchema.Fields); i++ { schema := collectionSchema.Fields[i] if schema.GetIsDynamic() { dynamicFieldID = schema.GetFieldID() break } } if dynamicFieldID < 0 { return merr.WrapErrImportFailed("the collection schema is dynamic but dynamic field is not found") } rowCount := 0 if len(blockData) > 0 { for id, v := range blockData { if id == dynamicFieldID { continue } rowCount = v.RowNum() } } dynamicData, ok := blockData[dynamicFieldID] if !ok || dynamicData == nil { // dynamic field data is not provided, create new one dynamicData = &storage.JSONFieldData{ Data: make([][]byte, 0), } } if dynamicData.RowNum() < rowCount { // fill the dynamic data by an empty JSON object, make sure the row count is eaual to other fields data := dynamicData.(*storage.JSONFieldData) bs := []byte("{}") dynamicRowCount := dynamicData.RowNum() for i := 0; i < rowCount-dynamicRowCount; i++ { data.Data = append(data.Data, bs) } } blockData[dynamicFieldID] = dynamicData return nil } // tryFlushBlocks does the two things: // 1. if accumulate data of a block exceed blockSize, call callFlushFunc to generate new binlog file // 2. if total accumulate data exceed maxTotalSize, call callFlushFunc to flush the biggest block func tryFlushBlocks(ctx context.Context, shardsData []ShardData, collectionSchema *schemapb.CollectionSchema, callFlushFunc ImportFlushFunc, blockSize int64, maxTotalSize int64, force bool, ) error { totalSize := 0 biggestSize := 0 biggestItem := -1 biggestPartition := int64(-1) // 1. if accumulate data of a block exceed blockSize, call callFlushFunc to generate new binlog file for i := 0; i < len(shardsData); i++ { // outside context might be canceled(service stop, or future enhancement for canceling import task) if isCanceled(ctx) { log.Warn("Import util: import task was canceled") return merr.WrapErrImportFailed("import task was canceled") } shardData := shardsData[i] for partitionID, blockData := range shardData { err := fillDynamicData(blockData, collectionSchema) if err != nil { log.Warn("Import util: failed to fill dynamic field", zap.Error(err)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to fill dynamic field, error: %v", err)) } // Note: even rowCount is 0, the size is still non-zero size := 0 rowCount := 0 for _, fieldData := range blockData { size += fieldData.GetMemorySize() rowCount = fieldData.RowNum() } // force to flush, called at the end of Read() if force && rowCount > 0 { printFieldsDataInfo(blockData, "import util: prepare to force flush a block", nil) err := callFlushFunc(blockData, i, partitionID) if err != nil { log.Warn("Import util: failed to force flush block data", zap.Int("shardID", i), zap.Int64("partitionID", partitionID), zap.Error(err)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to force flush block data for shard id %d to partition %d, error: %v", i, partitionID, err)) } log.Info("Import util: force flush", zap.Int("rowCount", rowCount), zap.Int("size", size), zap.Int("shardID", i), zap.Int64("partitionID", partitionID)) shardData[partitionID] = initBlockData(collectionSchema) if shardData[partitionID] == nil { log.Warn("Import util: failed to initialize FieldData list", zap.Int("shardID", i), zap.Int64("partitionID", partitionID)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to initialize FieldData list for shard id %d to partition %d", i, partitionID)) } continue } // if segment size is larger than predefined blockSize, flush to create a new binlog file // initialize a new FieldData list for next round batch read if size > int(blockSize) && rowCount > 0 { printFieldsDataInfo(blockData, "import util: prepare to flush block larger than blockSize", nil) err := callFlushFunc(blockData, i, partitionID) if err != nil { log.Warn("Import util: failed to flush block data", zap.Int("shardID", i), zap.Int64("partitionID", partitionID), zap.Error(err)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to flush block data for shard id %d to partition %d, error: %v", i, partitionID, err)) } log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount), zap.Int("size", size), zap.Int("shardID", i), zap.Int64("partitionID", partitionID), zap.Int64("blockSize", blockSize)) shardData[partitionID] = initBlockData(collectionSchema) if shardData[partitionID] == nil { log.Warn("Import util: failed to initialize FieldData list", zap.Int("shardID", i), zap.Int64("partitionID", partitionID)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to initialize FieldData list for shard id %d to partition %d", i, partitionID)) } continue } // calculate the total size(ignore the flushed blocks) // find out the biggest block for the step 2 totalSize += size if size > biggestSize { biggestSize = size biggestItem = i biggestPartition = partitionID } } } // 2. if total accumulate data exceed maxTotalSize, call callFlushFUnc to flush the biggest block if totalSize > int(maxTotalSize) && biggestItem >= 0 && biggestPartition >= 0 { // outside context might be canceled(service stop, or future enhancement for canceling import task) if isCanceled(ctx) { log.Warn("Import util: import task was canceled") return merr.WrapErrImportFailed("import task was canceled") } blockData := shardsData[biggestItem][biggestPartition] err := fillDynamicData(blockData, collectionSchema) if err != nil { log.Warn("Import util: failed to fill dynamic field", zap.Error(err)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to fill dynamic field, error: %v", err)) } // Note: even rowCount is 0, the size is still non-zero size := 0 rowCount := 0 for _, fieldData := range blockData { size += fieldData.GetMemorySize() rowCount = fieldData.RowNum() } if rowCount > 0 { printFieldsDataInfo(blockData, "import util: prepare to flush biggest block", nil) err = callFlushFunc(blockData, biggestItem, biggestPartition) if err != nil { log.Warn("Import util: failed to flush biggest block data", zap.Int("shardID", biggestItem), zap.Int64("partitionID", biggestPartition)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to flush biggest block data for shard id %d to partition %d, error: %v", biggestItem, biggestPartition, err)) } log.Info("Import util: total size exceed limit and flush", zap.Int("rowCount", rowCount), zap.Int("size", size), zap.Int("totalSize", totalSize), zap.Int("shardID", biggestItem)) shardsData[biggestItem][biggestPartition] = initBlockData(collectionSchema) if shardsData[biggestItem][biggestPartition] == nil { log.Warn("Import util: failed to initialize FieldData list", zap.Int("shardID", biggestItem), zap.Int64("partitionID", biggestPartition)) return merr.WrapErrImportFailed(fmt.Sprintf("failed to initialize FieldData list for shard id %d to partition %d", biggestItem, biggestPartition)) } } } return nil } func getTypeName(dt schemapb.DataType) string { switch dt { case schemapb.DataType_Bool: return "Bool" case schemapb.DataType_Int8: return "Int8" case schemapb.DataType_Int16: return "Int16" case schemapb.DataType_Int32: return "Int32" case schemapb.DataType_Int64: return "Int64" case schemapb.DataType_Float: return "Float" case schemapb.DataType_Double: return "Double" case schemapb.DataType_VarChar: return "Varchar" case schemapb.DataType_String: return "String" case schemapb.DataType_BinaryVector: return "BinaryVector" case schemapb.DataType_FloatVector: return "FloatVector" case schemapb.DataType_JSON: return "JSON" default: return "InvalidType" } } func pkToShard(pk interface{}, shardNum uint32) (uint32, error) { var shard uint32 strPK, ok := pk.(string) if ok { hash := typeutil.HashString2Uint32(strPK) shard = hash % shardNum } else { intPK, ok := pk.(int64) if !ok { log.Warn("parser: primary key field must be int64 or varchar") return 0, merr.WrapErrImportFailed("primary key field must be int64 or varchar") } hash, _ := typeutil.Hash32Int64(intPK) shard = hash % shardNum } return shard, nil } func UpdateKVInfo(infos *[]*commonpb.KeyValuePair, k string, v string) error { if infos == nil { return merr.WrapErrImportFailed("Import util: kv array pointer is nil") } found := false for _, kv := range *infos { if kv.GetKey() == k { kv.Value = v found = true } } if !found { *infos = append(*infos, &commonpb.KeyValuePair{Key: k, Value: v}) } return nil } // appendFunc defines the methods to append data to storage.FieldData func appendFunc(schema *schemapb.FieldSchema) func(src storage.FieldData, n int, target storage.FieldData) error { switch schema.DataType { case schemapb.DataType_Bool: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.BoolFieldData) arr.Data = append(arr.Data, src.GetRow(n).(bool)) return nil } case schemapb.DataType_Float: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.FloatFieldData) arr.Data = append(arr.Data, src.GetRow(n).(float32)) return nil } case schemapb.DataType_Double: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.DoubleFieldData) arr.Data = append(arr.Data, src.GetRow(n).(float64)) return nil } case schemapb.DataType_Int8: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.Int8FieldData) arr.Data = append(arr.Data, src.GetRow(n).(int8)) return nil } case schemapb.DataType_Int16: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.Int16FieldData) arr.Data = append(arr.Data, src.GetRow(n).(int16)) return nil } case schemapb.DataType_Int32: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.Int32FieldData) arr.Data = append(arr.Data, src.GetRow(n).(int32)) return nil } case schemapb.DataType_Int64: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.Int64FieldData) arr.Data = append(arr.Data, src.GetRow(n).(int64)) return nil } case schemapb.DataType_BinaryVector: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.BinaryVectorFieldData) arr.Data = append(arr.Data, src.GetRow(n).([]byte)...) return nil } case schemapb.DataType_FloatVector: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.FloatVectorFieldData) arr.Data = append(arr.Data, src.GetRow(n).([]float32)...) return nil } case schemapb.DataType_String, schemapb.DataType_VarChar: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.StringFieldData) arr.Data = append(arr.Data, src.GetRow(n).(string)) return nil } case schemapb.DataType_JSON: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.JSONFieldData) arr.Data = append(arr.Data, src.GetRow(n).([]byte)) return nil } case schemapb.DataType_Array: return func(src storage.FieldData, n int, target storage.FieldData) error { arr := target.(*storage.ArrayFieldData) arr.Data = append(arr.Data, src.GetRow(n).(*schemapb.ScalarField)) return nil } default: return nil } } func prepareAppendFunctions(collectionInfo *CollectionInfo) (map[string]func(src storage.FieldData, n int, target storage.FieldData) error, error) { appendFunctions := make(map[string]func(src storage.FieldData, n int, target storage.FieldData) error) for i := 0; i < len(collectionInfo.Schema.Fields); i++ { schema := collectionInfo.Schema.Fields[i] appendFuncErr := appendFunc(schema) if appendFuncErr == nil { log.Warn("parser: unsupported field data type") return nil, fmt.Errorf("unsupported field data type: %d", schema.GetDataType()) } appendFunctions[schema.GetName()] = appendFuncErr } return appendFunctions, nil } // checkRowCount check row count of each field, all fields row count must be equal func checkRowCount(collectionInfo *CollectionInfo, fieldsData BlockData) (int, error) { rowCount := 0 rowCounter := make(map[string]int) for i := 0; i < len(collectionInfo.Schema.Fields); i++ { schema := collectionInfo.Schema.Fields[i] if !schema.GetAutoID() { v, ok := fieldsData[schema.GetFieldID()] if !ok { if schema.GetIsDynamic() { // user might not provide numpy file for dynamic field, skip it, will auto-generate later continue } log.Warn("field not provided", zap.String("fieldName", schema.GetName())) return 0, fmt.Errorf("field '%s' not provided", schema.GetName()) } rowCounter[schema.GetName()] = v.RowNum() if v.RowNum() > rowCount { rowCount = v.RowNum() } } } for name, count := range rowCounter { if count != rowCount { log.Warn("field row count is not equal to other fields row count", zap.String("fieldName", name), zap.Int("rowCount", count), zap.Int("otherRowCount", rowCount)) return 0, fmt.Errorf("field '%s' row count %d is not equal to other fields row count: %d", name, count, rowCount) } } return rowCount, nil } // hashToPartition hash partition key to get an partition ID, return the first partition ID if no partition key exist // CollectionInfo ensures only one partition ID in the PartitionIDs if no partition key exist func hashToPartition(collectionInfo *CollectionInfo, fieldsData BlockData, rowNumber int) (int64, error) { if collectionInfo.PartitionKey == nil { // no partition key, directly return the target partition id if len(collectionInfo.PartitionIDs) != 1 { return 0, fmt.Errorf("collection '%s' partition list is empty", collectionInfo.Schema.Name) } return collectionInfo.PartitionIDs[0], nil } partitionKeyID := collectionInfo.PartitionKey.GetFieldID() fieldData := fieldsData[partitionKeyID] value := fieldData.GetRow(rowNumber) index, err := pkToShard(value, uint32(len(collectionInfo.PartitionIDs))) if err != nil { return 0, err } return collectionInfo.PartitionIDs[index], nil } // splitFieldsData is to split the in-memory data(parsed from column-based files) into shards func splitFieldsData(collectionInfo *CollectionInfo, fieldsData BlockData, shards []ShardData, rowIDAllocator *allocator.IDAllocator) ([]int64, error) { if len(fieldsData) == 0 { log.Warn("fields data to split is empty") return nil, fmt.Errorf("fields data to split is empty") } if len(shards) != int(collectionInfo.ShardNum) { log.Warn("block count is not equal to collection shard number", zap.Int("shardsLen", len(shards)), zap.Int32("shardNum", collectionInfo.ShardNum)) return nil, fmt.Errorf("block count %d is not equal to collection shard number %d", len(shards), collectionInfo.ShardNum) } rowCount, err := checkRowCount(collectionInfo, fieldsData) if err != nil { return nil, err } // generate auto id for primary key and rowid field rowIDBegin, rowIDEnd, err := rowIDAllocator.Alloc(uint32(rowCount)) if err != nil { log.Warn("failed to alloc row ID", zap.Int("rowCount", rowCount), zap.Error(err)) return nil, fmt.Errorf("failed to alloc %d rows ID, error: %w", rowCount, err) } rowIDField, ok := fieldsData[common.RowIDField] if !ok { rowIDField = &storage.Int64FieldData{ Data: make([]int64, 0, rowCount), } fieldsData[common.RowIDField] = rowIDField } rowIDFieldArr := rowIDField.(*storage.Int64FieldData) for i := rowIDBegin; i < rowIDEnd; i++ { rowIDFieldArr.Data = append(rowIDFieldArr.Data, i) } // reset the primary keys, as we know, only int64 pk can be auto-generated primaryKey := collectionInfo.PrimaryKey autoIDRange := make([]int64, 0) if primaryKey.GetAutoID() { log.Info("generating auto-id", zap.Int("rowCount", rowCount), zap.Int64("rowIDBegin", rowIDBegin)) if primaryKey.GetDataType() != schemapb.DataType_Int64 { log.Warn("primary key field is auto-generated but the field type is not int64") return nil, fmt.Errorf("primary key field is auto-generated but the field type is not int64") } primaryDataArr := &storage.Int64FieldData{ Data: make([]int64, 0, rowCount), } for i := rowIDBegin; i < rowIDEnd; i++ { primaryDataArr.Data = append(primaryDataArr.Data, i) } fieldsData[primaryKey.GetFieldID()] = primaryDataArr autoIDRange = append(autoIDRange, rowIDBegin, rowIDEnd) } // if the primary key is not auto-gernerate and user doesn't provide, return error primaryData, ok := fieldsData[primaryKey.GetFieldID()] if !ok || primaryData.RowNum() <= 0 { log.Warn("primary key field is not provided", zap.String("keyName", primaryKey.GetName())) return nil, fmt.Errorf("primary key '%s' field data is not provided", primaryKey.GetName()) } // prepare append functions appendFunctions, err := prepareAppendFunctions(collectionInfo) if err != nil { return nil, err } // split data into shards for i := 0; i < rowCount; i++ { // hash to a shard number and partition pk := primaryData.GetRow(i) shard, err := pkToShard(pk, uint32(collectionInfo.ShardNum)) if err != nil { return nil, err } partitionID, err := hashToPartition(collectionInfo, fieldsData, i) if err != nil { return nil, err } // set rowID field rowIDField := shards[shard][partitionID][common.RowIDField].(*storage.Int64FieldData) rowIDField.Data = append(rowIDField.Data, rowIDFieldArr.GetRow(i).(int64)) // append row to shard for k := 0; k < len(collectionInfo.Schema.Fields); k++ { schema := collectionInfo.Schema.Fields[k] srcData := fieldsData[schema.GetFieldID()] targetData := shards[shard][partitionID][schema.GetFieldID()] if srcData == nil && schema.GetIsDynamic() { // user might not provide numpy file for dynamic field, skip it, will auto-generate later continue } if srcData == nil || targetData == nil { log.Warn("cannot append data since source or target field data is nil", zap.String("FieldName", schema.GetName()), zap.Bool("sourceNil", srcData == nil), zap.Bool("targetNil", targetData == nil)) return nil, fmt.Errorf("cannot append data for field '%s', possibly no any fields corresponding to this numpy file, or a required numpy file is not provided", schema.GetName()) } appendFunc := appendFunctions[schema.GetName()] err := appendFunc(srcData, i, targetData) if err != nil { return nil, err } } } return autoIDRange, nil }