enhance: Support readable JSON file import for Float16/BFloat16/SparseFloat (#33064)

Issue: #22837 Signed-off-by: Cai Yudong <yudong.cai@zilliz.com>
2024-05-16 14:47:35 +08:00 · 2024-05-16 14:47:35 +08:00 · 4ef163fb70
parent 892fe66b57
commit 4ef163fb70
10 changed files with 246 additions and 125 deletions
--- a/internal/util/importutilv2/json/reader_test.go
+++ b/internal/util/importutilv2/json/reader_test.go
@ -128,13 +128,29 @@ func (suite *ReaderSuite) run(dataType schemapb.DataType, elemType schemapb.Data
 				}
 			case schemapb.DataType_JSON:
 				data[fieldID] = string(v.GetRow(i).([]byte))
-			case schemapb.DataType_BinaryVector, schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector, schemapb.DataType_SparseFloatVector:
+			case schemapb.DataType_BinaryVector:
 				bytes := v.GetRow(i).([]byte)
 				ints := make([]int, 0, len(bytes))
 				for _, b := range bytes {
 					ints = append(ints, int(b))
 				}
 				data[fieldID] = ints
+			case schemapb.DataType_Float16Vector:
+				bytes := v.GetRow(i).([]byte)
+				data[fieldID] = typeutil.Float16BytesToFloat32Vector(bytes)
+			case schemapb.DataType_BFloat16Vector:
+				bytes := v.GetRow(i).([]byte)
+				data[fieldID] = typeutil.BFloat16BytesToFloat32Vector(bytes)
+			case schemapb.DataType_SparseFloatVector:
+				bytes := v.GetRow(i).([]byte)
+				elemCount := len(bytes) / 8
+				values := make(map[uint32]float32)
+				for j := 0; j < elemCount; j++ {
+					idx := common.Endian.Uint32(bytes[j*8:])
+					f := typeutil.BytesToFloat32(bytes[j*8+4:])
+					values[idx] = f
+				}
+				data[fieldID] = values
 			default:
 				data[fieldID] = v.GetRow(i)
 			}
--- a/internal/util/importutilv2/json/row_parser.go
+++ b/internal/util/importutilv2/json/row_parser.go
@ -305,46 +305,58 @@ func (r *rowParser) parseEntity(fieldID int64, obj any) (any, error) {
 			vec[i] = float32(num)
 		}
 		return vec, nil
-	case schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector:
+	case schemapb.DataType_Float16Vector:
+		// parse float string to Float16 bytes
 		arr, ok := obj.([]interface{})
 		if !ok {
 			return nil, r.wrapTypeError(obj, fieldID)
 		}
-		if len(arr) != r.dim*2 {
-			return nil, r.wrapDimError(len(arr)/2, fieldID)
+		if len(arr) != r.dim {
+			return nil, r.wrapDimError(len(arr), fieldID)
 		}
-		vec := make([]byte, len(arr))
+		vec := make([]byte, len(arr)*2)
 		for i := 0; i < len(arr); i++ {
 			value, ok := arr[i].(json.Number)
 			if !ok {
 				return nil, r.wrapTypeError(arr[i], fieldID)
 			}
-			num, err := strconv.ParseUint(value.String(), 0, 8)
+			num, err := strconv.ParseFloat(value.String(), 32)
 			if err != nil {
 				return nil, err
 			}
-			vec[i] = byte(num)
+			copy(vec[i*2:], typeutil.Float32ToFloat16Bytes(float32(num)))
+		}
+		return vec, nil
+	case schemapb.DataType_BFloat16Vector:
+		// parse float string to BFloat16 bytes
+		arr, ok := obj.([]interface{})
+		if !ok {
+			return nil, r.wrapTypeError(obj, fieldID)
+		}
+		if len(arr) != r.dim {
+			return nil, r.wrapDimError(len(arr), fieldID)
+		}
+		vec := make([]byte, len(arr)*2)
+		for i := 0; i < len(arr); i++ {
+			value, ok := arr[i].(json.Number)
+			if !ok {
+				return nil, r.wrapTypeError(arr[i], fieldID)
+			}
+			num, err := strconv.ParseFloat(value.String(), 32)
+			if err != nil {
+				return nil, err
+			}
+			copy(vec[i*2:], typeutil.Float32ToBFloat16Bytes(float32(num)))
 		}
 		return vec, nil
 	case schemapb.DataType_SparseFloatVector:
-		arr, ok := obj.([]interface{})
+		arr, ok := obj.(map[string]interface{})
 		if !ok {
 			return nil, r.wrapTypeError(obj, fieldID)
 		}
-		if len(arr)%8 != 0 {
-			return nil, r.wrapDimError(len(arr), fieldID)
-		}
-		vec := make([]byte, len(arr))
-		for i := 0; i < len(arr); i++ {
-			value, ok := arr[i].(json.Number)
-			if !ok {
-				return nil, r.wrapTypeError(arr[i], fieldID)
-			}
-			num, err := strconv.ParseUint(value.String(), 0, 8)
-			if err != nil {
-				return nil, err
-			}
-			vec[i] = byte(num)
+		vec, err := typeutil.CreateSparseFloatRowFromJSON(arr)
+		if err != nil {
+			return nil, err
 		}
 		return vec, nil
 	case schemapb.DataType_String, schemapb.DataType_VarChar:
--- a/pkg/util/testutils/gen_data.go
+++ b/pkg/util/testutils/gen_data.go
@ -248,27 +248,20 @@ func GenerateFloatVectors(numRows, dim int) []float32 {

 func GenerateFloat16Vectors(numRows, dim int) []byte {
 	total := numRows * dim
-	ret := make([]byte, total*2)
+	ret := make([]byte, 0, total*2)
 	for i := 0; i < total; i++ {
-		v := float16.Fromfloat32(rand.Float32()).Bits()
-		binary.LittleEndian.PutUint16(ret[i*2:], v)
+		f := (rand.Float32() - 0.5) * 100
+		ret = append(ret, typeutil.Float32ToFloat16Bytes(f)...)
 	}
 	return ret
 }

 func GenerateBFloat16Vectors(numRows, dim int) []byte {
 	total := numRows * dim
-	ret16 := make([]uint16, 0, total)
+	ret := make([]byte, 0, total*2)
 	for i := 0; i < total; i++ {
-		f := rand.Float32()
-		bits := math.Float32bits(f)
-		bits >>= 16
-		bits &= 0x7FFF
-		ret16 = append(ret16, uint16(bits))
-	}
-	ret := make([]byte, len(ret16)*2)
-	for i, value := range ret16 {
-		binary.LittleEndian.PutUint16(ret[i*2:], value)
+		f := (rand.Float32() - 0.5) * 100
+		ret = append(ret, typeutil.Float32ToBFloat16Bytes(f)...)
 	}
 	return ret
 }
--- a/pkg/util/typeutil/convension.go
+++ b/pkg/util/typeutil/convension.go
@ -23,6 +23,7 @@ import (
 	"reflect"

 	"github.com/golang/protobuf/proto"
+	"github.com/x448/float16"
 	"go.uber.org/zap"

 	"github.com/milvus-io/milvus/pkg/common"
@ -115,3 +116,41 @@ func SliceRemoveDuplicate(a interface{}) (ret []interface{}) {

 	return ret
 }
+
+func Float32ToFloat16Bytes(f float32) []byte {
+	ret := make([]byte, 2)
+	common.Endian.PutUint16(ret[:], float16.Fromfloat32(f).Bits())
+	return ret
+}
+
+func Float16BytesToFloat32(b []byte) float32 {
+	return float16.Frombits(common.Endian.Uint16(b)).Float32()
+}
+
+func Float16BytesToFloat32Vector(b []byte) []float32 {
+	dim := len(b) / 2
+	vec := make([]float32, 0, dim)
+	for j := 0; j < dim; j++ {
+		vec = append(vec, Float16BytesToFloat32(b[j*2:]))
+	}
+	return vec
+}
+
+func Float32ToBFloat16Bytes(f float32) []byte {
+	ret := make([]byte, 2)
+	common.Endian.PutUint16(ret[:], uint16(math.Float32bits(f)>>16))
+	return ret
+}
+
+func BFloat16BytesToFloat32(b []byte) float32 {
+	return math.Float32frombits(uint32(common.Endian.Uint16(b)) << 16)
+}
+
+func BFloat16BytesToFloat32Vector(b []byte) []float32 {
+	dim := len(b) / 2
+	vec := make([]float32, 0, dim)
+	for j := 0; j < dim; j++ {
+		vec = append(vec, BFloat16BytesToFloat32(b[j*2:]))
+	}
+	return vec
+}
--- a/pkg/util/typeutil/conversion_test.go
+++ b/pkg/util/typeutil/conversion_test.go
@ -18,9 +18,13 @@ package typeutil

 import (
 	"math"
+	"math/rand"
 	"testing"

 	"github.com/stretchr/testify/assert"
+	"go.uber.org/zap"
+
+	"github.com/milvus-io/milvus/pkg/log"
 )

 func TestConversion(t *testing.T) {
@ -94,4 +98,24 @@ func TestConversion(t *testing.T) {
 		ret1 := SliceRemoveDuplicate(arr)
 		assert.Equal(t, 3, len(ret1))
 	})
+
+	t.Run("TestFloat16", func(t *testing.T) {
+		for i := 0; i < 100; i++ {
+			v := (rand.Float32() - 0.5) * 100
+			b := Float32ToFloat16Bytes(v)
+			v2 := Float16BytesToFloat32(b)
+			log.Info("float16", zap.Float32("v", v), zap.Float32("v2", v2))
+			assert.Less(t, math.Abs(float64(v2/v-1)), 0.001)
+		}
+	})
+
+	t.Run("TestBFloat16", func(t *testing.T) {
+		for i := 0; i < 100; i++ {
+			v := (rand.Float32() - 0.5) * 100
+			b := Float32ToBFloat16Bytes(v)
+			v2 := BFloat16BytesToFloat32(b)
+			log.Info("bfloat16", zap.Float32("v", v), zap.Float32("v2", v2))
+			assert.Less(t, math.Abs(float64(v2/v-1)), 0.01)
+		}
+	})
 }
--- a/pkg/util/typeutil/schema.go
+++ b/pkg/util/typeutil/schema.go
@ -17,11 +17,11 @@
 package typeutil

 import (
-	"bytes"
 	"encoding/binary"
 	"encoding/json"
 	"fmt"
 	"math"
+	"reflect"
 	"sort"
 	"strconv"
 	"unsafe"
@ -1505,6 +1505,28 @@ func SparseFloatRowSetAt(row []byte, pos int, idx uint32, value float32) {
 	binary.LittleEndian.PutUint32(row[pos*8+4:], math.Float32bits(value))
 }

+func SortSparseFloatRow(indices []uint32, values []float32) ([]uint32, []float32) {
+	elemCount := len(indices)
+
+	indexOrder := make([]int, elemCount)
+	for i := range indexOrder {
+		indexOrder[i] = i
+	}
+
+	sort.Slice(indexOrder, func(i, j int) bool {
+		return indices[indexOrder[i]] < indices[indexOrder[j]]
+	})
+
+	sortedIndices := make([]uint32, elemCount)
+	sortedValues := make([]float32, elemCount)
+	for i, index := range indexOrder {
+		sortedIndices[i] = indices[index]
+		sortedValues[i] = values[index]
+	}
+
+	return sortedIndices, sortedValues
+}
+
 func CreateSparseFloatRow(indices []uint32, values []float32) []byte {
 	row := make([]byte, len(indices)*8)
 	for i := 0; i < len(indices); i++ {
@ -1519,59 +1541,59 @@ type sparseFloatVectorJSONRepresentation struct {
 }

 // accepted format:
-//   - {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}
-//   - {"1": 0.1, "2": 0.2, "3": 0.3}
+//   - {"indices": [1, 2, 3], "values": [0.1, 0.2, 0.3]}    # format1
+//   - {"1": 0.1, "2": 0.2, "3": 0.3}                       # format2
 //
 // we don't require the indices to be sorted from user input, but the returned
 // byte representation must have indices sorted
-func CreateSparseFloatRowFromJSON(input []byte) ([]byte, error) {
+func CreateSparseFloatRowFromJSON(input map[string]interface{}) ([]byte, error) {
 	var indices []uint32
 	var values []float32

-	var vec sparseFloatVectorJSONRepresentation
-	decoder := json.NewDecoder(bytes.NewReader(input))
-	decoder.DisallowUnknownFields()
-	err := decoder.Decode(&vec)
-	if err == nil {
-		if len(vec.Indices) != len(vec.Values) {
-			return nil, fmt.Errorf("indices and values length mismatch")
-		}
-		if len(vec.Indices) == 0 {
-			return nil, fmt.Errorf("empty indices/values in JSON input")
-		}
-		indices = vec.Indices
-		values = vec.Values
-	} else {
-		var vec2 map[uint32]float32
-		decoder = json.NewDecoder(bytes.NewReader(input))
-		decoder.DisallowUnknownFields()
-		err = decoder.Decode(&vec2)
-		if err != nil {
-			return nil, fmt.Errorf("failed to parse JSON input: %v", err)
-		}
-
-		for idx, val := range vec2 {
-			indices = append(indices, idx)
-			values = append(values, val)
-		}
+	if len(input) == 0 {
+		return nil, fmt.Errorf("empty JSON input")
 	}

-	indexOrder := make([]int, len(indices))
-	for i := range indexOrder {
-		indexOrder[i] = i
+	// try format1
+	indices, ok1 := input["indices"].([]uint32)
+	values, ok2 := input["values"].([]float32)
+
+	// try format2
+	if !ok1 && !ok2 {
+		for k, v := range input {
+			idx, err := strconv.ParseUint(k, 0, 32)
+			if err != nil {
+				return nil, err
+			}
+
+			var val float64
+			val, ok := v.(float64)
+			if !ok {
+				num, ok := v.(json.Number)
+				if !ok {
+					return nil, fmt.Errorf("invalid value type in JSON: %s", reflect.TypeOf(v))
+				}
+				val, err = strconv.ParseFloat(num.String(), 32)
+				if err != nil {
+					return nil, err
+				}
+			}
+
+			indices = append(indices, uint32(idx))
+			values = append(values, float32(val))
+		}
+	} else if ok1 != ok2 {
+		return nil, fmt.Errorf("invalid JSON input")
 	}

-	sort.Slice(indexOrder, func(i, j int) bool {
-		return indices[indexOrder[i]] < indices[indexOrder[j]]
-	})
-
-	sortedIndices := make([]uint32, len(indices))
-	sortedValues := make([]float32, len(values))
-	for i, index := range indexOrder {
-		sortedIndices[i] = indices[index]
-		sortedValues[i] = values[index]
+	if len(indices) != len(values) {
+		return nil, fmt.Errorf("indices and values length mismatch")
+	}
+	if len(indices) == 0 {
+		return nil, fmt.Errorf("empty indices/values in JSON input")
 	}

+	sortedIndices, sortedValues := SortSparseFloatRow(indices, values)
 	row := CreateSparseFloatRow(sortedIndices, sortedValues)
 	if err := ValidateSparseFloatRows(row); err != nil {
 		return nil, err
--- a/pkg/util/typeutil/schema_test.go
+++ b/pkg/util/typeutil/schema_test.go
@ -2120,107 +2120,89 @@ func TestValidateSparseFloatRows(t *testing.T) {

 func TestParseJsonSparseFloatRow(t *testing.T) {
 	t.Run("valid row 1", func(t *testing.T) {
-		row := []byte(`{"indices":[1,3,5],"values":[1.0,2.0,3.0]}`)
+		row := map[string]interface{}{"indices": []uint32{1, 3, 5}, "values": []float32{1.0, 2.0, 3.0}}
 		res, err := CreateSparseFloatRowFromJSON(row)
 		assert.NoError(t, err)
 		assert.Equal(t, CreateSparseFloatRow([]uint32{1, 3, 5}, []float32{1.0, 2.0, 3.0}), res)
 	})

 	t.Run("valid row 2", func(t *testing.T) {
-		row := []byte(`{"indices":[3,1,5],"values":[1.0,2.0,3.0]}`)
+		row := map[string]interface{}{"indices": []uint32{3, 1, 5}, "values": []float32{1.0, 2.0, 3.0}}
 		res, err := CreateSparseFloatRowFromJSON(row)
 		assert.NoError(t, err)
 		assert.Equal(t, CreateSparseFloatRow([]uint32{1, 3, 5}, []float32{2.0, 1.0, 3.0}), res)
 	})

 	t.Run("invalid row 1", func(t *testing.T) {
-		row := []byte(`{"indices":[1,3,5],"values":[1.0,2.0,3.0`)
+		row := map[string]interface{}{"indices": []uint32{1, 3, 5}, "values": []float32{1.0, 2.0}}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid row 2", func(t *testing.T) {
-		row := []byte(`{"indices":[1,3,5],"values":[1.0,2.0]`)
+		row := map[string]interface{}{"indices": []uint32{1}, "values": []float32{1.0, 2.0}}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid row 3", func(t *testing.T) {
-		row := []byte(`{"indices":[1],"values":[1.0,2.0]`)
+		row := map[string]interface{}{"indices": []uint32{}, "values": []float32{}}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid row 4", func(t *testing.T) {
-		row := []byte(`{"indices":[],"values":[]`)
-		_, err := CreateSparseFloatRowFromJSON(row)
-		assert.Error(t, err)
-	})
-
-	t.Run("invalid row 5", func(t *testing.T) {
-		row := []byte(`{"indices":[-3],"values":[0.2]`)
-		_, err := CreateSparseFloatRowFromJSON(row)
-		assert.Error(t, err)
-	})
-
-	t.Run("invalid row 6", func(t *testing.T) {
-		row := []byte(`{"indices":[3],"values":[-0.2]`)
+		row := map[string]interface{}{"indices": []uint32{3}, "values": []float32{-0.2}}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("valid dict row 1", func(t *testing.T) {
-		row := []byte(`{"1": 1.0, "3": 2.0, "5": 3.0}`)
+		row := map[string]interface{}{"1": 1.0, "3": 2.0, "5": 3.0}
 		res, err := CreateSparseFloatRowFromJSON(row)
 		assert.NoError(t, err)
 		assert.Equal(t, CreateSparseFloatRow([]uint32{1, 3, 5}, []float32{1.0, 2.0, 3.0}), res)
 	})

 	t.Run("valid dict row 2", func(t *testing.T) {
-		row := []byte(`{"3": 1.0, "1": 2.0, "5": 3.0}`)
+		row := map[string]interface{}{"3": 1.0, "1": 2.0, "5": 3.0}
 		res, err := CreateSparseFloatRowFromJSON(row)
 		assert.NoError(t, err)
 		assert.Equal(t, CreateSparseFloatRow([]uint32{1, 3, 5}, []float32{2.0, 1.0, 3.0}), res)
 	})

 	t.Run("invalid dict row 1", func(t *testing.T) {
-		row := []byte(`{"a": 1.0, "3": 2.0, "5": 3.0}`)
+		row := map[string]interface{}{"a": 1.0, "3": 2.0, "5": 3.0}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid dict row 2", func(t *testing.T) {
-		row := []byte(`{"1": "a", "3": 2.0, "5": 3.0}`)
+		row := map[string]interface{}{"1": "a", "3": 2.0, "5": 3.0}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid dict row 3", func(t *testing.T) {
-		row := []byte(`{"1": "1.0", "3": 2.0, "5": 3.0}`)
+		row := map[string]interface{}{"1": "1.0", "3": 2.0, "5": 3.0}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid dict row 4", func(t *testing.T) {
-		row := []byte(`{"1": 1.0, "3": 2.0, "5": }`)
+		row := map[string]interface{}{"-1": 1.0, "3": 2.0, "5": 3.0}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid dict row 5", func(t *testing.T) {
-		row := []byte(`{"-1": 1.0, "3": 2.0, "5": 3.0}`)
+		row := map[string]interface{}{"1": -1.0, "3": 2.0, "5": 3.0}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})

 	t.Run("invalid dict row 6", func(t *testing.T) {
-		row := []byte(`{"1": -1.0, "3": 2.0, "5": 3.0}`)
-		_, err := CreateSparseFloatRowFromJSON(row)
-		assert.Error(t, err)
-	})
-
-	t.Run("invalid dict row 7", func(t *testing.T) {
-		row := []byte(`{}`)
+		row := map[string]interface{}{}
 		_, err := CreateSparseFloatRowFromJSON(row)
 		assert.Error(t, err)
 	})
--- a/tests/integration/import/import_test.go
+++ b/tests/integration/import/import_test.go
@ -81,11 +81,17 @@ func (s *BulkInsertSuite) run() {

 	collectionName := "TestBulkInsert" + funcutil.GenRandomStr()

-	schema := integration.ConstructSchema(collectionName, dim, s.autoID,
-		&schemapb.FieldSchema{FieldID: 100, Name: "id", DataType: s.pkType, TypeParams: []*commonpb.KeyValuePair{{Key: common.MaxLengthKey, Value: "128"}}, IsPrimaryKey: true, AutoID: s.autoID},
-		&schemapb.FieldSchema{FieldID: 101, Name: "image_path", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: common.MaxLengthKey, Value: "65535"}}},
-		&schemapb.FieldSchema{FieldID: 102, Name: "embeddings", DataType: s.vecType, TypeParams: []*commonpb.KeyValuePair{{Key: common.DimKey, Value: "128"}}},
-	)
+	var schema *schemapb.CollectionSchema
+	fieldSchema1 := &schemapb.FieldSchema{FieldID: 100, Name: "id", DataType: s.pkType, TypeParams: []*commonpb.KeyValuePair{{Key: common.MaxLengthKey, Value: "128"}}, IsPrimaryKey: true, AutoID: s.autoID}
+	fieldSchema2 := &schemapb.FieldSchema{FieldID: 101, Name: "image_path", DataType: schemapb.DataType_VarChar, TypeParams: []*commonpb.KeyValuePair{{Key: common.MaxLengthKey, Value: "65535"}}}
+	fieldSchema3 := &schemapb.FieldSchema{FieldID: 102, Name: "embeddings", DataType: s.vecType, TypeParams: []*commonpb.KeyValuePair{{Key: common.DimKey, Value: "128"}}}
+	fieldSchema4 := &schemapb.FieldSchema{FieldID: 103, Name: "embeddings", DataType: s.vecType, TypeParams: []*commonpb.KeyValuePair{}}
+	if s.vecType != schemapb.DataType_SparseFloatVector {
+		schema = integration.ConstructSchema(collectionName, dim, s.autoID, fieldSchema1, fieldSchema2, fieldSchema3)
+	} else {
+		schema = integration.ConstructSchema(collectionName, dim, s.autoID, fieldSchema1, fieldSchema2, fieldSchema4)
+	}
+
 	marshaledSchema, err := proto.Marshal(schema)
 	s.NoError(err)

@ -214,10 +220,13 @@ func (s *BulkInsertSuite) TestMultiFileTypes() {
 		s.metricType = metric.L2
 		s.run()

-		// s.vecType = schemapb.DataType_SparseFloatVector
-		// s.indexType = indexparamcheck.IndexSparseWand
-		// s.metricType = metric.IP
-		// s.run()
+		// TODO: not support numpy for SparseFloatVector by now
+		if fileType != importutilv2.Numpy {
+			s.vecType = schemapb.DataType_SparseFloatVector
+			s.indexType = indexparamcheck.IndexSparseWand
+			s.metricType = metric.IP
+			s.run()
+		}
 	}
 }

--- a/tests/integration/import/util_test.go
+++ b/tests/integration/import/util_test.go
@ -37,8 +37,10 @@ import (
 	"github.com/milvus-io/milvus/internal/storage"
 	pq "github.com/milvus-io/milvus/internal/util/importutilv2/parquet"
 	"github.com/milvus-io/milvus/internal/util/testutil"
+	"github.com/milvus-io/milvus/pkg/common"
 	"github.com/milvus-io/milvus/pkg/log"
 	"github.com/milvus-io/milvus/pkg/util/merr"
+	"github.com/milvus-io/milvus/pkg/util/typeutil"
 	"github.com/milvus-io/milvus/tests/integration"
 )

@ -217,13 +219,29 @@ func GenerateJSONFile(t *testing.T, filePath string, schema *schemapb.Collection
 				data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetIntData().GetData()
 			case schemapb.DataType_JSON:
 				data[fieldID] = string(v.GetRow(i).([]byte))
-			case schemapb.DataType_BinaryVector, schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector, schemapb.DataType_SparseFloatVector:
+			case schemapb.DataType_BinaryVector:
 				bytes := v.GetRow(i).([]byte)
 				ints := make([]int, 0, len(bytes))
 				for _, b := range bytes {
 					ints = append(ints, int(b))
 				}
 				data[fieldID] = ints
+			case schemapb.DataType_Float16Vector:
+				bytes := v.GetRow(i).([]byte)
+				data[fieldID] = typeutil.Float16BytesToFloat32Vector(bytes)
+			case schemapb.DataType_BFloat16Vector:
+				bytes := v.GetRow(i).([]byte)
+				data[fieldID] = typeutil.BFloat16BytesToFloat32Vector(bytes)
+			case schemapb.DataType_SparseFloatVector:
+				bytes := v.GetRow(i).([]byte)
+				elemCount := len(bytes) / 8
+				values := make(map[uint32]float32)
+				for j := 0; j < elemCount; j++ {
+					idx := common.Endian.Uint32(bytes[j*8:])
+					f := typeutil.BytesToFloat32(bytes[j*8+4:])
+					values[idx] = f
+				}
+				data[fieldID] = values
 			default:
 				data[fieldID] = v.GetRow(i)
 			}
--- a/tests/python_client/common/bulk_insert_data.py
+++ b/tests/python_client/common/bulk_insert_data.py
@ -93,7 +93,7 @@ def gen_binary_vectors(nb, dim):
    return vectors


-def gen_fp16_vectors(num, dim):
+def gen_fp16_vectors(num, dim, for_json=False):
    """
    generate float16 vector data
    raw_vectors : the vectors
@ -105,13 +105,16 @@ def gen_fp16_vectors(num, dim):
    for _ in range(num):
        raw_vector = [random.random() for _ in range(dim)]
        raw_vectors.append(raw_vector)
-        fp16_vector = np.array(raw_vector, dtype=np.float16).view(np.uint8).tolist()
+        if for_json:
+            fp16_vector = np.array(raw_vector, dtype=np.float16).tolist()
+        else:
+            fp16_vector = np.array(raw_vector, dtype=np.float16).view(np.uint8).tolist()
        fp16_vectors.append(fp16_vector)

    return raw_vectors, fp16_vectors


-def gen_bf16_vectors(num, dim):
+def gen_bf16_vectors(num, dim, for_json=False):
    """
    generate brain float16 vector data
    raw_vectors : the vectors
@ -123,7 +126,10 @@ def gen_bf16_vectors(num, dim):
    for _ in range(num):
        raw_vector = [random.random() for _ in range(dim)]
        raw_vectors.append(raw_vector)
-        bf16_vector = np.array(jnp.array(raw_vector, dtype=jnp.bfloat16)).view(np.uint8).tolist()
+        if for_json:
+            bf16_vector = np.array(jnp.array(raw_vector, dtype=jnp.bfloat16)).tolist()
+        else:
+            bf16_vector = np.array(jnp.array(raw_vector, dtype=jnp.bfloat16)).view(np.uint8).tolist()
        bf16_vectors.append(bf16_vector)

    return raw_vectors, bf16_vectors
@ -603,9 +609,9 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
                    float_vector = False
                    d[data_field] = gen_vectors(float_vector=float_vector, rows=1, dim=dim)[0]
                if "bf16" in data_field:
-                    d[data_field] = gen_bf16_vectors(1, dim)[1][0]
+                    d[data_field] = gen_bf16_vectors(1, dim, True)[1][0]
                if "fp16" in data_field:
-                    d[data_field] = gen_fp16_vectors(1, dim)[1][0]
+                    d[data_field] = gen_fp16_vectors(1, dim, True)[1][0]
            elif data_field == DataField.float_field:
                d[data_field] = random.random()
            elif data_field == DataField.double_field: