enhance: Add get_vector unittest for float16 & bfloat16 (#32153)

Issue: #22837

Signed-off-by: Cai Yudong <yudong.cai@zilliz.com>
pull/32536/head
Cai Yudong 2024-04-23 16:15:23 +08:00 committed by GitHub
parent a5f0fc4373
commit 16b8b7b35d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 107 additions and 250 deletions

View File

@ -13,13 +13,11 @@ import "C"
import (
"bytes"
"context"
"encoding/binary"
"fmt"
"io"
"strconv"
"github.com/golang/protobuf/proto"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
@ -29,7 +27,6 @@ import (
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/mq/msgstream"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -160,176 +157,6 @@ func getPKsFromColumnBasedInsertMsg(msg *msgstream.InsertMsg, schema *schemapb.C
return pks, nil
}
func fillBinVecFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
dim := fieldData.GetVectors().GetDim()
rowBytes := dim / 8
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
x := fieldData.GetVectors().GetData().(*schemapb.VectorField_BinaryVector)
resultLen := dim / 8
copy(x.BinaryVector[i*int(resultLen):(i+1)*int(resultLen)], content)
return nil
}
func fillFloatVecFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
dim := fieldData.GetVectors().GetDim()
rowBytes := dim * 4
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
x := fieldData.GetVectors().GetData().(*schemapb.VectorField_FloatVector)
floatResult := make([]float32, dim)
buf := bytes.NewReader(content)
if err = binary.Read(buf, endian, &floatResult); err != nil {
return err
}
resultLen := dim
copy(x.FloatVector.Data[i*int(resultLen):(i+1)*int(resultLen)], floatResult)
return nil
}
func fillSparseFloatVecFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
return fmt.Errorf("fillSparseFloatVecFieldData not implemented")
}
func fillBoolFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read whole file.
// TODO: optimize here.
content, err := vcm.Read(ctx, dataPath)
if err != nil {
return err
}
var arr schemapb.BoolArray
err = proto.Unmarshal(content, &arr)
if err != nil {
return err
}
fieldData.GetScalars().GetBoolData().GetData()[i] = arr.Data[offset]
return nil
}
func fillStringFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read whole file.
// TODO: optimize here.
content, err := vcm.Read(ctx, dataPath)
if err != nil {
return err
}
var arr schemapb.StringArray
err = proto.Unmarshal(content, &arr)
if err != nil {
return err
}
fieldData.GetScalars().GetStringData().GetData()[i] = arr.Data[offset]
return nil
}
func fillInt8FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(1)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
var i8 int8
if err := funcutil.ReadBinary(endian, content, &i8); err != nil {
return err
}
fieldData.GetScalars().GetIntData().GetData()[i] = int32(i8)
return nil
}
func fillInt16FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(2)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
var i16 int16
if err := funcutil.ReadBinary(endian, content, &i16); err != nil {
return err
}
fieldData.GetScalars().GetIntData().GetData()[i] = int32(i16)
return nil
}
func fillInt32FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(4)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetIntData().GetData()[i]))
}
func fillInt64FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(8)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetLongData().GetData()[i]))
}
func fillFloatFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(4)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetFloatData().GetData()[i]))
}
func fillDoubleFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(8)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetDoubleData().GetData()[i]))
}
func fillFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
switch fieldData.Type {
case schemapb.DataType_BinaryVector:
return fillBinVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_FloatVector:
return fillFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Float16Vector:
return fillFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_BFloat16Vector:
return fillFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_SparseFloatVector:
return fillSparseFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Bool:
return fillBoolFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_String, schemapb.DataType_VarChar:
return fillStringFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int8:
return fillInt8FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int16:
return fillInt16FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int32:
return fillInt32FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int64:
return fillInt64FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Float:
return fillFloatFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Double:
return fillDoubleFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
default:
return fmt.Errorf("invalid data type: %s", fieldData.Type.String())
}
}
// mergeRequestCost merge the costs of request, the cost may came from different worker in same channel
// or different channel in same collection, for now we just choose the part with the highest response time
func mergeRequestCost(requestCosts []*internalpb.CostAggregation) *internalpb.CostAggregation {

View File

@ -128,8 +128,8 @@ func (s *TestGetVectorSuite) run() {
vecFieldData = integration.NewFloatVectorFieldData(vecFieldName, NB, dim)
} else if s.vecType == schemapb.DataType_Float16Vector {
vecFieldData = integration.NewFloat16VectorFieldData(vecFieldName, NB, dim)
// } else if s.vecType == schemapb.DataType_BFloat16Vector {
// vecFieldData = integration.NewBFloat16VectorFieldData(vecFieldName, NB, dim)
} else if s.vecType == schemapb.DataType_BFloat16Vector {
vecFieldData = integration.NewBFloat16VectorFieldData(vecFieldName, NB, dim)
} else if typeutil.IsSparseFloatVectorType(s.vecType) {
vecFieldData = integration.NewSparseFloatVectorFieldData(vecFieldName, NB)
} else {
@ -240,25 +240,45 @@ func (s *TestGetVectorSuite) run() {
}
}
} else if s.vecType == schemapb.DataType_Float16Vector {
// s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector(), nq*topk*dim*2)
// rawData := vecFieldData.GetVectors().GetFloat16Vector()
// resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector()
// if s.pkType == schemapb.DataType_Int64 {
// for i, id := range result.GetIds().GetIntId().GetData() {
// expect := rawData[int(id)*dim : (int(id)+1)*dim]
// actual := resData[i*dim : (i+1)*dim]
// s.Require().ElementsMatch(expect, actual)
// }
// } else {
// for i, idStr := range result.GetIds().GetStrId().GetData() {
// id, err := strconv.Atoi(idStr)
// s.Require().NoError(err)
// expect := rawData[id*dim : (id+1)*dim]
// actual := resData[i*dim : (i+1)*dim]
// s.Require().ElementsMatch(expect, actual)
// }
// }
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector(), nq*topk*dim*2)
rawData := vecFieldData.GetVectors().GetFloat16Vector()
resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector()
rowBytes := dim * 2
if s.pkType == schemapb.DataType_Int64 {
for i, id := range result.GetIds().GetIntId().GetData() {
expect := rawData[int(id)*rowBytes : (int(id)+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
} else {
for i, idStr := range result.GetIds().GetStrId().GetData() {
id, err := strconv.Atoi(idStr)
s.Require().NoError(err)
expect := rawData[id*rowBytes : (id+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
}
} else if s.vecType == schemapb.DataType_BFloat16Vector {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetBfloat16Vector(), nq*topk*dim*2)
rawData := vecFieldData.GetVectors().GetBfloat16Vector()
resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetBfloat16Vector()
rowBytes := dim * 2
if s.pkType == schemapb.DataType_Int64 {
for i, id := range result.GetIds().GetIntId().GetData() {
expect := rawData[int(id)*rowBytes : (int(id)+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
} else {
for i, idStr := range result.GetIds().GetStrId().GetData() {
id, err := strconv.Atoi(idStr)
s.Require().NoError(err)
expect := rawData[id*rowBytes : (id+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
}
} else if s.vecType == schemapb.DataType_SparseFloatVector {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetSparseFloatVector().GetContents(), nq*topk)
rawData := vecFieldData.GetVectors().GetSparseFloatVector().GetContents()
@ -278,23 +298,22 @@ func (s *TestGetVectorSuite) run() {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector(), nq*topk*dim/8)
rawData := vecFieldData.GetVectors().GetBinaryVector()
resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector()
rowBytes := dim / 8
if s.pkType == schemapb.DataType_Int64 {
for i, id := range result.GetIds().GetIntId().GetData() {
dataBytes := dim / 8
for j := 0; j < dataBytes; j++ {
expect := rawData[int(id)*dataBytes+j]
actual := resData[i*dataBytes+j]
for j := 0; j < rowBytes; j++ {
expect := rawData[int(id)*rowBytes+j]
actual := resData[i*rowBytes+j]
s.Require().Equal(expect, actual)
}
}
} else {
for i, idStr := range result.GetIds().GetStrId().GetData() {
dataBytes := dim / 8
id, err := strconv.Atoi(idStr)
s.Require().NoError(err)
for j := 0; j < dataBytes; j++ {
expect := rawData[id*dataBytes+j]
actual := resData[i*dataBytes+j]
for j := 0; j < rowBytes; j++ {
expect := rawData[id*rowBytes+j]
actual := resData[i*rowBytes+j]
s.Require().Equal(expect, actual)
}
}
@ -319,26 +338,6 @@ func (s *TestGetVectorSuite) TestGetVector_FLAT() {
s.run()
}
func (s *TestGetVectorSuite) TestGetVector_Float16Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexFaissIDMap
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_Float16Vector
s.run()
}
// func (s *TestGetVectorSuite) TestGetVector_BFloat16Vector() {
// s.nq = 10
// s.topK = 10
// s.indexType = integration.IndexFaissIDMap
// s.metricType = metric.L2
// s.pkType = schemapb.DataType_Int64
// s.vecType = schemapb.DataType_BFloat16Vector
// s.run()
// }
func (s *TestGetVectorSuite) TestGetVector_IVF_FLAT() {
s.nq = 10
s.topK = 10
@ -429,6 +428,26 @@ func (s *TestGetVectorSuite) TestGetVector_BinaryVector() {
s.run()
}
func (s *TestGetVectorSuite) TestGetVector_Float16Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexHNSW
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_Float16Vector
s.run()
}
func (s *TestGetVectorSuite) TestGetVector_BFloat16Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexHNSW
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_BFloat16Vector
s.run()
}
func (s *TestGetVectorSuite) TestGetVector_Big_NQ_TOPK() {
s.T().Skip("skip big NQ Top due to timeout")
s.nq = 10000

View File

@ -18,10 +18,14 @@ package integration
import (
"context"
"encoding/binary"
"fmt"
"math"
"math/rand"
"time"
"github.com/x448/float16"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/util/testutils"
@ -163,20 +167,20 @@ func NewFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.Fie
}
}
// func NewBFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
// return &schemapb.FieldData{
// Type: schemapb.DataType_BFloat16Vector,
// FieldName: fieldName,
// Field: &schemapb.FieldData_Vectors{
// Vectors: &schemapb.VectorField{
// Dim: int64(dim),
// Data: &schemapb.VectorField_Bfloat16Vector{
// Bfloat16Vector: GenerateBFloat16Vectors(numRows, dim),
// },
// },
// },
// }
// }
func NewBFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
return &schemapb.FieldData{
Type: schemapb.DataType_BFloat16Vector,
FieldName: fieldName,
Field: &schemapb.FieldData_Vectors{
Vectors: &schemapb.VectorField{
Dim: int64(dim),
Data: &schemapb.VectorField_Bfloat16Vector{
Bfloat16Vector: GenerateBFloat16Vectors(numRows, dim),
},
},
},
}
}
func NewBinaryVectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
return &schemapb.FieldData{
@ -261,11 +265,28 @@ func GenerateBinaryVectors(numRows, dim int) []byte {
}
func GenerateFloat16Vectors(numRows, dim int) []byte {
total := numRows * dim * 2
ret := make([]byte, total)
_, err := rand.Read(ret)
if err != nil {
panic(err)
total := numRows * dim
ret := make([]byte, total*2)
for i := 0; i < total; i++ {
v := float16.Fromfloat32(rand.Float32()).Bits()
binary.LittleEndian.PutUint16(ret[i*2:], v)
}
return ret
}
func GenerateBFloat16Vectors(numRows, dim int) []byte {
total := numRows * dim
ret16 := make([]uint16, 0, total)
for i := 0; i < total; i++ {
f := rand.Float32()
bits := math.Float32bits(f)
bits >>= 16
bits &= 0x7FFF
ret16 = append(ret16, uint16(bits))
}
ret := make([]byte, total*2)
for i, value := range ret16 {
binary.LittleEndian.PutUint16(ret[i*2:], value)
}
return ret
}
@ -274,16 +295,6 @@ func GenerateSparseFloatArray(numRows int) *schemapb.SparseFloatArray {
return testutils.GenerateSparseFloatVectors(numRows)
}
// func GenerateBFloat16Vectors(numRows, dim int) []byte {
// total := numRows * dim * 2
// ret := make([]byte, total)
// _, err := rand.Read(ret)
// if err != nil {
// panic(err)
// }
// return ret
// }
func GenerateHashKeys(numRows int) []uint32 {
ret := make([]uint32, 0, numRows)
for i := 0; i < numRows; i++ {