enhance: Add get_vector unittest for float16 & bfloat16 ()

Issue: 

Signed-off-by: Cai Yudong <yudong.cai@zilliz.com>
pull/32536/head
Cai Yudong 2024-04-23 16:15:23 +08:00 committed by GitHub
parent a5f0fc4373
commit 16b8b7b35d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 107 additions and 250 deletions
internal/querynodev2/segments
tests/integration

View File

@ -13,13 +13,11 @@ import "C"
import ( import (
"bytes" "bytes"
"context"
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"io" "io"
"strconv" "strconv"
"github.com/golang/protobuf/proto"
"go.uber.org/zap" "go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
@ -29,7 +27,6 @@ import (
"github.com/milvus-io/milvus/pkg/common" "github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/mq/msgstream" "github.com/milvus-io/milvus/pkg/mq/msgstream"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/typeutil" "github.com/milvus-io/milvus/pkg/util/typeutil"
) )
@ -160,176 +157,6 @@ func getPKsFromColumnBasedInsertMsg(msg *msgstream.InsertMsg, schema *schemapb.C
return pks, nil return pks, nil
} }
func fillBinVecFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
dim := fieldData.GetVectors().GetDim()
rowBytes := dim / 8
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
x := fieldData.GetVectors().GetData().(*schemapb.VectorField_BinaryVector)
resultLen := dim / 8
copy(x.BinaryVector[i*int(resultLen):(i+1)*int(resultLen)], content)
return nil
}
func fillFloatVecFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
dim := fieldData.GetVectors().GetDim()
rowBytes := dim * 4
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
x := fieldData.GetVectors().GetData().(*schemapb.VectorField_FloatVector)
floatResult := make([]float32, dim)
buf := bytes.NewReader(content)
if err = binary.Read(buf, endian, &floatResult); err != nil {
return err
}
resultLen := dim
copy(x.FloatVector.Data[i*int(resultLen):(i+1)*int(resultLen)], floatResult)
return nil
}
func fillSparseFloatVecFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
return fmt.Errorf("fillSparseFloatVecFieldData not implemented")
}
func fillBoolFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read whole file.
// TODO: optimize here.
content, err := vcm.Read(ctx, dataPath)
if err != nil {
return err
}
var arr schemapb.BoolArray
err = proto.Unmarshal(content, &arr)
if err != nil {
return err
}
fieldData.GetScalars().GetBoolData().GetData()[i] = arr.Data[offset]
return nil
}
func fillStringFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read whole file.
// TODO: optimize here.
content, err := vcm.Read(ctx, dataPath)
if err != nil {
return err
}
var arr schemapb.StringArray
err = proto.Unmarshal(content, &arr)
if err != nil {
return err
}
fieldData.GetScalars().GetStringData().GetData()[i] = arr.Data[offset]
return nil
}
func fillInt8FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(1)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
var i8 int8
if err := funcutil.ReadBinary(endian, content, &i8); err != nil {
return err
}
fieldData.GetScalars().GetIntData().GetData()[i] = int32(i8)
return nil
}
func fillInt16FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(2)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
var i16 int16
if err := funcutil.ReadBinary(endian, content, &i16); err != nil {
return err
}
fieldData.GetScalars().GetIntData().GetData()[i] = int32(i16)
return nil
}
func fillInt32FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(4)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetIntData().GetData()[i]))
}
func fillInt64FieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(8)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetLongData().GetData()[i]))
}
func fillFloatFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(4)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetFloatData().GetData()[i]))
}
func fillDoubleFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
// read by offset.
rowBytes := int64(8)
content, err := vcm.ReadAt(ctx, dataPath, offset*rowBytes, rowBytes)
if err != nil {
return err
}
return funcutil.ReadBinary(endian, content, &(fieldData.GetScalars().GetDoubleData().GetData()[i]))
}
func fillFieldData(ctx context.Context, vcm storage.ChunkManager, dataPath string, fieldData *schemapb.FieldData, i int, offset int64, endian binary.ByteOrder) error {
switch fieldData.Type {
case schemapb.DataType_BinaryVector:
return fillBinVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_FloatVector:
return fillFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Float16Vector:
return fillFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_BFloat16Vector:
return fillFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_SparseFloatVector:
return fillSparseFloatVecFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Bool:
return fillBoolFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_String, schemapb.DataType_VarChar:
return fillStringFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int8:
return fillInt8FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int16:
return fillInt16FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int32:
return fillInt32FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Int64:
return fillInt64FieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Float:
return fillFloatFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
case schemapb.DataType_Double:
return fillDoubleFieldData(ctx, vcm, dataPath, fieldData, i, offset, endian)
default:
return fmt.Errorf("invalid data type: %s", fieldData.Type.String())
}
}
// mergeRequestCost merge the costs of request, the cost may came from different worker in same channel // mergeRequestCost merge the costs of request, the cost may came from different worker in same channel
// or different channel in same collection, for now we just choose the part with the highest response time // or different channel in same collection, for now we just choose the part with the highest response time
func mergeRequestCost(requestCosts []*internalpb.CostAggregation) *internalpb.CostAggregation { func mergeRequestCost(requestCosts []*internalpb.CostAggregation) *internalpb.CostAggregation {

View File

@ -128,8 +128,8 @@ func (s *TestGetVectorSuite) run() {
vecFieldData = integration.NewFloatVectorFieldData(vecFieldName, NB, dim) vecFieldData = integration.NewFloatVectorFieldData(vecFieldName, NB, dim)
} else if s.vecType == schemapb.DataType_Float16Vector { } else if s.vecType == schemapb.DataType_Float16Vector {
vecFieldData = integration.NewFloat16VectorFieldData(vecFieldName, NB, dim) vecFieldData = integration.NewFloat16VectorFieldData(vecFieldName, NB, dim)
// } else if s.vecType == schemapb.DataType_BFloat16Vector { } else if s.vecType == schemapb.DataType_BFloat16Vector {
// vecFieldData = integration.NewBFloat16VectorFieldData(vecFieldName, NB, dim) vecFieldData = integration.NewBFloat16VectorFieldData(vecFieldName, NB, dim)
} else if typeutil.IsSparseFloatVectorType(s.vecType) { } else if typeutil.IsSparseFloatVectorType(s.vecType) {
vecFieldData = integration.NewSparseFloatVectorFieldData(vecFieldName, NB) vecFieldData = integration.NewSparseFloatVectorFieldData(vecFieldName, NB)
} else { } else {
@ -240,25 +240,45 @@ func (s *TestGetVectorSuite) run() {
} }
} }
} else if s.vecType == schemapb.DataType_Float16Vector { } else if s.vecType == schemapb.DataType_Float16Vector {
// s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector(), nq*topk*dim*2) s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector(), nq*topk*dim*2)
// rawData := vecFieldData.GetVectors().GetFloat16Vector() rawData := vecFieldData.GetVectors().GetFloat16Vector()
// resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector() resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetFloat16Vector()
// if s.pkType == schemapb.DataType_Int64 { rowBytes := dim * 2
// for i, id := range result.GetIds().GetIntId().GetData() { if s.pkType == schemapb.DataType_Int64 {
// expect := rawData[int(id)*dim : (int(id)+1)*dim] for i, id := range result.GetIds().GetIntId().GetData() {
// actual := resData[i*dim : (i+1)*dim] expect := rawData[int(id)*rowBytes : (int(id)+1)*rowBytes]
// s.Require().ElementsMatch(expect, actual) actual := resData[i*rowBytes : (i+1)*rowBytes]
// } s.Require().ElementsMatch(expect, actual)
// } else { }
// for i, idStr := range result.GetIds().GetStrId().GetData() { } else {
// id, err := strconv.Atoi(idStr) for i, idStr := range result.GetIds().GetStrId().GetData() {
// s.Require().NoError(err) id, err := strconv.Atoi(idStr)
// expect := rawData[id*dim : (id+1)*dim] s.Require().NoError(err)
// actual := resData[i*dim : (i+1)*dim] expect := rawData[id*rowBytes : (id+1)*rowBytes]
// s.Require().ElementsMatch(expect, actual) actual := resData[i*rowBytes : (i+1)*rowBytes]
// } s.Require().ElementsMatch(expect, actual)
// } }
}
} else if s.vecType == schemapb.DataType_BFloat16Vector { } else if s.vecType == schemapb.DataType_BFloat16Vector {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetBfloat16Vector(), nq*topk*dim*2)
rawData := vecFieldData.GetVectors().GetBfloat16Vector()
resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetBfloat16Vector()
rowBytes := dim * 2
if s.pkType == schemapb.DataType_Int64 {
for i, id := range result.GetIds().GetIntId().GetData() {
expect := rawData[int(id)*rowBytes : (int(id)+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
} else {
for i, idStr := range result.GetIds().GetStrId().GetData() {
id, err := strconv.Atoi(idStr)
s.Require().NoError(err)
expect := rawData[id*rowBytes : (id+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
}
} else if s.vecType == schemapb.DataType_SparseFloatVector { } else if s.vecType == schemapb.DataType_SparseFloatVector {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetSparseFloatVector().GetContents(), nq*topk) s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetSparseFloatVector().GetContents(), nq*topk)
rawData := vecFieldData.GetVectors().GetSparseFloatVector().GetContents() rawData := vecFieldData.GetVectors().GetSparseFloatVector().GetContents()
@ -278,23 +298,22 @@ func (s *TestGetVectorSuite) run() {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector(), nq*topk*dim/8) s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector(), nq*topk*dim/8)
rawData := vecFieldData.GetVectors().GetBinaryVector() rawData := vecFieldData.GetVectors().GetBinaryVector()
resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector() resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector()
rowBytes := dim / 8
if s.pkType == schemapb.DataType_Int64 { if s.pkType == schemapb.DataType_Int64 {
for i, id := range result.GetIds().GetIntId().GetData() { for i, id := range result.GetIds().GetIntId().GetData() {
dataBytes := dim / 8 for j := 0; j < rowBytes; j++ {
for j := 0; j < dataBytes; j++ { expect := rawData[int(id)*rowBytes+j]
expect := rawData[int(id)*dataBytes+j] actual := resData[i*rowBytes+j]
actual := resData[i*dataBytes+j]
s.Require().Equal(expect, actual) s.Require().Equal(expect, actual)
} }
} }
} else { } else {
for i, idStr := range result.GetIds().GetStrId().GetData() { for i, idStr := range result.GetIds().GetStrId().GetData() {
dataBytes := dim / 8
id, err := strconv.Atoi(idStr) id, err := strconv.Atoi(idStr)
s.Require().NoError(err) s.Require().NoError(err)
for j := 0; j < dataBytes; j++ { for j := 0; j < rowBytes; j++ {
expect := rawData[id*dataBytes+j] expect := rawData[id*rowBytes+j]
actual := resData[i*dataBytes+j] actual := resData[i*rowBytes+j]
s.Require().Equal(expect, actual) s.Require().Equal(expect, actual)
} }
} }
@ -319,26 +338,6 @@ func (s *TestGetVectorSuite) TestGetVector_FLAT() {
s.run() s.run()
} }
func (s *TestGetVectorSuite) TestGetVector_Float16Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexFaissIDMap
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_Float16Vector
s.run()
}
// func (s *TestGetVectorSuite) TestGetVector_BFloat16Vector() {
// s.nq = 10
// s.topK = 10
// s.indexType = integration.IndexFaissIDMap
// s.metricType = metric.L2
// s.pkType = schemapb.DataType_Int64
// s.vecType = schemapb.DataType_BFloat16Vector
// s.run()
// }
func (s *TestGetVectorSuite) TestGetVector_IVF_FLAT() { func (s *TestGetVectorSuite) TestGetVector_IVF_FLAT() {
s.nq = 10 s.nq = 10
s.topK = 10 s.topK = 10
@ -429,6 +428,26 @@ func (s *TestGetVectorSuite) TestGetVector_BinaryVector() {
s.run() s.run()
} }
func (s *TestGetVectorSuite) TestGetVector_Float16Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexHNSW
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_Float16Vector
s.run()
}
func (s *TestGetVectorSuite) TestGetVector_BFloat16Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexHNSW
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_BFloat16Vector
s.run()
}
func (s *TestGetVectorSuite) TestGetVector_Big_NQ_TOPK() { func (s *TestGetVectorSuite) TestGetVector_Big_NQ_TOPK() {
s.T().Skip("skip big NQ Top due to timeout") s.T().Skip("skip big NQ Top due to timeout")
s.nq = 10000 s.nq = 10000

View File

@ -18,10 +18,14 @@ package integration
import ( import (
"context" "context"
"encoding/binary"
"fmt" "fmt"
"math"
"math/rand" "math/rand"
"time" "time"
"github.com/x448/float16"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/util/testutils" "github.com/milvus-io/milvus/pkg/util/testutils"
@ -163,20 +167,20 @@ func NewFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.Fie
} }
} }
// func NewBFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData { func NewBFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
// return &schemapb.FieldData{ return &schemapb.FieldData{
// Type: schemapb.DataType_BFloat16Vector, Type: schemapb.DataType_BFloat16Vector,
// FieldName: fieldName, FieldName: fieldName,
// Field: &schemapb.FieldData_Vectors{ Field: &schemapb.FieldData_Vectors{
// Vectors: &schemapb.VectorField{ Vectors: &schemapb.VectorField{
// Dim: int64(dim), Dim: int64(dim),
// Data: &schemapb.VectorField_Bfloat16Vector{ Data: &schemapb.VectorField_Bfloat16Vector{
// Bfloat16Vector: GenerateBFloat16Vectors(numRows, dim), Bfloat16Vector: GenerateBFloat16Vectors(numRows, dim),
// }, },
// }, },
// }, },
// } }
// } }
func NewBinaryVectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData { func NewBinaryVectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
return &schemapb.FieldData{ return &schemapb.FieldData{
@ -261,11 +265,28 @@ func GenerateBinaryVectors(numRows, dim int) []byte {
} }
func GenerateFloat16Vectors(numRows, dim int) []byte { func GenerateFloat16Vectors(numRows, dim int) []byte {
total := numRows * dim * 2 total := numRows * dim
ret := make([]byte, total) ret := make([]byte, total*2)
_, err := rand.Read(ret) for i := 0; i < total; i++ {
if err != nil { v := float16.Fromfloat32(rand.Float32()).Bits()
panic(err) binary.LittleEndian.PutUint16(ret[i*2:], v)
}
return ret
}
func GenerateBFloat16Vectors(numRows, dim int) []byte {
total := numRows * dim
ret16 := make([]uint16, 0, total)
for i := 0; i < total; i++ {
f := rand.Float32()
bits := math.Float32bits(f)
bits >>= 16
bits &= 0x7FFF
ret16 = append(ret16, uint16(bits))
}
ret := make([]byte, total*2)
for i, value := range ret16 {
binary.LittleEndian.PutUint16(ret[i*2:], value)
} }
return ret return ret
} }
@ -274,16 +295,6 @@ func GenerateSparseFloatArray(numRows int) *schemapb.SparseFloatArray {
return testutils.GenerateSparseFloatVectors(numRows) return testutils.GenerateSparseFloatVectors(numRows)
} }
// func GenerateBFloat16Vectors(numRows, dim int) []byte {
// total := numRows * dim * 2
// ret := make([]byte, total)
// _, err := rand.Read(ret)
// if err != nil {
// panic(err)
// }
// return ret
// }
func GenerateHashKeys(numRows int) []uint32 { func GenerateHashKeys(numRows int) []uint32 {
ret := make([]uint32, 0, numRows) ret := make([]uint32, 0, numRows)
for i := 0; i < numRows; i++ { for i := 0; i < numRows; i++ {