feat: Support bulk insert for Int8Vector (#39499)

Issue: #38666

Signed-off-by: Cai Yudong <yudong.cai@zilliz.com>
pull/39526/head
Cai Yudong 2025-01-23 10:19:06 +08:00 committed by GitHub
parent f070af67f7
commit 7476eb3625
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 106 additions and 1 deletions

View File

@ -167,6 +167,10 @@ func createBinlogBuf(t *testing.T, field *schemapb.FieldSchema, data storage.Fie
vectors := data.(*storage.SparseFloatVectorFieldData)
err = evt.AddSparseFloatVectorToPayload(vectors)
assert.NoError(t, err)
case schemapb.DataType_Int8Vector:
vectors := data.(*storage.Int8VectorFieldData).Data
err = evt.AddInt8VectorToPayload(vectors, int(dim))
assert.NoError(t, err)
default:
assert.True(t, false)
return nil
@ -420,6 +424,8 @@ func (suite *ReaderSuite) TestVector() {
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
suite.vecDataType = schemapb.DataType_SparseFloatVector
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
suite.vecDataType = schemapb.DataType_Int8Vector
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
}
func TestUtil(t *testing.T) {

View File

@ -229,6 +229,8 @@ func (suite *ReaderSuite) TestVector() {
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
suite.vecDataType = schemapb.DataType_SparseFloatVector
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
suite.vecDataType = schemapb.DataType_Int8Vector
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
}
func TestUtil(t *testing.T) {

View File

@ -337,6 +337,19 @@ func (r *rowParser) parseEntity(field *schemapb.FieldSchema, obj string) (any, e
return nil, err
}
return vec2, nil
case schemapb.DataType_Int8Vector:
if nullable && obj == r.nullkey {
return nil, merr.WrapErrParameterInvalidMsg("not support nullable in vector")
}
var vec []int8
err := json.Unmarshal([]byte(obj), &vec)
if err != nil {
return nil, r.wrapTypeError(obj, field)
}
if len(vec) != r.name2Dim[field.GetName()] {
return nil, r.wrapDimError(len(vec), field)
}
return vec, nil
case schemapb.DataType_Array:
if nullable && obj == r.nullkey {
return nil, nil

View File

@ -309,6 +309,8 @@ func (suite *ReaderSuite) TestVector() {
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
suite.vecDataType = schemapb.DataType_SparseFloatVector
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
suite.vecDataType = schemapb.DataType_Int8Vector
suite.run(schemapb.DataType_Int32, schemapb.DataType_None, false)
}
func TestUtil(t *testing.T) {

View File

@ -377,6 +377,27 @@ func (r *rowParser) parseEntity(fieldID int64, obj any) (any, error) {
return nil, err
}
return vec, nil
case schemapb.DataType_Int8Vector:
arr, ok := obj.([]interface{})
if !ok {
return nil, r.wrapTypeError(obj, fieldID)
}
if len(arr) != r.id2Dim[fieldID] {
return nil, r.wrapDimError(len(arr), fieldID)
}
vec := make([]int8, len(arr))
for i := 0; i < len(arr); i++ {
value, ok := arr[i].(json.Number)
if !ok {
return nil, r.wrapTypeError(arr[i], fieldID)
}
num, err := strconv.ParseInt(value.String(), 10, 8)
if err != nil {
return nil, err
}
vec[i] = int8(num)
}
return vec, nil
case schemapb.DataType_String, schemapb.DataType_VarChar:
value, ok := obj.(string)
if !ok {
@ -521,7 +542,7 @@ func (r *rowParser) parseNullableEntity(fieldID int64, obj any) (any, error) {
return nil, err
}
return num, nil
case schemapb.DataType_BinaryVector, schemapb.DataType_FloatVector, schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector, schemapb.DataType_SparseFloatVector:
case schemapb.DataType_BinaryVector, schemapb.DataType_FloatVector, schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector, schemapb.DataType_SparseFloatVector, schemapb.DataType_Int8Vector:
return nil, merr.WrapErrParameterInvalidMsg("not support nullable in vector")
case schemapb.DataType_String, schemapb.DataType_VarChar:
if obj == nil {

View File

@ -104,6 +104,8 @@ func (c *FieldReader) getCount(count int64) int64 {
count *= c.dim
case schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector:
count *= c.dim * 2
case schemapb.DataType_Int8Vector:
count *= c.dim
}
if int(count) > (total - c.readPosition) {
return int64(total - c.readPosition)
@ -203,6 +205,12 @@ func (c *FieldReader) Next(count int64) (any, error) {
return nil, err
}
c.readPosition += int(readCount)
case schemapb.DataType_Int8Vector:
data, err = ReadN[int8](c.reader, c.order, readCount)
if err != nil {
return nil, err
}
c.readPosition += int(readCount)
case schemapb.DataType_FloatVector:
var elementType schemapb.DataType
elementType, err = convertNumpyType(c.npyReader.Header.Descr.Type)

View File

@ -190,6 +190,14 @@ func (suite *ReaderSuite) run(dt schemapb.DataType) {
copy(chunkedRows[i][:], innerSlice)
}
data = chunkedRows
case schemapb.DataType_Int8Vector:
rows := fieldData.GetDataRows().([]int8)
chunked := lo.Chunk(rows, dim)
chunkedRows := make([][dim]int8, len(chunked))
for i, innerSlice := range chunked {
copy(chunkedRows[i][:], innerSlice)
}
data = chunkedRows
default:
data = fieldData.GetDataRows()
}
@ -324,6 +332,14 @@ func (suite *ReaderSuite) failRun(dt schemapb.DataType, isDynamic bool) {
copy(chunkedRows[i][:], innerSlice)
}
data = chunkedRows
case schemapb.DataType_Int8Vector:
rows := fieldData.GetDataRows().([]int8)
chunked := lo.Chunk(rows, dim)
chunkedRows := make([][dim]int8, len(chunked))
for i, innerSlice := range chunked {
copy(chunkedRows[i][:], innerSlice)
}
data = chunkedRows
default:
data = fieldData.GetDataRows()
}
@ -432,6 +448,8 @@ func (suite *ReaderSuite) TestVector() {
suite.run(schemapb.DataType_Int32)
// suite.vecDataType = schemapb.DataType_SparseFloatVector
// suite.run(schemapb.DataType_Int32)
suite.vecDataType = schemapb.DataType_Int8Vector
suite.run(schemapb.DataType_Int32)
}
func TestUtil(t *testing.T) {

View File

@ -223,6 +223,16 @@ func validateHeader(npyReader *npy.Reader, field *schemapb.FieldSchema, dim int)
if shape[1] != dim/8 {
return wrapDimError(shape[1]*8, dim, field)
}
case schemapb.DataType_Int8Vector:
if elementType != schemapb.DataType_Int8 {
return wrapElementTypeError(elementType, field)
}
if len(shape) != 2 {
return wrapShapeError(len(shape), 2, field)
}
if shape[1] != dim {
return wrapDimError(shape[1], dim, field)
}
case schemapb.DataType_VarChar, schemapb.DataType_JSON:
if len(shape) != 1 {
return wrapShapeError(len(shape), 1, field)

View File

@ -176,6 +176,19 @@ func (c *FieldReader) Next(count int64) (any, any, error) {
}
data, err := ReadSparseFloatVectorData(c, count)
return data, nil, err
case schemapb.DataType_Int8Vector:
if c.field.GetNullable() {
return nil, nil, merr.WrapErrParameterInvalidMsg("not support nullable in vector")
}
arrayData, err := ReadIntegerOrFloatArrayData[int8](c, count)
if err != nil {
return nil, nil, err
}
if arrayData == nil {
return nil, nil, nil
}
vectors := lo.Flatten(arrayData.([][]int8))
return vectors, nil, nil
case schemapb.DataType_Array:
// array has not support default_value
if c.field.GetNullable() {
@ -708,6 +721,8 @@ func checkVectorAligned(offsets []int32, dim int, dataType schemapb.DataType) er
case schemapb.DataType_SparseFloatVector:
// JSON format, skip alignment check
return nil
case schemapb.DataType_Int8Vector:
return checkVectorAlignWithDim(offsets, int32(dim))
default:
return fmt.Errorf("unexpected vector data type %s", dataType.String())
}

View File

@ -495,6 +495,8 @@ func (s *ReaderSuite) TestVector() {
s.run(schemapb.DataType_Int32, schemapb.DataType_None, false, 0)
s.vecDataType = schemapb.DataType_SparseFloatVector
s.run(schemapb.DataType_Int32, schemapb.DataType_None, false, 0)
s.vecDataType = schemapb.DataType_Int8Vector
s.run(schemapb.DataType_Int32, schemapb.DataType_None, false, 0)
}
func TestUtil(t *testing.T) {

View File

@ -202,6 +202,13 @@ func convertToArrowDataType(field *schemapb.FieldSchema, isArray bool) (arrow.Da
}), nil
case schemapb.DataType_SparseFloatVector:
return &arrow.StringType{}, nil
case schemapb.DataType_Int8Vector:
return arrow.ListOfField(arrow.Field{
Name: "item",
Type: &arrow.Int8Type{},
Nullable: true,
Metadata: arrow.Metadata{},
}), nil
default:
return nil, merr.WrapErrParameterInvalidMsg("unsupported data type %v", dataType.String())
}

View File

@ -1409,6 +1409,7 @@ class TestCollectionWithAuth(TestBase):
@pytest.mark.L0
@pytest.mark.skip("skip temporarily, need fix")
class TestCollectionProperties(TestBase):
"""Test collection property operations"""