enhance: Return deltadata for `DeleteCodec.Deserialize` (#37214)

Related to #35303 #30404

This PR change return type of `DeleteCodec.Deserialize` from
`storage.DeleteData` to `DeltaData`, which
reduces the memory usage of interface header.

Also refine `storage.DeltaData` methods to make it easier to usage.

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
pull/37304/head
congqixia 2024-10-29 12:04:24 +08:00 committed by GitHub
parent 0f59bfdf30
commit 3106384fc4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 269 additions and 58 deletions

View File

@ -20,9 +20,10 @@ import (
"fmt"
"testing"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus/pkg/config"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/stretchr/testify/assert"
)
func TestResizePools(t *testing.T) {

View File

@ -17,7 +17,7 @@ type DeltalogIterator struct {
disposedOnce sync.Once
disposed atomic.Bool
data *storage.DeleteData
data *storage.DeltaData
blobs []*storage.Blob
label *Label
pos int
@ -50,8 +50,8 @@ func (d *DeltalogIterator) Next() (*LabeledRowData, error) {
}
row := &DeltalogRow{
Pk: d.data.Pks[d.pos],
Timestamp: d.data.Tss[d.pos],
Pk: d.data.DeletePks().Get(d.pos),
Timestamp: d.data.DeleteTimestamps()[d.pos],
}
d.pos++
@ -76,7 +76,7 @@ func (d *DeltalogIterator) hasNext() bool {
d.data = dData
d.blobs = nil
}
return int64(d.pos) < d.data.RowCount
return int64(d.pos) < d.data.DeleteRowCount()
}
func (d *DeltalogIterator) isDisposed() bool {

View File

@ -1509,18 +1509,15 @@ func (s *DelegatorDataSuite) TestSyncTargetVersion() {
func (s *DelegatorDataSuite) TestLevel0Deletions() {
delegator := s.delegator
partitionID := int64(10)
partitionDelPks := storage.NewInt64PrimaryKeys(1)
partitionDelPks.AppendRaw(1)
allPartitionDelPks := storage.NewInt64PrimaryKeys(1)
allPartitionDelPks.AppendRaw(2)
partitionDeleteData := &storage.DeltaData{
DeletePks: partitionDelPks,
DeleteTimestamps: []storage.Timestamp{100},
}
allPartitionDeleteData := &storage.DeltaData{
DeletePks: allPartitionDelPks,
DeleteTimestamps: []storage.Timestamp{101},
}
partitionDeleteData, err := storage.NewDeltaDataWithPkType(1, schemapb.DataType_Int64)
s.Require().NoError(err)
err = partitionDeleteData.Append(storage.NewInt64PrimaryKey(1), 100)
s.Require().NoError(err)
allPartitionDeleteData, err := storage.NewDeltaDataWithPkType(1, schemapb.DataType_Int64)
s.Require().NoError(err)
err = allPartitionDeleteData.Append(storage.NewInt64PrimaryKey(2), 101)
s.Require().NoError(err)
schema := segments.GenTestCollectionSchema("test_stop", schemapb.DataType_Int64, true)
collection := segments.NewCollection(1, schema, nil, &querypb.LoadMetaInfo{
@ -1549,29 +1546,29 @@ func (s *DelegatorDataSuite) TestLevel0Deletions() {
l0Global.LoadDeltaData(context.TODO(), allPartitionDeleteData)
pks, _ := delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(partitionDeleteData.DeletePks.Get(0)))
s.True(pks[0].EQ(partitionDeleteData.DeletePks().Get(0)))
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.Empty(pks)
delegator.segmentManager.Put(context.TODO(), segments.SegmentTypeSealed, l0Global)
pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.ElementsMatch(pks, []storage.PrimaryKey{partitionDeleteData.DeletePks.Get(0), allPartitionDeleteData.DeletePks.Get(0)})
s.ElementsMatch(pks, []storage.PrimaryKey{partitionDeleteData.DeletePks().Get(0), allPartitionDeleteData.DeletePks().Get(0)})
bfs := pkoracle.NewBloomFilterSet(3, l0.Partition(), commonpb.SegmentState_Sealed)
bfs.UpdateBloomFilter([]storage.PrimaryKey{allPartitionDeleteData.DeletePks.Get(0)})
bfs.UpdateBloomFilter([]storage.PrimaryKey{allPartitionDeleteData.DeletePks().Get(0)})
pks, _ = delegator.GetLevel0Deletions(partitionID, bfs)
// bf filtered segment
s.Equal(len(pks), 1)
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks().Get(0)))
delegator.segmentManager.Remove(context.TODO(), l0.ID(), querypb.DataScope_All)
pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks().Get(0)))
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks().Get(0)))
delegator.segmentManager.Remove(context.TODO(), l0Global.ID(), querypb.DataScope_All)
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))

View File

@ -1019,8 +1019,8 @@ func (s *LocalSegment) AddFieldDataInfo(ctx context.Context, rowCount int64, fie
}
func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
pks, tss := deltaData.DeletePks, deltaData.DeleteTimestamps
rowNum := deltaData.DelRowCount
pks, tss := deltaData.DeletePks(), deltaData.DeleteTimestamps()
rowNum := deltaData.DeleteRowCount()
if !s.ptrLock.RLockIf(state.IsNotReleased) {
return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released")

View File

@ -155,10 +155,10 @@ func (s *L0Segment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaD
s.dataGuard.Lock()
defer s.dataGuard.Unlock()
for i := 0; i < deltaData.DeletePks.Len(); i++ {
s.pks = append(s.pks, deltaData.DeletePks.Get(i))
for i := 0; i < int(deltaData.DeleteRowCount()); i++ {
s.pks = append(s.pks, deltaData.DeletePks().Get(i))
}
s.tss = append(s.tss, deltaData.DeleteTimestamps...)
s.tss = append(s.tss, deltaData.DeleteTimestamps()...)
return nil
}

View File

@ -1212,22 +1212,13 @@ func (loader *segmentLoader) loadDeltalogs(ctx context.Context, segment Segment,
return blob.RowNum
})
var deltaData *storage.DeltaData
collection := loader.manager.Collection.Get(segment.Collection())
helper, _ := typeutil.CreateSchemaHelper(collection.Schema())
pkField, _ := helper.GetPrimaryKeyField()
switch pkField.DataType {
case schemapb.DataType_Int64:
deltaData = &storage.DeltaData{
DeletePks: storage.NewInt64PrimaryKeys(int(rowNums)),
DeleteTimestamps: make([]uint64, 0, rowNums),
}
case schemapb.DataType_VarChar:
deltaData = &storage.DeltaData{
DeletePks: storage.NewVarcharPrimaryKeys(int(rowNums)),
DeleteTimestamps: make([]uint64, 0, rowNums),
}
deltaData, err := storage.NewDeltaDataWithPkType(rowNums, pkField.DataType)
if err != nil {
return err
}
reader, err := storage.CreateDeltalogReader(blobs)
@ -1244,9 +1235,10 @@ func (loader *segmentLoader) loadDeltalogs(ctx context.Context, segment Segment,
return err
}
dl := reader.Value()
deltaData.DeletePks.MustAppend(dl.Pk)
deltaData.DeleteTimestamps = append(deltaData.DeleteTimestamps, dl.Ts)
deltaData.DelRowCount++
err = deltaData.Append(dl.Pk, dl.Ts)
if err != nil {
return err
}
}
err = segment.LoadDeltaData(ctx, deltaData)
@ -1254,7 +1246,7 @@ func (loader *segmentLoader) loadDeltalogs(ctx context.Context, segment Segment,
return err
}
log.Info("load delta logs done", zap.Int64("deleteCount", deltaData.DelRowCount))
log.Info("load delta logs done", zap.Int64("deleteCount", deltaData.DeleteRowCount()))
return nil
}

View File

@ -23,6 +23,8 @@ import (
"math"
"sort"
"github.com/samber/lo"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/pkg/common"
@ -761,13 +763,17 @@ func (deleteCodec *DeleteCodec) Serialize(collectionID UniqueID, partitionID Uni
}
// Deserialize deserializes the deltalog blobs into DeleteData
func (deleteCodec *DeleteCodec) Deserialize(blobs []*Blob) (partitionID UniqueID, segmentID UniqueID, data *DeleteData, err error) {
func (deleteCodec *DeleteCodec) Deserialize(blobs []*Blob) (partitionID UniqueID, segmentID UniqueID, data *DeltaData, err error) {
if len(blobs) == 0 {
return InvalidUniqueID, InvalidUniqueID, nil, fmt.Errorf("blobs is empty")
}
rowNum := lo.SumBy(blobs, func(blob *Blob) int64 {
return blob.RowNum
})
var pid, sid UniqueID
result := &DeleteData{}
result := NewDeltaData(rowNum)
deserializeBlob := func(blob *Blob) error {
binlogReader, err := NewBinlogReader(blob.Value)
@ -801,7 +807,10 @@ func (deleteCodec *DeleteCodec) Deserialize(blobs []*Blob) (partitionID UniqueID
if err != nil {
return err
}
result.Append(deleteLog.Pk, deleteLog.Ts)
err = result.Append(deleteLog.Pk, deleteLog.Ts)
if err != nil {
return err
}
}
return nil
}

View File

@ -574,9 +574,11 @@ func TestDeleteCodec(t *testing.T) {
pid, sid, data, err := deleteCodec.Deserialize([]*Blob{blob})
assert.NoError(t, err)
intPks, ok := data.DeletePks().(*Int64PrimaryKeys)
require.True(t, ok)
assert.Equal(t, pid, int64(1))
assert.Equal(t, sid, int64(1))
assert.Equal(t, data, deleteData)
assert.Equal(t, []int64{1, 2}, intPks.values)
})
t.Run("string pk", func(t *testing.T) {
@ -591,9 +593,11 @@ func TestDeleteCodec(t *testing.T) {
pid, sid, data, err := deleteCodec.Deserialize([]*Blob{blob})
assert.NoError(t, err)
strPks, ok := data.DeletePks().(*VarcharPrimaryKeys)
require.True(t, ok)
assert.Equal(t, pid, int64(1))
assert.Equal(t, sid, int64(1))
assert.Equal(t, data, deleteData)
assert.Equal(t, []string{"test1", "test2"}, strPks.values)
})
}
@ -633,8 +637,10 @@ func TestUpgradeDeleteLog(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, int64(1), parID)
assert.Equal(t, int64(1), segID)
assert.ElementsMatch(t, dData.Pks, deleteData.Pks)
assert.ElementsMatch(t, dData.Tss, deleteData.Tss)
intPks, ok := deleteData.DeletePks().(*Int64PrimaryKeys)
require.True(t, ok)
assert.ElementsMatch(t, []int64{1, 2}, intPks.values)
assert.ElementsMatch(t, dData.Tss, deleteData.DeleteTimestamps())
})
t.Run("with split lenth error", func(t *testing.T) {

View File

@ -21,11 +21,13 @@ import (
"fmt"
"strconv"
"strings"
"sync"
"github.com/samber/lo"
"github.com/valyala/fastjson"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/util/merr"
)
// parserPool use object pooling to reduce fastjson.Parser allocation.
@ -34,14 +36,84 @@ var parserPool = &fastjson.ParserPool{}
// DeltaData stores delta data
// currently only delete tuples are stored
type DeltaData struct {
PkType schemapb.DataType
pkType schemapb.DataType
// delete tuples
DeletePks PrimaryKeys
DeleteTimestamps []Timestamp
deletePks PrimaryKeys
deleteTimestamps []Timestamp
// stats
DelRowCount int64
MemSize int64
delRowCount int64
initCap int64
typeInitOnce sync.Once
}
func (dd *DeltaData) initPkType(pkType schemapb.DataType) error {
var err error
dd.typeInitOnce.Do(func() {
switch pkType {
case schemapb.DataType_Int64:
dd.deletePks = NewInt64PrimaryKeys(dd.initCap)
case schemapb.DataType_VarChar:
dd.deletePks = NewVarcharPrimaryKeys(dd.initCap)
default:
err = merr.WrapErrServiceInternal("unsupported pk type", pkType.String())
}
dd.pkType = pkType
})
return err
}
func (dd *DeltaData) PkType() schemapb.DataType {
return dd.pkType
}
func (dd *DeltaData) DeletePks() PrimaryKeys {
return dd.deletePks
}
func (dd *DeltaData) DeleteTimestamps() []Timestamp {
return dd.deleteTimestamps
}
func (dd *DeltaData) Append(pk PrimaryKey, ts Timestamp) error {
dd.initPkType(pk.Type())
err := dd.deletePks.Append(pk)
if err != nil {
return err
}
dd.deleteTimestamps = append(dd.deleteTimestamps, ts)
dd.delRowCount++
return nil
}
func (dd *DeltaData) DeleteRowCount() int64 {
return dd.delRowCount
}
func (dd *DeltaData) MemSize() int64 {
var result int64
if dd.deletePks != nil {
result += dd.deletePks.Size()
}
result += int64(len(dd.deleteTimestamps) * 8)
return result
}
func NewDeltaData(cap int64) *DeltaData {
return &DeltaData{
deleteTimestamps: make([]Timestamp, 0, cap),
initCap: cap,
}
}
func NewDeltaDataWithPkType(cap int64, pkType schemapb.DataType) (*DeltaData, error) {
result := NewDeltaData(cap)
err := result.initPkType(pkType)
if err != nil {
return nil, err
}
return result, nil
}
type DeleteLog struct {

View File

@ -20,6 +20,8 @@ import (
"testing"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
)
type DeleteLogSuite struct {
@ -150,3 +152,135 @@ func (s *DeleteLogSuite) TestUnmarshalJSON() {
func TestDeleteLog(t *testing.T) {
suite.Run(t, new(DeleteLogSuite))
}
type DeltaDataSuite struct {
suite.Suite
}
func (s *DeltaDataSuite) TestCreateWithPkType() {
s.Run("int_pks", func() {
dd, err := NewDeltaDataWithPkType(10, schemapb.DataType_Int64)
s.Require().NoError(err)
s.Equal(schemapb.DataType_Int64, dd.PkType())
intPks, ok := dd.DeletePks().(*Int64PrimaryKeys)
s.Require().True(ok)
s.EqualValues(10, cap(intPks.values))
})
s.Run("varchar_pks", func() {
dd, err := NewDeltaDataWithPkType(10, schemapb.DataType_VarChar)
s.Require().NoError(err)
s.Equal(schemapb.DataType_VarChar, dd.PkType())
intPks, ok := dd.DeletePks().(*VarcharPrimaryKeys)
s.Require().True(ok)
s.EqualValues(10, cap(intPks.values))
})
s.Run("unsupport_pk_type", func() {
_, err := NewDeltaDataWithPkType(10, schemapb.DataType_Bool)
s.Error(err)
})
}
func (s *DeltaDataSuite) TestAppend() {
s.Run("normal_same_type", func() {
s.Run("int64_pk", func() {
dd, err := NewDeltaDataWithPkType(10, schemapb.DataType_Int64)
s.Require().NoError(err)
err = dd.Append(NewInt64PrimaryKey(100), 100)
s.NoError(err)
err = dd.Append(NewInt64PrimaryKey(200), 200)
s.NoError(err)
s.EqualValues(2, dd.DeleteRowCount())
s.Equal([]Timestamp{100, 200}, dd.DeleteTimestamps())
intPks, ok := dd.DeletePks().(*Int64PrimaryKeys)
s.Require().True(ok)
s.Equal([]int64{100, 200}, intPks.values)
s.EqualValues(32, dd.MemSize())
})
s.Run("varchar_pk", func() {
dd, err := NewDeltaDataWithPkType(10, schemapb.DataType_VarChar)
s.Require().NoError(err)
err = dd.Append(NewVarCharPrimaryKey("100"), 100)
s.NoError(err)
err = dd.Append(NewVarCharPrimaryKey("200"), 200)
s.NoError(err)
s.EqualValues(2, dd.DeleteRowCount())
s.Equal([]Timestamp{100, 200}, dd.DeleteTimestamps())
varcharPks, ok := dd.DeletePks().(*VarcharPrimaryKeys)
s.Require().True(ok)
s.Equal([]string{"100", "200"}, varcharPks.values)
s.EqualValues(54, dd.MemSize())
})
})
s.Run("deduct_pk_type", func() {
s.Run("int64_pk", func() {
dd := NewDeltaData(10)
err := dd.Append(NewInt64PrimaryKey(100), 100)
s.NoError(err)
s.Equal(schemapb.DataType_Int64, dd.PkType())
err = dd.Append(NewInt64PrimaryKey(200), 200)
s.NoError(err)
s.Equal(schemapb.DataType_Int64, dd.PkType())
s.EqualValues(2, dd.DeleteRowCount())
s.Equal([]Timestamp{100, 200}, dd.DeleteTimestamps())
intPks, ok := dd.DeletePks().(*Int64PrimaryKeys)
s.Require().True(ok)
s.Equal([]int64{100, 200}, intPks.values)
})
s.Run("varchar_pk", func() {
dd := NewDeltaData(10)
err := dd.Append(NewVarCharPrimaryKey("100"), 100)
s.NoError(err)
s.Equal(schemapb.DataType_VarChar, dd.PkType())
err = dd.Append(NewVarCharPrimaryKey("200"), 200)
s.NoError(err)
s.Equal(schemapb.DataType_VarChar, dd.PkType())
s.EqualValues(2, dd.DeleteRowCount())
s.Equal([]Timestamp{100, 200}, dd.DeleteTimestamps())
varcharPks, ok := dd.DeletePks().(*VarcharPrimaryKeys)
s.Require().True(ok)
s.Equal([]string{"100", "200"}, varcharPks.values)
})
})
// expect errors
s.Run("mixed_pk_type", func() {
s.Run("intpks_append_varchar", func() {
dd, err := NewDeltaDataWithPkType(10, schemapb.DataType_Int64)
s.Require().NoError(err)
err = dd.Append(NewVarCharPrimaryKey("100"), 100)
s.Error(err)
})
s.Run("varcharpks_append_int", func() {
dd, err := NewDeltaDataWithPkType(10, schemapb.DataType_VarChar)
s.Require().NoError(err)
err = dd.Append(NewInt64PrimaryKey(100), 100)
s.Error(err)
})
})
}
func TestDeltaData(t *testing.T) {
suite.Run(t, new(DeltaDataSuite))
}

View File

@ -38,7 +38,7 @@ type Int64PrimaryKeys struct {
values []int64
}
func NewInt64PrimaryKeys(cap int) *Int64PrimaryKeys {
func NewInt64PrimaryKeys(cap int64) *Int64PrimaryKeys {
return &Int64PrimaryKeys{values: make([]int64, 0, cap)}
}
@ -97,7 +97,7 @@ type VarcharPrimaryKeys struct {
size int64
}
func NewVarcharPrimaryKeys(cap int) *VarcharPrimaryKeys {
func NewVarcharPrimaryKeys(cap int64) *VarcharPrimaryKeys {
return &VarcharPrimaryKeys{
values: make([]string, 0, cap),
}