enhance: Replace PrimaryKey slice with PrimaryKeys saving memory (#37127)

Related to #35303

Slice of `storage.PrimaryKey` will have extra interface cost for each
element, which may cause notable memory usage when delta row count
number is large.

This PR replaces PrimaryKey slice with PrimaryKeys interface saving the
extra interface cost.

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
pull/37180/head
congqixia 2024-10-28 10:29:30 +08:00 committed by GitHub
parent 9d16b972ea
commit 7774b7275e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 124 additions and 56 deletions

View File

@ -1509,8 +1509,18 @@ func (s *DelegatorDataSuite) TestSyncTargetVersion() {
func (s *DelegatorDataSuite) TestLevel0Deletions() {
delegator := s.delegator
partitionID := int64(10)
partitionDeleteData := storage.NewDeleteData([]storage.PrimaryKey{storage.NewInt64PrimaryKey(1)}, []storage.Timestamp{100})
allPartitionDeleteData := storage.NewDeleteData([]storage.PrimaryKey{storage.NewInt64PrimaryKey(2)}, []storage.Timestamp{101})
partitionDelPks := storage.NewInt64PrimaryKeys(1)
partitionDelPks.AppendRaw(1)
allPartitionDelPks := storage.NewInt64PrimaryKeys(1)
allPartitionDelPks.AppendRaw(2)
partitionDeleteData := &storage.DeltaData{
DeletePks: partitionDelPks,
DeleteTimestamps: []storage.Timestamp{100},
}
allPartitionDeleteData := &storage.DeltaData{
DeletePks: allPartitionDelPks,
DeleteTimestamps: []storage.Timestamp{101},
}
schema := segments.GenTestCollectionSchema("test_stop", schemapb.DataType_Int64, true)
collection := segments.NewCollection(1, schema, nil, &querypb.LoadMetaInfo{
@ -1539,29 +1549,29 @@ func (s *DelegatorDataSuite) TestLevel0Deletions() {
l0Global.LoadDeltaData(context.TODO(), allPartitionDeleteData)
pks, _ := delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(partitionDeleteData.Pks[0]))
s.True(pks[0].EQ(partitionDeleteData.DeletePks.Get(0)))
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.Empty(pks)
delegator.segmentManager.Put(context.TODO(), segments.SegmentTypeSealed, l0Global)
pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.ElementsMatch(pks, []storage.PrimaryKey{partitionDeleteData.Pks[0], allPartitionDeleteData.Pks[0]})
s.ElementsMatch(pks, []storage.PrimaryKey{partitionDeleteData.DeletePks.Get(0), allPartitionDeleteData.DeletePks.Get(0)})
bfs := pkoracle.NewBloomFilterSet(3, l0.Partition(), commonpb.SegmentState_Sealed)
bfs.UpdateBloomFilter(allPartitionDeleteData.Pks)
bfs.UpdateBloomFilter([]storage.PrimaryKey{allPartitionDeleteData.DeletePks.Get(0)})
pks, _ = delegator.GetLevel0Deletions(partitionID, bfs)
// bf filtered segment
s.Equal(len(pks), 1)
s.True(pks[0].EQ(allPartitionDeleteData.Pks[0]))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
delegator.segmentManager.Remove(context.TODO(), l0.ID(), querypb.DataScope_All)
pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(allPartitionDeleteData.Pks[0]))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(allPartitionDeleteData.Pks[0]))
s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
delegator.segmentManager.Remove(context.TODO(), l0Global.ID(), querypb.DataScope_All)
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))

View File

@ -822,7 +822,7 @@ func (_c *MockSegment_Level_Call) RunAndReturn(run func() datapb.SegmentLevel) *
}
// LoadDeltaData provides a mock function with given fields: ctx, deltaData
func (_m *MockSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error {
func (_m *MockSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
ret := _m.Called(ctx, deltaData)
if len(ret) == 0 {
@ -830,7 +830,7 @@ func (_m *MockSegment) LoadDeltaData(ctx context.Context, deltaData *storage.Del
}
var r0 error
if rf, ok := ret.Get(0).(func(context.Context, *storage.DeleteData) error); ok {
if rf, ok := ret.Get(0).(func(context.Context, *storage.DeltaData) error); ok {
r0 = rf(ctx, deltaData)
} else {
r0 = ret.Error(0)
@ -846,14 +846,14 @@ type MockSegment_LoadDeltaData_Call struct {
// LoadDeltaData is a helper method to define mock.On call
// - ctx context.Context
// - deltaData *storage.DeleteData
// - deltaData *storage.DeltaData
func (_e *MockSegment_Expecter) LoadDeltaData(ctx interface{}, deltaData interface{}) *MockSegment_LoadDeltaData_Call {
return &MockSegment_LoadDeltaData_Call{Call: _e.mock.On("LoadDeltaData", ctx, deltaData)}
}
func (_c *MockSegment_LoadDeltaData_Call) Run(run func(ctx context.Context, deltaData *storage.DeleteData)) *MockSegment_LoadDeltaData_Call {
func (_c *MockSegment_LoadDeltaData_Call) Run(run func(ctx context.Context, deltaData *storage.DeltaData)) *MockSegment_LoadDeltaData_Call {
_c.Call.Run(func(args mock.Arguments) {
run(args[0].(context.Context), args[1].(*storage.DeleteData))
run(args[0].(context.Context), args[1].(*storage.DeltaData))
})
return _c
}
@ -863,7 +863,7 @@ func (_c *MockSegment_LoadDeltaData_Call) Return(_a0 error) *MockSegment_LoadDel
return _c
}
func (_c *MockSegment_LoadDeltaData_Call) RunAndReturn(run func(context.Context, *storage.DeleteData) error) *MockSegment_LoadDeltaData_Call {
func (_c *MockSegment_LoadDeltaData_Call) RunAndReturn(run func(context.Context, *storage.DeltaData) error) *MockSegment_LoadDeltaData_Call {
_c.Call.Return(run)
return _c
}

View File

@ -1018,9 +1018,9 @@ func (s *LocalSegment) AddFieldDataInfo(ctx context.Context, rowCount int64, fie
return nil
}
func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error {
pks, tss := deltaData.Pks, deltaData.Tss
rowNum := deltaData.RowCount
func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
pks, tss := deltaData.DeletePks, deltaData.DeleteTimestamps
rowNum := deltaData.DelRowCount
if !s.ptrLock.RLockIf(state.IsNotReleased) {
return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released")
@ -1033,31 +1033,9 @@ func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.Del
zap.Int64("segmentID", s.ID()),
)
pkType := pks[0].Type()
ids := &schemapb.IDs{}
switch pkType {
case schemapb.DataType_Int64:
int64Pks := make([]int64, len(pks))
for index, pk := range pks {
int64Pks[index] = pk.(*storage.Int64PrimaryKey).Value
}
ids.IdField = &schemapb.IDs_IntId{
IntId: &schemapb.LongArray{
Data: int64Pks,
},
}
case schemapb.DataType_VarChar:
varCharPks := make([]string, len(pks))
for index, pk := range pks {
varCharPks[index] = pk.(*storage.VarCharPrimaryKey).Value
}
ids.IdField = &schemapb.IDs_StrId{
StrId: &schemapb.StringArray{
Data: varCharPks,
},
}
default:
return fmt.Errorf("invalid data type of primary keys")
ids, err := storage.ParsePrimaryKeysBatch2IDs(pks)
if err != nil {
return err
}
idsBlob, err := proto.Marshal(ids)

View File

@ -78,7 +78,7 @@ type Segment interface {
// Modification related
Insert(ctx context.Context, rowIDs []int64, timestamps []typeutil.Timestamp, record *segcorepb.InsertRecord) error
Delete(ctx context.Context, primaryKeys []storage.PrimaryKey, timestamps []typeutil.Timestamp) error
LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error
LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error
LastDeltaTimestamp() uint64
Release(ctx context.Context, opts ...releaseOption)

View File

@ -151,12 +151,14 @@ func (s *L0Segment) Delete(ctx context.Context, primaryKeys []storage.PrimaryKey
return merr.WrapErrIoFailedReason("delete not supported for L0 segment")
}
func (s *L0Segment) LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error {
func (s *L0Segment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
s.dataGuard.Lock()
defer s.dataGuard.Unlock()
s.pks = append(s.pks, deltaData.Pks...)
s.tss = append(s.tss, deltaData.Tss...)
for i := 0; i < deltaData.DeletePks.Len(); i++ {
s.pks = append(s.pks, deltaData.DeletePks.Get(i))
}
s.tss = append(s.tss, deltaData.DeleteTimestamps...)
return nil
}

View File

@ -1207,9 +1207,23 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
rowNums := lo.SumBy(blobs, func(blob *storage.Blob) int64 {
return blob.RowNum
})
deltaData := &storage.DeleteData{
Pks: make([]storage.PrimaryKey, 0, rowNums),
Tss: make([]uint64, 0, rowNums),
var deltaData *storage.DeltaData
collection := loader.manager.Collection.Get(segment.Collection())
helper, _ := typeutil.CreateSchemaHelper(collection.Schema())
pkField, _ := helper.GetPrimaryKeyField()
switch pkField.DataType {
case schemapb.DataType_Int64:
deltaData = &storage.DeltaData{
DeletePks: storage.NewInt64PrimaryKeys(int(rowNums)),
DeleteTimestamps: make([]uint64, 0, rowNums),
}
case schemapb.DataType_VarChar:
deltaData = &storage.DeltaData{
DeletePks: storage.NewVarcharPrimaryKeys(int(rowNums)),
DeleteTimestamps: make([]uint64, 0, rowNums),
}
}
reader, err := storage.CreateDeltalogReader(blobs)
@ -1226,7 +1240,9 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
return err
}
dl := reader.Value()
deltaData.Append(dl.Pk, dl.Ts)
deltaData.DeletePks.MustAppend(dl.Pk)
deltaData.DeleteTimestamps = append(deltaData.DeleteTimestamps, dl.Ts)
deltaData.DelRowCount++
}
err = segment.LoadDeltaData(ctx, deltaData)
@ -1234,7 +1250,7 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
return err
}
log.Info("load delta logs done", zap.Int64("deleteCount", deltaData.RowCount))
log.Info("load delta logs done", zap.Int64("deleteCount", deltaData.DelRowCount))
return nil
}

View File

@ -34,14 +34,14 @@ var parserPool = &fastjson.ParserPool{}
// DeltaData stores delta data
// currently only delete tuples are stored
type DeltaData struct {
pkType schemapb.DataType
PkType schemapb.DataType
// delete tuples
delPks PrimaryKeys
delTss []Timestamp
DeletePks PrimaryKeys
DeleteTimestamps []Timestamp
// stats
delRowCount int64
memSize int64
DelRowCount int64
MemSize int64
}
type DeleteLog struct {

View File

@ -23,6 +23,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
)
type PrimaryKey interface {
@ -350,6 +351,33 @@ func ParseIDs2PrimaryKeys(ids *schemapb.IDs) []PrimaryKey {
return ret
}
func ParsePrimaryKeysBatch2IDs(pks PrimaryKeys) (*schemapb.IDs, error) {
ret := &schemapb.IDs{}
if pks.Len() == 0 {
return ret, nil
}
switch pks.Type() {
case schemapb.DataType_Int64:
int64Pks := pks.(*Int64PrimaryKeys)
ret.IdField = &schemapb.IDs_IntId{
IntId: &schemapb.LongArray{
Data: int64Pks.values,
},
}
case schemapb.DataType_VarChar:
varcharPks := pks.(*VarcharPrimaryKeys)
ret.IdField = &schemapb.IDs_StrId{
StrId: &schemapb.StringArray{
Data: varcharPks.values,
},
}
default:
return nil, merr.WrapErrServiceInternal("parsing unsupported pk type", pks.Type().String())
}
return ret, nil
}
func ParsePrimaryKeys2IDs(pks []PrimaryKey) *schemapb.IDs {
ret := &schemapb.IDs{}
if len(pks) == 0 {

View File

@ -177,3 +177,37 @@ func TestParsePrimaryKeysAndIDs(t *testing.T) {
assert.ElementsMatch(t, c.pks, testPks)
}
}
type badPks struct {
PrimaryKeys
}
func (pks *badPks) Type() schemapb.DataType {
return schemapb.DataType_None
}
func TestParsePrimaryKeysBatch2IDs(t *testing.T) {
t.Run("success_cases", func(t *testing.T) {
intPks := NewInt64PrimaryKeys(3)
intPks.AppendRaw(1, 2, 3)
ids, err := ParsePrimaryKeysBatch2IDs(intPks)
assert.NoError(t, err)
assert.ElementsMatch(t, []int64{1, 2, 3}, ids.GetIntId().GetData())
strPks := NewVarcharPrimaryKeys(3)
strPks.AppendRaw("1", "2", "3")
ids, err = ParsePrimaryKeysBatch2IDs(strPks)
assert.NoError(t, err)
assert.ElementsMatch(t, []string{"1", "2", "3"}, ids.GetStrId().GetData())
})
t.Run("unsupport_type", func(t *testing.T) {
intPks := NewInt64PrimaryKeys(3)
intPks.AppendRaw(1, 2, 3)
_, err := ParsePrimaryKeysBatch2IDs(&badPks{PrimaryKeys: intPks})
assert.Error(t, err)
})
}