fix: mistaken deletions may occur during GC channel checkpoints (#35707)

issue: #35706

Signed-off-by: jaime <yun.zhang@zilliz.com>
pull/35765/head
jaime 2024-08-28 10:11:05 +08:00 committed by GitHub
parent 7de210ff28
commit b7ea1defd3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 40 additions and 12 deletions

View File

@ -29,6 +29,7 @@ import (
"go.uber.org/zap" "go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus/internal/datacoord/broker"
"github.com/milvus-io/milvus/internal/metastore/kv/binlog" "github.com/milvus-io/milvus/internal/metastore/kv/binlog"
"github.com/milvus-io/milvus/internal/metastore/model" "github.com/milvus-io/milvus/internal/metastore/model"
"github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/datapb"
@ -53,6 +54,7 @@ type GcOption struct {
dropTolerance time.Duration // dropped segment related key tolerance time dropTolerance time.Duration // dropped segment related key tolerance time
scanInterval time.Duration // interval for scan residue for interupted log wrttien scanInterval time.Duration // interval for scan residue for interupted log wrttien
broker broker.Broker
removeObjectPool *conc.Pool[struct{}] removeObjectPool *conc.Pool[struct{}]
} }
@ -489,7 +491,7 @@ func (gc *garbageCollector) recycleChannelCPMeta(ctx context.Context) {
collectionID2GcStatus := make(map[int64]bool) collectionID2GcStatus := make(map[int64]bool)
skippedCnt := 0 skippedCnt := 0
log.Info("start to GC channel cp", zap.Int("vchannelCnt", len(channelCPs))) log.Info("start to GC channel cp", zap.Int("vchannelCPCnt", len(channelCPs)))
for vChannel := range channelCPs { for vChannel := range channelCPs {
collectionID := funcutil.GetCollectionIDFromVChannel(vChannel) collectionID := funcutil.GetCollectionIDFromVChannel(vChannel)
@ -500,8 +502,20 @@ func (gc *garbageCollector) recycleChannelCPMeta(ctx context.Context) {
continue continue
} }
if _, ok := collectionID2GcStatus[collectionID]; !ok { _, ok := collectionID2GcStatus[collectionID]
collectionID2GcStatus[collectionID] = gc.meta.catalog.GcConfirm(ctx, collectionID, -1) if !ok {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
has, err := gc.option.broker.HasCollection(ctx, collectionID)
if err == nil && !has {
collectionID2GcStatus[collectionID] = gc.meta.catalog.GcConfirm(ctx, collectionID, -1)
} else {
// skip checkpoints GC of this cycle if describe collection fails or the collection state is available.
log.Debug("skip channel cp GC, the collection state is available",
zap.Int64("collectionID", collectionID),
zap.Bool("dropped", has), zap.Error(err))
collectionID2GcStatus[collectionID] = false
}
} }
// Skip to GC if all segments meta of the corresponding collection are not removed // Skip to GC if all segments meta of the corresponding collection are not removed

View File

@ -39,6 +39,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb" "github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
broker2 "github.com/milvus-io/milvus/internal/datacoord/broker"
kvmocks "github.com/milvus-io/milvus/internal/kv/mocks" kvmocks "github.com/milvus-io/milvus/internal/kv/mocks"
"github.com/milvus-io/milvus/internal/metastore" "github.com/milvus-io/milvus/internal/metastore"
"github.com/milvus-io/milvus/internal/metastore/kv/datacoord" "github.com/milvus-io/milvus/internal/metastore/kv/datacoord"
@ -1486,22 +1487,28 @@ func TestGarbageCollector_recycleChannelMeta(t *testing.T) {
m.channelCPs.checkpoints = map[string]*msgpb.MsgPosition{ m.channelCPs.checkpoints = map[string]*msgpb.MsgPosition{
"cluster-id-rootcoord-dm_0_123v0": nil, "cluster-id-rootcoord-dm_0_123v0": nil,
"cluster-id-rootcoord-dm_1_123v0": nil,
"cluster-id-rootcoord-dm_0_124v0": nil, "cluster-id-rootcoord-dm_0_124v0": nil,
} }
gc := newGarbageCollector(m, newMockHandlerWithMeta(m), GcOption{}) broker := broker2.NewMockBroker(t)
broker.EXPECT().HasCollection(mock.Anything, mock.Anything).Return(true, nil).Twice()
gc := newGarbageCollector(m, newMockHandlerWithMeta(m), GcOption{broker: broker})
t.Run("list channel cp fail", func(t *testing.T) { t.Run("list channel cp fail", func(t *testing.T) {
catalog.EXPECT().ListChannelCheckpoint(mock.Anything).Return(nil, errors.New("mock error")).Once() catalog.EXPECT().ListChannelCheckpoint(mock.Anything).Return(nil, errors.New("mock error")).Once()
gc.recycleChannelCPMeta(context.TODO()) gc.recycleChannelCPMeta(context.TODO())
assert.Equal(t, 2, len(m.channelCPs.checkpoints)) assert.Equal(t, 3, len(m.channelCPs.checkpoints))
}) })
catalog.EXPECT().ListChannelCheckpoint(mock.Anything).Unset()
catalog.EXPECT().ListChannelCheckpoint(mock.Anything).Return(map[string]*msgpb.MsgPosition{ catalog.EXPECT().ListChannelCheckpoint(mock.Anything).Return(map[string]*msgpb.MsgPosition{
"cluster-id-rootcoord-dm_0_123v0": nil, "cluster-id-rootcoord-dm_0_123v0": nil,
"cluster-id-rootcoord-dm_1_123v0": nil,
"cluster-id-rootcoord-dm_0_invalidedCollectionIDv0": nil, "cluster-id-rootcoord-dm_0_invalidedCollectionIDv0": nil,
"cluster-id-rootcoord-dm_0_124v0": nil, "cluster-id-rootcoord-dm_0_124v0": nil,
}, nil).Twice() }, nil).Times(3)
catalog.EXPECT().GcConfirm(mock.Anything, mock.Anything, mock.Anything). catalog.EXPECT().GcConfirm(mock.Anything, mock.Anything, mock.Anything).
RunAndReturn(func(ctx context.Context, collectionID int64, i2 int64) bool { RunAndReturn(func(ctx context.Context, collectionID int64, i2 int64) bool {
@ -1509,16 +1516,22 @@ func TestGarbageCollector_recycleChannelMeta(t *testing.T) {
return true return true
} }
return false return false
}) }).Maybe()
t.Run("drop channel cp fail", func(t *testing.T) { t.Run("skip drop channel due to collection is available", func(t *testing.T) {
catalog.EXPECT().DropChannelCheckpoint(mock.Anything, mock.Anything).Return(errors.New("mock error")).Once()
gc.recycleChannelCPMeta(context.TODO()) gc.recycleChannelCPMeta(context.TODO())
assert.Equal(t, 2, len(m.channelCPs.checkpoints)) assert.Equal(t, 3, len(m.channelCPs.checkpoints))
}) })
t.Run("gc ok", func(t *testing.T) { broker.EXPECT().HasCollection(mock.Anything, mock.Anything).Return(false, nil).Times(4)
catalog.EXPECT().DropChannelCheckpoint(mock.Anything, mock.Anything).Return(nil).Once() t.Run("drop channel cp fail", func(t *testing.T) {
catalog.EXPECT().DropChannelCheckpoint(mock.Anything, mock.Anything).Return(errors.New("mock error")).Twice()
gc.recycleChannelCPMeta(context.TODO())
assert.Equal(t, 3, len(m.channelCPs.checkpoints))
})
t.Run("channel cp gc ok", func(t *testing.T) {
catalog.EXPECT().DropChannelCheckpoint(mock.Anything, mock.Anything).Return(nil).Twice()
gc.recycleChannelCPMeta(context.TODO()) gc.recycleChannelCPMeta(context.TODO())
assert.Equal(t, 1, len(m.channelCPs.checkpoints)) assert.Equal(t, 1, len(m.channelCPs.checkpoints))
}) })

View File

@ -533,6 +533,7 @@ func (s *Server) newChunkManagerFactory() (storage.ChunkManager, error) {
func (s *Server) initGarbageCollection(cli storage.ChunkManager) { func (s *Server) initGarbageCollection(cli storage.ChunkManager) {
s.garbageCollector = newGarbageCollector(s.meta, s.handler, GcOption{ s.garbageCollector = newGarbageCollector(s.meta, s.handler, GcOption{
cli: cli, cli: cli,
broker: s.broker,
enabled: Params.DataCoordCfg.EnableGarbageCollection.GetAsBool(), enabled: Params.DataCoordCfg.EnableGarbageCollection.GetAsBool(),
checkInterval: Params.DataCoordCfg.GCInterval.GetAsDuration(time.Second), checkInterval: Params.DataCoordCfg.GCInterval.GetAsDuration(time.Second),
scanInterval: Params.DataCoordCfg.GCScanIntervalInHour.GetAsDuration(time.Hour), scanInterval: Params.DataCoordCfg.GCScanIntervalInHour.GetAsDuration(time.Hour),