skip handoff event on unloaded partition (#20306)

Signed-off-by: Wei Liu <wei.liu@zilliz.com>

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/20330/head
wei liu 2022-11-07 17:05:06 +08:00 committed by GitHub
parent 14c1182418
commit d7ebb25701
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 177 additions and 49 deletions

View File

@ -132,6 +132,7 @@ func (suite *JobSuite) SetupTest() {
suite.meta, suite.meta,
suite.dist, suite.dist,
suite.targetMgr, suite.targetMgr,
suite.broker,
) )
suite.scheduler = NewScheduler() suite.scheduler = NewScheduler()

View File

@ -18,6 +18,7 @@ package observers
import ( import (
"context" "context"
"strings"
"sync" "sync"
"time" "time"
@ -64,33 +65,40 @@ type HandoffObserver struct {
meta *meta.Meta meta *meta.Meta
dist *meta.DistributionManager dist *meta.DistributionManager
target *meta.TargetManager target *meta.TargetManager
broker meta.Broker
revision int64 revision int64
collectionStatus map[int64]CollectionHandoffStatus collectionStatus map[int64]CollectionHandoffStatus
handoffEventLock sync.RWMutex handoffEventLock sync.RWMutex
handoffEvents map[int64]*HandoffEvent handoffEvents map[int64]*HandoffEvent
// partition id -> queue // collection id -> queue
handoffSubmitOrders map[int64]queue handoffSubmitOrders map[int64]queue
// collectionId -> loaded partitionId, only for load collection case
loadedPartitions map[int64]typeutil.UniqueSet
stopOnce sync.Once stopOnce sync.Once
} }
func NewHandoffObserver(store meta.Store, meta *meta.Meta, dist *meta.DistributionManager, target *meta.TargetManager) *HandoffObserver { func NewHandoffObserver(store meta.Store, meta *meta.Meta, dist *meta.DistributionManager, target *meta.TargetManager, broker meta.Broker) *HandoffObserver {
return &HandoffObserver{ return &HandoffObserver{
store: store, store: store,
c: make(chan struct{}), c: make(chan struct{}),
meta: meta, meta: meta,
dist: dist, dist: dist,
target: target, target: target,
broker: broker,
collectionStatus: map[int64]CollectionHandoffStatus{}, collectionStatus: map[int64]CollectionHandoffStatus{},
handoffEvents: map[int64]*HandoffEvent{}, handoffEvents: map[int64]*HandoffEvent{},
handoffSubmitOrders: map[int64]queue{}, handoffSubmitOrders: map[int64]queue{},
loadedPartitions: map[int64]typeutil.Set[int64]{},
} }
} }
func (ob *HandoffObserver) Register(collectionIDs ...int64) { func (ob *HandoffObserver) Register(collectionIDs ...int64) {
ob.handoffEventLock.Lock() ob.handoffEventLock.Lock()
defer ob.handoffEventLock.Unlock() defer ob.handoffEventLock.Unlock()
log.Info("Register handoff for collection",
zap.Int64s("collectionIDs", collectionIDs))
for _, collectionID := range collectionIDs { for _, collectionID := range collectionIDs {
ob.collectionStatus[collectionID] = CollectionHandoffStatusRegistered ob.collectionStatus[collectionID] = CollectionHandoffStatusRegistered
@ -100,9 +108,19 @@ func (ob *HandoffObserver) Register(collectionIDs ...int64) {
func (ob *HandoffObserver) Unregister(ctx context.Context, collectionIDs ...int64) { func (ob *HandoffObserver) Unregister(ctx context.Context, collectionIDs ...int64) {
ob.handoffEventLock.Lock() ob.handoffEventLock.Lock()
defer ob.handoffEventLock.Unlock() defer ob.handoffEventLock.Unlock()
log.Info("Unregister handoff for collection",
zap.Int64s("collectionIDs", collectionIDs))
for _, collectionID := range collectionIDs { for _, collectionID := range collectionIDs {
delete(ob.collectionStatus, collectionID) delete(ob.collectionStatus, collectionID)
delete(ob.handoffSubmitOrders, collectionID)
}
collectionSet := typeutil.NewUniqueSet(collectionIDs...)
for segmentID, event := range ob.handoffEvents {
if collectionSet.Contain(event.Segment.GetCollectionID()) {
delete(ob.handoffEvents, segmentID)
}
} }
} }
@ -115,6 +133,13 @@ func (ob *HandoffObserver) StartHandoff(collectionIDs ...int64) {
} }
} }
func (ob *HandoffObserver) GetEventNum() int {
ob.handoffEventLock.Lock()
defer ob.handoffEventLock.Unlock()
return len(ob.handoffEvents)
}
func (ob *HandoffObserver) consumeOutdatedHandoffEvent(ctx context.Context) error { func (ob *HandoffObserver) consumeOutdatedHandoffEvent(ctx context.Context) error {
_, handoffReqValues, revision, err := ob.store.LoadHandoffWithRevision() _, handoffReqValues, revision, err := ob.store.LoadHandoffWithRevision()
if err != nil { if err != nil {
@ -231,18 +256,19 @@ func (ob *HandoffObserver) tryHandoff(ctx context.Context, segment *querypb.Segm
) )
log.Info("try handoff segment...") log.Info("try handoff segment...")
status, ok := ob.collectionStatus[segment.GetCollectionID()] status, collectionRegistered := ob.collectionStatus[segment.GetCollectionID()]
if Params.QueryCoordCfg.AutoHandoff && if Params.QueryCoordCfg.AutoHandoff &&
ok && collectionRegistered &&
ob.checkLoadStatus(segment) &&
(segment.GetIsFake() || ob.meta.CollectionManager.ContainAnyIndex(segment.GetCollectionID(), indexIDs...)) { (segment.GetIsFake() || ob.meta.CollectionManager.ContainAnyIndex(segment.GetCollectionID(), indexIDs...)) {
event := ob.handoffEvents[segment.SegmentID] event := ob.handoffEvents[segment.SegmentID]
if event == nil { if event == nil {
// record submit order // record submit order
_, ok := ob.handoffSubmitOrders[segment.GetPartitionID()] _, ok := ob.handoffSubmitOrders[segment.GetCollectionID()]
if !ok { if !ok {
ob.handoffSubmitOrders[segment.GetPartitionID()] = make([]int64, 0) ob.handoffSubmitOrders[segment.GetCollectionID()] = make([]int64, 0)
} }
ob.handoffSubmitOrders[segment.GetPartitionID()] = append(ob.handoffSubmitOrders[segment.GetPartitionID()], segment.GetSegmentID()) ob.handoffSubmitOrders[segment.GetCollectionID()] = append(ob.handoffSubmitOrders[segment.GetCollectionID()], segment.GetSegmentID())
} }
if status == CollectionHandoffStatusRegistered { if status == CollectionHandoffStatusRegistered {
@ -272,6 +298,48 @@ func (ob *HandoffObserver) tryHandoff(ctx context.Context, segment *querypb.Segm
} }
} }
func (ob *HandoffObserver) checkLoadStatus(segment *querypb.SegmentInfo) bool {
if ob.meta.GetCollection(segment.GetCollectionID()) != nil {
// if collection is loaded, should check whether the partition has been droped!
if ob.loadedPartitions[segment.GetCollectionID()] == nil {
ob.loadedPartitions[segment.GetCollectionID()] = typeutil.NewUniqueSet()
}
// should updated loaded partitions when meet new partitionID
if !ob.loadedPartitions[segment.GetCollectionID()].Contain(segment.GetPartitionID()) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
err := retry.Do(ctx, func() error {
partitionIDs, err := ob.broker.GetPartitions(ctx, segment.GetCollectionID())
if err == nil {
ob.loadedPartitions[segment.GetCollectionID()].Insert(partitionIDs...)
return nil
}
return err
}, retry.Attempts(5))
if err != nil {
// collection has been dropped or released
if strings.Contains(err.Error(), "CollectionNotExists") ||
ob.meta.GetCollection(segment.GetCollectionID()) == nil {
return false
}
// collection not released , but can get partition list to check handoff
log.Warn("handoff check load status failed due to get partitions failed",
zap.Int64("collectionID", segment.GetCollectionID()),
zap.Int64("partitionID", segment.GetPartitionID()),
zap.String("channel", segment.GetDmChannel()),
zap.Int64("segmentID", segment.GetSegmentID()))
return false
}
}
return ob.loadedPartitions[segment.GetCollectionID()].Contain(segment.GetPartitionID())
}
return ob.meta.GetPartition(segment.GetPartitionID()) != nil
}
func (ob *HandoffObserver) handoff(segment *querypb.SegmentInfo) { func (ob *HandoffObserver) handoff(segment *querypb.SegmentInfo) {
targets := ob.target.GetSegmentsByCollection(segment.GetCollectionID(), segment.GetPartitionID()) targets := ob.target.GetSegmentsByCollection(segment.GetCollectionID(), segment.GetPartitionID())
// when handoff event load a Segment, it sobuld remove all recursive handoff compact from // when handoff event load a Segment, it sobuld remove all recursive handoff compact from
@ -378,7 +446,7 @@ func (ob *HandoffObserver) tryClean(ctx context.Context) {
ob.handoffEventLock.Lock() ob.handoffEventLock.Lock()
defer ob.handoffEventLock.Unlock() defer ob.handoffEventLock.Unlock()
for partitionID, partitionSubmitOrder := range ob.handoffSubmitOrders { for collectionID, partitionSubmitOrder := range ob.handoffSubmitOrders {
pos := 0 pos := 0
for _, segmentID := range partitionSubmitOrder { for _, segmentID := range partitionSubmitOrder {
event, ok := ob.handoffEvents[segmentID] event, ok := ob.handoffEvents[segmentID]
@ -403,7 +471,7 @@ func (ob *HandoffObserver) tryClean(ctx context.Context) {
break break
} }
} }
ob.handoffSubmitOrders[partitionID] = partitionSubmitOrder[pos:] ob.handoffSubmitOrders[collectionID] = partitionSubmitOrder[pos:]
} }
} }

View File

@ -24,9 +24,6 @@ import (
"time" "time"
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
"github.com/stretchr/testify/suite"
clientv3 "go.etcd.io/etcd/client/v3"
"github.com/milvus-io/milvus-proto/go-api/commonpb" "github.com/milvus-io/milvus-proto/go-api/commonpb"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd" etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/log" "github.com/milvus-io/milvus/internal/log"
@ -37,6 +34,9 @@ import (
"github.com/milvus-io/milvus/internal/util" "github.com/milvus-io/milvus/internal/util"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
clientv3 "go.etcd.io/etcd/client/v3"
) )
const ( const (
@ -65,6 +65,7 @@ type HandoffObserverTestSuit struct {
meta *meta.Meta meta *meta.Meta
dist *meta.DistributionManager dist *meta.DistributionManager
target *meta.TargetManager target *meta.TargetManager
broker *meta.MockBroker
// Test Object // Test Object
observer *HandoffObserver observer *HandoffObserver
@ -117,9 +118,10 @@ func (suite *HandoffObserverTestSuit) SetupTest() {
suite.meta = meta.NewMeta(suite.idAllocator, suite.store) suite.meta = meta.NewMeta(suite.idAllocator, suite.store)
suite.dist = meta.NewDistributionManager() suite.dist = meta.NewDistributionManager()
suite.target = meta.NewTargetManager() suite.target = meta.NewTargetManager()
suite.broker = meta.NewMockBroker(suite.T())
// Test Object // Test Object
suite.observer = NewHandoffObserver(suite.store, suite.meta, suite.dist, suite.target) suite.observer = NewHandoffObserver(suite.store, suite.meta, suite.dist, suite.target, suite.broker)
suite.observer.Register(suite.collection) suite.observer.Register(suite.collection)
suite.observer.StartHandoff(suite.collection) suite.observer.StartHandoff(suite.collection)
suite.load() suite.load()
@ -143,6 +145,7 @@ func (suite *HandoffObserverTestSuit) TestFlushingHandoff() {
Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second
err := suite.observer.Start(context.Background()) err := suite.observer.Start(context.Background())
suite.NoError(err) suite.NoError(err)
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{suite.partition}, nil)
flushingSegment := &querypb.SegmentInfo{ flushingSegment := &querypb.SegmentInfo{
SegmentID: 3, SegmentID: 3,
@ -197,7 +200,7 @@ func (suite *HandoffObserverTestSuit) TestCompactHandoff() {
Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second
err := suite.observer.Start(context.Background()) err := suite.observer.Start(context.Background())
suite.NoError(err) suite.NoError(err)
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{suite.partition}, nil)
compactSegment := &querypb.SegmentInfo{ compactSegment := &querypb.SegmentInfo{
SegmentID: 3, SegmentID: 3,
CollectionID: suite.collection, CollectionID: suite.collection,
@ -249,6 +252,7 @@ func (suite *HandoffObserverTestSuit) TestRecursiveHandoff() {
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}}, Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}},
GrowingSegments: typeutil.NewUniqueSet(3), GrowingSegments: typeutil.NewUniqueSet(3),
}) })
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{suite.partition}, nil)
Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second
err := suite.observer.Start(context.Background()) err := suite.observer.Start(context.Background())
@ -464,57 +468,107 @@ func (suite *HandoffObserverTestSuit) load() {
suite.target.AddSegment(suite.sealedSegments...) suite.target.AddSegment(suite.sealedSegments...)
} }
func (suite *HandoffObserverTestSuit) TestHandoffOnUnLoadedPartition() { func (suite *HandoffObserverTestSuit) TestHandoffOnUnloadedPartition() {
const ( Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second
collectionID = 111 err := suite.observer.Start(context.Background())
loadedPartitionID = 1
unloadedPartitionID = 2
)
err := suite.meta.PutPartition(&meta.Partition{
PartitionLoadInfo: &querypb.PartitionLoadInfo{
CollectionID: collectionID,
PartitionID: loadedPartitionID,
ReplicaNumber: suite.replicaNumber,
Status: querypb.LoadStatus_Loaded,
},
})
suite.NoError(err) suite.NoError(err)
// init leader view suite.dist.LeaderViewManager.Update(1, &meta.LeaderView{
suite.dist.LeaderViewManager.Update(2, &meta.LeaderView{
ID: 1, ID: 1,
CollectionID: collectionID, CollectionID: suite.collection,
Channel: suite.channel.ChannelName, Channel: suite.channel.ChannelName,
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}}, Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}},
}) })
suite.dist.LeaderViewManager.Update(1, &meta.LeaderView{
ID: 2,
CollectionID: suite.collection,
Channel: suite.channel.ChannelName,
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}},
})
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{2222}, nil)
suite.observer.Register(suite.collection)
suite.observer.StartHandoff(suite.collection)
defer suite.observer.Unregister(context.TODO(), suite.collection)
compactSegment1 := &querypb.SegmentInfo{
SegmentID: 111,
CollectionID: suite.collection,
PartitionID: 1111,
SegmentState: commonpb.SegmentState_Sealed,
CompactionFrom: []int64{1},
CreatedByCompaction: true,
IndexInfos: []*querypb.FieldIndexInfo{{IndexID: defaultIndexID}},
}
compactSegment2 := &querypb.SegmentInfo{
SegmentID: 222,
CollectionID: suite.collection,
PartitionID: 2222,
SegmentState: commonpb.SegmentState_Sealed,
CompactionFrom: []int64{1},
CreatedByCompaction: true,
IndexInfos: []*querypb.FieldIndexInfo{{IndexID: defaultIndexID}},
}
suite.produceHandOffEvent(compactSegment1)
suite.produceHandOffEvent(compactSegment2)
suite.Eventually(func() bool {
return !suite.target.ContainSegment(111) && suite.target.ContainSegment(222)
}, 3*time.Second, 1*time.Second)
}
func (suite *HandoffObserverTestSuit) TestUnRegisterHandoff() {
Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second
err = suite.observer.Start(context.Background()) err := suite.observer.Start(context.Background())
suite.NoError(err) suite.NoError(err)
compactSegment := &querypb.SegmentInfo{ suite.dist.LeaderViewManager.Update(1, &meta.LeaderView{
SegmentID: 3, ID: 1,
CollectionID: collectionID, CollectionID: suite.collection,
PartitionID: unloadedPartitionID, Channel: suite.channel.ChannelName,
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}},
})
suite.dist.LeaderViewManager.Update(1, &meta.LeaderView{
ID: 2,
CollectionID: suite.collection,
Channel: suite.channel.ChannelName,
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}},
})
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{1111, 2222}, nil)
suite.observer.Register(suite.collection)
compactSegment1 := &querypb.SegmentInfo{
SegmentID: 111,
CollectionID: suite.collection,
PartitionID: 1111,
SegmentState: commonpb.SegmentState_Sealed,
CompactionFrom: []int64{1},
CreatedByCompaction: true,
IndexInfos: []*querypb.FieldIndexInfo{{IndexID: defaultIndexID}},
}
suite.produceHandOffEvent(compactSegment1)
suite.Eventually(func() bool {
return suite.observer.GetEventNum() == 1
}, 3*time.Second, 1*time.Second)
suite.observer.Unregister(context.TODO(), suite.collection)
suite.observer.Register(suite.collection)
defer suite.observer.Unregister(context.TODO(), suite.collection)
compactSegment2 := &querypb.SegmentInfo{
SegmentID: 222,
CollectionID: suite.collection,
PartitionID: 2222,
SegmentState: commonpb.SegmentState_Sealed, SegmentState: commonpb.SegmentState_Sealed,
CompactionFrom: []int64{2}, CompactionFrom: []int64{2},
CreatedByCompaction: true, CreatedByCompaction: true,
IndexInfos: []*querypb.FieldIndexInfo{{IndexID: defaultIndexID}}, IndexInfos: []*querypb.FieldIndexInfo{{IndexID: defaultIndexID}},
} }
suite.produceHandOffEvent(compactSegment) suite.produceHandOffEvent(compactSegment2)
suite.Eventually(func() bool { suite.Eventually(func() bool {
return suite.target.ContainSegment(1) && suite.target.ContainSegment(2) return suite.observer.GetEventNum() == 1
}, 3*time.Second, 1*time.Second)
suite.Eventually(func() bool {
return !suite.target.ContainSegment(3)
}, 3*time.Second, 1*time.Second)
suite.Eventually(func() bool {
key := fmt.Sprintf("%s/%d/%d/%d", util.HandoffSegmentPrefix, suite.collection, suite.partition, 3)
value, err := suite.kv.Load(key)
return len(value) == 0 && err != nil
}, 3*time.Second, 1*time.Second) }, 3*time.Second, 1*time.Second)
} }
@ -526,6 +580,7 @@ func (suite *HandoffObserverTestSuit) TestFilterOutEventByIndexID() {
Channel: suite.channel.ChannelName, Channel: suite.channel.ChannelName,
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}}, Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}, 2: {NodeID: 2, Version: 0}},
}) })
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{suite.partition}, nil)
Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second Params.QueryCoordCfg.CheckHandoffInterval = 1 * time.Second
err := suite.observer.Start(context.Background()) err := suite.observer.Start(context.Background())
@ -556,6 +611,7 @@ func (suite *HandoffObserverTestSuit) TestFakedSegmentHandoff() {
Channel: suite.channel.ChannelName, Channel: suite.channel.ChannelName,
Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}}, Segments: map[int64]*querypb.SegmentDist{1: {NodeID: 1, Version: 0}},
}) })
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{suite.partition}, nil)
Params.QueryCoordCfg.CheckHandoffInterval = 200 * time.Millisecond Params.QueryCoordCfg.CheckHandoffInterval = 200 * time.Millisecond
err := suite.observer.Start(context.Background()) err := suite.observer.Start(context.Background())

View File

@ -291,6 +291,7 @@ func (s *Server) initObserver() {
s.meta, s.meta,
s.dist, s.dist,
s.targetMgr, s.targetMgr,
s.broker,
) )
} }

View File

@ -360,6 +360,7 @@ func (suite *ServerSuite) hackServer() {
suite.server.meta, suite.server.meta,
suite.server.dist, suite.server.dist,
suite.server.targetMgr, suite.server.targetMgr,
suite.server.broker,
) )
suite.server.distController = dist.NewDistController( suite.server.distController = dist.NewDistController(
suite.server.cluster, suite.server.cluster,

View File

@ -132,6 +132,7 @@ func (suite *ServiceSuite) SetupTest() {
suite.meta, suite.meta,
suite.dist, suite.dist,
suite.targetMgr, suite.targetMgr,
suite.broker,
) )
suite.balancer = balance.NewRowCountBasedBalancer( suite.balancer = balance.NewRowCountBasedBalancer(
suite.taskScheduler, suite.taskScheduler,