fix: Check stale should check leader task's leader id (#31962)

issue: #30816

check stale rules for leader task:
1. for reduce leader task, it should keep executing until leader's node
become offline.
2. for grow leader task,it should keep executing until leader's node
become stopping.

This PR check leader node's stopping state for grow leader task

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/32064/head
wei liu 2024-04-09 15:33:25 +08:00 committed by GitHub
parent 5b693c466d
commit 177ddda47f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 6 additions and 8 deletions

View File

@ -20,7 +20,6 @@ import (
"context"
"time"
"github.com/samber/lo"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/proto/datapb"
@ -122,12 +121,7 @@ func (c *LeaderChecker) findNeedLoadedSegments(ctx context.Context, replica *met
)
ret := make([]task.Task, 0)
// skip set segment on stopping node to leader view
aliveNodeDist := lo.Filter(dist, func(s *meta.Segment, _ int) bool {
nodeInfo := c.nodeMgr.Get(s.Node)
return nodeInfo != nil && nodeInfo.GetState() != session.NodeStateStopping
})
latestNodeDist := utils.FindMaxVersionSegments(aliveNodeDist)
latestNodeDist := utils.FindMaxVersionSegments(dist)
for _, s := range latestNodeDist {
segment := c.target.GetSealedSegment(leaderView.CollectionID, s.GetID(), meta.CurrentTargetFirst)
existInTarget := segment != nil

View File

@ -191,6 +191,10 @@ func (action *LeaderAction) Version() typeutil.UniqueID {
return action.version
}
func (action *LeaderAction) GetLeaderID() typeutil.UniqueID {
return action.leaderID
}
func (action *LeaderAction) IsFinished(distMgr *meta.DistributionManager) bool {
views := distMgr.LeaderViewManager.GetLeaderView(action.leaderID)
view := views[action.Shard()]

View File

@ -966,7 +966,7 @@ func (scheduler *taskScheduler) checkLeaderTaskStale(task *LeaderTask) error {
for _, action := range task.Actions() {
switch action.Type() {
case ActionTypeGrow:
if ok, _ := scheduler.nodeMgr.IsStoppingNode(action.Node()); ok {
if ok, _ := scheduler.nodeMgr.IsStoppingNode(action.(*LeaderAction).GetLeaderID()); ok {
log.Warn("task stale due to node offline", zap.Int64("segment", task.segmentID))
return merr.WrapErrNodeOffline(action.Node())
}