fix: querycoord panic after node down (#31831)

issue: #30519

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/22134/head
wei liu 2024-04-03 10:03:22 +08:00 committed by GitHub
parent 0feee53631
commit 7471a8005f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 2 deletions

View File

@ -956,8 +956,10 @@ func (s *Server) GetShardLeaders(ctx context.Context, req *querypb.GetShardLeade
addrs := make([]string, 0, len(leaders)) addrs := make([]string, 0, len(leaders))
for _, leader := range readableLeaders { for _, leader := range readableLeaders {
info := s.nodeMgr.Get(leader.ID) info := s.nodeMgr.Get(leader.ID)
ids = append(ids, info.ID()) if info != nil {
addrs = append(addrs, info.Addr()) ids = append(ids, info.ID())
addrs = append(addrs, info.Addr())
}
} }
resp.Shards = append(resp.Shards, &querypb.ShardLeadersList{ resp.Shards = append(resp.Shards, &querypb.ShardLeadersList{

View File

@ -198,6 +198,12 @@ func (ex *Executor) loadSegment(task *SegmentTask, step int) error {
// Get shard leader for the given replica and segment // Get shard leader for the given replica and segment
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(task.CollectionID(), action.Node()) replica := ex.meta.ReplicaManager.GetByCollectionAndNode(task.CollectionID(), action.Node())
if replica == nil {
msg := "node doesn't belong to any replica"
err := merr.WrapErrNodeNotAvailable(action.Node())
log.Warn(msg, zap.Error(err))
return err
}
view := ex.dist.LeaderViewManager.GetLatestLeadersByReplicaShard(replica, action.Shard()) view := ex.dist.LeaderViewManager.GetLatestLeadersByReplicaShard(replica, action.Shard())
if view == nil { if view == nil {
msg := "no shard leader for the segment to execute loading" msg := "no shard leader for the segment to execute loading"
@ -255,6 +261,12 @@ func (ex *Executor) releaseSegment(task *SegmentTask, step int) {
// return // return
// } // }
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(task.CollectionID(), action.Node()) replica := ex.meta.ReplicaManager.GetByCollectionAndNode(task.CollectionID(), action.Node())
if replica == nil {
msg := "node doesn't belong to any replica"
err := merr.WrapErrNodeNotAvailable(action.Node())
log.Warn(msg, zap.Error(err))
return
}
view := ex.dist.LeaderViewManager.GetLatestLeadersByReplicaShard(replica, action.Shard()) view := ex.dist.LeaderViewManager.GetLatestLeadersByReplicaShard(replica, action.Shard())
if view == nil { if view == nil {
msg := "no shard leader for the segment to execute releasing" msg := "no shard leader for the segment to execute releasing"