From 303470fc3516f08890e7623f8720182867c593b9 Mon Sep 17 00:00:00 2001
From: wei liu <wei.liu@zilliz.com>
Date: Wed, 22 May 2024 10:03:40 +0800
Subject: [PATCH] fix: Clean offline node from resource group after qc restart
 (#33232)

issue: #33200 #33207
pr#33104 causes the offline node will be kept in resource group after qc
recover, and offline node will be assign to new replica as rwNode, then
request send to those node will fail by NodeNotFound.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
---
 internal/querycoordv2/server.go | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/internal/querycoordv2/server.go b/internal/querycoordv2/server.go
index 0f54d5e2f8..da16b65fb4 100644
--- a/internal/querycoordv2/server.go
+++ b/internal/querycoordv2/server.go
@@ -456,7 +456,7 @@ func (s *Server) startQueryCoord() error {
 			s.nodeMgr.Stopping(node.ServerID)
 		}
 	}
-	s.checkReplicas()
+	s.checkNodeStateInRG()
 	for _, node := range sessions {
 		s.handleNodeUp(node.ServerID)
 	}
@@ -778,28 +778,15 @@ func (s *Server) handleNodeDown(node int64) {
 	s.meta.ResourceManager.HandleNodeDown(node)
 }
 
-// checkReplicas checks whether replica contains offline node, and remove those nodes
-func (s *Server) checkReplicas() {
-	for _, collection := range s.meta.CollectionManager.GetAll() {
-		log := log.With(zap.Int64("collectionID", collection))
-		replicas := s.meta.ReplicaManager.GetByCollection(collection)
-		for _, replica := range replicas {
-			toRemove := make([]int64, 0)
-			for _, node := range replica.GetNodes() {
-				if s.nodeMgr.Get(node) == nil {
-					toRemove = append(toRemove, node)
-				}
-			}
-
-			if len(toRemove) > 0 {
-				log := log.With(
-					zap.Int64("replicaID", replica.GetID()),
-					zap.Int64s("offlineNodes", toRemove),
-				)
-				log.Info("some nodes are offline, remove them from replica", zap.Any("toRemove", toRemove))
-				if err := s.meta.ReplicaManager.RemoveNode(replica.GetID(), toRemove...); err != nil {
-					log.Warn("failed to remove offline nodes from replica")
-				}
+func (s *Server) checkNodeStateInRG() {
+	for _, rgName := range s.meta.ListResourceGroups() {
+		rg := s.meta.ResourceManager.GetResourceGroup(rgName)
+		for _, node := range rg.GetNodes() {
+			info := s.nodeMgr.Get(node)
+			if info == nil {
+				s.meta.ResourceManager.HandleNodeDown(node)
+			} else if info.IsStoppingState() {
+				s.meta.ResourceManager.HandleNodeStopping(node)
 			}
 		}
 	}