fix: Remove heartbeat lag logic during get shard leaders (#29999) (#30085)

issue: #29677 #29838 pr: #29999 during get shard leaders, if qeurynode doesn't ack the heartbeat than 10s, querycoord will treat it as unavailable, and won't return shard leader on it. but when querynode has a full cpu usage, it's easily to stuck for more than 10s without ack the heartbeat, which cause no shard leader to search/query. This PR remove heartbeat lag logic during get shard leaders Signed-off-by: Wei Liu <wei.liu@zilliz.com>
2024-01-18 17:48:55 +08:00 · 2024-01-18 17:48:55 +08:00 · 71e24f0a7f
parent 7f32576f36
commit 71e24f0a7f
2 changed files with 0 additions and 8 deletions
--- a/internal/querycoordv2/handlers.go
+++ b/internal/querycoordv2/handlers.go
@ -359,8 +359,6 @@ func (s *Server) fillReplicaInfo(replica *meta.Replica, withShardNodes bool) (*m
 func checkNodeAvailable(nodeID int64, info *session.NodeInfo) error {
 	if info == nil {
 		return merr.WrapErrNodeOffline(nodeID)
-	} else if time.Since(info.LastHeartbeat()) > Params.QueryCoordCfg.HeartbeatAvailableInterval.GetAsDuration(time.Millisecond) {
-		return merr.WrapErrNodeOffline(nodeID, fmt.Sprintf("lastHB=%v", info.LastHeartbeat()))
 	}
 	return nil
 }
--- a/internal/querycoordv2/services_test.go
+++ b/internal/querycoordv2/services_test.go
@ -1521,12 +1521,6 @@ func (suite *ServiceSuite) TestGetShardLeadersFailed() {
 			suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost"))
 		}

-		// Last heartbeat response time too old
-		suite.fetchHeartbeats(time.Now().Add(-Params.QueryCoordCfg.HeartbeatAvailableInterval.GetAsDuration(time.Millisecond) - 1))
-		resp, err = server.GetShardLeaders(ctx, req)
-		suite.NoError(err)
-		suite.Equal(commonpb.ErrorCode_NoReplicaAvailable, resp.GetStatus().GetErrorCode())
-
 		// Segment not fully loaded
 		for _, node := range suite.nodes {
 			suite.dist.SegmentDistManager.Update(node)