fix: Remove heartbeat lag logic during get shard leaders (#29999) (#30085)

issue: #29677 #29838
pr: #29999
during get shard leaders, if qeurynode doesn't ack the heartbeat than
10s, querycoord will treat it as unavailable, and won't return shard
leader on it. but when querynode has a full cpu usage, it's easily to
stuck for more than 10s without ack the heartbeat, which cause no shard
leader to search/query.

This PR remove heartbeat lag logic during get shard leaders

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/30116/head
wei liu 2024-01-18 17:48:55 +08:00 committed by GitHub
parent 7f32576f36
commit 71e24f0a7f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 0 additions and 8 deletions

View File

@ -359,8 +359,6 @@ func (s *Server) fillReplicaInfo(replica *meta.Replica, withShardNodes bool) (*m
func checkNodeAvailable(nodeID int64, info *session.NodeInfo) error {
if info == nil {
return merr.WrapErrNodeOffline(nodeID)
} else if time.Since(info.LastHeartbeat()) > Params.QueryCoordCfg.HeartbeatAvailableInterval.GetAsDuration(time.Millisecond) {
return merr.WrapErrNodeOffline(nodeID, fmt.Sprintf("lastHB=%v", info.LastHeartbeat()))
}
return nil
}

View File

@ -1521,12 +1521,6 @@ func (suite *ServiceSuite) TestGetShardLeadersFailed() {
suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost"))
}
// Last heartbeat response time too old
suite.fetchHeartbeats(time.Now().Add(-Params.QueryCoordCfg.HeartbeatAvailableInterval.GetAsDuration(time.Millisecond) - 1))
resp, err = server.GetShardLeaders(ctx, req)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_NoReplicaAvailable, resp.GetStatus().GetErrorCode())
// Segment not fully loaded
for _, node := range suite.nodes {
suite.dist.SegmentDistManager.Update(node)