fix: Set node unreachable when get shard client failed (#31277)

issue: #30531

cause get client from `shardClientMgr`, doesn't means query node is
unavailable. because of the ref counter policy in `shardClientMgr`,
which will clean the client, if no collection use qn as shard leader.

This PR fix that set node unreachable when get shard client failed.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/31240/head
wei liu 2024-03-15 10:23:03 +08:00 committed by GitHub
parent a1386bae7f
commit ca8eee2c47
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 15 additions and 3 deletions

View File

@ -221,9 +221,8 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
qn, err := b.clientMgr.GetClient(ctx, node)
if err != nil {
if b.trySetQueryNodeUnReachable(node, err) {
log.Warn("get client failed, set node unreachable", zap.Int64("node", node), zap.Error(err))
}
// get client from clientMgr failed, which means this qn isn't a shard leader anymore, skip it's health check
log.RatedInfo(10, "get client failed", zap.Int64("node", node), zap.Error(err))
return struct{}{}, nil
}

View File

@ -334,6 +334,19 @@ func (suite *LookAsideBalancerSuite) TestCheckHealthLoop() {
}, 5*time.Second, 100*time.Millisecond)
}
func (suite *LookAsideBalancerSuite) TestGetClientFailed() {
suite.balancer.metricsUpdateTs.Insert(2, time.Now().UnixMilli())
// test get shard client from client mgr return nil
suite.clientMgr.ExpectedCalls = nil
suite.clientMgr.EXPECT().GetClient(mock.Anything, int64(2)).Return(nil, errors.New("shard client not found"))
failCounter := atomic.NewInt64(0)
suite.balancer.failedHeartBeatCounter.Insert(2, failCounter)
suite.Eventually(func() bool {
return failCounter.Load() == 0
}, 10*time.Second, 1*time.Second)
}
func (suite *LookAsideBalancerSuite) TestNodeRecover() {
// mock qn down for a while and then recover
qn3 := mocks.NewMockQueryNodeClient(suite.T())