From c056620899b21200f650013ea6f4b5eb0f9f77c0 Mon Sep 17 00:00:00 2001 From: wei liu Date: Thu, 26 Sep 2024 18:13:15 +0800 Subject: [PATCH] fix: Skip unnecessary query node health check in proxy (#36491) issue: #36490 After the query node changes from a delegator to a worker, proxy should skip this querynode's health check. Signed-off-by: Wei Liu --- internal/proxy/look_aside_balancer.go | 1 + internal/proxy/look_aside_balancer_test.go | 11 ++++------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/internal/proxy/look_aside_balancer.go b/internal/proxy/look_aside_balancer.go index dd1115d8b0..3bd7c3d63e 100644 --- a/internal/proxy/look_aside_balancer.go +++ b/internal/proxy/look_aside_balancer.go @@ -247,6 +247,7 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) { qn, err := b.clientMgr.GetClient(ctx, node) if err != nil { // get client from clientMgr failed, which means this qn isn't a shard leader anymore, skip it's health check + b.trySetQueryNodeUnReachable(node, err) log.RatedInfo(10, "get client failed", zap.Int64("node", node), zap.Error(err)) return struct{}{}, nil } diff --git a/internal/proxy/look_aside_balancer_test.go b/internal/proxy/look_aside_balancer_test.go index e80d31fc07..69d57c5f55 100644 --- a/internal/proxy/look_aside_balancer_test.go +++ b/internal/proxy/look_aside_balancer_test.go @@ -25,7 +25,6 @@ import ( "github.com/cockroachdb/errors" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/suite" - "go.uber.org/atomic" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" @@ -344,12 +343,10 @@ func (suite *LookAsideBalancerSuite) TestGetClientFailed() { // test get shard client from client mgr return nil suite.clientMgr.ExpectedCalls = nil suite.clientMgr.EXPECT().GetClient(mock.Anything, int64(2)).Return(nil, errors.New("shard client not found")) - failCounter := atomic.NewInt64(0) - suite.balancer.failedHeartBeatCounter.Insert(2, failCounter) - - // slepp 10s, wait for checkNodeHealth execute for more than one round - time.Sleep(10 * time.Second) - suite.True(failCounter.Load() == 0) + // expected stopping the health check after failure times reaching the limit + suite.Eventually(func() bool { + return !suite.balancer.metricsMap.Contain(2) + }, 30*time.Second, 1*time.Second) } func (suite *LookAsideBalancerSuite) TestNodeRecover() {