enhance: Force to reset coord connection for unavailable error (#33910)

pr: #33908

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/33900/head
wei liu 2024-06-18 11:15:58 +08:00 committed by GitHub
parent 4513569207
commit aafa86095f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 23 deletions

View File

@ -218,13 +218,13 @@ func (c *ClientBase[T]) GetGrpcClient(ctx context.Context) (*clientConnWrapper[T
return c.grpcClient, nil return c.grpcClient, nil
} }
func (c *ClientBase[T]) resetConnection(wrapper *clientConnWrapper[T]) { func (c *ClientBase[T]) resetConnection(wrapper *clientConnWrapper[T], forceReset bool) {
if time.Since(c.lastReset.Load()) < c.minResetInterval { if !forceReset && time.Since(c.lastReset.Load()) < c.minResetInterval {
return return
} }
c.grpcClientMtx.Lock() c.grpcClientMtx.Lock()
defer c.grpcClientMtx.Unlock() defer c.grpcClientMtx.Unlock()
if time.Since(c.lastReset.Load()) < c.minResetInterval { if !forceReset && time.Since(c.lastReset.Load()) < c.minResetInterval {
return return
} }
if generic.IsZero(c.grpcClient) { if generic.IsZero(c.grpcClient) {
@ -396,12 +396,12 @@ func (c *ClientBase[T]) needResetCancel() (needReset bool) {
return false return false
} }
func (c *ClientBase[T]) checkGrpcErr(ctx context.Context, err error) (needRetry, needReset bool, retErr error) { func (c *ClientBase[T]) checkGrpcErr(ctx context.Context, err error) (needRetry, needReset, forceReset bool, retErr error) {
log := log.Ctx(ctx).With(zap.String("clientRole", c.GetRole())) log := log.Ctx(ctx).With(zap.String("clientRole", c.GetRole()))
// Unknown err // Unknown err
if !funcutil.IsGrpcErr(err) { if !funcutil.IsGrpcErr(err) {
log.Warn("fail to grpc call because of unknown error", zap.Error(err)) log.Warn("fail to grpc call because of unknown error", zap.Error(err))
return false, false, err return false, false, false, err
} }
// grpc err // grpc err
@ -409,22 +409,25 @@ func (c *ClientBase[T]) checkGrpcErr(ctx context.Context, err error) (needRetry,
switch { switch {
case funcutil.IsGrpcErr(err, codes.Canceled, codes.DeadlineExceeded): case funcutil.IsGrpcErr(err, codes.Canceled, codes.DeadlineExceeded):
// canceled or deadline exceeded // canceled or deadline exceeded
return true, c.needResetCancel(), err return true, c.needResetCancel(), false, err
case funcutil.IsGrpcErr(err, codes.Unimplemented): case funcutil.IsGrpcErr(err, codes.Unimplemented):
// for unimplemented error, reset coord connection to avoid old coord's side effect. // for unimplemented error, reset coord connection to avoid old coord's side effect.
// old coord's side effect: when coord changed, the connection in coord's client won't reset automatically. // old coord's side effect: when coord changed, the connection in coord's client won't reset automatically.
// so if new interface appear in new coord, will got a unimplemented error // so if new interface appear in new coord, will got a unimplemented error
return false, true, merr.WrapErrServiceUnimplemented(err) return false, true, true, merr.WrapErrServiceUnimplemented(err)
case IsServerIDMismatchErr(err): case IsServerIDMismatchErr(err):
if ok := c.checkNodeSessionExist(ctx); !ok { if ok := c.checkNodeSessionExist(ctx); !ok {
// if session doesn't exist, no need to retry for datanode/indexnode/querynode/proxy // if session doesn't exist, no need to retry for datanode/indexnode/querynode/proxy
return false, false, err return false, false, false, err
} }
return true, true, err return true, true, true, err
case IsCrossClusterRoutingErr(err): case IsCrossClusterRoutingErr(err):
return true, true, err return true, true, true, err
case funcutil.IsGrpcErr(err, codes.Unavailable):
// for unavailable error in coord, force to reset coord connection
return true, true, !c.isNode, err
default: default:
return true, true, err return true, true, false, err
} }
} }
@ -454,8 +457,8 @@ func (c *ClientBase[T]) call(ctx context.Context, caller func(client T) (any, er
log.Warn("fail to get grpc client", zap.Error(clientErr)) log.Warn("fail to get grpc client", zap.Error(clientErr))
} }
resetClientFunc := func() { resetClientFunc := func(forceReset bool) {
c.resetConnection(wrapper) c.resetConnection(wrapper, forceReset)
wrapper, clientErr = c.GetGrpcClient(ctx) wrapper, clientErr = c.GetGrpcClient(ctx)
if clientErr != nil { if clientErr != nil {
log.Warn("fail to get grpc client in the retry state", zap.Error(clientErr)) log.Warn("fail to get grpc client in the retry state", zap.Error(clientErr))
@ -473,7 +476,7 @@ func (c *ClientBase[T]) call(ctx context.Context, caller func(client T) (any, er
err := errors.Wrap(clientErr, "empty grpc client") err := errors.Wrap(clientErr, "empty grpc client")
log.Warn("grpc client is nil, maybe fail to get client in the retry state", zap.Error(err)) log.Warn("grpc client is nil, maybe fail to get client in the retry state", zap.Error(err))
resetClientFunc() resetClientFunc(false)
return true, err return true, err
} }
@ -483,17 +486,17 @@ func (c *ClientBase[T]) call(ctx context.Context, caller func(client T) (any, er
wrapper.Unpin() wrapper.Unpin()
if err != nil { if err != nil {
var needRetry, needReset bool var needRetry, needReset, forceReset bool
needRetry, needReset, err = c.checkGrpcErr(ctx, err) needRetry, needReset, forceReset, err = c.checkGrpcErr(ctx, err)
if needReset { if needReset {
log.Warn("start to reset connection because of specific reasons", zap.Error(err)) log.Warn("start to reset connection because of specific reasons", zap.Error(err))
resetClientFunc() resetClientFunc(forceReset)
} else { } else {
// err occurs but no need to reset connection, try to verify session // err occurs but no need to reset connection, try to verify session
err := c.verifySession(ctx) err := c.verifySession(ctx)
if err != nil { if err != nil {
log.Warn("failed to verify session, reset connection", zap.Error(err)) log.Warn("failed to verify session, reset connection", zap.Error(err))
resetClientFunc() resetClientFunc(forceReset)
} }
} }
return needRetry, err return needRetry, err

View File

@ -370,28 +370,38 @@ func TestClientBase_CheckGrpcError(t *testing.T) {
base.MaxAttempts = 1 base.MaxAttempts = 1
ctx := context.Background() ctx := context.Background()
retry, reset, _ := base.checkGrpcErr(ctx, status.Errorf(codes.Canceled, "fake context canceled")) retry, reset, forceReset, _ := base.checkGrpcErr(ctx, status.Errorf(codes.Canceled, "fake context canceled"))
assert.True(t, retry) assert.True(t, retry)
assert.True(t, reset) assert.True(t, reset)
assert.False(t, forceReset)
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unimplemented, "fake context canceled")) retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unimplemented, "fake context canceled"))
assert.False(t, retry) assert.False(t, retry)
assert.True(t, reset) assert.True(t, reset)
assert.True(t, forceReset)
retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unavailable, "fake context canceled"))
assert.True(t, retry)
assert.True(t, reset)
assert.True(t, forceReset)
// test serverId mismatch // test serverId mismatch
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotMatch.Error())) retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotMatch.Error()))
assert.True(t, retry) assert.True(t, retry)
assert.True(t, reset) assert.True(t, reset)
assert.True(t, forceReset)
// test cross cluster // test cross cluster
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrServiceCrossClusterRouting.Error())) retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrServiceCrossClusterRouting.Error()))
assert.True(t, retry) assert.True(t, retry)
assert.True(t, reset) assert.True(t, reset)
assert.True(t, forceReset)
// test default // test default
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotFound.Error())) retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotFound.Error()))
assert.True(t, retry) assert.True(t, retry)
assert.True(t, reset) assert.True(t, reset)
assert.False(t, forceReset)
} }
type server struct { type server struct {