mirror of https://github.com/milvus-io/milvus.git
enhance: Force to reset coord connection for unavailable error (#33910)
pr: #33908 Signed-off-by: Wei Liu <wei.liu@zilliz.com>pull/33900/head
parent
4513569207
commit
aafa86095f
|
@ -218,13 +218,13 @@ func (c *ClientBase[T]) GetGrpcClient(ctx context.Context) (*clientConnWrapper[T
|
||||||
return c.grpcClient, nil
|
return c.grpcClient, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ClientBase[T]) resetConnection(wrapper *clientConnWrapper[T]) {
|
func (c *ClientBase[T]) resetConnection(wrapper *clientConnWrapper[T], forceReset bool) {
|
||||||
if time.Since(c.lastReset.Load()) < c.minResetInterval {
|
if !forceReset && time.Since(c.lastReset.Load()) < c.minResetInterval {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
c.grpcClientMtx.Lock()
|
c.grpcClientMtx.Lock()
|
||||||
defer c.grpcClientMtx.Unlock()
|
defer c.grpcClientMtx.Unlock()
|
||||||
if time.Since(c.lastReset.Load()) < c.minResetInterval {
|
if !forceReset && time.Since(c.lastReset.Load()) < c.minResetInterval {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if generic.IsZero(c.grpcClient) {
|
if generic.IsZero(c.grpcClient) {
|
||||||
|
@ -396,12 +396,12 @@ func (c *ClientBase[T]) needResetCancel() (needReset bool) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ClientBase[T]) checkGrpcErr(ctx context.Context, err error) (needRetry, needReset bool, retErr error) {
|
func (c *ClientBase[T]) checkGrpcErr(ctx context.Context, err error) (needRetry, needReset, forceReset bool, retErr error) {
|
||||||
log := log.Ctx(ctx).With(zap.String("clientRole", c.GetRole()))
|
log := log.Ctx(ctx).With(zap.String("clientRole", c.GetRole()))
|
||||||
// Unknown err
|
// Unknown err
|
||||||
if !funcutil.IsGrpcErr(err) {
|
if !funcutil.IsGrpcErr(err) {
|
||||||
log.Warn("fail to grpc call because of unknown error", zap.Error(err))
|
log.Warn("fail to grpc call because of unknown error", zap.Error(err))
|
||||||
return false, false, err
|
return false, false, false, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// grpc err
|
// grpc err
|
||||||
|
@ -409,22 +409,25 @@ func (c *ClientBase[T]) checkGrpcErr(ctx context.Context, err error) (needRetry,
|
||||||
switch {
|
switch {
|
||||||
case funcutil.IsGrpcErr(err, codes.Canceled, codes.DeadlineExceeded):
|
case funcutil.IsGrpcErr(err, codes.Canceled, codes.DeadlineExceeded):
|
||||||
// canceled or deadline exceeded
|
// canceled or deadline exceeded
|
||||||
return true, c.needResetCancel(), err
|
return true, c.needResetCancel(), false, err
|
||||||
case funcutil.IsGrpcErr(err, codes.Unimplemented):
|
case funcutil.IsGrpcErr(err, codes.Unimplemented):
|
||||||
// for unimplemented error, reset coord connection to avoid old coord's side effect.
|
// for unimplemented error, reset coord connection to avoid old coord's side effect.
|
||||||
// old coord's side effect: when coord changed, the connection in coord's client won't reset automatically.
|
// old coord's side effect: when coord changed, the connection in coord's client won't reset automatically.
|
||||||
// so if new interface appear in new coord, will got a unimplemented error
|
// so if new interface appear in new coord, will got a unimplemented error
|
||||||
return false, true, merr.WrapErrServiceUnimplemented(err)
|
return false, true, true, merr.WrapErrServiceUnimplemented(err)
|
||||||
case IsServerIDMismatchErr(err):
|
case IsServerIDMismatchErr(err):
|
||||||
if ok := c.checkNodeSessionExist(ctx); !ok {
|
if ok := c.checkNodeSessionExist(ctx); !ok {
|
||||||
// if session doesn't exist, no need to retry for datanode/indexnode/querynode/proxy
|
// if session doesn't exist, no need to retry for datanode/indexnode/querynode/proxy
|
||||||
return false, false, err
|
return false, false, false, err
|
||||||
}
|
}
|
||||||
return true, true, err
|
return true, true, true, err
|
||||||
case IsCrossClusterRoutingErr(err):
|
case IsCrossClusterRoutingErr(err):
|
||||||
return true, true, err
|
return true, true, true, err
|
||||||
|
case funcutil.IsGrpcErr(err, codes.Unavailable):
|
||||||
|
// for unavailable error in coord, force to reset coord connection
|
||||||
|
return true, true, !c.isNode, err
|
||||||
default:
|
default:
|
||||||
return true, true, err
|
return true, true, false, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -454,8 +457,8 @@ func (c *ClientBase[T]) call(ctx context.Context, caller func(client T) (any, er
|
||||||
log.Warn("fail to get grpc client", zap.Error(clientErr))
|
log.Warn("fail to get grpc client", zap.Error(clientErr))
|
||||||
}
|
}
|
||||||
|
|
||||||
resetClientFunc := func() {
|
resetClientFunc := func(forceReset bool) {
|
||||||
c.resetConnection(wrapper)
|
c.resetConnection(wrapper, forceReset)
|
||||||
wrapper, clientErr = c.GetGrpcClient(ctx)
|
wrapper, clientErr = c.GetGrpcClient(ctx)
|
||||||
if clientErr != nil {
|
if clientErr != nil {
|
||||||
log.Warn("fail to get grpc client in the retry state", zap.Error(clientErr))
|
log.Warn("fail to get grpc client in the retry state", zap.Error(clientErr))
|
||||||
|
@ -473,7 +476,7 @@ func (c *ClientBase[T]) call(ctx context.Context, caller func(client T) (any, er
|
||||||
|
|
||||||
err := errors.Wrap(clientErr, "empty grpc client")
|
err := errors.Wrap(clientErr, "empty grpc client")
|
||||||
log.Warn("grpc client is nil, maybe fail to get client in the retry state", zap.Error(err))
|
log.Warn("grpc client is nil, maybe fail to get client in the retry state", zap.Error(err))
|
||||||
resetClientFunc()
|
resetClientFunc(false)
|
||||||
return true, err
|
return true, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -483,17 +486,17 @@ func (c *ClientBase[T]) call(ctx context.Context, caller func(client T) (any, er
|
||||||
wrapper.Unpin()
|
wrapper.Unpin()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var needRetry, needReset bool
|
var needRetry, needReset, forceReset bool
|
||||||
needRetry, needReset, err = c.checkGrpcErr(ctx, err)
|
needRetry, needReset, forceReset, err = c.checkGrpcErr(ctx, err)
|
||||||
if needReset {
|
if needReset {
|
||||||
log.Warn("start to reset connection because of specific reasons", zap.Error(err))
|
log.Warn("start to reset connection because of specific reasons", zap.Error(err))
|
||||||
resetClientFunc()
|
resetClientFunc(forceReset)
|
||||||
} else {
|
} else {
|
||||||
// err occurs but no need to reset connection, try to verify session
|
// err occurs but no need to reset connection, try to verify session
|
||||||
err := c.verifySession(ctx)
|
err := c.verifySession(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("failed to verify session, reset connection", zap.Error(err))
|
log.Warn("failed to verify session, reset connection", zap.Error(err))
|
||||||
resetClientFunc()
|
resetClientFunc(forceReset)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return needRetry, err
|
return needRetry, err
|
||||||
|
|
|
@ -370,28 +370,38 @@ func TestClientBase_CheckGrpcError(t *testing.T) {
|
||||||
base.MaxAttempts = 1
|
base.MaxAttempts = 1
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
retry, reset, _ := base.checkGrpcErr(ctx, status.Errorf(codes.Canceled, "fake context canceled"))
|
retry, reset, forceReset, _ := base.checkGrpcErr(ctx, status.Errorf(codes.Canceled, "fake context canceled"))
|
||||||
assert.True(t, retry)
|
assert.True(t, retry)
|
||||||
assert.True(t, reset)
|
assert.True(t, reset)
|
||||||
|
assert.False(t, forceReset)
|
||||||
|
|
||||||
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unimplemented, "fake context canceled"))
|
retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unimplemented, "fake context canceled"))
|
||||||
assert.False(t, retry)
|
assert.False(t, retry)
|
||||||
assert.True(t, reset)
|
assert.True(t, reset)
|
||||||
|
assert.True(t, forceReset)
|
||||||
|
|
||||||
|
retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unavailable, "fake context canceled"))
|
||||||
|
assert.True(t, retry)
|
||||||
|
assert.True(t, reset)
|
||||||
|
assert.True(t, forceReset)
|
||||||
|
|
||||||
// test serverId mismatch
|
// test serverId mismatch
|
||||||
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotMatch.Error()))
|
retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotMatch.Error()))
|
||||||
assert.True(t, retry)
|
assert.True(t, retry)
|
||||||
assert.True(t, reset)
|
assert.True(t, reset)
|
||||||
|
assert.True(t, forceReset)
|
||||||
|
|
||||||
// test cross cluster
|
// test cross cluster
|
||||||
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrServiceCrossClusterRouting.Error()))
|
retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrServiceCrossClusterRouting.Error()))
|
||||||
assert.True(t, retry)
|
assert.True(t, retry)
|
||||||
assert.True(t, reset)
|
assert.True(t, reset)
|
||||||
|
assert.True(t, forceReset)
|
||||||
|
|
||||||
// test default
|
// test default
|
||||||
retry, reset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotFound.Error()))
|
retry, reset, forceReset, _ = base.checkGrpcErr(ctx, status.Errorf(codes.Unknown, merr.ErrNodeNotFound.Error()))
|
||||||
assert.True(t, retry)
|
assert.True(t, retry)
|
||||||
assert.True(t, reset)
|
assert.True(t, reset)
|
||||||
|
assert.False(t, forceReset)
|
||||||
}
|
}
|
||||||
|
|
||||||
type server struct {
|
type server struct {
|
||||||
|
|
Loading…
Reference in New Issue