Add datanode client retry (#5394)

Add datanode client retry.

Signed-off-by: godchen <qingxiang.chen@zilliz.com>
pull/5402/head^2
godchen 2021-05-25 15:47:08 +08:00 committed by GitHub
parent c28c34e852
commit 17cabfbed3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 157 additions and 47 deletions

View File

@ -24,8 +24,9 @@ func TestDataNodeClusterRegister(t *testing.T) {
dataNodeNum := 3
ids := make([]int64, 0, dataNodeNum)
for i := 0; i < dataNodeNum; i++ {
c := newMockDataNodeClient(int64(i))
err := c.Init()
c, err := newMockDataNodeClient(int64(i))
assert.Nil(t, err)
err = c.Init()
assert.Nil(t, err)
err = c.Start()
assert.Nil(t, err)
@ -74,8 +75,9 @@ func TestWatchChannels(t *testing.T) {
cluster := newDataNodeCluster()
for _, c := range cases {
for i := 0; i < dataNodeNum; i++ {
c := newMockDataNodeClient(int64(i))
err := c.Init()
c, err := newMockDataNodeClient(int64(i))
assert.Nil(t, err)
err = c.Init()
assert.Nil(t, err)
err = c.Start()
assert.Nil(t, err)

View File

@ -68,11 +68,11 @@ type mockDataNodeClient struct {
state internalpb.StateCode
}
func newMockDataNodeClient(id int64) *mockDataNodeClient {
func newMockDataNodeClient(id int64) (*mockDataNodeClient, error) {
return &mockDataNodeClient{
id: id,
state: internalpb.StateCode_Initializing,
}
}, nil
}
func (c *mockDataNodeClient) Init() error {

View File

@ -76,7 +76,7 @@ type Server struct {
insertChannels []string
msFactory msgstream.Factory
ttBarrier timesync.TimeTickBarrier
createDataNodeClient func(addr string) types.DataNode
createDataNodeClient func(addr string, serverID int64) (types.DataNode, error)
}
func CreateServer(ctx context.Context, factory msgstream.Factory) (*Server, error) {
@ -87,8 +87,12 @@ func CreateServer(ctx context.Context, factory msgstream.Factory) (*Server, erro
msFactory: factory,
}
s.insertChannels = s.getInsertChannels()
s.createDataNodeClient = func(addr string) types.DataNode {
return grpcdatanodeclient.NewClient(addr)
s.createDataNodeClient = func(addr string, serverID int64) (types.DataNode, error) {
node, err := grpcdatanodeclient.NewClient(addr, serverID, []string{Params.EtcdAddress}, 10)
if err != nil {
return nil, err
}
return node, nil
}
s.UpdateStateCode(internalpb.StateCode_Abnormal)
return s, nil
@ -566,7 +570,10 @@ func (s *Server) RegisterNode(ctx context.Context, req *datapb.RegisterNodeReque
}
func (s *Server) newDataNode(ip string, port int64, id UniqueID) (*dataNode, error) {
client := s.createDataNodeClient(fmt.Sprintf("%s:%d", ip, port))
client, err := s.createDataNodeClient(fmt.Sprintf("%s:%d", ip, port), id)
if err != nil {
return nil, err
}
if err := client.Init(); err != nil {
return nil, err
}

View File

@ -231,8 +231,9 @@ func TestFlush(t *testing.T) {
func TestGetComponentStates(t *testing.T) {
svr := newTestServer(t)
defer closeTestServer(t, svr)
cli := newMockDataNodeClient(1)
err := cli.Init()
cli, err := newMockDataNodeClient(1)
assert.Nil(t, err)
err = cli.Init()
assert.Nil(t, err)
err = cli.Start()
assert.Nil(t, err)
@ -800,7 +801,7 @@ func newTestServer(t *testing.T) *Server {
assert.Nil(t, err)
defer ms.Stop()
svr.SetMasterClient(ms)
svr.createDataNodeClient = func(addr string) types.DataNode {
svr.createDataNodeClient = func(addr string, serverID int64) (types.DataNode, error) {
return newMockDataNodeClient(0)
}
assert.Nil(t, err)

View File

@ -13,6 +13,8 @@ package grpcdatanodeclient
import (
"context"
"fmt"
"strconv"
"time"
"github.com/milvus-io/milvus/internal/log"
@ -25,26 +27,97 @@ import (
"github.com/milvus-io/milvus/internal/proto/internalpb"
"github.com/milvus-io/milvus/internal/proto/milvuspb"
"github.com/milvus-io/milvus/internal/util/sessionutil"
"github.com/milvus-io/milvus/internal/util/typeutil"
"go.uber.org/zap"
"google.golang.org/grpc"
)
type Client struct {
ctx context.Context
grpc datapb.DataNodeClient
conn *grpc.ClientConn
address string
ctx context.Context
grpc datapb.DataNodeClient
conn *grpc.ClientConn
address string
serverID int64
sess *sessionutil.Session
timeout time.Duration
reconnTry int
recallTry int
}
func NewClient(address string) *Client {
return &Client{
address: address,
ctx: context.Background(),
func getDataNodeAddress(sess *sessionutil.Session, serverID int64) (string, error) {
key := typeutil.DataNodeRole + "-" + strconv.FormatInt(serverID, 10)
msess, err := sess.GetSessions(key)
if err != nil {
return "", err
}
ms, ok := msess[key]
if !ok {
return "", fmt.Errorf("number of master service is incorrect, %d", len(msess))
}
return ms.Address, nil
}
func NewClient(address string, serverID int64, etcdAddr []string, timeout time.Duration) (*Client, error) {
sess := sessionutil.NewSession(context.Background(), etcdAddr)
return &Client{
grpc: nil,
conn: nil,
address: address,
ctx: context.Background(),
sess: sess,
timeout: timeout,
recallTry: 3,
reconnTry: 10,
}, nil
}
func (c *Client) Init() error {
tracer := opentracing.GlobalTracer()
if c.address != "" {
connectGrpcFunc := func() error {
log.Debug("DataNode connect ", zap.String("address", c.address))
conn, err := grpc.DialContext(c.ctx, c.address, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithUnaryInterceptor(
otgrpc.OpenTracingClientInterceptor(tracer)),
grpc.WithStreamInterceptor(
otgrpc.OpenTracingStreamClientInterceptor(tracer)))
if err != nil {
return err
}
c.conn = conn
return nil
}
err := retry.Retry(c.reconnTry, time.Millisecond*500, connectGrpcFunc)
if err != nil {
return err
}
} else {
return c.reconnect()
}
c.grpc = datapb.NewDataNodeClient(c.conn)
return nil
}
func (c *Client) reconnect() error {
tracer := opentracing.GlobalTracer()
var err error
getDataNodeAddressFn := func() error {
c.address, err = getDataNodeAddress(c.sess, c.serverID)
if err != nil {
return err
}
return nil
}
err = retry.Retry(c.reconnTry, 3*time.Second, getDataNodeAddressFn)
if err != nil {
return err
}
connectGrpcFunc := func() error {
log.Debug("DataNode connect ", zap.String("address", c.address))
conn, err := grpc.DialContext(c.ctx, c.address, grpc.WithInsecure(), grpc.WithBlock(),
@ -59,7 +132,7 @@ func (c *Client) Init() error {
return nil
}
err := retry.Retry(100000, time.Millisecond*200, connectGrpcFunc)
err = retry.Retry(c.reconnTry, 500*time.Millisecond, connectGrpcFunc)
if err != nil {
return err
}
@ -67,6 +140,23 @@ func (c *Client) Init() error {
return nil
}
func (c *Client) recall(caller func() (interface{}, error)) (interface{}, error) {
ret, err := caller()
if err == nil {
return ret, nil
}
for i := 0; i < c.recallTry; i++ {
err = c.reconnect()
if err == nil {
ret, err = caller()
if err == nil {
return ret, nil
}
}
}
return ret, err
}
func (c *Client) Start() error {
return nil
}
@ -81,17 +171,29 @@ func (c *Client) Register() error {
}
func (c *Client) GetComponentStates(ctx context.Context) (*internalpb.ComponentStates, error) {
return c.grpc.GetComponentStates(ctx, &internalpb.GetComponentStatesRequest{})
ret, err := c.recall(func() (interface{}, error) {
return c.grpc.GetComponentStates(ctx, &internalpb.GetComponentStatesRequest{})
})
return ret.(*internalpb.ComponentStates), err
}
func (c *Client) GetStatisticsChannel(ctx context.Context) (*milvuspb.StringResponse, error) {
return c.grpc.GetStatisticsChannel(ctx, &internalpb.GetStatisticsChannelRequest{})
ret, err := c.recall(func() (interface{}, error) {
return c.grpc.GetStatisticsChannel(ctx, &internalpb.GetStatisticsChannelRequest{})
})
return ret.(*milvuspb.StringResponse), err
}
func (c *Client) WatchDmChannels(ctx context.Context, req *datapb.WatchDmChannelsRequest) (*commonpb.Status, error) {
return c.grpc.WatchDmChannels(ctx, req)
ret, err := c.recall(func() (interface{}, error) {
return c.grpc.WatchDmChannels(ctx, req)
})
return ret.(*commonpb.Status), err
}
func (c *Client) FlushSegments(ctx context.Context, req *datapb.FlushSegmentsRequest) (*commonpb.Status, error) {
return c.grpc.FlushSegments(ctx, req)
ret, err := c.recall(func() (interface{}, error) {
return c.grpc.FlushSegments(ctx, req)
})
return ret.(*commonpb.Status), err
}

View File

@ -14,7 +14,6 @@ package grpcmasterserviceclient
import (
"context"
"fmt"
"path"
"time"
"github.com/milvus-io/milvus/internal/proto/commonpb"
@ -43,14 +42,13 @@ type GrpcClient struct {
}
func getMasterServiceAddr(sess *sessionutil.Session) (string, error) {
msess, err := sess.GetSessions(typeutil.MasterServiceRole)
key := typeutil.MasterServiceRole
msess, err := sess.GetSessions(key)
if err != nil {
return "", err
}
key := path.Join(sessionutil.DefaultServiceRoot, typeutil.MasterServiceRole)
var ms *sessionutil.Session
var ok bool
if ms, ok = msess[key]; !ok {
ms, ok := msess[key]
if !ok {
return "", fmt.Errorf("number of master service is incorrect, %d", len(msess))
}
return ms.Address, nil
@ -58,12 +56,8 @@ func getMasterServiceAddr(sess *sessionutil.Session) (string, error) {
func NewClient(addr string, etcdAddr []string, timeout time.Duration) (*GrpcClient, error) {
sess := sessionutil.NewSession(context.Background(), etcdAddr)
if addr == "" {
var err error
if addr, err = getMasterServiceAddr(sess); err != nil {
return nil, err
}
if sess == nil {
return nil, fmt.Errorf("new session error, maybe can not connect to etcd")
}
return &GrpcClient{
@ -106,16 +100,20 @@ func (c *GrpcClient) Init() error {
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
var err error
for i := 0; i < c.reconnTry; i++ {
if c.conn, err = grpc.DialContext(ctx, c.addr, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithUnaryInterceptor(
otgrpc.OpenTracingClientInterceptor(tracer)),
grpc.WithStreamInterceptor(
otgrpc.OpenTracingStreamClientInterceptor(tracer))); err == nil {
break
if c.addr != "" {
for i := 0; i < c.reconnTry; i++ {
if c.conn, err = grpc.DialContext(ctx, c.addr, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithUnaryInterceptor(
otgrpc.OpenTracingClientInterceptor(tracer)),
grpc.WithStreamInterceptor(
otgrpc.OpenTracingStreamClientInterceptor(tracer))); err == nil {
break
}
}
}
if err != nil {
if err != nil {
return fmt.Errorf("connect to specific address gprc error")
}
} else {
return c.reconnect()
}
c.grpcClient = masterpb.NewMasterServiceClient(c.conn)