mirror of https://github.com/milvus-io/milvus.git
Add datanode client retry (#5394)
Add datanode client retry. Signed-off-by: godchen <qingxiang.chen@zilliz.com>pull/5402/head^2
parent
c28c34e852
commit
17cabfbed3
|
@ -24,8 +24,9 @@ func TestDataNodeClusterRegister(t *testing.T) {
|
|||
dataNodeNum := 3
|
||||
ids := make([]int64, 0, dataNodeNum)
|
||||
for i := 0; i < dataNodeNum; i++ {
|
||||
c := newMockDataNodeClient(int64(i))
|
||||
err := c.Init()
|
||||
c, err := newMockDataNodeClient(int64(i))
|
||||
assert.Nil(t, err)
|
||||
err = c.Init()
|
||||
assert.Nil(t, err)
|
||||
err = c.Start()
|
||||
assert.Nil(t, err)
|
||||
|
@ -74,8 +75,9 @@ func TestWatchChannels(t *testing.T) {
|
|||
cluster := newDataNodeCluster()
|
||||
for _, c := range cases {
|
||||
for i := 0; i < dataNodeNum; i++ {
|
||||
c := newMockDataNodeClient(int64(i))
|
||||
err := c.Init()
|
||||
c, err := newMockDataNodeClient(int64(i))
|
||||
assert.Nil(t, err)
|
||||
err = c.Init()
|
||||
assert.Nil(t, err)
|
||||
err = c.Start()
|
||||
assert.Nil(t, err)
|
||||
|
|
|
@ -68,11 +68,11 @@ type mockDataNodeClient struct {
|
|||
state internalpb.StateCode
|
||||
}
|
||||
|
||||
func newMockDataNodeClient(id int64) *mockDataNodeClient {
|
||||
func newMockDataNodeClient(id int64) (*mockDataNodeClient, error) {
|
||||
return &mockDataNodeClient{
|
||||
id: id,
|
||||
state: internalpb.StateCode_Initializing,
|
||||
}
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *mockDataNodeClient) Init() error {
|
||||
|
|
|
@ -76,7 +76,7 @@ type Server struct {
|
|||
insertChannels []string
|
||||
msFactory msgstream.Factory
|
||||
ttBarrier timesync.TimeTickBarrier
|
||||
createDataNodeClient func(addr string) types.DataNode
|
||||
createDataNodeClient func(addr string, serverID int64) (types.DataNode, error)
|
||||
}
|
||||
|
||||
func CreateServer(ctx context.Context, factory msgstream.Factory) (*Server, error) {
|
||||
|
@ -87,8 +87,12 @@ func CreateServer(ctx context.Context, factory msgstream.Factory) (*Server, erro
|
|||
msFactory: factory,
|
||||
}
|
||||
s.insertChannels = s.getInsertChannels()
|
||||
s.createDataNodeClient = func(addr string) types.DataNode {
|
||||
return grpcdatanodeclient.NewClient(addr)
|
||||
s.createDataNodeClient = func(addr string, serverID int64) (types.DataNode, error) {
|
||||
node, err := grpcdatanodeclient.NewClient(addr, serverID, []string{Params.EtcdAddress}, 10)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return node, nil
|
||||
}
|
||||
s.UpdateStateCode(internalpb.StateCode_Abnormal)
|
||||
return s, nil
|
||||
|
@ -566,7 +570,10 @@ func (s *Server) RegisterNode(ctx context.Context, req *datapb.RegisterNodeReque
|
|||
}
|
||||
|
||||
func (s *Server) newDataNode(ip string, port int64, id UniqueID) (*dataNode, error) {
|
||||
client := s.createDataNodeClient(fmt.Sprintf("%s:%d", ip, port))
|
||||
client, err := s.createDataNodeClient(fmt.Sprintf("%s:%d", ip, port), id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := client.Init(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -231,8 +231,9 @@ func TestFlush(t *testing.T) {
|
|||
func TestGetComponentStates(t *testing.T) {
|
||||
svr := newTestServer(t)
|
||||
defer closeTestServer(t, svr)
|
||||
cli := newMockDataNodeClient(1)
|
||||
err := cli.Init()
|
||||
cli, err := newMockDataNodeClient(1)
|
||||
assert.Nil(t, err)
|
||||
err = cli.Init()
|
||||
assert.Nil(t, err)
|
||||
err = cli.Start()
|
||||
assert.Nil(t, err)
|
||||
|
@ -800,7 +801,7 @@ func newTestServer(t *testing.T) *Server {
|
|||
assert.Nil(t, err)
|
||||
defer ms.Stop()
|
||||
svr.SetMasterClient(ms)
|
||||
svr.createDataNodeClient = func(addr string) types.DataNode {
|
||||
svr.createDataNodeClient = func(addr string, serverID int64) (types.DataNode, error) {
|
||||
return newMockDataNodeClient(0)
|
||||
}
|
||||
assert.Nil(t, err)
|
||||
|
|
|
@ -13,6 +13,8 @@ package grpcdatanodeclient
|
|||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/log"
|
||||
|
@ -25,26 +27,97 @@ import (
|
|||
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
||||
"github.com/milvus-io/milvus/internal/proto/milvuspb"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/sessionutil"
|
||||
"github.com/milvus-io/milvus/internal/util/typeutil"
|
||||
"go.uber.org/zap"
|
||||
"google.golang.org/grpc"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
ctx context.Context
|
||||
grpc datapb.DataNodeClient
|
||||
conn *grpc.ClientConn
|
||||
address string
|
||||
ctx context.Context
|
||||
|
||||
grpc datapb.DataNodeClient
|
||||
conn *grpc.ClientConn
|
||||
|
||||
address string
|
||||
serverID int64
|
||||
|
||||
sess *sessionutil.Session
|
||||
timeout time.Duration
|
||||
reconnTry int
|
||||
recallTry int
|
||||
}
|
||||
|
||||
func NewClient(address string) *Client {
|
||||
return &Client{
|
||||
address: address,
|
||||
ctx: context.Background(),
|
||||
func getDataNodeAddress(sess *sessionutil.Session, serverID int64) (string, error) {
|
||||
key := typeutil.DataNodeRole + "-" + strconv.FormatInt(serverID, 10)
|
||||
msess, err := sess.GetSessions(key)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
ms, ok := msess[key]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("number of master service is incorrect, %d", len(msess))
|
||||
}
|
||||
return ms.Address, nil
|
||||
}
|
||||
|
||||
func NewClient(address string, serverID int64, etcdAddr []string, timeout time.Duration) (*Client, error) {
|
||||
sess := sessionutil.NewSession(context.Background(), etcdAddr)
|
||||
|
||||
return &Client{
|
||||
grpc: nil,
|
||||
conn: nil,
|
||||
address: address,
|
||||
ctx: context.Background(),
|
||||
sess: sess,
|
||||
timeout: timeout,
|
||||
recallTry: 3,
|
||||
reconnTry: 10,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *Client) Init() error {
|
||||
tracer := opentracing.GlobalTracer()
|
||||
if c.address != "" {
|
||||
connectGrpcFunc := func() error {
|
||||
log.Debug("DataNode connect ", zap.String("address", c.address))
|
||||
conn, err := grpc.DialContext(c.ctx, c.address, grpc.WithInsecure(), grpc.WithBlock(),
|
||||
grpc.WithUnaryInterceptor(
|
||||
otgrpc.OpenTracingClientInterceptor(tracer)),
|
||||
grpc.WithStreamInterceptor(
|
||||
otgrpc.OpenTracingStreamClientInterceptor(tracer)))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.conn = conn
|
||||
return nil
|
||||
}
|
||||
|
||||
err := retry.Retry(c.reconnTry, time.Millisecond*500, connectGrpcFunc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
return c.reconnect()
|
||||
}
|
||||
c.grpc = datapb.NewDataNodeClient(c.conn)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) reconnect() error {
|
||||
tracer := opentracing.GlobalTracer()
|
||||
var err error
|
||||
getDataNodeAddressFn := func() error {
|
||||
c.address, err = getDataNodeAddress(c.sess, c.serverID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
err = retry.Retry(c.reconnTry, 3*time.Second, getDataNodeAddressFn)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
connectGrpcFunc := func() error {
|
||||
log.Debug("DataNode connect ", zap.String("address", c.address))
|
||||
conn, err := grpc.DialContext(c.ctx, c.address, grpc.WithInsecure(), grpc.WithBlock(),
|
||||
|
@ -59,7 +132,7 @@ func (c *Client) Init() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
err := retry.Retry(100000, time.Millisecond*200, connectGrpcFunc)
|
||||
err = retry.Retry(c.reconnTry, 500*time.Millisecond, connectGrpcFunc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -67,6 +140,23 @@ func (c *Client) Init() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) recall(caller func() (interface{}, error)) (interface{}, error) {
|
||||
ret, err := caller()
|
||||
if err == nil {
|
||||
return ret, nil
|
||||
}
|
||||
for i := 0; i < c.recallTry; i++ {
|
||||
err = c.reconnect()
|
||||
if err == nil {
|
||||
ret, err = caller()
|
||||
if err == nil {
|
||||
return ret, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret, err
|
||||
}
|
||||
|
||||
func (c *Client) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
@ -81,17 +171,29 @@ func (c *Client) Register() error {
|
|||
}
|
||||
|
||||
func (c *Client) GetComponentStates(ctx context.Context) (*internalpb.ComponentStates, error) {
|
||||
return c.grpc.GetComponentStates(ctx, &internalpb.GetComponentStatesRequest{})
|
||||
ret, err := c.recall(func() (interface{}, error) {
|
||||
return c.grpc.GetComponentStates(ctx, &internalpb.GetComponentStatesRequest{})
|
||||
})
|
||||
return ret.(*internalpb.ComponentStates), err
|
||||
}
|
||||
|
||||
func (c *Client) GetStatisticsChannel(ctx context.Context) (*milvuspb.StringResponse, error) {
|
||||
return c.grpc.GetStatisticsChannel(ctx, &internalpb.GetStatisticsChannelRequest{})
|
||||
ret, err := c.recall(func() (interface{}, error) {
|
||||
return c.grpc.GetStatisticsChannel(ctx, &internalpb.GetStatisticsChannelRequest{})
|
||||
})
|
||||
return ret.(*milvuspb.StringResponse), err
|
||||
}
|
||||
|
||||
func (c *Client) WatchDmChannels(ctx context.Context, req *datapb.WatchDmChannelsRequest) (*commonpb.Status, error) {
|
||||
return c.grpc.WatchDmChannels(ctx, req)
|
||||
ret, err := c.recall(func() (interface{}, error) {
|
||||
return c.grpc.WatchDmChannels(ctx, req)
|
||||
})
|
||||
return ret.(*commonpb.Status), err
|
||||
}
|
||||
|
||||
func (c *Client) FlushSegments(ctx context.Context, req *datapb.FlushSegmentsRequest) (*commonpb.Status, error) {
|
||||
return c.grpc.FlushSegments(ctx, req)
|
||||
ret, err := c.recall(func() (interface{}, error) {
|
||||
return c.grpc.FlushSegments(ctx, req)
|
||||
})
|
||||
return ret.(*commonpb.Status), err
|
||||
}
|
||||
|
|
|
@ -14,7 +14,6 @@ package grpcmasterserviceclient
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path"
|
||||
"time"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||
|
@ -43,14 +42,13 @@ type GrpcClient struct {
|
|||
}
|
||||
|
||||
func getMasterServiceAddr(sess *sessionutil.Session) (string, error) {
|
||||
msess, err := sess.GetSessions(typeutil.MasterServiceRole)
|
||||
key := typeutil.MasterServiceRole
|
||||
msess, err := sess.GetSessions(key)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
key := path.Join(sessionutil.DefaultServiceRoot, typeutil.MasterServiceRole)
|
||||
var ms *sessionutil.Session
|
||||
var ok bool
|
||||
if ms, ok = msess[key]; !ok {
|
||||
ms, ok := msess[key]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("number of master service is incorrect, %d", len(msess))
|
||||
}
|
||||
return ms.Address, nil
|
||||
|
@ -58,12 +56,8 @@ func getMasterServiceAddr(sess *sessionutil.Session) (string, error) {
|
|||
|
||||
func NewClient(addr string, etcdAddr []string, timeout time.Duration) (*GrpcClient, error) {
|
||||
sess := sessionutil.NewSession(context.Background(), etcdAddr)
|
||||
|
||||
if addr == "" {
|
||||
var err error
|
||||
if addr, err = getMasterServiceAddr(sess); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if sess == nil {
|
||||
return nil, fmt.Errorf("new session error, maybe can not connect to etcd")
|
||||
}
|
||||
|
||||
return &GrpcClient{
|
||||
|
@ -106,16 +100,20 @@ func (c *GrpcClient) Init() error {
|
|||
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
|
||||
defer cancel()
|
||||
var err error
|
||||
for i := 0; i < c.reconnTry; i++ {
|
||||
if c.conn, err = grpc.DialContext(ctx, c.addr, grpc.WithInsecure(), grpc.WithBlock(),
|
||||
grpc.WithUnaryInterceptor(
|
||||
otgrpc.OpenTracingClientInterceptor(tracer)),
|
||||
grpc.WithStreamInterceptor(
|
||||
otgrpc.OpenTracingStreamClientInterceptor(tracer))); err == nil {
|
||||
break
|
||||
if c.addr != "" {
|
||||
for i := 0; i < c.reconnTry; i++ {
|
||||
if c.conn, err = grpc.DialContext(ctx, c.addr, grpc.WithInsecure(), grpc.WithBlock(),
|
||||
grpc.WithUnaryInterceptor(
|
||||
otgrpc.OpenTracingClientInterceptor(tracer)),
|
||||
grpc.WithStreamInterceptor(
|
||||
otgrpc.OpenTracingStreamClientInterceptor(tracer))); err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
if err != nil {
|
||||
return fmt.Errorf("connect to specific address gprc error")
|
||||
}
|
||||
} else {
|
||||
return c.reconnect()
|
||||
}
|
||||
c.grpcClient = masterpb.NewMasterServiceClient(c.conn)
|
||||
|
|
Loading…
Reference in New Issue