Register after start to prevent there are tow coordinator at the same time (#21641) (#21707)

Signed-off-by: cai.zhang <cai.zhang@zilliz.com>
pull/21868/head
cai.zhang 2023-01-30 11:11:50 +08:00 committed by GitHub
parent bb086f47e4
commit bcae97b125
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 207 additions and 80 deletions

View File

@ -238,6 +238,7 @@ func (s *Server) Register() error {
s.icSession.Register()
s.session.Register()
if s.enableActiveStandBy {
s.icSession.ProcessActiveStandBy(nil)
s.session.ProcessActiveStandBy(s.activateFunc)
}
go s.session.LivenessCheck(s.serverLoopCtx, func() {
@ -261,6 +262,7 @@ func (s *Server) initSession() error {
return errors.New("failed to initialize IndexCoord session")
}
s.icSession.Init(typeutil.IndexCoordRole, s.address, true, true)
s.icSession.SetEnableActiveStandBy(s.enableActiveStandBy)
s.session = sessionutil.NewSession(s.ctx, Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
if s.session == nil {
@ -276,7 +278,30 @@ func (s *Server) Init() error {
var err error
s.stateCode.Store(commonpb.StateCode_Initializing)
s.factory.Init(Params)
if err = s.initSession(); err != nil {
return err
}
if s.enableActiveStandBy {
s.activateFunc = func() {
log.Info("DataCoord switch from standby to active, activating")
if err := s.initDataCoord(); err != nil {
log.Warn("DataCoord init failed", zap.Error(err))
// TODO: panic if error occurred?
}
s.startDataCoord()
s.stateCode.Store(commonpb.StateCode_Healthy)
log.Info("DataCoord startup success")
}
s.stateCode.Store(commonpb.StateCode_StandBy)
log.Info("DataCoord enter standby mode successfully")
return nil
}
return s.initDataCoord()
}
func (s *Server) initDataCoord() error {
var err error
if err = s.initRootCoordClient(); err != nil {
return err
}
@ -316,6 +341,8 @@ func (s *Server) Init() error {
s.initGarbageCollection(storageCli)
s.initIndexBuilder(storageCli)
s.serverLoopCtx, s.serverLoopCancel = context.WithCancel(s.ctx)
return nil
}
@ -327,35 +354,27 @@ func (s *Server) Init() error {
// datanodes etcd watch, etcd alive check and flush completed status check
// 4. set server state to Healthy
func (s *Server) Start() error {
if !s.enableActiveStandBy {
s.startDataCoord()
s.stateCode.Store(commonpb.StateCode_Healthy)
log.Info("DataCoord startup successfully")
}
return nil
}
func (s *Server) startDataCoord() {
if Params.DataCoordCfg.EnableCompaction.GetAsBool() {
s.compactionHandler.start()
s.compactionTrigger.start()
}
if s.enableActiveStandBy {
s.activateFunc = func() {
// todo complete the activateFunc
log.Info("datacoord switch from standby to active, activating")
s.startServerLoop()
s.stateCode.Store(commonpb.StateCode_Healthy)
logutil.Logger(s.ctx).Info("startup success")
}
s.stateCode.Store(commonpb.StateCode_StandBy)
logutil.Logger(s.ctx).Info("DataCoord enter standby mode successfully")
} else {
s.startServerLoop()
s.stateCode.Store(commonpb.StateCode_Healthy)
logutil.Logger(s.ctx).Info("DataCoord startup successfully")
}
s.startServerLoop()
// DataCoord (re)starts successfully and starts to collection segment stats
// data from all DataNode.
// This will prevent DataCoord from missing out any important segment stats
// data while offline.
log.Info("DataCoord (re)starts successfully and re-collecting segment stats from DataNodes")
s.reCollectSegmentStats(s.ctx)
return nil
}
func (s *Server) initCluster() error {
@ -516,7 +535,6 @@ func (s *Server) initIndexNodeManager() {
}
func (s *Server) startServerLoop() {
s.serverLoopCtx, s.serverLoopCancel = context.WithCancel(s.ctx)
s.serverLoopWg.Add(3)
s.startDataNodeTtLoop(s.serverLoopCtx)
s.startWatchService(s.serverLoopCtx)

View File

@ -164,16 +164,17 @@ func (s *Server) startGrpcLoop(grpcPort int) {
}
func (s *Server) start() error {
err := s.dataCoord.Start()
if err != nil {
log.Error("DataCoord start failed", zap.Error(err))
return err
}
err = s.dataCoord.Register()
err := s.dataCoord.Register()
if err != nil {
log.Debug("DataCoord register service failed", zap.Error(err))
return err
}
err = s.dataCoord.Start()
if err != nil {
log.Error("DataCoord start failed", zap.Error(err))
return err
}
return nil
}

View File

@ -241,11 +241,11 @@ func (s *Server) startGrpcLoop(grpcPort int) {
// start starts QueryCoord's grpc service.
func (s *Server) start() error {
err := s.queryCoord.Start()
err := s.queryCoord.Register()
if err != nil {
return err
}
return s.queryCoord.Register()
return s.queryCoord.Start()
}
// Stop stops QueryCoord's grpc service.

View File

@ -256,16 +256,17 @@ func (s *Server) startGrpcLoop(port int) {
}
func (s *Server) start() error {
log.Debug("RootCoord Core start ...")
if err := s.rootCoord.Start(); err != nil {
log.Error(err.Error())
return err
}
log.Info("RootCoord Core start ...")
if err := s.rootCoord.Register(); err != nil {
log.Error("RootCoord registers service failed", zap.Error(err))
return err
}
if err := s.rootCoord.Start(); err != nil {
log.Error("RootCoord start service failed", zap.Error(err))
return err
}
return nil
}

View File

@ -144,11 +144,7 @@ func (s *Server) Register() error {
return nil
}
func (s *Server) Init() error {
log.Info("QueryCoord start init",
zap.String("meta-root-path", Params.EtcdCfg.MetaRootPath.GetValue()),
zap.String("address", s.address))
func (s *Server) initSession() error {
// Init QueryCoord session
s.session = sessionutil.NewSession(s.ctx, Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
if s.session == nil {
@ -157,7 +153,39 @@ func (s *Server) Init() error {
s.session.Init(typeutil.QueryCoordRole, s.address, true, true)
s.enableActiveStandBy = Params.QueryCoordCfg.EnableActiveStandby.GetAsBool()
s.session.SetEnableActiveStandBy(s.enableActiveStandBy)
return nil
}
func (s *Server) Init() error {
log.Info("QueryCoord start init",
zap.String("meta-root-path", Params.EtcdCfg.MetaRootPath.GetValue()),
zap.String("address", s.address))
if err := s.initSession(); err != nil {
return err
}
if s.enableActiveStandBy {
s.activateFunc = func() {
log.Info("QueryCoord switch from standby to active, activating")
if err := s.initQueryCoord(); err != nil {
log.Warn("QueryCoord init failed", zap.Error(err))
}
if err := s.startQueryCoord(); err != nil {
log.Warn("QueryCoord init failed", zap.Error(err))
}
s.UpdateStateCode(commonpb.StateCode_Healthy)
log.Info("QueryCoord startup success")
}
s.UpdateStateCode(commonpb.StateCode_StandBy)
log.Info("QueryCoord enter standby mode successfully")
return nil
}
return s.initQueryCoord()
}
func (s *Server) initQueryCoord() error {
// Init KV
etcdKV := etcdkv.NewEtcdKV(s.etcdCli, Params.EtcdCfg.MetaRootPath.GetValue())
s.kv = etcdKV
@ -318,6 +346,17 @@ func (s *Server) afterStart() {
}
func (s *Server) Start() error {
if !s.enableActiveStandBy {
if err := s.startQueryCoord(); err != nil {
return err
}
s.UpdateStateCode(commonpb.StateCode_Healthy)
log.Info("QueryCoord started")
}
return nil
}
func (s *Server) startQueryCoord() error {
log.Info("start watcher...")
sessions, revision, err := s.session.GetSessions(typeutil.QueryNodeRole)
if err != nil {
@ -339,22 +378,8 @@ func (s *Server) Start() error {
if err != nil {
return err
}
if s.enableActiveStandBy {
s.activateFunc = func() {
log.Info("querycoord switch from standby to active, activating")
s.startServerLoop()
s.UpdateStateCode(commonpb.StateCode_Healthy)
}
s.UpdateStateCode(commonpb.StateCode_StandBy)
} else {
s.startServerLoop()
s.UpdateStateCode(commonpb.StateCode_Healthy)
}
log.Info("QueryCoord started")
s.startServerLoop()
s.afterStart()
return nil
}

View File

@ -27,8 +27,10 @@ import (
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/commonpb"
"github.com/milvus-io/milvus-proto/go-api/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/schemapb"
"github.com/milvus-io/milvus/internal/log"
coordMocks "github.com/milvus-io/milvus/internal/mocks"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/checkers"
@ -39,6 +41,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/internal/util/commonpbutil"
"github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/paramtable"
)
@ -244,14 +247,41 @@ func (suite *ServerSuite) TestEnableActiveStandby() {
suite.server, err = newQueryCoord()
suite.NoError(err)
suite.hackServer()
err = suite.server.Start()
mockRootCoord := coordMocks.NewRootCoord(suite.T())
mockDataCoord := coordMocks.NewDataCoord(suite.T())
mockRootCoord.EXPECT().DescribeCollection(mock.Anything, mock.Anything).Return(&milvuspb.DescribeCollectionResponse{
Status: successStatus,
Schema: &schemapb.CollectionSchema{},
}, nil).Maybe()
for _, collection := range suite.collections {
if suite.loadTypes[collection] == querypb.LoadType_LoadCollection {
req := &milvuspb.ShowPartitionsRequest{
Base: commonpbutil.NewMsgBase(
commonpbutil.WithMsgType(commonpb.MsgType_ShowPartitions),
),
CollectionID: collection,
}
mockRootCoord.EXPECT().ShowPartitionsInternal(mock.Anything, req).Return(&milvuspb.ShowPartitionsResponse{
Status: successStatus,
PartitionIDs: suite.partitions[collection],
}, nil).Maybe()
}
suite.expectGetRecoverInfoByMockDataCoord(collection, mockDataCoord)
}
err = suite.server.SetRootCoord(mockRootCoord)
suite.NoError(err)
err = suite.server.SetDataCoord(mockDataCoord)
suite.NoError(err)
//suite.hackServer()
states1, err := suite.server.GetComponentStates(context.Background())
suite.NoError(err)
suite.Equal(commonpb.StateCode_StandBy, states1.GetState().GetStateCode())
err = suite.server.Register()
suite.NoError(err)
err = suite.server.Start()
suite.NoError(err)
states2, err := suite.server.GetComponentStates(context.Background())
suite.NoError(err)
@ -351,6 +381,43 @@ func (suite *ServerSuite) expectGetRecoverInfo(collection int64) {
}
}
func (suite *ServerSuite) expectGetRecoverInfoByMockDataCoord(collection int64, dataCoord *coordMocks.DataCoord) {
var (
vChannels []*datapb.VchannelInfo
segmentBinlogs []*datapb.SegmentBinlogs
)
for partition, segments := range suite.segments[collection] {
segments := segments
getRecoveryInfoRequest := &datapb.GetRecoveryInfoRequest{
Base: commonpbutil.NewMsgBase(
commonpbutil.WithMsgType(commonpb.MsgType_GetRecoveryInfo),
),
CollectionID: collection,
PartitionID: partition,
}
vChannels = []*datapb.VchannelInfo{}
for _, channel := range suite.channels[collection] {
vChannels = append(vChannels, &datapb.VchannelInfo{
CollectionID: collection,
ChannelName: channel,
})
}
segmentBinlogs = []*datapb.SegmentBinlogs{}
for _, segment := range segments {
segmentBinlogs = append(segmentBinlogs, &datapb.SegmentBinlogs{
SegmentID: segment,
InsertChannel: suite.channels[collection][segment%2],
})
}
dataCoord.EXPECT().GetRecoveryInfo(mock.Anything, getRecoveryInfoRequest).Maybe().Return(&datapb.GetRecoveryInfoResponse{
Status: successStatus,
Channels: vChannels,
Binlogs: segmentBinlogs,
}, nil)
}
}
func (suite *ServerSuite) updateCollectionStatus(collectionID int64, status querypb.LoadStatus) {
collection := suite.server.meta.GetCollection(collectionID)
if collection != nil {

View File

@ -405,10 +405,6 @@ func (c *Core) initImportManager() error {
}
func (c *Core) initInternal() error {
if err := c.initSession(); err != nil {
return err
}
c.initKVCreator()
if err := c.initMetaTable(); err != nil {
@ -426,7 +422,6 @@ func (c *Core) initInternal() error {
c.scheduler = newScheduler(c.ctx, c.idAllocator, c.tsoAllocator)
c.factory.Init(Params)
chanMap := c.meta.ListCollectionPhysicalChannels()
c.chanTimeTick = newTimeTickSync(c.ctx, c.session.ServerID, c.factory, chanMap)
c.proxyClientManager = newProxyClientManager(c.proxyCreator)
@ -468,9 +463,36 @@ func (c *Core) initInternal() error {
// Init initialize routine
func (c *Core) Init() error {
var initError error
c.initOnce.Do(func() {
initError = c.initInternal()
})
c.factory.Init(Params)
if err := c.initSession(); err != nil {
return err
}
if c.enableActiveStandBy {
c.activateFunc = func() {
log.Info("RootCoord switch from standby to active, activating")
c.initOnce.Do(func() {
if err := c.initInternal(); err != nil {
log.Warn("RootCoord init failed", zap.Error(err))
}
})
c.startOnce.Do(func() {
if err := c.startInternal(); err != nil {
log.Warn("RootCoord start failed", zap.Error(err))
}
})
c.UpdateStateCode(commonpb.StateCode_Healthy)
log.Info("RootCoord startup success")
}
c.UpdateStateCode(commonpb.StateCode_StandBy)
log.Info("RootCoord enter standby mode successfully")
} else {
c.initOnce.Do(func() {
initError = c.initInternal()
})
}
return initError
}
@ -603,20 +625,10 @@ func (c *Core) startInternal() error {
c.scheduler.Start()
c.stepExecutor.Start()
if c.enableActiveStandBy {
c.activateFunc = func() {
// todo to complete
log.Info("rootcoord switch from standby to active, activating")
c.startServerLoop()
c.UpdateStateCode(commonpb.StateCode_Healthy)
}
c.UpdateStateCode(commonpb.StateCode_StandBy)
logutil.Logger(c.ctx).Info("rootcoord enter standby mode successfully")
} else {
c.startServerLoop()
c.UpdateStateCode(commonpb.StateCode_Healthy)
logutil.Logger(c.ctx).Info("rootcoord startup successfully")
}
c.startServerLoop()
c.UpdateStateCode(commonpb.StateCode_Healthy)
logutil.Logger(c.ctx).Info("rootcoord startup successfully")
return nil
}
@ -633,9 +645,12 @@ func (c *Core) startServerLoop() {
// Start starts RootCoord.
func (c *Core) Start() error {
var err error
c.startOnce.Do(func() {
err = c.startInternal()
})
if !c.enableActiveStandBy {
c.startOnce.Do(func() {
err = c.startInternal()
})
}
return err
}