fix: [10kcp] Fix standby mixcoord start failed (#38324)

When standby transitions to active, the component state changes to
Initialize. If the initialization takes too long (exceeding the liveness
probe's maximum retries), the standby pod is stopped and fails to start.
This PR removes the Initialize state during standby transitions in
rolling upgrades. The state now switches directly from standby to
healthy, preventing health check failures.

issue: https://github.com/milvus-io/milvus/issues/37630

pr: https://github.com/milvus-io/milvus/pull/38308

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
pull/38327/head
yihao.dai 2024-12-10 10:53:50 +08:00 committed by GitHub
parent 24a055996b
commit 15b01daec5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 8 deletions

View File

@ -333,16 +333,22 @@ func (s *Server) Init() error {
log.Info("DataCoord startup success")
return nil
}
s.stateCode.Store(commonpb.StateCode_StandBy)
s.UpdateStateCode(commonpb.StateCode_StandBy)
log.Info("DataCoord enter standby mode successfully")
return nil
}
s.UpdateStateCode(commonpb.StateCode_Initializing)
return s.initDataCoord()
}
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
s.stateCode.Store(code)
log.Info("update datacoord state", zap.String("state", code.String()))
}
func (s *Server) initDataCoord() error {
s.stateCode.Store(commonpb.StateCode_Initializing)
var err error
if err = s.initRootCoordClient(); err != nil {
return err
@ -472,7 +478,7 @@ func (s *Server) startDataCoord() {
// })
s.afterStart()
s.stateCode.Store(commonpb.StateCode_Healthy)
s.UpdateStateCode(commonpb.StateCode_Healthy)
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.GetServerID())
}
@ -1109,9 +1115,7 @@ func (s *Server) initRootCoordClient() error {
//
// stop message stream client and stop server loops
func (s *Server) Stop() error {
if !s.stateCode.CompareAndSwap(commonpb.StateCode_Healthy, commonpb.StateCode_Abnormal) {
return nil
}
s.UpdateStateCode(commonpb.StateCode_Abnormal)
logutil.Logger(s.ctx).Info("datacoord server shutdown")
s.garbageCollector.close()
logutil.Logger(s.ctx).Info("datacoord garbage collector stopped")

View File

@ -215,6 +215,7 @@ func (s *Server) Init() error {
return nil
}
s.UpdateStateCode(commonpb.StateCode_Initializing)
return s.initQueryCoord()
}
@ -235,7 +236,6 @@ func (s *Server) initQueryCoord() error {
}
log.Info("QueryCoord report DataCoord ready")
s.UpdateStateCode(commonpb.StateCode_Initializing)
log.Info("start init querycoord", zap.Any("State", commonpb.StateCode_Initializing))
// Init KV and ID allocator
metaType := Params.MetaStoreCfg.MetaStoreType.GetValue()
@ -599,6 +599,7 @@ func (s *Server) Stop() error {
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
s.status.Store(int32(code))
log.Info("update querycoord state", zap.String("state", code.String()))
}
func (s *Server) State() commonpb.StateCode {

View File

@ -428,7 +428,6 @@ func (c *Core) initTSOAllocator() error {
}
func (c *Core) initInternal() error {
c.UpdateStateCode(commonpb.StateCode_Initializing)
c.initKVCreator()
if err := c.initIDAllocator(); err != nil {
@ -517,6 +516,7 @@ func (c *Core) Init() error {
log.Info("RootCoord enter standby mode successfully")
} else {
c.initOnce.Do(func() {
c.UpdateStateCode(commonpb.StateCode_Initializing)
initError = c.initInternal()
})
}