mirror of https://github.com/milvus-io/milvus.git
fix: [10kcp] Fix standby mixcoord start failed (#38324)
When standby transitions to active, the component state changes to Initialize. If the initialization takes too long (exceeding the liveness probe's maximum retries), the standby pod is stopped and fails to start. This PR removes the Initialize state during standby transitions in rolling upgrades. The state now switches directly from standby to healthy, preventing health check failures. issue: https://github.com/milvus-io/milvus/issues/37630 pr: https://github.com/milvus-io/milvus/pull/38308 Signed-off-by: bigsheeper <yihao.dai@zilliz.com>pull/38327/head
parent
24a055996b
commit
15b01daec5
|
@ -333,16 +333,22 @@ func (s *Server) Init() error {
|
|||
log.Info("DataCoord startup success")
|
||||
return nil
|
||||
}
|
||||
s.stateCode.Store(commonpb.StateCode_StandBy)
|
||||
s.UpdateStateCode(commonpb.StateCode_StandBy)
|
||||
log.Info("DataCoord enter standby mode successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
s.UpdateStateCode(commonpb.StateCode_Initializing)
|
||||
return s.initDataCoord()
|
||||
}
|
||||
|
||||
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
|
||||
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
|
||||
s.stateCode.Store(code)
|
||||
log.Info("update datacoord state", zap.String("state", code.String()))
|
||||
}
|
||||
|
||||
func (s *Server) initDataCoord() error {
|
||||
s.stateCode.Store(commonpb.StateCode_Initializing)
|
||||
var err error
|
||||
if err = s.initRootCoordClient(); err != nil {
|
||||
return err
|
||||
|
@ -472,7 +478,7 @@ func (s *Server) startDataCoord() {
|
|||
// })
|
||||
|
||||
s.afterStart()
|
||||
s.stateCode.Store(commonpb.StateCode_Healthy)
|
||||
s.UpdateStateCode(commonpb.StateCode_Healthy)
|
||||
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.GetServerID())
|
||||
}
|
||||
|
||||
|
@ -1109,9 +1115,7 @@ func (s *Server) initRootCoordClient() error {
|
|||
//
|
||||
// stop message stream client and stop server loops
|
||||
func (s *Server) Stop() error {
|
||||
if !s.stateCode.CompareAndSwap(commonpb.StateCode_Healthy, commonpb.StateCode_Abnormal) {
|
||||
return nil
|
||||
}
|
||||
s.UpdateStateCode(commonpb.StateCode_Abnormal)
|
||||
logutil.Logger(s.ctx).Info("datacoord server shutdown")
|
||||
s.garbageCollector.close()
|
||||
logutil.Logger(s.ctx).Info("datacoord garbage collector stopped")
|
||||
|
|
|
@ -215,6 +215,7 @@ func (s *Server) Init() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
s.UpdateStateCode(commonpb.StateCode_Initializing)
|
||||
return s.initQueryCoord()
|
||||
}
|
||||
|
||||
|
@ -235,7 +236,6 @@ func (s *Server) initQueryCoord() error {
|
|||
}
|
||||
log.Info("QueryCoord report DataCoord ready")
|
||||
|
||||
s.UpdateStateCode(commonpb.StateCode_Initializing)
|
||||
log.Info("start init querycoord", zap.Any("State", commonpb.StateCode_Initializing))
|
||||
// Init KV and ID allocator
|
||||
metaType := Params.MetaStoreCfg.MetaStoreType.GetValue()
|
||||
|
@ -599,6 +599,7 @@ func (s *Server) Stop() error {
|
|||
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
|
||||
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
|
||||
s.status.Store(int32(code))
|
||||
log.Info("update querycoord state", zap.String("state", code.String()))
|
||||
}
|
||||
|
||||
func (s *Server) State() commonpb.StateCode {
|
||||
|
|
|
@ -428,7 +428,6 @@ func (c *Core) initTSOAllocator() error {
|
|||
}
|
||||
|
||||
func (c *Core) initInternal() error {
|
||||
c.UpdateStateCode(commonpb.StateCode_Initializing)
|
||||
c.initKVCreator()
|
||||
|
||||
if err := c.initIDAllocator(); err != nil {
|
||||
|
@ -517,6 +516,7 @@ func (c *Core) Init() error {
|
|||
log.Info("RootCoord enter standby mode successfully")
|
||||
} else {
|
||||
c.initOnce.Do(func() {
|
||||
c.UpdateStateCode(commonpb.StateCode_Initializing)
|
||||
initError = c.initInternal()
|
||||
})
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue