mirror of https://github.com/milvus-io/milvus.git
enhance: Add logs for check health failed (#39208)
Signed-off-by: Wei Liu <wei.liu@zilliz.com>pull/39295/head
parent
27a99f6b9d
commit
d2834a1812
|
@ -318,7 +318,7 @@ func (s *Server) Init() error {
|
||||||
log.Info("DataCoord startup success")
|
log.Info("DataCoord startup success")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
s.stateCode.Store(commonpb.StateCode_StandBy)
|
s.UpdateStateCode(commonpb.StateCode_StandBy)
|
||||||
log.Info("DataCoord enter standby mode successfully")
|
log.Info("DataCoord enter standby mode successfully")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -328,7 +328,7 @@ func (s *Server) Init() error {
|
||||||
|
|
||||||
func (s *Server) initDataCoord() error {
|
func (s *Server) initDataCoord() error {
|
||||||
log := log.Ctx(s.ctx)
|
log := log.Ctx(s.ctx)
|
||||||
s.stateCode.Store(commonpb.StateCode_Initializing)
|
s.UpdateStateCode(commonpb.StateCode_Initializing)
|
||||||
var err error
|
var err error
|
||||||
if err = s.initRootCoordClient(); err != nil {
|
if err = s.initRootCoordClient(); err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -463,7 +463,7 @@ func (s *Server) startDataCoord() {
|
||||||
// })
|
// })
|
||||||
|
|
||||||
s.afterStart()
|
s.afterStart()
|
||||||
s.stateCode.Store(commonpb.StateCode_Healthy)
|
s.UpdateStateCode(commonpb.StateCode_Healthy)
|
||||||
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.GetServerID())
|
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.GetServerID())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -688,6 +688,12 @@ func (s *Server) GetStateCode() commonpb.StateCode {
|
||||||
return code.(commonpb.StateCode)
|
return code.(commonpb.StateCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UpdateStateCode update state code
|
||||||
|
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
|
||||||
|
s.stateCode.Store(code)
|
||||||
|
log.Ctx(s.ctx).Info("update datacoord state", zap.String("state", code.String()))
|
||||||
|
}
|
||||||
|
|
||||||
// GetComponentStates returns DataCoord's current state
|
// GetComponentStates returns DataCoord's current state
|
||||||
func (s *Server) GetComponentStates(ctx context.Context, req *milvuspb.GetComponentStatesRequest) (*milvuspb.ComponentStates, error) {
|
func (s *Server) GetComponentStates(ctx context.Context, req *milvuspb.GetComponentStatesRequest) (*milvuspb.ComponentStates, error) {
|
||||||
code := s.GetStateCode()
|
code := s.GetStateCode()
|
||||||
|
|
|
@ -90,14 +90,14 @@ func (handler *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request)
|
||||||
resp := &HealthResponse{
|
resp := &HealthResponse{
|
||||||
State: "OK",
|
State: "OK",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unhealthyComponent := make([]string, 0)
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
healthNum := 0
|
|
||||||
for _, in := range handler.indicators {
|
for _, in := range handler.indicators {
|
||||||
handler.unregisterLock.RLock()
|
handler.unregisterLock.RLock()
|
||||||
_, unregistered := handler.unregisteredRoles[in.GetName()]
|
_, unregistered := handler.unregisteredRoles[in.GetName()]
|
||||||
handler.unregisterLock.RUnlock()
|
handler.unregisterLock.RUnlock()
|
||||||
if unregistered {
|
if unregistered {
|
||||||
healthNum++
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
code := in.Health(ctx)
|
code := in.Health(ctx)
|
||||||
|
@ -105,13 +105,15 @@ func (handler *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request)
|
||||||
Name: in.GetName(),
|
Name: in.GetName(),
|
||||||
Code: code,
|
Code: code,
|
||||||
})
|
})
|
||||||
if code == commonpb.StateCode_Healthy || code == commonpb.StateCode_StandBy {
|
|
||||||
healthNum++
|
if code != commonpb.StateCode_Healthy && code != commonpb.StateCode_StandBy {
|
||||||
|
unhealthyComponent = append(unhealthyComponent, in.GetName())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if healthNum != handler.indicatorNum {
|
if len(unhealthyComponent) > 0 {
|
||||||
resp.State = fmt.Sprintf("Not all components are healthy, %d/%d", healthNum, handler.indicatorNum)
|
resp.State = fmt.Sprintf("Not all components are healthy, %d/%d", handler.indicatorNum-len(unhealthyComponent), handler.indicatorNum)
|
||||||
|
log.Info("check health failed", zap.Strings("UnhealthyComponent", unhealthyComponent))
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.State == "OK" {
|
if resp.State == "OK" {
|
||||||
|
|
|
@ -671,6 +671,7 @@ func (s *Server) Stop() error {
|
||||||
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
|
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
|
||||||
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
|
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
|
||||||
s.status.Store(int32(code))
|
s.status.Store(int32(code))
|
||||||
|
log.Ctx(s.ctx).Info("update querycoord state", zap.String("state", code.String()))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) State() commonpb.StateCode {
|
func (s *Server) State() commonpb.StateCode {
|
||||||
|
|
Loading…
Reference in New Issue