enhance: Add logs for check health failed (#39208)

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/39295/head
wei liu 2025-01-15 17:31:00 +08:00 committed by GitHub
parent 27a99f6b9d
commit d2834a1812
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 18 additions and 9 deletions

View File

@ -318,7 +318,7 @@ func (s *Server) Init() error {
log.Info("DataCoord startup success")
return nil
}
s.stateCode.Store(commonpb.StateCode_StandBy)
s.UpdateStateCode(commonpb.StateCode_StandBy)
log.Info("DataCoord enter standby mode successfully")
return nil
}
@ -328,7 +328,7 @@ func (s *Server) Init() error {
func (s *Server) initDataCoord() error {
log := log.Ctx(s.ctx)
s.stateCode.Store(commonpb.StateCode_Initializing)
s.UpdateStateCode(commonpb.StateCode_Initializing)
var err error
if err = s.initRootCoordClient(); err != nil {
return err
@ -463,7 +463,7 @@ func (s *Server) startDataCoord() {
// })
s.afterStart()
s.stateCode.Store(commonpb.StateCode_Healthy)
s.UpdateStateCode(commonpb.StateCode_Healthy)
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.GetServerID())
}

View File

@ -688,6 +688,12 @@ func (s *Server) GetStateCode() commonpb.StateCode {
return code.(commonpb.StateCode)
}
// UpdateStateCode update state code
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
s.stateCode.Store(code)
log.Ctx(s.ctx).Info("update datacoord state", zap.String("state", code.String()))
}
// GetComponentStates returns DataCoord's current state
func (s *Server) GetComponentStates(ctx context.Context, req *milvuspb.GetComponentStatesRequest) (*milvuspb.ComponentStates, error) {
code := s.GetStateCode()

View File

@ -90,14 +90,14 @@ func (handler *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request)
resp := &HealthResponse{
State: "OK",
}
unhealthyComponent := make([]string, 0)
ctx := context.Background()
healthNum := 0
for _, in := range handler.indicators {
handler.unregisterLock.RLock()
_, unregistered := handler.unregisteredRoles[in.GetName()]
handler.unregisterLock.RUnlock()
if unregistered {
healthNum++
continue
}
code := in.Health(ctx)
@ -105,13 +105,15 @@ func (handler *HealthHandler) ServeHTTP(w http.ResponseWriter, r *http.Request)
Name: in.GetName(),
Code: code,
})
if code == commonpb.StateCode_Healthy || code == commonpb.StateCode_StandBy {
healthNum++
if code != commonpb.StateCode_Healthy && code != commonpb.StateCode_StandBy {
unhealthyComponent = append(unhealthyComponent, in.GetName())
}
}
if healthNum != handler.indicatorNum {
resp.State = fmt.Sprintf("Not all components are healthy, %d/%d", healthNum, handler.indicatorNum)
if len(unhealthyComponent) > 0 {
resp.State = fmt.Sprintf("Not all components are healthy, %d/%d", handler.indicatorNum-len(unhealthyComponent), handler.indicatorNum)
log.Info("check health failed", zap.Strings("UnhealthyComponent", unhealthyComponent))
}
if resp.State == "OK" {

View File

@ -671,6 +671,7 @@ func (s *Server) Stop() error {
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
func (s *Server) UpdateStateCode(code commonpb.StateCode) {
s.status.Store(int32(code))
log.Ctx(s.ctx).Info("update querycoord state", zap.String("state", code.String()))
}
func (s *Server) State() commonpb.StateCode {