limit the frequency of GetMetrics() log (#21514) (#21519)

Signed-off-by: yah01 <yang.cen@zilliz.com>
pull/21472/head
yah01 2023-01-04 20:03:35 +08:00 committed by GitHub
parent d88846b20c
commit 989ea16a20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 82 additions and 93 deletions

View File

@ -831,7 +831,7 @@ func (s *Server) ShowConfigurations(ctx context.Context, req *internalpb.ShowCon
func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
if s.isClosed() {
log.Warn("DataCoord.GetMetrics failed",
zap.Int64("node_id", Params.DataCoordCfg.GetNodeID()),
zap.Int64("nodeID", s.session.ServerID),
zap.String("req", req.Request),
zap.Error(errDataCoordIsUnhealthy(Params.DataCoordCfg.GetNodeID())))
@ -848,7 +848,7 @@ func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest
metricType, err := metricsinfo.ParseMetricType(req.Request)
if err != nil {
log.Warn("DataCoord.GetMetrics failed to parse metric type",
zap.Int64("node_id", Params.DataCoordCfg.GetNodeID()),
zap.Int64("nodeID", s.session.ServerID),
zap.String("req", req.Request),
zap.Error(err))
@ -874,10 +874,10 @@ func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest
}, nil
}
log.Debug("DataCoord.GetMetrics",
zap.Int64("node_id", Params.DataCoordCfg.GetNodeID()),
log.RatedDebug(60, "DataCoord.GetMetrics",
zap.Int64("nodeID", s.session.ServerID),
zap.String("req", req.Request),
zap.String("metric_type", metricType),
zap.String("metricType", metricType),
zap.Any("metrics", metrics), // TODO(dragondriver): necessary? may be very large
zap.Error(err))
@ -885,9 +885,9 @@ func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest
}
log.RatedWarn(60.0, "DataCoord.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("node_id", Params.DataCoordCfg.GetNodeID()),
zap.Int64("nodeID", s.session.ServerID),
zap.String("req", req.Request),
zap.String("metric_type", metricType))
zap.String("metricType", metricType))
return &milvuspb.GetMetricsResponse{
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, Params.DataCoordCfg.GetNodeID()),

View File

@ -760,14 +760,14 @@ func (node *DataNode) ShowConfigurations(ctx context.Context, req *internalpb.Sh
func (node *DataNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
if !node.isHealthy() {
log.Warn("DataNode.GetMetrics failed",
zap.Int64("node_id", Params.DataNodeCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.Error(errDataNodeIsUnhealthy(Params.DataNodeCfg.GetNodeID())))
zap.Error(errDataNodeIsUnhealthy(node.session.ServerID)))
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: msgDataNodeIsUnhealthy(Params.DataNodeCfg.GetNodeID()),
Reason: msgDataNodeIsUnhealthy(node.session.ServerID),
},
}, nil
}
@ -775,14 +775,14 @@ func (node *DataNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRe
metricType, err := metricsinfo.ParseMetricType(req.Request)
if err != nil {
log.Warn("DataNode.GetMetrics failed to parse metric type",
zap.Int64("node_id", Params.DataNodeCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.Error(err))
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: fmt.Sprintf("datanode GetMetrics failed, nodeID=%d, err=%s", Params.DataNodeCfg.GetNodeID(), err.Error()),
Reason: fmt.Sprintf("datanode GetMetrics failed, nodeID=%d, err=%s", node.session.ServerID, err.Error()),
},
}, nil
}
@ -790,11 +790,11 @@ func (node *DataNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRe
if metricType == metricsinfo.SystemInfoMetrics {
systemInfoMetrics, err := node.getSystemInfoMetrics(ctx, req)
if err != nil {
log.Warn("DataNode GetMetrics failed", zap.Int64("nodeID", Params.DataNodeCfg.GetNodeID()), zap.Error(err))
log.Warn("DataNode GetMetrics failed", zap.Int64("nodeID", node.session.ServerID), zap.Error(err))
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: fmt.Sprintf("datanode GetMetrics failed, nodeID=%d, err=%s", Params.DataNodeCfg.GetNodeID(), err.Error()),
Reason: fmt.Sprintf("datanode GetMetrics failed, nodeID=%d, err=%s", node.session.ServerID, err.Error()),
},
}, nil
}
@ -802,8 +802,8 @@ func (node *DataNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRe
return systemInfoMetrics, nil
}
log.Debug("DataNode.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("node_id", Params.DataNodeCfg.GetNodeID()),
log.RatedWarn(60, "DataNode.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.String("metric_type", metricType))

View File

@ -944,7 +944,7 @@ func (i *IndexCoord) ShowConfigurations(ctx context.Context, req *internalpb.Sho
// GetMetrics gets the metrics info of IndexCoord.
func (i *IndexCoord) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
log.Debug("IndexCoord.GetMetrics", zap.Int64("node id", i.serverID), zap.String("req", req.Request))
log.RatedInfo(60, "IndexCoord.GetMetrics", zap.Int64("nodeID", i.serverID), zap.String("req", req.Request))
if !i.isHealthy() {
log.Warn(msgIndexCoordIsUnhealthy(i.serverID))
@ -961,7 +961,7 @@ func (i *IndexCoord) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReq
metricType, err := metricsinfo.ParseMetricType(req.Request)
if err != nil {
log.Error("IndexCoord.GetMetrics failed to parse metric type",
zap.Int64("node id", i.session.ServerID),
zap.Int64("nodeID", i.session.ServerID),
zap.String("req", req.Request),
zap.Error(err))
@ -974,35 +974,30 @@ func (i *IndexCoord) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReq
}, nil
}
log.Debug("IndexCoord.GetMetrics",
zap.String("metric type", metricType))
if metricType == metricsinfo.SystemInfoMetrics {
ret, err := i.metricsCacheManager.GetSystemInfoMetrics()
if err == nil && ret != nil {
return ret, nil
metrics, err := i.metricsCacheManager.GetSystemInfoMetrics()
if err != nil {
// Miss cache
metrics, err = getSystemInfoMetrics(ctx, req, i)
}
log.Warn("failed to get system info metrics from cache, recompute instead",
zap.Error(err))
metrics, err := getSystemInfoMetrics(ctx, req, i)
log.Debug("IndexCoord.GetMetrics",
zap.Int64("node id", i.session.ServerID),
log.RatedDebug(60, "IndexCoord.GetMetrics",
zap.Int64("nodeID", i.session.ServerID),
zap.String("req", req.Request),
zap.String("metric type", metricType),
zap.String("metricType", metricType),
zap.String("metrics", metrics.Response), // TODO(dragondriver): necessary? may be very large
zap.Error(err))
zap.Error(err),
)
i.metricsCacheManager.UpdateSystemInfoMetrics(metrics)
return metrics, nil
}
log.Debug("IndexCoord.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("node id", i.session.ServerID),
log.RatedWarn(60, "IndexCoord.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("nodeID", i.session.ServerID),
zap.String("req", req.Request),
zap.String("metric type", metricType))
zap.String("metricType", metricType))
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{

View File

@ -213,7 +213,7 @@ func (i *IndexNode) GetJobStats(ctx context.Context, req *indexpb.GetJobStatsReq
func (i *IndexNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
if !i.isHealthy() {
log.Ctx(ctx).Warn("IndexNode.GetMetrics failed",
zap.Int64("node_id", Params.IndexNodeCfg.GetNodeID()),
zap.Int64("nodeID", i.GetNodeID()),
zap.String("req", req.Request),
zap.Error(errIndexNodeIsUnhealthy(Params.IndexNodeCfg.GetNodeID())))
@ -229,7 +229,7 @@ func (i *IndexNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequ
metricType, err := metricsinfo.ParseMetricType(req.Request)
if err != nil {
log.Ctx(ctx).Warn("IndexNode.GetMetrics failed to parse metric type",
zap.Int64("node_id", Params.IndexNodeCfg.GetNodeID()),
zap.Int64("nodeID", i.GetNodeID()),
zap.String("req", req.Request),
zap.Error(err))
@ -245,8 +245,8 @@ func (i *IndexNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequ
if metricType == metricsinfo.SystemInfoMetrics {
metrics, err := getSystemInfoMetrics(ctx, req, i)
log.Ctx(ctx).Debug("IndexNode.GetMetrics",
zap.Int64("node_id", Params.IndexNodeCfg.GetNodeID()),
log.Ctx(ctx).RatedDebug(60, "IndexNode.GetMetrics",
zap.Int64("nodeID", i.GetNodeID()),
zap.String("req", req.Request),
zap.String("metric_type", metricType),
zap.Error(err))
@ -254,8 +254,8 @@ func (i *IndexNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequ
return metrics, nil
}
log.Ctx(ctx).Warn("IndexNode.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("node_id", Params.IndexNodeCfg.GetNodeID()),
log.Ctx(ctx).RatedWarn(60, "IndexNode.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("nodeID", i.GetNodeID()),
zap.String("req", req.Request),
zap.String("metric_type", metricType))

View File

@ -3658,13 +3658,18 @@ func (node *Proxy) RegisterLink(ctx context.Context, req *milvuspb.RegisterLinkR
// GetMetrics gets the metrics of proxy
// TODO(dragondriver): cache the Metrics and set a retention to the cache
func (node *Proxy) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
log.Debug("Proxy.GetMetrics",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
sp, ctx := trace.StartSpanFromContextWithOperationName(ctx, "Proxy-GetMetrics")
defer sp.Finish()
log := log.Ctx(ctx)
log.RatedDebug(60, "Proxy.GetMetrics",
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request))
if !node.checkHealthy() {
log.Warn("Proxy.GetMetrics failed",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.Error(errProxyIsUnhealthy(Params.ProxyCfg.GetNodeID())))
@ -3680,7 +3685,7 @@ func (node *Proxy) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReque
metricType, err := metricsinfo.ParseMetricType(req.Request)
if err != nil {
log.Warn("Proxy.GetMetrics failed to parse metric type",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.Error(err))
@ -3693,9 +3698,6 @@ func (node *Proxy) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReque
}, nil
}
log.Debug("Proxy.GetMetrics",
zap.String("metric_type", metricType))
req.Base = commonpbutil.NewMsgBase(
commonpbutil.WithMsgType(commonpb.MsgType_SystemInfo),
commonpbutil.WithMsgID(0),
@ -3703,19 +3705,15 @@ func (node *Proxy) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReque
commonpbutil.WithSourceID(Params.ProxyCfg.GetNodeID()),
)
if metricType == metricsinfo.SystemInfoMetrics {
ret, err := node.metricsCacheManager.GetSystemInfoMetrics()
if err == nil && ret != nil {
return ret, nil
metrics, err := node.metricsCacheManager.GetSystemInfoMetrics()
if err != nil {
metrics, err = getSystemInfoMetrics(ctx, req, node)
}
log.Debug("failed to get system info metrics from cache, recompute instead",
zap.Error(err))
metrics, err := getSystemInfoMetrics(ctx, req, node)
log.Debug("Proxy.GetMetrics",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
log.RatedDebug(60, "Proxy.GetMetrics",
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.String("metric_type", metricType),
zap.String("metricType", metricType),
zap.Any("metrics", metrics), // TODO(dragondriver): necessary? may be very large
zap.Error(err))
@ -3724,10 +3722,10 @@ func (node *Proxy) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReque
return metrics, nil
}
log.Warn("Proxy.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
log.RatedWarn(60, "Proxy.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.String("metric_type", metricType))
zap.String("metricType", metricType))
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{
@ -3741,6 +3739,13 @@ func (node *Proxy) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsReque
// GetProxyMetrics gets the metrics of proxy, it's an internal interface which is different from GetMetrics interface,
// because it only obtains the metrics of Proxy, not including the topological metrics of Query cluster and Data cluster.
func (node *Proxy) GetProxyMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
sp, ctx := trace.StartSpanFromContextWithOperationName(ctx, "Proxy-GetProxyMetrics")
defer sp.Finish()
log := log.Ctx(ctx).With(
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request))
if !node.checkHealthy() {
log.Warn("Proxy.GetProxyMetrics failed",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
@ -3794,17 +3799,13 @@ func (node *Proxy) GetProxyMetrics(ctx context.Context, req *milvuspb.GetMetrics
}
log.Debug("Proxy.GetProxyMetrics",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
zap.String("req", req.Request),
zap.String("metric_type", metricType))
zap.String("metricType", metricType))
return proxyMetrics, nil
}
log.Warn("Proxy.GetProxyMetrics failed, request metric type is not implemented yet",
zap.Int64("node_id", Params.ProxyCfg.GetNodeID()),
zap.String("req", req.Request),
zap.String("metric_type", metricType))
zap.String("metricType", metricType))
return &milvuspb.GetMetricsResponse{
Status: &commonpb.Status{

View File

@ -540,9 +540,7 @@ func (s *Server) ShowConfigurations(ctx context.Context, req *internalpb.ShowCon
}
func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
log := log.With(zap.Int64("msgID", req.Base.GetMsgID()))
log.Debug("get metrics request received",
log.RatedDebug(60, "get metrics request received",
zap.String("metricType", req.GetRequest()))
if s.status.Load() != commonpb.StateCode_Healthy {

View File

@ -1175,7 +1175,7 @@ func (node *QueryNode) SyncReplicaSegments(ctx context.Context, req *querypb.Syn
func (node *QueryNode) ShowConfigurations(ctx context.Context, req *internalpb.ShowConfigurationsRequest) (*internalpb.ShowConfigurationsResponse, error) {
if !node.isHealthy() {
log.Warn("QueryNode.ShowConfigurations failed",
zap.Int64("nodeId", Params.QueryNodeCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Pattern),
zap.Error(errQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID())))
@ -1195,7 +1195,7 @@ func (node *QueryNode) ShowConfigurations(ctx context.Context, req *internalpb.S
func (node *QueryNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
if !node.isHealthy() {
log.Ctx(ctx).Warn("QueryNode.GetMetrics failed",
zap.Int64("nodeId", Params.QueryNodeCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.Error(errQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID())))
@ -1211,7 +1211,7 @@ func (node *QueryNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsR
metricType, err := metricsinfo.ParseMetricType(req.Request)
if err != nil {
log.Ctx(ctx).Warn("QueryNode.GetMetrics failed to parse metric type",
zap.Int64("nodeId", Params.QueryNodeCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.Error(err))
@ -1227,7 +1227,7 @@ func (node *QueryNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsR
queryNodeMetrics, err := getSystemInfoMetrics(ctx, req, node)
if err != nil {
log.Ctx(ctx).Warn("QueryNode.GetMetrics failed",
zap.Int64("NodeId", Params.QueryNodeCfg.GetNodeID()),
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.String("metricType", metricType),
zap.Error(err))
@ -1241,8 +1241,8 @@ func (node *QueryNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsR
return queryNodeMetrics, nil
}
log.Ctx(ctx).Debug("QueryNode.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("NodeId", Params.QueryNodeCfg.GetNodeID()),
log.Ctx(ctx).RatedDebug(60, "QueryNode.GetMetrics failed, request metric type is not implemented yet",
zap.Int64("nodeID", node.session.ServerID),
zap.String("req", req.Request),
zap.String("metricType", metricType))

View File

@ -1403,41 +1403,36 @@ func (c *Core) GetMetrics(ctx context.Context, in *milvuspb.GetMetricsRequest) (
metricType, err := metricsinfo.ParseMetricType(in.Request)
if err != nil {
log.Warn("ParseMetricType failed", zap.String("role", typeutil.RootCoordRole),
zap.Int64("node_id", c.session.ServerID), zap.String("req", in.Request), zap.Error(err))
zap.Int64("nodeID", c.session.ServerID), zap.String("req", in.Request), zap.Error(err))
return &milvuspb.GetMetricsResponse{
Status: failStatus(commonpb.ErrorCode_UnexpectedError, "ParseMetricType failed: "+err.Error()),
Response: "",
}, nil
}
log.Debug("GetMetrics success", zap.String("role", typeutil.RootCoordRole),
zap.String("metric_type", metricType), zap.Int64("msgID", in.GetBase().GetMsgID()))
if metricType == metricsinfo.SystemInfoMetrics {
ret, err := c.metricsCacheManager.GetSystemInfoMetrics()
if err == nil && ret != nil {
return ret, nil
metrics, err := c.metricsCacheManager.GetSystemInfoMetrics()
if err != nil {
metrics, err = c.getSystemInfoMetrics(ctx, in)
}
log.Debug("GetSystemInfoMetrics from cache failed, recompute instead", zap.String("role", typeutil.RootCoordRole),
zap.Int64("msgID", in.GetBase().GetMsgID()), zap.Error(err))
systemInfoMetrics, err := c.getSystemInfoMetrics(ctx, in)
if err != nil {
log.Warn("GetSystemInfoMetrics failed", zap.String("role", typeutil.RootCoordRole),
zap.String("metric_type", metricType), zap.Int64("msgID", in.GetBase().GetMsgID()), zap.Error(err))
log.Warn("GetSystemInfoMetrics failed",
zap.String("role", typeutil.RootCoordRole),
zap.String("metricType", metricType),
zap.Error(err))
return &milvuspb.GetMetricsResponse{
Status: failStatus(commonpb.ErrorCode_UnexpectedError, fmt.Sprintf("getSystemInfoMetrics failed: %s", err.Error())),
Response: "",
}, nil
}
c.metricsCacheManager.UpdateSystemInfoMetrics(systemInfoMetrics)
return systemInfoMetrics, err
c.metricsCacheManager.UpdateSystemInfoMetrics(metrics)
return metrics, err
}
log.Warn("GetMetrics failed, metric type not implemented", zap.String("role", typeutil.RootCoordRole),
zap.String("metric_type", metricType), zap.Int64("msgID", in.GetBase().GetMsgID()))
log.RatedWarn(60, "GetMetrics failed, metric type not implemented", zap.String("role", typeutil.RootCoordRole),
zap.String("metricType", metricType))
return &milvuspb.GetMetricsResponse{
Status: failStatus(commonpb.ErrorCode_UnexpectedError, metricsinfo.MsgUnimplementedMetric),