Merge pull request #5779 from allenxu404/issue-matrics

add prometheus metrics to record warning total and last status of backups
pull/5812/head
qiuming 2023-01-31 16:02:30 +08:00 committed by GitHub
commit 51568525cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 0 deletions

View File

@ -0,0 +1,3 @@
Add File system backup related matrics to Grafana dashboard
Add metrics backup_warning_total for record of total warnings
Add metrics backup_last_status for record of last status of the backup

View File

@ -303,12 +303,16 @@ func (c *backupController) processBackup(key string) error {
switch request.Status.Phase {
case velerov1api.BackupPhaseCompleted:
c.metrics.RegisterBackupSuccess(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc)
case velerov1api.BackupPhasePartiallyFailed:
c.metrics.RegisterBackupPartialFailure(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
case velerov1api.BackupPhaseFailed:
c.metrics.RegisterBackupFailed(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
case velerov1api.BackupPhaseFailedValidation:
c.metrics.RegisterBackupValidationFailure(backupScheduleName)
c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
}
log.Debug("Updating backup's final status")
@ -789,6 +793,10 @@ func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, bac
serverMetrics.RegisterBackupItemsTotalGauge(backupScheduleName, backup.Status.Progress.TotalItems)
}
serverMetrics.RegisterBackupItemsErrorsGauge(backupScheduleName, backup.Status.Errors)
if backup.Status.Warnings > 0 {
serverMetrics.RegisterBackupWarning(backupScheduleName)
}
}
func persistBackup(backup *pkgbackup.Request,

View File

@ -45,6 +45,8 @@ const (
backupLastSuccessfulTimestamp = "backup_last_successful_timestamp"
backupItemsTotalGauge = "backup_items_total"
backupItemsErrorsGauge = "backup_items_errors"
backupWarningTotal = "backup_warning_total"
backupLastStatus = "backup_last_status"
restoreTotal = "restore_total"
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
@ -70,6 +72,10 @@ const (
pvbNameLabel = "pod_volume_backup"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
// metrics values
BackupLastStatusSucc int64 = 1
BackupLastStatusFailure int64 = 0
)
// NewServerMetrics returns new ServerMetrics
@ -198,6 +204,22 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
backupWarningTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: backupWarningTotal,
Help: "Total number of warned backups",
},
[]string{scheduleLabel},
),
backupLastStatus: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metricNamespace,
Name: backupLastStatus,
Help: "Last status of the backup. A value of 1 is success, 0 is failure",
},
[]string{scheduleLabel},
),
restoreTotal: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: metricNamespace,
@ -386,6 +408,12 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Add(0)
}
@ -559,6 +587,20 @@ func (m *ServerMetrics) RegisterBackupItemsErrorsGauge(backupSchedule string, it
}
}
// RegisterBackupWarning records a warned backup.
func (m *ServerMetrics) RegisterBackupWarning(backupSchedule string) {
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterBackupLastStatus records the last status of the backup.
func (m *ServerMetrics) RegisterBackupLastStatus(backupSchedule string, lastStatus int64) {
if g, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
g.WithLabelValues(backupSchedule).Set(float64(lastStatus))
}
}
// toSeconds translates a time.Duration value into a float64
// representing the number of seconds in that duration.
func toSeconds(d time.Duration) float64 {