diff --git a/changelogs/unreleased/5779-allenxu404 b/changelogs/unreleased/5779-allenxu404 new file mode 100644 index 000000000..d05bf4364 --- /dev/null +++ b/changelogs/unreleased/5779-allenxu404 @@ -0,0 +1,3 @@ +Add File system backup related matrics to Grafana dashboard +Add metrics backup_warning_total for record of total warnings +Add metrics backup_last_status for record of last status of the backup \ No newline at end of file diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 67874c7e5..be1c1872e 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -303,12 +303,16 @@ func (c *backupController) processBackup(key string) error { switch request.Status.Phase { case velerov1api.BackupPhaseCompleted: c.metrics.RegisterBackupSuccess(backupScheduleName) + c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc) case velerov1api.BackupPhasePartiallyFailed: c.metrics.RegisterBackupPartialFailure(backupScheduleName) + c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) case velerov1api.BackupPhaseFailed: c.metrics.RegisterBackupFailed(backupScheduleName) + c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) case velerov1api.BackupPhaseFailedValidation: c.metrics.RegisterBackupValidationFailure(backupScheduleName) + c.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) } log.Debug("Updating backup's final status") @@ -789,6 +793,10 @@ func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, bac serverMetrics.RegisterBackupItemsTotalGauge(backupScheduleName, backup.Status.Progress.TotalItems) } serverMetrics.RegisterBackupItemsErrorsGauge(backupScheduleName, backup.Status.Errors) + + if backup.Status.Warnings > 0 { + serverMetrics.RegisterBackupWarning(backupScheduleName) + } } func persistBackup(backup *pkgbackup.Request, diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index eb74072ad..75d2e79d5 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -45,6 +45,8 @@ const ( backupLastSuccessfulTimestamp = "backup_last_successful_timestamp" backupItemsTotalGauge = "backup_items_total" backupItemsErrorsGauge = "backup_items_errors" + backupWarningTotal = "backup_warning_total" + backupLastStatus = "backup_last_status" restoreTotal = "restore_total" restoreAttemptTotal = "restore_attempt_total" restoreValidationFailedTotal = "restore_validation_failed_total" @@ -70,6 +72,10 @@ const ( pvbNameLabel = "pod_volume_backup" scheduleLabel = "schedule" backupNameLabel = "backupName" + + // metrics values + BackupLastStatusSucc int64 = 1 + BackupLastStatusFailure int64 = 0 ) // NewServerMetrics returns new ServerMetrics @@ -198,6 +204,22 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), + backupWarningTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: backupWarningTotal, + Help: "Total number of warned backups", + }, + []string{scheduleLabel}, + ), + backupLastStatus: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: backupLastStatus, + Help: "Last status of the backup. A value of 1 is success, 0 is failure", + }, + []string{scheduleLabel}, + ), restoreTotal: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metricNamespace, @@ -386,6 +408,12 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { if c, ok := m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec); ok { c.WithLabelValues(scheduleName).Add(0) } + if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Add(0) + } + if c, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok { + c.WithLabelValues(scheduleName).Add(0) + } if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Add(0) } @@ -559,6 +587,20 @@ func (m *ServerMetrics) RegisterBackupItemsErrorsGauge(backupSchedule string, it } } +// RegisterBackupWarning records a warned backup. +func (m *ServerMetrics) RegisterBackupWarning(backupSchedule string) { + if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + +// RegisterBackupLastStatus records the last status of the backup. +func (m *ServerMetrics) RegisterBackupLastStatus(backupSchedule string, lastStatus int64) { + if g, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok { + g.WithLabelValues(backupSchedule).Set(float64(lastStatus)) + } +} + // toSeconds translates a time.Duration value into a float64 // representing the number of seconds in that duration. func toSeconds(d time.Duration) float64 {