diff --git a/changelogs/unreleased/2196-skriss b/changelogs/unreleased/2196-skriss new file mode 100644 index 000000000..14a301d16 --- /dev/null +++ b/changelogs/unreleased/2196-skriss @@ -0,0 +1 @@ +repopulate backup_last_successful_timestamp metrics for each schedule after server restart diff --git a/pkg/builder/backup_builder.go b/pkg/builder/backup_builder.go index f7f17ed31..6d0b1dc68 100644 --- a/pkg/builder/backup_builder.go +++ b/pkg/builder/backup_builder.go @@ -164,6 +164,12 @@ func (b *BackupBuilder) StartTimestamp(val time.Time) *BackupBuilder { return b } +// CompletionTimestamp sets the Backup's completion timestamp. +func (b *BackupBuilder) CompletionTimestamp(val time.Time) *BackupBuilder { + b.object.Status.CompletionTimestamp = &metav1.Time{Time: val} + return b +} + // Hooks sets the Backup's hooks. func (b *BackupBuilder) Hooks(hooks velerov1api.BackupHooks) *BackupBuilder { b.object.Spec.Hooks = hooks diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 1b2a7e689..60545b8a5 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -148,12 +148,44 @@ func NewBackupController( } func (c *backupController) resync() { + // recompute backup_total metric backups, err := c.lister.List(labels.Everything()) if err != nil { c.logger.Error(err, "Error computing backup_total metric") } else { c.metrics.SetBackupTotal(int64(len(backups))) } + + // recompute backup_last_successful_timestamp metric for each + // schedule (including the empty schedule, i.e. ad-hoc backups) + for schedule, timestamp := range getLastSuccessBySchedule(backups) { + c.metrics.SetBackupLastSuccessfulTimestamp(schedule, timestamp) + } +} + +// getLastSuccessBySchedule finds the most recent completed backup for each schedule +// and returns a map of schedule name -> completion time of the most recent completed +// backup. This map includes an entry for ad-hoc/non-scheduled backups, where the key +// is the empty string. +func getLastSuccessBySchedule(backups []*velerov1api.Backup) map[string]time.Time { + lastSuccessBySchedule := map[string]time.Time{} + for _, backup := range backups { + if backup.Status.Phase != velerov1api.BackupPhaseCompleted { + continue + } + if backup.Status.CompletionTimestamp == nil { + continue + } + + schedule := backup.Labels[velerov1api.ScheduleNameLabel] + timestamp := backup.Status.CompletionTimestamp.Time + + if timestamp.After(lastSuccessBySchedule[schedule]) { + lastSuccessBySchedule[schedule] = timestamp + } + } + + return lastSuccessBySchedule } func (c *backupController) processBackup(key string) error { diff --git a/pkg/controller/backup_controller_test.go b/pkg/controller/backup_controller_test.go index ebb3beccf..e1899abaa 100644 --- a/pkg/controller/backup_controller_test.go +++ b/pkg/controller/backup_controller_test.go @@ -738,3 +738,119 @@ func TestValidateAndGetSnapshotLocations(t *testing.T) { }) } } + +// Test_getLastSuccessBySchedule verifies that the getLastSuccessBySchedule helper function correctly returns +// the completion timestamp of the most recent completed backup for each schedule, including an entry for ad-hoc +// or non-scheduled backups. +func Test_getLastSuccessBySchedule(t *testing.T) { + buildBackup := func(phase velerov1api.BackupPhase, completion time.Time, schedule string) *velerov1api.Backup { + b := builder.ForBackup("", ""). + ObjectMeta(builder.WithLabels(velerov1api.ScheduleNameLabel, schedule)). + Phase(phase) + + if !completion.IsZero() { + b.CompletionTimestamp(completion) + } + + return b.Result() + } + + // create a static "base time" that can be used to easily construct completion timestamps + // by using the .Add(...) method. + baseTime, err := time.Parse(time.RFC1123, time.RFC1123) + require.NoError(t, err) + + tests := []struct { + name string + backups []*velerov1api.Backup + want map[string]time.Time + }{ + { + name: "when backups is nil, an empty map is returned", + backups: nil, + want: map[string]time.Time{}, + }, + { + name: "when backups is empty, an empty map is returned", + backups: []*velerov1api.Backup{}, + want: map[string]time.Time{}, + }, + { + name: "when multiple completed backups for a schedule exist, the latest one is returned", + backups: []*velerov1api.Backup{ + buildBackup(velerov1api.BackupPhaseCompleted, baseTime, "schedule-1"), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(time.Second), "schedule-1"), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(-time.Second), "schedule-1"), + }, + want: map[string]time.Time{ + "schedule-1": baseTime.Add(time.Second), + }, + }, + { + name: "when the most recent backup for a schedule is Failed, the timestamp of the most recent Completed one is returned", + backups: []*velerov1api.Backup{ + buildBackup(velerov1api.BackupPhaseCompleted, baseTime, "schedule-1"), + buildBackup(velerov1api.BackupPhaseFailed, baseTime.Add(time.Second), "schedule-1"), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(-time.Second), "schedule-1"), + }, + want: map[string]time.Time{ + "schedule-1": baseTime, + }, + }, + { + name: "when there are no Completed backups for a schedule, it's not returned", + backups: []*velerov1api.Backup{ + buildBackup(velerov1api.BackupPhaseInProgress, baseTime, "schedule-1"), + buildBackup(velerov1api.BackupPhaseFailed, baseTime.Add(time.Second), "schedule-1"), + buildBackup(velerov1api.BackupPhasePartiallyFailed, baseTime.Add(-time.Second), "schedule-1"), + }, + want: map[string]time.Time{}, + }, + { + name: "when backups exist without a schedule, the most recent Completed one is returned", + backups: []*velerov1api.Backup{ + buildBackup(velerov1api.BackupPhaseCompleted, baseTime, ""), + buildBackup(velerov1api.BackupPhaseFailed, baseTime.Add(time.Second), ""), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(-time.Second), ""), + }, + want: map[string]time.Time{ + "": baseTime, + }, + }, + { + name: "when backups exist for multiple schedules, the most recent Completed timestamp for each schedule is returned", + backups: []*velerov1api.Backup{ + // ad-hoc backups (no schedule) + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(30*time.Minute), ""), + buildBackup(velerov1api.BackupPhaseFailed, baseTime.Add(time.Hour), ""), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(-time.Second), ""), + + // schedule-1 + buildBackup(velerov1api.BackupPhaseCompleted, baseTime, "schedule-1"), + buildBackup(velerov1api.BackupPhaseFailed, baseTime.Add(time.Second), "schedule-1"), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(-time.Second), "schedule-1"), + + // schedule-2 + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(24*time.Hour), "schedule-2"), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(48*time.Hour), "schedule-2"), + buildBackup(velerov1api.BackupPhaseCompleted, baseTime.Add(72*time.Hour), "schedule-2"), + + // schedule-3 + buildBackup(velerov1api.BackupPhaseNew, baseTime, "schedule-3"), + buildBackup(velerov1api.BackupPhaseInProgress, baseTime.Add(time.Minute), "schedule-3"), + buildBackup(velerov1api.BackupPhasePartiallyFailed, baseTime.Add(2*time.Minute), "schedule-3"), + }, + want: map[string]time.Time{ + "": baseTime.Add(30 * time.Minute), + "schedule-1": baseTime, + "schedule-2": baseTime.Add(72 * time.Hour), + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, getLastSuccessBySchedule(tc.backups)) + }) + } +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 716c19d91..d3ed0cbee 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -297,9 +297,9 @@ func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, si } // SetBackupLastSuccessfulTimestamp records the last time a backup ran successfully, Unix timestamp in seconds -func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string) { +func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string, time time.Time) { if g, ok := m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec); ok { - g.WithLabelValues(backupSchedule).Set(float64(time.Now().Unix())) + g.WithLabelValues(backupSchedule).Set(float64(time.Unix())) } } @@ -322,7 +322,7 @@ func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) { if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(backupSchedule).Inc() } - m.SetBackupLastSuccessfulTimestamp(backupSchedule) + m.SetBackupLastSuccessfulTimestamp(backupSchedule, time.Now()) } // RegisterBackupPartialFailure records a partially failed backup.