move restores to PartiallyFailed if >=1 error in restore results

Signed-off-by: Steve Kriss <krisss@vmware.com>
pull/1389/head
Steve Kriss 2019-04-23 16:26:16 -06:00
parent e7e666306c
commit eb30ec0666
4 changed files with 31 additions and 3 deletions

View File

@ -82,10 +82,14 @@ const (
// RestorePhaseInProgress means the restore is currently executing.
RestorePhaseInProgress RestorePhase = "InProgress"
// RestorePhaseCompleted means the restore has finished executing.
// Any relevant warnings or errors will be captured in the Status.
// RestorePhaseCompleted means the restore has run successfully
// without errors.
RestorePhaseCompleted RestorePhase = "Completed"
// RestorePhasePartiallyFailed means the restore has run to completion
// but encountered 1+ errors restoring individual items.
RestorePhasePartiallyFailed RestorePhase = "PartiallyFailed"
// RestorePhaseFailed means the restore was unable to execute.
// The failing error is recorded in status.FailureReason.
RestorePhaseFailed RestorePhase = "Failed"

View File

@ -259,6 +259,10 @@ func (c *restoreController) processRestore(restore *api.Restore) error {
restore.Status.Phase = api.RestorePhaseFailed
restore.Status.FailureReason = err.Error()
c.metrics.RegisterRestoreFailed(backupScheduleName)
} else if restore.Status.Errors > 0 {
c.logger.Debug("Restore partially failed")
restore.Status.Phase = api.RestorePhasePartiallyFailed
c.metrics.RegisterRestorePartialFailure(backupScheduleName)
} else {
c.logger.Debug("Restore completed")
restore.Status.Phase = api.RestorePhaseCompleted

View File

@ -300,6 +300,7 @@ func TestProcessQueueItem(t *testing.T) {
restorerError: errors.New("blarg"),
expectedErr: false,
expectedPhase: string(api.RestorePhaseInProgress),
expectedFinalPhase: string(api.RestorePhasePartiallyFailed),
expectedRestoreErrors: 1,
expectedRestorerCall: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseInProgress).Restore,
},
@ -595,7 +596,7 @@ func TestProcessQueueItem(t *testing.T) {
if test.expectedFinalPhase != "" {
expected = Patch{
Status: StatusPatch{
Phase: api.RestorePhaseCompleted,
Phase: api.RestorePhase(test.expectedFinalPhase),
Errors: test.expectedRestoreErrors,
},
}

View File

@ -42,6 +42,7 @@ const (
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
restoreSuccessTotal = "restore_success_total"
restorePartialFailureTotal = "restore_partial_failure_total"
restoreFailedTotal = "restore_failed_total"
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
@ -162,6 +163,14 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
restorePartialFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restorePartialFailureTotal,
Help: "Total number of partially failed restores",
},
[]string{scheduleLabel},
),
restoreFailedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
@ -236,6 +245,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
@ -346,6 +358,13 @@ func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
}
}
// RegisterRestorePartialFailure records a restore that partially failed.
func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) {
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterRestoreFailed records a restore that failed.
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {