Merge pull request #1389 from skriss/restore-partially-failed

Add PartiallyFailed phase for restores
pull/1386/head
Nolan Brubaker 2019-04-25 17:06:26 -04:00 committed by GitHub
commit bf19623e82
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 47 additions and 5 deletions

View File

@ -82,10 +82,14 @@ const (
// RestorePhaseInProgress means the restore is currently executing.
RestorePhaseInProgress RestorePhase = "InProgress"
// RestorePhaseCompleted means the restore has finished executing.
// Any relevant warnings or errors will be captured in the Status.
// RestorePhaseCompleted means the restore has run successfully
// without errors.
RestorePhaseCompleted RestorePhase = "Completed"
// RestorePhasePartiallyFailed means the restore has run to completion
// but encountered 1+ errors restoring individual items.
RestorePhasePartiallyFailed RestorePhase = "PartiallyFailed"
// RestorePhaseFailed means the restore was unable to execute.
// The failing error is recorded in status.FailureReason.
RestorePhaseFailed RestorePhase = "Failed"

View File

@ -50,7 +50,10 @@ func NewLogsCommand(f client.Factory) *cobra.Command {
cmd.Exit("Error checking for restore %q: %v", restoreName, err)
}
if restore.Status.Phase != v1.RestorePhaseCompleted && restore.Status.Phase != v1.RestorePhaseFailed {
switch restore.Status.Phase {
case v1.RestorePhaseCompleted, v1.RestorePhaseFailed, v1.RestorePhasePartiallyFailed:
// terminal phases, don't exit.
default:
cmd.Exit("Logs for restore %q are not available until it's finished processing. Please wait "+
"until the restore has a phase of Completed or Failed and try again.", restoreName)
}

View File

@ -19,6 +19,7 @@ package output
import (
"bytes"
"encoding/json"
"fmt"
"sort"
"strings"
@ -35,7 +36,17 @@ func DescribeRestore(restore *v1.Restore, podVolumeRestores []v1.PodVolumeRestor
d.DescribeMetadata(restore.ObjectMeta)
d.Println()
d.Printf("Phase:\t%s\n", restore.Status.Phase)
phase := restore.Status.Phase
if phase == "" {
phase = v1.RestorePhaseNew
}
resultsNote := ""
if phase == v1.RestorePhaseFailed || phase == v1.RestorePhasePartiallyFailed {
resultsNote = fmt.Sprintf(" (run 'velero restore logs %s' for more information)", restore.Name)
}
d.Printf("Phase:\t%s%s\n", restore.Status.Phase, resultsNote)
if len(restore.Status.ValidationErrors) > 0 {
d.Println()

View File

@ -259,6 +259,10 @@ func (c *restoreController) processRestore(restore *api.Restore) error {
restore.Status.Phase = api.RestorePhaseFailed
restore.Status.FailureReason = err.Error()
c.metrics.RegisterRestoreFailed(backupScheduleName)
} else if restore.Status.Errors > 0 {
c.logger.Debug("Restore partially failed")
restore.Status.Phase = api.RestorePhasePartiallyFailed
c.metrics.RegisterRestorePartialFailure(backupScheduleName)
} else {
c.logger.Debug("Restore completed")
restore.Status.Phase = api.RestorePhaseCompleted

View File

@ -300,6 +300,7 @@ func TestProcessQueueItem(t *testing.T) {
restorerError: errors.New("blarg"),
expectedErr: false,
expectedPhase: string(api.RestorePhaseInProgress),
expectedFinalPhase: string(api.RestorePhasePartiallyFailed),
expectedRestoreErrors: 1,
expectedRestorerCall: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseInProgress).Restore,
},
@ -595,7 +596,7 @@ func TestProcessQueueItem(t *testing.T) {
if test.expectedFinalPhase != "" {
expected = Patch{
Status: StatusPatch{
Phase: api.RestorePhaseCompleted,
Phase: api.RestorePhase(test.expectedFinalPhase),
Errors: test.expectedRestoreErrors,
},
}

View File

@ -42,6 +42,7 @@ const (
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
restoreSuccessTotal = "restore_success_total"
restorePartialFailureTotal = "restore_partial_failure_total"
restoreFailedTotal = "restore_failed_total"
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
@ -162,6 +163,14 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
restorePartialFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restorePartialFailureTotal,
Help: "Total number of partially failed restores",
},
[]string{scheduleLabel},
),
restoreFailedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
@ -236,6 +245,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
@ -346,6 +358,13 @@ func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
}
}
// RegisterRestorePartialFailure records a restore that partially failed.
func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) {
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterRestoreFailed records a restore that failed.
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {