Add restore failed phase and metrics

Signed-off-by: Nolan Brubaker <nolan@heptio.com>
pull/607/head
Nolan Brubaker 2018-07-05 16:49:47 -04:00
parent 7cebfe2df0
commit 74dbf38793
5 changed files with 102 additions and 55 deletions

View File

@ -85,6 +85,10 @@ const (
// RestorePhaseCompleted means the restore has finished executing.
// Any relevant warnings or errors will be captured in the Status.
RestorePhaseCompleted RestorePhase = "Completed"
// RestorePhaseFailed means the restore was unable to execute.
// The failing error is recorded in status.FailureReason.
RestorePhaseFailed RestorePhase = "Failed"
)
// RestoreStatus captures the current status of an Ark restore
@ -103,6 +107,9 @@ type RestoreStatus struct {
// Errors is a count of all error messages that were generated during
// execution of the restore. The actual errors are stored in object storage.
Errors int `json:"errors"`
// FailureReason is an error that caused the entire restore to fail.
FailureReason string `json:"failureReason"`
}
// RestoreResult is a collection of messages that were generated

View File

@ -259,28 +259,17 @@ func (c *restoreController) processRestore(key string) error {
// don't modify items in the cache
restore = restore.DeepCopy()
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
for _, nonrestorable := range nonRestorableResources {
if !excludedResources.Has(nonrestorable) {
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
}
}
backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName)
backupScheduleName := ""
if backup != nil {
backupScheduleName = backup.GetLabels()["ark-schedule"]
}
// Register attempts before we do validation so we can get better tracking
c.metrics.RegisterRestoreAttempt(backupScheduleName)
// validation
if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 {
if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 {
restore.Status.Phase = api.RestorePhaseFailedValidation
} else {
restore.Status.Phase = api.RestorePhaseInProgress
}
backupScheduleName := restore.Spec.ScheduleName
// Register attempts after validation so we don't have to fetch the backup multiple times
c.metrics.RegisterRestoreAttempt(backupScheduleName)
// update status
updatedRestore, err := patchRestore(original, restore, c.restoreClient)
if err != nil {
@ -296,7 +285,7 @@ func (c *restoreController) processRestore(key string) error {
}
logContext.Debug("Running restore")
// execution & upload of restore
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup)
restoreWarnings, restoreErrors, restoreFailure := c.runRestore(restore, c.bucket)
restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster)
for _, w := range restoreWarnings.Namespaces {
@ -307,15 +296,19 @@ func (c *restoreController) processRestore(key string) error {
for _, e := range restoreErrors.Namespaces {
restore.Status.Errors += len(e)
}
if restore.Status.Errors > 0 {
c.metrics.RegisterRestoreIncomplete(backupScheduleName)
if restoreFailure != nil {
logContext.Debug("restore failed")
restore.Status.Phase = api.RestorePhaseFailed
restore.Status.FailureReason = restoreFailure.Error()
c.metrics.RegisterRestoreFailed(backupScheduleName)
} else {
logContext.Debug("restore completed")
// We got through the restore process without failing validation or restore execution
restore.Status.Phase = api.RestorePhaseCompleted
c.metrics.RegisterRestoreSuccess(backupScheduleName)
}
logContext.Debug("restore completed")
restore.Status.Phase = api.RestorePhaseCompleted
logContext.Debug("Updating Restore final status")
if _, err = patchRestore(original, restore, c.restoreClient); err != nil {
logContext.WithError(errors.WithStack(err)).Info("Error updating Restore final status")
@ -324,7 +317,7 @@ func (c *restoreController) processRestore(key string) error {
return nil
}
func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string {
func (c *restoreController) completeAndValidate(restore *api.Restore) []string {
// add non-restorable resources to restore's excluded resources
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
for _, nonrestorable := range nonRestorableResources {
@ -334,12 +327,6 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e
}
var validationErrors []string
if restore.Spec.BackupName == "" {
validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.")
} else if fetchErr != nil {
validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr))
}
// validate that included resources don't contain any non-restorable resources
includedResources := sets.NewString(restore.Spec.IncludedResources...)
for _, nonRestorableResource := range nonRestorableResources {
@ -390,11 +377,19 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e
}
}
// validate that we can fetch the source backup
if _, err := c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil {
var (
backup *api.Backup
err error
)
if backup, err = c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil {
return append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", err))
}
// Fill in the ScheduleName so it's easier to consume for metrics.
if restore.Spec.ScheduleName == "" {
restore.Spec.ScheduleName = backup.GetLabels()["ark-schedule"]
}
return validationErrors
}
@ -462,7 +457,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error
return backup, nil
}
func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) {
func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult, restoreFailure error) {
logContext := c.logger.WithFields(
logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
@ -482,6 +477,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
if err != nil {
logContext.WithError(err).Error("Error downloading backup")
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
restoreFailure = err
return
}
tempFiles = append(tempFiles, backupFile)
@ -490,6 +486,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
if err != nil {
logContext.WithError(errors.WithStack(err)).Error("Error creating log temp file")
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
restoreFailure = err
return
}
tempFiles = append(tempFiles, logFile)
@ -498,6 +495,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
if err != nil {
logContext.WithError(errors.WithStack(err)).Error("Error creating results temp file")
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
restoreFailure = err
return
}
tempFiles = append(tempFiles, resultsFile)
@ -506,10 +504,12 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
for _, file := range tempFiles {
if err := file.Close(); err != nil {
logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error closing file")
restoreFailure = err
}
if err := os.Remove(file.Name()); err != nil {
logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error removing file")
restoreFailure = err
}
}
}()
@ -521,6 +521,8 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
}
defer c.pluginManager.CloseRestoreItemActions(restore.Name)
// Any return statement above this line means a total restore failure
// Some failures after this line *may* be a total restore failure
logContext.Info("starting restore")
restoreWarnings, restoreErrors = c.restorer.Restore(restore, backup, backupFile, logFile, actions)
logContext.Info("restore completed")

View File

@ -120,19 +120,21 @@ func TestFetchBackup(t *testing.T) {
func TestProcessRestore(t *testing.T) {
tests := []struct {
name string
restoreKey string
restore *api.Restore
backup *api.Backup
restorerError error
allowRestoreSnapshots bool
expectedErr bool
expectedPhase string
expectedValidationErrors []string
expectedRestoreErrors int
expectedRestorerCall *api.Restore
backupServiceGetBackupError error
uploadLogError error
name string
restoreKey string
restore *api.Restore
backup *api.Backup
restorerError error
allowRestoreSnapshots bool
expectedErr bool
expectedPhase string
expectedValidationErrors []string
expectedRestoreErrors int
expectedRestorerCall *api.Restore
backupServiceGetBackupError error
uploadLogError error
backupServiceDownloadBackupError error
expectedFinalPhase string
}{
{
name: "invalid key returns error",
@ -300,6 +302,14 @@ func TestProcessRestore(t *testing.T) {
"Invalid included/excluded resource lists: excludes list cannot contain an item in the includes list: restores.ark.heptio.com",
},
},
{
name: "backup download error results in failed restore",
restore: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseNew).Restore,
expectedPhase: string(api.RestorePhaseInProgress),
expectedFinalPhase: string(api.RestorePhaseFailed),
backupServiceDownloadBackupError: errors.New("Couldn't download backup"),
backup: arktest.NewTestBackup().WithName("backup-1").Backup,
},
}
for _, test := range tests {
@ -403,6 +413,10 @@ func TestProcessRestore(t *testing.T) {
backupSvc.On("GetBackup", "bucket", mock.Anything).Return(nil, test.backupServiceGetBackupError)
}
if test.backupServiceDownloadBackupError != nil {
backupSvc.On("DownloadBackup", "bucket", test.restore.Spec.BackupName).Return(nil, test.backupServiceDownloadBackupError)
}
if test.restore != nil {
pluginManager.On("GetRestoreItemActions", test.restore.Name).Return(nil, nil)
pluginManager.On("CloseRestoreItemActions", test.restore.Name).Return(nil)
@ -477,6 +491,15 @@ func TestProcessRestore(t *testing.T) {
Errors: test.expectedRestoreErrors,
},
}
// Override our default expectations if the case requires it
if test.expectedFinalPhase != "" {
expected = Patch{
Status: StatusPatch{
Phase: api.RestorePhaseCompleted,
Errors: test.expectedRestoreErrors,
},
}
}
arktest.ValidatePatch(t, actions[1], expected, decode)
@ -507,6 +530,7 @@ func TestCompleteAndValidateWhenScheduleNameSpecified(t *testing.T) {
false,
logger,
nil,
nil,
).(*restoreController)
restore := &api.Restore{

View File

@ -38,7 +38,7 @@ const (
restoreAttemptTotal = "restore_attempt_total"
restoreValidationFailedTotal = "restore_validation_failed_total"
restoreSuccessTotal = "restore_success_total"
restoreIncompleteTotal = "restore_incomplete_total"
restoreFailedTotal = "restore_failed_total"
scheduleLabel = "schedule"
backupNameLabel = "backupName"
@ -117,11 +117,11 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
restoreIncompleteTotal: prometheus.NewCounterVec(
restoreFailedTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreIncompleteTotal,
Help: "Total number of incomplete restores",
Name: restoreFailedTotal,
Help: "Total number of failed restores",
},
[]string{scheduleLabel},
),
@ -129,7 +129,7 @@ func NewServerMetrics() *ServerMetrics {
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: restoreValidationFailedTotal,
Help: "Total number of failed restore validations",
Help: "Total number of failed restores failing validations",
},
[]string{scheduleLabel},
),
@ -154,6 +154,18 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[backupFailureCount].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
}
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
@ -204,21 +216,21 @@ func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
}
}
// RegisterRestoreSuccess records a successful completion of a restore.
// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore.
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterRestoreIncomplete records a restore that finished with errors.
func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) {
if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok {
// RegisterRestoreFailed records a restore that failed.
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterRestoreValidationFailed records a failed restore.
// RegisterRestoreValidationFailed records a restore that failed validation.
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()

View File

@ -103,6 +103,8 @@ func AssertDeepEqual(t *testing.T, expected, actual interface{}) bool {
}
if !equality.Semantic.DeepEqual(expected, actual) {
fmt.Printf("expected = %+v\n", expected)
fmt.Printf("actual = %+v\n", actual)
return assert.Fail(t, fmt.Sprintf("Objects not equal"))
}