Add restore failed phase and metrics
Signed-off-by: Nolan Brubaker <nolan@heptio.com>pull/607/head
parent
7cebfe2df0
commit
74dbf38793
|
@ -85,6 +85,10 @@ const (
|
|||
// RestorePhaseCompleted means the restore has finished executing.
|
||||
// Any relevant warnings or errors will be captured in the Status.
|
||||
RestorePhaseCompleted RestorePhase = "Completed"
|
||||
|
||||
// RestorePhaseFailed means the restore was unable to execute.
|
||||
// The failing error is recorded in status.FailureReason.
|
||||
RestorePhaseFailed RestorePhase = "Failed"
|
||||
)
|
||||
|
||||
// RestoreStatus captures the current status of an Ark restore
|
||||
|
@ -103,6 +107,9 @@ type RestoreStatus struct {
|
|||
// Errors is a count of all error messages that were generated during
|
||||
// execution of the restore. The actual errors are stored in object storage.
|
||||
Errors int `json:"errors"`
|
||||
|
||||
// FailureReason is an error that caused the entire restore to fail.
|
||||
FailureReason string `json:"failureReason"`
|
||||
}
|
||||
|
||||
// RestoreResult is a collection of messages that were generated
|
||||
|
|
|
@ -259,28 +259,17 @@ func (c *restoreController) processRestore(key string) error {
|
|||
// don't modify items in the cache
|
||||
restore = restore.DeepCopy()
|
||||
|
||||
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
|
||||
for _, nonrestorable := range nonRestorableResources {
|
||||
if !excludedResources.Has(nonrestorable) {
|
||||
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
|
||||
}
|
||||
}
|
||||
|
||||
backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName)
|
||||
backupScheduleName := ""
|
||||
if backup != nil {
|
||||
backupScheduleName = backup.GetLabels()["ark-schedule"]
|
||||
}
|
||||
// Register attempts before we do validation so we can get better tracking
|
||||
c.metrics.RegisterRestoreAttempt(backupScheduleName)
|
||||
|
||||
// validation
|
||||
if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 {
|
||||
if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 {
|
||||
restore.Status.Phase = api.RestorePhaseFailedValidation
|
||||
} else {
|
||||
restore.Status.Phase = api.RestorePhaseInProgress
|
||||
}
|
||||
|
||||
backupScheduleName := restore.Spec.ScheduleName
|
||||
// Register attempts after validation so we don't have to fetch the backup multiple times
|
||||
c.metrics.RegisterRestoreAttempt(backupScheduleName)
|
||||
|
||||
// update status
|
||||
updatedRestore, err := patchRestore(original, restore, c.restoreClient)
|
||||
if err != nil {
|
||||
|
@ -296,7 +285,7 @@ func (c *restoreController) processRestore(key string) error {
|
|||
}
|
||||
logContext.Debug("Running restore")
|
||||
// execution & upload of restore
|
||||
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup)
|
||||
restoreWarnings, restoreErrors, restoreFailure := c.runRestore(restore, c.bucket)
|
||||
|
||||
restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster)
|
||||
for _, w := range restoreWarnings.Namespaces {
|
||||
|
@ -307,15 +296,19 @@ func (c *restoreController) processRestore(key string) error {
|
|||
for _, e := range restoreErrors.Namespaces {
|
||||
restore.Status.Errors += len(e)
|
||||
}
|
||||
if restore.Status.Errors > 0 {
|
||||
c.metrics.RegisterRestoreIncomplete(backupScheduleName)
|
||||
|
||||
if restoreFailure != nil {
|
||||
logContext.Debug("restore failed")
|
||||
restore.Status.Phase = api.RestorePhaseFailed
|
||||
restore.Status.FailureReason = restoreFailure.Error()
|
||||
c.metrics.RegisterRestoreFailed(backupScheduleName)
|
||||
} else {
|
||||
logContext.Debug("restore completed")
|
||||
// We got through the restore process without failing validation or restore execution
|
||||
restore.Status.Phase = api.RestorePhaseCompleted
|
||||
c.metrics.RegisterRestoreSuccess(backupScheduleName)
|
||||
}
|
||||
|
||||
logContext.Debug("restore completed")
|
||||
restore.Status.Phase = api.RestorePhaseCompleted
|
||||
|
||||
logContext.Debug("Updating Restore final status")
|
||||
if _, err = patchRestore(original, restore, c.restoreClient); err != nil {
|
||||
logContext.WithError(errors.WithStack(err)).Info("Error updating Restore final status")
|
||||
|
@ -324,7 +317,7 @@ func (c *restoreController) processRestore(key string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string {
|
||||
func (c *restoreController) completeAndValidate(restore *api.Restore) []string {
|
||||
// add non-restorable resources to restore's excluded resources
|
||||
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
|
||||
for _, nonrestorable := range nonRestorableResources {
|
||||
|
@ -334,12 +327,6 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e
|
|||
}
|
||||
var validationErrors []string
|
||||
|
||||
if restore.Spec.BackupName == "" {
|
||||
validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.")
|
||||
} else if fetchErr != nil {
|
||||
validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr))
|
||||
}
|
||||
|
||||
// validate that included resources don't contain any non-restorable resources
|
||||
includedResources := sets.NewString(restore.Spec.IncludedResources...)
|
||||
for _, nonRestorableResource := range nonRestorableResources {
|
||||
|
@ -390,11 +377,19 @@ func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr e
|
|||
}
|
||||
}
|
||||
|
||||
// validate that we can fetch the source backup
|
||||
if _, err := c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil {
|
||||
var (
|
||||
backup *api.Backup
|
||||
err error
|
||||
)
|
||||
if backup, err = c.fetchBackup(c.bucket, restore.Spec.BackupName); err != nil {
|
||||
return append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", err))
|
||||
}
|
||||
|
||||
// Fill in the ScheduleName so it's easier to consume for metrics.
|
||||
if restore.Spec.ScheduleName == "" {
|
||||
restore.Spec.ScheduleName = backup.GetLabels()["ark-schedule"]
|
||||
}
|
||||
|
||||
return validationErrors
|
||||
}
|
||||
|
||||
|
@ -462,7 +457,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error
|
|||
return backup, nil
|
||||
}
|
||||
|
||||
func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) {
|
||||
func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult, restoreFailure error) {
|
||||
logContext := c.logger.WithFields(
|
||||
logrus.Fields{
|
||||
"restore": kubeutil.NamespaceAndName(restore),
|
||||
|
@ -482,6 +477,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
|
|||
if err != nil {
|
||||
logContext.WithError(err).Error("Error downloading backup")
|
||||
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
|
||||
restoreFailure = err
|
||||
return
|
||||
}
|
||||
tempFiles = append(tempFiles, backupFile)
|
||||
|
@ -490,6 +486,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
|
|||
if err != nil {
|
||||
logContext.WithError(errors.WithStack(err)).Error("Error creating log temp file")
|
||||
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
|
||||
restoreFailure = err
|
||||
return
|
||||
}
|
||||
tempFiles = append(tempFiles, logFile)
|
||||
|
@ -498,6 +495,7 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
|
|||
if err != nil {
|
||||
logContext.WithError(errors.WithStack(err)).Error("Error creating results temp file")
|
||||
restoreErrors.Ark = append(restoreErrors.Ark, err.Error())
|
||||
restoreFailure = err
|
||||
return
|
||||
}
|
||||
tempFiles = append(tempFiles, resultsFile)
|
||||
|
@ -506,10 +504,12 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
|
|||
for _, file := range tempFiles {
|
||||
if err := file.Close(); err != nil {
|
||||
logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error closing file")
|
||||
restoreFailure = err
|
||||
}
|
||||
|
||||
if err := os.Remove(file.Name()); err != nil {
|
||||
logContext.WithError(errors.WithStack(err)).WithField("file", file.Name()).Error("Error removing file")
|
||||
restoreFailure = err
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
@ -521,6 +521,8 @@ func (c *restoreController) runRestore(restore *api.Restore, bucket string, back
|
|||
}
|
||||
defer c.pluginManager.CloseRestoreItemActions(restore.Name)
|
||||
|
||||
// Any return statement above this line means a total restore failure
|
||||
// Some failures after this line *may* be a total restore failure
|
||||
logContext.Info("starting restore")
|
||||
restoreWarnings, restoreErrors = c.restorer.Restore(restore, backup, backupFile, logFile, actions)
|
||||
logContext.Info("restore completed")
|
||||
|
|
|
@ -133,6 +133,8 @@ func TestProcessRestore(t *testing.T) {
|
|||
expectedRestorerCall *api.Restore
|
||||
backupServiceGetBackupError error
|
||||
uploadLogError error
|
||||
backupServiceDownloadBackupError error
|
||||
expectedFinalPhase string
|
||||
}{
|
||||
{
|
||||
name: "invalid key returns error",
|
||||
|
@ -300,6 +302,14 @@ func TestProcessRestore(t *testing.T) {
|
|||
"Invalid included/excluded resource lists: excludes list cannot contain an item in the includes list: restores.ark.heptio.com",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "backup download error results in failed restore",
|
||||
restore: NewRestore("foo", "bar", "backup-1", "ns-1", "", api.RestorePhaseNew).Restore,
|
||||
expectedPhase: string(api.RestorePhaseInProgress),
|
||||
expectedFinalPhase: string(api.RestorePhaseFailed),
|
||||
backupServiceDownloadBackupError: errors.New("Couldn't download backup"),
|
||||
backup: arktest.NewTestBackup().WithName("backup-1").Backup,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
|
@ -403,6 +413,10 @@ func TestProcessRestore(t *testing.T) {
|
|||
backupSvc.On("GetBackup", "bucket", mock.Anything).Return(nil, test.backupServiceGetBackupError)
|
||||
}
|
||||
|
||||
if test.backupServiceDownloadBackupError != nil {
|
||||
backupSvc.On("DownloadBackup", "bucket", test.restore.Spec.BackupName).Return(nil, test.backupServiceDownloadBackupError)
|
||||
}
|
||||
|
||||
if test.restore != nil {
|
||||
pluginManager.On("GetRestoreItemActions", test.restore.Name).Return(nil, nil)
|
||||
pluginManager.On("CloseRestoreItemActions", test.restore.Name).Return(nil)
|
||||
|
@ -477,6 +491,15 @@ func TestProcessRestore(t *testing.T) {
|
|||
Errors: test.expectedRestoreErrors,
|
||||
},
|
||||
}
|
||||
// Override our default expectations if the case requires it
|
||||
if test.expectedFinalPhase != "" {
|
||||
expected = Patch{
|
||||
Status: StatusPatch{
|
||||
Phase: api.RestorePhaseCompleted,
|
||||
Errors: test.expectedRestoreErrors,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
arktest.ValidatePatch(t, actions[1], expected, decode)
|
||||
|
||||
|
@ -507,6 +530,7 @@ func TestCompleteAndValidateWhenScheduleNameSpecified(t *testing.T) {
|
|||
false,
|
||||
logger,
|
||||
nil,
|
||||
nil,
|
||||
).(*restoreController)
|
||||
|
||||
restore := &api.Restore{
|
||||
|
|
|
@ -38,7 +38,7 @@ const (
|
|||
restoreAttemptTotal = "restore_attempt_total"
|
||||
restoreValidationFailedTotal = "restore_validation_failed_total"
|
||||
restoreSuccessTotal = "restore_success_total"
|
||||
restoreIncompleteTotal = "restore_incomplete_total"
|
||||
restoreFailedTotal = "restore_failed_total"
|
||||
|
||||
scheduleLabel = "schedule"
|
||||
backupNameLabel = "backupName"
|
||||
|
@ -117,11 +117,11 @@ func NewServerMetrics() *ServerMetrics {
|
|||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
restoreIncompleteTotal: prometheus.NewCounterVec(
|
||||
restoreFailedTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: restoreIncompleteTotal,
|
||||
Help: "Total number of incomplete restores",
|
||||
Name: restoreFailedTotal,
|
||||
Help: "Total number of failed restores",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
|
@ -129,7 +129,7 @@ func NewServerMetrics() *ServerMetrics {
|
|||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: restoreValidationFailedTotal,
|
||||
Help: "Total number of failed restore validations",
|
||||
Help: "Total number of failed restores failing validations",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
|
@ -154,6 +154,18 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
|
|||
if c, ok := m.metrics[backupFailureCount].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
}
|
||||
|
||||
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
|
||||
|
@ -204,21 +216,21 @@ func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
|
|||
}
|
||||
}
|
||||
|
||||
// RegisterRestoreSuccess records a successful completion of a restore.
|
||||
// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore.
|
||||
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRestoreIncomplete records a restore that finished with errors.
|
||||
func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok {
|
||||
// RegisterRestoreFailed records a restore that failed.
|
||||
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRestoreValidationFailed records a failed restore.
|
||||
// RegisterRestoreValidationFailed records a restore that failed validation.
|
||||
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
|
|
|
@ -103,6 +103,8 @@ func AssertDeepEqual(t *testing.T, expected, actual interface{}) bool {
|
|||
}
|
||||
|
||||
if !equality.Semantic.DeepEqual(expected, actual) {
|
||||
fmt.Printf("expected = %+v\n", expected)
|
||||
fmt.Printf("actual = %+v\n", actual)
|
||||
return assert.Fail(t, fmt.Sprintf("Objects not equal"))
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue