Merge pull request #4833 from ywk253100/220413_restart
Make in-progress backup/restore as failed when doing the reconcilepull/4869/head
commit
58a8371ca2
|
@ -0,0 +1 @@
|
|||
Make in-progress backup/restore as failed when doing the reconcile to avoid hanging in in-progress status
|
|
@ -372,6 +372,10 @@ spec:
|
|||
format: date-time
|
||||
nullable: true
|
||||
type: string
|
||||
failureReason:
|
||||
description: FailureReason is an error that caused the entire backup
|
||||
to fail.
|
||||
type: string
|
||||
formatVersion:
|
||||
description: FormatVersion is the backup format version, including
|
||||
major, minor, and patch version.
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -292,6 +292,10 @@ type BackupStatus struct {
|
|||
// +optional
|
||||
VolumeSnapshotsCompleted int `json:"volumeSnapshotsCompleted,omitempty"`
|
||||
|
||||
// FailureReason is an error that caused the entire backup to fail.
|
||||
// +optional
|
||||
FailureReason string `json:"failureReason,omitempty"`
|
||||
|
||||
// Warnings is a count of all warning messages that were generated during
|
||||
// execution of the backup. The actual warnings are in the backup's log
|
||||
// file in object storage.
|
||||
|
|
|
@ -147,13 +147,12 @@ func NewBackupController(
|
|||
backup := obj.(*velerov1api.Backup)
|
||||
|
||||
switch backup.Status.Phase {
|
||||
case "", velerov1api.BackupPhaseNew:
|
||||
// only process new backups
|
||||
case "", velerov1api.BackupPhaseNew, velerov1api.BackupPhaseInProgress:
|
||||
default:
|
||||
c.logger.WithFields(logrus.Fields{
|
||||
"backup": kubeutil.NamespaceAndName(backup),
|
||||
"phase": backup.Status.Phase,
|
||||
}).Debug("Backup is not new, skipping")
|
||||
}).Debug("Backup is not new or in-progress, skipping")
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -241,7 +240,22 @@ func (c *backupController) processBackup(key string) error {
|
|||
// this key (even though it was a no-op).
|
||||
switch original.Status.Phase {
|
||||
case "", velerov1api.BackupPhaseNew:
|
||||
// only process new backups
|
||||
case velerov1api.BackupPhaseInProgress:
|
||||
// A backup may stay in-progress forever because of
|
||||
// 1) the controller restarts during the processing of a backup
|
||||
// 2) the backup with in-progress status isn't updated to completed or failed status successfully
|
||||
// So we try to mark such Backups as failed to avoid it
|
||||
updated := original.DeepCopy()
|
||||
updated.Status.Phase = velerov1api.BackupPhaseFailed
|
||||
updated.Status.FailureReason = fmt.Sprintf("got a Backup with unexpected status %q, this may be due to a restart of the controller during the backing up, mark it as %q",
|
||||
velerov1api.BackupPhaseInProgress, updated.Status.Phase)
|
||||
updated.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
|
||||
_, err = patchBackup(original, updated, c.client)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "error updating Backup status to %s", updated.Status.Phase)
|
||||
}
|
||||
log.Warn(updated.Status.FailureReason)
|
||||
return nil
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
@ -261,6 +275,7 @@ func (c *backupController) processBackup(key string) error {
|
|||
if err != nil {
|
||||
return errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
|
||||
}
|
||||
|
||||
// store ref to just-updated item for creating patch
|
||||
original = updatedBackup
|
||||
request.Backup = updatedBackup.DeepCopy()
|
||||
|
@ -287,6 +302,7 @@ func (c *backupController) processBackup(key string) error {
|
|||
// result in the backup being Failed.
|
||||
log.WithError(err).Error("backup failed")
|
||||
request.Status.Phase = velerov1api.BackupPhaseFailed
|
||||
request.Status.FailureReason = err.Error()
|
||||
}
|
||||
|
||||
switch request.Status.Phase {
|
||||
|
|
|
@ -94,11 +94,6 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
|
|||
key: "velero/backup-1",
|
||||
backup: defaultBackup().Phase(velerov1api.BackupPhaseFailedValidation).Result(),
|
||||
},
|
||||
{
|
||||
name: "InProgress backup is not processed",
|
||||
key: "velero/backup-1",
|
||||
backup: defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result(),
|
||||
},
|
||||
{
|
||||
name: "Completed backup is not processed",
|
||||
key: "velero/backup-1",
|
||||
|
@ -140,6 +135,28 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestMarkInProgressBackupAsFailed(t *testing.T) {
|
||||
backup := defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result()
|
||||
clientset := fake.NewSimpleClientset(backup)
|
||||
sharedInformers := informers.NewSharedInformerFactory(clientset, 0)
|
||||
logger := logging.DefaultLogger(logrus.DebugLevel, logging.FormatText)
|
||||
|
||||
c := &backupController{
|
||||
genericController: newGenericController("backup-test", logger),
|
||||
client: clientset.VeleroV1(),
|
||||
lister: sharedInformers.Velero().V1().Backups().Lister(),
|
||||
clock: &clock.RealClock{},
|
||||
}
|
||||
require.NoError(t, sharedInformers.Velero().V1().Backups().Informer().GetStore().Add(backup))
|
||||
|
||||
err := c.processBackup(fmt.Sprintf("%s/%s", backup.Namespace, backup.Name))
|
||||
require.Nil(t, err)
|
||||
|
||||
res, err := clientset.VeleroV1().Backups(backup.Namespace).Get(context.TODO(), backup.Name, metav1.GetOptions{})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, velerov1api.BackupPhaseFailed, res.Status.Phase)
|
||||
}
|
||||
|
||||
func TestProcessBackupValidationFailures(t *testing.T) {
|
||||
defaultBackupLocation := builder.ForBackupStorageLocation("velero", "loc-1").Result()
|
||||
|
||||
|
@ -729,6 +746,7 @@ func TestProcessBackupCompletions(t *testing.T) {
|
|||
},
|
||||
Status: velerov1api.BackupStatus{
|
||||
Phase: velerov1api.BackupPhaseFailed,
|
||||
FailureReason: "backup already exists in object storage",
|
||||
Version: 1,
|
||||
FormatVersion: "1.1.0",
|
||||
StartTimestamp: ×tamp,
|
||||
|
@ -766,6 +784,7 @@ func TestProcessBackupCompletions(t *testing.T) {
|
|||
},
|
||||
Status: velerov1api.BackupStatus{
|
||||
Phase: velerov1api.BackupPhaseFailed,
|
||||
FailureReason: "error checking if backup already exists in object storage: Backup already exists in object storage",
|
||||
Version: 1,
|
||||
FormatVersion: "1.1.0",
|
||||
StartTimestamp: ×tamp,
|
||||
|
|
|
@ -144,13 +144,12 @@ func NewRestoreController(
|
|||
restore := obj.(*api.Restore)
|
||||
|
||||
switch restore.Status.Phase {
|
||||
case "", api.RestorePhaseNew:
|
||||
// only process new restores
|
||||
case "", api.RestorePhaseNew, api.RestorePhaseInProgress:
|
||||
default:
|
||||
c.logger.WithFields(logrus.Fields{
|
||||
"restore": kubeutil.NamespaceAndName(restore),
|
||||
"phase": restore.Status.Phase,
|
||||
}).Debug("Restore is not new, skipping")
|
||||
}).Debug("Restore is not new or in-progress, skipping")
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -202,7 +201,21 @@ func (c *restoreController) processQueueItem(key string) error {
|
|||
// is ("" | New)
|
||||
switch restore.Status.Phase {
|
||||
case "", api.RestorePhaseNew:
|
||||
// only process new restores
|
||||
case api.RestorePhaseInProgress:
|
||||
// A restore may stay in-progress forever because of
|
||||
// 1) the controller restarts during the processing of a restore
|
||||
// 2) the restore with in-progress status isn't updated to completed or failed status successfully
|
||||
// So we try to mark such restores as failed to avoid it
|
||||
updated := restore.DeepCopy()
|
||||
updated.Status.Phase = api.RestorePhaseFailed
|
||||
updated.Status.FailureReason = fmt.Sprintf("got a Restore with unexpected status %q, this may be due to a restart of the controller during the restore, mark it as %q",
|
||||
api.RestorePhaseInProgress, updated.Status.Phase)
|
||||
_, err = patchRestore(restore, updated, c.restoreClient)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "error updating Restore status to %s", updated.Status.Phase)
|
||||
}
|
||||
log.Warn(updated.Status.FailureReason)
|
||||
return nil
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ import (
|
|||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -170,11 +171,6 @@ func TestProcessQueueItemSkips(t *testing.T) {
|
|||
restoreKey: "foo/bar",
|
||||
expectError: true,
|
||||
},
|
||||
{
|
||||
name: "restore with phase InProgress does not get processed",
|
||||
restoreKey: "foo/bar",
|
||||
restore: builder.ForRestore("foo", "bar").Phase(velerov1api.RestorePhaseInProgress).Result(),
|
||||
},
|
||||
{
|
||||
name: "restore with phase Completed does not get processed",
|
||||
restoreKey: "foo/bar",
|
||||
|
@ -226,6 +222,31 @@ func TestProcessQueueItemSkips(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestMarkInProgressRestoreAsFailed(t *testing.T) {
|
||||
var (
|
||||
restore = builder.ForRestore("velero", "bar").Phase(velerov1api.RestorePhaseInProgress).Result()
|
||||
client = fake.NewSimpleClientset(restore)
|
||||
sharedInformers = informers.NewSharedInformerFactory(client, 0)
|
||||
logger = velerotest.NewLogger()
|
||||
)
|
||||
|
||||
c := restoreController{
|
||||
genericController: newGenericController("restore-test", logger),
|
||||
restoreClient: client.VeleroV1(),
|
||||
restoreLister: sharedInformers.Velero().V1().Restores().Lister(),
|
||||
}
|
||||
|
||||
err := sharedInformers.Velero().V1().Restores().Informer().GetStore().Add(restore)
|
||||
require.Nil(t, err)
|
||||
|
||||
err = c.processQueueItem(fmt.Sprintf("%s/%s", restore.Namespace, restore.Name))
|
||||
require.Nil(t, err)
|
||||
|
||||
res, err := c.restoreClient.Restores(restore.Namespace).Get(context.Background(), restore.Name, metav1.GetOptions{})
|
||||
require.Nil(t, err)
|
||||
assert.Equal(t, velerov1api.RestorePhaseFailed, res.Status.Phase)
|
||||
}
|
||||
|
||||
func TestProcessQueueItem(t *testing.T) {
|
||||
|
||||
defaultStorageLocation := builder.ForBackupStorageLocation("velero", "default").Provider("myCloud").Bucket("bucket").Result()
|
||||
|
|
|
@ -144,5 +144,7 @@ status:
|
|||
warnings: 2
|
||||
# Number of errors that were logged by the backup.
|
||||
errors: 0
|
||||
# An error that caused the entire backup to fail.
|
||||
failureReason: ""
|
||||
|
||||
```
|
||||
|
|
Loading…
Reference in New Issue