From 8392e6d83fdeb944ecd2fa1fe408f63ac8fd13f5 Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Fri, 26 Apr 2019 10:14:26 -0600 Subject: [PATCH] add PartiallyFailed phase for backups, log+continue on errors (#1386) Signed-off-by: Steve Kriss --- changelogs/unreleased/1386-skriss | 1 + pkg/apis/velero/v1/backup.go | 14 +++ pkg/backup/backup.go | 25 ++--- pkg/backup/backup_test.go | 70 +++++++++++++- pkg/backup/group_backupper.go | 37 ++++--- pkg/backup/item_backupper.go | 18 +--- pkg/backup/resource_backupper.go | 123 +++++++++++++----------- pkg/cmd/cli/backup/logs.go | 5 +- pkg/cmd/util/output/backup_describer.go | 16 ++- pkg/cmd/util/output/backup_printer.go | 14 ++- pkg/controller/backup_controller.go | 97 +++++++++++++------ pkg/metrics/metrics.go | 19 ++++ pkg/util/logging/log_counter_hook.go | 61 ++++++++++++ 13 files changed, 355 insertions(+), 145 deletions(-) create mode 100644 changelogs/unreleased/1386-skriss create mode 100644 pkg/util/logging/log_counter_hook.go diff --git a/changelogs/unreleased/1386-skriss b/changelogs/unreleased/1386-skriss new file mode 100644 index 000000000..436b16e13 --- /dev/null +++ b/changelogs/unreleased/1386-skriss @@ -0,0 +1 @@ +add PartiallyFailed phase for backups, log + continue on errors during backup process diff --git a/pkg/apis/velero/v1/backup.go b/pkg/apis/velero/v1/backup.go index 36c3e198f..a649f7b09 100644 --- a/pkg/apis/velero/v1/backup.go +++ b/pkg/apis/velero/v1/backup.go @@ -149,6 +149,10 @@ const ( // errors. BackupPhaseCompleted BackupPhase = "Completed" + // BackupPhasePartiallyFailed means the backup has run to completion + // but encountered 1+ errors backing up individual items. + BackupPhasePartiallyFailed BackupPhase = "PartiallyFailed" + // BackupPhaseFailed means the backup ran but encountered an error that // prevented it from completing successfully. BackupPhaseFailed BackupPhase = "Failed" @@ -191,6 +195,16 @@ type BackupStatus struct { // VolumeSnapshotsCompleted is the total number of successfully // completed volume snapshots for this backup. VolumeSnapshotsCompleted int `json:"volumeSnapshotsCompleted"` + + // Warnings is a count of all warning messages that were generated during + // execution of the backup. The actual warnings are in the backup's log + // file in object storage. + Warnings int `json:"warnings"` + + // Errors is a count of all error messages that were generated during + // execution of the backup. The actual errors are in the backup's log + // file in object storage. + Errors int `json:"errors"` } // +genclient diff --git a/pkg/backup/backup.go b/pkg/backup/backup.go index e0974995a..d244f764e 100644 --- a/pkg/backup/backup.go +++ b/pkg/backup/backup.go @@ -30,7 +30,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" - kuberrs "k8s.io/apimachinery/pkg/util/errors" api "github.com/heptio/velero/pkg/apis/velero/v1" "github.com/heptio/velero/pkg/client" @@ -39,7 +38,6 @@ import ( "github.com/heptio/velero/pkg/podexec" "github.com/heptio/velero/pkg/restic" "github.com/heptio/velero/pkg/util/collections" - kubeutil "github.com/heptio/velero/pkg/util/kube" ) // BackupVersion is the current backup version for Velero. @@ -207,17 +205,18 @@ type VolumeSnapshotterGetter interface { } // Backup backs up the items specified in the Backup, placing them in a gzip-compressed tar file -// written to backupFile. The finalized api.Backup is written to metadata. -func (kb *kubernetesBackupper) Backup(logger logrus.FieldLogger, backupRequest *Request, backupFile io.Writer, actions []velero.BackupItemAction, volumeSnapshotterGetter VolumeSnapshotterGetter) error { +// written to backupFile. The finalized api.Backup is written to metadata. Any error that represents +// a complete backup failure is returned. Errors that constitute partial failures (i.e. failures to +// back up individual resources that don't prevent the backup from continuing to be processed) are logged +// to the backup log. +func (kb *kubernetesBackupper) Backup(log logrus.FieldLogger, backupRequest *Request, backupFile io.Writer, actions []velero.BackupItemAction, volumeSnapshotterGetter VolumeSnapshotterGetter) error { gzippedData := gzip.NewWriter(backupFile) defer gzippedData.Close() tw := tar.NewWriter(gzippedData) defer tw.Close() - log := logger.WithField("backup", kubeutil.NamespaceAndName(backupRequest)) - log.Info("Starting backup") - + log.Info("Writing backup version file") if err := kb.writeBackupVersion(tw); err != nil { return errors.WithStack(err) } @@ -276,21 +275,13 @@ func (kb *kubernetesBackupper) Backup(logger logrus.FieldLogger, backupRequest * volumeSnapshotterGetter, ) - var errs []error for _, group := range kb.discoveryHelper.Resources() { if err := gb.backupGroup(group); err != nil { - errs = append(errs, err) + log.WithError(err).WithField("apiGroup", group.String()).Error("Error backing up API group") } } - err = kuberrs.Flatten(kuberrs.NewAggregate(errs)) - if err == nil { - log.Infof("Backup completed successfully") - } else { - log.Infof("Backup completed with errors: %v", err) - } - - return err + return nil } func (kb *kubernetesBackupper) writeBackupVersion(tw *tar.Writer) error { diff --git a/pkg/backup/backup_test.go b/pkg/backup/backup_test.go index 862973bc2..3ba3e4c6b 100644 --- a/pkg/backup/backup_test.go +++ b/pkg/backup/backup_test.go @@ -373,12 +373,64 @@ func TestBackup(t *testing.T) { tests := []struct { name string backup *v1.Backup + actions []velero.BackupItemAction expectedNamespaces *collections.IncludesExcludes expectedResources *collections.IncludesExcludes expectedHooks []resourceHook backupGroupErrors map[*metav1.APIResourceList]error expectedError error }{ + { + name: "error resolving actions returns an error", + backup: &v1.Backup{ + Spec: v1.BackupSpec{ + // cm - shortcut in legacy api group + // csr - shortcut in certificates.k8s.io api group + // roles - fully qualified in rbac.authorization.k8s.io api group + IncludedResources: []string{"cm", "csr", "roles"}, + IncludedNamespaces: []string{"a", "b"}, + ExcludedNamespaces: []string{"c", "d"}, + }, + }, + actions: []velero.BackupItemAction{new(appliesToErrorAction)}, + expectedNamespaces: collections.NewIncludesExcludes().Includes("a", "b").Excludes("c", "d"), + expectedResources: collections.NewIncludesExcludes().Includes("configmaps", "certificatesigningrequests.certificates.k8s.io", "roles.rbac.authorization.k8s.io"), + expectedHooks: []resourceHook{}, + expectedError: errors.New("error calling AppliesTo"), + }, + { + name: "error resolving hooks returns an error", + backup: &v1.Backup{ + Spec: v1.BackupSpec{ + // cm - shortcut in legacy api group + // csr - shortcut in certificates.k8s.io api group + // roles - fully qualified in rbac.authorization.k8s.io api group + IncludedResources: []string{"cm", "csr", "roles"}, + IncludedNamespaces: []string{"a", "b"}, + ExcludedNamespaces: []string{"c", "d"}, + Hooks: v1.BackupHooks{ + Resources: []v1.BackupResourceHookSpec{ + { + Name: "hook-with-invalid-label-selector", + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOperator("nonexistent-operator"), + Values: []string{"bar"}, + }, + }, + }, + }, + }, + }, + }, + }, + expectedNamespaces: collections.NewIncludesExcludes().Includes("a", "b").Excludes("c", "d"), + expectedResources: collections.NewIncludesExcludes().Includes("configmaps", "certificatesigningrequests.certificates.k8s.io", "roles.rbac.authorization.k8s.io"), + expectedHooks: []resourceHook{}, + expectedError: errors.New("\"nonexistent-operator\" is not a valid pod selector operator"), + }, { name: "happy path, no actions, no hooks, no errors", backup: &v1.Backup{ @@ -411,7 +463,7 @@ func TestBackup(t *testing.T) { certificatesGroup: nil, rbacGroup: errors.New("rbac error"), }, - expectedError: errors.New("[v1 error, rbac error]"), + expectedError: nil, }, { name: "hooks", @@ -509,7 +561,7 @@ func TestBackup(t *testing.T) { mock.Anything, // restic backupper mock.Anything, // pvc snapshot tracker mock.Anything, // volume snapshotter getter - ).Return(groupBackupper) + ).Maybe().Return(groupBackupper) for group, err := range test.backupGroupErrors { groupBackupper.On("backupGroup", group).Return(err) @@ -522,7 +574,7 @@ func TestBackup(t *testing.T) { groupBackupperFactory: groupBackupperFactory, } - err := kb.Backup(logging.DefaultLogger(logrus.DebugLevel), req, new(bytes.Buffer), nil, nil) + err := kb.Backup(logging.DefaultLogger(logrus.DebugLevel), req, new(bytes.Buffer), test.actions, nil) assert.Equal(t, test.expectedNamespaces, req.NamespaceIncludesExcludes) assert.Equal(t, test.expectedResources, req.ResourceIncludesExcludes) @@ -538,6 +590,18 @@ func TestBackup(t *testing.T) { } } +// appliesToErrorAction is a backup item action that always returns +// an error when AppliesTo() is called. +type appliesToErrorAction struct{} + +func (a *appliesToErrorAction) AppliesTo() (velero.ResourceSelector, error) { + return velero.ResourceSelector{}, errors.New("error calling AppliesTo") +} + +func (a *appliesToErrorAction) Execute(item runtime.Unstructured, backup *v1.Backup) (runtime.Unstructured, []velero.ResourceIdentifier, error) { + panic("not implemented") +} + func TestBackupUsesNewCohabitatingResourcesForEachBackup(t *testing.T) { groupBackupperFactory := &mockGroupBackupperFactory{} kb := &kubernetesBackupper{ diff --git a/pkg/backup/group_backupper.go b/pkg/backup/group_backupper.go index 82da5613d..237f96e64 100644 --- a/pkg/backup/group_backupper.go +++ b/pkg/backup/group_backupper.go @@ -24,7 +24,6 @@ import ( "github.com/sirupsen/logrus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" - kuberrs "k8s.io/apimachinery/pkg/util/errors" "github.com/heptio/velero/pkg/client" "github.com/heptio/velero/pkg/discovery" @@ -101,23 +100,7 @@ type defaultGroupBackupper struct { // backupGroup backs up a single API group. func (gb *defaultGroupBackupper) backupGroup(group *metav1.APIResourceList) error { - var ( - errs []error - log = gb.log.WithField("group", group.GroupVersion) - rb = gb.resourceBackupperFactory.newResourceBackupper( - log, - gb.backupRequest, - gb.dynamicFactory, - gb.discoveryHelper, - gb.backedUpItems, - gb.cohabitatingResources, - gb.podCommandExecutor, - gb.tarWriter, - gb.resticBackupper, - gb.resticSnapshotTracker, - gb.volumeSnapshotterGetter, - ) - ) + log := gb.log.WithField("group", group.GroupVersion) log.Infof("Backing up group") @@ -132,13 +115,27 @@ func (gb *defaultGroupBackupper) backupGroup(group *metav1.APIResourceList) erro sortCoreGroup(group) } + rb := gb.resourceBackupperFactory.newResourceBackupper( + log, + gb.backupRequest, + gb.dynamicFactory, + gb.discoveryHelper, + gb.backedUpItems, + gb.cohabitatingResources, + gb.podCommandExecutor, + gb.tarWriter, + gb.resticBackupper, + gb.resticSnapshotTracker, + gb.volumeSnapshotterGetter, + ) + for _, resource := range group.APIResources { if err := rb.backupResource(group, resource); err != nil { - errs = append(errs, err) + log.WithError(err).WithField("resource", resource.String()).Error("Error backing up API resource") } } - return kuberrs.NewAggregate(errs) + return nil } // sortCoreGroup sorts group as a coreGroup. diff --git a/pkg/backup/item_backupper.go b/pkg/backup/item_backupper.go index 9ddf649a2..6b4b83698 100644 --- a/pkg/backup/item_backupper.go +++ b/pkg/backup/item_backupper.go @@ -160,7 +160,7 @@ func (ib *defaultItemBackupper) backupItem(logger logrus.FieldLogger, obj runtim } ib.backedUpItems[key] = struct{}{} - log.Info("Backing up resource") + log.Info("Backing up item") log.Debug("Executing pre hooks") if err := ib.itemHookHandler.handleHooks(log, groupResource, obj, ib.backupRequest.ResourceHooks, hookPhasePre); err != nil { @@ -192,7 +192,6 @@ func (ib *defaultItemBackupper) backupItem(logger logrus.FieldLogger, obj runtim updatedObj, err := ib.executeActions(log, obj, groupResource, name, namespace, metadata) if err != nil { - log.WithError(err).Error("Error executing item actions") backupErrs = append(backupErrs, err) // if there was an error running actions, execute post hooks and return @@ -309,11 +308,6 @@ func (ib *defaultItemBackupper) executeActions( updatedItem, additionalItemIdentifiers, err := action.Execute(obj, ib.backupRequest.Backup) if err != nil { - // We want this to show up in the log file at the place where the error occurs. When we return - // the error, it get aggregated with all the other ones at the end of the backup, making it - // harder to tell when it happened. - log.WithError(err).Error("error executing custom action") - return nil, errors.Wrapf(err, "error executing custom action (groupResource=%s, namespace=%s, name=%s)", groupResource.String(), namespace, name) } obj = updatedItem @@ -331,7 +325,7 @@ func (ib *defaultItemBackupper) executeActions( additionalItem, err := client.Get(additionalItem.Name, metav1.GetOptions{}) if err != nil { - return nil, err + return nil, errors.WithStack(err) } if err = ib.additionalItemBackupper.backupItem(log, additionalItem, gvr.GroupResource()); err != nil { @@ -393,7 +387,7 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log // of this PV. If so, don't take a snapshot. if pv.Spec.ClaimRef != nil { if ib.resticSnapshotTracker.Has(pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name) { - log.Info("Skipping Persistent Volume snapshot because volume has already been backed up.") + log.Info("Skipping persistent volume snapshot because volume has already been backed up with restic.") return nil } } @@ -433,7 +427,7 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log } if volumeSnapshotter == nil { - log.Info("PersistentVolume is not a supported volume type for snapshots, skipping.") + log.Info("Persistent volume is not a supported volume type for snapshots, skipping.") return nil } @@ -447,17 +441,15 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log log.Info("Getting volume information") volumeType, iops, err := volumeSnapshotter.GetVolumeInfo(volumeID, pvFailureDomainZone) if err != nil { - log.WithError(err).Error("error getting volume info") return errors.WithMessage(err, "error getting volume info") } - log.Info("Snapshotting PersistentVolume") + log.Info("Snapshotting persistent volume") snapshot := volumeSnapshot(ib.backupRequest.Backup, pv.Name, volumeID, volumeType, pvFailureDomainZone, location, iops) var errs []error snapshotID, err := volumeSnapshotter.CreateSnapshot(snapshot.Spec.ProviderVolumeID, snapshot.Spec.VolumeAZ, tags) if err != nil { - log.WithError(err).Error("error creating snapshot") errs = append(errs, errors.Wrap(err, "error taking snapshot of volume")) snapshot.Status.Phase = volume.SnapshotPhaseFailed } else { diff --git a/pkg/backup/resource_backupper.go b/pkg/backup/resource_backupper.go index 296ad4511..86d1ca8a4 100644 --- a/pkg/backup/resource_backupper.go +++ b/pkg/backup/resource_backupper.go @@ -24,7 +24,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" - kuberrs "k8s.io/apimachinery/pkg/util/errors" + kubeerrs "k8s.io/apimachinery/pkg/util/errors" "github.com/heptio/velero/pkg/client" "github.com/heptio/velero/pkg/discovery" @@ -102,22 +102,16 @@ type defaultResourceBackupper struct { } // backupResource backs up all the objects for a given group-version-resource. -func (rb *defaultResourceBackupper) backupResource( - group *metav1.APIResourceList, - resource metav1.APIResource, -) error { - var errs []error +func (rb *defaultResourceBackupper) backupResource(group *metav1.APIResourceList, resource metav1.APIResource) error { + log := rb.log.WithField("resource", resource.Name) + + log.Info("Backing up resource") gv, err := schema.ParseGroupVersion(group.GroupVersion) if err != nil { return errors.Wrapf(err, "error parsing GroupVersion %s", group.GroupVersion) } gr := schema.GroupResource{Group: gv.Group, Resource: resource.Name} - grString := gr.String() - - log := rb.log.WithField("groupResource", grString) - - log.Info("Evaluating resource") clusterScoped := !resource.Namespaced @@ -142,8 +136,8 @@ func (rb *defaultResourceBackupper) backupResource( } } - if !rb.backupRequest.ResourceIncludesExcludes.ShouldInclude(grString) { - log.Infof("Resource is excluded") + if !rb.backupRequest.ResourceIncludesExcludes.ShouldInclude(gr.String()) { + log.Infof("Skipping resource because it's excluded") return nil } @@ -178,38 +172,39 @@ func (rb *defaultResourceBackupper) backupResource( if gr == kuberesource.Namespaces && namespacesToList[0] != "" { resourceClient, err := rb.dynamicFactory.ClientForGroupVersionResource(gv, resource, "") if err != nil { - return err + log.WithError(err).Error("Error getting dynamic client") + } else { + var labelSelector labels.Selector + if rb.backupRequest.Spec.LabelSelector != nil { + labelSelector, err = metav1.LabelSelectorAsSelector(rb.backupRequest.Spec.LabelSelector) + if err != nil { + // This should never happen... + return errors.Wrap(err, "invalid label selector") + } + } + + for _, ns := range namespacesToList { + log = log.WithField("namespace", ns) + log.Info("Getting namespace") + unstructured, err := resourceClient.Get(ns, metav1.GetOptions{}) + if err != nil { + log.WithError(errors.WithStack(err)).Error("Error getting namespace") + continue + } + + labels := labels.Set(unstructured.GetLabels()) + if labelSelector != nil && !labelSelector.Matches(labels) { + log.Info("Skipping namespace because it does not match the backup's label selector") + continue + } + + if err := itemBackupper.backupItem(log, unstructured, gr); err != nil { + log.WithError(errors.WithStack(err)).Error("Error backing up namespace") + } + } + + return nil } - - var labelSelector labels.Selector - if rb.backupRequest.Spec.LabelSelector != nil { - labelSelector, err = metav1.LabelSelectorAsSelector(rb.backupRequest.Spec.LabelSelector) - if err != nil { - // This should never happen... - return errors.Wrap(err, "invalid label selector") - } - } - - for _, ns := range namespacesToList { - log.WithField("namespace", ns).Info("Getting namespace") - unstructured, err := resourceClient.Get(ns, metav1.GetOptions{}) - if err != nil { - errs = append(errs, errors.Wrap(err, "error getting namespace")) - continue - } - - labels := labels.Set(unstructured.GetLabels()) - if labelSelector != nil && !labelSelector.Matches(labels) { - log.WithField("name", unstructured.GetName()).Info("skipping item because it does not match the backup's label selector") - continue - } - - if err := itemBackupper.backupItem(log, unstructured, gr); err != nil { - errs = append(errs, err) - } - } - - return kuberrs.NewAggregate(errs) } // If we get here, we're backing up something other than namespaces @@ -218,9 +213,12 @@ func (rb *defaultResourceBackupper) backupResource( } for _, namespace := range namespacesToList { + log = log.WithField("namespace", namespace) + resourceClient, err := rb.dynamicFactory.ClientForGroupVersionResource(gv, resource, namespace) if err != nil { - return err + log.WithError(err).Error("Error getting dynamic client") + continue } var labelSelector string @@ -228,44 +226,59 @@ func (rb *defaultResourceBackupper) backupResource( labelSelector = metav1.FormatLabelSelector(selector) } - log.WithField("namespace", namespace).Info("Listing items") + log.Info("Listing items") unstructuredList, err := resourceClient.List(metav1.ListOptions{LabelSelector: labelSelector}) if err != nil { - return errors.WithStack(err) + log.WithError(errors.WithStack(err)).Error("Error listing items") + continue } // do the backup items, err := meta.ExtractList(unstructuredList) if err != nil { - return errors.WithStack(err) + log.WithError(errors.WithStack(err)).Error("Error extracting list") + continue } - log.WithField("namespace", namespace).Infof("Retrieved %d items", len(items)) + log.Infof("Retrieved %d items", len(items)) + for _, item := range items { unstructured, ok := item.(runtime.Unstructured) if !ok { - errs = append(errs, errors.Errorf("unexpected type %T", item)) + log.Errorf("Unexpected type %T", item) continue } metadata, err := meta.Accessor(unstructured) if err != nil { - errs = append(errs, errors.Wrapf(err, "unable to get a metadata accessor")) + log.WithError(errors.WithStack(err)).Error("Error getting a metadata accessor") continue } if gr == kuberesource.Namespaces && !rb.backupRequest.NamespaceIncludesExcludes.ShouldInclude(metadata.GetName()) { - log.WithField("name", metadata.GetName()).Info("skipping namespace because it is excluded") + log.WithField("name", metadata.GetName()).Info("Skipping namespace because it's excluded") continue } - if err := itemBackupper.backupItem(log, unstructured, gr); err != nil { - errs = append(errs, err) + err = itemBackupper.backupItem(log, unstructured, gr) + if aggregate, ok := err.(kubeerrs.Aggregate); ok { + log.WithField("name", metadata.GetName()).Infof("%d errors encountered backup up item", len(aggregate.Errors())) + // log each error separately so we get error location info in the log, and an + // accurate count of errors + for _, err = range aggregate.Errors() { + log.WithError(err).WithField("name", metadata.GetName()).Error("Error backing up item") + } + + continue + } + if err != nil { + log.WithError(err).WithField("name", metadata.GetName()).Error("Error backing up item") + continue } } } - return kuberrs.NewAggregate(errs) + return nil } // getNamespacesToList examines ie and resolves the includes and excludes to a full list of diff --git a/pkg/cmd/cli/backup/logs.go b/pkg/cmd/cli/backup/logs.go index 960c785a8..16753536d 100644 --- a/pkg/cmd/cli/backup/logs.go +++ b/pkg/cmd/cli/backup/logs.go @@ -50,7 +50,10 @@ func NewLogsCommand(f client.Factory) *cobra.Command { cmd.Exit("Error checking for backup %q: %v", backupName, err) } - if backup.Status.Phase != v1.BackupPhaseCompleted && backup.Status.Phase != v1.BackupPhaseFailed { + switch backup.Status.Phase { + case v1.BackupPhaseCompleted, v1.BackupPhasePartiallyFailed, v1.BackupPhaseFailed: + // terminal phases, do nothing. + default: cmd.Exit("Logs for backup %q are not available until it's finished processing. Please wait "+ "until the backup has a phase of Completed or Failed and try again.", backupName) } diff --git a/pkg/cmd/util/output/backup_describer.go b/pkg/cmd/util/output/backup_describer.go index eb9b81b29..0bfdb2be5 100644 --- a/pkg/cmd/util/output/backup_describer.go +++ b/pkg/cmd/util/output/backup_describer.go @@ -1,5 +1,5 @@ /* -Copyright 2017 the Velero contributors. +Copyright 2017, 2019 the Velero contributors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -47,7 +47,13 @@ func DescribeBackup( if phase == "" { phase = velerov1api.BackupPhaseNew } - d.Printf("Phase:\t%s\n", phase) + + logsNote := "" + if backup.Status.Phase == velerov1api.BackupPhaseFailed || backup.Status.Phase == velerov1api.BackupPhasePartiallyFailed { + logsNote = fmt.Sprintf(" (run `velero backup logs %s` for more information)", backup.Name) + } + + d.Printf("Phase:\t%s%s\n", phase, logsNote) status := backup.Status if len(status.ValidationErrors) > 0 { @@ -58,6 +64,12 @@ func DescribeBackup( } } + if status.Phase == velerov1api.BackupPhasePartiallyFailed { + d.Println() + d.Printf("Errors:\t%d\n", status.Errors) + d.Printf("Warnings:\t%d\n", status.Warnings) + } + d.Println() DescribeBackupSpec(d, backup.Spec) diff --git a/pkg/cmd/util/output/backup_printer.go b/pkg/cmd/util/output/backup_printer.go index 6e7111b23..336bb3bf3 100644 --- a/pkg/cmd/util/output/backup_printer.go +++ b/pkg/cmd/util/output/backup_printer.go @@ -1,5 +1,5 @@ /* -Copyright 2017 the Velero contributors. +Copyright 2017, 2019 the Velero contributors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -85,13 +85,21 @@ func printBackup(backup *velerov1api.Backup, w io.Writer, options printers.Print expiration = backup.CreationTimestamp.Add(backup.Spec.TTL.Duration) } - status := backup.Status.Phase + status := string(backup.Status.Phase) if status == "" { - status = velerov1api.BackupPhaseNew + status = string(velerov1api.BackupPhaseNew) } if backup.DeletionTimestamp != nil && !backup.DeletionTimestamp.Time.IsZero() { status = "Deleting" } + if status == string(velerov1api.BackupPhasePartiallyFailed) { + if backup.Status.Errors == 1 { + status = fmt.Sprintf("%s (1 error)", status) + } else { + status = fmt.Sprintf("%s (%d errors)", status, backup.Status.Errors) + } + + } location := backup.Spec.StorageLocation diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 1b91157d6..c516fdec3 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -1,5 +1,5 @@ /* -Copyright 2017 the Velero contributors. +Copyright 2017, 2019 the Velero contributors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -214,16 +214,29 @@ func (c *backupController) processBackup(key string) error { defer c.backupTracker.Delete(request.Namespace, request.Name) log.Debug("Running backup") - // execution & upload of backup + backupScheduleName := request.GetLabels()[velerov1api.ScheduleNameLabel] c.metrics.RegisterBackupAttempt(backupScheduleName) + // execution & upload of backup if err := c.runBackup(request); err != nil { + // even though runBackup sets the backup's phase prior + // to uploading artifacts to object storage, we have to + // check for an error again here and update the phase if + // one is found, because there could've been an error + // while uploading artifacts to object storage, which would + // result in the backup being Failed. log.WithError(err).Error("backup failed") request.Status.Phase = velerov1api.BackupPhaseFailed - c.metrics.RegisterBackupFailed(backupScheduleName) - } else { + } + + switch request.Status.Phase { + case velerov1api.BackupPhaseCompleted: c.metrics.RegisterBackupSuccess(backupScheduleName) + case velerov1api.BackupPhasePartiallyFailed: + c.metrics.RegisterBackupPartialFailure(backupScheduleName) + case velerov1api.BackupPhaseFailed: + c.metrics.RegisterBackupFailed(backupScheduleName) } log.Debug("Updating backup's final status") @@ -411,9 +424,11 @@ func (c *backupController) validateAndGetSnapshotLocations(backup *velerov1api.B return providerLocations, nil } +// runBackup runs and uploads a validated backup. Any error returned from this function +// causes the backup to be Failed; if no error is returned, the backup's status's Errors +// field is checked to see if the backup was a partial failure. func (c *backupController) runBackup(backup *pkgbackup.Request) error { - log := c.logger.WithField("backup", kubeutil.NamespaceAndName(backup)) - log.Info("Starting backup") + c.logger.WithField("backup", kubeutil.NamespaceAndName(backup)).Info("Setting up backup log") logFile, err := ioutil.TempFile("", "") if err != nil { @@ -429,25 +444,31 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error { // backup log failed for whatever reason. logger := logging.DefaultLogger(c.backupLogLevel) logger.Out = io.MultiWriter(os.Stdout, gzippedLogFile) - log = logger.WithField("backup", kubeutil.NamespaceAndName(backup)) - log.Info("Starting backup") + logCounter := logging.NewLogCounterHook() + logger.Hooks.Add(logCounter) + backupLog := logger.WithField("backup", kubeutil.NamespaceAndName(backup)) + + backupLog.Info("Setting up backup temp file") backupFile, err := ioutil.TempFile("", "") if err != nil { return errors.Wrap(err, "error creating temp file for backup") } - defer closeAndRemoveFile(backupFile, log) + defer closeAndRemoveFile(backupFile, backupLog) - pluginManager := c.newPluginManager(log) + backupLog.Info("Setting up plugin manager") + pluginManager := c.newPluginManager(backupLog) defer pluginManager.CleanupClients() + backupLog.Info("Getting backup item actions") actions, err := pluginManager.GetBackupItemActions() if err != nil { return err } - backupStore, err := c.newBackupStore(backup.StorageLocation, pluginManager, log) + backupLog.Info("Setting up backup store") + backupStore, err := c.newBackupStore(backup.StorageLocation, pluginManager, backupLog) if err != nil { return err } @@ -462,17 +483,9 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error { return errors.Errorf("backup already exists in object storage") } - // Do the actual backup - var errs []error - if err := c.backupper.Backup(log, backup, backupFile, actions, pluginManager); err != nil { - errs = append(errs, err) - backup.Status.Phase = velerov1api.BackupPhaseFailed - } else { - backup.Status.Phase = velerov1api.BackupPhaseCompleted - } - - if err := gzippedLogFile.Close(); err != nil { - c.logger.WithError(err).Error("error closing gzippedLogFile") + var fatalErrs []error + if err := c.backupper.Backup(backupLog, backup, backupFile, actions, pluginManager); err != nil { + fatalErrs = append(fatalErrs, err) } // Mark completion timestamp before serializing and uploading. @@ -486,21 +499,45 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error { } } - errs = append(errs, persistBackup(backup, backupFile, logFile, backupStore, c.logger)...) - errs = append(errs, recordBackupMetrics(backup.Backup, backupFile, c.metrics)) + recordBackupMetrics(backupLog, backup.Backup, backupFile, c.metrics) - log.Info("Backup completed") + if err := gzippedLogFile.Close(); err != nil { + c.logger.WithError(err).Error("error closing gzippedLogFile") + } - return kerrors.NewAggregate(errs) + backup.Status.Warnings = logCounter.GetCount(logrus.WarnLevel) + backup.Status.Errors = logCounter.GetCount(logrus.ErrorLevel) + + // Assign finalize phase as close to end as possible so that any errors + // logged to backupLog are captured. This is done before uploading the + // artifacts to object storage so that the JSON representation of the + // backup in object storage has the terminal phase set. + switch { + case len(fatalErrs) > 0: + backup.Status.Phase = velerov1api.BackupPhaseFailed + case logCounter.GetCount(logrus.ErrorLevel) > 0: + backup.Status.Phase = velerov1api.BackupPhasePartiallyFailed + default: + backup.Status.Phase = velerov1api.BackupPhaseCompleted + } + + if errs := persistBackup(backup, backupFile, logFile, backupStore, c.logger); len(errs) > 0 { + fatalErrs = append(fatalErrs, errs...) + } + + c.logger.Info("Backup completed") + + // if we return a non-nil error, the calling function will update + // the backup's phase to Failed. + return kerrors.NewAggregate(fatalErrs) } -func recordBackupMetrics(backup *velerov1api.Backup, backupFile *os.File, serverMetrics *metrics.ServerMetrics) error { +func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, backupFile *os.File, serverMetrics *metrics.ServerMetrics) { backupScheduleName := backup.GetLabels()[velerov1api.ScheduleNameLabel] var backupSizeBytes int64 - var err error if backupFileStat, err := backupFile.Stat(); err != nil { - err = errors.Wrap(err, "error getting file info") + log.WithError(errors.WithStack(err)).Error("Error getting backup file info") } else { backupSizeBytes = backupFileStat.Size() } @@ -512,8 +549,6 @@ func recordBackupMetrics(backup *velerov1api.Backup, backupFile *os.File, server serverMetrics.RegisterVolumeSnapshotAttempts(backupScheduleName, backup.Status.VolumeSnapshotsAttempted) serverMetrics.RegisterVolumeSnapshotSuccesses(backupScheduleName, backup.Status.VolumeSnapshotsCompleted) serverMetrics.RegisterVolumeSnapshotFailures(backupScheduleName, backup.Status.VolumeSnapshotsAttempted-backup.Status.VolumeSnapshotsCompleted) - - return err } func persistBackup(backup *pkgbackup.Request, backupContents, backupLog *os.File, backupStore persistence.BackupStore, log logrus.FieldLogger) []error { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 3b4565e8a..330783947 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -33,6 +33,7 @@ const ( backupTotal = "backup_total" backupAttemptTotal = "backup_attempt_total" backupSuccessTotal = "backup_success_total" + backupPartialFailureTotal = "backup_partial_failure_total" backupFailureTotal = "backup_failure_total" backupDurationSeconds = "backup_duration_seconds" backupDeletionAttemptTotal = "backup_deletion_attempt_total" @@ -89,6 +90,14 @@ func NewServerMetrics() *ServerMetrics { }, []string{scheduleLabel}, ), + backupPartialFailureTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metricNamespace, + Name: backupPartialFailureTotal, + Help: "Total number of partially failed backups", + }, + []string{scheduleLabel}, + ), backupFailureTotal: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metricNamespace, @@ -230,6 +239,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) { if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } + if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(scheduleName).Set(0) + } if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok { c.WithLabelValues(scheduleName).Set(0) } @@ -296,6 +308,13 @@ func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) { } } +// RegisterBackupPartialFailure records a partially failed backup. +func (m *ServerMetrics) RegisterBackupPartialFailure(backupSchedule string) { + if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok { + c.WithLabelValues(backupSchedule).Inc() + } +} + // RegisterBackupFailed records a failed backup. func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) { if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok { diff --git a/pkg/util/logging/log_counter_hook.go b/pkg/util/logging/log_counter_hook.go new file mode 100644 index 000000000..d8d53359c --- /dev/null +++ b/pkg/util/logging/log_counter_hook.go @@ -0,0 +1,61 @@ +/* +Copyright 2019 the Velero contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logging + +import ( + "sync" + + "github.com/sirupsen/logrus" +) + +// LogCounterHook is a logrus hook that counts the number of log +// statements that have been written at each logrus level. +type LogCounterHook struct { + mu sync.RWMutex + counts map[logrus.Level]int +} + +// NewLogCounterHook returns a pointer to an initialized LogCounterHook. +func NewLogCounterHook() *LogCounterHook { + return &LogCounterHook{ + counts: make(map[logrus.Level]int), + } +} + +// Levels returns the logrus levels that the hook should be fired for. +func (h *LogCounterHook) Levels() []logrus.Level { + return logrus.AllLevels +} + +// Fire executes the hook's logic. +func (h *LogCounterHook) Fire(entry *logrus.Entry) error { + h.mu.Lock() + defer h.mu.Unlock() + + h.counts[entry.Level]++ + + return nil +} + +// GetCount returns the number of log statements that have been +// written at the specific level provided. +func (h *LogCounterHook) GetCount(level logrus.Level) int { + h.mu.RLock() + defer h.mu.RUnlock() + + return h.counts[level] +}