add PartiallyFailed phase for backups, log+continue on errors (#1386)

Signed-off-by: Steve Kriss <krisss@vmware.com>
pull/1415/head
Steve Kriss 2019-04-26 10:14:26 -06:00 committed by KubeKween
parent 42f351b000
commit 8392e6d83f
13 changed files with 355 additions and 145 deletions

View File

@ -0,0 +1 @@
add PartiallyFailed phase for backups, log + continue on errors during backup process

View File

@ -149,6 +149,10 @@ const (
// errors.
BackupPhaseCompleted BackupPhase = "Completed"
// BackupPhasePartiallyFailed means the backup has run to completion
// but encountered 1+ errors backing up individual items.
BackupPhasePartiallyFailed BackupPhase = "PartiallyFailed"
// BackupPhaseFailed means the backup ran but encountered an error that
// prevented it from completing successfully.
BackupPhaseFailed BackupPhase = "Failed"
@ -191,6 +195,16 @@ type BackupStatus struct {
// VolumeSnapshotsCompleted is the total number of successfully
// completed volume snapshots for this backup.
VolumeSnapshotsCompleted int `json:"volumeSnapshotsCompleted"`
// Warnings is a count of all warning messages that were generated during
// execution of the backup. The actual warnings are in the backup's log
// file in object storage.
Warnings int `json:"warnings"`
// Errors is a count of all error messages that were generated during
// execution of the backup. The actual errors are in the backup's log
// file in object storage.
Errors int `json:"errors"`
}
// +genclient

View File

@ -30,7 +30,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
kuberrs "k8s.io/apimachinery/pkg/util/errors"
api "github.com/heptio/velero/pkg/apis/velero/v1"
"github.com/heptio/velero/pkg/client"
@ -39,7 +38,6 @@ import (
"github.com/heptio/velero/pkg/podexec"
"github.com/heptio/velero/pkg/restic"
"github.com/heptio/velero/pkg/util/collections"
kubeutil "github.com/heptio/velero/pkg/util/kube"
)
// BackupVersion is the current backup version for Velero.
@ -207,17 +205,18 @@ type VolumeSnapshotterGetter interface {
}
// Backup backs up the items specified in the Backup, placing them in a gzip-compressed tar file
// written to backupFile. The finalized api.Backup is written to metadata.
func (kb *kubernetesBackupper) Backup(logger logrus.FieldLogger, backupRequest *Request, backupFile io.Writer, actions []velero.BackupItemAction, volumeSnapshotterGetter VolumeSnapshotterGetter) error {
// written to backupFile. The finalized api.Backup is written to metadata. Any error that represents
// a complete backup failure is returned. Errors that constitute partial failures (i.e. failures to
// back up individual resources that don't prevent the backup from continuing to be processed) are logged
// to the backup log.
func (kb *kubernetesBackupper) Backup(log logrus.FieldLogger, backupRequest *Request, backupFile io.Writer, actions []velero.BackupItemAction, volumeSnapshotterGetter VolumeSnapshotterGetter) error {
gzippedData := gzip.NewWriter(backupFile)
defer gzippedData.Close()
tw := tar.NewWriter(gzippedData)
defer tw.Close()
log := logger.WithField("backup", kubeutil.NamespaceAndName(backupRequest))
log.Info("Starting backup")
log.Info("Writing backup version file")
if err := kb.writeBackupVersion(tw); err != nil {
return errors.WithStack(err)
}
@ -276,21 +275,13 @@ func (kb *kubernetesBackupper) Backup(logger logrus.FieldLogger, backupRequest *
volumeSnapshotterGetter,
)
var errs []error
for _, group := range kb.discoveryHelper.Resources() {
if err := gb.backupGroup(group); err != nil {
errs = append(errs, err)
log.WithError(err).WithField("apiGroup", group.String()).Error("Error backing up API group")
}
}
err = kuberrs.Flatten(kuberrs.NewAggregate(errs))
if err == nil {
log.Infof("Backup completed successfully")
} else {
log.Infof("Backup completed with errors: %v", err)
}
return err
return nil
}
func (kb *kubernetesBackupper) writeBackupVersion(tw *tar.Writer) error {

View File

@ -373,12 +373,64 @@ func TestBackup(t *testing.T) {
tests := []struct {
name string
backup *v1.Backup
actions []velero.BackupItemAction
expectedNamespaces *collections.IncludesExcludes
expectedResources *collections.IncludesExcludes
expectedHooks []resourceHook
backupGroupErrors map[*metav1.APIResourceList]error
expectedError error
}{
{
name: "error resolving actions returns an error",
backup: &v1.Backup{
Spec: v1.BackupSpec{
// cm - shortcut in legacy api group
// csr - shortcut in certificates.k8s.io api group
// roles - fully qualified in rbac.authorization.k8s.io api group
IncludedResources: []string{"cm", "csr", "roles"},
IncludedNamespaces: []string{"a", "b"},
ExcludedNamespaces: []string{"c", "d"},
},
},
actions: []velero.BackupItemAction{new(appliesToErrorAction)},
expectedNamespaces: collections.NewIncludesExcludes().Includes("a", "b").Excludes("c", "d"),
expectedResources: collections.NewIncludesExcludes().Includes("configmaps", "certificatesigningrequests.certificates.k8s.io", "roles.rbac.authorization.k8s.io"),
expectedHooks: []resourceHook{},
expectedError: errors.New("error calling AppliesTo"),
},
{
name: "error resolving hooks returns an error",
backup: &v1.Backup{
Spec: v1.BackupSpec{
// cm - shortcut in legacy api group
// csr - shortcut in certificates.k8s.io api group
// roles - fully qualified in rbac.authorization.k8s.io api group
IncludedResources: []string{"cm", "csr", "roles"},
IncludedNamespaces: []string{"a", "b"},
ExcludedNamespaces: []string{"c", "d"},
Hooks: v1.BackupHooks{
Resources: []v1.BackupResourceHookSpec{
{
Name: "hook-with-invalid-label-selector",
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: "foo",
Operator: metav1.LabelSelectorOperator("nonexistent-operator"),
Values: []string{"bar"},
},
},
},
},
},
},
},
},
expectedNamespaces: collections.NewIncludesExcludes().Includes("a", "b").Excludes("c", "d"),
expectedResources: collections.NewIncludesExcludes().Includes("configmaps", "certificatesigningrequests.certificates.k8s.io", "roles.rbac.authorization.k8s.io"),
expectedHooks: []resourceHook{},
expectedError: errors.New("\"nonexistent-operator\" is not a valid pod selector operator"),
},
{
name: "happy path, no actions, no hooks, no errors",
backup: &v1.Backup{
@ -411,7 +463,7 @@ func TestBackup(t *testing.T) {
certificatesGroup: nil,
rbacGroup: errors.New("rbac error"),
},
expectedError: errors.New("[v1 error, rbac error]"),
expectedError: nil,
},
{
name: "hooks",
@ -509,7 +561,7 @@ func TestBackup(t *testing.T) {
mock.Anything, // restic backupper
mock.Anything, // pvc snapshot tracker
mock.Anything, // volume snapshotter getter
).Return(groupBackupper)
).Maybe().Return(groupBackupper)
for group, err := range test.backupGroupErrors {
groupBackupper.On("backupGroup", group).Return(err)
@ -522,7 +574,7 @@ func TestBackup(t *testing.T) {
groupBackupperFactory: groupBackupperFactory,
}
err := kb.Backup(logging.DefaultLogger(logrus.DebugLevel), req, new(bytes.Buffer), nil, nil)
err := kb.Backup(logging.DefaultLogger(logrus.DebugLevel), req, new(bytes.Buffer), test.actions, nil)
assert.Equal(t, test.expectedNamespaces, req.NamespaceIncludesExcludes)
assert.Equal(t, test.expectedResources, req.ResourceIncludesExcludes)
@ -538,6 +590,18 @@ func TestBackup(t *testing.T) {
}
}
// appliesToErrorAction is a backup item action that always returns
// an error when AppliesTo() is called.
type appliesToErrorAction struct{}
func (a *appliesToErrorAction) AppliesTo() (velero.ResourceSelector, error) {
return velero.ResourceSelector{}, errors.New("error calling AppliesTo")
}
func (a *appliesToErrorAction) Execute(item runtime.Unstructured, backup *v1.Backup) (runtime.Unstructured, []velero.ResourceIdentifier, error) {
panic("not implemented")
}
func TestBackupUsesNewCohabitatingResourcesForEachBackup(t *testing.T) {
groupBackupperFactory := &mockGroupBackupperFactory{}
kb := &kubernetesBackupper{

View File

@ -24,7 +24,6 @@ import (
"github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
kuberrs "k8s.io/apimachinery/pkg/util/errors"
"github.com/heptio/velero/pkg/client"
"github.com/heptio/velero/pkg/discovery"
@ -101,23 +100,7 @@ type defaultGroupBackupper struct {
// backupGroup backs up a single API group.
func (gb *defaultGroupBackupper) backupGroup(group *metav1.APIResourceList) error {
var (
errs []error
log = gb.log.WithField("group", group.GroupVersion)
rb = gb.resourceBackupperFactory.newResourceBackupper(
log,
gb.backupRequest,
gb.dynamicFactory,
gb.discoveryHelper,
gb.backedUpItems,
gb.cohabitatingResources,
gb.podCommandExecutor,
gb.tarWriter,
gb.resticBackupper,
gb.resticSnapshotTracker,
gb.volumeSnapshotterGetter,
)
)
log := gb.log.WithField("group", group.GroupVersion)
log.Infof("Backing up group")
@ -132,13 +115,27 @@ func (gb *defaultGroupBackupper) backupGroup(group *metav1.APIResourceList) erro
sortCoreGroup(group)
}
rb := gb.resourceBackupperFactory.newResourceBackupper(
log,
gb.backupRequest,
gb.dynamicFactory,
gb.discoveryHelper,
gb.backedUpItems,
gb.cohabitatingResources,
gb.podCommandExecutor,
gb.tarWriter,
gb.resticBackupper,
gb.resticSnapshotTracker,
gb.volumeSnapshotterGetter,
)
for _, resource := range group.APIResources {
if err := rb.backupResource(group, resource); err != nil {
errs = append(errs, err)
log.WithError(err).WithField("resource", resource.String()).Error("Error backing up API resource")
}
}
return kuberrs.NewAggregate(errs)
return nil
}
// sortCoreGroup sorts group as a coreGroup.

View File

@ -160,7 +160,7 @@ func (ib *defaultItemBackupper) backupItem(logger logrus.FieldLogger, obj runtim
}
ib.backedUpItems[key] = struct{}{}
log.Info("Backing up resource")
log.Info("Backing up item")
log.Debug("Executing pre hooks")
if err := ib.itemHookHandler.handleHooks(log, groupResource, obj, ib.backupRequest.ResourceHooks, hookPhasePre); err != nil {
@ -192,7 +192,6 @@ func (ib *defaultItemBackupper) backupItem(logger logrus.FieldLogger, obj runtim
updatedObj, err := ib.executeActions(log, obj, groupResource, name, namespace, metadata)
if err != nil {
log.WithError(err).Error("Error executing item actions")
backupErrs = append(backupErrs, err)
// if there was an error running actions, execute post hooks and return
@ -309,11 +308,6 @@ func (ib *defaultItemBackupper) executeActions(
updatedItem, additionalItemIdentifiers, err := action.Execute(obj, ib.backupRequest.Backup)
if err != nil {
// We want this to show up in the log file at the place where the error occurs. When we return
// the error, it get aggregated with all the other ones at the end of the backup, making it
// harder to tell when it happened.
log.WithError(err).Error("error executing custom action")
return nil, errors.Wrapf(err, "error executing custom action (groupResource=%s, namespace=%s, name=%s)", groupResource.String(), namespace, name)
}
obj = updatedItem
@ -331,7 +325,7 @@ func (ib *defaultItemBackupper) executeActions(
additionalItem, err := client.Get(additionalItem.Name, metav1.GetOptions{})
if err != nil {
return nil, err
return nil, errors.WithStack(err)
}
if err = ib.additionalItemBackupper.backupItem(log, additionalItem, gvr.GroupResource()); err != nil {
@ -393,7 +387,7 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log
// of this PV. If so, don't take a snapshot.
if pv.Spec.ClaimRef != nil {
if ib.resticSnapshotTracker.Has(pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name) {
log.Info("Skipping Persistent Volume snapshot because volume has already been backed up.")
log.Info("Skipping persistent volume snapshot because volume has already been backed up with restic.")
return nil
}
}
@ -433,7 +427,7 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log
}
if volumeSnapshotter == nil {
log.Info("PersistentVolume is not a supported volume type for snapshots, skipping.")
log.Info("Persistent volume is not a supported volume type for snapshots, skipping.")
return nil
}
@ -447,17 +441,15 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log
log.Info("Getting volume information")
volumeType, iops, err := volumeSnapshotter.GetVolumeInfo(volumeID, pvFailureDomainZone)
if err != nil {
log.WithError(err).Error("error getting volume info")
return errors.WithMessage(err, "error getting volume info")
}
log.Info("Snapshotting PersistentVolume")
log.Info("Snapshotting persistent volume")
snapshot := volumeSnapshot(ib.backupRequest.Backup, pv.Name, volumeID, volumeType, pvFailureDomainZone, location, iops)
var errs []error
snapshotID, err := volumeSnapshotter.CreateSnapshot(snapshot.Spec.ProviderVolumeID, snapshot.Spec.VolumeAZ, tags)
if err != nil {
log.WithError(err).Error("error creating snapshot")
errs = append(errs, errors.Wrap(err, "error taking snapshot of volume"))
snapshot.Status.Phase = volume.SnapshotPhaseFailed
} else {

View File

@ -24,7 +24,7 @@ import (
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
kuberrs "k8s.io/apimachinery/pkg/util/errors"
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
"github.com/heptio/velero/pkg/client"
"github.com/heptio/velero/pkg/discovery"
@ -102,22 +102,16 @@ type defaultResourceBackupper struct {
}
// backupResource backs up all the objects for a given group-version-resource.
func (rb *defaultResourceBackupper) backupResource(
group *metav1.APIResourceList,
resource metav1.APIResource,
) error {
var errs []error
func (rb *defaultResourceBackupper) backupResource(group *metav1.APIResourceList, resource metav1.APIResource) error {
log := rb.log.WithField("resource", resource.Name)
log.Info("Backing up resource")
gv, err := schema.ParseGroupVersion(group.GroupVersion)
if err != nil {
return errors.Wrapf(err, "error parsing GroupVersion %s", group.GroupVersion)
}
gr := schema.GroupResource{Group: gv.Group, Resource: resource.Name}
grString := gr.String()
log := rb.log.WithField("groupResource", grString)
log.Info("Evaluating resource")
clusterScoped := !resource.Namespaced
@ -142,8 +136,8 @@ func (rb *defaultResourceBackupper) backupResource(
}
}
if !rb.backupRequest.ResourceIncludesExcludes.ShouldInclude(grString) {
log.Infof("Resource is excluded")
if !rb.backupRequest.ResourceIncludesExcludes.ShouldInclude(gr.String()) {
log.Infof("Skipping resource because it's excluded")
return nil
}
@ -178,9 +172,8 @@ func (rb *defaultResourceBackupper) backupResource(
if gr == kuberesource.Namespaces && namespacesToList[0] != "" {
resourceClient, err := rb.dynamicFactory.ClientForGroupVersionResource(gv, resource, "")
if err != nil {
return err
}
log.WithError(err).Error("Error getting dynamic client")
} else {
var labelSelector labels.Selector
if rb.backupRequest.Spec.LabelSelector != nil {
labelSelector, err = metav1.LabelSelectorAsSelector(rb.backupRequest.Spec.LabelSelector)
@ -191,25 +184,27 @@ func (rb *defaultResourceBackupper) backupResource(
}
for _, ns := range namespacesToList {
log.WithField("namespace", ns).Info("Getting namespace")
log = log.WithField("namespace", ns)
log.Info("Getting namespace")
unstructured, err := resourceClient.Get(ns, metav1.GetOptions{})
if err != nil {
errs = append(errs, errors.Wrap(err, "error getting namespace"))
log.WithError(errors.WithStack(err)).Error("Error getting namespace")
continue
}
labels := labels.Set(unstructured.GetLabels())
if labelSelector != nil && !labelSelector.Matches(labels) {
log.WithField("name", unstructured.GetName()).Info("skipping item because it does not match the backup's label selector")
log.Info("Skipping namespace because it does not match the backup's label selector")
continue
}
if err := itemBackupper.backupItem(log, unstructured, gr); err != nil {
errs = append(errs, err)
log.WithError(errors.WithStack(err)).Error("Error backing up namespace")
}
}
return kuberrs.NewAggregate(errs)
return nil
}
}
// If we get here, we're backing up something other than namespaces
@ -218,9 +213,12 @@ func (rb *defaultResourceBackupper) backupResource(
}
for _, namespace := range namespacesToList {
log = log.WithField("namespace", namespace)
resourceClient, err := rb.dynamicFactory.ClientForGroupVersionResource(gv, resource, namespace)
if err != nil {
return err
log.WithError(err).Error("Error getting dynamic client")
continue
}
var labelSelector string
@ -228,44 +226,59 @@ func (rb *defaultResourceBackupper) backupResource(
labelSelector = metav1.FormatLabelSelector(selector)
}
log.WithField("namespace", namespace).Info("Listing items")
log.Info("Listing items")
unstructuredList, err := resourceClient.List(metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return errors.WithStack(err)
log.WithError(errors.WithStack(err)).Error("Error listing items")
continue
}
// do the backup
items, err := meta.ExtractList(unstructuredList)
if err != nil {
return errors.WithStack(err)
log.WithError(errors.WithStack(err)).Error("Error extracting list")
continue
}
log.WithField("namespace", namespace).Infof("Retrieved %d items", len(items))
log.Infof("Retrieved %d items", len(items))
for _, item := range items {
unstructured, ok := item.(runtime.Unstructured)
if !ok {
errs = append(errs, errors.Errorf("unexpected type %T", item))
log.Errorf("Unexpected type %T", item)
continue
}
metadata, err := meta.Accessor(unstructured)
if err != nil {
errs = append(errs, errors.Wrapf(err, "unable to get a metadata accessor"))
log.WithError(errors.WithStack(err)).Error("Error getting a metadata accessor")
continue
}
if gr == kuberesource.Namespaces && !rb.backupRequest.NamespaceIncludesExcludes.ShouldInclude(metadata.GetName()) {
log.WithField("name", metadata.GetName()).Info("skipping namespace because it is excluded")
log.WithField("name", metadata.GetName()).Info("Skipping namespace because it's excluded")
continue
}
if err := itemBackupper.backupItem(log, unstructured, gr); err != nil {
errs = append(errs, err)
err = itemBackupper.backupItem(log, unstructured, gr)
if aggregate, ok := err.(kubeerrs.Aggregate); ok {
log.WithField("name", metadata.GetName()).Infof("%d errors encountered backup up item", len(aggregate.Errors()))
// log each error separately so we get error location info in the log, and an
// accurate count of errors
for _, err = range aggregate.Errors() {
log.WithError(err).WithField("name", metadata.GetName()).Error("Error backing up item")
}
continue
}
if err != nil {
log.WithError(err).WithField("name", metadata.GetName()).Error("Error backing up item")
continue
}
}
}
return kuberrs.NewAggregate(errs)
return nil
}
// getNamespacesToList examines ie and resolves the includes and excludes to a full list of

View File

@ -50,7 +50,10 @@ func NewLogsCommand(f client.Factory) *cobra.Command {
cmd.Exit("Error checking for backup %q: %v", backupName, err)
}
if backup.Status.Phase != v1.BackupPhaseCompleted && backup.Status.Phase != v1.BackupPhaseFailed {
switch backup.Status.Phase {
case v1.BackupPhaseCompleted, v1.BackupPhasePartiallyFailed, v1.BackupPhaseFailed:
// terminal phases, do nothing.
default:
cmd.Exit("Logs for backup %q are not available until it's finished processing. Please wait "+
"until the backup has a phase of Completed or Failed and try again.", backupName)
}

View File

@ -1,5 +1,5 @@
/*
Copyright 2017 the Velero contributors.
Copyright 2017, 2019 the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -47,7 +47,13 @@ func DescribeBackup(
if phase == "" {
phase = velerov1api.BackupPhaseNew
}
d.Printf("Phase:\t%s\n", phase)
logsNote := ""
if backup.Status.Phase == velerov1api.BackupPhaseFailed || backup.Status.Phase == velerov1api.BackupPhasePartiallyFailed {
logsNote = fmt.Sprintf(" (run `velero backup logs %s` for more information)", backup.Name)
}
d.Printf("Phase:\t%s%s\n", phase, logsNote)
status := backup.Status
if len(status.ValidationErrors) > 0 {
@ -58,6 +64,12 @@ func DescribeBackup(
}
}
if status.Phase == velerov1api.BackupPhasePartiallyFailed {
d.Println()
d.Printf("Errors:\t%d\n", status.Errors)
d.Printf("Warnings:\t%d\n", status.Warnings)
}
d.Println()
DescribeBackupSpec(d, backup.Spec)

View File

@ -1,5 +1,5 @@
/*
Copyright 2017 the Velero contributors.
Copyright 2017, 2019 the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -85,13 +85,21 @@ func printBackup(backup *velerov1api.Backup, w io.Writer, options printers.Print
expiration = backup.CreationTimestamp.Add(backup.Spec.TTL.Duration)
}
status := backup.Status.Phase
status := string(backup.Status.Phase)
if status == "" {
status = velerov1api.BackupPhaseNew
status = string(velerov1api.BackupPhaseNew)
}
if backup.DeletionTimestamp != nil && !backup.DeletionTimestamp.Time.IsZero() {
status = "Deleting"
}
if status == string(velerov1api.BackupPhasePartiallyFailed) {
if backup.Status.Errors == 1 {
status = fmt.Sprintf("%s (1 error)", status)
} else {
status = fmt.Sprintf("%s (%d errors)", status, backup.Status.Errors)
}
}
location := backup.Spec.StorageLocation

View File

@ -1,5 +1,5 @@
/*
Copyright 2017 the Velero contributors.
Copyright 2017, 2019 the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -214,16 +214,29 @@ func (c *backupController) processBackup(key string) error {
defer c.backupTracker.Delete(request.Namespace, request.Name)
log.Debug("Running backup")
// execution & upload of backup
backupScheduleName := request.GetLabels()[velerov1api.ScheduleNameLabel]
c.metrics.RegisterBackupAttempt(backupScheduleName)
// execution & upload of backup
if err := c.runBackup(request); err != nil {
// even though runBackup sets the backup's phase prior
// to uploading artifacts to object storage, we have to
// check for an error again here and update the phase if
// one is found, because there could've been an error
// while uploading artifacts to object storage, which would
// result in the backup being Failed.
log.WithError(err).Error("backup failed")
request.Status.Phase = velerov1api.BackupPhaseFailed
c.metrics.RegisterBackupFailed(backupScheduleName)
} else {
}
switch request.Status.Phase {
case velerov1api.BackupPhaseCompleted:
c.metrics.RegisterBackupSuccess(backupScheduleName)
case velerov1api.BackupPhasePartiallyFailed:
c.metrics.RegisterBackupPartialFailure(backupScheduleName)
case velerov1api.BackupPhaseFailed:
c.metrics.RegisterBackupFailed(backupScheduleName)
}
log.Debug("Updating backup's final status")
@ -411,9 +424,11 @@ func (c *backupController) validateAndGetSnapshotLocations(backup *velerov1api.B
return providerLocations, nil
}
// runBackup runs and uploads a validated backup. Any error returned from this function
// causes the backup to be Failed; if no error is returned, the backup's status's Errors
// field is checked to see if the backup was a partial failure.
func (c *backupController) runBackup(backup *pkgbackup.Request) error {
log := c.logger.WithField("backup", kubeutil.NamespaceAndName(backup))
log.Info("Starting backup")
c.logger.WithField("backup", kubeutil.NamespaceAndName(backup)).Info("Setting up backup log")
logFile, err := ioutil.TempFile("", "")
if err != nil {
@ -429,25 +444,31 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
// backup log failed for whatever reason.
logger := logging.DefaultLogger(c.backupLogLevel)
logger.Out = io.MultiWriter(os.Stdout, gzippedLogFile)
log = logger.WithField("backup", kubeutil.NamespaceAndName(backup))
log.Info("Starting backup")
logCounter := logging.NewLogCounterHook()
logger.Hooks.Add(logCounter)
backupLog := logger.WithField("backup", kubeutil.NamespaceAndName(backup))
backupLog.Info("Setting up backup temp file")
backupFile, err := ioutil.TempFile("", "")
if err != nil {
return errors.Wrap(err, "error creating temp file for backup")
}
defer closeAndRemoveFile(backupFile, log)
defer closeAndRemoveFile(backupFile, backupLog)
pluginManager := c.newPluginManager(log)
backupLog.Info("Setting up plugin manager")
pluginManager := c.newPluginManager(backupLog)
defer pluginManager.CleanupClients()
backupLog.Info("Getting backup item actions")
actions, err := pluginManager.GetBackupItemActions()
if err != nil {
return err
}
backupStore, err := c.newBackupStore(backup.StorageLocation, pluginManager, log)
backupLog.Info("Setting up backup store")
backupStore, err := c.newBackupStore(backup.StorageLocation, pluginManager, backupLog)
if err != nil {
return err
}
@ -462,17 +483,9 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
return errors.Errorf("backup already exists in object storage")
}
// Do the actual backup
var errs []error
if err := c.backupper.Backup(log, backup, backupFile, actions, pluginManager); err != nil {
errs = append(errs, err)
backup.Status.Phase = velerov1api.BackupPhaseFailed
} else {
backup.Status.Phase = velerov1api.BackupPhaseCompleted
}
if err := gzippedLogFile.Close(); err != nil {
c.logger.WithError(err).Error("error closing gzippedLogFile")
var fatalErrs []error
if err := c.backupper.Backup(backupLog, backup, backupFile, actions, pluginManager); err != nil {
fatalErrs = append(fatalErrs, err)
}
// Mark completion timestamp before serializing and uploading.
@ -486,21 +499,45 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
}
}
errs = append(errs, persistBackup(backup, backupFile, logFile, backupStore, c.logger)...)
errs = append(errs, recordBackupMetrics(backup.Backup, backupFile, c.metrics))
recordBackupMetrics(backupLog, backup.Backup, backupFile, c.metrics)
log.Info("Backup completed")
return kerrors.NewAggregate(errs)
if err := gzippedLogFile.Close(); err != nil {
c.logger.WithError(err).Error("error closing gzippedLogFile")
}
func recordBackupMetrics(backup *velerov1api.Backup, backupFile *os.File, serverMetrics *metrics.ServerMetrics) error {
backup.Status.Warnings = logCounter.GetCount(logrus.WarnLevel)
backup.Status.Errors = logCounter.GetCount(logrus.ErrorLevel)
// Assign finalize phase as close to end as possible so that any errors
// logged to backupLog are captured. This is done before uploading the
// artifacts to object storage so that the JSON representation of the
// backup in object storage has the terminal phase set.
switch {
case len(fatalErrs) > 0:
backup.Status.Phase = velerov1api.BackupPhaseFailed
case logCounter.GetCount(logrus.ErrorLevel) > 0:
backup.Status.Phase = velerov1api.BackupPhasePartiallyFailed
default:
backup.Status.Phase = velerov1api.BackupPhaseCompleted
}
if errs := persistBackup(backup, backupFile, logFile, backupStore, c.logger); len(errs) > 0 {
fatalErrs = append(fatalErrs, errs...)
}
c.logger.Info("Backup completed")
// if we return a non-nil error, the calling function will update
// the backup's phase to Failed.
return kerrors.NewAggregate(fatalErrs)
}
func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, backupFile *os.File, serverMetrics *metrics.ServerMetrics) {
backupScheduleName := backup.GetLabels()[velerov1api.ScheduleNameLabel]
var backupSizeBytes int64
var err error
if backupFileStat, err := backupFile.Stat(); err != nil {
err = errors.Wrap(err, "error getting file info")
log.WithError(errors.WithStack(err)).Error("Error getting backup file info")
} else {
backupSizeBytes = backupFileStat.Size()
}
@ -512,8 +549,6 @@ func recordBackupMetrics(backup *velerov1api.Backup, backupFile *os.File, server
serverMetrics.RegisterVolumeSnapshotAttempts(backupScheduleName, backup.Status.VolumeSnapshotsAttempted)
serverMetrics.RegisterVolumeSnapshotSuccesses(backupScheduleName, backup.Status.VolumeSnapshotsCompleted)
serverMetrics.RegisterVolumeSnapshotFailures(backupScheduleName, backup.Status.VolumeSnapshotsAttempted-backup.Status.VolumeSnapshotsCompleted)
return err
}
func persistBackup(backup *pkgbackup.Request, backupContents, backupLog *os.File, backupStore persistence.BackupStore, log logrus.FieldLogger) []error {

View File

@ -33,6 +33,7 @@ const (
backupTotal = "backup_total"
backupAttemptTotal = "backup_attempt_total"
backupSuccessTotal = "backup_success_total"
backupPartialFailureTotal = "backup_partial_failure_total"
backupFailureTotal = "backup_failure_total"
backupDurationSeconds = "backup_duration_seconds"
backupDeletionAttemptTotal = "backup_deletion_attempt_total"
@ -89,6 +90,14 @@ func NewServerMetrics() *ServerMetrics {
},
[]string{scheduleLabel},
),
backupPartialFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
Name: backupPartialFailureTotal,
Help: "Total number of partially failed backups",
},
[]string{scheduleLabel},
),
backupFailureTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: metricNamespace,
@ -230,6 +239,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(scheduleName).Set(0)
}
@ -296,6 +308,13 @@ func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) {
}
}
// RegisterBackupPartialFailure records a partially failed backup.
func (m *ServerMetrics) RegisterBackupPartialFailure(backupSchedule string) {
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
c.WithLabelValues(backupSchedule).Inc()
}
}
// RegisterBackupFailed records a failed backup.
func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) {
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {

View File

@ -0,0 +1,61 @@
/*
Copyright 2019 the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package logging
import (
"sync"
"github.com/sirupsen/logrus"
)
// LogCounterHook is a logrus hook that counts the number of log
// statements that have been written at each logrus level.
type LogCounterHook struct {
mu sync.RWMutex
counts map[logrus.Level]int
}
// NewLogCounterHook returns a pointer to an initialized LogCounterHook.
func NewLogCounterHook() *LogCounterHook {
return &LogCounterHook{
counts: make(map[logrus.Level]int),
}
}
// Levels returns the logrus levels that the hook should be fired for.
func (h *LogCounterHook) Levels() []logrus.Level {
return logrus.AllLevels
}
// Fire executes the hook's logic.
func (h *LogCounterHook) Fire(entry *logrus.Entry) error {
h.mu.Lock()
defer h.mu.Unlock()
h.counts[entry.Level]++
return nil
}
// GetCount returns the number of log statements that have been
// written at the specific level provided.
func (h *LogCounterHook) GetCount(level logrus.Level) int {
h.mu.RLock()
defer h.mu.RUnlock()
return h.counts[level]
}