add PartiallyFailed phase for backups, log+continue on errors (#1386)
Signed-off-by: Steve Kriss <krisss@vmware.com>pull/1415/head
parent
42f351b000
commit
8392e6d83f
|
@ -0,0 +1 @@
|
|||
add PartiallyFailed phase for backups, log + continue on errors during backup process
|
|
@ -149,6 +149,10 @@ const (
|
|||
// errors.
|
||||
BackupPhaseCompleted BackupPhase = "Completed"
|
||||
|
||||
// BackupPhasePartiallyFailed means the backup has run to completion
|
||||
// but encountered 1+ errors backing up individual items.
|
||||
BackupPhasePartiallyFailed BackupPhase = "PartiallyFailed"
|
||||
|
||||
// BackupPhaseFailed means the backup ran but encountered an error that
|
||||
// prevented it from completing successfully.
|
||||
BackupPhaseFailed BackupPhase = "Failed"
|
||||
|
@ -191,6 +195,16 @@ type BackupStatus struct {
|
|||
// VolumeSnapshotsCompleted is the total number of successfully
|
||||
// completed volume snapshots for this backup.
|
||||
VolumeSnapshotsCompleted int `json:"volumeSnapshotsCompleted"`
|
||||
|
||||
// Warnings is a count of all warning messages that were generated during
|
||||
// execution of the backup. The actual warnings are in the backup's log
|
||||
// file in object storage.
|
||||
Warnings int `json:"warnings"`
|
||||
|
||||
// Errors is a count of all error messages that were generated during
|
||||
// execution of the backup. The actual errors are in the backup's log
|
||||
// file in object storage.
|
||||
Errors int `json:"errors"`
|
||||
}
|
||||
|
||||
// +genclient
|
||||
|
|
|
@ -30,7 +30,6 @@ import (
|
|||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
kuberrs "k8s.io/apimachinery/pkg/util/errors"
|
||||
|
||||
api "github.com/heptio/velero/pkg/apis/velero/v1"
|
||||
"github.com/heptio/velero/pkg/client"
|
||||
|
@ -39,7 +38,6 @@ import (
|
|||
"github.com/heptio/velero/pkg/podexec"
|
||||
"github.com/heptio/velero/pkg/restic"
|
||||
"github.com/heptio/velero/pkg/util/collections"
|
||||
kubeutil "github.com/heptio/velero/pkg/util/kube"
|
||||
)
|
||||
|
||||
// BackupVersion is the current backup version for Velero.
|
||||
|
@ -207,17 +205,18 @@ type VolumeSnapshotterGetter interface {
|
|||
}
|
||||
|
||||
// Backup backs up the items specified in the Backup, placing them in a gzip-compressed tar file
|
||||
// written to backupFile. The finalized api.Backup is written to metadata.
|
||||
func (kb *kubernetesBackupper) Backup(logger logrus.FieldLogger, backupRequest *Request, backupFile io.Writer, actions []velero.BackupItemAction, volumeSnapshotterGetter VolumeSnapshotterGetter) error {
|
||||
// written to backupFile. The finalized api.Backup is written to metadata. Any error that represents
|
||||
// a complete backup failure is returned. Errors that constitute partial failures (i.e. failures to
|
||||
// back up individual resources that don't prevent the backup from continuing to be processed) are logged
|
||||
// to the backup log.
|
||||
func (kb *kubernetesBackupper) Backup(log logrus.FieldLogger, backupRequest *Request, backupFile io.Writer, actions []velero.BackupItemAction, volumeSnapshotterGetter VolumeSnapshotterGetter) error {
|
||||
gzippedData := gzip.NewWriter(backupFile)
|
||||
defer gzippedData.Close()
|
||||
|
||||
tw := tar.NewWriter(gzippedData)
|
||||
defer tw.Close()
|
||||
|
||||
log := logger.WithField("backup", kubeutil.NamespaceAndName(backupRequest))
|
||||
log.Info("Starting backup")
|
||||
|
||||
log.Info("Writing backup version file")
|
||||
if err := kb.writeBackupVersion(tw); err != nil {
|
||||
return errors.WithStack(err)
|
||||
}
|
||||
|
@ -276,21 +275,13 @@ func (kb *kubernetesBackupper) Backup(logger logrus.FieldLogger, backupRequest *
|
|||
volumeSnapshotterGetter,
|
||||
)
|
||||
|
||||
var errs []error
|
||||
for _, group := range kb.discoveryHelper.Resources() {
|
||||
if err := gb.backupGroup(group); err != nil {
|
||||
errs = append(errs, err)
|
||||
log.WithError(err).WithField("apiGroup", group.String()).Error("Error backing up API group")
|
||||
}
|
||||
}
|
||||
|
||||
err = kuberrs.Flatten(kuberrs.NewAggregate(errs))
|
||||
if err == nil {
|
||||
log.Infof("Backup completed successfully")
|
||||
} else {
|
||||
log.Infof("Backup completed with errors: %v", err)
|
||||
}
|
||||
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
|
||||
func (kb *kubernetesBackupper) writeBackupVersion(tw *tar.Writer) error {
|
||||
|
|
|
@ -373,12 +373,64 @@ func TestBackup(t *testing.T) {
|
|||
tests := []struct {
|
||||
name string
|
||||
backup *v1.Backup
|
||||
actions []velero.BackupItemAction
|
||||
expectedNamespaces *collections.IncludesExcludes
|
||||
expectedResources *collections.IncludesExcludes
|
||||
expectedHooks []resourceHook
|
||||
backupGroupErrors map[*metav1.APIResourceList]error
|
||||
expectedError error
|
||||
}{
|
||||
{
|
||||
name: "error resolving actions returns an error",
|
||||
backup: &v1.Backup{
|
||||
Spec: v1.BackupSpec{
|
||||
// cm - shortcut in legacy api group
|
||||
// csr - shortcut in certificates.k8s.io api group
|
||||
// roles - fully qualified in rbac.authorization.k8s.io api group
|
||||
IncludedResources: []string{"cm", "csr", "roles"},
|
||||
IncludedNamespaces: []string{"a", "b"},
|
||||
ExcludedNamespaces: []string{"c", "d"},
|
||||
},
|
||||
},
|
||||
actions: []velero.BackupItemAction{new(appliesToErrorAction)},
|
||||
expectedNamespaces: collections.NewIncludesExcludes().Includes("a", "b").Excludes("c", "d"),
|
||||
expectedResources: collections.NewIncludesExcludes().Includes("configmaps", "certificatesigningrequests.certificates.k8s.io", "roles.rbac.authorization.k8s.io"),
|
||||
expectedHooks: []resourceHook{},
|
||||
expectedError: errors.New("error calling AppliesTo"),
|
||||
},
|
||||
{
|
||||
name: "error resolving hooks returns an error",
|
||||
backup: &v1.Backup{
|
||||
Spec: v1.BackupSpec{
|
||||
// cm - shortcut in legacy api group
|
||||
// csr - shortcut in certificates.k8s.io api group
|
||||
// roles - fully qualified in rbac.authorization.k8s.io api group
|
||||
IncludedResources: []string{"cm", "csr", "roles"},
|
||||
IncludedNamespaces: []string{"a", "b"},
|
||||
ExcludedNamespaces: []string{"c", "d"},
|
||||
Hooks: v1.BackupHooks{
|
||||
Resources: []v1.BackupResourceHookSpec{
|
||||
{
|
||||
Name: "hook-with-invalid-label-selector",
|
||||
LabelSelector: &metav1.LabelSelector{
|
||||
MatchExpressions: []metav1.LabelSelectorRequirement{
|
||||
{
|
||||
Key: "foo",
|
||||
Operator: metav1.LabelSelectorOperator("nonexistent-operator"),
|
||||
Values: []string{"bar"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedNamespaces: collections.NewIncludesExcludes().Includes("a", "b").Excludes("c", "d"),
|
||||
expectedResources: collections.NewIncludesExcludes().Includes("configmaps", "certificatesigningrequests.certificates.k8s.io", "roles.rbac.authorization.k8s.io"),
|
||||
expectedHooks: []resourceHook{},
|
||||
expectedError: errors.New("\"nonexistent-operator\" is not a valid pod selector operator"),
|
||||
},
|
||||
{
|
||||
name: "happy path, no actions, no hooks, no errors",
|
||||
backup: &v1.Backup{
|
||||
|
@ -411,7 +463,7 @@ func TestBackup(t *testing.T) {
|
|||
certificatesGroup: nil,
|
||||
rbacGroup: errors.New("rbac error"),
|
||||
},
|
||||
expectedError: errors.New("[v1 error, rbac error]"),
|
||||
expectedError: nil,
|
||||
},
|
||||
{
|
||||
name: "hooks",
|
||||
|
@ -509,7 +561,7 @@ func TestBackup(t *testing.T) {
|
|||
mock.Anything, // restic backupper
|
||||
mock.Anything, // pvc snapshot tracker
|
||||
mock.Anything, // volume snapshotter getter
|
||||
).Return(groupBackupper)
|
||||
).Maybe().Return(groupBackupper)
|
||||
|
||||
for group, err := range test.backupGroupErrors {
|
||||
groupBackupper.On("backupGroup", group).Return(err)
|
||||
|
@ -522,7 +574,7 @@ func TestBackup(t *testing.T) {
|
|||
groupBackupperFactory: groupBackupperFactory,
|
||||
}
|
||||
|
||||
err := kb.Backup(logging.DefaultLogger(logrus.DebugLevel), req, new(bytes.Buffer), nil, nil)
|
||||
err := kb.Backup(logging.DefaultLogger(logrus.DebugLevel), req, new(bytes.Buffer), test.actions, nil)
|
||||
|
||||
assert.Equal(t, test.expectedNamespaces, req.NamespaceIncludesExcludes)
|
||||
assert.Equal(t, test.expectedResources, req.ResourceIncludesExcludes)
|
||||
|
@ -538,6 +590,18 @@ func TestBackup(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// appliesToErrorAction is a backup item action that always returns
|
||||
// an error when AppliesTo() is called.
|
||||
type appliesToErrorAction struct{}
|
||||
|
||||
func (a *appliesToErrorAction) AppliesTo() (velero.ResourceSelector, error) {
|
||||
return velero.ResourceSelector{}, errors.New("error calling AppliesTo")
|
||||
}
|
||||
|
||||
func (a *appliesToErrorAction) Execute(item runtime.Unstructured, backup *v1.Backup) (runtime.Unstructured, []velero.ResourceIdentifier, error) {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func TestBackupUsesNewCohabitatingResourcesForEachBackup(t *testing.T) {
|
||||
groupBackupperFactory := &mockGroupBackupperFactory{}
|
||||
kb := &kubernetesBackupper{
|
||||
|
|
|
@ -24,7 +24,6 @@ import (
|
|||
"github.com/sirupsen/logrus"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
kuberrs "k8s.io/apimachinery/pkg/util/errors"
|
||||
|
||||
"github.com/heptio/velero/pkg/client"
|
||||
"github.com/heptio/velero/pkg/discovery"
|
||||
|
@ -101,23 +100,7 @@ type defaultGroupBackupper struct {
|
|||
|
||||
// backupGroup backs up a single API group.
|
||||
func (gb *defaultGroupBackupper) backupGroup(group *metav1.APIResourceList) error {
|
||||
var (
|
||||
errs []error
|
||||
log = gb.log.WithField("group", group.GroupVersion)
|
||||
rb = gb.resourceBackupperFactory.newResourceBackupper(
|
||||
log,
|
||||
gb.backupRequest,
|
||||
gb.dynamicFactory,
|
||||
gb.discoveryHelper,
|
||||
gb.backedUpItems,
|
||||
gb.cohabitatingResources,
|
||||
gb.podCommandExecutor,
|
||||
gb.tarWriter,
|
||||
gb.resticBackupper,
|
||||
gb.resticSnapshotTracker,
|
||||
gb.volumeSnapshotterGetter,
|
||||
)
|
||||
)
|
||||
log := gb.log.WithField("group", group.GroupVersion)
|
||||
|
||||
log.Infof("Backing up group")
|
||||
|
||||
|
@ -132,13 +115,27 @@ func (gb *defaultGroupBackupper) backupGroup(group *metav1.APIResourceList) erro
|
|||
sortCoreGroup(group)
|
||||
}
|
||||
|
||||
rb := gb.resourceBackupperFactory.newResourceBackupper(
|
||||
log,
|
||||
gb.backupRequest,
|
||||
gb.dynamicFactory,
|
||||
gb.discoveryHelper,
|
||||
gb.backedUpItems,
|
||||
gb.cohabitatingResources,
|
||||
gb.podCommandExecutor,
|
||||
gb.tarWriter,
|
||||
gb.resticBackupper,
|
||||
gb.resticSnapshotTracker,
|
||||
gb.volumeSnapshotterGetter,
|
||||
)
|
||||
|
||||
for _, resource := range group.APIResources {
|
||||
if err := rb.backupResource(group, resource); err != nil {
|
||||
errs = append(errs, err)
|
||||
log.WithError(err).WithField("resource", resource.String()).Error("Error backing up API resource")
|
||||
}
|
||||
}
|
||||
|
||||
return kuberrs.NewAggregate(errs)
|
||||
return nil
|
||||
}
|
||||
|
||||
// sortCoreGroup sorts group as a coreGroup.
|
||||
|
|
|
@ -160,7 +160,7 @@ func (ib *defaultItemBackupper) backupItem(logger logrus.FieldLogger, obj runtim
|
|||
}
|
||||
ib.backedUpItems[key] = struct{}{}
|
||||
|
||||
log.Info("Backing up resource")
|
||||
log.Info("Backing up item")
|
||||
|
||||
log.Debug("Executing pre hooks")
|
||||
if err := ib.itemHookHandler.handleHooks(log, groupResource, obj, ib.backupRequest.ResourceHooks, hookPhasePre); err != nil {
|
||||
|
@ -192,7 +192,6 @@ func (ib *defaultItemBackupper) backupItem(logger logrus.FieldLogger, obj runtim
|
|||
|
||||
updatedObj, err := ib.executeActions(log, obj, groupResource, name, namespace, metadata)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("Error executing item actions")
|
||||
backupErrs = append(backupErrs, err)
|
||||
|
||||
// if there was an error running actions, execute post hooks and return
|
||||
|
@ -309,11 +308,6 @@ func (ib *defaultItemBackupper) executeActions(
|
|||
|
||||
updatedItem, additionalItemIdentifiers, err := action.Execute(obj, ib.backupRequest.Backup)
|
||||
if err != nil {
|
||||
// We want this to show up in the log file at the place where the error occurs. When we return
|
||||
// the error, it get aggregated with all the other ones at the end of the backup, making it
|
||||
// harder to tell when it happened.
|
||||
log.WithError(err).Error("error executing custom action")
|
||||
|
||||
return nil, errors.Wrapf(err, "error executing custom action (groupResource=%s, namespace=%s, name=%s)", groupResource.String(), namespace, name)
|
||||
}
|
||||
obj = updatedItem
|
||||
|
@ -331,7 +325,7 @@ func (ib *defaultItemBackupper) executeActions(
|
|||
|
||||
additionalItem, err := client.Get(additionalItem.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
if err = ib.additionalItemBackupper.backupItem(log, additionalItem, gvr.GroupResource()); err != nil {
|
||||
|
@ -393,7 +387,7 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log
|
|||
// of this PV. If so, don't take a snapshot.
|
||||
if pv.Spec.ClaimRef != nil {
|
||||
if ib.resticSnapshotTracker.Has(pv.Spec.ClaimRef.Namespace, pv.Spec.ClaimRef.Name) {
|
||||
log.Info("Skipping Persistent Volume snapshot because volume has already been backed up.")
|
||||
log.Info("Skipping persistent volume snapshot because volume has already been backed up with restic.")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -433,7 +427,7 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log
|
|||
}
|
||||
|
||||
if volumeSnapshotter == nil {
|
||||
log.Info("PersistentVolume is not a supported volume type for snapshots, skipping.")
|
||||
log.Info("Persistent volume is not a supported volume type for snapshots, skipping.")
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -447,17 +441,15 @@ func (ib *defaultItemBackupper) takePVSnapshot(obj runtime.Unstructured, log log
|
|||
log.Info("Getting volume information")
|
||||
volumeType, iops, err := volumeSnapshotter.GetVolumeInfo(volumeID, pvFailureDomainZone)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error getting volume info")
|
||||
return errors.WithMessage(err, "error getting volume info")
|
||||
}
|
||||
|
||||
log.Info("Snapshotting PersistentVolume")
|
||||
log.Info("Snapshotting persistent volume")
|
||||
snapshot := volumeSnapshot(ib.backupRequest.Backup, pv.Name, volumeID, volumeType, pvFailureDomainZone, location, iops)
|
||||
|
||||
var errs []error
|
||||
snapshotID, err := volumeSnapshotter.CreateSnapshot(snapshot.Spec.ProviderVolumeID, snapshot.Spec.VolumeAZ, tags)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error creating snapshot")
|
||||
errs = append(errs, errors.Wrap(err, "error taking snapshot of volume"))
|
||||
snapshot.Status.Phase = volume.SnapshotPhaseFailed
|
||||
} else {
|
||||
|
|
|
@ -24,7 +24,7 @@ import (
|
|||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
kuberrs "k8s.io/apimachinery/pkg/util/errors"
|
||||
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
|
||||
|
||||
"github.com/heptio/velero/pkg/client"
|
||||
"github.com/heptio/velero/pkg/discovery"
|
||||
|
@ -102,22 +102,16 @@ type defaultResourceBackupper struct {
|
|||
}
|
||||
|
||||
// backupResource backs up all the objects for a given group-version-resource.
|
||||
func (rb *defaultResourceBackupper) backupResource(
|
||||
group *metav1.APIResourceList,
|
||||
resource metav1.APIResource,
|
||||
) error {
|
||||
var errs []error
|
||||
func (rb *defaultResourceBackupper) backupResource(group *metav1.APIResourceList, resource metav1.APIResource) error {
|
||||
log := rb.log.WithField("resource", resource.Name)
|
||||
|
||||
log.Info("Backing up resource")
|
||||
|
||||
gv, err := schema.ParseGroupVersion(group.GroupVersion)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "error parsing GroupVersion %s", group.GroupVersion)
|
||||
}
|
||||
gr := schema.GroupResource{Group: gv.Group, Resource: resource.Name}
|
||||
grString := gr.String()
|
||||
|
||||
log := rb.log.WithField("groupResource", grString)
|
||||
|
||||
log.Info("Evaluating resource")
|
||||
|
||||
clusterScoped := !resource.Namespaced
|
||||
|
||||
|
@ -142,8 +136,8 @@ func (rb *defaultResourceBackupper) backupResource(
|
|||
}
|
||||
}
|
||||
|
||||
if !rb.backupRequest.ResourceIncludesExcludes.ShouldInclude(grString) {
|
||||
log.Infof("Resource is excluded")
|
||||
if !rb.backupRequest.ResourceIncludesExcludes.ShouldInclude(gr.String()) {
|
||||
log.Infof("Skipping resource because it's excluded")
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -178,9 +172,8 @@ func (rb *defaultResourceBackupper) backupResource(
|
|||
if gr == kuberesource.Namespaces && namespacesToList[0] != "" {
|
||||
resourceClient, err := rb.dynamicFactory.ClientForGroupVersionResource(gv, resource, "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.WithError(err).Error("Error getting dynamic client")
|
||||
} else {
|
||||
var labelSelector labels.Selector
|
||||
if rb.backupRequest.Spec.LabelSelector != nil {
|
||||
labelSelector, err = metav1.LabelSelectorAsSelector(rb.backupRequest.Spec.LabelSelector)
|
||||
|
@ -191,25 +184,27 @@ func (rb *defaultResourceBackupper) backupResource(
|
|||
}
|
||||
|
||||
for _, ns := range namespacesToList {
|
||||
log.WithField("namespace", ns).Info("Getting namespace")
|
||||
log = log.WithField("namespace", ns)
|
||||
log.Info("Getting namespace")
|
||||
unstructured, err := resourceClient.Get(ns, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
errs = append(errs, errors.Wrap(err, "error getting namespace"))
|
||||
log.WithError(errors.WithStack(err)).Error("Error getting namespace")
|
||||
continue
|
||||
}
|
||||
|
||||
labels := labels.Set(unstructured.GetLabels())
|
||||
if labelSelector != nil && !labelSelector.Matches(labels) {
|
||||
log.WithField("name", unstructured.GetName()).Info("skipping item because it does not match the backup's label selector")
|
||||
log.Info("Skipping namespace because it does not match the backup's label selector")
|
||||
continue
|
||||
}
|
||||
|
||||
if err := itemBackupper.backupItem(log, unstructured, gr); err != nil {
|
||||
errs = append(errs, err)
|
||||
log.WithError(errors.WithStack(err)).Error("Error backing up namespace")
|
||||
}
|
||||
}
|
||||
|
||||
return kuberrs.NewAggregate(errs)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// If we get here, we're backing up something other than namespaces
|
||||
|
@ -218,9 +213,12 @@ func (rb *defaultResourceBackupper) backupResource(
|
|||
}
|
||||
|
||||
for _, namespace := range namespacesToList {
|
||||
log = log.WithField("namespace", namespace)
|
||||
|
||||
resourceClient, err := rb.dynamicFactory.ClientForGroupVersionResource(gv, resource, namespace)
|
||||
if err != nil {
|
||||
return err
|
||||
log.WithError(err).Error("Error getting dynamic client")
|
||||
continue
|
||||
}
|
||||
|
||||
var labelSelector string
|
||||
|
@ -228,44 +226,59 @@ func (rb *defaultResourceBackupper) backupResource(
|
|||
labelSelector = metav1.FormatLabelSelector(selector)
|
||||
}
|
||||
|
||||
log.WithField("namespace", namespace).Info("Listing items")
|
||||
log.Info("Listing items")
|
||||
unstructuredList, err := resourceClient.List(metav1.ListOptions{LabelSelector: labelSelector})
|
||||
if err != nil {
|
||||
return errors.WithStack(err)
|
||||
log.WithError(errors.WithStack(err)).Error("Error listing items")
|
||||
continue
|
||||
}
|
||||
|
||||
// do the backup
|
||||
items, err := meta.ExtractList(unstructuredList)
|
||||
if err != nil {
|
||||
return errors.WithStack(err)
|
||||
log.WithError(errors.WithStack(err)).Error("Error extracting list")
|
||||
continue
|
||||
}
|
||||
|
||||
log.WithField("namespace", namespace).Infof("Retrieved %d items", len(items))
|
||||
log.Infof("Retrieved %d items", len(items))
|
||||
|
||||
for _, item := range items {
|
||||
unstructured, ok := item.(runtime.Unstructured)
|
||||
if !ok {
|
||||
errs = append(errs, errors.Errorf("unexpected type %T", item))
|
||||
log.Errorf("Unexpected type %T", item)
|
||||
continue
|
||||
}
|
||||
|
||||
metadata, err := meta.Accessor(unstructured)
|
||||
if err != nil {
|
||||
errs = append(errs, errors.Wrapf(err, "unable to get a metadata accessor"))
|
||||
log.WithError(errors.WithStack(err)).Error("Error getting a metadata accessor")
|
||||
continue
|
||||
}
|
||||
|
||||
if gr == kuberesource.Namespaces && !rb.backupRequest.NamespaceIncludesExcludes.ShouldInclude(metadata.GetName()) {
|
||||
log.WithField("name", metadata.GetName()).Info("skipping namespace because it is excluded")
|
||||
log.WithField("name", metadata.GetName()).Info("Skipping namespace because it's excluded")
|
||||
continue
|
||||
}
|
||||
|
||||
if err := itemBackupper.backupItem(log, unstructured, gr); err != nil {
|
||||
errs = append(errs, err)
|
||||
err = itemBackupper.backupItem(log, unstructured, gr)
|
||||
if aggregate, ok := err.(kubeerrs.Aggregate); ok {
|
||||
log.WithField("name", metadata.GetName()).Infof("%d errors encountered backup up item", len(aggregate.Errors()))
|
||||
// log each error separately so we get error location info in the log, and an
|
||||
// accurate count of errors
|
||||
for _, err = range aggregate.Errors() {
|
||||
log.WithError(err).WithField("name", metadata.GetName()).Error("Error backing up item")
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
log.WithError(err).WithField("name", metadata.GetName()).Error("Error backing up item")
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return kuberrs.NewAggregate(errs)
|
||||
return nil
|
||||
}
|
||||
|
||||
// getNamespacesToList examines ie and resolves the includes and excludes to a full list of
|
||||
|
|
|
@ -50,7 +50,10 @@ func NewLogsCommand(f client.Factory) *cobra.Command {
|
|||
cmd.Exit("Error checking for backup %q: %v", backupName, err)
|
||||
}
|
||||
|
||||
if backup.Status.Phase != v1.BackupPhaseCompleted && backup.Status.Phase != v1.BackupPhaseFailed {
|
||||
switch backup.Status.Phase {
|
||||
case v1.BackupPhaseCompleted, v1.BackupPhasePartiallyFailed, v1.BackupPhaseFailed:
|
||||
// terminal phases, do nothing.
|
||||
default:
|
||||
cmd.Exit("Logs for backup %q are not available until it's finished processing. Please wait "+
|
||||
"until the backup has a phase of Completed or Failed and try again.", backupName)
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright 2017 the Velero contributors.
|
||||
Copyright 2017, 2019 the Velero contributors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -47,7 +47,13 @@ func DescribeBackup(
|
|||
if phase == "" {
|
||||
phase = velerov1api.BackupPhaseNew
|
||||
}
|
||||
d.Printf("Phase:\t%s\n", phase)
|
||||
|
||||
logsNote := ""
|
||||
if backup.Status.Phase == velerov1api.BackupPhaseFailed || backup.Status.Phase == velerov1api.BackupPhasePartiallyFailed {
|
||||
logsNote = fmt.Sprintf(" (run `velero backup logs %s` for more information)", backup.Name)
|
||||
}
|
||||
|
||||
d.Printf("Phase:\t%s%s\n", phase, logsNote)
|
||||
|
||||
status := backup.Status
|
||||
if len(status.ValidationErrors) > 0 {
|
||||
|
@ -58,6 +64,12 @@ func DescribeBackup(
|
|||
}
|
||||
}
|
||||
|
||||
if status.Phase == velerov1api.BackupPhasePartiallyFailed {
|
||||
d.Println()
|
||||
d.Printf("Errors:\t%d\n", status.Errors)
|
||||
d.Printf("Warnings:\t%d\n", status.Warnings)
|
||||
}
|
||||
|
||||
d.Println()
|
||||
DescribeBackupSpec(d, backup.Spec)
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright 2017 the Velero contributors.
|
||||
Copyright 2017, 2019 the Velero contributors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -85,13 +85,21 @@ func printBackup(backup *velerov1api.Backup, w io.Writer, options printers.Print
|
|||
expiration = backup.CreationTimestamp.Add(backup.Spec.TTL.Duration)
|
||||
}
|
||||
|
||||
status := backup.Status.Phase
|
||||
status := string(backup.Status.Phase)
|
||||
if status == "" {
|
||||
status = velerov1api.BackupPhaseNew
|
||||
status = string(velerov1api.BackupPhaseNew)
|
||||
}
|
||||
if backup.DeletionTimestamp != nil && !backup.DeletionTimestamp.Time.IsZero() {
|
||||
status = "Deleting"
|
||||
}
|
||||
if status == string(velerov1api.BackupPhasePartiallyFailed) {
|
||||
if backup.Status.Errors == 1 {
|
||||
status = fmt.Sprintf("%s (1 error)", status)
|
||||
} else {
|
||||
status = fmt.Sprintf("%s (%d errors)", status, backup.Status.Errors)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
location := backup.Spec.StorageLocation
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright 2017 the Velero contributors.
|
||||
Copyright 2017, 2019 the Velero contributors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -214,16 +214,29 @@ func (c *backupController) processBackup(key string) error {
|
|||
defer c.backupTracker.Delete(request.Namespace, request.Name)
|
||||
|
||||
log.Debug("Running backup")
|
||||
// execution & upload of backup
|
||||
|
||||
backupScheduleName := request.GetLabels()[velerov1api.ScheduleNameLabel]
|
||||
c.metrics.RegisterBackupAttempt(backupScheduleName)
|
||||
|
||||
// execution & upload of backup
|
||||
if err := c.runBackup(request); err != nil {
|
||||
// even though runBackup sets the backup's phase prior
|
||||
// to uploading artifacts to object storage, we have to
|
||||
// check for an error again here and update the phase if
|
||||
// one is found, because there could've been an error
|
||||
// while uploading artifacts to object storage, which would
|
||||
// result in the backup being Failed.
|
||||
log.WithError(err).Error("backup failed")
|
||||
request.Status.Phase = velerov1api.BackupPhaseFailed
|
||||
c.metrics.RegisterBackupFailed(backupScheduleName)
|
||||
} else {
|
||||
}
|
||||
|
||||
switch request.Status.Phase {
|
||||
case velerov1api.BackupPhaseCompleted:
|
||||
c.metrics.RegisterBackupSuccess(backupScheduleName)
|
||||
case velerov1api.BackupPhasePartiallyFailed:
|
||||
c.metrics.RegisterBackupPartialFailure(backupScheduleName)
|
||||
case velerov1api.BackupPhaseFailed:
|
||||
c.metrics.RegisterBackupFailed(backupScheduleName)
|
||||
}
|
||||
|
||||
log.Debug("Updating backup's final status")
|
||||
|
@ -411,9 +424,11 @@ func (c *backupController) validateAndGetSnapshotLocations(backup *velerov1api.B
|
|||
return providerLocations, nil
|
||||
}
|
||||
|
||||
// runBackup runs and uploads a validated backup. Any error returned from this function
|
||||
// causes the backup to be Failed; if no error is returned, the backup's status's Errors
|
||||
// field is checked to see if the backup was a partial failure.
|
||||
func (c *backupController) runBackup(backup *pkgbackup.Request) error {
|
||||
log := c.logger.WithField("backup", kubeutil.NamespaceAndName(backup))
|
||||
log.Info("Starting backup")
|
||||
c.logger.WithField("backup", kubeutil.NamespaceAndName(backup)).Info("Setting up backup log")
|
||||
|
||||
logFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
|
@ -429,25 +444,31 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
|
|||
// backup log failed for whatever reason.
|
||||
logger := logging.DefaultLogger(c.backupLogLevel)
|
||||
logger.Out = io.MultiWriter(os.Stdout, gzippedLogFile)
|
||||
log = logger.WithField("backup", kubeutil.NamespaceAndName(backup))
|
||||
|
||||
log.Info("Starting backup")
|
||||
logCounter := logging.NewLogCounterHook()
|
||||
logger.Hooks.Add(logCounter)
|
||||
|
||||
backupLog := logger.WithField("backup", kubeutil.NamespaceAndName(backup))
|
||||
|
||||
backupLog.Info("Setting up backup temp file")
|
||||
backupFile, err := ioutil.TempFile("", "")
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "error creating temp file for backup")
|
||||
}
|
||||
defer closeAndRemoveFile(backupFile, log)
|
||||
defer closeAndRemoveFile(backupFile, backupLog)
|
||||
|
||||
pluginManager := c.newPluginManager(log)
|
||||
backupLog.Info("Setting up plugin manager")
|
||||
pluginManager := c.newPluginManager(backupLog)
|
||||
defer pluginManager.CleanupClients()
|
||||
|
||||
backupLog.Info("Getting backup item actions")
|
||||
actions, err := pluginManager.GetBackupItemActions()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
backupStore, err := c.newBackupStore(backup.StorageLocation, pluginManager, log)
|
||||
backupLog.Info("Setting up backup store")
|
||||
backupStore, err := c.newBackupStore(backup.StorageLocation, pluginManager, backupLog)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -462,17 +483,9 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
|
|||
return errors.Errorf("backup already exists in object storage")
|
||||
}
|
||||
|
||||
// Do the actual backup
|
||||
var errs []error
|
||||
if err := c.backupper.Backup(log, backup, backupFile, actions, pluginManager); err != nil {
|
||||
errs = append(errs, err)
|
||||
backup.Status.Phase = velerov1api.BackupPhaseFailed
|
||||
} else {
|
||||
backup.Status.Phase = velerov1api.BackupPhaseCompleted
|
||||
}
|
||||
|
||||
if err := gzippedLogFile.Close(); err != nil {
|
||||
c.logger.WithError(err).Error("error closing gzippedLogFile")
|
||||
var fatalErrs []error
|
||||
if err := c.backupper.Backup(backupLog, backup, backupFile, actions, pluginManager); err != nil {
|
||||
fatalErrs = append(fatalErrs, err)
|
||||
}
|
||||
|
||||
// Mark completion timestamp before serializing and uploading.
|
||||
|
@ -486,21 +499,45 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
|
|||
}
|
||||
}
|
||||
|
||||
errs = append(errs, persistBackup(backup, backupFile, logFile, backupStore, c.logger)...)
|
||||
errs = append(errs, recordBackupMetrics(backup.Backup, backupFile, c.metrics))
|
||||
recordBackupMetrics(backupLog, backup.Backup, backupFile, c.metrics)
|
||||
|
||||
log.Info("Backup completed")
|
||||
|
||||
return kerrors.NewAggregate(errs)
|
||||
if err := gzippedLogFile.Close(); err != nil {
|
||||
c.logger.WithError(err).Error("error closing gzippedLogFile")
|
||||
}
|
||||
|
||||
func recordBackupMetrics(backup *velerov1api.Backup, backupFile *os.File, serverMetrics *metrics.ServerMetrics) error {
|
||||
backup.Status.Warnings = logCounter.GetCount(logrus.WarnLevel)
|
||||
backup.Status.Errors = logCounter.GetCount(logrus.ErrorLevel)
|
||||
|
||||
// Assign finalize phase as close to end as possible so that any errors
|
||||
// logged to backupLog are captured. This is done before uploading the
|
||||
// artifacts to object storage so that the JSON representation of the
|
||||
// backup in object storage has the terminal phase set.
|
||||
switch {
|
||||
case len(fatalErrs) > 0:
|
||||
backup.Status.Phase = velerov1api.BackupPhaseFailed
|
||||
case logCounter.GetCount(logrus.ErrorLevel) > 0:
|
||||
backup.Status.Phase = velerov1api.BackupPhasePartiallyFailed
|
||||
default:
|
||||
backup.Status.Phase = velerov1api.BackupPhaseCompleted
|
||||
}
|
||||
|
||||
if errs := persistBackup(backup, backupFile, logFile, backupStore, c.logger); len(errs) > 0 {
|
||||
fatalErrs = append(fatalErrs, errs...)
|
||||
}
|
||||
|
||||
c.logger.Info("Backup completed")
|
||||
|
||||
// if we return a non-nil error, the calling function will update
|
||||
// the backup's phase to Failed.
|
||||
return kerrors.NewAggregate(fatalErrs)
|
||||
}
|
||||
|
||||
func recordBackupMetrics(log logrus.FieldLogger, backup *velerov1api.Backup, backupFile *os.File, serverMetrics *metrics.ServerMetrics) {
|
||||
backupScheduleName := backup.GetLabels()[velerov1api.ScheduleNameLabel]
|
||||
|
||||
var backupSizeBytes int64
|
||||
var err error
|
||||
if backupFileStat, err := backupFile.Stat(); err != nil {
|
||||
err = errors.Wrap(err, "error getting file info")
|
||||
log.WithError(errors.WithStack(err)).Error("Error getting backup file info")
|
||||
} else {
|
||||
backupSizeBytes = backupFileStat.Size()
|
||||
}
|
||||
|
@ -512,8 +549,6 @@ func recordBackupMetrics(backup *velerov1api.Backup, backupFile *os.File, server
|
|||
serverMetrics.RegisterVolumeSnapshotAttempts(backupScheduleName, backup.Status.VolumeSnapshotsAttempted)
|
||||
serverMetrics.RegisterVolumeSnapshotSuccesses(backupScheduleName, backup.Status.VolumeSnapshotsCompleted)
|
||||
serverMetrics.RegisterVolumeSnapshotFailures(backupScheduleName, backup.Status.VolumeSnapshotsAttempted-backup.Status.VolumeSnapshotsCompleted)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func persistBackup(backup *pkgbackup.Request, backupContents, backupLog *os.File, backupStore persistence.BackupStore, log logrus.FieldLogger) []error {
|
||||
|
|
|
@ -33,6 +33,7 @@ const (
|
|||
backupTotal = "backup_total"
|
||||
backupAttemptTotal = "backup_attempt_total"
|
||||
backupSuccessTotal = "backup_success_total"
|
||||
backupPartialFailureTotal = "backup_partial_failure_total"
|
||||
backupFailureTotal = "backup_failure_total"
|
||||
backupDurationSeconds = "backup_duration_seconds"
|
||||
backupDeletionAttemptTotal = "backup_deletion_attempt_total"
|
||||
|
@ -89,6 +90,14 @@ func NewServerMetrics() *ServerMetrics {
|
|||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
backupPartialFailureTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: backupPartialFailureTotal,
|
||||
Help: "Total number of partially failed backups",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
backupFailureTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
|
@ -230,6 +239,9 @@ func (m *ServerMetrics) InitSchedule(scheduleName string) {
|
|||
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(scheduleName).Set(0)
|
||||
}
|
||||
|
@ -296,6 +308,13 @@ func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) {
|
|||
}
|
||||
}
|
||||
|
||||
// RegisterBackupPartialFailure records a partially failed backup.
|
||||
func (m *ServerMetrics) RegisterBackupPartialFailure(backupSchedule string) {
|
||||
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterBackupFailed records a failed backup.
|
||||
func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) {
|
||||
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
Copyright 2019 the Velero contributors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package logging
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// LogCounterHook is a logrus hook that counts the number of log
|
||||
// statements that have been written at each logrus level.
|
||||
type LogCounterHook struct {
|
||||
mu sync.RWMutex
|
||||
counts map[logrus.Level]int
|
||||
}
|
||||
|
||||
// NewLogCounterHook returns a pointer to an initialized LogCounterHook.
|
||||
func NewLogCounterHook() *LogCounterHook {
|
||||
return &LogCounterHook{
|
||||
counts: make(map[logrus.Level]int),
|
||||
}
|
||||
}
|
||||
|
||||
// Levels returns the logrus levels that the hook should be fired for.
|
||||
func (h *LogCounterHook) Levels() []logrus.Level {
|
||||
return logrus.AllLevels
|
||||
}
|
||||
|
||||
// Fire executes the hook's logic.
|
||||
func (h *LogCounterHook) Fire(entry *logrus.Entry) error {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
|
||||
h.counts[entry.Level]++
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetCount returns the number of log statements that have been
|
||||
// written at the specific level provided.
|
||||
func (h *LogCounterHook) GetCount(level logrus.Level) int {
|
||||
h.mu.RLock()
|
||||
defer h.mu.RUnlock()
|
||||
|
||||
return h.counts[level]
|
||||
}
|
Loading…
Reference in New Issue