Merge pull request #5104 from blackpiglet/5048-CSI-snapshot-timeout-configurable

Make CSI snapshot creation timeout configurable for backup and schedule.
pull/5203/head
Daniel Jiang 2022-08-08 19:51:38 +08:00 committed by GitHub
commit 6951875053
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 71 additions and 9 deletions

View File

@ -0,0 +1 @@
Make CSI snapshot creation timeout configurable.

View File

@ -37,6 +37,11 @@ spec:
spec:
description: BackupSpec defines the specification for a Velero backup.
properties:
csiSnapshotTimeout:
description: CSISnapshotTimeout specifies the time used to wait for
CSI VolumeSnapshot status turns to ReadyToUse during creation, before
returning error as timeout. The default value is 10 minute.
type: string
defaultVolumesToRestic:
description: DefaultVolumesToRestic specifies whether restic should
be used to take a backup of all pod volumes by default.

View File

@ -61,6 +61,11 @@ spec:
description: Template is the definition of the Backup to be run on
the provided schedule
properties:
csiSnapshotTimeout:
description: CSISnapshotTimeout specifies the time used to wait
for CSI VolumeSnapshot status turns to ReadyToUse during creation,
before returning error as timeout. The default value is 10 minute.
type: string
defaultVolumesToRestic:
description: DefaultVolumesToRestic specifies whether restic should
be used to take a backup of all pod volumes by default.

File diff suppressed because one or more lines are too long

View File

@ -110,6 +110,12 @@ type BackupSpec struct {
// +optional
// +nullable
OrderedResources map[string]string `json:"orderedResources,omitempty"`
// CSISnapshotTimeout specifies the time used to wait for CSI VolumeSnapshot status turns to
// ReadyToUse during creation, before returning error as timeout.
// The default value is 10 minute.
// +optional
CSISnapshotTimeout metav1.Duration `json:"csiSnapshotTimeout,omitempty"`
}
// BackupHooks contains custom behaviors that should be executed at different phases of the backup.

View File

@ -344,6 +344,7 @@ func (in *BackupSpec) DeepCopyInto(out *BackupSpec) {
(*out)[key] = val
}
}
out.CSISnapshotTimeout = in.CSISnapshotTimeout
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackupSpec.

View File

@ -233,3 +233,9 @@ func (b *BackupBuilder) OrderedResources(orders map[string]string) *BackupBuilde
b.object.Spec.OrderedResources = orders
return b
}
// CSISnapshotTimeout sets the Backup's CSISnapshotTimeout
func (b *BackupBuilder) CSISnapshotTimeout(timeout time.Duration) *BackupBuilder {
b.object.Spec.CSISnapshotTimeout.Duration = timeout
return b
}

View File

@ -98,6 +98,7 @@ type CreateOptions struct {
SnapshotLocations []string
FromSchedule string
OrderedResources string
CSISnapshotTimeout time.Duration
client veleroclient.Interface
}
@ -122,6 +123,7 @@ func (o *CreateOptions) BindFlags(flags *pflag.FlagSet) {
flags.StringSliceVar(&o.SnapshotLocations, "volume-snapshot-locations", o.SnapshotLocations, "List of locations (at most one per provider) where volume snapshots should be stored.")
flags.VarP(&o.Selector, "selector", "l", "Only back up resources matching this label selector.")
flags.StringVar(&o.OrderedResources, "ordered-resources", "", "Mapping Kinds to an ordered list of specific resources of that Kind. Resource names are separated by commas and their names are in format 'namespace/resourcename'. For cluster scope resource, simply use resource name. Key-value pairs in the mapping are separated by semi-colon. Example: 'pods=ns1/pod1,ns1/pod2;persistentvolumeclaims=ns1/pvc4,ns1/pvc8'. Optional.")
flags.DurationVar(&o.CSISnapshotTimeout, "csi-snapshot-timeout", o.CSISnapshotTimeout, "How long to wait for CSI snapshot creation before timeout.")
f := flags.VarPF(&o.SnapshotVolumes, "snapshot-volumes", "", "Take snapshots of PersistentVolumes as part of the backup.")
// this allows the user to just specify "--snapshot-volumes" as shorthand for "--snapshot-volumes=true"
// like a normal bool flag
@ -332,7 +334,8 @@ func (o *CreateOptions) BuildBackup(namespace string) (*velerov1api.Backup, erro
LabelSelector(o.Selector.LabelSelector).
TTL(o.TTL).
StorageLocation(o.StorageLocation).
VolumeSnapshotLocations(o.SnapshotLocations...)
VolumeSnapshotLocations(o.SnapshotLocations...).
CSISnapshotTimeout(o.CSISnapshotTimeout)
if len(o.OrderedResources) > 0 {
orders, err := ParseOrderedResources(o.OrderedResources)
if err != nil {

View File

@ -19,6 +19,7 @@ package backup
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/assert"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -35,6 +36,7 @@ func TestCreateOptions_BuildBackup(t *testing.T) {
o.Labels.Set("velero.io/test=true")
o.OrderedResources = "pods=p1,p2;persistentvolumeclaims=pvc1,pvc2"
orders, err := ParseOrderedResources(o.OrderedResources)
o.CSISnapshotTimeout = 20 * time.Minute
assert.NoError(t, err)
backup, err := o.BuildBackup(testNamespace)
@ -46,6 +48,7 @@ func TestCreateOptions_BuildBackup(t *testing.T) {
SnapshotVolumes: o.SnapshotVolumes.Value,
IncludeClusterResources: o.IncludeClusterResources.Value,
OrderedResources: orders,
CSISnapshotTimeout: metav1.Duration{Duration: o.CSISnapshotTimeout},
}, backup.Spec)
assert.Equal(t, map[string]string{

View File

@ -145,6 +145,7 @@ func (o *CreateOptions) Run(c *cobra.Command, f client.Factory) error {
VolumeSnapshotLocations: o.BackupOptions.SnapshotLocations,
DefaultVolumesToRestic: o.BackupOptions.DefaultVolumesToRestic.Value,
OrderedResources: orders,
CSISnapshotTimeout: metav1.Duration{Duration: o.BackupOptions.CSISnapshotTimeout},
},
Schedule: o.Schedule,
UseOwnerReferencesInBackup: &o.UseOwnerReferencesInBackup,

View File

@ -103,6 +103,8 @@ const (
// the default TTL for a backup
defaultBackupTTL = 30 * 24 * time.Hour
defaultCSISnapshotTimeout = 10 * time.Minute
// defaultCredentialsDirectory is the path on disk where credential
// files will be written to
defaultCredentialsDirectory = "/tmp/credentials"
@ -112,7 +114,7 @@ type serverConfig struct {
// TODO(2.0) Deprecate defaultBackupLocation
pluginDir, metricsAddress, defaultBackupLocation string
backupSyncPeriod, podVolumeOperationTimeout, resourceTerminatingTimeout time.Duration
defaultBackupTTL, storeValidationFrequency time.Duration
defaultBackupTTL, storeValidationFrequency, defaultCSISnapshotTimeout time.Duration
restoreResourcePriorities []string
defaultVolumeSnapshotLocations map[string]string
restoreOnly bool
@ -143,6 +145,7 @@ func NewCommand(f client.Factory) *cobra.Command {
defaultVolumeSnapshotLocations: make(map[string]string),
backupSyncPeriod: defaultBackupSyncPeriod,
defaultBackupTTL: defaultBackupTTL,
defaultCSISnapshotTimeout: defaultCSISnapshotTimeout,
storeValidationFrequency: defaultStoreValidationFrequency,
podVolumeOperationTimeout: defaultPodVolumeOperationTimeout,
restoreResourcePriorities: defaultRestorePriorities,
@ -651,6 +654,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
s.config.defaultBackupLocation,
s.config.defaultVolumesToRestic,
s.config.defaultBackupTTL,
s.config.defaultCSISnapshotTimeout,
s.sharedInformerFactory.Velero().V1().VolumeSnapshotLocations().Lister(),
defaultVolumeSnapshotLocations,
s.metrics,

View File

@ -88,6 +88,7 @@ type backupController struct {
defaultBackupLocation string
defaultVolumesToRestic bool
defaultBackupTTL time.Duration
defaultCSISnapshotTimeout time.Duration
snapshotLocationLister velerov1listers.VolumeSnapshotLocationLister
defaultSnapshotLocations map[string]string
metrics *metrics.ServerMetrics
@ -112,6 +113,7 @@ func NewBackupController(
defaultBackupLocation string,
defaultVolumesToRestic bool,
defaultBackupTTL time.Duration,
defaultCSISnapshotTimeout time.Duration,
volumeSnapshotLocationLister velerov1listers.VolumeSnapshotLocationLister,
defaultSnapshotLocations map[string]string,
metrics *metrics.ServerMetrics,
@ -136,6 +138,7 @@ func NewBackupController(
defaultBackupLocation: defaultBackupLocation,
defaultVolumesToRestic: defaultVolumesToRestic,
defaultBackupTTL: defaultBackupTTL,
defaultCSISnapshotTimeout: defaultCSISnapshotTimeout,
snapshotLocationLister: volumeSnapshotLocationLister,
defaultSnapshotLocations: defaultSnapshotLocations,
metrics: metrics,
@ -359,6 +362,11 @@ func (c *backupController) prepareBackupRequest(backup *velerov1api.Backup) *pkg
request.Spec.TTL.Duration = c.defaultBackupTTL
}
if request.Spec.CSISnapshotTimeout.Duration == 0 {
// set default CSI VolumeSnapshot timeout
request.Spec.CSISnapshotTimeout.Duration = c.defaultCSISnapshotTimeout
}
// calculate expiration
request.Status.Expiration = &metav1.Time{Time: c.clock.Now().Add(request.Spec.TTL.Duration)}
@ -649,7 +657,7 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
backupLog.Error(err)
}
err = c.checkVolumeSnapshotReadyToUse(context.Background(), volumeSnapshots)
err = c.checkVolumeSnapshotReadyToUse(context.Background(), volumeSnapshots, backup.Spec.CSISnapshotTimeout.Duration)
if err != nil {
backupLog.Errorf("fail to wait VolumeSnapshot change to Ready: %s", err.Error())
}
@ -890,9 +898,10 @@ func encodeToJSONGzip(data interface{}, desc string) (*bytes.Buffer, []error) {
// using goroutine here instead of waiting in CSI plugin, because it's not easy to make BackupItemAction
// parallel by now. After BackupItemAction parallel is implemented, this logic should be moved to CSI plugin
// as https://github.com/vmware-tanzu/velero-plugin-for-csi/pull/100
func (c *backupController) checkVolumeSnapshotReadyToUse(ctx context.Context, volumesnapshots []*snapshotv1api.VolumeSnapshot) error {
func (c *backupController) checkVolumeSnapshotReadyToUse(ctx context.Context, volumesnapshots []*snapshotv1api.VolumeSnapshot,
csiSnapshotTimeout time.Duration) error {
eg, _ := errgroup.WithContext(ctx)
timeout := 10 * time.Minute
timeout := csiSnapshotTimeout
interval := 5 * time.Second
for _, vs := range volumesnapshots {

View File

@ -29,6 +29,10 @@ metadata:
namespace: velero
# Parameters about the backup. Required.
spec:
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:

View File

@ -34,6 +34,10 @@ spec:
schedule: 0 7 * * *
# Template is the spec that should be used for each backup triggered by this schedule.
template:
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the scheduled backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:

View File

@ -47,10 +47,10 @@ cd velero
kubectl apply -f examples/nginx-app/with-pv.yaml
```
1. Create a backup with PV snapshotting:
1. Create a backup with PV snapshotting. `--csi-snapshot-timeout` is used to setup time to wait before CSI snapshot creation timeout. The default value is 10 minutes:
```bash
velero backup create nginx-backup --include-namespaces nginx-example
velero backup create nginx-backup --include-namespaces nginx-example --csi-snapshot-timeout=20m
```
1. Simulate a disaster:

View File

@ -29,6 +29,11 @@ metadata:
namespace: velero
# Parameters about the backup. Required.
spec:
# Available since v1.9.1.
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:

View File

@ -34,6 +34,11 @@ spec:
schedule: 0 7 * * *
# Template is the spec that should be used for each backup triggered by this schedule.
template:
# Available since v1.9.1.
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the scheduled backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces: