From 1c372893ecafda6dad6b9229ae68a2e547d02cc9 Mon Sep 17 00:00:00 2001 From: Daniel Jiang Date: Wed, 22 Jan 2025 15:41:39 +0800 Subject: [PATCH] Clean up leaked CSI snapshot for incomplete backup This commit makes sure when a backup is deleted the controller will delete the CSI snapshot even when the bakckup tarball is not uploaded. fixes #8160 Signed-off-by: Daniel Jiang --- .github/workflows/pr-codespell.yml | 2 +- .golangci.yaml | 2 +- changelogs/unreleased/8637-raesonerjt | 1 + pkg/controller/backup_deletion_controller.go | 30 +++++++++++++++++-- .../backup_deletion_controller_test.go | 17 ++++++++++- pkg/util/csi/volume_snapshot.go | 5 +++- 6 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 changelogs/unreleased/8637-raesonerjt diff --git a/.github/workflows/pr-codespell.yml b/.github/workflows/pr-codespell.yml index 0d3138e40..900371ca2 100644 --- a/.github/workflows/pr-codespell.yml +++ b/.github/workflows/pr-codespell.yml @@ -13,7 +13,7 @@ jobs: - name: Codespell uses: codespell-project/actions-codespell@master with: - # ignore the config/.../crd.go file as it's generated binary data that is edited elswhere. + # ignore the config/.../crd.go file as it's generated binary data that is edited elsewhere. skip: .git,*.png,*.jpg,*.woff,*.ttf,*.gif,*.ico,./config/crd/v1beta1/crds/crds.go,./config/crd/v1/crds/crds.go,./config/crd/v2alpha1/crds/crds.go,./go.sum,./LICENSE ignore_words_list: iam,aks,ist,bridget,ue,shouldnot,atleast,notin,sme,optin check_filenames: true diff --git a/.golangci.yaml b/.golangci.yaml index 8d2b6c75d..3d43c0d2d 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -333,7 +333,7 @@ linters-settings: force-case-trailing-whitespace: 0 # Force cuddling of err checks with err var assignment force-err-cuddling: false - # Allow leading comments to be separated with empty liens + # Allow leading comments to be separated with empty lines allow-separated-leading-comment: false linters: diff --git a/changelogs/unreleased/8637-raesonerjt b/changelogs/unreleased/8637-raesonerjt new file mode 100644 index 000000000..5588fe513 --- /dev/null +++ b/changelogs/unreleased/8637-raesonerjt @@ -0,0 +1 @@ +Clean up leaked CSI snapshot for incomplete backup \ No newline at end of file diff --git a/pkg/controller/backup_deletion_controller.go b/pkg/controller/backup_deletion_controller.go index 9a09e85d3..31874412d 100644 --- a/pkg/controller/backup_deletion_controller.go +++ b/pkg/controller/backup_deletion_controller.go @@ -22,6 +22,8 @@ import ( "fmt" "time" + "github.com/vmware-tanzu/velero/pkg/util/csi" + jsonpatch "github.com/evanphx/json-patch/v5" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -35,6 +37,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1" + "github.com/vmware-tanzu/velero/internal/credentials" "github.com/vmware-tanzu/velero/internal/delete" "github.com/vmware-tanzu/velero/internal/volume" @@ -251,12 +255,18 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque } // don't defer CleanupClients here, since it was already called above. + var errs []string + if len(actions) > 0 { // Download the tarball backupFile, err := downloadToTempFile(backup.Name, backupStore, log) if err != nil { log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name) + log.Info("Cleaning up CSI volumesnapshots") + if err := r.deleteCSIVolumeSnapshots(ctx, backup, log); err != nil { + errs = append(errs, err.Error()) + } } else { defer closeAndRemoveFile(backupFile, r.logger) deleteCtx := &delete.Context{ @@ -279,8 +289,6 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque } } - var errs []string - if backupStore != nil { log.Info("Removing PV snapshots") @@ -495,6 +503,24 @@ func (r *backupDeletionReconciler) deleteExistingDeletionRequests(ctx context.Co return errs } +// deleteCSIVolumeSnapshots clean up the CSI snapshots created by the backup, this should be called when the backup is failed +// when it's running, e.g. due to velero pod restart, and the backup.tar is failed to be downloaded from storage. +func (r *backupDeletionReconciler) deleteCSIVolumeSnapshots(ctx context.Context, backup *velerov1api.Backup, log logrus.FieldLogger) error { + vsList := snapshotv1api.VolumeSnapshotList{} + if err := r.Client.List(ctx, &vsList, &client.ListOptions{ + LabelSelector: labels.SelectorFromSet(map[string]string{ + velerov1api.BackupNameLabel: label.GetValidName(backup.Name), + }), + }); err != nil { + return errors.Wrap(err, "error listing volume snapshots") + } + for _, item := range vsList.Items { + vs := item + csi.CleanupVolumeSnapshot(&vs, r.Client, log) + } + return nil +} + func (r *backupDeletionReconciler) deletePodVolumeSnapshots(ctx context.Context, backup *velerov1api.Backup) []error { if r.repoMgr == nil { return nil diff --git a/pkg/controller/backup_deletion_controller_test.go b/pkg/controller/backup_deletion_controller_test.go index 9a85a1348..d99a5c3c5 100644 --- a/pkg/controller/backup_deletion_controller_test.go +++ b/pkg/controller/backup_deletion_controller_test.go @@ -25,6 +25,8 @@ import ( "reflect" "time" + snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1" + "context" "github.com/sirupsen/logrus" @@ -609,7 +611,13 @@ func TestBackupDeletionControllerReconcile(t *testing.T) { Provider: "provider-1", }, } - td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation) + + csiSnapshot := builder.ForVolumeSnapshot("user-ns", "vs-1").ObjectMeta( + builder.WithLabelsMap(map[string]string{ + "velero.io/backup-name": "foo", + })).SourcePVC("some-pvc").Result() + + td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation, csiSnapshot) td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1") snapshots := []*volume.Snapshot{ @@ -654,6 +662,13 @@ func TestBackupDeletionControllerReconcile(t *testing.T) { }, &velerov1api.Backup{}) assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err) + // leaked CSI snapshot should be deleted + err = td.fakeClient.Get(context.TODO(), types.NamespacedName{ + Namespace: "user-ns", + Name: "vs-1", + }, &snapshotv1api.VolumeSnapshot{}) + assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err) + // Make sure snapshot was deleted assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len()) }) diff --git a/pkg/util/csi/volume_snapshot.go b/pkg/util/csi/volume_snapshot.go index 53796ab97..bcb656e3c 100644 --- a/pkg/util/csi/volume_snapshot.go +++ b/pkg/util/csi/volume_snapshot.go @@ -490,6 +490,8 @@ func SetVolumeSnapshotContentDeletionPolicy( return crClient.Patch(context.TODO(), vsc, crclient.MergeFrom(originVSC)) } +// CleanupVolumeSnapshot deletes the VolumeSnapshot and the associated VolumeSnapshotContent. It will make sure the +// physical snapshot is also deleted. func CleanupVolumeSnapshot( volSnap *snapshotv1api.VolumeSnapshot, crClient crclient.Client, @@ -528,7 +530,8 @@ func CleanupVolumeSnapshot( } } -// DeleteVolumeSnapshot handles the VolumeSnapshot instance deletion. +// DeleteVolumeSnapshot handles the VolumeSnapshot instance deletion. It will make sure the VolumeSnapshotContent is +// recreated so that the physical snapshot is retained. func DeleteVolumeSnapshot( vs snapshotv1api.VolumeSnapshot, vsc snapshotv1api.VolumeSnapshotContent,