Clean up leaked CSI snapshot for incomplete backup

This commit makes sure when a backup is deleted the controller will
delete the CSI snapshot even when the bakckup tarball is not uploaded.

fixes #8160

Signed-off-by: Daniel Jiang <daniel.jiang@broadcom.com>
pull/8637/head
Daniel Jiang 2025-01-22 15:41:39 +08:00
parent 5b1738abf8
commit 1c372893ec
6 changed files with 51 additions and 6 deletions

View File

@ -13,7 +13,7 @@ jobs:
- name: Codespell
uses: codespell-project/actions-codespell@master
with:
# ignore the config/.../crd.go file as it's generated binary data that is edited elswhere.
# ignore the config/.../crd.go file as it's generated binary data that is edited elsewhere.
skip: .git,*.png,*.jpg,*.woff,*.ttf,*.gif,*.ico,./config/crd/v1beta1/crds/crds.go,./config/crd/v1/crds/crds.go,./config/crd/v2alpha1/crds/crds.go,./go.sum,./LICENSE
ignore_words_list: iam,aks,ist,bridget,ue,shouldnot,atleast,notin,sme,optin
check_filenames: true

View File

@ -333,7 +333,7 @@ linters-settings:
force-case-trailing-whitespace: 0
# Force cuddling of err checks with err var assignment
force-err-cuddling: false
# Allow leading comments to be separated with empty liens
# Allow leading comments to be separated with empty lines
allow-separated-leading-comment: false
linters:

View File

@ -0,0 +1 @@
Clean up leaked CSI snapshot for incomplete backup

View File

@ -22,6 +22,8 @@ import (
"fmt"
"time"
"github.com/vmware-tanzu/velero/pkg/util/csi"
jsonpatch "github.com/evanphx/json-patch/v5"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
@ -35,6 +37,8 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"
"github.com/vmware-tanzu/velero/internal/credentials"
"github.com/vmware-tanzu/velero/internal/delete"
"github.com/vmware-tanzu/velero/internal/volume"
@ -251,12 +255,18 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
// don't defer CleanupClients here, since it was already called above.
var errs []string
if len(actions) > 0 {
// Download the tarball
backupFile, err := downloadToTempFile(backup.Name, backupStore, log)
if err != nil {
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
log.Info("Cleaning up CSI volumesnapshots")
if err := r.deleteCSIVolumeSnapshots(ctx, backup, log); err != nil {
errs = append(errs, err.Error())
}
} else {
defer closeAndRemoveFile(backupFile, r.logger)
deleteCtx := &delete.Context{
@ -279,8 +289,6 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}
var errs []string
if backupStore != nil {
log.Info("Removing PV snapshots")
@ -495,6 +503,24 @@ func (r *backupDeletionReconciler) deleteExistingDeletionRequests(ctx context.Co
return errs
}
// deleteCSIVolumeSnapshots clean up the CSI snapshots created by the backup, this should be called when the backup is failed
// when it's running, e.g. due to velero pod restart, and the backup.tar is failed to be downloaded from storage.
func (r *backupDeletionReconciler) deleteCSIVolumeSnapshots(ctx context.Context, backup *velerov1api.Backup, log logrus.FieldLogger) error {
vsList := snapshotv1api.VolumeSnapshotList{}
if err := r.Client.List(ctx, &vsList, &client.ListOptions{
LabelSelector: labels.SelectorFromSet(map[string]string{
velerov1api.BackupNameLabel: label.GetValidName(backup.Name),
}),
}); err != nil {
return errors.Wrap(err, "error listing volume snapshots")
}
for _, item := range vsList.Items {
vs := item
csi.CleanupVolumeSnapshot(&vs, r.Client, log)
}
return nil
}
func (r *backupDeletionReconciler) deletePodVolumeSnapshots(ctx context.Context, backup *velerov1api.Backup) []error {
if r.repoMgr == nil {
return nil

View File

@ -25,6 +25,8 @@ import (
"reflect"
"time"
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"
"context"
"github.com/sirupsen/logrus"
@ -609,7 +611,13 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
Provider: "provider-1",
},
}
td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
csiSnapshot := builder.ForVolumeSnapshot("user-ns", "vs-1").ObjectMeta(
builder.WithLabelsMap(map[string]string{
"velero.io/backup-name": "foo",
})).SourcePVC("some-pvc").Result()
td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation, csiSnapshot)
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")
snapshots := []*volume.Snapshot{
@ -654,6 +662,13 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
// leaked CSI snapshot should be deleted
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
Namespace: "user-ns",
Name: "vs-1",
}, &snapshotv1api.VolumeSnapshot{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)
// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
})

View File

@ -490,6 +490,8 @@ func SetVolumeSnapshotContentDeletionPolicy(
return crClient.Patch(context.TODO(), vsc, crclient.MergeFrom(originVSC))
}
// CleanupVolumeSnapshot deletes the VolumeSnapshot and the associated VolumeSnapshotContent. It will make sure the
// physical snapshot is also deleted.
func CleanupVolumeSnapshot(
volSnap *snapshotv1api.VolumeSnapshot,
crClient crclient.Client,
@ -528,7 +530,8 @@ func CleanupVolumeSnapshot(
}
}
// DeleteVolumeSnapshot handles the VolumeSnapshot instance deletion.
// DeleteVolumeSnapshot handles the VolumeSnapshot instance deletion. It will make sure the VolumeSnapshotContent is
// recreated so that the physical snapshot is retained.
func DeleteVolumeSnapshot(
vs snapshotv1api.VolumeSnapshot,
vsc snapshotv1api.VolumeSnapshotContent,