enhance uninstall and backup deletion for restore finalizer
Signed-off-by: allenxu404 <qix2@vmware.com>pull/6591/head
parent
3e631ca466
commit
846f0de178
|
@ -25,23 +25,31 @@ import (
|
|||
"github.com/spf13/cobra"
|
||||
"github.com/spf13/pflag"
|
||||
|
||||
appsv1api "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
|
||||
apiextv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/api/meta"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
kbclient "sigs.k8s.io/controller-runtime/pkg/client"
|
||||
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
|
||||
|
||||
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
|
||||
"github.com/vmware-tanzu/velero/pkg/client"
|
||||
"github.com/vmware-tanzu/velero/pkg/cmd"
|
||||
"github.com/vmware-tanzu/velero/pkg/cmd/cli"
|
||||
"github.com/vmware-tanzu/velero/pkg/controller"
|
||||
"github.com/vmware-tanzu/velero/pkg/install"
|
||||
kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube"
|
||||
)
|
||||
|
||||
var gracefulDeletionMaximumDuration = 1 * time.Minute
|
||||
|
||||
// uninstallOptions collects all the options for uninstalling Velero from a Kubernetes cluster.
|
||||
type uninstallOptions struct {
|
||||
wait bool // deprecated
|
||||
|
@ -117,7 +125,6 @@ func Run(ctx context.Context, kbClient kbclient.Client, namespace string) error
|
|||
}
|
||||
|
||||
// CRDs
|
||||
|
||||
veleroLabelSelector := labels.SelectorFromSet(install.Labels())
|
||||
opts := []kbclient.DeleteAllOfOption{
|
||||
kbclient.InNamespace(namespace),
|
||||
|
@ -160,8 +167,7 @@ func Run(ctx context.Context, kbClient kbclient.Client, namespace string) error
|
|||
}
|
||||
|
||||
func deleteNamespace(ctx context.Context, kbClient kbclient.Client, namespace string) error {
|
||||
// delete resources with finalizer attached first to ensure finalizer can be handled by corresponding controller and resources can be deleted successfully before controller's pod is deleted.
|
||||
// otherwise the process of deleting namespace will get stuck in deleting those resources forever.
|
||||
// Deal with resources with attached finalizers to ensure proper handling of those finalizers.
|
||||
if err := deleteResourcesWithFinalizer(ctx, kbClient, namespace); err != nil {
|
||||
return errors.Wrap(err, "Fail to remove finalizer from restores")
|
||||
}
|
||||
|
@ -211,58 +217,136 @@ func deleteNamespace(ctx context.Context, kbClient kbclient.Client, namespace st
|
|||
return nil
|
||||
}
|
||||
|
||||
// A few things needed to be noticed here:
|
||||
// 1. When we delete resources with attached finalizers, the corresponding controller will deal with the finalizer then resources can be deleted successfully.
|
||||
// So it is important to delete these resources before deleting the pod that runs that controller.
|
||||
// 2. The controller may encounter errors while handling the finalizer, in such case, the controller will keep trying until it succeeds.
|
||||
// So it is important to set a timeout, once the process exceed the timeout, we will forcedly delete the resources by removing the finalizer from them,
|
||||
// otherwise the deletion process may get stuck indefinitely.
|
||||
// 3. There is only restore finalizer supported as of v1.12. If any new finalizers are added in the future, the corresponding deletion logic can be
|
||||
// incorporated into this function.
|
||||
func deleteResourcesWithFinalizer(ctx context.Context, kbClient kbclient.Client, namespace string) error {
|
||||
//check if restore crd exists
|
||||
fmt.Println("Waiting for resource with attached finalizer to be deleted")
|
||||
return deleteRestore(ctx, kbClient, namespace)
|
||||
}
|
||||
|
||||
func deleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error {
|
||||
// Check if restore crd exists, if it does not exist, return immediately.
|
||||
var err error
|
||||
v1crd := &apiextv1.CustomResourceDefinition{}
|
||||
key := kbclient.ObjectKey{Name: "restores.velero.io"}
|
||||
if err := kbClient.Get(ctx, key, v1crd); err != nil {
|
||||
if err = kbClient.Get(ctx, key, v1crd); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
return nil
|
||||
} else {
|
||||
return err
|
||||
return errors.Wrap(err, "Error getting restore crd")
|
||||
}
|
||||
}
|
||||
|
||||
// delete all the restores
|
||||
restoreList := &velerov1api.RestoreList{}
|
||||
if err := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
|
||||
return err
|
||||
// First attempt to gracefully delete all the restore within a specified time frame, If the process exceeds the timeout limit,
|
||||
// it is likely that there may be errors during the finalization of restores. In such cases, we should proceed with forcefully deleting the restores.
|
||||
err = gracefullyDeleteRestore(ctx, kbClient, namespace)
|
||||
if err != nil && err != wait.ErrWaitTimeout {
|
||||
return errors.Wrap(err, "Error deleting restores")
|
||||
}
|
||||
|
||||
for i := range restoreList.Items {
|
||||
if err := kbClient.Delete(ctx, &restoreList.Items[i]); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
continue
|
||||
}
|
||||
return err
|
||||
if err == wait.ErrWaitTimeout {
|
||||
err = forcedlyDeleteRestore(ctx, kbClient, namespace)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "Error deleting restores")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func gracefullyDeleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error {
|
||||
var err error
|
||||
restoreList := &velerov1api.RestoreList{}
|
||||
if err = kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
|
||||
return errors.Wrap(err, "Error getting restores during graceful deletion")
|
||||
}
|
||||
|
||||
for i := range restoreList.Items {
|
||||
if err = kbClient.Delete(ctx, &restoreList.Items[i]); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
continue
|
||||
}
|
||||
return errors.Wrap(err, "Error deleting restores during graceful deletion")
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the deletion of all the restores within a specified time frame
|
||||
err = wait.PollImmediate(time.Second, gracefulDeletionMaximumDuration, func() (bool, error) {
|
||||
restoreList := &velerov1api.RestoreList{}
|
||||
if errList := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); errList != nil {
|
||||
return false, errList
|
||||
}
|
||||
|
||||
if len(restoreList.Items) > 0 {
|
||||
fmt.Print(".")
|
||||
return false, nil
|
||||
} else {
|
||||
return true, nil
|
||||
}
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func forcedlyDeleteRestore(ctx context.Context, kbClient kbclient.Client, namespace string) error {
|
||||
// Delete velero deployment first in case:
|
||||
// 1. finalizers will be added back by restore controller after they are removed at next step;
|
||||
// 2. new restores attached with finalizer will be created by restore controller after we remove all the restores' finalizer at next step;
|
||||
deploy := &appsv1api.Deployment{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Namespace: "velero",
|
||||
Name: namespace,
|
||||
},
|
||||
}
|
||||
|
||||
err := kbClient.Delete(ctx, deploy)
|
||||
if err != nil && !apierrors.IsNotFound(err) {
|
||||
return errors.Wrap(err, "Error deleting velero deployment during force deletion")
|
||||
}
|
||||
|
||||
ctxc, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
checkFunc := func() {
|
||||
deploy := &appsv1api.Deployment{}
|
||||
key := kbclient.ObjectKey{Namespace: namespace, Name: "velero"}
|
||||
|
||||
if err = kbClient.Get(ctxc, key, deploy); err != nil {
|
||||
if apierrors.IsNotFound(err) {
|
||||
err = nil
|
||||
}
|
||||
cancel()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Wait until velero deployment are deleted.
|
||||
wait.Until(checkFunc, 100*time.Millisecond, ctxc.Done())
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "Error deleting velero deployment during force deletion")
|
||||
}
|
||||
|
||||
// Remove all the restores' finalizer so they can be deleted during the deletion of velero namespace.
|
||||
restoreList := &velerov1api.RestoreList{}
|
||||
if err := kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
|
||||
return errors.Wrap(err, "Error getting restores during force deletion")
|
||||
}
|
||||
|
||||
for i := range restoreList.Items {
|
||||
if controllerutil.ContainsFinalizer(&restoreList.Items[i], controller.ExternalResourcesFinalizer) {
|
||||
update := &restoreList.Items[i]
|
||||
original := update.DeepCopy()
|
||||
controllerutil.RemoveFinalizer(update, controller.ExternalResourcesFinalizer)
|
||||
if err := kubeutil.PatchResource(original, update, kbClient); err != nil {
|
||||
return errors.Wrap(err, "Error removing restore finalizer during force deletion")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Waiting for resource with finalizer attached to be deleted")
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
var err error
|
||||
checkFunc := func() {
|
||||
restoreList := &velerov1api.RestoreList{}
|
||||
if err = kbClient.List(ctx, restoreList, &kbclient.ListOptions{Namespace: namespace}); err != nil {
|
||||
cancel()
|
||||
return
|
||||
}
|
||||
|
||||
if len(restoreList.Items) > 0 {
|
||||
fmt.Print(".")
|
||||
} else {
|
||||
cancel()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// wait until all the restores are deleted
|
||||
wait.Until(checkFunc, 100*time.Millisecond, ctx.Done())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -34,7 +34,9 @@ import (
|
|||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
kubeerrs "k8s.io/apimachinery/pkg/util/errors"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/utils/clock"
|
||||
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
|
||||
"github.com/vmware-tanzu/velero/internal/credentials"
|
||||
|
@ -363,24 +365,45 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
|
|||
}); err != nil {
|
||||
log.WithError(errors.WithStack(err)).Error("Error listing restore API objects")
|
||||
} else {
|
||||
// Restore files in object storage will be handled by restore finalizer, so we simply need to initiate a delete request on restores here.
|
||||
for i, restore := range restoreList.Items {
|
||||
if restore.Spec.BackupName != backup.Name {
|
||||
continue
|
||||
}
|
||||
restoreLog := log.WithField("restore", kube.NamespaceAndName(&restoreList.Items[i]))
|
||||
|
||||
restoreLog.Info("Deleting restore log/results from backup storage")
|
||||
if err := backupStore.DeleteRestore(restore.Name); err != nil {
|
||||
errs = append(errs, err.Error())
|
||||
// if we couldn't delete the restore files, don't delete the API object
|
||||
continue
|
||||
}
|
||||
|
||||
restoreLog.Info("Deleting restore referencing backup")
|
||||
if err := r.Delete(ctx, &restoreList.Items[i]); err != nil {
|
||||
errs = append(errs, errors.Wrapf(err, "error deleting restore %s", kube.NamespaceAndName(&restoreList.Items[i])).Error())
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the deletion of restores within certain amount of time.
|
||||
// Notice that there could be potential errors during the finalization process, which may result in the failure to delete the restore.
|
||||
// Therefore, it is advisable to set a timeout period for waiting.
|
||||
err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
|
||||
restoreList := &velerov1api.RestoreList{}
|
||||
if err := r.List(ctx, restoreList, &client.ListOptions{Namespace: backup.Namespace, LabelSelector: selector}); err != nil {
|
||||
return false, err
|
||||
}
|
||||
cnt := 0
|
||||
for _, restore := range restoreList.Items {
|
||||
if restore.Spec.BackupName != backup.Name {
|
||||
continue
|
||||
}
|
||||
cnt++
|
||||
}
|
||||
|
||||
if cnt > 0 {
|
||||
return false, nil
|
||||
} else {
|
||||
return true, nil
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
log.WithError(err).Error("Error polling for deletion of restores")
|
||||
errs = append(errs, errors.Wrapf(err, "error deleting restore %s", err).Error())
|
||||
}
|
||||
}
|
||||
|
||||
if len(errs) == 0 {
|
||||
|
|
|
@ -322,8 +322,6 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
|
|||
td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
|
||||
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(io.NopCloser(bytes.NewReader([]byte("hello world"))), nil)
|
||||
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)
|
||||
td.backupStore.On("DeleteRestore", "restore-1").Return(nil)
|
||||
td.backupStore.On("DeleteRestore", "restore-2").Return(nil)
|
||||
|
||||
_, err := td.controller.Reconcile(context.TODO(), td.req)
|
||||
require.NoError(t, err)
|
||||
|
@ -363,8 +361,6 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
|
|||
assert.Nil(t, err)
|
||||
|
||||
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
|
||||
td.backupStore.AssertCalled(t, "DeleteRestore", "restore-1")
|
||||
td.backupStore.AssertCalled(t, "DeleteRestore", "restore-2")
|
||||
|
||||
// Make sure snapshot was deleted
|
||||
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
|
||||
|
|
Loading…
Reference in New Issue