Merge pull request #7022 from allenxu404/i6721

Fix inconsistent behavior of Backup and Restore hook execution
2023-11-06 14:01:30 +05:30 · 2023-11-06 14:01:30 +05:30 · 6b7ce6655d
parent 11938f9a5e 3a3527553a
commit 6b7ce6655d
5 changed files with 31 additions and 26 deletions
--- a/changelogs/unreleased/7022-allenxu404
+++ b/changelogs/unreleased/7022-allenxu404
@ -0,0 +1 @@
+Fix inconsistent behavior of Backup and Restore hook execution
--- a/internal/hook/wait_exec_hook_handler.go
+++ b/internal/hook/wait_exec_hook_handler.go
@ -50,6 +50,11 @@ type DefaultListWatchFactory struct {
 	PodsGetter cache.Getter
 }

+type HookErrInfo struct {
+	Namespace string
+	Err       error
+}
+
 func (d *DefaultListWatchFactory) NewListWatch(namespace string, selector fields.Selector) cache.ListerWatcher {
 	return cache.NewListWatchFromClient(d.PodsGetter, "pods", namespace, selector)
 }
@ -158,8 +163,8 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
 				if hook.Hook.WaitTimeout.Duration != 0 && time.Since(waitStart) > hook.Hook.WaitTimeout.Duration {
 					err := fmt.Errorf("hook %s in container %s expired before executing", hook.HookName, hook.Hook.Container)
 					hookLog.Error(err)
+					errors = append(errors, err)
 					if hook.Hook.OnError == velerov1api.HookErrorModeFail {
-						errors = append(errors, err)
 						cancel()
 						return
 					}
@ -172,8 +177,9 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
 				}
 				if err := e.PodCommandExecutor.ExecutePodCommand(hookLog, podMap, pod.Namespace, pod.Name, hook.HookName, eh); err != nil {
 					hookLog.WithError(err).Error("Error executing hook")
+					err = fmt.Errorf("hook %s in container %s failed to execute, err: %v", hook.HookName, hook.Hook.Container, err)
+					errors = append(errors, err)
 					if hook.Hook.OnError == velerov1api.HookErrorModeFail {
-						errors = append(errors, err)
 						cancel()
 						return
 					}
@ -204,10 +210,9 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
 	podWatcher.Run(ctx.Done())

 	// There are some cases where this function could return with unexecuted hooks: the pod may
-	// be deleted, a hook with OnError mode Fail could fail, or it may timeout waiting for
+	// be deleted, a hook could fail, or it may timeout waiting for
 	// containers to become ready.
-	// Each unexecuted hook is logged as an error but only hooks with OnError mode Fail return
-	// an error from this function.
+	// Each unexecuted hook is logged as an error and this error will be returned from this function.
 	for _, hooks := range byContainer {
 		for _, hook := range hooks {
 			if hook.executed {
@ -222,9 +227,7 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
 				},
 			)
 			hookLog.Error(err)
-			if hook.Hook.OnError == velerov1api.HookErrorModeFail {
-				errors = append(errors, err)
-			}
+			errors = append(errors, err)
 		}
 	}

--- a/internal/hook/wait_exec_hook_handler_test.go
+++ b/internal/hook/wait_exec_hook_handler_test.go
@ -209,10 +209,10 @@ func TestWaitExecHandleHooks(t *testing.T) {
 						Result(),
 				},
 			},
-			expectedErrors: []error{errors.New("pod hook error")},
+			expectedErrors: []error{errors.New("hook <from-annotation> in container container1 failed to execute, err: pod hook error")},
 		},
 		{
-			name: "should return no error when hook from annotation fails with on error mode continue",
+			name: "should return error when hook from annotation fails with on error mode continue",
 			initialPod: builder.ForPod("default", "my-pod").
 				ObjectMeta(builder.WithAnnotations(
 					podRestoreHookCommandAnnotationKey, "/usr/bin/foo",
@ -278,7 +278,7 @@ func TestWaitExecHandleHooks(t *testing.T) {
 						Result(),
 				},
 			},
-			expectedErrors: nil,
+			expectedErrors: []error{errors.New("hook <from-annotation> in container container1 failed to execute, err: pod hook error")},
 		},
 		{
 			name: "should return no error when hook from annotation executes after 10ms wait for container to start",
@ -422,7 +422,7 @@ func TestWaitExecHandleHooks(t *testing.T) {
 			},
 		},
 		{
-			name:          "should return no error when spec hook with wait timeout expires with OnError mode Continue",
+			name:          "should return error when spec hook with wait timeout expires with OnError mode Continue",
 			groupResource: "pods",
 			initialPod: builder.ForPod("default", "my-pod").
 				Containers(&v1.Container{
@ -435,7 +435,7 @@ func TestWaitExecHandleHooks(t *testing.T) {
 					},
 				}).
 				Result(),
-			expectedErrors: nil,
+			expectedErrors: []error{errors.New("hook my-hook-1 in container container1 in pod default/my-pod not executed: context deadline exceeded")},
 			byContainer: map[string][]PodExecRestoreHook{
 				"container1": {
 					{
@ -515,8 +515,8 @@ func TestWaitExecHandleHooks(t *testing.T) {
 			sharedHooksContextTimeout: time.Millisecond,
 		},
 		{
-			name:           "should return no error when shared hooks context is canceled before spec hook with OnError mode Continue executes",
-			expectedErrors: nil,
+			name:           "should return error when shared hooks context is canceled before spec hook with OnError mode Continue executes",
+			expectedErrors: []error{errors.New("hook my-hook-1 in container container1 in pod default/my-pod not executed: context deadline exceeded")},
 			groupResource:  "pods",
 			initialPod: builder.ForPod("default", "my-pod").
 				Containers(&v1.Container{
--- a/pkg/apis/velero/v1/backup_types.go
+++ b/pkg/apis/velero/v1/backup_types.go
@ -261,12 +261,12 @@ type ExecHook struct {
 type HookErrorMode string

 const (
-	// HookErrorModeContinue means that an error from a hook is acceptable, and the backup can
-	// proceed.
+	// HookErrorModeContinue means that an error from a hook is acceptable and the backup/restore can
+	// proceed with the rest of hooks' execution. This backup/restore should be in `PartiallyFailed` status.
 	HookErrorModeContinue HookErrorMode = "Continue"

-	// HookErrorModeFail means that an error from a hook is problematic, and the backup should be in
-	// error.
+	// HookErrorModeFail means that an error from a hook is problematic and Velero should stop executing following hooks.
+	// This backup/restore should be in `PartiallyFailed` status.
 	HookErrorModeFail HookErrorMode = "Fail"
 )

--- a/pkg/restore/restore.go
+++ b/pkg/restore/restore.go
@ -311,7 +311,7 @@ func (kr *kubernetesRestorer) RestoreWithResolvers(
 		discoveryHelper:                kr.discoveryHelper,
 		resourcePriorities:             kr.resourcePriorities,
 		resourceRestoreHooks:           resourceRestoreHooks,
-		hooksErrs:                      make(chan error),
+		hooksErrs:                      make(chan hook.HookErrInfo),
 		waitExecHookHandler:            waitExecHookHandler,
 		hooksContext:                   hooksCtx,
 		hooksCancelFunc:                hooksCancelFunc,
@ -360,7 +360,7 @@ type restoreContext struct {
 	discoveryHelper                discovery.Helper
 	resourcePriorities             Priorities
 	hooksWaitGroup                 sync.WaitGroup
-	hooksErrs                      chan error
+	hooksErrs                      chan hook.HookErrInfo
 	resourceRestoreHooks           []hook.ResourceRestoreHook
 	waitExecHookHandler            hook.WaitExecHookHandler
 	hooksContext                   go_context.Context
@ -655,8 +655,8 @@ func (ctx *restoreContext) execute() (results.Result, results.Result) {
 		ctx.hooksWaitGroup.Wait()
 		close(ctx.hooksErrs)
 	}()
-	for err := range ctx.hooksErrs {
-		errs.Velero = append(errs.Velero, err.Error())
+	for errInfo := range ctx.hooksErrs {
+		errs.Add(errInfo.Namespace, errInfo.Err)
 	}
 	ctx.log.Info("Done waiting for all post-restore exec hooks to complete")

@ -1898,10 +1898,11 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) {
 		// on the ctx.podVolumeErrs channel.
 		defer ctx.hooksWaitGroup.Done()

+		podNs := createdObj.GetNamespace()
 		pod := new(v1.Pod)
 		if err := runtime.DefaultUnstructuredConverter.FromUnstructured(createdObj.UnstructuredContent(), &pod); err != nil {
 			ctx.log.WithError(err).Error("error converting unstructured pod")
-			ctx.hooksErrs <- err
+			ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err}
 			return
 		}
 		execHooksByContainer, err := hook.GroupRestoreExecHooks(
@ -1911,7 +1912,7 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) {
 		)
 		if err != nil {
 			ctx.log.WithError(err).Errorf("error getting exec hooks for pod %s/%s", pod.Namespace, pod.Name)
-			ctx.hooksErrs <- err
+			ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err}
 			return
 		}

@ -1921,7 +1922,7 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) {

 			for _, err := range errs {
 				// Errors are already logged in the HandleHooks method.
-				ctx.hooksErrs <- err
+				ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err}
 			}
 		}
 	}()
				`@ -0,0 +1 @@`
				`Fix inconsistent behavior of Backup and Restore hook execution`