Merge pull request #7022 from allenxu404/i6721

Fix inconsistent behavior of Backup and Restore hook execution
pull/6553/head
Anshul Ahuja 2023-11-06 14:01:30 +05:30 committed by GitHub
commit 6b7ce6655d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 31 additions and 26 deletions

View File

@ -0,0 +1 @@
Fix inconsistent behavior of Backup and Restore hook execution

View File

@ -50,6 +50,11 @@ type DefaultListWatchFactory struct {
PodsGetter cache.Getter
}
type HookErrInfo struct {
Namespace string
Err error
}
func (d *DefaultListWatchFactory) NewListWatch(namespace string, selector fields.Selector) cache.ListerWatcher {
return cache.NewListWatchFromClient(d.PodsGetter, "pods", namespace, selector)
}
@ -158,8 +163,8 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
if hook.Hook.WaitTimeout.Duration != 0 && time.Since(waitStart) > hook.Hook.WaitTimeout.Duration {
err := fmt.Errorf("hook %s in container %s expired before executing", hook.HookName, hook.Hook.Container)
hookLog.Error(err)
errors = append(errors, err)
if hook.Hook.OnError == velerov1api.HookErrorModeFail {
errors = append(errors, err)
cancel()
return
}
@ -172,8 +177,9 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
}
if err := e.PodCommandExecutor.ExecutePodCommand(hookLog, podMap, pod.Namespace, pod.Name, hook.HookName, eh); err != nil {
hookLog.WithError(err).Error("Error executing hook")
err = fmt.Errorf("hook %s in container %s failed to execute, err: %v", hook.HookName, hook.Hook.Container, err)
errors = append(errors, err)
if hook.Hook.OnError == velerov1api.HookErrorModeFail {
errors = append(errors, err)
cancel()
return
}
@ -204,10 +210,9 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
podWatcher.Run(ctx.Done())
// There are some cases where this function could return with unexecuted hooks: the pod may
// be deleted, a hook with OnError mode Fail could fail, or it may timeout waiting for
// be deleted, a hook could fail, or it may timeout waiting for
// containers to become ready.
// Each unexecuted hook is logged as an error but only hooks with OnError mode Fail return
// an error from this function.
// Each unexecuted hook is logged as an error and this error will be returned from this function.
for _, hooks := range byContainer {
for _, hook := range hooks {
if hook.executed {
@ -222,9 +227,7 @@ func (e *DefaultWaitExecHookHandler) HandleHooks(
},
)
hookLog.Error(err)
if hook.Hook.OnError == velerov1api.HookErrorModeFail {
errors = append(errors, err)
}
errors = append(errors, err)
}
}

View File

@ -209,10 +209,10 @@ func TestWaitExecHandleHooks(t *testing.T) {
Result(),
},
},
expectedErrors: []error{errors.New("pod hook error")},
expectedErrors: []error{errors.New("hook <from-annotation> in container container1 failed to execute, err: pod hook error")},
},
{
name: "should return no error when hook from annotation fails with on error mode continue",
name: "should return error when hook from annotation fails with on error mode continue",
initialPod: builder.ForPod("default", "my-pod").
ObjectMeta(builder.WithAnnotations(
podRestoreHookCommandAnnotationKey, "/usr/bin/foo",
@ -278,7 +278,7 @@ func TestWaitExecHandleHooks(t *testing.T) {
Result(),
},
},
expectedErrors: nil,
expectedErrors: []error{errors.New("hook <from-annotation> in container container1 failed to execute, err: pod hook error")},
},
{
name: "should return no error when hook from annotation executes after 10ms wait for container to start",
@ -422,7 +422,7 @@ func TestWaitExecHandleHooks(t *testing.T) {
},
},
{
name: "should return no error when spec hook with wait timeout expires with OnError mode Continue",
name: "should return error when spec hook with wait timeout expires with OnError mode Continue",
groupResource: "pods",
initialPod: builder.ForPod("default", "my-pod").
Containers(&v1.Container{
@ -435,7 +435,7 @@ func TestWaitExecHandleHooks(t *testing.T) {
},
}).
Result(),
expectedErrors: nil,
expectedErrors: []error{errors.New("hook my-hook-1 in container container1 in pod default/my-pod not executed: context deadline exceeded")},
byContainer: map[string][]PodExecRestoreHook{
"container1": {
{
@ -515,8 +515,8 @@ func TestWaitExecHandleHooks(t *testing.T) {
sharedHooksContextTimeout: time.Millisecond,
},
{
name: "should return no error when shared hooks context is canceled before spec hook with OnError mode Continue executes",
expectedErrors: nil,
name: "should return error when shared hooks context is canceled before spec hook with OnError mode Continue executes",
expectedErrors: []error{errors.New("hook my-hook-1 in container container1 in pod default/my-pod not executed: context deadline exceeded")},
groupResource: "pods",
initialPod: builder.ForPod("default", "my-pod").
Containers(&v1.Container{

View File

@ -261,12 +261,12 @@ type ExecHook struct {
type HookErrorMode string
const (
// HookErrorModeContinue means that an error from a hook is acceptable, and the backup can
// proceed.
// HookErrorModeContinue means that an error from a hook is acceptable and the backup/restore can
// proceed with the rest of hooks' execution. This backup/restore should be in `PartiallyFailed` status.
HookErrorModeContinue HookErrorMode = "Continue"
// HookErrorModeFail means that an error from a hook is problematic, and the backup should be in
// error.
// HookErrorModeFail means that an error from a hook is problematic and Velero should stop executing following hooks.
// This backup/restore should be in `PartiallyFailed` status.
HookErrorModeFail HookErrorMode = "Fail"
)

View File

@ -311,7 +311,7 @@ func (kr *kubernetesRestorer) RestoreWithResolvers(
discoveryHelper: kr.discoveryHelper,
resourcePriorities: kr.resourcePriorities,
resourceRestoreHooks: resourceRestoreHooks,
hooksErrs: make(chan error),
hooksErrs: make(chan hook.HookErrInfo),
waitExecHookHandler: waitExecHookHandler,
hooksContext: hooksCtx,
hooksCancelFunc: hooksCancelFunc,
@ -360,7 +360,7 @@ type restoreContext struct {
discoveryHelper discovery.Helper
resourcePriorities Priorities
hooksWaitGroup sync.WaitGroup
hooksErrs chan error
hooksErrs chan hook.HookErrInfo
resourceRestoreHooks []hook.ResourceRestoreHook
waitExecHookHandler hook.WaitExecHookHandler
hooksContext go_context.Context
@ -655,8 +655,8 @@ func (ctx *restoreContext) execute() (results.Result, results.Result) {
ctx.hooksWaitGroup.Wait()
close(ctx.hooksErrs)
}()
for err := range ctx.hooksErrs {
errs.Velero = append(errs.Velero, err.Error())
for errInfo := range ctx.hooksErrs {
errs.Add(errInfo.Namespace, errInfo.Err)
}
ctx.log.Info("Done waiting for all post-restore exec hooks to complete")
@ -1898,10 +1898,11 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) {
// on the ctx.podVolumeErrs channel.
defer ctx.hooksWaitGroup.Done()
podNs := createdObj.GetNamespace()
pod := new(v1.Pod)
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(createdObj.UnstructuredContent(), &pod); err != nil {
ctx.log.WithError(err).Error("error converting unstructured pod")
ctx.hooksErrs <- err
ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err}
return
}
execHooksByContainer, err := hook.GroupRestoreExecHooks(
@ -1911,7 +1912,7 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) {
)
if err != nil {
ctx.log.WithError(err).Errorf("error getting exec hooks for pod %s/%s", pod.Namespace, pod.Name)
ctx.hooksErrs <- err
ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err}
return
}
@ -1921,7 +1922,7 @@ func (ctx *restoreContext) waitExec(createdObj *unstructured.Unstructured) {
for _, err := range errs {
// Errors are already logged in the HandleHooks method.
ctx.hooksErrs <- err
ctx.hooksErrs <- hook.HookErrInfo{Namespace: podNs, Err: err}
}
}
}()