add change log

Signed-off-by: Lyndon-Li <lyonghui@vmware.com>
pull/5760/head
Lyndon-Li 2023-01-12 17:47:42 +08:00
parent fc98268181
commit d23418b5b5
3 changed files with 40 additions and 25 deletions

View File

@ -0,0 +1 @@
Fix issue 5043, after the restore pod is scheduled, check if the node-agent pod is running in the same node.

View File

@ -63,7 +63,7 @@ type restorer struct {
resultsLock sync.Mutex
results map[string]chan *velerov1api.PodVolumeRestore
nodeAgentCheck chan struct{}
nodeAgentCheck chan error
log logrus.FieldLogger
}
@ -145,7 +145,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
r.results[resultsKey(data.Pod.Namespace, data.Pod.Name)] = resultsChan
r.resultsLock.Unlock()
r.nodeAgentCheck = make(chan struct{})
r.nodeAgentCheck = make(chan error)
var (
errs []error
@ -179,11 +179,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
numRestores++
}
checkCtx, checkCancel := context.WithCancel(context.Background())
go func() {
nodeName := ""
checkFunc := func(ctx context.Context) (bool, error) {
newObj, err := r.kubeClient.CoreV1().Pods(data.Pod.Namespace).Get(context.TODO(), data.Pod.Name, metav1.GetOptions{})
newObj, err := r.kubeClient.CoreV1().Pods(data.Pod.Namespace).Get(ctx, data.Pod.Name, metav1.GetOptions{})
if err != nil {
return false, err
}
@ -198,16 +199,16 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
}
}
err := wait.PollWithContext(r.ctx, time.Millisecond*500, time.Minute*10, checkFunc)
err := wait.PollWithContext(checkCtx, time.Millisecond*500, time.Minute*10, checkFunc)
if err == wait.ErrWaitTimeout {
r.log.WithError(err).Error("Restoring pod is not scheduled until timeout, disengage")
r.log.WithError(err).Error("Restoring pod is not scheduled until timeout or cancel, disengage")
} else if err != nil {
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
} else {
err = nodeagent.IsRunningInNode(r.ctx, data.Restore.Namespace, nodeName, r.podClient)
err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.podClient)
if err != nil {
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running on node, abort the restore")
r.nodeAgentCheck <- struct{}{}
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")
r.nodeAgentCheck <- errors.Wrapf(err, "node-agent pod is not running in node %s", nodeName)
}
}
}()
@ -222,12 +223,17 @@ ForEachVolume:
if res.Status.Phase == velerov1api.PodVolumeRestorePhaseFailed {
errs = append(errs, errors.Errorf("pod volume restore failed: %s", res.Status.Message))
}
case <-r.nodeAgentCheck:
errs = append(errs, errors.New("node agent pod is not running in node"))
case err := <-r.nodeAgentCheck:
errs = append(errs, err)
break ForEachVolume
}
}
// This is to prevent the case that resultsChan is signaled before nodeAgentCheck though this is unlikely possible.
// One possible case is that the CR is edited and set to an ending state manually, either completed or failed.
// In this case, we must notify the check routine to stop.
checkCancel()
r.resultsLock.Lock()
delete(r.results, resultsKey(data.Pod.Namespace, data.Pod.Name))
r.resultsLock.Unlock()

View File

@ -23,30 +23,38 @@ import (
// IsPodRunning does a well-rounded check to make sure the specified pod is running stably.
// If not, return the error found
func IsPodRunning(pod *corev1api.Pod) error {
if pod.Spec.NodeName == "" {
return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
}
return isPodScheduledInStatus(pod, func(pod *corev1api.Pod) error {
if pod.Status.Phase != corev1api.PodRunning {
return errors.New("pod is not running")
}
if pod.Status.Phase != corev1api.PodRunning {
return errors.Errorf("pod is not running, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
}
if pod.DeletionTimestamp != nil {
return errors.Errorf("pod is being terminated, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
}
return nil
return nil
})
}
// IsPodRunning does a well-rounded check to make sure the specified pod has been scheduled into a node and in a stable status.
// IsPodScheduled does a well-rounded check to make sure the specified pod has been scheduled into a node and in a stable status.
// If not, return the error found
func IsPodScheduled(pod *corev1api.Pod) error {
return isPodScheduledInStatus(pod, func(pod *corev1api.Pod) error {
if pod.Status.Phase != corev1api.PodRunning && pod.Status.Phase != corev1api.PodPending {
return errors.New("pod is running or pending")
}
return nil
})
}
func isPodScheduledInStatus(pod *corev1api.Pod, statusCheckFunc func(*corev1api.Pod) error) error {
if pod == nil {
return errors.New("invalid input pod")
}
if pod.Spec.NodeName == "" {
return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
}
if pod.Status.Phase != corev1api.PodRunning && pod.Status.Phase != corev1api.PodPending {
return errors.Errorf("pod is not in a stable status, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
if err := statusCheckFunc(pod); err != nil {
return errors.Wrapf(err, "pod is not in the expected status, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
}
if pod.DeletionTimestamp != nil {