add change log

Signed-off-by: Lyndon-Li <lyonghui@vmware.com>
2023-01-12 17:47:42 +08:00 · 2023-01-12 17:47:42 +08:00 · d23418b5b5
parent fc98268181
commit d23418b5b5
3 changed files with 40 additions and 25 deletions
--- a/changelogs/unreleased/5760-Lyndon-Li
+++ b/changelogs/unreleased/5760-Lyndon-Li
@ -0,0 +1 @@
+Fix issue 5043, after the restore pod is scheduled, check if the node-agent pod is running in the same node.
--- a/pkg/podvolume/restorer.go
+++ b/pkg/podvolume/restorer.go
@ -63,7 +63,7 @@ type restorer struct {

 	resultsLock    sync.Mutex
 	results        map[string]chan *velerov1api.PodVolumeRestore
-	nodeAgentCheck chan struct{}
+	nodeAgentCheck chan error
 	log            logrus.FieldLogger
 }

@ -145,7 +145,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
 	r.results[resultsKey(data.Pod.Namespace, data.Pod.Name)] = resultsChan
 	r.resultsLock.Unlock()

-	r.nodeAgentCheck = make(chan struct{})
+	r.nodeAgentCheck = make(chan error)

 	var (
 		errs        []error
@ -179,11 +179,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
 		numRestores++
 	}

+	checkCtx, checkCancel := context.WithCancel(context.Background())
 	go func() {
 		nodeName := ""

 		checkFunc := func(ctx context.Context) (bool, error) {
-			newObj, err := r.kubeClient.CoreV1().Pods(data.Pod.Namespace).Get(context.TODO(), data.Pod.Name, metav1.GetOptions{})
+			newObj, err := r.kubeClient.CoreV1().Pods(data.Pod.Namespace).Get(ctx, data.Pod.Name, metav1.GetOptions{})
 			if err != nil {
 				return false, err
 			}
@ -198,16 +199,16 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
 			}
 		}

-		err := wait.PollWithContext(r.ctx, time.Millisecond*500, time.Minute*10, checkFunc)
+		err := wait.PollWithContext(checkCtx, time.Millisecond*500, time.Minute*10, checkFunc)
 		if err == wait.ErrWaitTimeout {
-			r.log.WithError(err).Error("Restoring pod is not scheduled until timeout, disengage")
+			r.log.WithError(err).Error("Restoring pod is not scheduled until timeout or cancel, disengage")
 		} else if err != nil {
 			r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
 		} else {
-			err = nodeagent.IsRunningInNode(r.ctx, data.Restore.Namespace, nodeName, r.podClient)
+			err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.podClient)
 			if err != nil {
-				r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running on node, abort the restore")
-				r.nodeAgentCheck <- struct{}{}
+				r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")
+				r.nodeAgentCheck <- errors.Wrapf(err, "node-agent pod is not running in node %s", nodeName)
 			}
 		}
 	}()
@ -222,12 +223,17 @@ ForEachVolume:
 			if res.Status.Phase == velerov1api.PodVolumeRestorePhaseFailed {
 				errs = append(errs, errors.Errorf("pod volume restore failed: %s", res.Status.Message))
 			}
-		case <-r.nodeAgentCheck:
-			errs = append(errs, errors.New("node agent pod is not running in node"))
+		case err := <-r.nodeAgentCheck:
+			errs = append(errs, err)
 			break ForEachVolume
 		}
 	}

+	// This is to prevent the case that resultsChan is signaled before nodeAgentCheck though this is unlikely possible.
+	// One possible case is that the CR is edited and set to an ending state manually, either completed or failed.
+	// In this case, we must notify the check routine to stop.
+	checkCancel()
+
 	r.resultsLock.Lock()
 	delete(r.results, resultsKey(data.Pod.Namespace, data.Pod.Name))
 	r.resultsLock.Unlock()
--- a/pkg/util/kube/pod.go
+++ b/pkg/util/kube/pod.go
@ -23,30 +23,38 @@ import (
 // IsPodRunning does a well-rounded check to make sure the specified pod is running stably.
 // If not, return the error found
 func IsPodRunning(pod *corev1api.Pod) error {
-	if pod.Spec.NodeName == "" {
-		return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
-	}
+	return isPodScheduledInStatus(pod, func(pod *corev1api.Pod) error {
+		if pod.Status.Phase != corev1api.PodRunning {
+			return errors.New("pod is not running")
+		}

-	if pod.Status.Phase != corev1api.PodRunning {
-		return errors.Errorf("pod is not running, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
-	}
-
-	if pod.DeletionTimestamp != nil {
-		return errors.Errorf("pod is being terminated, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
-	}
-
-	return nil
+		return nil
+	})
 }

-// IsPodRunning does a well-rounded check to make sure the specified pod has been scheduled into a node and in a stable status.
+// IsPodScheduled does a well-rounded check to make sure the specified pod has been scheduled into a node and in a stable status.
 // If not, return the error found
 func IsPodScheduled(pod *corev1api.Pod) error {
+	return isPodScheduledInStatus(pod, func(pod *corev1api.Pod) error {
+		if pod.Status.Phase != corev1api.PodRunning && pod.Status.Phase != corev1api.PodPending {
+			return errors.New("pod is running or pending")
+		}
+
+		return nil
+	})
+}
+
+func isPodScheduledInStatus(pod *corev1api.Pod, statusCheckFunc func(*corev1api.Pod) error) error {
+	if pod == nil {
+		return errors.New("invalid input pod")
+	}
+
 	if pod.Spec.NodeName == "" {
 		return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
 	}

-	if pod.Status.Phase != corev1api.PodRunning && pod.Status.Phase != corev1api.PodPending {
-		return errors.Errorf("pod is not in a stable status, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
+	if err := statusCheckFunc(pod); err != nil {
+		return errors.Wrapf(err, "pod is not in the expected status, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
 	}

 	if pod.DeletionTimestamp != nil {
				`@ -0,0 +1 @@`
				`Fix issue 5043, after the restore pod is scheduled, check if the node-agent pod is running in the same node.`