parent
fc98268181
commit
d23418b5b5
|
@ -0,0 +1 @@
|
|||
Fix issue 5043, after the restore pod is scheduled, check if the node-agent pod is running in the same node.
|
|
@ -63,7 +63,7 @@ type restorer struct {
|
|||
|
||||
resultsLock sync.Mutex
|
||||
results map[string]chan *velerov1api.PodVolumeRestore
|
||||
nodeAgentCheck chan struct{}
|
||||
nodeAgentCheck chan error
|
||||
log logrus.FieldLogger
|
||||
}
|
||||
|
||||
|
@ -145,7 +145,7 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
|
|||
r.results[resultsKey(data.Pod.Namespace, data.Pod.Name)] = resultsChan
|
||||
r.resultsLock.Unlock()
|
||||
|
||||
r.nodeAgentCheck = make(chan struct{})
|
||||
r.nodeAgentCheck = make(chan error)
|
||||
|
||||
var (
|
||||
errs []error
|
||||
|
@ -179,11 +179,12 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
|
|||
numRestores++
|
||||
}
|
||||
|
||||
checkCtx, checkCancel := context.WithCancel(context.Background())
|
||||
go func() {
|
||||
nodeName := ""
|
||||
|
||||
checkFunc := func(ctx context.Context) (bool, error) {
|
||||
newObj, err := r.kubeClient.CoreV1().Pods(data.Pod.Namespace).Get(context.TODO(), data.Pod.Name, metav1.GetOptions{})
|
||||
newObj, err := r.kubeClient.CoreV1().Pods(data.Pod.Namespace).Get(ctx, data.Pod.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
@ -198,16 +199,16 @@ func (r *restorer) RestorePodVolumes(data RestoreData) []error {
|
|||
}
|
||||
}
|
||||
|
||||
err := wait.PollWithContext(r.ctx, time.Millisecond*500, time.Minute*10, checkFunc)
|
||||
err := wait.PollWithContext(checkCtx, time.Millisecond*500, time.Minute*10, checkFunc)
|
||||
if err == wait.ErrWaitTimeout {
|
||||
r.log.WithError(err).Error("Restoring pod is not scheduled until timeout, disengage")
|
||||
r.log.WithError(err).Error("Restoring pod is not scheduled until timeout or cancel, disengage")
|
||||
} else if err != nil {
|
||||
r.log.WithError(err).Error("Failed to check node-agent pod status, disengage")
|
||||
} else {
|
||||
err = nodeagent.IsRunningInNode(r.ctx, data.Restore.Namespace, nodeName, r.podClient)
|
||||
err = nodeagent.IsRunningInNode(checkCtx, data.Restore.Namespace, nodeName, r.podClient)
|
||||
if err != nil {
|
||||
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running on node, abort the restore")
|
||||
r.nodeAgentCheck <- struct{}{}
|
||||
r.log.WithField("node", nodeName).WithError(err).Error("node-agent pod is not running in node, abort the restore")
|
||||
r.nodeAgentCheck <- errors.Wrapf(err, "node-agent pod is not running in node %s", nodeName)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
@ -222,12 +223,17 @@ ForEachVolume:
|
|||
if res.Status.Phase == velerov1api.PodVolumeRestorePhaseFailed {
|
||||
errs = append(errs, errors.Errorf("pod volume restore failed: %s", res.Status.Message))
|
||||
}
|
||||
case <-r.nodeAgentCheck:
|
||||
errs = append(errs, errors.New("node agent pod is not running in node"))
|
||||
case err := <-r.nodeAgentCheck:
|
||||
errs = append(errs, err)
|
||||
break ForEachVolume
|
||||
}
|
||||
}
|
||||
|
||||
// This is to prevent the case that resultsChan is signaled before nodeAgentCheck though this is unlikely possible.
|
||||
// One possible case is that the CR is edited and set to an ending state manually, either completed or failed.
|
||||
// In this case, we must notify the check routine to stop.
|
||||
checkCancel()
|
||||
|
||||
r.resultsLock.Lock()
|
||||
delete(r.results, resultsKey(data.Pod.Namespace, data.Pod.Name))
|
||||
r.resultsLock.Unlock()
|
||||
|
|
|
@ -23,30 +23,38 @@ import (
|
|||
// IsPodRunning does a well-rounded check to make sure the specified pod is running stably.
|
||||
// If not, return the error found
|
||||
func IsPodRunning(pod *corev1api.Pod) error {
|
||||
if pod.Spec.NodeName == "" {
|
||||
return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
}
|
||||
return isPodScheduledInStatus(pod, func(pod *corev1api.Pod) error {
|
||||
if pod.Status.Phase != corev1api.PodRunning {
|
||||
return errors.New("pod is not running")
|
||||
}
|
||||
|
||||
if pod.Status.Phase != corev1api.PodRunning {
|
||||
return errors.Errorf("pod is not running, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
}
|
||||
|
||||
if pod.DeletionTimestamp != nil {
|
||||
return errors.Errorf("pod is being terminated, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
}
|
||||
|
||||
return nil
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// IsPodRunning does a well-rounded check to make sure the specified pod has been scheduled into a node and in a stable status.
|
||||
// IsPodScheduled does a well-rounded check to make sure the specified pod has been scheduled into a node and in a stable status.
|
||||
// If not, return the error found
|
||||
func IsPodScheduled(pod *corev1api.Pod) error {
|
||||
return isPodScheduledInStatus(pod, func(pod *corev1api.Pod) error {
|
||||
if pod.Status.Phase != corev1api.PodRunning && pod.Status.Phase != corev1api.PodPending {
|
||||
return errors.New("pod is running or pending")
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func isPodScheduledInStatus(pod *corev1api.Pod, statusCheckFunc func(*corev1api.Pod) error) error {
|
||||
if pod == nil {
|
||||
return errors.New("invalid input pod")
|
||||
}
|
||||
|
||||
if pod.Spec.NodeName == "" {
|
||||
return errors.Errorf("pod is not scheduled, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
}
|
||||
|
||||
if pod.Status.Phase != corev1api.PodRunning && pod.Status.Phase != corev1api.PodPending {
|
||||
return errors.Errorf("pod is not in a stable status, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
if err := statusCheckFunc(pod); err != nil {
|
||||
return errors.Wrapf(err, "pod is not in the expected status, name=%s, namespace=%s, phase=%s", pod.Name, pod.Namespace, pod.Status.Phase)
|
||||
}
|
||||
|
||||
if pod.DeletionTimestamp != nil {
|
||||
|
|
Loading…
Reference in New Issue