fix WaitForPod by waiting for component Ready instead of pod Running status
parent
ee6283ecf1
commit
3c7d2e0351
|
@ -60,6 +60,15 @@ var (
|
|||
"kube-proxy",
|
||||
"kube-scheduler",
|
||||
}
|
||||
// SystemPodsList is a list of essential pods for running kurnetes to wait for them to be Ready
|
||||
SystemPodsList = []string{
|
||||
"kube-dns", // coredns
|
||||
"etcd",
|
||||
"kube-apiserver",
|
||||
"kube-controller-manager",
|
||||
"kube-proxy",
|
||||
"kube-scheduler",
|
||||
}
|
||||
)
|
||||
|
||||
// ShouldWait will return true if the config says need to wait
|
||||
|
|
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
Copyright 2021 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
// Package kverify verifies a running Kubernetes cluster is healthy
|
||||
package kverify
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
core "k8s.io/api/core/v1"
|
||||
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/klog/v2"
|
||||
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
||||
)
|
||||
|
||||
// WaitForPodReadyByLabel waits for pod with label ([key:]val) in a namespace to be in Ready condition.
|
||||
// If namespace is not provided, it defaults to "kube-system".
|
||||
// If label key is not provided, it will try with "component" and "k8s-app".
|
||||
func WaitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, timeout time.Duration) error {
|
||||
klog.Infof("waiting %v for pod with %q label in %q namespace to be Ready ...", timeout, label, namespace)
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
klog.Infof("duration metric: took %v to run WaitForPodReadyByLabel for pod with %q label in %q namespace ...", time.Since(start), label, namespace)
|
||||
}()
|
||||
|
||||
if namespace == "" {
|
||||
namespace = "kube-system"
|
||||
}
|
||||
|
||||
lkey := ""
|
||||
lval := ""
|
||||
l := strings.Split(label, ":")
|
||||
switch len(l) {
|
||||
case 1: // treat as no label key provided, just val
|
||||
lval = strings.TrimSpace(l[0])
|
||||
case 2:
|
||||
lkey = strings.TrimSpace(l[0])
|
||||
lval = strings.TrimSpace(l[1])
|
||||
default:
|
||||
return fmt.Errorf("pod label %q is malformed", label)
|
||||
}
|
||||
|
||||
checkReady := func() (bool, error) {
|
||||
if time.Since(start) > timeout {
|
||||
return false, fmt.Errorf("wait for pod with %q label in %q namespace to be Ready timed out", label, namespace)
|
||||
}
|
||||
|
||||
pods, err := cs.CoreV1().Pods(namespace).List(meta.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Infof("error listing pods in %q namespace, will retry: %v", namespace, err)
|
||||
return false, nil
|
||||
}
|
||||
for _, pod := range pods.Items {
|
||||
for k, v := range pod.ObjectMeta.Labels {
|
||||
if ((lkey == "" && (k == "component" || k == "k8s-app")) || lkey == k) && v == lval {
|
||||
return checkPodStatus(&pod)
|
||||
}
|
||||
}
|
||||
}
|
||||
klog.Infof("pod with %q label in %q namespace was not found, will retry", label, namespace)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
|
||||
return errors.Wrapf(err, "wait pod Ready")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// WaitForPodReadyByName waits for pod with name in a namespace to be in Ready condition.
|
||||
// If namespace is not provided, it defaults to "kube-system".
|
||||
func WaitForPodReadyByName(cs *kubernetes.Clientset, name, namespace string, timeout time.Duration) error {
|
||||
klog.Infof("waiting %v for pod %q in %q namespace to be Ready ...", timeout, name, namespace)
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
klog.Infof("duration metric: took %v to run WaitForPodReadyByName for pod %q in %q namespace ...", time.Since(start), name, namespace)
|
||||
}()
|
||||
|
||||
if namespace == "" {
|
||||
namespace = "kube-system"
|
||||
}
|
||||
|
||||
checkReady := func() (bool, error) {
|
||||
if time.Since(start) > timeout {
|
||||
return false, fmt.Errorf("wait for pod %q in %q namespace to be Ready timed out", name, namespace)
|
||||
}
|
||||
|
||||
pod, err := cs.CoreV1().Pods(namespace).Get(name, meta.GetOptions{})
|
||||
if err != nil {
|
||||
klog.Infof("error getting pod %q in %q namespace, will retry: %v", name, namespace, err)
|
||||
return false, nil
|
||||
}
|
||||
return checkPodStatus(pod)
|
||||
}
|
||||
|
||||
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
|
||||
return errors.Wrapf(err, "wait pod Ready")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkPodStatus returns if pod is Ready and any error occurred.
|
||||
func checkPodStatus(pod *core.Pod) (bool, error) {
|
||||
if pod.Status.Phase != core.PodRunning {
|
||||
klog.Infof("pod %q in %q namespace is not Running, will retry: %+v", pod.Name, pod.Namespace, pod.Status)
|
||||
return false, nil
|
||||
}
|
||||
for _, c := range pod.Status.Conditions {
|
||||
if c.Type == core.PodReady {
|
||||
if c.Status != core.ConditionTrue {
|
||||
klog.Infof("pod %q in %q namespace is not Ready, will retry: %+v", pod.Name, pod.Namespace, c)
|
||||
return false, nil
|
||||
}
|
||||
klog.Infof("pod %q in %q namespace is Ready ...", pod.Name, pod.Namespace)
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, fmt.Errorf("pod %q in %q namespace does not have %q status: %+v", pod.Name, pod.Namespace, core.PodReady, pod.Status)
|
||||
}
|
|
@ -36,40 +36,25 @@ import (
|
|||
"k8s.io/minikube/pkg/util/retry"
|
||||
)
|
||||
|
||||
// WaitForSystemPods verifies essential pods for running kurnetes is running
|
||||
// WaitForSystemPods verifies essential pods for running kurnetes are Ready
|
||||
func WaitForSystemPods(r cruntime.Manager, bs bootstrapper.Bootstrapper, cfg config.ClusterConfig, cr command.Runner, client *kubernetes.Clientset, start time.Time, timeout time.Duration) error {
|
||||
klog.Info("waiting for kube-system pods to appear ...")
|
||||
klog.Info("waiting for kube-system pods to be Ready ...")
|
||||
pStart := time.Now()
|
||||
defer func() {
|
||||
klog.Infof("duration metric: took %s for waiting for kube-system pods to be Ready ...", time.Since(pStart))
|
||||
}()
|
||||
|
||||
podList := func() error {
|
||||
if time.Since(start) > minLogCheckTime {
|
||||
announceProblems(r, bs, cfg, cr)
|
||||
time.Sleep(kconst.APICallRetryInterval * 5)
|
||||
}
|
||||
if time.Since(start) > minLogCheckTime {
|
||||
announceProblems(r, bs, cfg, cr)
|
||||
time.Sleep(kconst.APICallRetryInterval * 5)
|
||||
}
|
||||
|
||||
// Wait for any system pod, as waiting for apiserver may block until etcd
|
||||
pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{})
|
||||
if err != nil {
|
||||
klog.Warningf("pod list returned error: %v", err)
|
||||
for _, label := range SystemPodsList {
|
||||
if err := WaitForPodReadyByLabel(client, label, "kube-system", timeout); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
klog.Infof("%d kube-system pods found", len(pods.Items))
|
||||
for _, pod := range pods.Items {
|
||||
klog.Infof(podStatusMsg(pod))
|
||||
}
|
||||
|
||||
if len(pods.Items) < 2 {
|
||||
return fmt.Errorf("only %d pod(s) have shown up", len(pods.Items))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := retry.Local(podList, timeout); err != nil {
|
||||
return fmt.Errorf("apiserver never returned a pod list")
|
||||
}
|
||||
klog.Infof("duration metric: took %s to wait for pod list to return data ...", time.Since(pStart))
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -477,12 +477,8 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
|
|||
|
||||
if n.ControlPlane {
|
||||
if cfg.VerifyComponents[kverify.APIServerWaitKey] {
|
||||
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, start, timeout); err != nil {
|
||||
return errors.Wrap(err, "wait for apiserver proc")
|
||||
}
|
||||
|
||||
if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, start, hostname, port, timeout); err != nil {
|
||||
return errors.Wrap(err, "wait for healthy API server")
|
||||
if err := kverify.WaitForPodReadyByLabel(client, "component: kube-apiserver", "kube-system", timeout); err != nil {
|
||||
return errors.Wrapf(err, "waiting for API server pod to be Ready")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -474,12 +474,22 @@ func validateComponentHealth(ctx context.Context, t *testing.T, profile string)
|
|||
|
||||
for _, i := range cs.Items {
|
||||
for _, l := range i.Labels {
|
||||
t.Logf("%s phase: %s", l, i.Status.Phase)
|
||||
_, ok := found[l]
|
||||
if ok {
|
||||
if _, ok := found[l]; ok { // skip irrelevant (eg, repeating/redundant '"tier": "control-plane"') labels
|
||||
found[l] = true
|
||||
if i.Status.Phase != "Running" {
|
||||
t.Logf("%s phase: %s", l, i.Status.Phase)
|
||||
if i.Status.Phase != api.PodRunning {
|
||||
t.Errorf("%s is not Running: %+v", l, i.Status)
|
||||
continue
|
||||
}
|
||||
for _, c := range i.Status.Conditions {
|
||||
if c.Type == api.PodReady {
|
||||
if c.Status != api.ConditionTrue {
|
||||
t.Errorf("%s is not Ready: %+v", l, i.Status)
|
||||
} else {
|
||||
t.Logf("%s status: %s", l, c.Type)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue