fix WaitForPod by waiting for component Ready instead of pod Running status

pull/10424/head
Predrag Rogic 2021-02-09 19:51:26 +00:00
parent ee6283ecf1
commit 3c7d2e0351
No known key found for this signature in database
GPG Key ID: F1FF5748C4855229
5 changed files with 175 additions and 36 deletions

View File

@ -60,6 +60,15 @@ var (
"kube-proxy",
"kube-scheduler",
}
// SystemPodsList is a list of essential pods for running kurnetes to wait for them to be Ready
SystemPodsList = []string{
"kube-dns", // coredns
"etcd",
"kube-apiserver",
"kube-controller-manager",
"kube-proxy",
"kube-scheduler",
}
)
// ShouldWait will return true if the config says need to wait

View File

@ -0,0 +1,139 @@
/*
Copyright 2021 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package kverify verifies a running Kubernetes cluster is healthy
package kverify
import (
"fmt"
"strings"
"time"
"github.com/pkg/errors"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
)
// WaitForPodReadyByLabel waits for pod with label ([key:]val) in a namespace to be in Ready condition.
// If namespace is not provided, it defaults to "kube-system".
// If label key is not provided, it will try with "component" and "k8s-app".
func WaitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, timeout time.Duration) error {
klog.Infof("waiting %v for pod with %q label in %q namespace to be Ready ...", timeout, label, namespace)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %v to run WaitForPodReadyByLabel for pod with %q label in %q namespace ...", time.Since(start), label, namespace)
}()
if namespace == "" {
namespace = "kube-system"
}
lkey := ""
lval := ""
l := strings.Split(label, ":")
switch len(l) {
case 1: // treat as no label key provided, just val
lval = strings.TrimSpace(l[0])
case 2:
lkey = strings.TrimSpace(l[0])
lval = strings.TrimSpace(l[1])
default:
return fmt.Errorf("pod label %q is malformed", label)
}
checkReady := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod with %q label in %q namespace to be Ready timed out", label, namespace)
}
pods, err := cs.CoreV1().Pods(namespace).List(meta.ListOptions{})
if err != nil {
klog.Infof("error listing pods in %q namespace, will retry: %v", namespace, err)
return false, nil
}
for _, pod := range pods.Items {
for k, v := range pod.ObjectMeta.Labels {
if ((lkey == "" && (k == "component" || k == "k8s-app")) || lkey == k) && v == lval {
return checkPodStatus(&pod)
}
}
}
klog.Infof("pod with %q label in %q namespace was not found, will retry", label, namespace)
return false, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}
return nil
}
// WaitForPodReadyByName waits for pod with name in a namespace to be in Ready condition.
// If namespace is not provided, it defaults to "kube-system".
func WaitForPodReadyByName(cs *kubernetes.Clientset, name, namespace string, timeout time.Duration) error {
klog.Infof("waiting %v for pod %q in %q namespace to be Ready ...", timeout, name, namespace)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %v to run WaitForPodReadyByName for pod %q in %q namespace ...", time.Since(start), name, namespace)
}()
if namespace == "" {
namespace = "kube-system"
}
checkReady := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod %q in %q namespace to be Ready timed out", name, namespace)
}
pod, err := cs.CoreV1().Pods(namespace).Get(name, meta.GetOptions{})
if err != nil {
klog.Infof("error getting pod %q in %q namespace, will retry: %v", name, namespace, err)
return false, nil
}
return checkPodStatus(pod)
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}
return nil
}
// checkPodStatus returns if pod is Ready and any error occurred.
func checkPodStatus(pod *core.Pod) (bool, error) {
if pod.Status.Phase != core.PodRunning {
klog.Infof("pod %q in %q namespace is not Running, will retry: %+v", pod.Name, pod.Namespace, pod.Status)
return false, nil
}
for _, c := range pod.Status.Conditions {
if c.Type == core.PodReady {
if c.Status != core.ConditionTrue {
klog.Infof("pod %q in %q namespace is not Ready, will retry: %+v", pod.Name, pod.Namespace, c)
return false, nil
}
klog.Infof("pod %q in %q namespace is Ready ...", pod.Name, pod.Namespace)
return true, nil
}
}
return false, fmt.Errorf("pod %q in %q namespace does not have %q status: %+v", pod.Name, pod.Namespace, core.PodReady, pod.Status)
}

View File

@ -36,40 +36,25 @@ import (
"k8s.io/minikube/pkg/util/retry"
)
// WaitForSystemPods verifies essential pods for running kurnetes is running
// WaitForSystemPods verifies essential pods for running kurnetes are Ready
func WaitForSystemPods(r cruntime.Manager, bs bootstrapper.Bootstrapper, cfg config.ClusterConfig, cr command.Runner, client *kubernetes.Clientset, start time.Time, timeout time.Duration) error {
klog.Info("waiting for kube-system pods to appear ...")
klog.Info("waiting for kube-system pods to be Ready ...")
pStart := time.Now()
defer func() {
klog.Infof("duration metric: took %s for waiting for kube-system pods to be Ready ...", time.Since(pStart))
}()
podList := func() error {
if time.Since(start) > minLogCheckTime {
announceProblems(r, bs, cfg, cr)
time.Sleep(kconst.APICallRetryInterval * 5)
}
if time.Since(start) > minLogCheckTime {
announceProblems(r, bs, cfg, cr)
time.Sleep(kconst.APICallRetryInterval * 5)
}
// Wait for any system pod, as waiting for apiserver may block until etcd
pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{})
if err != nil {
klog.Warningf("pod list returned error: %v", err)
for _, label := range SystemPodsList {
if err := WaitForPodReadyByLabel(client, label, "kube-system", timeout); err != nil {
return err
}
klog.Infof("%d kube-system pods found", len(pods.Items))
for _, pod := range pods.Items {
klog.Infof(podStatusMsg(pod))
}
if len(pods.Items) < 2 {
return fmt.Errorf("only %d pod(s) have shown up", len(pods.Items))
}
return nil
}
if err := retry.Local(podList, timeout); err != nil {
return fmt.Errorf("apiserver never returned a pod list")
}
klog.Infof("duration metric: took %s to wait for pod list to return data ...", time.Since(pStart))
return nil
}

View File

@ -477,12 +477,8 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
if n.ControlPlane {
if cfg.VerifyComponents[kverify.APIServerWaitKey] {
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, start, timeout); err != nil {
return errors.Wrap(err, "wait for apiserver proc")
}
if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, start, hostname, port, timeout); err != nil {
return errors.Wrap(err, "wait for healthy API server")
if err := kverify.WaitForPodReadyByLabel(client, "component: kube-apiserver", "kube-system", timeout); err != nil {
return errors.Wrapf(err, "waiting for API server pod to be Ready")
}
}

View File

@ -474,12 +474,22 @@ func validateComponentHealth(ctx context.Context, t *testing.T, profile string)
for _, i := range cs.Items {
for _, l := range i.Labels {
t.Logf("%s phase: %s", l, i.Status.Phase)
_, ok := found[l]
if ok {
if _, ok := found[l]; ok { // skip irrelevant (eg, repeating/redundant '"tier": "control-plane"') labels
found[l] = true
if i.Status.Phase != "Running" {
t.Logf("%s phase: %s", l, i.Status.Phase)
if i.Status.Phase != api.PodRunning {
t.Errorf("%s is not Running: %+v", l, i.Status)
continue
}
for _, c := range i.Status.Conditions {
if c.Type == api.PodReady {
if c.Status != api.ConditionTrue {
t.Errorf("%s is not Ready: %+v", l, i.Status)
} else {
t.Logf("%s status: %s", l, c.Type)
}
break
}
}
}
}