replace sleep with retry.Expo()

pull/10424/head
Predrag Rogic 2021-02-16 22:27:26 +00:00
parent 90cd9c3a60
commit b8052fe33d
No known key found for this signature in database
GPG Key ID: F1FF5748C4855229
4 changed files with 40 additions and 24 deletions

View File

@ -37,8 +37,8 @@ const (
NodeReadyKey = "node_ready"
// KubeletKey is the name used in the flags for waiting for the kubelet status to be ready
KubeletKey = "kubelet"
// OperationalKey is the name used for waiting for pods in CorePodsList to be Ready
OperationalKey = "operational"
// ExtraKey is the name used for extra waiting for pods in CorePodsList to be Ready
ExtraKey = "extra"
)
// vars related to the --wait flag
@ -46,9 +46,9 @@ var (
// DefaultComponents is map of the the default components to wait for
DefaultComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true}
// NoWaitComponents is map of componets to wait for if specified 'none' or 'false'
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false, OperationalKey: false}
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false, KubeletKey: false, ExtraKey: false}
// AllComponents is map for waiting for all components.
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true, OperationalKey: true}
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true, NodeReadyKey: true, KubeletKey: true, ExtraKey: true}
// DefaultWaitList is list of all default components to wait for. only names to be used for start flags.
DefaultWaitList = []string{APIServerWaitKey, SystemPodsWaitKey}
// AllComponentsList list of all valid components keys to wait for. only names to be used used for start flags.
@ -62,7 +62,7 @@ var (
"kube-proxy",
"kube-scheduler",
}
// CorePodsList is a list of essential pods for running kurnetes to wait for them to be operational ("Ready")
// CorePodsList is a list of essential pods for running kurnetes to extra wait for them to be Ready
CorePodsList = []string{
"kube-dns", // coredns
"etcd",

View File

@ -31,12 +31,12 @@ import (
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
)
// WaitOperational calls WaitForPodReadyByLabel for each pod in labels list and returns any errors occurred.
func WaitOperational(cs *kubernetes.Clientset, labels []string, timeout time.Duration) error {
klog.Info("waiting for kube-system core pods %s to be Ready ...", labels)
pStart := time.Now()
// WaitExtra calls WaitForPodReadyByLabel for each pod in labels list and returns any errors occurred.
func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration) error {
klog.Infof("extra waiting for kube-system core pods %s to be Ready ...", labels)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %s for waiting for kube-system core pods to be Ready ...", time.Since(pStart))
klog.Infof("duration metric: took %s for extra waiting for kube-system core pods to be Ready ...", time.Since(start))
}()
var errs []string
@ -84,7 +84,6 @@ func WaitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, t
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod with %q label in %q namespace to be Ready timed out", label, namespace)
}
pods, err := cs.CoreV1().Pods(namespace).List(meta.ListOptions{})
if err != nil {
klog.Infof("error listing pods in %q namespace, will retry: %v", namespace, err)
@ -110,7 +109,6 @@ func WaitForPodReadyByLabel(cs *kubernetes.Clientset, label, namespace string, t
klog.Infof("pod with %q label in %q namespace was not found, will retry", label, namespace)
return false, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}
@ -136,7 +134,6 @@ func WaitForPodReadyByName(cs *kubernetes.Clientset, name, namespace string, tim
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for pod %q in %q namespace to be Ready timed out", name, namespace)
}
pod, err := cs.CoreV1().Pods(namespace).Get(name, meta.GetOptions{})
if err != nil {
klog.Infof("error getting pod %q in %q namespace, will retry: %v", name, namespace, err)
@ -154,7 +151,6 @@ func WaitForPodReadyByName(cs *kubernetes.Clientset, name, namespace string, tim
}
return false, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait pod Ready")
}

View File

@ -36,6 +36,7 @@ import (
"github.com/docker/machine/libmachine"
"github.com/docker/machine/libmachine/state"
"github.com/pkg/errors"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
@ -470,9 +471,9 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
return nil
}
if cfg.VerifyComponents[kverify.OperationalKey] {
if err := kverify.WaitOperational(client, kverify.CorePodsList, timeout); err != nil {
return errors.Wrap(err, "waiting for operational status")
if cfg.VerifyComponents[kverify.ExtraKey] {
if err := kverify.WaitExtra(client, kverify.CorePodsList, timeout); err != nil {
return errors.Wrap(err, "extra waiting")
}
}
@ -664,14 +665,32 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
}
}
if cfg.VerifyComponents[kverify.OperationalKey] {
if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears to be immediately Ready as are all kube-system pods
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts
// so we wait for kubelet to initialise itself...
time.Sleep(10 * time.Second)
if err := kverify.WaitOperational(client, kverify.CorePodsList, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "operational status")
// it appears as to be immediately Ready as well as all kube-system pods,
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts,
// and by that time we would exit completely, so we wait until kubelet begins restarting pods
klog.Info("waiting for restarted kubelet to initialise ...")
start := time.Now()
wait := func() error {
pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{})
if err != nil {
return err
}
for _, pod := range pods.Items {
if pod.Labels["tier"] == "control-plane" {
if ready, _ := kverify.IsPodReady(&pod); !ready {
return nil
}
}
}
return fmt.Errorf("kubelet not initialised")
}
_ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute)
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))
if err := kverify.WaitExtra(client, kverify.CorePodsList, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
}

View File

@ -448,6 +448,7 @@ func imageID(image string) string {
}
// validateComponentHealth asserts that all Kubernetes components are healthy
// note: it expects all components to be Ready, so it makes sense to run it close after only those tests that include '--wait=all' start flag (ie, with extra wait)
func validateComponentHealth(ctx context.Context, t *testing.T, profile string) {
defer PostMortemLogs(t, profile)