From e9220557140fe957ad74c990a474d4b9c3db86a1 Mon Sep 17 00:00:00 2001 From: Steven Powell Date: Thu, 27 Jan 2022 14:02:53 -0800 Subject: [PATCH] revert "Delete and init kubeadm on subsequent starts" --- pkg/minikube/bootstrapper/kubeadm/kubeadm.go | 217 ++++++++++++++++--- pkg/minikube/constants/constants.go | 3 - test/integration/pause_test.go | 2 +- translations/de.json | 1 + translations/es.json | 1 + translations/ko.json | 1 + translations/pl.json | 1 + translations/ru.json | 1 + translations/strings.txt | 1 + translations/zh-CN.json | 1 + 10 files changed, 197 insertions(+), 32 deletions(-) diff --git a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go index c7f57b9ea5..ee9a77fe1b 100644 --- a/pkg/minikube/bootstrapper/kubeadm/kubeadm.go +++ b/pkg/minikube/bootstrapper/kubeadm/kubeadm.go @@ -37,6 +37,7 @@ import ( "github.com/docker/machine/libmachine/state" "github.com/pkg/errors" core "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" @@ -61,6 +62,7 @@ import ( "k8s.io/minikube/pkg/minikube/sysinit" "k8s.io/minikube/pkg/minikube/vmpath" "k8s.io/minikube/pkg/util" + "k8s.io/minikube/pkg/util/retry" "k8s.io/minikube/pkg/version" kconst "k8s.io/minikube/third_party/kubeadm/app/constants" ) @@ -396,10 +398,13 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error { } if err := bsutil.ExistingConfig(k.c); err == nil { - if reconfigure := k.needsReconfigure(cfg); !reconfigure { + klog.Infof("found existing configuration files, will attempt cluster restart") + rerr := k.restartControlPlane(cfg) + if rerr == nil { return nil } + out.ErrT(style.Embarrassed, "Unable to restart cluster, will reset it: {{.error}}", out.V{"error": rerr}) if err := k.DeleteCluster(cfg.KubernetesConfig); err != nil { klog.Warningf("delete failed: %v", err) } @@ -558,21 +563,71 @@ func (k *Bootstrapper) ensureServiceStarted(svc string) error { } // needsReconfigure returns whether or not the cluster needs to be reconfigured -func (k *Bootstrapper) needsReconfigure(cfg config.ClusterConfig) bool { +func (k *Bootstrapper) needsReconfigure(conf string, hostname string, port int, client *kubernetes.Clientset, version string) bool { + if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil { + klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output()) + return true + } + // cruntime.Enable() may restart kube-apiserver but does not wait for it to return back + apiStatusTimeout := 3000 * time.Millisecond + st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port) + if err != nil { + klog.Infof("needs reconfigure: apiserver error: %v", err) + return true + } + if st != state.Running { + klog.Infof("needs reconfigure: apiserver in state %s", st) + return true + } + + if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil { + klog.Infof("needs reconfigure: %v", err) + return true + } + + if err := kverify.APIServerVersionMatch(client, version); err != nil { + klog.Infof("needs reconfigure: %v", err) + return true + } + + // DANGER: This log message is hard-coded in an integration test! + klog.Infof("The running cluster does not require reconfiguration: %s", hostname) + return false +} + +// restartCluster restarts the Kubernetes cluster configured by kubeadm +func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error { + klog.Infof("restartCluster start") + + start := time.Now() + defer func() { + klog.Infof("restartCluster took %s", time.Since(start)) + }() + + k8sVersion, err := util.ParseKubernetesVersion(cfg.KubernetesConfig.KubernetesVersion) + if err != nil { + return errors.Wrap(err, "parsing Kubernetes version") + } + + phase := "alpha" + controlPlane := "controlplane" + if k8sVersion.GTE(semver.MustParse("1.13.0")) { + phase = "init" + controlPlane = "control-plane" + } + if err := k.createCompatSymlinks(); err != nil { klog.Errorf("failed to create compat symlinks: %v", err) } cp, err := config.PrimaryControlPlane(&cfg) if err != nil { - klog.Warningf("needs reconfigure: primary control plane error: %v", err) - return true + return errors.Wrap(err, "primary control plane") } hostname, _, port, err := driver.ControlPlaneEndpoint(&cfg, &cp, cfg.Driver) if err != nil { - klog.Warningf("needs reconfigure: control plane error: %v", err) - return true + return errors.Wrap(err, "control plane") } // Save the costly tax of reinstalling Kubernetes if the only issue is a missing kube context @@ -583,40 +638,125 @@ func (k *Bootstrapper) needsReconfigure(cfg config.ClusterConfig) bool { client, err := k.client(hostname, port) if err != nil { - klog.Warningf("needs reconfigure: getting k8s client error: %v", err) - return true + return errors.Wrap(err, "getting k8s client") } + // If the cluster is running, check if we have any work to do. conf := bsutil.KubeadmYamlPath - if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil { - klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output()) - return true + if !k.needsReconfigure(conf, hostname, port, client, cfg.KubernetesConfig.KubernetesVersion) { + klog.Infof("Taking a shortcut, as the cluster seems to be properly configured") + return nil } - // cruntime.Enable() may restart kube-apiserver but does not wait for it to return back - apiStatusTimeout := 3 * time.Second - st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port) + + if err := k.stopKubeSystem(cfg); err != nil { + klog.Warningf("Failed to stop kube-system containers: port conflicts may arise: %v", err) + } + + if err := sysinit.New(k.c).Stop("kubelet"); err != nil { + klog.Warningf("Failed to stop kubelet, this might cause upgrade errors: %v", err) + } + + if err := k.clearStaleConfigs(cfg); err != nil { + return errors.Wrap(err, "clearing stale configs") + } + + if _, err := k.c.RunCmd(exec.Command("sudo", "cp", conf+".new", conf)); err != nil { + return errors.Wrap(err, "cp") + } + + baseCmd := fmt.Sprintf("%s %s", bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), phase) + cmds := []string{ + fmt.Sprintf("%s phase certs all --config %s", baseCmd, conf), + fmt.Sprintf("%s phase kubeconfig all --config %s", baseCmd, conf), + fmt.Sprintf("%s phase kubelet-start --config %s", baseCmd, conf), + fmt.Sprintf("%s phase %s all --config %s", baseCmd, controlPlane, conf), + fmt.Sprintf("%s phase etcd local --config %s", baseCmd, conf), + } + + klog.Infof("reconfiguring cluster from %s", conf) + // Run commands one at a time so that it is easier to root cause failures. + for _, c := range cmds { + if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil { + klog.Errorf("%s failed - will try once more: %v", c, err) + + if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil { + return errors.Wrap(err, "run") + } + } + } + + cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c}) if err != nil { - klog.Warningf("needs reconfigure: apiserver error: %v", err) - return true - } - if st != state.Running { - klog.Warningf("needs reconfigure: apiserver in state %s", st.String()) - return true + return errors.Wrap(err, "runtime") } - if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil { - klog.Warningf("needs reconfigure: %v", err) - return true + // We must ensure that the apiserver is healthy before proceeding + if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil { + return errors.Wrap(err, "apiserver healthz") } - if err := kverify.APIServerVersionMatch(client, cfg.KubernetesConfig.KubernetesVersion); err != nil { - klog.Warningf("needs reconfigure: %v", err) - return true + if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, time.Now(), hostname, port, kconst.DefaultControlPlaneTimeout); err != nil { + return errors.Wrap(err, "apiserver health") } - klog.Infof("%s: %s", constants.ReconfigurationNotRequired, hostname) - return false + // because reboots clear /etc/cni + if err := k.applyCNI(cfg); err != nil { + return errors.Wrap(err, "apply cni") + } + + if err := kverify.WaitForSystemPods(cr, k, cfg, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil { + return errors.Wrap(err, "system pods") + } + + if err := kverify.NodePressure(client); err != nil { + adviseNodePressure(err, cfg.Name, cfg.Driver) + } + + // This can fail during upgrades if the old pods have not shut down yet + addonPhase := func() error { + _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s phase addon all --config %s", baseCmd, conf))) + return err + } + if err = retry.Expo(addonPhase, 100*time.Microsecond, 30*time.Second); err != nil { + klog.Warningf("addon install failed, wil retry: %v", err) + return errors.Wrap(err, "addons") + } + + // must be called after applyCNI and `kubeadm phase addon all` (ie, coredns redeploy) + if cfg.VerifyComponents[kverify.ExtraKey] { + // after kubelet is restarted (with 'kubeadm init phase kubelet-start' above), + // it appears as to be immediately Ready as well as all kube-system pods (last observed state), + // then (after ~10sec) it realises it has some changes to apply, implying also pods restarts, + // and by that time we would exit completely, so we wait until kubelet begins restarting pods + klog.Info("waiting for restarted kubelet to initialise ...") + start := time.Now() + wait := func() error { + pods, err := client.CoreV1().Pods(meta.NamespaceSystem).List(context.Background(), meta.ListOptions{LabelSelector: "tier=control-plane"}) + if err != nil { + return err + } + for _, pod := range pods.Items { + if ready, _ := kverify.IsPodReady(&pod); !ready { + return nil + } + } + return fmt.Errorf("kubelet not initialised") + } + _ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute) + klog.Infof("kubelet initialised") + klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start)) + + if err := kverify.WaitExtra(client, kverify.CorePodsLabels, kconst.DefaultControlPlaneTimeout); err != nil { + return errors.Wrap(err, "extra") + } + } + + if err := bsutil.AdjustResourceLimits(k.c); err != nil { + klog.Warningf("unable to adjust resource limits: %v", err) + } + + return nil } // JoinCluster adds new node to an existing cluster. @@ -909,6 +1049,27 @@ func (k *Bootstrapper) elevateKubeSystemPrivileges(cfg config.ClusterConfig) err return nil } +// stopKubeSystem stops all the containers in the kube-system to prevent #8740 when doing hot upgrade +func (k *Bootstrapper) stopKubeSystem(cfg config.ClusterConfig) error { + klog.Info("stopping kube-system containers ...") + cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c}) + if err != nil { + return errors.Wrap(err, "new cruntime") + } + + ids, err := cr.ListContainers(cruntime.ListContainersOptions{Namespaces: []string{"kube-system"}}) + if err != nil { + return errors.Wrap(err, "list") + } + + if len(ids) > 0 { + if err := cr.StopContainers(ids); err != nil { + return errors.Wrap(err, "stop") + } + } + return nil +} + // adviseNodePressure will advise the user what to do with difference pressure errors based on their environment func adviseNodePressure(err error, name string, drv string) { if diskErr, ok := err.(*kverify.ErrDiskPressure); ok { diff --git a/pkg/minikube/constants/constants.go b/pkg/minikube/constants/constants.go index 1300e2aa9b..7fd53a87e0 100644 --- a/pkg/minikube/constants/constants.go +++ b/pkg/minikube/constants/constants.go @@ -141,9 +141,6 @@ const ( MountTypeFlag = "type" // MountUIDFlag is the flag used to set the mount UID MountUIDFlag = "uid" - - // ReconfigurationNotRequired is the message logged when reconfiguration is not required - ReconfigurationNotRequired = "The running cluster does not require reconfiguration" ) var ( diff --git a/test/integration/pause_test.go b/test/integration/pause_test.go index 4909fd9aa4..93cccbdced 100644 --- a/test/integration/pause_test.go +++ b/test/integration/pause_test.go @@ -97,7 +97,7 @@ func validateStartNoReconfigure(ctx context.Context, t *testing.T, profile strin } if !NoneDriver() { - softLog := constants.ReconfigurationNotRequired + softLog := "The running cluster does not require reconfiguration" if !strings.Contains(rr.Output(), softLog) { t.Errorf("expected the second start log output to include %q but got: %s", softLog, rr.Output()) } diff --git a/translations/de.json b/translations/de.json index 31c9eeb96a..7fd39c63f6 100644 --- a/translations/de.json +++ b/translations/de.json @@ -766,6 +766,7 @@ "Unable to pull images, which may be OK: {{.error}}": "Bilder können nicht abgerufen werden, was möglicherweise kein Problem darstellt: {{.error}}", "Unable to push cached images: {{.error}}": "Kann gecachete Image nicht veröffentlichen (push): {{.error}}", "Unable to remove machine directory": "Kann Maschinen Verzeichnis nicht entfernen", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "Kann existierenden Kubernetes v{{.old}} Cluster nicht auf Version v{{.new}} downgraden", "Unable to stop VM": "Kann VM nicht stoppen", "Unable to update {{.driver}} driver: {{.error}}": "Kann Treiber {{.driver}} nicht aktualisieren: {{.error}}", diff --git a/translations/es.json b/translations/es.json index 982d6c0384..ff76dd3f3c 100644 --- a/translations/es.json +++ b/translations/es.json @@ -774,6 +774,7 @@ "Unable to pull images, which may be OK: {{.error}}": "No se ha podido recuperar imágenes, que podrían estar en buen estado: {{.error}}", "Unable to push cached images: {{.error}}": "", "Unable to remove machine directory": "", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "", "Unable to stop VM": "", "Unable to update {{.driver}} driver: {{.error}}": "", diff --git a/translations/ko.json b/translations/ko.json index e49b7d5577..9b3b442003 100644 --- a/translations/ko.json +++ b/translations/ko.json @@ -774,6 +774,7 @@ "Unable to push cached images: {{.error}}": "", "Unable to remove machine directory": "", "Unable to remove machine directory: %v": "머신 디렉토리를 제거할 수 없습니다: %v", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "", "Unable to start VM. Please investigate and run 'minikube delete' if possible": "가상 머신을 시작할 수 없습니다. 확인 후 가능하면 'minikube delete' 를 실행하세요", "Unable to stop VM": "가상 머신을 중지할 수 없습니다", diff --git a/translations/pl.json b/translations/pl.json index c953b643e3..190080827d 100644 --- a/translations/pl.json +++ b/translations/pl.json @@ -782,6 +782,7 @@ "Unable to pick a default driver. Here is what was considered, in preference order:": "", "Unable to push cached images: {{.error}}": "", "Unable to remove machine directory": "", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "", "Unable to start VM": "Nie można uruchomić maszyny wirtualnej", "Unable to stop VM": "Nie można zatrzymać maszyny wirtualnej", diff --git a/translations/ru.json b/translations/ru.json index ffa3d3329f..a140db3819 100644 --- a/translations/ru.json +++ b/translations/ru.json @@ -713,6 +713,7 @@ "Unable to pick a default driver. Here is what was considered, in preference order:": "", "Unable to push cached images: {{.error}}": "", "Unable to remove machine directory": "", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "", "Unable to stop VM": "", "Unable to update {{.driver}} driver: {{.error}}": "", diff --git a/translations/strings.txt b/translations/strings.txt index fa0a3ba36d..086c0a510b 100644 --- a/translations/strings.txt +++ b/translations/strings.txt @@ -713,6 +713,7 @@ "Unable to pick a default driver. Here is what was considered, in preference order:": "", "Unable to push cached images: {{.error}}": "", "Unable to remove machine directory": "", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "", "Unable to stop VM": "", "Unable to update {{.driver}} driver: {{.error}}": "", diff --git a/translations/zh-CN.json b/translations/zh-CN.json index c339981ab4..95ada5356b 100644 --- a/translations/zh-CN.json +++ b/translations/zh-CN.json @@ -881,6 +881,7 @@ "Unable to pull images, which may be OK: {{.error}}": "无法拉取镜像,有可能是正常状况:{{.error}}", "Unable to push cached images: {{.error}}": "", "Unable to remove machine directory": "", + "Unable to restart cluster, will reset it: {{.error}}": "", "Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "", "Unable to start VM. Please investigate and run 'minikube delete' if possible": "无法启动虚拟机。可能的话请检查后执行 'minikube delete'", "Unable to stop VM": "无法停止虚拟机",