Merge pull request #7752 from medyagh/node_pressure_redo

Check node pressure & new option "node_ready" for --wait flag
pull/7769/head
Medya Ghazizadeh 2020-04-17 19:45:40 -07:00 committed by GitHub
commit fe4fdeec6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 328 additions and 18 deletions

View File

@ -333,7 +333,9 @@ func Start(wg *sync.WaitGroup, cc *config.ClusterConfig, toEnable map[string]boo
var awg sync.WaitGroup
out.T(out.AddonEnable, "Enabling addons: {{.addons}}", out.V{"addons": strings.Join(toEnableList, ", ")})
defer func() { // making it show after verifications( not perfect till #7613 is closed)
out.T(out.AddonEnable, "Enabled addons: {{.addons}}", out.V{"addons": strings.Join(toEnableList, ", ")})
}()
for _, a := range toEnableList {
awg.Add(1)
go func(name string) {

View File

@ -32,7 +32,9 @@ const (
// DefaultSAWaitKey is the name used in the flags for default service account
DefaultSAWaitKey = "default_sa"
// AppsRunning is the name used in the flags for waiting for k8s-apps to be running
AppsRunning = "apps_running"
AppsRunningKey = "apps_running"
// NodeReadyKey is the name used in the flags for waiting for the node status to be ready
NodeReadyKey = "node_ready"
)
// vars related to the --wait flag
@ -40,13 +42,13 @@ var (
// DefaultComponents is map of the the default components to wait for
DefaultComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true}
// NoWaitComponents is map of componets to wait for if specified 'none' or 'false'
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunning: false}
NoComponents = map[string]bool{APIServerWaitKey: false, SystemPodsWaitKey: false, DefaultSAWaitKey: false, AppsRunningKey: false, NodeReadyKey: false}
// AllComponents is map for waiting for all components.
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunning: true}
AllComponents = map[string]bool{APIServerWaitKey: true, SystemPodsWaitKey: true, DefaultSAWaitKey: true, AppsRunningKey: true}
// DefaultWaitList is list of all default components to wait for. only names to be used for start flags.
DefaultWaitList = []string{APIServerWaitKey, SystemPodsWaitKey}
// AllComponentsList list of all valid components keys to wait for. only names to be used used for start flags.
AllComponentsList = []string{APIServerWaitKey, SystemPodsWaitKey, DefaultSAWaitKey, AppsRunning}
AllComponentsList = []string{APIServerWaitKey, SystemPodsWaitKey, DefaultSAWaitKey, AppsRunningKey, NodeReadyKey}
// AppsRunningList running list are valid k8s-app components to wait for them to be running
AppsRunningList = []string{
"kube-dns", // coredns

View File

@ -0,0 +1,142 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package kverify verifies a running kubernetes cluster is healthy
package kverify
import (
"fmt"
"time"
"github.com/golang/glog"
"github.com/pkg/errors"
v1 "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)
// NodeCondition represents a favorable or unfavorable node condition.
type NodeCondition struct {
Type v1.NodeConditionType
Status v1.ConditionStatus
Reason string
Message string
}
// DiskPressure detects if the condition is disk pressure
func (pc *NodeCondition) DiskPressure() bool {
return pc.Type == v1.NodeDiskPressure && pc.Status == v1.ConditionTrue
}
// MemoryPressure detects if the condition is memory pressure
func (pc *NodeCondition) MemoryPressure() bool {
return pc.Type == v1.NodeMemoryPressure && pc.Status == v1.ConditionTrue
}
// PIDPressure detects if the condition is PID pressure
func (pc *NodeCondition) PIDPressure() bool {
return pc.Type == v1.NodePIDPressure && pc.Status == v1.ConditionTrue
}
// NetworkUnavailable detects if the condition is PID pressure
func (pc *NodeCondition) NetworkUnavailable() bool {
return pc.Type == v1.NodeNetworkUnavailable && pc.Status == v1.ConditionTrue
}
const errTextFormat = "node has unwanted condition %q : Reason %q Message: %q"
// ErrMemoryPressure is thrown when there is node memory pressure condition
type ErrMemoryPressure struct {
NodeCondition
}
func (e *ErrMemoryPressure) Error() string {
return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message)
}
// ErrDiskPressure is thrown when there is node disk pressure condition
type ErrDiskPressure struct {
NodeCondition
}
func (e *ErrDiskPressure) Error() string {
return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message)
}
// ErrPIDPressure is thrown when there is node PID pressure condition
type ErrPIDPressure struct {
NodeCondition
}
func (e *ErrPIDPressure) Error() string {
return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message)
}
// ErrNetworkNotReady is thrown when there is node condition is network not ready
type ErrNetworkNotReady struct {
NodeCondition
}
func (e *ErrNetworkNotReady) Error() string {
return fmt.Sprintf(errTextFormat, e.Type, e.Reason, e.Message)
}
// NodePressure verfies that node is not under disk, memory, pid or network pressure.
func NodePressure(cs *kubernetes.Clientset) error {
glog.Info("verifying NodePressure condition ...")
start := time.Now()
defer func() {
glog.Infof("duration metric: took %s to run NodePressure ...", time.Since(start))
}()
ns, err := cs.CoreV1().Nodes().List(meta.ListOptions{})
if err != nil {
return errors.Wrap(err, "list nodes")
}
for _, n := range ns.Items {
glog.Infof("node storage ephemeral capacity is %s", n.Status.Capacity.StorageEphemeral())
glog.Infof("node cpu capacity is %s", n.Status.Capacity.Cpu().AsDec())
for _, c := range n.Status.Conditions {
pc := NodeCondition{Type: c.Type, Status: c.Status, Reason: c.Reason, Message: c.Message}
if pc.DiskPressure() {
return &ErrDiskPressure{
NodeCondition: pc,
}
}
if pc.MemoryPressure() {
return &ErrMemoryPressure{
NodeCondition: pc,
}
}
if pc.PIDPressure() {
return &ErrPIDPressure{
NodeCondition: pc,
}
}
if pc.NetworkUnavailable() {
return &ErrNetworkNotReady{
NodeCondition: pc,
}
}
}
}
return nil
}

View File

@ -0,0 +1,64 @@
/*
Copyright 2020 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package kverify verifies a running kubernetes cluster is healthy
package kverify
import (
"fmt"
"time"
"github.com/golang/glog"
"github.com/pkg/errors"
v1 "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
)
// WaitForNodeReady waits till kube client reports node status as "ready"
func WaitForNodeReady(cs *kubernetes.Clientset, timeout time.Duration) error {
glog.Info("waiting for node status to be ready ...")
start := time.Now()
defer func() {
glog.Infof("duration metric: took %s to wait for WaitForNodeReady...", time.Since(start))
}()
checkReady := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("wait for node to be ready timed out")
}
ns, err := cs.CoreV1().Nodes().List(meta.ListOptions{})
if err != nil {
glog.Infof("error listing nodes will retry: %v", err)
return false, nil
}
for _, n := range ns.Items {
for _, c := range n.Status.Conditions {
if c.Type == v1.NodeReady && c.Status != v1.ConditionTrue {
glog.Infof("node %q has unwanted condition %q : Reason %q Message: %q. will try. ", n.Name, c.Type, c.Reason, c.Message)
return false, nil
}
}
}
return true, nil
}
if err := wait.PollImmediate(kconst.APICallRetryInterval, kconst.DefaultControlPlaneTimeout, checkReady); err != nil {
return errors.Wrapf(err, "wait node ready")
}
return nil
}

View File

@ -21,6 +21,7 @@ import (
"context"
"os/exec"
"path"
"runtime"
"sync"
"fmt"
@ -41,6 +42,7 @@ import (
"k8s.io/client-go/kubernetes"
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
"k8s.io/minikube/pkg/drivers/kic"
"k8s.io/minikube/pkg/drivers/kic/oci"
"k8s.io/minikube/pkg/kapi"
"k8s.io/minikube/pkg/minikube/assets"
"k8s.io/minikube/pkg/minikube/bootstrapper"
@ -325,16 +327,37 @@ func (k *Bootstrapper) client(ip string, port int) (*kubernetes.Clientset, error
}
// WaitForNode blocks until the node appears to be healthy
func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, timeout time.Duration) error {
func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, timeout time.Duration) (waitErr error) {
start := time.Now()
if !n.ControlPlane {
glog.Infof("%s is not a control plane, nothing to wait for", n.Name)
return nil
}
out.T(out.HealthCheck, "Verifying Kubernetes Components:")
out.T(out.IndentVerify, "verifying node conditions ...")
// TODO: #7706: for better performance we could use k.client inside minikube to avoid asking for external IP:PORT
hostname, _, port, err := driver.ControlPaneEndpoint(&cfg, &n, cfg.Driver)
if err != nil {
return errors.Wrap(err, "get control plane endpoint")
}
defer func() { // run pressure verification after all other checks, so there be an api server to talk to.
client, err := k.client(hostname, port)
if err != nil {
waitErr = errors.Wrap(err, "get k8s client")
}
if err := kverify.NodePressure(client); err != nil {
adviseNodePressure(err, cfg.Name, cfg.Driver)
waitErr = errors.Wrap(err, "node pressure")
}
}()
if !kverify.ShouldWait(cfg.VerifyComponents) {
glog.Infof("skip waiting for components based on config.")
return nil
return waitErr
}
cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
@ -342,12 +365,8 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
return errors.Wrapf(err, "create runtme-manager %s", cfg.KubernetesConfig.ContainerRuntime)
}
hostname, _, port, err := driver.ControlPaneEndpoint(&cfg, &n, cfg.Driver)
if err != nil {
return errors.Wrap(err, "get control plane endpoint")
}
if cfg.VerifyComponents[kverify.APIServerWaitKey] {
out.T(out.IndentVerify, "verifying api server ...")
client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "get k8s client")
@ -362,6 +381,7 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}
if cfg.VerifyComponents[kverify.SystemPodsWaitKey] {
out.T(out.IndentVerify, "verifying system pods ...")
client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "get k8s client")
@ -372,6 +392,7 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}
if cfg.VerifyComponents[kverify.DefaultSAWaitKey] {
out.T(out.IndentVerify, "verifying default service account ...")
client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "get k8s client")
@ -381,7 +402,8 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}
}
if cfg.VerifyComponents[kverify.AppsRunning] {
if cfg.VerifyComponents[kverify.AppsRunningKey] {
out.T(out.IndentVerify, "verifying apps running ...")
client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "get k8s client")
@ -391,8 +413,19 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}
}
if cfg.VerifyComponents[kverify.NodeReadyKey] {
out.T(out.IndentVerify, "verifying node ready")
client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "get k8s client")
}
if err := kverify.WaitForNodeReady(client, timeout); err != nil {
return errors.Wrap(err, "waiting for node to be ready")
}
}
glog.Infof("duration metric: took %s to wait for : %+v ...", time.Since(start), cfg.VerifyComponents)
return nil
return waitErr
}
// needsReset returns whether or not the cluster needs to be reconfigured
@ -517,6 +550,10 @@ func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error {
return errors.Wrap(err, "system pods")
}
if err := kverify.NodePressure(client); err != nil {
adviseNodePressure(err, cfg.Name, cfg.Driver)
}
// This can fail during upgrades if the old pods have not shut down yet
addonPhase := func() error {
_, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s phase addon all --config %s", baseCmd, conf)))
@ -860,3 +897,63 @@ func (k *Bootstrapper) elevateKubeSystemPrivileges(cfg config.ClusterConfig) err
}
return nil
}
// adviseNodePressure will advise the user what to do with difference pressure errors based on their environment
func adviseNodePressure(err error, name string, drv string) {
if diskErr, ok := err.(*kverify.ErrDiskPressure); ok {
out.ErrLn("")
glog.Warning(diskErr)
out.WarningT("The node {{.name}} has ran out of disk space.", out.V{"name": name})
// generic advice for all drivers
out.T(out.Tip, "Please free up disk or prune images.")
if driver.IsVM(drv) {
out.T(out.Stopped, "Please create a cluster with bigger disk size: `minikube start --disk SIZE_MB` ")
} else if drv == oci.Docker && runtime.GOOS != "linux" {
out.T(out.Stopped, "Please increse Desktop's disk size.")
if runtime.GOOS == "darwin" {
out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-mac/space/"})
}
if runtime.GOOS == "windows" {
out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-windows/"})
}
}
out.ErrLn("")
return
}
if memErr, ok := err.(*kverify.ErrMemoryPressure); ok {
out.ErrLn("")
glog.Warning(memErr)
out.WarningT("The node {{.name}} has ran out of memory.", out.V{"name": name})
out.T(out.Tip, "Check if you have unnecessary pods running by running 'kubectl get po -A")
if driver.IsVM(drv) {
out.T(out.Stopped, "Consider creating a cluster with larger memory size using `minikube start --memory SIZE_MB` ")
} else if drv == oci.Docker && runtime.GOOS != "linux" {
out.T(out.Stopped, "Consider increasing Docker Desktop's memory size.")
if runtime.GOOS == "darwin" {
out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-mac/space/"})
}
if runtime.GOOS == "windows" {
out.T(out.Documentation, "Documentation: {{.url}}", out.V{"url": "https://docs.docker.com/docker-for-windows/"})
}
}
out.ErrLn("")
return
}
if pidErr, ok := err.(*kverify.ErrPIDPressure); ok {
glog.Warning(pidErr)
out.ErrLn("")
out.WarningT("The node {{.name}} has ran out of available PIDs.", out.V{"name": name})
out.ErrLn("")
return
}
if netErr, ok := err.(*kverify.ErrNetworkNotReady); ok {
glog.Warning(netErr)
out.ErrLn("")
out.WarningT("The node {{.name}} network is not available. Please verify network settings.", out.V{"name": name})
out.ErrLn("")
return
}
}

View File

@ -37,7 +37,6 @@ import (
cmdcfg "k8s.io/minikube/cmd/minikube/cmd/config"
"k8s.io/minikube/pkg/addons"
"k8s.io/minikube/pkg/minikube/bootstrapper"
"k8s.io/minikube/pkg/minikube/bootstrapper/bsutil/kverify"
"k8s.io/minikube/pkg/minikube/bootstrapper/images"
"k8s.io/minikube/pkg/minikube/cluster"
"k8s.io/minikube/pkg/minikube/command"
@ -145,8 +144,8 @@ func Start(starter Starter, apiServer bool) (*kubeconfig.Settings, error) {
prepareNone()
}
// Skip pre-existing, because we already waited for health
if kverify.ShouldWait(starter.Cfg.VerifyComponents) && !starter.PreExists {
// TODO: existing cluster should wait for health #7597
if !starter.PreExists {
if err := bs.WaitForNode(*starter.Cfg, *starter.Node, viper.GetDuration(waitTimeout)); err != nil {
return nil, errors.Wrap(err, "Wait failed")
}

View File

@ -70,6 +70,7 @@ var styles = map[StyleEnum]style{
ThumbsUp: {Prefix: "👍 "},
ThumbsDown: {Prefix: "👎 "},
Option: {Prefix: " ▪ ", LowPrefix: lowIndent}, // Indented bullet
IndentVerify: {Prefix: " 🔎 ", LowPrefix: lowIndent}, // Indented verifying icon, it needs extra space to make it work
Command: {Prefix: " ▪ ", LowPrefix: lowIndent}, // Indented bullet
LogEntry: {Prefix: " "}, // Indent
Deleted: {Prefix: "💀 "},
@ -108,6 +109,7 @@ var styles = map[StyleEnum]style{
Enabling: {Prefix: "🔌 "},
Shutdown: {Prefix: "🛑 "},
Pulling: {Prefix: "🚜 "},
HealthCheck: {Prefix: "🕵️ "}, // mac needed extra space for right tabbing
Verifying: {Prefix: "🤔 "},
VerifyingNoLine: {Prefix: "🤔 ", OmitNewline: true},
Kubectl: {Prefix: "💗 "},

View File

@ -43,6 +43,7 @@ const (
ThumbsUp
ThumbsDown
Option
IndentVerify
Command
LogEntry
Deleted
@ -73,6 +74,7 @@ const (
Enabling
Shutdown
Pulling
HealthCheck
Verifying
VerifyingNoLine
Kubectl

View File

@ -86,7 +86,7 @@ minikube start [flags]
--uuid string Provide VM UUID to restore MAC address (hyperkit driver only)
--vm Filter to use only VM Drivers
--vm-driver driver DEPRECATED, use driver instead.
--wait strings comma separated list of kubernetes components to verify and wait for after starting a cluster. defaults to "apiserver,system_pods", available options: "apiserver,system_pods,default_sa,apps_running" . other acceptable values are 'all' or 'none', 'true' and 'false' (default [apiserver,system_pods])
--wait strings comma separated list of kubernetes components to verify and wait for after starting a cluster. defaults to "apiserver,system_pods", available options: "apiserver,system_pods,default_sa,apps_running,node_ready" . other acceptable values are 'all' or 'none', 'true' and 'false' (default [apiserver,system_pods])
--wait-timeout duration max time to wait per Kubernetes core services to be healthy. (default 6m0s)
```