From d0a21be5325f74cbee334ce8b6e6a106608c3a57 Mon Sep 17 00:00:00 2001 From: VerlorenerReisender <46831212+ComradeProgrammer@users.noreply.github.com> Date: Sat, 9 Aug 2025 23:49:55 +0200 Subject: [PATCH] improve docker service reliability, update docker systemd files (#21174) * docker: update template for docker.service * wait for docker * fix unit test --- pkg/minikube/cruntime/cruntime_test.go | 2 +- pkg/minikube/cruntime/docker.go | 37 +++++++++++++++----------- pkg/provision/buildroot.go | 34 +++++++++++------------ pkg/provision/ubuntu.go | 23 ++++++++++++---- 4 files changed, 57 insertions(+), 39 deletions(-) diff --git a/pkg/minikube/cruntime/cruntime_test.go b/pkg/minikube/cruntime/cruntime_test.go index 4d056fdc33..c65ff4b930 100644 --- a/pkg/minikube/cruntime/cruntime_test.go +++ b/pkg/minikube/cruntime/cruntime_test.go @@ -554,7 +554,7 @@ func (f *FakeRunner) systemctl(args []string, root bool) (string, error) { // no f.t.Logf("fake systemctl: SvcRestarted %s", svc) case "is-active": f.t.Logf("fake systemctl: %s is-status: %v", svc, state) - if state == SvcRunning { + if state == SvcRunning || state == SvcRestarted { return out, nil } return out, fmt.Errorf("%s in state: %v", svc, state) diff --git a/pkg/minikube/cruntime/docker.go b/pkg/minikube/cruntime/docker.go index 78993fb628..a708c160ff 100644 --- a/pkg/minikube/cruntime/docker.go +++ b/pkg/minikube/cruntime/docker.go @@ -168,8 +168,7 @@ func (r *Docker) Enable(disOthers bool, cgroupDriver string, inUserNamespace boo return err } - _ = r.Init.ResetFailed("docker") - if err := r.Init.Restart("docker"); err != nil { + if err := r.restartServiceWithExpoRetry("docker"); err != nil { return err } @@ -199,20 +198,9 @@ func (r *Docker) Enable(disOthers bool, cgroupDriver string, inUserNamespace boo return err } - _ = r.Init.ResetFailed(service) - _ = r.Init.Restart(service) - // try to restart service if stopped, restart until it works - retry.Expo( - func() error { - if !r.Init.Active(service) { - r.Init.ResetFailed(service) - r.Init.Restart(service) - return fmt.Errorf("cri-docker not running") - } - return nil - }, - 1*time.Second, time.Minute, 5, - ) + if err := r.restartServiceWithExpoRetry(service); err != nil { + return err + } } return nil @@ -824,3 +812,20 @@ ExecStart={{.ExecPath}} --container-runtime-endpoint fd:// --pod-infra-container } return nil } + +func (r *Docker) restartServiceWithExpoRetry(service string) error { + _ = r.Init.ResetFailed(service) + _ = r.Init.Restart(service) + // try to restart service if stopped, restart until it works + return retry.Expo( + func() error { + if !r.Init.Active(service) { + r.Init.ResetFailed(service) + r.Init.Restart(service) + return fmt.Errorf("%s not running", service) + } + return nil + }, + 1*time.Second, time.Minute, 5, + ) +} diff --git a/pkg/provision/buildroot.go b/pkg/provision/buildroot.go index f15cd579e9..d063e30d0e 100644 --- a/pkg/provision/buildroot.go +++ b/pkg/provision/buildroot.go @@ -70,18 +70,21 @@ func (p *BuildrootProvisioner) GenerateDockerOptions(dockerPort int) (*provision klog.Infof("root file system type: %s", fstype) noPivot = fstype == "rootfs" } - + /* Template is copied from https://github.com/moby/moby/blob/44bca1adf3c806c4ed6688d67c6f995653b09b26/contrib/init/systemd/docker.service#L2 with the following changes: + After: minikube-automount.service is added + ExecStart: additional flags are added + */ engineConfigTmpl := `[Unit] Description=Docker Application Container Engine Documentation=https://docs.docker.com -After=network.target minikube-automount.service docker.socket -Requires= minikube-automount.service docker.socket +After=network-online.target minikube-automount.service nss-lookup.target docker.socket firewalld.service containerd.service time-set.target +Wants=network-online.target containerd.service +Requires=docker.socket StartLimitBurst=3 StartLimitIntervalSec=60 - [Service] Type=notify -Restart=on-failure +Restart=always ` if noPivot { klog.Warning("Using fundamentally insecure --no-pivot option") @@ -94,18 +97,15 @@ Environment=DOCKER_RAMDISK=yes {{range .EngineOptions.Env}}Environment={{.}} {{end}} -# This file is a systemd drop-in unit that inherits from the base dockerd configuration. -# The base configuration already specifies an 'ExecStart=...' command. The first directive -# here is to clear out that command inherited from the base configuration. Without this, -# the command from the base configuration and the command specified here are treated as -# a sequence of commands, which is not the desired behavior, nor is it valid -- systemd -# will catch this invalid input and refuse to start the service with an error like: -# Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services. - -# NOTE: default-ulimit=nofile is set to an arbitrary number for consistency with other -# container runtimes. If left unlimited, it may result in OOM issues with MySQL. ExecStart= -ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 -H unix:///var/run/docker.sock --default-ulimit=nofile=1048576:1048576 --tlsverify --tlscacert {{.AuthOptions.CaCertRemotePath}} --tlscert {{.AuthOptions.ServerCertRemotePath}} --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }} +ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 \ + -H fd:// --containerd=/run/containerd/containerd.sock \ + -H unix:///var/run/docker.sock \ + --default-ulimit=nofile=1048576:1048576 \ + --tlsverify \ + --tlscacert {{.AuthOptions.CaCertRemotePath}} \ + --tlscert {{.AuthOptions.ServerCertRemotePath}} \ + --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }} ExecReload=/bin/kill -s HUP \$MAINPID # Having non-zero Limit*s causes performance problems due to accounting overhead @@ -117,13 +117,13 @@ LimitCORE=infinity # Uncomment TasksMax if your systemd version supports it. # Only systemd 226 and above support this version. TasksMax=infinity -TimeoutStartSec=0 # set delegate yes so that systemd does not reset the cgroups of docker containers Delegate=yes # kill only the docker process, not all processes in the cgroup KillMode=process +OOMScoreAdjust=-500 [Install] WantedBy=multi-user.target diff --git a/pkg/provision/ubuntu.go b/pkg/provision/ubuntu.go index 91c5386480..ea18c7bd7f 100644 --- a/pkg/provision/ubuntu.go +++ b/pkg/provision/ubuntu.go @@ -71,20 +71,25 @@ func (p *UbuntuProvisioner) GenerateDockerOptions(dockerPort int) (*provision.Do klog.Infof("root file system type: %s", fstype) noPivot = fstype == "rootfs" } + /* Template is copied from https://github.com/moby/moby/blob/44bca1adf3c806c4ed6688d67c6f995653b09b26/contrib/init/systemd/docker.service#L2 with the following changes: + ExecStart: additional flags are added + StartLimitBurst=3 : The service can be restarted up to 3 times within the time window defined by StartLimitIntervalSec. + + StartLimitIntervalSec=60 : The time window is 60 seconds. If the service fails and restarts more than 3 times in 60 seconds, systemd will stop trying to restart it automatically. + */ engineConfigTmpl := `[Unit] Description=Docker Application Container Engine Documentation=https://docs.docker.com -BindsTo=containerd.service -After=network-online.target firewalld.service containerd.service -Wants=network-online.target +After=network-online.target nss-lookup.target docker.socket firewalld.service containerd.service time-set.target +Wants=network-online.target containerd.service Requires=docker.socket StartLimitBurst=3 StartLimitIntervalSec=60 [Service] Type=notify -Restart=on-failure +Restart=always ` if noPivot { klog.Warning("Using fundamentally insecure --no-pivot option") @@ -108,7 +113,14 @@ Environment=DOCKER_RAMDISK=yes # NOTE: default-ulimit=nofile is set to an arbitrary number for consistency with other # container runtimes. If left unlimited, it may result in OOM issues with MySQL. ExecStart= -ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 -H unix:///var/run/docker.sock --default-ulimit=nofile=1048576:1048576 --tlsverify --tlscacert {{.AuthOptions.CaCertRemotePath}} --tlscert {{.AuthOptions.ServerCertRemotePath}} --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }} +ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 \ + -H fd:// --containerd=/run/containerd/containerd.sock \ + -H unix:///var/run/docker.sock \ + --default-ulimit=nofile=1048576:1048576 \ + --tlsverify \ + --tlscacert {{.AuthOptions.CaCertRemotePath}} \ + --tlscert {{.AuthOptions.ServerCertRemotePath}} \ + --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }} ExecReload=/bin/kill -s HUP \$MAINPID # Having non-zero Limit*s causes performance problems due to accounting overhead @@ -127,6 +139,7 @@ Delegate=yes # kill only the docker process, not all processes in the cgroup KillMode=process +OOMScoreAdjust=-500 [Install] WantedBy=multi-user.target