improve docker service reliability, update docker systemd files (#21174)

* docker: update template for docker.service

* wait for docker

* fix unit test
pull/21273/head
VerlorenerReisender 2025-08-09 23:49:55 +02:00 committed by GitHub
parent c432003f04
commit d0a21be532
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 57 additions and 39 deletions

View File

@ -554,7 +554,7 @@ func (f *FakeRunner) systemctl(args []string, root bool) (string, error) { // no
f.t.Logf("fake systemctl: SvcRestarted %s", svc)
case "is-active":
f.t.Logf("fake systemctl: %s is-status: %v", svc, state)
if state == SvcRunning {
if state == SvcRunning || state == SvcRestarted {
return out, nil
}
return out, fmt.Errorf("%s in state: %v", svc, state)

View File

@ -168,8 +168,7 @@ func (r *Docker) Enable(disOthers bool, cgroupDriver string, inUserNamespace boo
return err
}
_ = r.Init.ResetFailed("docker")
if err := r.Init.Restart("docker"); err != nil {
if err := r.restartServiceWithExpoRetry("docker"); err != nil {
return err
}
@ -199,20 +198,9 @@ func (r *Docker) Enable(disOthers bool, cgroupDriver string, inUserNamespace boo
return err
}
_ = r.Init.ResetFailed(service)
_ = r.Init.Restart(service)
// try to restart service if stopped, restart until it works
retry.Expo(
func() error {
if !r.Init.Active(service) {
r.Init.ResetFailed(service)
r.Init.Restart(service)
return fmt.Errorf("cri-docker not running")
}
return nil
},
1*time.Second, time.Minute, 5,
)
if err := r.restartServiceWithExpoRetry(service); err != nil {
return err
}
}
return nil
@ -824,3 +812,20 @@ ExecStart={{.ExecPath}} --container-runtime-endpoint fd:// --pod-infra-container
}
return nil
}
func (r *Docker) restartServiceWithExpoRetry(service string) error {
_ = r.Init.ResetFailed(service)
_ = r.Init.Restart(service)
// try to restart service if stopped, restart until it works
return retry.Expo(
func() error {
if !r.Init.Active(service) {
r.Init.ResetFailed(service)
r.Init.Restart(service)
return fmt.Errorf("%s not running", service)
}
return nil
},
1*time.Second, time.Minute, 5,
)
}

View File

@ -70,18 +70,21 @@ func (p *BuildrootProvisioner) GenerateDockerOptions(dockerPort int) (*provision
klog.Infof("root file system type: %s", fstype)
noPivot = fstype == "rootfs"
}
/* Template is copied from https://github.com/moby/moby/blob/44bca1adf3c806c4ed6688d67c6f995653b09b26/contrib/init/systemd/docker.service#L2 with the following changes:
After: minikube-automount.service is added
ExecStart: additional flags are added
*/
engineConfigTmpl := `[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network.target minikube-automount.service docker.socket
Requires= minikube-automount.service docker.socket
After=network-online.target minikube-automount.service nss-lookup.target docker.socket firewalld.service containerd.service time-set.target
Wants=network-online.target containerd.service
Requires=docker.socket
StartLimitBurst=3
StartLimitIntervalSec=60
[Service]
Type=notify
Restart=on-failure
Restart=always
`
if noPivot {
klog.Warning("Using fundamentally insecure --no-pivot option")
@ -94,18 +97,15 @@ Environment=DOCKER_RAMDISK=yes
{{range .EngineOptions.Env}}Environment={{.}}
{{end}}
# This file is a systemd drop-in unit that inherits from the base dockerd configuration.
# The base configuration already specifies an 'ExecStart=...' command. The first directive
# here is to clear out that command inherited from the base configuration. Without this,
# the command from the base configuration and the command specified here are treated as
# a sequence of commands, which is not the desired behavior, nor is it valid -- systemd
# will catch this invalid input and refuse to start the service with an error like:
# Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services.
# NOTE: default-ulimit=nofile is set to an arbitrary number for consistency with other
# container runtimes. If left unlimited, it may result in OOM issues with MySQL.
ExecStart=
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 -H unix:///var/run/docker.sock --default-ulimit=nofile=1048576:1048576 --tlsverify --tlscacert {{.AuthOptions.CaCertRemotePath}} --tlscert {{.AuthOptions.ServerCertRemotePath}} --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 \
-H fd:// --containerd=/run/containerd/containerd.sock \
-H unix:///var/run/docker.sock \
--default-ulimit=nofile=1048576:1048576 \
--tlsverify \
--tlscacert {{.AuthOptions.CaCertRemotePath}} \
--tlscert {{.AuthOptions.ServerCertRemotePath}} \
--tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
ExecReload=/bin/kill -s HUP \$MAINPID
# Having non-zero Limit*s causes performance problems due to accounting overhead
@ -117,13 +117,13 @@ LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
OOMScoreAdjust=-500
[Install]
WantedBy=multi-user.target

View File

@ -71,20 +71,25 @@ func (p *UbuntuProvisioner) GenerateDockerOptions(dockerPort int) (*provision.Do
klog.Infof("root file system type: %s", fstype)
noPivot = fstype == "rootfs"
}
/* Template is copied from https://github.com/moby/moby/blob/44bca1adf3c806c4ed6688d67c6f995653b09b26/contrib/init/systemd/docker.service#L2 with the following changes:
ExecStart: additional flags are added
StartLimitBurst=3 : The service can be restarted up to 3 times within the time window defined by StartLimitIntervalSec.
StartLimitIntervalSec=60 : The time window is 60 seconds. If the service fails and restarts more than 3 times in 60 seconds, systemd will stop trying to restart it automatically.
*/
engineConfigTmpl := `[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
BindsTo=containerd.service
After=network-online.target firewalld.service containerd.service
Wants=network-online.target
After=network-online.target nss-lookup.target docker.socket firewalld.service containerd.service time-set.target
Wants=network-online.target containerd.service
Requires=docker.socket
StartLimitBurst=3
StartLimitIntervalSec=60
[Service]
Type=notify
Restart=on-failure
Restart=always
`
if noPivot {
klog.Warning("Using fundamentally insecure --no-pivot option")
@ -108,7 +113,14 @@ Environment=DOCKER_RAMDISK=yes
# NOTE: default-ulimit=nofile is set to an arbitrary number for consistency with other
# container runtimes. If left unlimited, it may result in OOM issues with MySQL.
ExecStart=
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 -H unix:///var/run/docker.sock --default-ulimit=nofile=1048576:1048576 --tlsverify --tlscacert {{.AuthOptions.CaCertRemotePath}} --tlscert {{.AuthOptions.ServerCertRemotePath}} --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 \
-H fd:// --containerd=/run/containerd/containerd.sock \
-H unix:///var/run/docker.sock \
--default-ulimit=nofile=1048576:1048576 \
--tlsverify \
--tlscacert {{.AuthOptions.CaCertRemotePath}} \
--tlscert {{.AuthOptions.ServerCertRemotePath}} \
--tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
ExecReload=/bin/kill -s HUP \$MAINPID
# Having non-zero Limit*s causes performance problems due to accounting overhead
@ -127,6 +139,7 @@ Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
OOMScoreAdjust=-500
[Install]
WantedBy=multi-user.target