improve docker service reliability, update docker systemd files (#21174)
* docker: update template for docker.service * wait for docker * fix unit testpull/21273/head
parent
c432003f04
commit
d0a21be532
|
@ -554,7 +554,7 @@ func (f *FakeRunner) systemctl(args []string, root bool) (string, error) { // no
|
|||
f.t.Logf("fake systemctl: SvcRestarted %s", svc)
|
||||
case "is-active":
|
||||
f.t.Logf("fake systemctl: %s is-status: %v", svc, state)
|
||||
if state == SvcRunning {
|
||||
if state == SvcRunning || state == SvcRestarted {
|
||||
return out, nil
|
||||
}
|
||||
return out, fmt.Errorf("%s in state: %v", svc, state)
|
||||
|
|
|
@ -168,8 +168,7 @@ func (r *Docker) Enable(disOthers bool, cgroupDriver string, inUserNamespace boo
|
|||
return err
|
||||
}
|
||||
|
||||
_ = r.Init.ResetFailed("docker")
|
||||
if err := r.Init.Restart("docker"); err != nil {
|
||||
if err := r.restartServiceWithExpoRetry("docker"); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -199,20 +198,9 @@ func (r *Docker) Enable(disOthers bool, cgroupDriver string, inUserNamespace boo
|
|||
return err
|
||||
}
|
||||
|
||||
_ = r.Init.ResetFailed(service)
|
||||
_ = r.Init.Restart(service)
|
||||
// try to restart service if stopped, restart until it works
|
||||
retry.Expo(
|
||||
func() error {
|
||||
if !r.Init.Active(service) {
|
||||
r.Init.ResetFailed(service)
|
||||
r.Init.Restart(service)
|
||||
return fmt.Errorf("cri-docker not running")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
1*time.Second, time.Minute, 5,
|
||||
)
|
||||
if err := r.restartServiceWithExpoRetry(service); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -824,3 +812,20 @@ ExecStart={{.ExecPath}} --container-runtime-endpoint fd:// --pod-infra-container
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Docker) restartServiceWithExpoRetry(service string) error {
|
||||
_ = r.Init.ResetFailed(service)
|
||||
_ = r.Init.Restart(service)
|
||||
// try to restart service if stopped, restart until it works
|
||||
return retry.Expo(
|
||||
func() error {
|
||||
if !r.Init.Active(service) {
|
||||
r.Init.ResetFailed(service)
|
||||
r.Init.Restart(service)
|
||||
return fmt.Errorf("%s not running", service)
|
||||
}
|
||||
return nil
|
||||
},
|
||||
1*time.Second, time.Minute, 5,
|
||||
)
|
||||
}
|
||||
|
|
|
@ -70,18 +70,21 @@ func (p *BuildrootProvisioner) GenerateDockerOptions(dockerPort int) (*provision
|
|||
klog.Infof("root file system type: %s", fstype)
|
||||
noPivot = fstype == "rootfs"
|
||||
}
|
||||
|
||||
/* Template is copied from https://github.com/moby/moby/blob/44bca1adf3c806c4ed6688d67c6f995653b09b26/contrib/init/systemd/docker.service#L2 with the following changes:
|
||||
After: minikube-automount.service is added
|
||||
ExecStart: additional flags are added
|
||||
*/
|
||||
engineConfigTmpl := `[Unit]
|
||||
Description=Docker Application Container Engine
|
||||
Documentation=https://docs.docker.com
|
||||
After=network.target minikube-automount.service docker.socket
|
||||
Requires= minikube-automount.service docker.socket
|
||||
After=network-online.target minikube-automount.service nss-lookup.target docker.socket firewalld.service containerd.service time-set.target
|
||||
Wants=network-online.target containerd.service
|
||||
Requires=docker.socket
|
||||
StartLimitBurst=3
|
||||
StartLimitIntervalSec=60
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
Restart=on-failure
|
||||
Restart=always
|
||||
`
|
||||
if noPivot {
|
||||
klog.Warning("Using fundamentally insecure --no-pivot option")
|
||||
|
@ -94,18 +97,15 @@ Environment=DOCKER_RAMDISK=yes
|
|||
{{range .EngineOptions.Env}}Environment={{.}}
|
||||
{{end}}
|
||||
|
||||
# This file is a systemd drop-in unit that inherits from the base dockerd configuration.
|
||||
# The base configuration already specifies an 'ExecStart=...' command. The first directive
|
||||
# here is to clear out that command inherited from the base configuration. Without this,
|
||||
# the command from the base configuration and the command specified here are treated as
|
||||
# a sequence of commands, which is not the desired behavior, nor is it valid -- systemd
|
||||
# will catch this invalid input and refuse to start the service with an error like:
|
||||
# Service has more than one ExecStart= setting, which is only allowed for Type=oneshot services.
|
||||
|
||||
# NOTE: default-ulimit=nofile is set to an arbitrary number for consistency with other
|
||||
# container runtimes. If left unlimited, it may result in OOM issues with MySQL.
|
||||
ExecStart=
|
||||
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 -H unix:///var/run/docker.sock --default-ulimit=nofile=1048576:1048576 --tlsverify --tlscacert {{.AuthOptions.CaCertRemotePath}} --tlscert {{.AuthOptions.ServerCertRemotePath}} --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
|
||||
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 \
|
||||
-H fd:// --containerd=/run/containerd/containerd.sock \
|
||||
-H unix:///var/run/docker.sock \
|
||||
--default-ulimit=nofile=1048576:1048576 \
|
||||
--tlsverify \
|
||||
--tlscacert {{.AuthOptions.CaCertRemotePath}} \
|
||||
--tlscert {{.AuthOptions.ServerCertRemotePath}} \
|
||||
--tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
|
||||
ExecReload=/bin/kill -s HUP \$MAINPID
|
||||
|
||||
# Having non-zero Limit*s causes performance problems due to accounting overhead
|
||||
|
@ -117,13 +117,13 @@ LimitCORE=infinity
|
|||
# Uncomment TasksMax if your systemd version supports it.
|
||||
# Only systemd 226 and above support this version.
|
||||
TasksMax=infinity
|
||||
TimeoutStartSec=0
|
||||
|
||||
# set delegate yes so that systemd does not reset the cgroups of docker containers
|
||||
Delegate=yes
|
||||
|
||||
# kill only the docker process, not all processes in the cgroup
|
||||
KillMode=process
|
||||
OOMScoreAdjust=-500
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
@ -71,20 +71,25 @@ func (p *UbuntuProvisioner) GenerateDockerOptions(dockerPort int) (*provision.Do
|
|||
klog.Infof("root file system type: %s", fstype)
|
||||
noPivot = fstype == "rootfs"
|
||||
}
|
||||
/* Template is copied from https://github.com/moby/moby/blob/44bca1adf3c806c4ed6688d67c6f995653b09b26/contrib/init/systemd/docker.service#L2 with the following changes:
|
||||
ExecStart: additional flags are added
|
||||
|
||||
StartLimitBurst=3 : The service can be restarted up to 3 times within the time window defined by StartLimitIntervalSec.
|
||||
|
||||
StartLimitIntervalSec=60 : The time window is 60 seconds. If the service fails and restarts more than 3 times in 60 seconds, systemd will stop trying to restart it automatically.
|
||||
*/
|
||||
engineConfigTmpl := `[Unit]
|
||||
Description=Docker Application Container Engine
|
||||
Documentation=https://docs.docker.com
|
||||
BindsTo=containerd.service
|
||||
After=network-online.target firewalld.service containerd.service
|
||||
Wants=network-online.target
|
||||
After=network-online.target nss-lookup.target docker.socket firewalld.service containerd.service time-set.target
|
||||
Wants=network-online.target containerd.service
|
||||
Requires=docker.socket
|
||||
StartLimitBurst=3
|
||||
StartLimitIntervalSec=60
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
Restart=on-failure
|
||||
Restart=always
|
||||
`
|
||||
if noPivot {
|
||||
klog.Warning("Using fundamentally insecure --no-pivot option")
|
||||
|
@ -108,7 +113,14 @@ Environment=DOCKER_RAMDISK=yes
|
|||
# NOTE: default-ulimit=nofile is set to an arbitrary number for consistency with other
|
||||
# container runtimes. If left unlimited, it may result in OOM issues with MySQL.
|
||||
ExecStart=
|
||||
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 -H unix:///var/run/docker.sock --default-ulimit=nofile=1048576:1048576 --tlsverify --tlscacert {{.AuthOptions.CaCertRemotePath}} --tlscert {{.AuthOptions.ServerCertRemotePath}} --tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
|
||||
ExecStart=/usr/bin/dockerd -H tcp://0.0.0.0:2376 \
|
||||
-H fd:// --containerd=/run/containerd/containerd.sock \
|
||||
-H unix:///var/run/docker.sock \
|
||||
--default-ulimit=nofile=1048576:1048576 \
|
||||
--tlsverify \
|
||||
--tlscacert {{.AuthOptions.CaCertRemotePath}} \
|
||||
--tlscert {{.AuthOptions.ServerCertRemotePath}} \
|
||||
--tlskey {{.AuthOptions.ServerKeyRemotePath}} {{ range .EngineOptions.Labels }}--label {{.}} {{ end }}{{ range .EngineOptions.InsecureRegistry }}--insecure-registry {{.}} {{ end }}{{ range .EngineOptions.RegistryMirror }}--registry-mirror {{.}} {{ end }}{{ range .EngineOptions.ArbitraryFlags }}--{{.}} {{ end }}
|
||||
ExecReload=/bin/kill -s HUP \$MAINPID
|
||||
|
||||
# Having non-zero Limit*s causes performance problems due to accounting overhead
|
||||
|
@ -127,6 +139,7 @@ Delegate=yes
|
|||
|
||||
# kill only the docker process, not all processes in the cgroup
|
||||
KillMode=process
|
||||
OOMScoreAdjust=-500
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
Loading…
Reference in New Issue