diff --git a/pkg/agent/config/config.go b/pkg/agent/config/config.go index 8de77b6a9e..5758333e0d 100644 --- a/pkg/agent/config/config.go +++ b/pkg/agent/config/config.go @@ -19,12 +19,11 @@ import ( "strings" "time" - "github.com/containerd/containerd/snapshots/overlay" - fuseoverlayfs "github.com/containerd/fuse-overlayfs-snapshotter" "github.com/pkg/errors" "github.com/rancher/k3s/pkg/agent/proxy" "github.com/rancher/k3s/pkg/cli/cmds" "github.com/rancher/k3s/pkg/clientaccess" + "github.com/rancher/k3s/pkg/containerd" "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/daemons/control/deps" "github.com/rancher/k3s/pkg/util" @@ -447,12 +446,12 @@ func get(ctx context.Context, envInfo *cmds.Agent, proxy proxy.Proxy) (*config.N if !nodeConfig.Docker && nodeConfig.ContainerRuntimeEndpoint == "" { switch nodeConfig.AgentConfig.Snapshotter { case "overlayfs": - if err := overlay.Supported(nodeConfig.Containerd.Root); err != nil { + if err := containerd.OverlaySupported(nodeConfig.Containerd.Root); err != nil { return nil, errors.Wrapf(err, "\"overlayfs\" snapshotter cannot be enabled for %q, try using \"fuse-overlayfs\" or \"native\"", nodeConfig.Containerd.Root) } case "fuse-overlayfs": - if err := fuseoverlayfs.Supported(nodeConfig.Containerd.Root); err != nil { + if err := containerd.FuseoverlayfsSupported(nodeConfig.Containerd.Root); err != nil { return nil, errors.Wrapf(err, "\"fuse-overlayfs\" snapshotter cannot be enabled for %q, try using \"native\"", nodeConfig.Containerd.Root) } diff --git a/pkg/agent/containerd/config_linux.go b/pkg/agent/containerd/config_linux.go new file mode 100644 index 0000000000..3f5e475696 --- /dev/null +++ b/pkg/agent/containerd/config_linux.go @@ -0,0 +1,73 @@ +// +build linux + +package containerd + +import ( + "context" + "io/ioutil" + "os" + + "github.com/opencontainers/runc/libcontainer/system" + "github.com/pkg/errors" + "github.com/rancher/k3s/pkg/agent/templates" + util2 "github.com/rancher/k3s/pkg/agent/util" + "github.com/rancher/k3s/pkg/cgroups" + "github.com/rancher/k3s/pkg/daemons/config" + "github.com/rancher/k3s/pkg/version" + "github.com/rancher/wharfie/pkg/registries" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// setupContainerdConfig generates the containerd.toml, using a template combined with various +// runtime configurations and registry mirror settings provided by the administrator. +func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { + privRegistries, err := registries.GetPrivateRegistries(cfg.AgentConfig.PrivateRegistry) + if err != nil { + return err + } + + isRunningInUserNS := system.RunningInUserNS() + _, _, hasCFS, hasPIDs := cgroups.CheckCgroups() + // "/sys/fs/cgroup" is namespaced + cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil + disableCgroup := isRunningInUserNS && (!hasCFS || !hasPIDs || !cgroupfsWritable) + if disableCgroup { + logrus.Warn("cgroup v2 controllers are not delegated for rootless. Disabling cgroup.") + } + + var containerdTemplate string + containerdConfig := templates.ContainerdConfig{ + NodeConfig: cfg, + DisableCgroup: disableCgroup, + IsRunningInUserNS: isRunningInUserNS, + PrivateRegistryConfig: privRegistries.Registry(), + } + + selEnabled, selConfigured, err := selinuxStatus() + if err != nil { + return errors.Wrap(err, "failed to detect selinux") + } + switch { + case !cfg.SELinux && selEnabled: + logrus.Warn("SELinux is enabled on this host, but " + version.Program + " has not been started with --selinux - containerd SELinux support is disabled") + case cfg.SELinux && !selConfigured: + logrus.Warnf("SELinux is enabled for "+version.Program+" but process is not running in context '%s', "+version.Program+"-selinux policy may need to be applied", SELinuxContextType) + } + + containerdTemplateBytes, err := ioutil.ReadFile(cfg.Containerd.Template) + if err == nil { + logrus.Infof("Using containerd template at %s", cfg.Containerd.Template) + containerdTemplate = string(containerdTemplateBytes) + } else if os.IsNotExist(err) { + containerdTemplate = templates.ContainerdConfigTemplate + } else { + return err + } + parsedTemplate, err := templates.ParseTemplateFromConfig(containerdTemplate, containerdConfig) + if err != nil { + return err + } + + return util2.WriteFile(cfg.Containerd.Config, parsedTemplate) +} diff --git a/pkg/agent/containerd/config_windows.go b/pkg/agent/containerd/config_windows.go new file mode 100644 index 0000000000..9d5dbc5a33 --- /dev/null +++ b/pkg/agent/containerd/config_windows.go @@ -0,0 +1,16 @@ +// +build windows + +package containerd + +import ( + "context" + + "github.com/rancher/k3s/pkg/daemons/config" +) + +// setupContainerdConfig generates the containerd.toml, using a template combined with various +// runtime configurations and registry mirror settings provided by the administrator. +func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { + // TODO: Create windows config setup. + return nil +} diff --git a/pkg/agent/containerd/containerd.go b/pkg/agent/containerd/containerd.go index cc1a947728..ad5be86ac0 100644 --- a/pkg/agent/containerd/containerd.go +++ b/pkg/agent/containerd/containerd.go @@ -21,19 +21,13 @@ import ( "github.com/containerd/containerd/reference/docker" "github.com/klauspost/compress/zstd" "github.com/natefinch/lumberjack" - "github.com/opencontainers/runc/libcontainer/system" "github.com/pierrec/lz4" "github.com/pkg/errors" - "github.com/rancher/k3s/pkg/agent/templates" util2 "github.com/rancher/k3s/pkg/agent/util" - "github.com/rancher/k3s/pkg/daemons/agent" "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/untar" - "github.com/rancher/k3s/pkg/version" - "github.com/rancher/wharfie/pkg/registries" "github.com/rancher/wrangler/pkg/merr" "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" "google.golang.org/grpc" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" "k8s.io/kubernetes/pkg/kubelet/util" @@ -330,56 +324,3 @@ func prePullImages(ctx context.Context, conn *grpc.ClientConn, images io.Reader) } return nil } - -// setupContainerdConfig generates the containerd.toml, using a template combined with various -// runtime configurations and registry mirror settings provided by the administrator. -func setupContainerdConfig(ctx context.Context, cfg *config.Node) error { - privRegistries, err := registries.GetPrivateRegistries(cfg.AgentConfig.PrivateRegistry) - if err != nil { - return err - } - - isRunningInUserNS := system.RunningInUserNS() - _, _, hasCFS, hasPIDs := agent.CheckCgroups() - // "/sys/fs/cgroup" is namespaced - cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil - disableCgroup := isRunningInUserNS && (!hasCFS || !hasPIDs || !cgroupfsWritable) - if disableCgroup { - logrus.Warn("cgroup v2 controllers are not delegated for rootless. Disabling cgroup.") - } - - var containerdTemplate string - containerdConfig := templates.ContainerdConfig{ - NodeConfig: cfg, - DisableCgroup: disableCgroup, - IsRunningInUserNS: isRunningInUserNS, - PrivateRegistryConfig: privRegistries.Registry(), - } - - selEnabled, selConfigured, err := selinuxStatus() - if err != nil { - return errors.Wrap(err, "failed to detect selinux") - } - switch { - case !cfg.SELinux && selEnabled: - logrus.Warn("SELinux is enabled on this host, but " + version.Program + " has not been started with --selinux - containerd SELinux support is disabled") - case cfg.SELinux && !selConfigured: - logrus.Warnf("SELinux is enabled for "+version.Program+" but process is not running in context '%s', "+version.Program+"-selinux policy may need to be applied", SELinuxContextType) - } - - containerdTemplateBytes, err := ioutil.ReadFile(cfg.Containerd.Template) - if err == nil { - logrus.Infof("Using containerd template at %s", cfg.Containerd.Template) - containerdTemplate = string(containerdTemplateBytes) - } else if os.IsNotExist(err) { - containerdTemplate = templates.ContainerdConfigTemplate - } else { - return err - } - parsedTemplate, err := templates.ParseTemplateFromConfig(containerdTemplate, containerdConfig) - if err != nil { - return err - } - - return util2.WriteFile(cfg.Containerd.Config, parsedTemplate) -} diff --git a/pkg/agent/loadbalancer/loadbalancer.go b/pkg/agent/loadbalancer/loadbalancer.go index 457dc2ac4c..39dec7ef68 100644 --- a/pkg/agent/loadbalancer/loadbalancer.go +++ b/pkg/agent/loadbalancer/loadbalancer.go @@ -7,12 +7,10 @@ import ( "path/filepath" "strconv" "sync" - "syscall" "github.com/google/tcpproxy" "github.com/rancher/k3s/pkg/version" "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) type LoadBalancer struct { @@ -156,9 +154,3 @@ func onDialError(src net.Conn, dstDialErr error) { logrus.Debugf("Incoming conn %v, error dialing load balancer servers: %v", src.RemoteAddr().String(), dstDialErr) src.Close() } - -func reusePort(network, address string, conn syscall.RawConn) error { - return conn.Control(func(descriptor uintptr) { - syscall.SetsockoptInt(int(descriptor), unix.SOL_SOCKET, unix.SO_REUSEPORT, 1) - }) -} diff --git a/pkg/agent/loadbalancer/utility_windows.go b/pkg/agent/loadbalancer/utility_windows.go new file mode 100644 index 0000000000..4648f8fd25 --- /dev/null +++ b/pkg/agent/loadbalancer/utility_windows.go @@ -0,0 +1,9 @@ +// +build windows + +package loadbalancer + +import "syscall" + +func reusePort(network, address string, conn syscall.RawConn) error { + return nil +} diff --git a/pkg/agent/loadbalancer/utlity_linux.go b/pkg/agent/loadbalancer/utlity_linux.go new file mode 100644 index 0000000000..bd71f14c80 --- /dev/null +++ b/pkg/agent/loadbalancer/utlity_linux.go @@ -0,0 +1,15 @@ +// +build linux + +package loadbalancer + +import ( + "syscall" + + "golang.org/x/sys/unix" +) + +func reusePort(network, address string, conn syscall.RawConn) error { + return conn.Control(func(descriptor uintptr) { + syscall.SetsockoptInt(int(descriptor), unix.SOL_SOCKET, unix.SO_REUSEPORT, 1) + }) +} diff --git a/pkg/agent/run.go b/pkg/agent/run.go index ee36846200..3cbf3c6240 100644 --- a/pkg/agent/run.go +++ b/pkg/agent/run.go @@ -10,8 +10,6 @@ import ( "strings" "time" - "github.com/containerd/cgroups" - cgroupsv2 "github.com/containerd/cgroups/v2" systemd "github.com/coreos/go-systemd/daemon" "github.com/pkg/errors" "github.com/rancher/k3s/pkg/agent/config" @@ -21,6 +19,7 @@ import ( "github.com/rancher/k3s/pkg/agent/proxy" "github.com/rancher/k3s/pkg/agent/syssetup" "github.com/rancher/k3s/pkg/agent/tunnel" + "github.com/rancher/k3s/pkg/cgroups" "github.com/rancher/k3s/pkg/cli/cmds" "github.com/rancher/k3s/pkg/clientaccess" cp "github.com/rancher/k3s/pkg/cloudprovider" @@ -199,7 +198,7 @@ func coreClient(cfg string) (kubernetes.Interface, error) { } func Run(ctx context.Context, cfg cmds.Agent) error { - if err := validate(); err != nil { + if err := cgroups.Validate(); err != nil { return err } @@ -237,53 +236,6 @@ func Run(ctx context.Context, cfg cmds.Agent) error { return run(ctx, cfg, proxy) } -func validate() error { - if cgroups.Mode() == cgroups.Unified { - return validateCgroupsV2() - } - return validateCgroupsV1() -} - -func validateCgroupsV1() error { - cgroups, err := ioutil.ReadFile("/proc/self/cgroup") - if err != nil { - return err - } - - if !strings.Contains(string(cgroups), "cpuset") { - logrus.Warn(`Failed to find cpuset cgroup, you may need to add "cgroup_enable=cpuset" to your linux cmdline (/boot/cmdline.txt on a Raspberry Pi)`) - } - - if !strings.Contains(string(cgroups), "memory") { - msg := "ailed to find memory cgroup, you may need to add \"cgroup_memory=1 cgroup_enable=memory\" to your linux cmdline (/boot/cmdline.txt on a Raspberry Pi)" - logrus.Error("F" + msg) - return errors.New("f" + msg) - } - - return nil -} - -func validateCgroupsV2() error { - manager, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") - if err != nil { - return err - } - controllers, err := manager.RootControllers() - if err != nil { - return err - } - m := make(map[string]struct{}) - for _, controller := range controllers { - m[controller] = struct{}{} - } - for _, controller := range []string{"cpu", "cpuset", "memory"} { - if _, ok := m[controller]; !ok { - return fmt.Errorf("failed to find %s cgroup (v2)", controller) - } - } - return nil -} - func configureNode(ctx context.Context, agentConfig *daemonconfig.Agent, nodes v1.NodeInterface) error { count := 0 for { diff --git a/pkg/agent/syssetup/setup_windows.go b/pkg/agent/syssetup/setup_windows.go index 4053e2a901..9159978cf1 100644 --- a/pkg/agent/syssetup/setup_windows.go +++ b/pkg/agent/syssetup/setup_windows.go @@ -1,3 +1,7 @@ package syssetup -func Configure() {} +import kubeproxyconfig "k8s.io/kubernetes/pkg/proxy/apis/config" + +func Configure(enableIPv6 bool, config *kubeproxyconfig.KubeProxyConntrackConfiguration) { + +} diff --git a/pkg/cgroups/cgroups_linux.go b/pkg/cgroups/cgroups_linux.go new file mode 100644 index 0000000000..3f97ba915c --- /dev/null +++ b/pkg/cgroups/cgroups_linux.go @@ -0,0 +1,173 @@ +// +build linux + +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "github.com/containerd/cgroups" + cgroupsv2 "github.com/containerd/cgroups/v2" + "github.com/rancher/k3s/pkg/version" + "github.com/sirupsen/logrus" +) + +func Validate() error { + if cgroups.Mode() == cgroups.Unified { + return validateCgroupsV2() + } + return validateCgroupsV1() +} + +func validateCgroupsV1() error { + cgroups, err := ioutil.ReadFile("/proc/self/cgroup") + if err != nil { + return err + } + + if !strings.Contains(string(cgroups), "cpuset") { + logrus.Warn(`Failed to find cpuset cgroup, you may need to add "cgroup_enable=cpuset" to your linux cmdline (/boot/cmdline.txt on a Raspberry Pi)`) + } + + if !strings.Contains(string(cgroups), "memory") { + msg := "ailed to find memory cgroup, you may need to add \"cgroup_memory=1 cgroup_enable=memory\" to your linux cmdline (/boot/cmdline.txt on a Raspberry Pi)" + logrus.Error("F" + msg) + return errors.New("f" + msg) + } + + return nil +} + +func validateCgroupsV2() error { + manager, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") + if err != nil { + return err + } + controllers, err := manager.RootControllers() + if err != nil { + return err + } + m := make(map[string]struct{}) + for _, controller := range controllers { + m[controller] = struct{}{} + } + for _, controller := range []string{"cpu", "cpuset", "memory"} { + if _, ok := m[controller]; !ok { + return fmt.Errorf("failed to find %s cgroup (v2)", controller) + } + } + return nil +} + +func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { + cgroupsModeV2 := cgroups.Mode() == cgroups.Unified + + // For Unified (v2) cgroups we can directly check to see what controllers are mounted + // under the unified hierarchy. + if cgroupsModeV2 { + m, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") + if err != nil { + return "", "", false, false + } + controllers, err := m.Controllers() + if err != nil { + return "", "", false, false + } + // Intentionally using an expressionless switch to match the logic below + for _, controller := range controllers { + switch { + case controller == "cpu": + hasCFS = true + case controller == "pids": + hasPIDs = true + } + } + } + + f, err := os.Open("/proc/self/cgroup") + if err != nil { + return "", "", false, false + } + defer f.Close() + + scan := bufio.NewScanner(f) + for scan.Scan() { + parts := strings.Split(scan.Text(), ":") + if len(parts) < 3 { + continue + } + controllers := strings.Split(parts[1], ",") + // For v1 or hybrid, controller can be a single value {"blkio"}, or a comounted set {"cpu","cpuacct"} + // For v2, controllers = {""} (only contains a single empty string) + for _, controller := range controllers { + switch { + case controller == "name=systemd" || cgroupsModeV2: + // If we detect that we are running under a `.scope` unit with systemd + // we can assume we are being directly invoked from the command line + // and thus need to set our kubelet root to something out of the context + // of `/user.slice` to ensure that `CPUAccounting` and `MemoryAccounting` + // are enabled, as they are generally disabled by default for `user.slice` + // Note that we are not setting the `runtimeRoot` as if we are running with + // `--docker`, we will inadvertently move the cgroup `dockerd` lives in + // which is not ideal and causes dockerd to become unmanageable by systemd. + last := parts[len(parts)-1] + i := strings.LastIndex(last, ".scope") + if i > 0 { + kubeletRoot = "/" + version.Program + } + case controller == "cpu": + // It is common for this to show up multiple times in /sys/fs/cgroup if the controllers are comounted: + // as "cpu" and "cpuacct", symlinked to the actual hierarchy at "cpu,cpuacct". Unfortunately the order + // listed in /proc/self/cgroups may not be the same order used in /sys/fs/cgroup, so this check + // can fail if we use the comma-separated name. Instead, we check for the controller using the symlink. + p := filepath.Join("/sys/fs/cgroup", controller, parts[2], "cpu.cfs_period_us") + if _, err := os.Stat(p); err == nil { + hasCFS = true + } + case controller == "pids": + hasPIDs = true + } + } + } + + // If we're running with v1 and didn't find a scope assigned by systemd, we need to create our own root cgroup to avoid + // just inheriting from the parent process. The kubelet will take care of moving us into it when we start it up later. + if kubeletRoot == "" { + // Examine process ID 1 to see if there is a cgroup assigned to it. + // When we are not in a container, process 1 is likely to be systemd or some other service manager. + // It either lives at `/` or `/init.scope` according to https://man7.org/linux/man-pages/man7/systemd.special.7.html + // When containerized, process 1 will be generally be in a cgroup, otherwise, we may be running in + // a host PID scenario but we don't support this. + g, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", "", false, false + } + defer g.Close() + scan = bufio.NewScanner(g) + for scan.Scan() { + parts := strings.Split(scan.Text(), ":") + if len(parts) < 3 { + continue + } + controllers := strings.Split(parts[1], ",") + // For v1 or hybrid, controller can be a single value {"blkio"}, or a comounted set {"cpu","cpuacct"} + // For v2, controllers = {""} (only contains a single empty string) + for _, controller := range controllers { + switch { + case controller == "name=systemd" || cgroupsModeV2: + last := parts[len(parts)-1] + if last != "/" && last != "/init.scope" { + kubeletRoot = "/" + version.Program + runtimeRoot = "/" + version.Program + } + } + } + } + } + return kubeletRoot, runtimeRoot, hasCFS, hasPIDs +} diff --git a/pkg/cgroups/cgroups_windows.go b/pkg/cgroups/cgroups_windows.go new file mode 100644 index 0000000000..f5c11dd38f --- /dev/null +++ b/pkg/cgroups/cgroups_windows.go @@ -0,0 +1,11 @@ +// +build windows + +package cgroups + +func Validate() error { + return nil +} + +func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { + return "", "", false, false +} diff --git a/pkg/cli/cmds/root.go b/pkg/cli/cmds/root.go index 13d523b430..ebd536685c 100644 --- a/pkg/cli/cmds/root.go +++ b/pkg/cli/cmds/root.go @@ -1,7 +1,6 @@ package cmds import ( - "errors" "fmt" "os" "runtime" @@ -21,8 +20,6 @@ var ( } ) -var ErrCommandNoArgs = errors.New("this command does not take any arguments") - func init() { // hack - force "file,dns" lookup order if go dns is used if os.Getenv("RES_OPTIONS") == "" { diff --git a/pkg/cli/etcdsnapshot/etcd_snapshot.go b/pkg/cli/etcdsnapshot/etcd_snapshot.go index 72038862a5..0b738c0e0f 100644 --- a/pkg/cli/etcdsnapshot/etcd_snapshot.go +++ b/pkg/cli/etcdsnapshot/etcd_snapshot.go @@ -15,6 +15,7 @@ import ( "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/etcd" "github.com/rancher/k3s/pkg/server" + util2 "github.com/rancher/k3s/pkg/util" "github.com/rancher/wrangler/pkg/signals" "github.com/urfave/cli" ) @@ -69,7 +70,7 @@ func run(app *cli.Context, cfg *cmds.Server) error { } if len(app.Args()) > 0 { - return cmds.ErrCommandNoArgs + return util2.ErrCommandNoArgs } serverConfig.ControlConfig.DataDir = dataDir diff --git a/pkg/containerd/utility_linux.go b/pkg/containerd/utility_linux.go new file mode 100644 index 0000000000..e4d66c8d9a --- /dev/null +++ b/pkg/containerd/utility_linux.go @@ -0,0 +1,16 @@ +// +build linux + +package containerd + +import ( + "github.com/containerd/containerd/snapshots/overlay" + fuseoverlayfs "github.com/containerd/fuse-overlayfs-snapshotter" +) + +func OverlaySupported(root string) error { + return overlay.Supported(root) +} + +func FuseoverlayfsSupported(root string) error { + return fuseoverlayfs.Supported(root) +} diff --git a/pkg/containerd/utility_windows.go b/pkg/containerd/utility_windows.go new file mode 100644 index 0000000000..c2172b91e4 --- /dev/null +++ b/pkg/containerd/utility_windows.go @@ -0,0 +1,16 @@ +// +build windows + +package containerd + +import ( + "github.com/pkg/errors" + util2 "github.com/rancher/k3s/pkg/util" +) + +func OverlaySupported(root string) error { + return errors.Wrapf(util2.ErrUnsupportedPlatform, "overlayfs is not supported") +} + +func FuseoverlayfsSupported(root string) error { + return errors.Wrapf(util2.ErrUnsupportedPlatform, "fuse-overlayfs is not supported") +} diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index b8fa6e2449..cb2d1aac20 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -1,28 +1,23 @@ package agent import ( - "bufio" "math/rand" "os" "path/filepath" "strings" "time" - "github.com/containerd/cgroups" - cgroupsv2 "github.com/containerd/cgroups/v2" "github.com/opencontainers/runc/libcontainer/system" + "github.com/rancher/k3s/pkg/cgroups" "github.com/rancher/k3s/pkg/daemons/config" "github.com/rancher/k3s/pkg/daemons/executor" "github.com/rancher/k3s/pkg/util" - "github.com/rancher/k3s/pkg/version" "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" "k8s.io/apimachinery/pkg/util/net" "k8s.io/component-base/logs" - "k8s.io/kubernetes/pkg/kubeapiserver/authorizer/modes" - _ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration _ "k8s.io/component-base/metrics/prometheus/version" // for version metric registration + "k8s.io/kubernetes/pkg/kubeapiserver/authorizer/modes" ) const unixPrefix = "unix://" @@ -133,7 +128,7 @@ func startKubelet(cfg *config.Agent) error { if err != nil || defaultIP.String() != cfg.NodeIP { argsMap["node-ip"] = cfg.NodeIP } - kubeletRoot, runtimeRoot, hasCFS, hasPIDs := CheckCgroups() + kubeletRoot, runtimeRoot, hasCFS, hasPIDs := cgroups.CheckCgroups() if !hasCFS { logrus.Warn("Disabling CPU quotas due to missing cpu.cfs_period_us") argsMap["cpu-cfs-quota"] = "false" @@ -170,20 +165,7 @@ func startKubelet(cfg *config.Agent) error { } if cfg.Rootless { - // "/sys/fs/cgroup" is namespaced - cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil - if hasCFS && hasPIDs && cgroupfsWritable { - logrus.Info("cgroup v2 controllers are delegated for rootless.") - // cgroupfs v2, delegated for rootless by systemd - argsMap["cgroup-driver"] = "cgroupfs" - } else { - logrus.Warn("cgroup v2 controllers are not delegated for rootless. Setting cgroup driver to \"none\".") - // flags are from https://github.com/rootless-containers/usernetes/blob/v20190826.0/boot/kubelet.sh - argsMap["cgroup-driver"] = "none" - argsMap["feature-gates=SupportNoneCgroupDriver"] = "true" - argsMap["cgroups-per-qos"] = "false" - argsMap["enforce-node-allocatable"] = "" - } + createRootlessConfig(argsMap, hasCFS, hasCFS) } if cfg.ProtectKernelDefaults { @@ -216,111 +198,3 @@ func ImageCredProvAvailable(cfg *config.Agent) bool { } return true } - -func CheckCgroups() (kubeletRoot, runtimeRoot string, hasCFS, hasPIDs bool) { - cgroupsModeV2 := cgroups.Mode() == cgroups.Unified - - // For Unified (v2) cgroups we can directly check to see what controllers are mounted - // under the unified hierarchy. - if cgroupsModeV2 { - m, err := cgroupsv2.LoadManager("/sys/fs/cgroup", "/") - if err != nil { - return "", "", false, false - } - controllers, err := m.Controllers() - if err != nil { - return "", "", false, false - } - // Intentionally using an expressionless switch to match the logic below - for _, controller := range controllers { - switch { - case controller == "cpu": - hasCFS = true - case controller == "pids": - hasPIDs = true - } - } - } - - f, err := os.Open("/proc/self/cgroup") - if err != nil { - return "", "", false, false - } - defer f.Close() - - scan := bufio.NewScanner(f) - for scan.Scan() { - parts := strings.Split(scan.Text(), ":") - if len(parts) < 3 { - continue - } - controllers := strings.Split(parts[1], ",") - // For v1 or hybrid, controller can be a single value {"blkio"}, or a comounted set {"cpu","cpuacct"} - // For v2, controllers = {""} (only contains a single empty string) - for _, controller := range controllers { - switch { - case controller == "name=systemd" || cgroupsModeV2: - // If we detect that we are running under a `.scope` unit with systemd - // we can assume we are being directly invoked from the command line - // and thus need to set our kubelet root to something out of the context - // of `/user.slice` to ensure that `CPUAccounting` and `MemoryAccounting` - // are enabled, as they are generally disabled by default for `user.slice` - // Note that we are not setting the `runtimeRoot` as if we are running with - // `--docker`, we will inadvertently move the cgroup `dockerd` lives in - // which is not ideal and causes dockerd to become unmanageable by systemd. - last := parts[len(parts)-1] - i := strings.LastIndex(last, ".scope") - if i > 0 { - kubeletRoot = "/" + version.Program - } - case controller == "cpu": - // It is common for this to show up multiple times in /sys/fs/cgroup if the controllers are comounted: - // as "cpu" and "cpuacct", symlinked to the actual hierarchy at "cpu,cpuacct". Unfortunately the order - // listed in /proc/self/cgroups may not be the same order used in /sys/fs/cgroup, so this check - // can fail if we use the comma-separated name. Instead, we check for the controller using the symlink. - p := filepath.Join("/sys/fs/cgroup", controller, parts[2], "cpu.cfs_period_us") - if _, err := os.Stat(p); err == nil { - hasCFS = true - } - case controller == "pids": - hasPIDs = true - } - } - } - - // If we're running with v1 and didn't find a scope assigned by systemd, we need to create our own root cgroup to avoid - // just inheriting from the parent process. The kubelet will take care of moving us into it when we start it up later. - if kubeletRoot == "" { - // Examine process ID 1 to see if there is a cgroup assigned to it. - // When we are not in a container, process 1 is likely to be systemd or some other service manager. - // It either lives at `/` or `/init.scope` according to https://man7.org/linux/man-pages/man7/systemd.special.7.html - // When containerized, process 1 will be generally be in a cgroup, otherwise, we may be running in - // a host PID scenario but we don't support this. - g, err := os.Open("/proc/1/cgroup") - if err != nil { - return "", "", false, false - } - defer g.Close() - scan = bufio.NewScanner(g) - for scan.Scan() { - parts := strings.Split(scan.Text(), ":") - if len(parts) < 3 { - continue - } - controllers := strings.Split(parts[1], ",") - // For v1 or hybrid, controller can be a single value {"blkio"}, or a comounted set {"cpu","cpuacct"} - // For v2, controllers = {""} (only contains a single empty string) - for _, controller := range controllers { - switch { - case controller == "name=systemd" || cgroupsModeV2: - last := parts[len(parts)-1] - if last != "/" && last != "/init.scope" { - kubeletRoot = "/" + version.Program - runtimeRoot = "/" + version.Program - } - } - } - } - } - return kubeletRoot, runtimeRoot, hasCFS, hasPIDs -} diff --git a/pkg/daemons/agent/agent_linux.go b/pkg/daemons/agent/agent_linux.go new file mode 100644 index 0000000000..3f1a6bf07b --- /dev/null +++ b/pkg/daemons/agent/agent_linux.go @@ -0,0 +1,25 @@ +// +build linux + +package agent + +import ( + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func createRootlessConfig(argsMap map[string]string, hasCFS, hasPIDs bool) { + // "/sys/fs/cgroup" is namespaced + cgroupfsWritable := unix.Access("/sys/fs/cgroup", unix.W_OK) == nil + if hasCFS && hasPIDs && cgroupfsWritable { + logrus.Info("cgroup v2 controllers are delegated for rootless.") + // cgroupfs v2, delegated for rootless by systemd + argsMap["cgroup-driver"] = "cgroupfs" + } else { + logrus.Warn("cgroup v2 controllers are not delegated for rootless. Setting cgroup driver to \"none\".") + // flags are from https://github.com/rootless-containers/usernetes/blob/v20190826.0/boot/kubelet.sh + argsMap["cgroup-driver"] = "none" + argsMap["feature-gates=SupportNoneCgroupDriver"] = "true" + argsMap["cgroups-per-qos"] = "false" + argsMap["enforce-node-allocatable"] = "" + } +} diff --git a/pkg/daemons/agent/agent_windows.go b/pkg/daemons/agent/agent_windows.go new file mode 100644 index 0000000000..a995e8c707 --- /dev/null +++ b/pkg/daemons/agent/agent_windows.go @@ -0,0 +1,6 @@ +// +build windows + +package agent + +func createRootlessConfig(argsMap map[string]string, hasCfs, hasPIDs bool) { +} diff --git a/pkg/util/errors.go b/pkg/util/errors.go new file mode 100644 index 0000000000..92a648be8a --- /dev/null +++ b/pkg/util/errors.go @@ -0,0 +1,6 @@ +package util + +import "errors" + +var ErrCommandNoArgs = errors.New("this command does not take any arguments") +var ErrUnsupportedPlatform = errors.New("unsupported platform")