update entrypoint

2022-05-10 14:35:32 -07:00 · 2022-05-10 14:35:32 -07:00 · 31c4c8bced
parent 04f627d5e7
commit 31c4c8bced
1 changed files with 136 additions and 34 deletions
--- a/deploy/kicbase/entrypoint
+++ b/deploy/kicbase/entrypoint
@ -28,6 +28,11 @@ if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
  echo 'INFO: running in a user namespace (experimental)'
 fi

+grep_allow_nomatch() {
+  # grep exits 0 on match, 1 on no match, 2 on error
+  grep "$@" || [[ $? == 1 ]]
+}
+
 validate_userns() {
  if [[ -z "${userns}" ]]; then
    return
@ -40,22 +45,81 @@ validate_userns() {
    echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
  fi

-  if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
-    echo "ERROR: UserNS: cgroup v2 needs to be enabled, see https://rootlesscontaine.rs/getting-started/common/cgroup2/" >&2
-    exit 1
+  if [[ -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
+    for f in cpu memory pids; do
+      if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
+        echo "ERROR: UserNS: $f controller needs to be delegated" >&2
+        exit 1
+      fi
+    done
  fi
-  for f in cpu memory pids; do
-    if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
-      echo "ERROR: UserNS: $f controller needs to be delegated, see https://rootlesscontaine.rs/getting-started/common/cgroup2/" >&2
-    exit 1
-    fi
-  done
+}
+
+overlayfs_preferrable() {
+	if [[ -z "$userns" ]]; then
+		# If we are outside userns, we can always assume overlayfs is preferrable
+		return 0
+	fi
+
+	# Debian 10 and 11 supports overlayfs in userns with a "permit_mount_in_userns" kernel patch,
+	# but known to be unstable, so we avoid using it https://github.com/moby/moby/issues/42302
+	if [[ -e "/sys/module/overlay/parameters/permit_mounts_in_userns" ]]; then
+		echo "INFO: UserNS: kernel seems supporting overlayfs with permit_mounts_in_userns, but avoiding due to instability."
+		return 1
+	fi
+
+	# Check overlayfs availability, by attempting to mount it.
+	#
+	# Overlayfs inside userns is known to be available for the following environments:
+	# - Kernel >= 5.11 (but 5.11 and 5.12 have issues on SELinux hosts. Fixed in 5.13.)
+	# - Ubuntu kernel
+	# - Debian kernel (but avoided due to instability, see the /sys/module/overlay/... check above)
+	# - Sysbox
+	tmp=$(mktemp -d)
+	mkdir -p "${tmp}/l" "${tmp}/u" "${tmp}/w" "${tmp}/m"
+	if ! mount -t overlay -o lowerdir="${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" overlay "${tmp}/m"; then
+		echo "INFO: UserNS: kernel does not seem to support overlayfs."
+		rm -rf "${tmp}"
+		return 1
+	fi
+	umount "${tmp}/m"
+	rm -rf "${tmp}"
+
+	# Detect whether SELinux is Enforcing (or Permitted) by grepping /proc/self/attr/current .
+	# Note that we cannot use `getenforce` command here because /sys/fs/selinux is typically not mounted for containers.
+	if grep -q "_t:" "/proc/self/attr/current"; then
+		# When the kernel is before v5.13 and SELinux is enforced, fuse-overlayfs might be safer, so we print a warning (but not an error).
+		# https://github.com/torvalds/linux/commit/7fa2e79a6bb924fa4b2de5766dab31f0f47b5ab6
+		echo "WARN: UserNS: SELinux might be Enforcing. If you see an error related to overlayfs, try setting \`KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER=fuse-overlayfs\` ." >&2
+	fi
+	return 0
 }

 configure_containerd() {
-  # we need to switch to the 'native' snapshotter on zfs
-  if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then
-    sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
+  local snapshotter=${KIND_EXPERIMENTAL_CONTAINERD_SNAPSHOTTER:-}
+  if [[ -n "$userns" ]]; then
+    # userns (rootless) configs
+
+    # Adjust oomScoreAdj
+    sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml
+
+    # Use fuse-overlayfs if overlayfs is not preferrable: https://github.com/kubernetes-sigs/kind/issues/2275
+    if [[ -z "$snapshotter" ]] && ! overlayfs_preferrable; then
+      snapshotter="fuse-overlayfs"
+    fi
+  else
+    # we need to switch to the 'native' snapshotter on zfs
+    if [[ -z "$snapshotter" ]] && [["$(stat -f -c %T /kind)" == 'zfs' ]]; then
+      snapshotter="native"
+    fi
+  fi
+  if [[ -n "$snapshotter" ]]; then
+    echo "INFO: changing snapshotter from \"overlayfs\" to \"$snapshotter\""
+    sed -i "s/snapshotter = \"overlayfs\"/snapshotter = \"$snapshotter\"/" /etc/containerd/config.toml
+    if [[ "$snapshotter" = "fuse-overlayfs" ]]; then
+      echo 'INFO: enabling containerd-fuse-overlayfs service'
+      systemctl enable containerd-fuse-overlayfs
+    fi
  fi
 }

@ -106,15 +170,19 @@ fix_mount() {
    sync
  fi

-  if [[ -z "${userns}" ]]; then
-    echo 'INFO: remounting /sys read-only'
-    # systemd-in-a-container should have read only /sys
-    # https://systemd.io/CONTAINER_INTERFACE/
-    # however, we need other things from `docker run --privileged` ...
-    # and this flag also happens to make /sys rw, amongst other things
-    #
-    # This step is skipped when running inside UserNS, because it fails with EACCES.
-    mount -o remount,ro /sys
+  echo 'INFO: remounting /sys read-only'
+  # systemd-in-a-container should have read only /sys
+  # https://systemd.io/CONTAINER_INTERFACE/
+  # however, we need other things from `docker run --privileged` ...
+  # and this flag also happens to make /sys rw, amongst other things
+  #
+  # This step is ignored when running inside UserNS, because it fails with EACCES.
+  if ! mount -o remount,ro /sys; then
+    if [[ -n "$userns" ]]; then
+      echo 'INFO: UserNS: ignoring mount fail' >&2
+    else
+      exit 1
+    fi
  fi

  echo 'INFO: making mounts shared' >&2
@ -163,20 +231,55 @@ fix_cgroup() {
    return
  fi
  echo 'INFO: detected cgroup v1'
-  echo 'INFO: fix cgroup mounts for all subsystems'
+  local current_cgroup
+  current_cgroup=$(grep -E '^[^:]*:([^:]*,)?cpu(,[^,:]*)?:.*' /proc/self/cgroup | cut -d: -f3)
+  if [ "$current_cgroup" = "/" ]; then
+    echo "INFO: cgroupns detected, no need to fix cgroups"
+    return
+  fi
+
+  # NOTE The rest of this function deals with the unfortunate situation of
+  # cgroup v1 with no cgroupns enabled. One fine day every user will have
+  # cgroupns enabled (or switch or cgroup v2 which has it enabled by default).
+  # Once that happens, this function can be removed completely.
+
+  echo 'WARN: cgroupns not enabled! Please use cgroup v2, or cgroup v1 with cgroupns enabled.'
+
  # See: https://d2iq.com/blog/running-kind-inside-a-kubernetes-cluster-for-continuous-integration
  # Capture initial state before modifying
  #
  # Basically we're looking for the cgroup-path for the cpu controller for the
  # current process. this tells us what cgroup-path the container is in.
  # Then we collect the subsystems that are active on this path.
-  # We assume the cpu controller is in use on all node containers.
+  # We assume the cpu controller is in use on all node containers,
+  # and other controllers use the same sub-path.
  #
  # See: https://man7.org/linux/man-pages/man7/cgroups.7.html
-  local current_cgroup
-  current_cgroup=$(grep -E '^[^:]*:([^:]*,)?cpu(,[^,:]*)?:.*' /proc/self/cgroup | cut -d: -f3)
+  echo 'INFO: fix cgroup mounts for all subsystems'
  local cgroup_subsystems
  cgroup_subsystems=$(findmnt -lun -o source,target -t cgroup | grep "${current_cgroup}" | awk '{print $2}')
+  # Unmount the cgroup subsystems that are not known to runtime used to
+  # run the container we are in. Those subsystems are not properly scoped
+  # (i.e. the root cgroup is exposed, rather than something like docker/xxxx).
+  # In case a runtime (which is aware of more subsystems -- such as rdma,
+  # misc, or unified) is used inside the container, it may create cgroups for
+  # these subsystems, and as they are not scoped, they will leak to the host
+  # and thus will become non-removable.
+  #
+  # See https://github.com/kubernetes/kubernetes/issues/109182
+  local unsupported_cgroups
+  unsupported_cgroups=$(findmnt -lun -o source,target -t cgroup | grep_allow_nomatch -v "${current_cgroup}" | awk '{print $2}')
+  if [ -n "$unsupported_cgroups" ]; then
+    local mnt
+    echo "$unsupported_cgroups" |
+    while IFS= read -r mnt; do
+      echo "INFO: unmounting and removing $mnt"
+      umount "$mnt" || true
+      rmdir "$mnt" || true
+    done
+  fi
+
+
  # For each cgroup subsystem, Docker does a bind mount from the current
  # cgroup to the root of the cgroup subsystem. For instance:
  #   /sys/fs/cgroup/memory/docker/<cid> -> /sys/fs/cgroup/memory
@ -214,10 +317,12 @@ fix_cgroup() {
  # "nesting" clusters, unless we instruct it to use a different cgroup root.
  # We do this, and when doing so we must fixup this alternative root
  # currently this is hardcoded to be /kubelet
+  # under systemd cgroup driver, kubelet appends .slice
  mount --make-rprivate /sys/fs/cgroup
  echo "${cgroup_subsystems}" |
  while IFS= read -r subsystem; do
    mount_kubelet_cgroup_root "/kubelet" "${subsystem}"
+    mount_kubelet_cgroup_root "/kubelet.slice" "${subsystem}"
  done
 }

@ -287,18 +392,15 @@ fix_kmsg() {
 }

 select_iptables() {
-  # based on: https://github.com/kubernetes/kubernetes/blob/ffe93b3979486feb41a0f85191bdd189cbd56ccc/build/debian-iptables/iptables-wrapper
-  local mode=nft
-  num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep '^-' | wc -l || true)
-  if [ "${num_legacy_lines}" -ge 10 ]; then
+  # based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh
+  local mode num_legacy_lines num_nft_lines
+  num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep -c '^-' || true)
+  num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep -c '^-' || true)
+  if [ "${num_legacy_lines}" -ge "${num_nft_lines}" ]; then
    mode=legacy
  else
-    num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep '^-' | wc -l || true)
-    if [ "${num_legacy_lines}" -ge "${num_nft_lines}" ]; then
-      mode=legacy
-    fi
+    mode=nft
  fi
-
  echo "INFO: setting iptables to detected mode: ${mode}" >&2
  update-alternatives --set iptables "/usr/sbin/iptables-${mode}" > /dev/null
  update-alternatives --set ip6tables "/usr/sbin/ip6tables-${mode}" > /dev/null