From 9709d019667ee195f5b9e48fba88429d757c74f2 Mon Sep 17 00:00:00 2001 From: David Leadbeater Date: Tue, 13 Feb 2024 02:35:42 +0000 Subject: [PATCH] Make /proc/sys read-only with carve-outs for some sysctls This mounts a read-write version of /proc and /sys under /kind/private, which allows bind mounting and also makes use cases that need an unmasked proc or sys possible. /proc/sys is bind mounted read only per the systemd container interface[1]. Then some sysctls are made writable again by bind mounting across from the private /proc which was mounted. This may cause issues for privileged daemonsets which set sysctls which aren't namespaced (this may work anyway as often they set them to the same value on multiple nodes). That can be worked around by adding additional bind mounts via docker exec, making it clear kind can't support such interfaces and they might leak from the container. [1]: https://systemd.io/CONTAINER_INTERFACE/ --- images/base/Dockerfile | 5 ++++ images/base/files/usr/local/bin/entrypoint | 30 +++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/images/base/Dockerfile b/images/base/Dockerfile index e9440cb1f1..8d8ed8d7f6 100644 --- a/images/base/Dockerfile +++ b/images/base/Dockerfile @@ -97,6 +97,11 @@ RUN echo "Enabling / Disabling services ... " \ RUN echo "Ensuring /etc/kubernetes/manifests" \ && mkdir -p /etc/kubernetes/manifests +# Used as mount points for private copies of proc and sys filesystems in entrypoint. +RUN echo "Ensuring /kind/private" \ + && mkdir -p /kind/private/proc /kind/private/sys \ + && chmod 0700 /kind/private /kind/private/proc /kind/private/sys + # shared stage to setup go version for building binaries # NOTE we will be cross-compiling for performance reasons # This is also why we start again FROM the same base image but a different diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 015036481c..c819527c98 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -172,13 +172,24 @@ fix_mount() { sync fi + # Mount sysfs and proc as read-write, on a known, but kind specific location. + # This allows bind mounting, below and is also required to run some workloads + # which need to mount proc and sysfs themselves (this avoids the proc and + # sysfs mounts being "masked", as far as the kernel is concerned). + # XXX, better ref for fs_fully_visible than kernel code? + # https://github.com/torvalds/linux/commit/1b852bceb0d1 + log_info 'mounting /kind/private filesystems' + mount -t sysfs -o rw sysfs /kind/private/sys + mount -t proc -o rw proc /kind/private/proc + log_info 'remounting /sys read-only' # systemd-in-a-container should have read only /sys # https://systemd.io/CONTAINER_INTERFACE/ # however, we need other things from `docker run --privileged` ... # and this flag also happens to make /sys rw, amongst other things # - # This step is ignored when running inside UserNS, because it fails with EACCES. + # This step is ignored when running inside UserNS, because it can fail with + # EACCES. if ! mount -o remount,ro /sys; then if [[ -n "$userns" ]]; then log_info 'UserNS: ignoring mount fail' @@ -187,6 +198,23 @@ fix_mount() { fi fi + log_info 'making /proc/sys read-only, with known sysctls read-write' + mount --rbind -o ro /proc/sys /proc/sys + # These are the sysctls known to be namespaced in the kernel, list taken from Kubernetes: + # https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/component-helpers/node/util/sysctl/namespace.go + # In addition the kubelet attempts to set some sysctl to particular settings, we allow those: + # https://github.com/search?q=repo%3Akubernetes/kubernetes%20setupKernelTunables&type=code + for mount_point in \ + kernel/shmall kernel/shmmax kernel/shmmni kernel/shm_rmid_forced kernel/msgmax kernel/msgmnb kernel/msgmni \ + fs/mqueue \ + net \ + vm/overcommit_memory vm/panic_on_oom kernel/panic kernel/panic_on_oops \ + kernel/keys/root_maxkeys kernel/keys/root_maxbytes; do + if [[ -f /kind/private/proc/sys/"${mount_point}" ]]; then + mount --bind -o rw /kind/private/proc/sys/"${mount_point}" /proc/sys/"${mount_point}" + fi + done + log_info 'making mounts shared' # for mount propagation mount --make-rshared /