diff --git a/images/base/Dockerfile b/images/base/Dockerfile index dd1fd03577..807455a65b 100644 --- a/images/base/Dockerfile +++ b/images/base/Dockerfile @@ -96,6 +96,7 @@ RUN echo "Ensuring scripts are executable ..." \ libseccomp2 pigz \ bash ca-certificates curl rsync \ nfs-common \ + jq \ && find /lib/systemd/system/sysinit.target.wants/ -name "systemd-tmpfiles-setup.service" -delete \ && rm -f /lib/systemd/system/multi-user.target.wants/* \ && rm -f /etc/systemd/system/*.wants/* \ diff --git a/images/base/files/etc/containerd/config.toml b/images/base/files/etc/containerd/config.toml index 7fa009b89f..d300800147 100644 --- a/images/base/files/etc/containerd/config.toml +++ b/images/base/files/etc/containerd/config.toml @@ -6,6 +6,8 @@ version = 2 default_runtime_name = "runc" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "runc" # Setup a runtime with the magic name ("test-handler") used for Kubernetes # runtime class tests ... @@ -20,3 +22,5 @@ version = 2 tolerate_missing_hugepages_controller = true # explicitly use default snapshotter so we can sed it in entrypoint snapshotter = "overlayfs" + # restrict_oom_score_adj needs to be true when running inside UserNS (rootless) + restrict_oom_score_adj = false diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 1eb80b2caa..e205dbee3c 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -18,11 +18,122 @@ set -o errexit set -o nounset set -o pipefail +# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. +# Otherwise we are in a non-initial user namespace. +# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 +userns="" +if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then + userns="1" + echo 'INFO: running in a user namespace (experimental)' +fi + +validate_userns() { + if [[ -z "${userns}" ]]; then + return + fi + + local nofile_hard + nofile_hard="$(ulimit -Hn)" + local nofile_hard_expected="64000" + if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then + # This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863 + echo "ERROR: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 + exit 1 + fi + + if ! [ -f "/proc/sys/net/netfilter/nf_conntrack_max" ]; then + echo "ERROR: UserNS: /proc/sys/net/netfilter/nf_conntrack_max does not exist (needs kernel 5.7 or later)" >&2 + fi + local nf_conntrack_max + nf_conntrack_max="$(cat /proc/sys/net/netfilter/nf_conntrack_max)" + local nf_conntrack_max_expected="131072" + if [[ "${nf_conntrack_max}" != "${nf_conntrack_max_expected}" ]]; then + # This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863 + echo "ERROR: UserNS: expected net.netfilter.nf_conntrack_max to be ${nf_conntrack_max_expected}, got ${nf_conntrack_max}" >&2 + exit 1 + fi + + local dmesg_restrict + dmesg_restrict="$(cat /proc/sys/kernel/dmesg_restrict)" + if [[ "${dmesg_restrict}" != "0" ]]; then + # This ERROR can be probably demoted to WARNING after analysis of this issue: https://github.com/rootless-containers/usernetes/issues/204 + echo "ERROR: UserNS: expected kernel.dmesg_restrict to be 0, got ${dmesg_restrict}" >&2 + exit 1 + fi + if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then + echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2 + exit 1 + fi + for f in cpu memory pids; do + if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then + echo "ERROR: UserNS: $f controller needs to be delegated" >&2 + exit 1 + fi + done +} + +fake_file_with_content(){ + local path="$1" + local content="$2" + local base="/run/fake" + local fake_path="${base}/${path}" + mkdir -p "$(dirname "${fake_path}")" + echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)" + echo "${content}" > "${fake_path}" + mount --bind "${fake_path}" "${path}" +} + +fake_sysctl() { + local key="$1" + local key_slash + # shellcheck disable=SC2001 + key_slash="$(echo "${key}" | sed -e s@\\.@/@g)" + local path="/proc/sys/${key_slash}" + if [[ -f "${path}" ]]; then + local content + content="$(cat "${path}")" + fake_file_with_content "${path}" "${content}" + fi +} + configure_containerd() { # we need to switch to the 'native' snapshotter on zfs if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml fi + + # userns (rootless) configs + if [[ -n "$userns" ]]; then + # Adjust oomScoreAdj + sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml + + # mounting overlayfs inside userns requires patching kernel. + # Ubuntu kernel is patched by default. + # Debian kernel is patched by default as well, but Debian needs `sudo modprobe overlay permit_mounts_in_userns=1`. + local tmp + tmp=$(mktemp -d) + mkdir -p "${tmp}"/{l,u,w,m} + if mount -t overlay overlay -o "lowerdir=${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" "${tmp}/m"; then + umount "${tmp}/m" + else + echo 'INFO: UserNS: this kernel does not support mounting overlayfs inside userns. Disabling overlayfs' + sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml + fi + rm -rf "${tmp}" + + # To run vanilla kubelet inside UserNS, we need to fake several unwritable sysctl to be writable. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream. + fake_sysctl "vm.overcommit_memory" + fake_sysctl "vm.panic_on_oom" + fake_sysctl "kernel.panic" + fake_sysctl "kernel.panic_on_oops" + fake_sysctl "kernel.keys.root_maxkeys" + fake_sysctl "kernel.keys.root_maxbytes" + + # Wrap runc to mount fake "/sys/module/nf_conntrack/parameters/hashsize" for kube-proxy. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream. + sed -i 's/BinaryName = "runc"/BinaryName = "userns-ociwrapper"/' /etc/containerd/config.toml + fi } configure_proxy() { @@ -50,12 +161,16 @@ fix_mount() { sync fi - echo 'INFO: remounting /sys read-only' - # systemd-in-a-container should have read only /sys - # https://systemd.io/CONTAINER_INTERFACE/ - # however, we need other things from `docker run --privileged` ... - # and this flag also happens to make /sys rw, amongst other things - mount -o remount,ro /sys + if [[ -z "${userns}" ]]; then + echo 'INFO: remounting /sys read-only' + # systemd-in-a-container should have read only /sys + # https://systemd.io/CONTAINER_INTERFACE/ + # however, we need other things from `docker run --privileged` ... + # and this flag also happens to make /sys rw, amongst other things + # + # This step is skipped when running inside UserNS, because it fails with EACCES. + mount -o remount,ro /sys + fi echo 'INFO: making mounts shared' >&2 # for mount propagation @@ -299,6 +414,9 @@ enable_network_magic(){ fi } +# validate state +validate_userns + # run pre-init fixups # NOTE: it's important that we do configure* first in this order to avoid races configure_containerd diff --git a/images/base/files/usr/local/bin/userns-ociwrapper b/images/base/files/usr/local/bin/userns-ociwrapper new file mode 100755 index 0000000000..0b03412b38 --- /dev/null +++ b/images/base/files/usr/local/bin/userns-ociwrapper @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2020 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +RUNTIME="runc" + +bundle="." +bundle_flag="" +# shellcheck disable=SC2068 +for f in $@; do + if [[ -n $bundle_flag ]]; then + bundle=$f + break + else + # FIXME: support `--bundle=STRING` as well + case $f in + -b | --bundle) + bundle_flag=$f + ;; + esac + fi +done + +if [ -f "${bundle}/config.json" ]; then + # kube-proxy wants to read "/sys/module/nf_conntrack/parameters/hashsize", but it fails with EACCES when running inside userns. + # So we bind-mount a fake file. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged + echo "65536" >"/run/nf_conntrack_fake_hashsize" + q='.mounts += [{"destination": "/sys/module/nf_conntrack/parameters/hashsize", "source": "/run/nf_conntrack_fake_hashsize", "type": "none", "options": ["bind"]}]' + tmp="$(mktemp ociwrapper.XXXXXXXX)" + jq "$q" <"${bundle}/config.json" >"$tmp" + mv "$tmp" "${bundle}/config.json" +fi + +exec "$RUNTIME" "$@" diff --git a/site/content/docs/user/rootless.md b/site/content/docs/user/rootless.md new file mode 100644 index 0000000000..fa9766fab7 --- /dev/null +++ b/site/content/docs/user/rootless.md @@ -0,0 +1,54 @@ +--- +title: "Running kind with Rootless Docker" +menu: + main: + parent: "user" + identifier: "rootless" + weight: 3 +--- +Starting with kind 0.11.0 and Docker 20.10, Rootless Docker can be used as the node provider of kind. + +Rootless Podman is not supported at the moment. + +## Host requirements +### Kernel +The kernel needs to be 5.7 or later currently. +In future, we may be able to support a broader range of the kernel version. + +### cgroup v2 +The host needs to be running with cgroup v2. + +cgroup v2 is enabled by default on Fedora. +On other distros, cgroup v2 can be typically enabled by adding `GRUB_CMDLINE_LINUX="systemd.unified_cgroup_hierarchy=1"` to `/etc/default/grub` and +running `sudo update-grub`. + +Also, depending on the host configuration, the following steps might be needed: + +- Create `/etc/systemd/system/user@.service.d/delegate.conf` with the following content, and then run `sudo systemctl daemon-reload`: +```ini +[Service] +Delegate=yes +``` + +- Create `/etc/sysctl.d/99-rootless.conf` with the following content, and then run `sudo sysctl --system`: +``` +net.netfilter.nf_conntrack_max=131072 +kernel.dmesg_restrict=0 +``` + +## Restrictions + +The restrictions of Rootless Docker apply to kind clusters as well. + +e.g. +- OverlayFS cannot be used unless the host is using kernel >= 5.11, or Ubuntu/Debian kernel +- Cannot mount block storages +- Cannot mount NFS + +## Creating a kind cluster with Rootless Docker + +To create a kind cluster with Rootless Docker, just run: +```console +$ export DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock +$ kind create cluster +```