diff --git a/images/base/Dockerfile b/images/base/Dockerfile index c68ed05088..2358cdb5cd 100644 --- a/images/base/Dockerfile +++ b/images/base/Dockerfile @@ -80,6 +80,7 @@ RUN echo "Ensuring scripts are executable ..." \ libseccomp2 pigz \ bash ca-certificates curl rsync \ nfs-common \ + jq \ && find /lib/systemd/system/sysinit.target.wants/ -name "systemd-tmpfiles-setup.service" -delete \ && rm -f /lib/systemd/system/multi-user.target.wants/* \ && rm -f /etc/systemd/system/*.wants/* \ diff --git a/images/base/files/etc/containerd/config.toml b/images/base/files/etc/containerd/config.toml index 7fa009b89f..d300800147 100644 --- a/images/base/files/etc/containerd/config.toml +++ b/images/base/files/etc/containerd/config.toml @@ -6,6 +6,8 @@ version = 2 default_runtime_name = "runc" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "runc" # Setup a runtime with the magic name ("test-handler") used for Kubernetes # runtime class tests ... @@ -20,3 +22,5 @@ version = 2 tolerate_missing_hugepages_controller = true # explicitly use default snapshotter so we can sed it in entrypoint snapshotter = "overlayfs" + # restrict_oom_score_adj needs to be true when running inside UserNS (rootless) + restrict_oom_score_adj = false diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 13839718a6..fcf8c82dea 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -18,11 +18,91 @@ set -o errexit set -o nounset set -o pipefail +userns="" +if egrep -qv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then + userns="1" + echo 'INFO: running in user namespace (experimental)' +fi + +validate_userns() { + if [[ -z "${userns}" ]]; then + return + fi + local nofile_hard="$(ulimit -Hn)" + local nofile_hard_expected="64000" + if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then + # This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863 + echo "ERROR: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 + exit 1 + fi + local dmesg_restrict="$(cat /proc/sys/kernel/dmesg_restrict)" + if [[ "${dmesg_restrict}" != "0" ]]; then + echo "ERROR: expected kernel.dmesg_restrict to be 0, got ${dmesg_restrict}" >&2 + exit 1 + fi +} + +fake_file_with_content(){ + local path="$1" + local content="$2" + local base="/run/fake" + local fake_path="${base}/${path}" + mkdir -p "$(dirname "${fake_path}")" + echo "INFO: faking ${path} to be \"${content}\" (writable)" + echo "${content}" > "${fake_path}" + mount --bind "${fake_path}" "${path}" +} + +fake_sysctl() { + local key="$1" + local key_slash="$(echo "${key}" | sed -e s@\\.@/@g)" + local path="/proc/sys/${key_slash}" + if [[ -f "${path}" ]]; then + local content="$(cat "${path}")" + fake_file_with_content "${path}" "${content}" + fi +} + configure_containerd() { # we need to switch to the 'native' snapshotter on zfs if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml fi + + # userns (rootless) configs + if [[ -n "$userns" ]]; then + # Adjust oomScoreAdj + sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml + + # mounting overlayfs inside userns requires patching kernel. + # Ubuntu kernel is patched by default. + # Debian kernel is patched by default as well, but Debian needs `sudo modprobe overlay permit_mounts_in_userns=1`. + local tmp=$(mktemp -d) + mkdir -p "${tmp}"/{l,u,w,m} + if mount -t overlay overlay -o "lowerdir=${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" "${tmp}/m"; then + umount "${tmp}/m" + else + echo 'INFO: this kernel does not support mounting overlayfs inside userns. Disabling overlayfs' + sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml + fi + rm -rf "${tmp}" + + # To run vanilla kubelet and kube-proxy inside UserNS, we need to fake several unwritable sysctl to be writable. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream. + fake_sysctl "vm.overcommit_memory" + fake_sysctl "vm.panic_on_oom" + fake_sysctl "kernel.panic" + fake_sysctl "kernel.panic_on_oops" + fake_sysctl "kernel.keys.root_maxkeys" + fake_sysctl "kernel.keys.root_maxbytes" + fake_sysctl "net.netfilter.nf_conntrack_max" + fake_sysctl "net.netfilter.nf_conntrack_tcp_timeout_established" + fake_sysctl "net.netfilter.nf_conntrack_tcp_timeout_close_wait" + + # Wrap runc to mount fake "/sys/module/nf_conntrack/parameters/hashsize" for kube-proxy. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream. + sed -i 's/BinaryName = "runc"/BinaryName = "userns-ociwrapper"/' /etc/containerd/config.toml + fi } configure_proxy() { @@ -50,12 +130,16 @@ fix_mount() { sync fi - echo 'INFO: remounting /sys read-only' - # systemd-in-a-container should have read only /sys - # https://systemd.io/CONTAINER_INTERFACE/ - # however, we need other things from `docker run --privileged` ... - # and this flag also happens to make /sys rw, amongst other things - mount -o remount,ro /sys + if [[ -z "${userns}" ]]; then + echo 'INFO: remounting /sys read-only' + # systemd-in-a-container should have read only /sys + # https://systemd.io/CONTAINER_INTERFACE/ + # however, we need other things from `docker run --privileged` ... + # and this flag also happens to make /sys rw, amongst other things + # + # This step is skipped when running inside UserNS, because it fails with EACCES. + mount -o remount,ro /sys + fi echo 'INFO: making mounts shared' >&2 # for mount propagation @@ -239,6 +323,9 @@ enable_network_magic(){ fi } +# validate state +validate_userns + # run pre-init fixups # NOTE: it's important that we do configure* first in this order to avoid races configure_containerd diff --git a/images/base/files/usr/local/bin/userns-ociwrapper b/images/base/files/usr/local/bin/userns-ociwrapper new file mode 100755 index 0000000000..c06efdc0de --- /dev/null +++ b/images/base/files/usr/local/bin/userns-ociwrapper @@ -0,0 +1,51 @@ +#!/bin/bash + +# Copyright 2020 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +RUNTIME="runc" + +bundle="." +bundle_flag="" +# FIXME: support `--bundle=STRING` as well +for f in $@; do + if [[ -n $bundle_flag ]]; then + bundle=$f + break + else + case $f in + -b | --bundle) + bundle_flag=$f + ;; + esac + fi +done + +if [ -f $bundle/config.json ]; then + # kube-proxy wants to read "/sys/module/nf_conntrack/parameters/hashsize", but it fails with EACCES when running inside userns. + # So we bind-mount a fake file. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged + echo "65536" >"/run/nf_conntrack_fake_hashsize" + q='.mounts += [{"destination": "/sys/module/nf_conntrack/parameters/hashsize", "source": "/run/nf_conntrack_fake_hashsize", "type": "none", "options": ["bind"]}]' + tmp=$(mktemp -d ociwrapper.XXXXXXXX) + jq "$q" <$bundle/config.json >$tmp/config.json + mv $tmp/config.json $bundle/config.json + rm -rf $tmp +fi + +exec "$RUNTIME" "$@" diff --git a/site/content/docs/user/quick-start.md b/site/content/docs/user/quick-start.md index acaa2f8499..d1a6ec6c65 100644 --- a/site/content/docs/user/quick-start.md +++ b/site/content/docs/user/quick-start.md @@ -425,6 +425,50 @@ The structure of the logs will look more or less like this: The logs contain information about the Docker host, the containers running kind, the Kubernetes cluster itself, etc. +### Rootless Docker + +Starting with kind 0.10.0 and Docker 20.10, Rootless Docker can be used as the node provider of kind. + +#### Host requirements +The host needs to be running with cgroup v2. + +cgroup v2 is enabled by default on Fedora. +On other distros, cgroup v2 can be typically enabled by adding `GRUB_CMDLINE_LINUX="systemd.unified_cgroup_hierarchy=1"` to `/etc/default/grub` and +running `sudo update-grub`. + +Also, depending on the host configuration, the following steps might be needed: + +- Create `/etc/systemd/system/user@.service.d/delegate.conf` with the following content, and then run `sudo systemctl daemon-reload`: +``` +[Service] +Delegate=yes +EOF +``` + +- Create `/etc/sysctl.d/99-rootless.conf` with the following content, and then run `sudo sysctl --system`: +``` +kernel.dmesg_restrict=0 +``` + +#### Restrictions + +The restrictions of Rootless Docker applies to kind clusters as well. + +e.g. +- OverlayFS cannot be used unless the host is Ubuntu or Debian +- Cannot mount block storages +- Cannot mount NFS + +#### Creating a kind cluster with Rootless Docker + +To create a kind cluster with Rootless Docker, just run `kind create cluster` command with +`DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock`. + +```console +$ export DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock +$ kind create cluster +``` + [go-supported]: https://golang.org/doc/devel/release.html#policy [known issues]: /docs/user/known-issues [releases]: https://github.com/kubernetes-sigs/kind/releases