Skip to content

Commit

Permalink
Support Rootless Docker, with vanilla Kubernetes
Browse files Browse the repository at this point in the history
Tested with vanilla Kubernetes v1.20.4

Signed-off-by: Akihiro Suda <[email protected]>
  • Loading branch information
AkihiroSuda committed Mar 3, 2021
1 parent 0cd1834 commit d45dd3e
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 6 deletions.
1 change: 1 addition & 0 deletions images/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ RUN echo "Ensuring scripts are executable ..." \
libseccomp2 pigz \
bash ca-certificates curl rsync \
nfs-common \
jq \
&& find /lib/systemd/system/sysinit.target.wants/ -name "systemd-tmpfiles-setup.service" -delete \
&& rm -f /lib/systemd/system/multi-user.target.wants/* \
&& rm -f /etc/systemd/system/*.wants/* \
Expand Down
4 changes: 4 additions & 0 deletions images/base/files/etc/containerd/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ version = 2
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
BinaryName = "runc"

# Setup a runtime with the magic name ("test-handler") used for Kubernetes
# runtime class tests ...
Expand All @@ -20,3 +22,5 @@ version = 2
tolerate_missing_hugepages_controller = true
# explicitly use default snapshotter so we can sed it in entrypoint
snapshotter = "overlayfs"
# restrict_oom_score_adj needs to be true when running inside UserNS (rootless)
restrict_oom_score_adj = false
130 changes: 124 additions & 6 deletions images/base/files/usr/local/bin/entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,122 @@ set -o errexit
set -o nounset
set -o pipefail

# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host.
# Otherwise we are in a non-initial user namespace.
# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118
userns=""
if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
userns="1"
echo 'INFO: running in a user namespace (experimental)'
fi

validate_userns() {
if [[ -z "${userns}" ]]; then
return
fi

local nofile_hard
nofile_hard="$(ulimit -Hn)"
local nofile_hard_expected="64000"
if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then
# This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863
echo "ERROR: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
exit 1
fi

if ! [ -f "/proc/sys/net/netfilter/nf_conntrack_max" ]; then
echo "ERROR: UserNS: /proc/sys/net/netfilter/nf_conntrack_max does not exist (needs kernel 5.7 or later)" >&2
fi
local nf_conntrack_max
nf_conntrack_max="$(cat /proc/sys/net/netfilter/nf_conntrack_max)"
local nf_conntrack_max_expected="131072"
if [[ "${nf_conntrack_max}" != "${nf_conntrack_max_expected}" ]]; then
# This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863
echo "ERROR: UserNS: expected net.netfilter.nf_conntrack_max to be ${nf_conntrack_max_expected}, got ${nf_conntrack_max}" >&2
exit 1
fi

local dmesg_restrict
dmesg_restrict="$(cat /proc/sys/kernel/dmesg_restrict)"
if [[ "${dmesg_restrict}" != "0" ]]; then
# This ERROR can be probably demoted to WARNING after analysis of this issue: https://github.com/rootless-containers/usernetes/issues/204
echo "ERROR: UserNS: expected kernel.dmesg_restrict to be 0, got ${dmesg_restrict}" >&2
exit 1
fi
if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2
exit 1
fi
for f in cpu memory pids; do
if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
echo "ERROR: UserNS: $f controller needs to be delegated" >&2
exit 1
fi
done
}

fake_file_with_content(){
local path="$1"
local content="$2"
local base="/run/fake"
local fake_path="${base}/${path}"
mkdir -p "$(dirname "${fake_path}")"
echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)"
echo "${content}" > "${fake_path}"
mount --bind "${fake_path}" "${path}"
}

fake_sysctl() {
local key="$1"
local key_slash
# shellcheck disable=SC2001
key_slash="$(echo "${key}" | sed -e s@\\.@/@g)"
local path="/proc/sys/${key_slash}"
if [[ -f "${path}" ]]; then
local content
content="$(cat "${path}")"
fake_file_with_content "${path}" "${content}"
fi
}

configure_containerd() {
# we need to switch to the 'native' snapshotter on zfs
if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then
sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
fi

# userns (rootless) configs
if [[ -n "$userns" ]]; then
# Adjust oomScoreAdj
sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml

# mounting overlayfs inside userns requires patching kernel.
# Ubuntu kernel is patched by default.
# Debian kernel is patched by default as well, but Debian needs `sudo modprobe overlay permit_mounts_in_userns=1`.
local tmp
tmp=$(mktemp -d)
mkdir -p "${tmp}"/{l,u,w,m}
if mount -t overlay overlay -o "lowerdir=${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" "${tmp}/m"; then
umount "${tmp}/m"
else
echo 'INFO: UserNS: this kernel does not support mounting overlayfs inside userns. Disabling overlayfs'
sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml
fi
rm -rf "${tmp}"

# To run vanilla kubelet inside UserNS, we need to fake several unwritable sysctl to be writable.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream.
fake_sysctl "vm.overcommit_memory"
fake_sysctl "vm.panic_on_oom"
fake_sysctl "kernel.panic"
fake_sysctl "kernel.panic_on_oops"
fake_sysctl "kernel.keys.root_maxkeys"
fake_sysctl "kernel.keys.root_maxbytes"

# Wrap runc to mount fake "/sys/module/nf_conntrack/parameters/hashsize" for kube-proxy.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream.
sed -i 's/BinaryName = "runc"/BinaryName = "userns-ociwrapper"/' /etc/containerd/config.toml
fi
}

configure_proxy() {
Expand Down Expand Up @@ -50,12 +161,16 @@ fix_mount() {
sync
fi

echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
mount -o remount,ro /sys
if [[ -z "${userns}" ]]; then
echo 'INFO: remounting /sys read-only'
# systemd-in-a-container should have read only /sys
# https://systemd.io/CONTAINER_INTERFACE/
# however, we need other things from `docker run --privileged` ...
# and this flag also happens to make /sys rw, amongst other things
#
# This step is skipped when running inside UserNS, because it fails with EACCES.
mount -o remount,ro /sys
fi

echo 'INFO: making mounts shared' >&2
# for mount propagation
Expand Down Expand Up @@ -299,6 +414,9 @@ enable_network_magic(){
fi
}

# validate state
validate_userns

# run pre-init fixups
# NOTE: it's important that we do configure* first in this order to avoid races
configure_containerd
Expand Down
51 changes: 51 additions & 0 deletions images/base/files/usr/local/bin/userns-ociwrapper
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Copyright 2020 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -o errexit
set -o nounset
set -o pipefail

RUNTIME="runc"

bundle="."
bundle_flag=""
# shellcheck disable=SC2068
for f in $@; do
if [[ -n $bundle_flag ]]; then
bundle=$f
break
else
# FIXME: support `--bundle=STRING` as well
case $f in
-b | --bundle)
bundle_flag=$f
;;
esac
fi
done

if [ -f "${bundle}/config.json" ]; then
# kube-proxy wants to read "/sys/module/nf_conntrack/parameters/hashsize", but it fails with EACCES when running inside userns.
# So we bind-mount a fake file.
# Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged
echo "65536" >"/run/nf_conntrack_fake_hashsize"
q='.mounts += [{"destination": "/sys/module/nf_conntrack/parameters/hashsize", "source": "/run/nf_conntrack_fake_hashsize", "type": "none", "options": ["bind"]}]'
tmp="$(mktemp ociwrapper.XXXXXXXX)"
jq "$q" <"${bundle}/config.json" >"$tmp"
mv "$tmp" "${bundle}/config.json"
fi

exec "$RUNTIME" "$@"
54 changes: 54 additions & 0 deletions site/content/docs/user/rootless.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
---
title: "Running kind with Rootless Docker"
menu:
main:
parent: "user"
identifier: "rootless"
weight: 3
---
Starting with kind 0.11.0 and Docker 20.10, Rootless Docker can be used as the node provider of kind.

Rootless Podman is not supported at the moment.

## Host requirements
### Kernel
The kernel needs to be 5.7 or later currently.
In future, we may be able to support a broader range of the kernel version.

### cgroup v2
The host needs to be running with cgroup v2.

cgroup v2 is enabled by default on Fedora.
On other distros, cgroup v2 can be typically enabled by adding `GRUB_CMDLINE_LINUX="systemd.unified_cgroup_hierarchy=1"` to `/etc/default/grub` and
running `sudo update-grub`.

Also, depending on the host configuration, the following steps might be needed:

- Create `/etc/systemd/system/[email protected]/delegate.conf` with the following content, and then run `sudo systemctl daemon-reload`:
```ini
[Service]
Delegate=yes
```

- Create `/etc/sysctl.d/99-rootless.conf` with the following content, and then run `sudo sysctl --system`:
```
net.netfilter.nf_conntrack_max=131072
kernel.dmesg_restrict=0
```

## Restrictions

The restrictions of Rootless Docker apply to kind clusters as well.

e.g.
- OverlayFS cannot be used unless the host is using kernel >= 5.11, or Ubuntu/Debian kernel
- Cannot mount block storages
- Cannot mount NFS

## Creating a kind cluster with Rootless Docker

To create a kind cluster with Rootless Docker, just run:
```console
$ export DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock
$ kind create cluster
```

0 comments on commit d45dd3e

Please sign in to comment.