diff --git a/images/base/files/etc/containerd/config.toml b/images/base/files/etc/containerd/config.toml index 7fa009b89f..c19f158e2c 100644 --- a/images/base/files/etc/containerd/config.toml +++ b/images/base/files/etc/containerd/config.toml @@ -20,3 +20,5 @@ version = 2 tolerate_missing_hugepages_controller = true # explicitly use default snapshotter so we can sed it in entrypoint snapshotter = "overlayfs" + # restrict_oom_score_adj needs to be true when running inside UserNS (rootless) + restrict_oom_score_adj = false diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 1eb80b2caa..79f1c361b8 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -18,11 +18,97 @@ set -o errexit set -o nounset set -o pipefail +# If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. +# Otherwise we are in a non-initial user namespace. +# https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 +userns="" +if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then + userns="1" + echo 'INFO: running in a user namespace (experimental)' +fi + +validate_userns() { + if [[ -z "${userns}" ]]; then + return + fi + + local nofile_hard + nofile_hard="$(ulimit -Hn)" + local nofile_hard_expected="64000" + if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then + echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 + fi + + if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then + echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2 + exit 1 + fi + for f in cpu memory pids; do + if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then + echo "ERROR: UserNS: $f controller needs to be delegated" >&2 + exit 1 + fi + done +} + +fake_file_with_content(){ + local path="$1" + local content="$2" + local base="/run/fake" + local fake_path="${base}/${path}" + mkdir -p "$(dirname "${fake_path}")" + echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)" + echo "${content}" > "${fake_path}" + mount --bind "${fake_path}" "${path}" +} + +fake_sysctl() { + local key="$1" + local key_slash + # shellcheck disable=SC2001 + key_slash="$(echo "${key}" | sed -e s@\\.@/@g)" + local path="/proc/sys/${key_slash}" + if [[ -f "${path}" ]]; then + local content + content="$(cat "${path}")" + fake_file_with_content "${path}" "${content}" + fi +} + configure_containerd() { # we need to switch to the 'native' snapshotter on zfs if [[ "$(stat -f -c %T /kind)" == 'zfs' ]]; then sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml fi + + # userns (rootless) configs + if [[ -n "$userns" ]]; then + # Adjust oomScoreAdj + sed -i 's/restrict_oom_score_adj = false/restrict_oom_score_adj = true/' /etc/containerd/config.toml + + # mounting overlayfs inside userns requires patching kernel. + # Ubuntu kernel is patched by default. + # Debian kernel is patched by default as well, but Debian needs `sudo modprobe overlay permit_mounts_in_userns=1`. + local tmp + tmp=$(mktemp -d) + mkdir -p "${tmp}"/{l,u,w,m} + if mount -t overlay overlay -o "lowerdir=${tmp}/l,upperdir=${tmp}/u,workdir=${tmp}/w" "${tmp}/m"; then + umount "${tmp}/m" + else + echo 'INFO: UserNS: this kernel does not support mounting overlayfs inside userns. Disabling overlayfs' + sed -i 's/snapshotter = "overlayfs"/snapshotter = "native"/' /etc/containerd/config.toml + fi + rm -rf "${tmp}" + + # To run vanilla kubelet inside UserNS, we need to fake several unwritable sysctl to be writable. + # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream. + fake_sysctl "vm.overcommit_memory" + fake_sysctl "vm.panic_on_oom" + fake_sysctl "kernel.panic" + fake_sysctl "kernel.panic_on_oops" + fake_sysctl "kernel.keys.root_maxkeys" + fake_sysctl "kernel.keys.root_maxbytes" + fi } configure_proxy() { @@ -50,12 +136,16 @@ fix_mount() { sync fi - echo 'INFO: remounting /sys read-only' - # systemd-in-a-container should have read only /sys - # https://systemd.io/CONTAINER_INTERFACE/ - # however, we need other things from `docker run --privileged` ... - # and this flag also happens to make /sys rw, amongst other things - mount -o remount,ro /sys + if [[ -z "${userns}" ]]; then + echo 'INFO: remounting /sys read-only' + # systemd-in-a-container should have read only /sys + # https://systemd.io/CONTAINER_INTERFACE/ + # however, we need other things from `docker run --privileged` ... + # and this flag also happens to make /sys rw, amongst other things + # + # This step is skipped when running inside UserNS, because it fails with EACCES. + mount -o remount,ro /sys + fi echo 'INFO: making mounts shared' >&2 # for mount propagation @@ -212,6 +302,13 @@ fix_kmsg() { else echo 'WARN: /dev/kmsg does not exist, nor does /dev/console!' >&2 fi + elif [[ -n "${userns}" ]]; then + if [[ -f "/proc/sys/kernel/dmesg_restrict" ]]; then + if [[ "$(cat /proc/sys/kernel/dmesg_restrict)" = "1" ]]; then + echo 'WARN: UserNS: /dev/kmsg is not readable, faking with /dev/null (hint: set sysctl value "kernel.dmesg_restrict" to 0)' >&2 + mount --bind /dev/null /dev/kmsg + fi + fi fi } @@ -299,6 +396,9 @@ enable_network_magic(){ fi } +# validate state +validate_userns + # run pre-init fixups # NOTE: it's important that we do configure* first in this order to avoid races configure_containerd diff --git a/pkg/cluster/internal/create/actions/config/config.go b/pkg/cluster/internal/create/actions/config/config.go index cfe970f388..638261be47 100644 --- a/pkg/cluster/internal/create/actions/config/config.go +++ b/pkg/cluster/internal/create/actions/config/config.go @@ -48,6 +48,11 @@ func (a *Action) Execute(ctx *actions.ActionContext) error { ctx.Status.Start("Writing configuration 📜") defer ctx.Status.End(false) + providerInfo, err := ctx.Provider.Info() + if err != nil { + return err + } + allNodes, err := ctx.Nodes() if err != nil { return err @@ -76,6 +81,7 @@ func (a *Action) Execute(ctx *actions.ActionContext) error { IPv6: ctx.Config.Networking.IPFamily == "ipv6", FeatureGates: ctx.Config.FeatureGates, RuntimeConfig: ctx.Config.RuntimeConfig, + RootlessProvider: providerInfo.Rootless, } kubeadmConfigPlusPatches := func(node nodes.Node, data kubeadm.ConfigData) func() error { diff --git a/pkg/cluster/internal/kubeadm/config.go b/pkg/cluster/internal/kubeadm/config.go index 21b142f512..6fe4df1cad 100644 --- a/pkg/cluster/internal/kubeadm/config.go +++ b/pkg/cluster/internal/kubeadm/config.go @@ -74,6 +74,10 @@ type ConfigData struct { // These auto-generated fields are available to Config templates, // but not meant to be set by hand DerivedConfigData + + // Provider is running with rootless mode, so kube-proxy needs to be configured + // not to fail on sysctl error. + RootlessProvider bool } // DerivedConfigData fields are automatically derived by @@ -385,7 +389,14 @@ mode: "{{ .KubeProxyMode }}" {{end}}{{end}} iptables: minSyncPeriod: 1s -{{end}} +{{if .RootlessProvider}}conntrack: +# Skip setting sysctl value "net.netfilter.nf_conntrack_max" + maxPerCore: 0 +# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_established" + tcpEstablishedTimeout: 0s +# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_close" + tcpCloseWaitTimeout: 0s +{{end}}{{end}} ` // Config returns a kubeadm config generated from config data, in particular @@ -404,6 +415,9 @@ func Config(data ConfigData) (config string, err error) { // assume the latest API version, then fallback if the k8s version is too low templateSource := ConfigTemplateBetaV2 if ver.LessThan(version.MustParseSemantic("v1.15.0")) { + if data.RootlessProvider { + return "", errors.Errorf("version %q is not compatible with rootless provider", ver) + } templateSource = ConfigTemplateBetaV1 } diff --git a/pkg/cluster/internal/providers/docker/provider.go b/pkg/cluster/internal/providers/docker/provider.go index 0a012a265e..680b2181b1 100644 --- a/pkg/cluster/internal/providers/docker/provider.go +++ b/pkg/cluster/internal/providers/docker/provider.go @@ -17,6 +17,8 @@ limitations under the License. package docker import ( + "encoding/csv" + "encoding/json" "fmt" "net" "os" @@ -281,3 +283,33 @@ func (p *provider) CollectLogs(dir string, nodes []nodes.Node) error { errs = append(errs, errors.AggregateConcurrent(fns)) return errors.NewAggregate(errs) } + +// Info returns the provider info. +func (p *provider) Info() (*providers.ProviderInfo, error) { + cmd := exec.Command("docker", "info", "--format", "{{json .SecurityOptions}}") + out, err := exec.Output(cmd) + if err != nil { + return nil, errors.Wrap(err, "failed to get docker info") + } + var securityOptions []string + if err := json.Unmarshal(out, &securityOptions); err != nil { + return nil, err + } + var info providers.ProviderInfo + for _, o := range securityOptions { + // o is like "name=seccomp,profile=default", or "name=rootless", + csvReader := csv.NewReader(strings.NewReader(o)) + sliceSlice, err := csvReader.ReadAll() + if err != nil { + return nil, err + } + for _, f := range sliceSlice { + for _, ff := range f { + if ff == "name=rootless" { + info.Rootless = true + } + } + } + } + return &info, nil +} diff --git a/pkg/cluster/internal/providers/podman/provider.go b/pkg/cluster/internal/providers/podman/provider.go index f775ab52ea..2e54e07f73 100644 --- a/pkg/cluster/internal/providers/podman/provider.go +++ b/pkg/cluster/internal/providers/podman/provider.go @@ -68,12 +68,6 @@ func (p *provider) Provision(status *cli.Status, cfg *config.Cluster) (err error return err } - // kind doesn't work with podman rootless, surface an error - if os.Geteuid() != 0 { - p.logger.Errorf("podman provider does not work properly in rootless mode") - os.Exit(1) - } - // TODO: validate cfg // ensure node images are pulled before actually provisioning if err := ensureNodeImages(p.logger, status, cfg); err != nil { @@ -350,3 +344,11 @@ func (p *provider) CollectLogs(dir string, nodes []nodes.Node) error { errs = append(errs, errors.AggregateConcurrent(fns)) return errors.NewAggregate(errs) } + +// Info returns the provider info. +func (p *provider) Info() (*providers.ProviderInfo, error) { + info := &providers.ProviderInfo{ + Rootless: os.Geteuid() != 0, + } + return info, nil +} diff --git a/pkg/cluster/internal/providers/provider.go b/pkg/cluster/internal/providers/provider.go index 6e28c4dc79..82e3d60408 100644 --- a/pkg/cluster/internal/providers/provider.go +++ b/pkg/cluster/internal/providers/provider.go @@ -45,4 +45,11 @@ type Provider interface { GetAPIServerInternalEndpoint(cluster string) (string, error) // CollectLogs will populate dir with cluster logs and other debug files CollectLogs(dir string, nodes []nodes.Node) error + // Info returns the provider info + Info() (*ProviderInfo, error) +} + +// ProviderInfo is the info of the provider +type ProviderInfo struct { + Rootless bool } diff --git a/site/content/docs/user/rootless.md b/site/content/docs/user/rootless.md new file mode 100644 index 0000000000..ecd3af349a --- /dev/null +++ b/site/content/docs/user/rootless.md @@ -0,0 +1,59 @@ +--- +title: "Running kind with Rootless Docker" +menu: + main: + parent: "user" + identifier: "rootless" + weight: 3 +--- +Starting with kind 0.11.0, [Rootless Docker](https://docs.docker.com/go/rootless/) and [Rootless Podman](https://github.com/containers/podman/blob/master/docs/tutorials/rootless_tutorial.md) can be used as the node provider of kind. + +## Provider requirements +- Docker: 20.10 or later +- Podman: 3.0 or later + +## Host requirements +The host needs to be running with cgroup v2. + +cgroup v2 is enabled by default on Fedora. +On other distros, cgroup v2 can be typically enabled by adding `GRUB_CMDLINE_LINUX="systemd.unified_cgroup_hierarchy=1"` to `/etc/default/grub` and +running `sudo update-grub`. + +Also, depending on the host configuration, the following steps might be needed: + +- Create `/etc/systemd/system/user@.service.d/delegate.conf` with the following content, and then run `sudo systemctl daemon-reload`: +```ini +[Service] +Delegate=yes +``` + +- Create `/etc/modules-load.d/iptables.conf` with the following content: +``` +iptables_nat +ip6tables_nat +``` + +## Restrictions + +The restrictions of Rootless Docker apply to kind clusters as well. + +e.g. +- OverlayFS cannot be used unless the host is using kernel >= 5.11, or Ubuntu/Debian kernel +- Cannot mount block storages +- Cannot mount NFS + +## Creating a kind cluster with Rootless Docker + +To create a kind cluster with Rootless Docker, just run: +```console +$ export DOCKER_HOST=unix://${XDG_RUNTIME_DIR}/docker.sock +$ kind create cluster +``` + +To create a kind cluster with Rootless Podman, just run: +```console +$ KIND_EXPERIMENTAL_PROVIDER=podman kind create cluster +``` + +## Tips +- To enable OOM watching, allow `dmesg` by running `sysctl -w kernel.dmesg_restrict=0`.