diff --git a/images/base/Dockerfile b/images/base/Dockerfile index 807455a65b..dd1fd03577 100644 --- a/images/base/Dockerfile +++ b/images/base/Dockerfile @@ -96,7 +96,6 @@ RUN echo "Ensuring scripts are executable ..." \ libseccomp2 pigz \ bash ca-certificates curl rsync \ nfs-common \ - jq \ && find /lib/systemd/system/sysinit.target.wants/ -name "systemd-tmpfiles-setup.service" -delete \ && rm -f /lib/systemd/system/multi-user.target.wants/* \ && rm -f /etc/systemd/system/*.wants/* \ diff --git a/images/base/files/etc/containerd/config.toml b/images/base/files/etc/containerd/config.toml index d300800147..c19f158e2c 100644 --- a/images/base/files/etc/containerd/config.toml +++ b/images/base/files/etc/containerd/config.toml @@ -6,8 +6,6 @@ version = 2 default_runtime_name = "runc" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" -[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] - BinaryName = "runc" # Setup a runtime with the magic name ("test-handler") used for Kubernetes # runtime class tests ... diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index eb6d9e5405..3b60acd0a0 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -41,18 +41,6 @@ validate_userns() { exit 1 fi - if ! [ -f "/proc/sys/net/netfilter/nf_conntrack_max" ]; then - echo "ERROR: UserNS: /proc/sys/net/netfilter/nf_conntrack_max does not exist (needs kernel 5.7 or later)" >&2 - fi - local nf_conntrack_max - nf_conntrack_max="$(cat /proc/sys/net/netfilter/nf_conntrack_max)" - local nf_conntrack_max_expected="$((32768 * $(nproc)))" - if [[ "${nf_conntrack_max}" != "${nf_conntrack_max_expected}" ]]; then - # This ERROR can be demoted to WARNING when k/k PR gets merged: https://github.com/kubernetes/kubernetes/pull/92863 - echo "ERROR: UserNS: expected net.netfilter.nf_conntrack_max to be ${nf_conntrack_max_expected}, got ${nf_conntrack_max}" >&2 - exit 1 - fi - local dmesg_restrict dmesg_restrict="$(cat /proc/sys/kernel/dmesg_restrict)" if [[ "${dmesg_restrict}" != "0" ]]; then @@ -129,10 +117,6 @@ configure_containerd() { fake_sysctl "kernel.panic_on_oops" fake_sysctl "kernel.keys.root_maxkeys" fake_sysctl "kernel.keys.root_maxbytes" - - # Wrap runc to mount fake "/sys/module/nf_conntrack/parameters/hashsize" for kube-proxy. - # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged in the upstream. - sed -i 's/BinaryName = "runc"/BinaryName = "userns-ociwrapper"/' /etc/containerd/config.toml fi } diff --git a/images/base/files/usr/local/bin/userns-ociwrapper b/images/base/files/usr/local/bin/userns-ociwrapper deleted file mode 100755 index 0b03412b38..0000000000 --- a/images/base/files/usr/local/bin/userns-ociwrapper +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# Copyright 2020 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -o errexit -set -o nounset -set -o pipefail - -RUNTIME="runc" - -bundle="." -bundle_flag="" -# shellcheck disable=SC2068 -for f in $@; do - if [[ -n $bundle_flag ]]; then - bundle=$f - break - else - # FIXME: support `--bundle=STRING` as well - case $f in - -b | --bundle) - bundle_flag=$f - ;; - esac - fi -done - -if [ -f "${bundle}/config.json" ]; then - # kube-proxy wants to read "/sys/module/nf_conntrack/parameters/hashsize", but it fails with EACCES when running inside userns. - # So we bind-mount a fake file. - # Workaround until https://github.com/kubernetes/kubernetes/pull/92863 gets merged - echo "65536" >"/run/nf_conntrack_fake_hashsize" - q='.mounts += [{"destination": "/sys/module/nf_conntrack/parameters/hashsize", "source": "/run/nf_conntrack_fake_hashsize", "type": "none", "options": ["bind"]}]' - tmp="$(mktemp ociwrapper.XXXXXXXX)" - jq "$q" <"${bundle}/config.json" >"$tmp" - mv "$tmp" "${bundle}/config.json" -fi - -exec "$RUNTIME" "$@" diff --git a/pkg/cluster/internal/create/actions/config/config.go b/pkg/cluster/internal/create/actions/config/config.go index cfe970f388..638261be47 100644 --- a/pkg/cluster/internal/create/actions/config/config.go +++ b/pkg/cluster/internal/create/actions/config/config.go @@ -48,6 +48,11 @@ func (a *Action) Execute(ctx *actions.ActionContext) error { ctx.Status.Start("Writing configuration 📜") defer ctx.Status.End(false) + providerInfo, err := ctx.Provider.Info() + if err != nil { + return err + } + allNodes, err := ctx.Nodes() if err != nil { return err @@ -76,6 +81,7 @@ func (a *Action) Execute(ctx *actions.ActionContext) error { IPv6: ctx.Config.Networking.IPFamily == "ipv6", FeatureGates: ctx.Config.FeatureGates, RuntimeConfig: ctx.Config.RuntimeConfig, + RootlessProvider: providerInfo.Rootless, } kubeadmConfigPlusPatches := func(node nodes.Node, data kubeadm.ConfigData) func() error { diff --git a/pkg/cluster/internal/kubeadm/config.go b/pkg/cluster/internal/kubeadm/config.go index 3c02378f4f..34abe6fb40 100644 --- a/pkg/cluster/internal/kubeadm/config.go +++ b/pkg/cluster/internal/kubeadm/config.go @@ -74,6 +74,10 @@ type ConfigData struct { // These auto-generated fields are available to Config templates, // but not meant to be set by hand DerivedConfigData + + // Provider is running with rootless mode, so kube-proxy needs to be configured + // not to fail on sysctl error. + RootlessProvider bool } // DerivedConfigData fields are automatically derived by @@ -382,6 +386,14 @@ mode: "{{ .KubeProxyMode }}" {{end}}{{end}} iptables: minSyncPeriod: 1s +{{if .RootlessProvider}}conntrack: +# Skip setting sysctl value "net.netfilter.nf_conntrack_max" + maxPerCore: 0 +# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_established" + tcpEstablishedTimeout: 0s +# Skip setting "net.netfilter.nf_conntrack_tcp_timeout_close" + tcpCloseWaitTimeout: 0s +{{end}} ` // Config returns a kubeadm config generated from config data, in particular @@ -400,6 +412,9 @@ func Config(data ConfigData) (config string, err error) { // assume the latest API version, then fallback if the k8s version is too low templateSource := ConfigTemplateBetaV2 if ver.LessThan(version.MustParseSemantic("v1.15.0")) { + if data.RootlessProvider { + return "", errors.Errorf("version %q is not compatible with rootless provider", ver) + } templateSource = ConfigTemplateBetaV1 } diff --git a/pkg/cluster/internal/providers/docker/provider.go b/pkg/cluster/internal/providers/docker/provider.go index 0a012a265e..680b2181b1 100644 --- a/pkg/cluster/internal/providers/docker/provider.go +++ b/pkg/cluster/internal/providers/docker/provider.go @@ -17,6 +17,8 @@ limitations under the License. package docker import ( + "encoding/csv" + "encoding/json" "fmt" "net" "os" @@ -281,3 +283,33 @@ func (p *provider) CollectLogs(dir string, nodes []nodes.Node) error { errs = append(errs, errors.AggregateConcurrent(fns)) return errors.NewAggregate(errs) } + +// Info returns the provider info. +func (p *provider) Info() (*providers.ProviderInfo, error) { + cmd := exec.Command("docker", "info", "--format", "{{json .SecurityOptions}}") + out, err := exec.Output(cmd) + if err != nil { + return nil, errors.Wrap(err, "failed to get docker info") + } + var securityOptions []string + if err := json.Unmarshal(out, &securityOptions); err != nil { + return nil, err + } + var info providers.ProviderInfo + for _, o := range securityOptions { + // o is like "name=seccomp,profile=default", or "name=rootless", + csvReader := csv.NewReader(strings.NewReader(o)) + sliceSlice, err := csvReader.ReadAll() + if err != nil { + return nil, err + } + for _, f := range sliceSlice { + for _, ff := range f { + if ff == "name=rootless" { + info.Rootless = true + } + } + } + } + return &info, nil +} diff --git a/pkg/cluster/internal/providers/podman/provider.go b/pkg/cluster/internal/providers/podman/provider.go index f775ab52ea..a3f326d334 100644 --- a/pkg/cluster/internal/providers/podman/provider.go +++ b/pkg/cluster/internal/providers/podman/provider.go @@ -350,3 +350,11 @@ func (p *provider) CollectLogs(dir string, nodes []nodes.Node) error { errs = append(errs, errors.AggregateConcurrent(fns)) return errors.NewAggregate(errs) } + +// Info returns the provider info. +func (p *provider) Info() (*providers.ProviderInfo, error) { + info := &providers.ProviderInfo{ + Rootless: os.Geteuid() != 0, + } + return info, nil +} diff --git a/pkg/cluster/internal/providers/provider.go b/pkg/cluster/internal/providers/provider.go index 6e28c4dc79..82e3d60408 100644 --- a/pkg/cluster/internal/providers/provider.go +++ b/pkg/cluster/internal/providers/provider.go @@ -45,4 +45,11 @@ type Provider interface { GetAPIServerInternalEndpoint(cluster string) (string, error) // CollectLogs will populate dir with cluster logs and other debug files CollectLogs(dir string, nodes []nodes.Node) error + // Info returns the provider info + Info() (*ProviderInfo, error) +} + +// ProviderInfo is the info of the provider +type ProviderInfo struct { + Rootless bool } diff --git a/site/content/docs/user/rootless.md b/site/content/docs/user/rootless.md index e48eceefb8..3ef0692945 100644 --- a/site/content/docs/user/rootless.md +++ b/site/content/docs/user/rootless.md @@ -11,11 +11,6 @@ Starting with kind 0.11.0 and Docker 20.10, Rootless Docker can be used as the n Rootless Podman is not supported at the moment. ## Host requirements -### Kernel -The kernel needs to be 5.7 or later currently. -In future, we may be able to support a broader range of the kernel version. - -### cgroup v2 The host needs to be running with cgroup v2. cgroup v2 is enabled by default on Fedora. @@ -32,12 +27,9 @@ Delegate=yes - Create `/etc/sysctl.d/99-rootless.conf` with the following content, and then run `sudo sysctl --system`: ``` -net.netfilter.nf_conntrack_max=<32768 * the number of CPUs> kernel.dmesg_restrict=0 ``` -e.g, When the number of CPUs (`nproc`) is 4, `net.netfilter.nf_conntrack_max=131072`. - ## Restrictions The restrictions of Rootless Docker apply to kind clusters as well.