diff --git a/go.mod b/go.mod index cf153e067..45dbd9f0a 100644 --- a/go.mod +++ b/go.mod @@ -186,4 +186,4 @@ require ( tags.cncf.io/container-device-interface/specs-go v0.6.0 // indirect ) -replace github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.1-0.20230904132852-a0466dd76f23 +replace github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.1-0.20240131200429-02120488a4c0 diff --git a/go.sum b/go.sum index e36960c73..be6f4681e 100644 --- a/go.sum +++ b/go.sum @@ -373,8 +373,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= -github.com/opencontainers/runc v1.1.1-0.20230904132852-a0466dd76f23 h1:CjJqzUWt07IJR6gO+Ron5qEcXomyLZLgURiSCXN6vXM= -github.com/opencontainers/runc v1.1.1-0.20230904132852-a0466dd76f23/go.mod h1:UkHdGiHfjdRR/suiePnXB844WcjZ0RcfGm2mQS/V5jM= +github.com/opencontainers/runc v1.1.1-0.20240131200429-02120488a4c0 h1:NwSQ/5rex97Rum/xZOMjlDQbbZ8YJKOTihf9sxqHxtE= +github.com/opencontainers/runc v1.1.1-0.20240131200429-02120488a4c0/go.mod h1:tBsQqk9ETVlXxzXjk2Xh/1VjxC/U3Gaq5ps/rC/cadE= github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-tools v0.9.1-0.20230914150019-408c51e934dc h1:d2hUh5O6MRBvStV55MQ8we08t42zSTqBbscoQccWmMc= diff --git a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go index 8b1483c7d..17d36ed15 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go @@ -26,14 +26,19 @@ func isEnabled() bool { } func setProcAttr(attr, value string) error { - // Under AppArmor you can only change your own attr, so use /proc/self/ - // instead of /proc// like libapparmor does - attrPath := "/proc/self/attr/apparmor/" + attr - if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) { + attr = utils.CleanPath(attr) + attrSubPath := "attr/apparmor/" + attr + if _, err := os.Stat("/proc/self/" + attrSubPath); errors.Is(err, os.ErrNotExist) { // fall back to the old convention - attrPath = "/proc/self/attr/" + attr + attrSubPath = "attr/" + attr } + // Under AppArmor you can only change your own attr, so there's no reason + // to not use /proc/thread-self/ (instead of /proc//, like libapparmor + // does). + attrPath, closer := utils.ProcThreadSelf(attrSubPath) + defer closer() + f, err := os.OpenFile(attrPath, os.O_WRONLY, 0) if err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go index 048a0ca4b..16aae5a3b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go @@ -10,6 +10,7 @@ import ( "strings" "sync" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -49,24 +50,13 @@ func WriteFile(dir, file, data string) error { return err } defer fd.Close() - if err := retryingWriteFile(fd, data); err != nil { + if _, err := fd.WriteString(data); err != nil { // Having data in the error message helps in debugging. return fmt.Errorf("failed to write %q: %w", data, err) } return nil } -func retryingWriteFile(fd *os.File, data string) error { - for { - _, err := fd.Write([]byte(data)) - if errors.Is(err, unix.EINTR) { - logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) - continue - } - return err - } -} - const ( cgroupfsDir = "/sys/fs/cgroup" cgroupfsPrefix = cgroupfsDir + "/" @@ -76,16 +66,16 @@ var ( // TestMode is set to true by unit tests that need "fake" cgroupfs. TestMode bool - cgroupFd int = -1 - prepOnce sync.Once - prepErr error - resolveFlags uint64 + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 ) func prepareOpenat2() error { prepOnce.Do(func() { fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ - Flags: unix.O_DIRECTORY | unix.O_PATH, + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} @@ -96,15 +86,16 @@ func prepareOpenat2() error { } return } + file := os.NewFile(uintptr(fd), cgroupfsDir) + var st unix.Statfs_t - if err = unix.Fstatfs(fd, &st); err != nil { + if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} logrus.Warnf("falling back to securejoin: %s", prepErr) return } - cgroupFd = fd - + cgroupRootHandle = file resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS if st.Type == unix.CGROUP2_SUPER_MAGIC { // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks @@ -122,7 +113,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { flags |= os.O_TRUNC | os.O_CREATE mode = 0o600 } - path := path.Join(dir, file) + path := path.Join(dir, utils.CleanPath(file)) if prepareOpenat2() != nil { return openFallback(path, flags, mode) } @@ -131,7 +122,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { return openFallback(path, flags, mode) } - fd, err := unix.Openat2(cgroupFd, relPath, + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, &unix.OpenHow{ Resolve: resolveFlags, Flags: uint64(flags) | unix.O_CLOEXEC, @@ -139,20 +130,21 @@ func openFile(dir, file string, flags int) (*os.File, error) { }) if err != nil { err = &os.PathError{Op: "openat2", Path: path, Err: err} - // Check if cgroupFd is still opened to cgroupfsDir + // Check if cgroupRootHandle is still opened to cgroupfsDir // (happens when this package is incorrectly used // across the chroot/pivot_root/mntns boundary, or // when /sys/fs/cgroup is remounted). // // TODO: if such usage will ever be common, amend this - // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) - fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) + // to reopen cgroupRootHandle and retry openat2. + fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd()))) + defer closer() + fdDest, _ := os.Readlink(fdPath) if fdDest != cgroupfsDir { - // Wrap the error so it is clear that cgroupFd + // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) + err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", + cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) } return nil, err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index 26a9d7c0f..727f7f918 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -84,6 +84,28 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error { period = "" } } + + var burst string + if r.CpuBurst != nil { + burst = strconv.FormatUint(*r.CpuBurst, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil { + // this is a special trick for burst feature, the current systemd and low version of kernel will not support it. + // So, an `no such file or directory` error would be raised, and we can ignore it . + if !errors.Is(err, unix.ENOENT) { + // Sometimes when the burst to be set is larger + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_burst exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } + } else { + burst = "" + } + } if r.CpuQuota != 0 { if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { return err @@ -93,6 +115,13 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error { return err } } + if burst != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil { + if !errors.Is(err, unix.ENOENT) { + return err + } + } + } } if r.CPUIdle != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go index 8ddd6fdd8..50f8f30cd 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -1,6 +1,8 @@ package fs import ( + "errors" + "os" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { } func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { + const suffix = ".limit_in_bytes" + skipRsvd := false + for _, hugetlb := range r.HugetlbLimit { - if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + prefix := "hugetlb." + hugetlb.Pagesize + val := strconv.FormatUint(hugetlb.Limit, 10) + if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil { + return err + } + if skipRsvd { + continue + } + if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil { + if errors.Is(err, os.ErrNotExist) { + skipRsvd = true + continue + } return err } } @@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { if !cgroups.PathExists(path) { return nil } + rsvd := ".rsvd" hugetlbStats := cgroups.HugetlbStats{} for _, pageSize := range cgroups.HugePageSizes() { - usage := "hugetlb." + pageSize + ".usage_in_bytes" - value, err := fscommon.GetCgroupParamUint(path, usage) + again: + prefix := "hugetlb." + pageSize + rsvd + + value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes") if err != nil { + if rsvd != "" && errors.Is(err, os.ErrNotExist) { + rsvd = "" + goto again + } return err } hugetlbStats.Usage = value - maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" - value, err = fscommon.GetCgroupParamUint(path, maxUsage) + value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes") if err != nil { return err } hugetlbStats.MaxUsage = value - failcnt := "hugetlb." + pageSize + ".failcnt" - value, err = fscommon.GetCgroupParamUint(path, failcnt) + value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt") if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index b7c75f941..783566d68 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { return err } stats.MemoryStats.SwapUsage = swapUsage + stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{ + Usage: swapUsage.Usage - memoryUsage.Usage, + Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt, + } kernelUsage, err := getMemoryData(path, "kmem") if err != nil { return err @@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) { memoryData.Failcnt = value value, err = fscommon.GetCgroupParamUint(path, limit) if err != nil { + if name == "kmem" && os.IsNotExist(err) { + // Ignore ENOENT as kmem.limit_in_bytes has + // been removed in newer kernels. + return memoryData, nil + } + return cgroups.MemoryData{}, err } memoryData.Limit = value diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go index 3f8deb0ad..8ee49d499 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go @@ -2,16 +2,19 @@ package fs2 import ( "bufio" + "errors" "os" "strconv" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) func isCpuSet(r *configs.Resources) bool { - return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil + return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil } func setCpu(dirPath string, r *configs.Resources) error { @@ -32,6 +35,23 @@ func setCpu(dirPath string, r *configs.Resources) error { } } + var burst string + if r.CpuBurst != nil { + burst = strconv.FormatUint(*r.CpuBurst, 10) + if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil { + // Sometimes when the burst to be set is larger + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_burst exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + burst = "" + } + } if r.CpuQuota != 0 || r.CpuPeriod != 0 { str := "max" if r.CpuQuota > 0 { @@ -47,6 +67,11 @@ func setCpu(dirPath string, r *configs.Resources) error { if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { return err } + if burst != "" { + if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil { + return err + } + } } return nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go index 9c949c91f..8ac831201 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go @@ -55,6 +55,9 @@ func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { return filepath.Join(root, innerPath), nil } + // we don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. ownCgroup, err := parseCgroupFile("/proc/self/cgroup") if err != nil { return "", err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go index 47b67afc2..0760be74b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go @@ -133,6 +133,10 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } + // misc (since kernel 5.13) + if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } if len(errs) > 0 && !m.config.Rootless { return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go index c92a7e64a..2ce2631e1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go @@ -1,6 +1,8 @@ package fs2 import ( + "errors" + "os" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error { if !isHugeTlbSet(r) { return nil } + const suffix = ".max" + skipRsvd := false for _, hugetlb := range r.HugetlbLimit { - if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + prefix := "hugetlb." + hugetlb.Pagesize + val := strconv.FormatUint(hugetlb.Limit, 10) + if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil { + return err + } + if skipRsvd { + continue + } + if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil { + if errors.Is(err, os.ErrNotExist) { + skipRsvd = true + continue + } return err } } @@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error { func statHugeTlb(dirPath string, stats *cgroups.Stats) error { hugetlbStats := cgroups.HugetlbStats{} + rsvd := ".rsvd" for _, pagesize := range cgroups.HugePageSizes() { - value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") + again: + prefix := "hugetlb." + pagesize + rsvd + value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current") if err != nil { + if rsvd != "" && errors.Is(err, os.ErrNotExist) { + rsvd = "" + goto again + } return err } hugetlbStats.Usage = value - fileName := "hugetlb." + pagesize + ".events" - value, err = fscommon.GetValueByKey(dirPath, fileName, "max") + value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max") if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go index 85e96b1ce..296565974 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go @@ -105,7 +105,7 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { memoryUsage, err := getMemoryDataV2(dirPath, "") if err != nil { if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { - // The root cgroup does not have memory.{current,max} + // The root cgroup does not have memory.{current,max,peak} // so emulate those using data from /proc/meminfo and // /sys/fs/cgroup/memory.stat return rootStatsFromMeminfo(stats) @@ -113,10 +113,12 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { return err } stats.MemoryStats.Usage = memoryUsage - swapUsage, err := getMemoryDataV2(dirPath, "swap") + swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap") if err != nil { return err } + stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage + swapUsage := swapOnlyUsage // As cgroup v1 reports SwapUsage values as mem+swap combined, // while in cgroup v2 swap values do not include memory, // report combined mem+swap for v1 compatibility. @@ -124,6 +126,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { if swapUsage.Limit != math.MaxUint64 { swapUsage.Limit += memoryUsage.Limit } + // The `MaxUsage` of mem+swap cannot simply combine mem with + // swap. So set it to 0 for v1 compatibility. + swapUsage.MaxUsage = 0 stats.MemoryStats.SwapUsage = swapUsage return nil @@ -138,6 +143,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } usage := moduleName + ".current" limit := moduleName + ".max" + maxUsage := moduleName + ".peak" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { @@ -157,6 +163,14 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } memoryData.Limit = value + // `memory.peak` since kernel 5.19 + // `memory.swap.peak` since kernel 6.5 + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil && !os.IsNotExist(err) { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + return memoryData, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/misc.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/misc.go new file mode 100644 index 000000000..f0b292aa0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/misc.go @@ -0,0 +1,52 @@ +package fs2 + +import ( + "bufio" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +func statMisc(dirPath string, stats *cgroups.Stats) error { + for _, file := range []string{"current", "events"} { + fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY) + if err != nil { + return err + } + + s := bufio.NewScanner(fd) + for s.Scan() { + key, value, err := fscommon.ParseKeyValue(s.Text()) + if err != nil { + fd.Close() + return err + } + + key = strings.TrimSuffix(key, ".max") + + if _, ok := stats.MiscStats[key]; !ok { + stats.MiscStats[key] = cgroups.MiscStats{} + } + + tmp := stats.MiscStats[key] + + switch file { + case "current": + tmp.Usage = value + case "events": + tmp.Events = value + } + + stats.MiscStats[key] = tmp + } + fd.Close() + + if err := s.Err(); err != nil { + return err + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index 8ff1fbb52..b475567d8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -91,6 +91,8 @@ type MemoryStats struct { Usage MemoryData `json:"usage,omitempty"` // usage of memory + swap SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of swap only + SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"` // usage of kernel memory KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory @@ -170,6 +172,13 @@ type RdmaStats struct { RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` } +type MiscStats struct { + // current resource usage for a key in misc + Usage uint64 `json:"usage,omitempty"` + // number of times the resource usage was about to go over the max boundary + Events uint64 `json:"events,omitempty"` +} + type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` @@ -179,10 +188,13 @@ type Stats struct { // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` RdmaStats RdmaStats `json:"rdma_stats,omitempty"` + // the map is in the format "misc resource name: stats of the key" + MiscStats map[string]MiscStats `json:"misc_stats,omitempty"` } func NewStats() *Stats { memoryStats := MemoryStats{Stats: make(map[string]uint64)} hugetlbStats := make(map[string]HugetlbStats) - return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} + miscStats := make(map[string]MiscStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats} } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 965c607c6..186cbc641 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -217,10 +217,26 @@ func PathExists(path string) bool { return true } -func rmdir(path string) error { +// rmdir tries to remove a directory, optionally retrying on EBUSY. +func rmdir(path string, retry bool) error { + delay := time.Millisecond + tries := 10 + +again: err := unix.Rmdir(path) - if err == nil || err == unix.ENOENT { + switch err { // nolint:errorlint // unix errors are bare + case nil, unix.ENOENT: return nil + case unix.EINTR: + goto again + case unix.EBUSY: + if retry && tries > 0 { + time.Sleep(delay) + delay *= 2 + tries-- + goto again + + } } return &os.PathError{Op: "rmdir", Path: path, Err: err} } @@ -228,68 +244,42 @@ func rmdir(path string) error { // RemovePath aims to remove cgroup path. It does so recursively, // by removing any subdirectories (sub-cgroups) first. func RemovePath(path string) error { - // try the fast path first - if err := rmdir(path); err == nil { + // Try the fast path first. + if err := rmdir(path, false); err == nil { return nil } infos, err := os.ReadDir(path) - if err != nil { - if os.IsNotExist(err) { - err = nil - } + if err != nil && !os.IsNotExist(err) { return err } for _, info := range infos { if info.IsDir() { - // We should remove subcgroups dir first + // We should remove subcgroup first. if err = RemovePath(filepath.Join(path, info.Name())); err != nil { break } } } if err == nil { - err = rmdir(path) + err = rmdir(path, true) } return err } // RemovePaths iterates over the provided paths removing them. -// We trying to remove all paths five times with increasing delay between tries. -// If after all there are not removed cgroups - appropriate error will be -// returned. func RemovePaths(paths map[string]string) (err error) { - const retries = 5 - delay := 10 * time.Millisecond - for i := 0; i < retries; i++ { - if i != 0 { - time.Sleep(delay) - delay *= 2 - } - for s, p := range paths { - if err := RemovePath(p); err != nil { - // do not log intermediate iterations - switch i { - case 0: - logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") - case retries - 1: - logrus.WithError(err).Error("Failed to remove cgroup") - } - } - _, err := os.Stat(p) - // We need this strange way of checking cgroups existence because - // RemoveAll almost always returns error, even on already removed - // cgroups - if os.IsNotExist(err) { - delete(paths, s) - } - } - if len(paths) == 0 { - //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 - paths = make(map[string]string) - return nil + for s, p := range paths { + if err := RemovePath(p); err == nil { + delete(paths, s) } } + if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + // TODO: switch to clear once Go < 1.21 is not supported. + paths = make(map[string]string) + return nil + } return fmt.Errorf("Failed to remove paths: %v", paths) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go index 8524c4684..81193e209 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go @@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string { // expensive), so it is assumed that cgroup mounts are not being changed. func readCgroupMountinfo() ([]*mountinfo.Info, error) { readMountinfoOnce.Do(func() { + // mountinfo.GetMounts uses /proc/thread-self, so we can use it without + // issues. cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( mountinfo.FSTypeFilter("cgroup"), ) }) - return cgroupMountinfo, readMountinfoErr } @@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) { return nil, err } + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return nil, err @@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) { if IsCgroup2UnifiedMode() { return "", errUnified } + + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index 1664304be..4a34cf76f 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -69,6 +69,9 @@ type Resources struct { // CPU hardcap limit (in usecs). Allowed cpu time in a given period. CpuQuota int64 `json:"cpu_quota"` + // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period. + CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive + // CPU period to be used for hardcapping (in usecs). 0 to use system default. CpuPeriod uint64 `json:"cpu_period"` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 19541293b..a0a79d19d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -8,6 +8,7 @@ import ( "time" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runtime-spec/specs-go" @@ -21,9 +22,9 @@ type Rlimit struct { // IDMap represents UID/GID Mappings for User Namespaces. type IDMap struct { - ContainerID int `json:"container_id"` - HostID int `json:"host_id"` - Size int `json:"size"` + ContainerID int64 `json:"container_id"` + HostID int64 `json:"host_id"` + Size int64 `json:"size"` } // Seccomp represents syscall restrictions @@ -213,12 +214,73 @@ type Config struct { // When RootlessCgroups is set, cgroups errors are ignored. RootlessCgroups bool `json:"rootless_cgroups,omitempty"` - // Do not try to remount a bind mount again after the first attempt failed on source - // filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set - NoMountFallback bool `json:"no_mount_fallback,omitempty"` - // TimeOffsets specifies the offset for supporting time namespaces. TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"` + + // Scheduler represents the scheduling attributes for a process. + Scheduler *Scheduler `json:"scheduler,omitempty"` + + // Personality contains configuration for the Linux personality syscall. + Personality *LinuxPersonality `json:"personality,omitempty"` +} + +// Scheduler is based on the Linux sched_setattr(2) syscall. +type Scheduler = specs.Scheduler + +// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr. +func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) { + var policy uint32 + switch scheduler.Policy { + case specs.SchedOther: + policy = 0 + case specs.SchedFIFO: + policy = 1 + case specs.SchedRR: + policy = 2 + case specs.SchedBatch: + policy = 3 + case specs.SchedISO: + policy = 4 + case specs.SchedIdle: + policy = 5 + case specs.SchedDeadline: + policy = 6 + default: + return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy) + } + + var flags uint64 + for _, flag := range scheduler.Flags { + switch flag { + case specs.SchedFlagResetOnFork: + flags |= 0x01 + case specs.SchedFlagReclaim: + flags |= 0x02 + case specs.SchedFlagDLOverrun: + flags |= 0x04 + case specs.SchedFlagKeepPolicy: + flags |= 0x08 + case specs.SchedFlagKeepParams: + flags |= 0x10 + case specs.SchedFlagUtilClampMin: + flags |= 0x20 + case specs.SchedFlagUtilClampMax: + flags |= 0x40 + default: + return nil, fmt.Errorf("invalid scheduler flag: %s", flag) + } + } + + return &unix.SchedAttr{ + Size: unix.SizeofSchedAttr, + Policy: policy, + Flags: flags, + Nice: scheduler.Nice, + Priority: uint32(scheduler.Priority), + Runtime: scheduler.Runtime, + Deadline: scheduler.Deadline, + Period: scheduler.Period, + }, nil } type ( diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go index 4e58bb396..e401f5331 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go @@ -1,14 +1,29 @@ package configs -import "errors" +import ( + "errors" + "fmt" + "math" +) var ( - errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") - errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.") - errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.") - errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.") + errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found") + errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found") +) + +// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details. +// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h +const ( + PerLinux = 0x0000 + PerLinux32 = 0x0008 ) +type LinuxPersonality struct { + // Domain for the personality + // can only contain values "LINUX" and "LINUX32" + Domain int `json:"domain"` +} + // HostUID gets the translated uid for the process on host which could be // different when user namespaces are enabled. func (c Config) HostUID(containerId int) (int, error) { @@ -16,11 +31,18 @@ func (c Config) HostUID(containerId int) (int, error) { if len(c.UIDMappings) == 0 { return -1, errNoUIDMap } - id, found := c.hostIDFromMapping(containerId, c.UIDMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings) if !found { - return -1, errNoUserMap + return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId) + } + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setuid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) } - return id, nil + return int(id), nil } // Return unchanged id. return containerId, nil @@ -39,11 +61,18 @@ func (c Config) HostGID(containerId int) (int, error) { if len(c.GIDMappings) == 0 { return -1, errNoGIDMap } - id, found := c.hostIDFromMapping(containerId, c.GIDMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings) if !found { - return -1, errNoGroupMap + return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId) + } + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setgid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) } - return id, nil + return int(id), nil } // Return unchanged id. return containerId, nil @@ -57,7 +86,7 @@ func (c Config) HostRootGID() (int, error) { // Utility function that gets a host ID for a container ID from user namespace map // if that ID is present in the map. -func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { +func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) { for _, m := range uMap { if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { hostID := m.HostID + (containerID - m.ContainerID) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go index e39440fc0..bfd356e49 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -3,5 +3,5 @@ package configs const ( // EXT_COPYUP is a directive to copy up the contents of a directory when // a tmpfs is mounted over it. - EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning + EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning ) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go index 6d4106de0..b69e9ab23 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount_linux.go @@ -2,6 +2,24 @@ package configs import "golang.org/x/sys/unix" +type MountIDMapping struct { + // Recursive indicates if the mapping needs to be recursive. + Recursive bool `json:"recursive"` + + // UserNSPath is a path to a user namespace that indicates the necessary + // id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and + // GIDMappings must be set to nil. + UserNSPath string `json:"userns_path,omitempty"` + + // UIDMappings is the uid mapping set for this mount, to be used with + // MOUNT_ATTR_IDMAP. + UIDMappings []IDMap `json:"uid_mappings,omitempty"` + + // GIDMappings is the gid mapping set for this mount, to be used with + // MOUNT_ATTR_IDMAP. + GIDMappings []IDMap `json:"gid_mappings,omitempty"` +} + type Mount struct { // Source path for the mount. Source string `json:"source"` @@ -15,6 +33,10 @@ type Mount struct { // Mount flags. Flags int `json:"flags"` + // Mount flags that were explicitly cleared in the configuration (meaning + // the user explicitly requested that these flags *not* be set). + ClearedFlags int `json:"cleared_flags"` + // Propagation Flags PropagationFlags []int `json:"propagation_flags"` @@ -30,17 +52,9 @@ type Mount struct { // Extensions are additional flags that are specific to runc. Extensions int `json:"extensions"` - // UIDMappings is used to changing file user owners w/o calling chown. - // Note that, the underlying filesystem should support this feature to be - // used. - // Every mount point could have its own mapping. - UIDMappings []IDMap `json:"uid_mappings,omitempty"` - - // GIDMappings is used to changing file group owners w/o calling chown. - // Note that, the underlying filesystem should support this feature to be - // used. - // Every mount point could have its own mapping. - GIDMappings []IDMap `json:"gid_mappings,omitempty"` + // Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil, + // the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings. + IDMapping *MountIDMapping `json:"id_mapping,omitempty"` } func (m *Mount) IsBind() bool { @@ -48,5 +62,5 @@ func (m *Mount) IsBind() bool { } func (m *Mount) IsIDMapped() bool { - return len(m.UIDMappings) > 0 || len(m.GIDMappings) > 0 + return m.IDMapping != nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index 5062432f8..898f96fd0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -59,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool { if nsFile == "" { return false } + // We don't need to use /proc/thread-self here because the list of + // namespace types is unrelated to the thread. This lets us avoid having to + // do runtime.LockOSThread. _, err := os.Stat("/proc/self/ns/" + nsFile) // a namespace is supported if it exists and we have permissions to read it supported = err == nil diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_deprecated.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_deprecated.go new file mode 100644 index 000000000..c6cd44345 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_deprecated.go @@ -0,0 +1,81 @@ +package user + +import ( + "io" + + "github.com/moby/sys/user" +) + +// LookupUser looks up a user by their username in /etc/passwd. If the user +// cannot be found (or there is no /etc/passwd file on the filesystem), then +// LookupUser returns an error. +func LookupUser(username string) (user.User, error) { + return user.LookupUser(username) +} + +// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot +// be found (or there is no /etc/passwd file on the filesystem), then LookupId +// returns an error. +func LookupUid(uid int) (user.User, error) { //nolint:revive // ignore var-naming: func LookupUid should be LookupUID + return user.LookupUid(uid) +} + +// LookupGroup looks up a group by its name in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGroup +// returns an error. +func LookupGroup(groupname string) (user.Group, error) { + return user.LookupGroup(groupname) +} + +// LookupGid looks up a group by its group id in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGid +// returns an error. +func LookupGid(gid int) (user.Group, error) { + return user.LookupGid(gid) +} + +func GetPasswdPath() (string, error) { + return user.GetPasswdPath() +} + +func GetPasswd() (io.ReadCloser, error) { + return user.GetPasswd() +} + +func GetGroupPath() (string, error) { + return user.GetGroupPath() +} + +func GetGroup() (io.ReadCloser, error) { + return user.GetGroup() +} + +// CurrentUser looks up the current user by their user id in /etc/passwd. If the +// user cannot be found (or there is no /etc/passwd file on the filesystem), +// then CurrentUser returns an error. +func CurrentUser() (user.User, error) { + return user.CurrentUser() +} + +// CurrentGroup looks up the current user's group by their primary group id's +// entry in /etc/passwd. If the group cannot be found (or there is no +// /etc/group file on the filesystem), then CurrentGroup returns an error. +func CurrentGroup() (user.Group, error) { + return user.CurrentGroup() +} + +func CurrentUserSubUIDs() ([]user.SubID, error) { + return user.CurrentUserSubUIDs() +} + +func CurrentUserSubGIDs() ([]user.SubID, error) { + return user.CurrentUserSubGIDs() +} + +func CurrentProcessUIDMap() ([]user.IDMap, error) { + return user.CurrentProcessUIDMap() +} + +func CurrentProcessGIDMap() ([]user.IDMap, error) { + return user.CurrentProcessGIDMap() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go deleted file mode 100644 index f95c1409f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ /dev/null @@ -1,157 +0,0 @@ -//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris - -package user - -import ( - "io" - "os" - "strconv" - - "golang.org/x/sys/unix" -) - -// Unix-specific path to the passwd and group formatted files. -const ( - unixPasswdPath = "/etc/passwd" - unixGroupPath = "/etc/group" -) - -// LookupUser looks up a user by their username in /etc/passwd. If the user -// cannot be found (or there is no /etc/passwd file on the filesystem), then -// LookupUser returns an error. -func LookupUser(username string) (User, error) { - return lookupUserFunc(func(u User) bool { - return u.Name == username - }) -} - -// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot -// be found (or there is no /etc/passwd file on the filesystem), then LookupId -// returns an error. -func LookupUid(uid int) (User, error) { - return lookupUserFunc(func(u User) bool { - return u.Uid == uid - }) -} - -func lookupUserFunc(filter func(u User) bool) (User, error) { - // Get operating system-specific passwd reader-closer. - passwd, err := GetPasswd() - if err != nil { - return User{}, err - } - defer passwd.Close() - - // Get the users. - users, err := ParsePasswdFilter(passwd, filter) - if err != nil { - return User{}, err - } - - // No user entries found. - if len(users) == 0 { - return User{}, ErrNoPasswdEntries - } - - // Assume the first entry is the "correct" one. - return users[0], nil -} - -// LookupGroup looks up a group by its name in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGroup -// returns an error. -func LookupGroup(groupname string) (Group, error) { - return lookupGroupFunc(func(g Group) bool { - return g.Name == groupname - }) -} - -// LookupGid looks up a group by its group id in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGid -// returns an error. -func LookupGid(gid int) (Group, error) { - return lookupGroupFunc(func(g Group) bool { - return g.Gid == gid - }) -} - -func lookupGroupFunc(filter func(g Group) bool) (Group, error) { - // Get operating system-specific group reader-closer. - group, err := GetGroup() - if err != nil { - return Group{}, err - } - defer group.Close() - - // Get the users. - groups, err := ParseGroupFilter(group, filter) - if err != nil { - return Group{}, err - } - - // No user entries found. - if len(groups) == 0 { - return Group{}, ErrNoGroupEntries - } - - // Assume the first entry is the "correct" one. - return groups[0], nil -} - -func GetPasswdPath() (string, error) { - return unixPasswdPath, nil -} - -func GetPasswd() (io.ReadCloser, error) { - return os.Open(unixPasswdPath) -} - -func GetGroupPath() (string, error) { - return unixGroupPath, nil -} - -func GetGroup() (io.ReadCloser, error) { - return os.Open(unixGroupPath) -} - -// CurrentUser looks up the current user by their user id in /etc/passwd. If the -// user cannot be found (or there is no /etc/passwd file on the filesystem), -// then CurrentUser returns an error. -func CurrentUser() (User, error) { - return LookupUid(unix.Getuid()) -} - -// CurrentGroup looks up the current user's group by their primary group id's -// entry in /etc/passwd. If the group cannot be found (or there is no -// /etc/group file on the filesystem), then CurrentGroup returns an error. -func CurrentGroup() (Group, error) { - return LookupGid(unix.Getgid()) -} - -func currentUserSubIDs(fileName string) ([]SubID, error) { - u, err := CurrentUser() - if err != nil { - return nil, err - } - filter := func(entry SubID) bool { - return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) - } - return ParseSubIDFileFilter(fileName, filter) -} - -func CurrentUserSubUIDs() ([]SubID, error) { - return currentUserSubIDs("/etc/subuid") -} - -func CurrentUserSubGIDs() ([]SubID, error) { - return currentUserSubIDs("/etc/subgid") -} - -func CurrentProcessUIDMap() ([]IDMap, error) { - return ParseIDMapFile("/proc/self/uid_map") -} - -func CurrentProcessGIDMap() ([]IDMap, error) { - return ParseIDMapFile("/proc/self/gid_map") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go deleted file mode 100644 index 984466d1a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ /dev/null @@ -1,605 +0,0 @@ -package user - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "io" - "os" - "strconv" - "strings" -) - -const ( - minID = 0 - maxID = 1<<31 - 1 // for 32-bit systems compatibility -) - -var ( - // ErrNoPasswdEntries is returned if no matching entries were found in /etc/group. - ErrNoPasswdEntries = errors.New("no matching entries in passwd file") - // ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd. - ErrNoGroupEntries = errors.New("no matching entries in group file") - // ErrRange is returned if a UID or GID is outside of the valid range. - ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID) -) - -type User struct { - Name string - Pass string - Uid int - Gid int - Gecos string - Home string - Shell string -} - -type Group struct { - Name string - Pass string - Gid int - List []string -} - -// SubID represents an entry in /etc/sub{u,g}id -type SubID struct { - Name string - SubID int64 - Count int64 -} - -// IDMap represents an entry in /proc/PID/{u,g}id_map -type IDMap struct { - ID int64 - ParentID int64 - Count int64 -} - -func parseLine(line []byte, v ...interface{}) { - parseParts(bytes.Split(line, []byte(":")), v...) -} - -func parseParts(parts [][]byte, v ...interface{}) { - if len(parts) == 0 { - return - } - - for i, p := range parts { - // Ignore cases where we don't have enough fields to populate the arguments. - // Some configuration files like to misbehave. - if len(v) <= i { - break - } - - // Use the type of the argument to figure out how to parse it, scanf() style. - // This is legit. - switch e := v[i].(type) { - case *string: - *e = string(p) - case *int: - // "numbers", with conversion errors ignored because of some misbehaving configuration files. - *e, _ = strconv.Atoi(string(p)) - case *int64: - *e, _ = strconv.ParseInt(string(p), 10, 64) - case *[]string: - // Comma-separated lists. - if len(p) != 0 { - *e = strings.Split(string(p), ",") - } else { - *e = []string{} - } - default: - // Someone goof'd when writing code using this function. Scream so they can hear us. - panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) - } - } -} - -func ParsePasswdFile(path string) ([]User, error) { - passwd, err := os.Open(path) - if err != nil { - return nil, err - } - defer passwd.Close() - return ParsePasswd(passwd) -} - -func ParsePasswd(passwd io.Reader) ([]User, error) { - return ParsePasswdFilter(passwd, nil) -} - -func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) { - passwd, err := os.Open(path) - if err != nil { - return nil, err - } - defer passwd.Close() - return ParsePasswdFilter(passwd, filter) -} - -func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { - if r == nil { - return nil, errors.New("nil source for passwd-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []User{} - ) - - for s.Scan() { - line := bytes.TrimSpace(s.Bytes()) - if len(line) == 0 { - continue - } - - // see: man 5 passwd - // name:password:UID:GID:GECOS:directory:shell - // Name:Pass:Uid:Gid:Gecos:Home:Shell - // root:x:0:0:root:/root:/bin/bash - // adm:x:3:4:adm:/var/adm:/bin/false - p := User{} - parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return out, nil -} - -func ParseGroupFile(path string) ([]Group, error) { - group, err := os.Open(path) - if err != nil { - return nil, err - } - - defer group.Close() - return ParseGroup(group) -} - -func ParseGroup(group io.Reader) ([]Group, error) { - return ParseGroupFilter(group, nil) -} - -func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) { - group, err := os.Open(path) - if err != nil { - return nil, err - } - defer group.Close() - return ParseGroupFilter(group, filter) -} - -func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { - if r == nil { - return nil, errors.New("nil source for group-formatted data") - } - rd := bufio.NewReader(r) - out := []Group{} - - // Read the file line-by-line. - for { - var ( - isPrefix bool - wholeLine []byte - err error - ) - - // Read the next line. We do so in chunks (as much as reader's - // buffer is able to keep), check if we read enough columns - // already on each step and store final result in wholeLine. - for { - var line []byte - line, isPrefix, err = rd.ReadLine() - - if err != nil { - // We should return no error if EOF is reached - // without a match. - if err == io.EOF { - err = nil - } - return out, err - } - - // Simple common case: line is short enough to fit in a - // single reader's buffer. - if !isPrefix && len(wholeLine) == 0 { - wholeLine = line - break - } - - wholeLine = append(wholeLine, line...) - - // Check if we read the whole line already. - if !isPrefix { - break - } - } - - // There's no spec for /etc/passwd or /etc/group, but we try to follow - // the same rules as the glibc parser, which allows comments and blank - // space at the beginning of a line. - wholeLine = bytes.TrimSpace(wholeLine) - if len(wholeLine) == 0 || wholeLine[0] == '#' { - continue - } - - // see: man 5 group - // group_name:password:GID:user_list - // Name:Pass:Gid:List - // root:x:0:root - // adm:x:4:root,adm,daemon - p := Group{} - parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List) - - if filter == nil || filter(p) { - out = append(out, p) - } - } -} - -type ExecUser struct { - Uid int - Gid int - Sgids []int - Home string -} - -// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the -// given file paths and uses that data as the arguments to GetExecUser. If the -// files cannot be opened for any reason, the error is ignored and a nil -// io.Reader is passed instead. -func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { - var passwd, group io.Reader - - if passwdFile, err := os.Open(passwdPath); err == nil { - passwd = passwdFile - defer passwdFile.Close() - } - - if groupFile, err := os.Open(groupPath); err == nil { - group = groupFile - defer groupFile.Close() - } - - return GetExecUser(userSpec, defaults, passwd, group) -} - -// GetExecUser parses a user specification string (using the passwd and group -// readers as sources for /etc/passwd and /etc/group data, respectively). In -// the case of blank fields or missing data from the sources, the values in -// defaults is used. -// -// GetExecUser will return an error if a user or group literal could not be -// found in any entry in passwd and group respectively. -// -// Examples of valid user specifications are: -// - "" -// - "user" -// - "uid" -// - "user:group" -// - "uid:gid -// - "user:gid" -// - "uid:group" -// -// It should be noted that if you specify a numeric user or group id, they will -// not be evaluated as usernames (only the metadata will be filled). So attempting -// to parse a user with user.Name = "1337" will produce the user with a UID of -// 1337. -func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { - if defaults == nil { - defaults = new(ExecUser) - } - - // Copy over defaults. - user := &ExecUser{ - Uid: defaults.Uid, - Gid: defaults.Gid, - Sgids: defaults.Sgids, - Home: defaults.Home, - } - - // Sgids slice *cannot* be nil. - if user.Sgids == nil { - user.Sgids = []int{} - } - - // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax - var userArg, groupArg string - parseLine([]byte(userSpec), &userArg, &groupArg) - - // Convert userArg and groupArg to be numeric, so we don't have to execute - // Atoi *twice* for each iteration over lines. - uidArg, uidErr := strconv.Atoi(userArg) - gidArg, gidErr := strconv.Atoi(groupArg) - - // Find the matching user. - users, err := ParsePasswdFilter(passwd, func(u User) bool { - if userArg == "" { - // Default to current state of the user. - return u.Uid == user.Uid - } - - if uidErr == nil { - // If the userArg is numeric, always treat it as a UID. - return uidArg == u.Uid - } - - return u.Name == userArg - }) - - // If we can't find the user, we have to bail. - if err != nil && passwd != nil { - if userArg == "" { - userArg = strconv.Itoa(user.Uid) - } - return nil, fmt.Errorf("unable to find user %s: %w", userArg, err) - } - - var matchedUserName string - if len(users) > 0 { - // First match wins, even if there's more than one matching entry. - matchedUserName = users[0].Name - user.Uid = users[0].Uid - user.Gid = users[0].Gid - user.Home = users[0].Home - } else if userArg != "" { - // If we can't find a user with the given username, the only other valid - // option is if it's a numeric username with no associated entry in passwd. - - if uidErr != nil { - // Not numeric. - return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries) - } - user.Uid = uidArg - - // Must be inside valid uid range. - if user.Uid < minID || user.Uid > maxID { - return nil, ErrRange - } - - // Okay, so it's numeric. We can just roll with this. - } - - // On to the groups. If we matched a username, we need to do this because of - // the supplementary group IDs. - if groupArg != "" || matchedUserName != "" { - groups, err := ParseGroupFilter(group, func(g Group) bool { - // If the group argument isn't explicit, we'll just search for it. - if groupArg == "" { - // Check if user is a member of this group. - for _, u := range g.List { - if u == matchedUserName { - return true - } - } - return false - } - - if gidErr == nil { - // If the groupArg is numeric, always treat it as a GID. - return gidArg == g.Gid - } - - return g.Name == groupArg - }) - if err != nil && group != nil { - return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err) - } - - // Only start modifying user.Gid if it is in explicit form. - if groupArg != "" { - if len(groups) > 0 { - // First match wins, even if there's more than one matching entry. - user.Gid = groups[0].Gid - } else { - // If we can't find a group with the given name, the only other valid - // option is if it's a numeric group name with no associated entry in group. - - if gidErr != nil { - // Not numeric. - return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries) - } - user.Gid = gidArg - - // Must be inside valid gid range. - if user.Gid < minID || user.Gid > maxID { - return nil, ErrRange - } - - // Okay, so it's numeric. We can just roll with this. - } - } else if len(groups) > 0 { - // Supplementary group ids only make sense if in the implicit form. - user.Sgids = make([]int, len(groups)) - for i, group := range groups { - user.Sgids[i] = group.Gid - } - } - } - - return user, nil -} - -// GetAdditionalGroups looks up a list of groups by name or group id -// against the given /etc/group formatted data. If a group name cannot -// be found, an error will be returned. If a group id cannot be found, -// or the given group data is nil, the id will be returned as-is -// provided it is in the legal range. -func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { - groups := []Group{} - if group != nil { - var err error - groups, err = ParseGroupFilter(group, func(g Group) bool { - for _, ag := range additionalGroups { - if g.Name == ag || strconv.Itoa(g.Gid) == ag { - return true - } - } - return false - }) - if err != nil { - return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err) - } - } - - gidMap := make(map[int]struct{}) - for _, ag := range additionalGroups { - var found bool - for _, g := range groups { - // if we found a matched group either by name or gid, take the - // first matched as correct - if g.Name == ag || strconv.Itoa(g.Gid) == ag { - if _, ok := gidMap[g.Gid]; !ok { - gidMap[g.Gid] = struct{}{} - found = true - break - } - } - } - // we asked for a group but didn't find it. let's check to see - // if we wanted a numeric group - if !found { - gid, err := strconv.ParseInt(ag, 10, 64) - if err != nil { - // Not a numeric ID either. - return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries) - } - // Ensure gid is inside gid range. - if gid < minID || gid > maxID { - return nil, ErrRange - } - gidMap[int(gid)] = struct{}{} - } - } - gids := []int{} - for gid := range gidMap { - gids = append(gids, gid) - } - return gids, nil -} - -// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups -// that opens the groupPath given and gives it as an argument to -// GetAdditionalGroups. -func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { - var group io.Reader - - if groupFile, err := os.Open(groupPath); err == nil { - group = groupFile - defer groupFile.Close() - } - return GetAdditionalGroups(additionalGroups, group) -} - -func ParseSubIDFile(path string) ([]SubID, error) { - subid, err := os.Open(path) - if err != nil { - return nil, err - } - defer subid.Close() - return ParseSubID(subid) -} - -func ParseSubID(subid io.Reader) ([]SubID, error) { - return ParseSubIDFilter(subid, nil) -} - -func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { - subid, err := os.Open(path) - if err != nil { - return nil, err - } - defer subid.Close() - return ParseSubIDFilter(subid, filter) -} - -func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { - if r == nil { - return nil, errors.New("nil source for subid-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []SubID{} - ) - - for s.Scan() { - line := bytes.TrimSpace(s.Bytes()) - if len(line) == 0 { - continue - } - - // see: man 5 subuid - p := SubID{} - parseLine(line, &p.Name, &p.SubID, &p.Count) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return out, nil -} - -func ParseIDMapFile(path string) ([]IDMap, error) { - r, err := os.Open(path) - if err != nil { - return nil, err - } - defer r.Close() - return ParseIDMap(r) -} - -func ParseIDMap(r io.Reader) ([]IDMap, error) { - return ParseIDMapFilter(r, nil) -} - -func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { - r, err := os.Open(path) - if err != nil { - return nil, err - } - defer r.Close() - return ParseIDMapFilter(r, filter) -} - -func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { - if r == nil { - return nil, errors.New("nil source for idmap-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []IDMap{} - ) - - for s.Scan() { - line := bytes.TrimSpace(s.Bytes()) - if len(line) == 0 { - continue - } - - // see: man 7 user_namespaces - p := IDMap{} - parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return out, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user_deprecated.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user_deprecated.go new file mode 100644 index 000000000..3c29f3d1d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user_deprecated.go @@ -0,0 +1,146 @@ +// Package user is an alias for [github.com/moby/sys/user]. +// +// Deprecated: use [github.com/moby/sys/user]. +package user + +import ( + "io" + + "github.com/moby/sys/user" +) + +var ( + // ErrNoPasswdEntries is returned if no matching entries were found in /etc/group. + ErrNoPasswdEntries = user.ErrNoPasswdEntries + // ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd. + ErrNoGroupEntries = user.ErrNoGroupEntries + // ErrRange is returned if a UID or GID is outside of the valid range. + ErrRange = user.ErrRange +) + +type ( + User = user.User + + Group = user.Group + + // SubID represents an entry in /etc/sub{u,g}id. + SubID = user.SubID + + // IDMap represents an entry in /proc/PID/{u,g}id_map. + IDMap = user.IDMap + + ExecUser = user.ExecUser +) + +func ParsePasswdFile(path string) ([]user.User, error) { + return user.ParsePasswdFile(path) +} + +func ParsePasswd(passwd io.Reader) ([]user.User, error) { + return user.ParsePasswd(passwd) +} + +func ParsePasswdFileFilter(path string, filter func(user.User) bool) ([]user.User, error) { + return user.ParsePasswdFileFilter(path, filter) +} + +func ParsePasswdFilter(r io.Reader, filter func(user.User) bool) ([]user.User, error) { + return user.ParsePasswdFilter(r, filter) +} + +func ParseGroupFile(path string) ([]user.Group, error) { + return user.ParseGroupFile(path) +} + +func ParseGroup(group io.Reader) ([]user.Group, error) { + return user.ParseGroup(group) +} + +func ParseGroupFileFilter(path string, filter func(user.Group) bool) ([]user.Group, error) { + return user.ParseGroupFileFilter(path, filter) +} + +func ParseGroupFilter(r io.Reader, filter func(user.Group) bool) ([]user.Group, error) { + return user.ParseGroupFilter(r, filter) +} + +// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the +// given file paths and uses that data as the arguments to GetExecUser. If the +// files cannot be opened for any reason, the error is ignored and a nil +// io.Reader is passed instead. +func GetExecUserPath(userSpec string, defaults *user.ExecUser, passwdPath, groupPath string) (*user.ExecUser, error) { + return user.GetExecUserPath(userSpec, defaults, passwdPath, groupPath) +} + +// GetExecUser parses a user specification string (using the passwd and group +// readers as sources for /etc/passwd and /etc/group data, respectively). In +// the case of blank fields or missing data from the sources, the values in +// defaults is used. +// +// GetExecUser will return an error if a user or group literal could not be +// found in any entry in passwd and group respectively. +// +// Examples of valid user specifications are: +// - "" +// - "user" +// - "uid" +// - "user:group" +// - "uid:gid +// - "user:gid" +// - "uid:group" +// +// It should be noted that if you specify a numeric user or group id, they will +// not be evaluated as usernames (only the metadata will be filled). So attempting +// to parse a user with user.Name = "1337" will produce the user with a UID of +// 1337. +func GetExecUser(userSpec string, defaults *user.ExecUser, passwd, group io.Reader) (*user.ExecUser, error) { + return user.GetExecUser(userSpec, defaults, passwd, group) +} + +// GetAdditionalGroups looks up a list of groups by name or group id +// against the given /etc/group formatted data. If a group name cannot +// be found, an error will be returned. If a group id cannot be found, +// or the given group data is nil, the id will be returned as-is +// provided it is in the legal range. +func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { + return user.GetAdditionalGroups(additionalGroups, group) +} + +// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups +// that opens the groupPath given and gives it as an argument to +// GetAdditionalGroups. +func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { + return user.GetAdditionalGroupsPath(additionalGroups, groupPath) +} + +func ParseSubIDFile(path string) ([]user.SubID, error) { + return user.ParseSubIDFile(path) +} + +func ParseSubID(subid io.Reader) ([]user.SubID, error) { + return user.ParseSubID(subid) +} + +func ParseSubIDFileFilter(path string, filter func(user.SubID) bool) ([]user.SubID, error) { + return user.ParseSubIDFileFilter(path, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(user.SubID) bool) ([]user.SubID, error) { + return user.ParseSubIDFilter(r, filter) +} + +func ParseIDMapFile(path string) ([]user.IDMap, error) { + return user.ParseIDMapFile(path) +} + +func ParseIDMap(r io.Reader) ([]user.IDMap, error) { + return user.ParseIDMap(r) +} + +func ParseIDMapFileFilter(path string, filter func(user.IDMap) bool) ([]user.IDMap, error) { + return user.ParseIDMapFileFilter(path, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(user.IDMap) bool) ([]user.IDMap, error) { + return user.ParseIDMapFilter(r, filter) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go deleted file mode 100644 index e018eae61..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go +++ /dev/null @@ -1,43 +0,0 @@ -//go:build gofuzz -// +build gofuzz - -package user - -import ( - "io" - "strings" -) - -func IsDivisbleBy(n int, divisibleby int) bool { - return (n % divisibleby) == 0 -} - -func FuzzUser(data []byte) int { - if len(data) == 0 { - return -1 - } - if !IsDivisbleBy(len(data), 5) { - return -1 - } - - var divided [][]byte - - chunkSize := len(data) / 5 - - for i := 0; i < len(data); i += chunkSize { - end := i + chunkSize - - divided = append(divided, data[i:end]) - } - - _, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil) - - var passwd, group io.Reader - - group = strings.NewReader(string(divided[1])) - _, _ = GetAdditionalGroups([]string{string(divided[2])}, group) - - passwd = strings.NewReader(string(divided[3])) - _, _ = GetExecUser(string(divided[4]), nil, passwd, group) - return 1 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c new file mode 100644 index 000000000..84f2c6188 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +/* + * All of the code here is run inside an aync-signal-safe context, so we need + * to be careful to not call any functions that could cause issues. In theory, + * since we are a Go program, there are fewer restrictions in practice, it's + * better to be safe than sorry. + * + * The only exception is exit, which we need to call to make sure we don't + * return into runc. + */ + +void bail(int pipefd, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vdprintf(pipefd, fmt, args); + va_end(args); + + exit(1); +} + +int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd) +{ + char buffer[4096] = { 0 }; + + pid_t child = fork(); + if (child != 0) + return child; + /* in child */ + + /* Join the target userns. */ + int nsfd = open(userns_path, O_RDONLY); + if (nsfd < 0) + bail(errfd, "open userns path %s failed: %m", userns_path); + + int err = setns(nsfd, CLONE_NEWUSER); + if (err < 0) + bail(errfd, "setns %s failed: %m", userns_path); + + close(nsfd); + + /* Pipe the requested file contents. */ + int fd = open(path, O_RDONLY); + if (fd < 0) + bail(errfd, "open %s in userns %s failed: %m", path, userns_path); + + int nread, ntotal = 0; + while ((nread = read(fd, buffer, sizeof(buffer))) != 0) { + if (nread < 0) + bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal); + ntotal += nread; + + int nwritten = 0; + while (nwritten < nread) { + int n = write(outfd, buffer, nread - nwritten); + if (n < 0) + bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m", + nread - nwritten, path, nwritten); + nwritten += n; + } + if (nread != nwritten) + bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten); + } + + close(fd); + close(outfd); + close(errfd); + + /* We must exit here, otherwise we would return into a forked runc. */ + exit(0); +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go new file mode 100644 index 000000000..7a8c2b023 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go @@ -0,0 +1,186 @@ +//go:build linux + +package userns + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +/* +#include +extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd); +*/ +import "C" + +func parseIdmapData(data []byte) (ms []configs.IDMap, err error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + var m configs.IDMap + line := scanner.Text() + if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil { + return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err) + } + ms = append(ms, m) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("parsing id map failed: %w", err) + } + return ms, nil +} + +// Do something equivalent to nsenter --user= cat , but more +// efficiently. Returns the contents of the requested file from within the user +// namespace. +func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err) + } + defer rdr.Close() + defer wtr.Close() + + errRdr, errWtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err) + } + defer errRdr.Close() + defer errWtr.Close() + + cNsPath := C.CString(nsPath) + defer C.free(unsafe.Pointer(cNsPath)) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd())) + + if childPid < 0 { + return nil, fmt.Errorf("failed to spawn fork for userns") + } else if childPid == 0 { + // this should never happen + panic("runc executing inside fork child -- unsafe state!") + } + + // We are in the parent -- close the write end of the pipe before reading. + wtr.Close() + output, err := io.ReadAll(rdr) + rdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn failed: %w", err) + } + + // Ditto for the error pipe. + errWtr.Close() + errOutput, err := io.ReadAll(errRdr) + errRdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err) + } + errOutput = bytes.TrimSpace(errOutput) + + // Clean up the child. + child, err := os.FindProcess(int(childPid)) + if err != nil { + return nil, fmt.Errorf("could not find userns spawn process: %w", err) + } + state, err := child.Wait() + if err != nil { + return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err) + } + if !state.Success() { + errStr := string(errOutput) + if errStr == "" { + errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode()) + } + return nil, fmt.Errorf("userns spawn: %s", errStr) + } else if len(errOutput) > 0 { + // We can just ignore weird output in the error pipe if the process + // didn't bail(), but for completeness output for debugging. + logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput)) + } + // The subprocess succeeded, return whatever it wrote to the pipe. + return output, nil +} + +func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) { + var ( + pid int + extra rune + tryFastPath bool + ) + + // nsPath is usually of the form /proc//ns/user, which means that we + // already have a pid that is part of the user namespace and thus we can + // just use the pid to read from /proc//*id_map. + // + // Note that Sscanf doesn't consume the whole input, so we check for any + // trailing data with %c. That way, we can be sure the pattern matched + // /proc/$pid/ns/user _exactly_ iff n === 1. + if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 { + tryFastPath = pid > 0 + } + + for _, mapType := range []struct { + name string + idMap *[]configs.IDMap + }{ + {"uid_map", &uidMap}, + {"gid_map", &gidMap}, + } { + var mapData []byte + + if tryFastPath { + path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name) + data, err := os.ReadFile(path) + if err != nil { + // Do not error out here -- we need to try the slow path if the + // fast path failed. + logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err) + } else { + mapData = data + } + } else { + logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath) + } + + if mapData == nil { + // We have to actually join the namespace if we cannot take the + // fast path. The path is resolved with respect to the child + // process, so just use /proc/self. + data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name) + if err != nil { + return nil, nil, err + } + mapData = data + } + idMap, err := parseIdmapData(mapData) + if err != nil { + return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err) + } + *mapType.idMap = idMap + } + + return uidMap, gidMap, nil +} + +// IsSameMapping returns whether or not the two id mappings are the same. Note +// that if the order of the mappings is different, or a mapping has been split, +// the mappings will be considered different. +func IsSameMapping(a, b []configs.IDMap) bool { + if len(a) != len(b) { + return false + } + for idx := range a { + if a[idx] != b[idx] { + return false + } + } + return true +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/usernsfd_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/usernsfd_linux.go new file mode 100644 index 000000000..2eb64cf76 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/usernsfd_linux.go @@ -0,0 +1,156 @@ +package userns + +import ( + "fmt" + "os" + "sort" + "strings" + "sync" + "syscall" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Mapping struct { + UIDMappings []configs.IDMap + GIDMappings []configs.IDMap +} + +func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) { + for _, uid := range m.UIDMappings { + uids = append(uids, syscall.SysProcIDMap{ + ContainerID: int(uid.ContainerID), + HostID: int(uid.HostID), + Size: int(uid.Size), + }) + } + for _, gid := range m.GIDMappings { + gids = append(gids, syscall.SysProcIDMap{ + ContainerID: int(gid.ContainerID), + HostID: int(gid.HostID), + Size: int(gid.Size), + }) + } + return +} + +// id returns a unique identifier for this mapping, agnostic of the order of +// the uid and gid mappings (because the order doesn't matter to the kernel). +// The set of userns handles is indexed using this ID. +func (m Mapping) id() string { + var uids, gids []string + for _, idmap := range m.UIDMappings { + uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + for _, idmap := range m.GIDMappings { + gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size)) + } + // We don't care about the sort order -- just sort them. + sort.Strings(uids) + sort.Strings(gids) + return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",") +} + +type Handles struct { + m sync.Mutex + maps map[string]*os.File +} + +// Release all resources associated with this Handle. All existing files +// returned from Get() will continue to work even after calling Release(). The +// same Handles can be re-used after calling Release(). +func (hs *Handles) Release() { + hs.m.Lock() + defer hs.m.Unlock() + + // Close the files for good measure, though GC will do that for us anyway. + for _, file := range hs.maps { + _ = file.Close() + } + hs.maps = nil +} + +func spawnProc(req Mapping) (*os.Process, error) { + // We need to spawn a subprocess with the requested mappings, which is + // unfortunately quite expensive. The "safe" way of doing this is natively + // with Go (and then spawning something like "sleep infinity"), but + // execve() is a waste of cycles because we just need some process to have + // the right mapping, we don't care what it's executing. The "unsafe" + // option of doing a clone() behind the back of Go is probably okay in + // theory as long as we just do kill(getpid(), SIGSTOP). However, if we + // tell Go to put the new process into PTRACE_TRACEME mode, we can avoid + // the exec and not have to faff around with the mappings. + // + // Note that Go's stdlib does not support newuidmap, but in the case of + // id-mapped mounts, it seems incredibly unlikely that the user will be + // requesting us to do a remapping as an unprivileged user with mappings + // they have privileges over. + logrus.Debugf("spawning dummy process for id-mapping %s", req.id()) + uidMappings, gidMappings := req.toSys() + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by definition. + // This lets us avoid having to do runtime.LockOSThread. + return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{ + Sys: &syscall.SysProcAttr{ + Cloneflags: unix.CLONE_NEWUSER, + UidMappings: uidMappings, + GidMappings: gidMappings, + GidMappingsEnableSetgroups: false, + // Put the process into PTRACE_TRACEME mode to allow us to get the + // userns without having a proper execve() target. + Ptrace: true, + }, + }) +} + +func dupFile(f *os.File) (*os.File, error) { + newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err) + } + return os.NewFile(uintptr(newFd), f.Name()), nil +} + +// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested +// mapping. The processes spawned to produce userns nsfds are cached, so if +// equivalent user namespace mappings are requested, the same user namespace +// will be returned. The caller is responsible for closing the returned file +// descriptor. +func (hs *Handles) Get(req Mapping) (file *os.File, err error) { + hs.m.Lock() + defer hs.m.Unlock() + + if hs.maps == nil { + hs.maps = make(map[string]*os.File) + } + + file, ok := hs.maps[req.id()] + if !ok { + proc, err := spawnProc(req) + if err != nil { + return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err) + } + // Make sure we kill the helper process. We ignore errors because + // there's not much we can do about them anyway, and ultimately + defer func() { + _ = proc.Kill() + _, _ = proc.Wait() + }() + + // Stash away a handle to the userns file. This is neater than keeping + // the process alive, because Go's GC can handle files much better than + // leaked processes, and having long-living useless processes seems + // less than ideal. + file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid)) + if err != nil { + return nil, err + } + hs.maps[req.id()] = file + } + // Duplicate the file, to make sure the lifecycle of each *os.File we + // return is independent. + return dupFile(file) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index dbd435341..1b523d8ac 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -3,15 +3,12 @@ package utils import ( "encoding/binary" "encoding/json" - "fmt" "io" "os" "path/filepath" - "strconv" "strings" "unsafe" - securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -43,6 +40,9 @@ func ExitStatus(status unix.WaitStatus) int { } // WriteJSON writes the provided struct v to w using standard json marshaling +// without a trailing newline. This is used instead of json.Encoder because +// there might be a problem in json decoder in some cases, see: +// https://github.com/docker/docker/issues/14203#issuecomment-174177790 func WriteJSON(w io.Writer, v interface{}) error { data, err := json.Marshal(v) if err != nil { @@ -99,39 +99,6 @@ func stripRoot(root, path string) string { return CleanPath("/" + path) } -// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) -// corresponding to the unsafePath resolved within the root. Before passing the -// fd, this path is verified to have been inside the root -- so operating on it -// through the passed fdpath should be safe. Do not access this path through -// the original path strings, and do not attempt to use the pathname outside of -// the passed closure (the file handle will be freed once the closure returns). -func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { - // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) - if err != nil { - return fmt.Errorf("resolving path inside rootfs failed: %w", err) - } - - // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) - if err != nil { - return fmt.Errorf("open o_path procfd: %w", err) - } - defer fh.Close() - - // Double-check the path is the one we expected. - procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) - if realpath, err := os.Readlink(procfd); err != nil { - return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { - return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) - } - - // Run the closure. - return fn(procfd) -} - // SearchLabels searches through a list of key=value pairs for a given key, // returning its value, and the binary flag telling whether the key exist. func SearchLabels(labels []string, key string) (string, bool) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index ca520b63b..f57f0874a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -7,9 +7,14 @@ import ( "fmt" "math" "os" + "path/filepath" + "runtime" "strconv" "sync" + _ "unsafe" // for go:linkname + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -49,15 +54,15 @@ func haveCloseRangeCloexec() bool { return haveCloseRangeCloexecBool } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { - if haveCloseRangeCloexec() { - err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) - return os.NewSyscallError("close_range", err) - } +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { + procSelfFd, closer := ProcThreadSelf("fd") + defer closer() - fdDir, err := os.Open("/proc/self/fd") + fdDir, err := os.Open(procSelfFd) if err != nil { return err } @@ -81,16 +86,68 @@ func CloseExecFrom(minFd int) error { if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } -// NewSockPair returns a new unix socket pair +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. + if haveCloseRangeCloexec() { + err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) + return os.NewSyscallError("close_range", err) + } + // Otherwise, fall back to the standard loop. + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We cannot use close_range(2) even if it is available, because we must + // not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + +// NewSockPair returns a new SOCK_STREAM unix socket pair. func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { @@ -98,3 +155,109 @@ func NewSockPair(name string) (parent, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + procSelfFd, closer := ProcThreadSelf("fd/") + defer closer() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) + // Double-check the path is the one we expected. + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + return fn(procfd) +} + +type ProcThreadSelfCloser func() + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. The caller _must_ call +// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) +// *only once* after it has finished using the returned path string. +func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + // We need to lock our thread until the caller is done with the path string + // because any non-atomic operation on the path (such as opening a file, + // then reading it) could be interrupted by the Go runtime where the + // underlying thread is swapped out and the original thread is killed, + // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In + // addition, the pre-3.17 fallback makes everything non-atomic because the + // same thing could happen between unix.Gettid() and the path operations. + // + // In theory, we don't need to lock in the atomic user case when using + // /proc/thread-self/, but it's better to be safe than sorry (and there are + // only one or two truly atomic users of /proc/thread-self/). + runtime.LockOSThread() + + threadSelf := "/proc/thread-self/" + if !haveProcThreadSelf { + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we + // are running inside the pid namespace of the container but /proc + // is the host's procfs. Unfortunately there is no real way to get + // the correct tid to use here (the kernel age means we cannot do + // things like set up a private fsopen("proc") -- even scanning + // NSpid in all of the tasks in /proc/self/task/*/status requires + // Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this + // one specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + } + return threadSelf + subpath, runtime.UnlockOSThread +} + +// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to +// create a /proc/thread-self handle for given file descriptor. +// +// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but +// without using fmt.Sprintf to avoid unneeded overhead. +func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { + return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 3650e9f98..891fd0933 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -806,7 +806,7 @@ github.com/opencontainers/go-digest ## explicit; go 1.18 github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.1.12 => github.com/opencontainers/runc v1.1.1-0.20230904132852-a0466dd76f23 +# github.com/opencontainers/runc v1.1.12 => github.com/opencontainers/runc v1.1.1-0.20240131200429-02120488a4c0 ## explicit; go 1.20 github.com/opencontainers/runc/libcontainer/apparmor github.com/opencontainers/runc/libcontainer/cgroups @@ -1250,4 +1250,4 @@ tags.cncf.io/container-device-interface/pkg/parser # tags.cncf.io/container-device-interface/specs-go v0.6.0 ## explicit; go 1.19 tags.cncf.io/container-device-interface/specs-go -# github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.1-0.20230904132852-a0466dd76f23 +# github.com/opencontainers/runc => github.com/opencontainers/runc v1.1.1-0.20240131200429-02120488a4c0