Skip to content

Commit

Permalink
refactor: move some c code to go
Browse files Browse the repository at this point in the history
Move all the stage-1 c code and some of the stage-2
c code to go code, because they are not related to
namespaces, they should be implemented by golang.

Signed-off-by: lifubang <[email protected]>
  • Loading branch information
lifubang committed Sep 30, 2024
1 parent c2486cc commit b70c4af
Show file tree
Hide file tree
Showing 10 changed files with 356 additions and 800 deletions.
5 changes: 5 additions & 0 deletions libcontainer/configs/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ type IDMap struct {
Size int64 `json:"size"`
}

// ToString is to serialize the IDMap to a string.
func (i IDMap) ToString() string {
return fmt.Sprintf("%d %d %d", i.ContainerID, i.HostID, i.Size)
}

// Seccomp represents syscall restrictions
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
Expand Down
79 changes: 6 additions & 73 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/execabs"
"golang.org/x/sys/unix"

"github.com/opencontainers/runc/libcontainer/cgroups"
Expand Down Expand Up @@ -593,6 +592,10 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.stage1SockChild)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_STAGE1PIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File())
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
Expand Down Expand Up @@ -1052,17 +1055,6 @@ func (c *Container) orderNamespacePaths(namespaces map[configs.NamespaceType]str
return paths, nil
}

func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
data := bytes.NewBuffer(nil)
for _, im := range idMap {
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
if _, err := data.WriteString(line); err != nil {
return nil, err
}
}
return data.Bytes(), nil
}

// netlinkError is an error wrapper type for use by custom netlink message
// types. Panics with errors are wrapped in netlinkError so that the recover
// in bootstrapData can distinguish intentional panics.
Expand Down Expand Up @@ -1109,59 +1101,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
})
}

// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UIDMappings) > 0 {
if c.config.RootlessEUID {
// We resolve the paths for new{u,g}idmap from
// the context of runc to avoid doing a path
// lookup in the nsexec context.
if path, err := execabs.LookPath("newuidmap"); err == nil {
r.AddData(&Bytemsg{
Type: UidmapPathAttr,
Value: []byte(path),
})
}
}
b, err := encodeIDMapping(c.config.UIDMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}

// write gid mappings
if len(c.config.GIDMappings) > 0 {
b, err := encodeIDMapping(c.config.GIDMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
if c.config.RootlessEUID {
if path, err := execabs.LookPath("newgidmap"); err == nil {
r.AddData(&Bytemsg{
Type: GidmapPathAttr,
Value: []byte(path),
})
}
}
if requiresRootOrMappingTool(c.config) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}

if c.config.OomScoreAdj != nil {
// write oom_score_adj
r.AddData(&Bytemsg{
Expand All @@ -1170,12 +1109,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
})
}

// write rootless
r.AddData(&Boolmsg{
Type: RootlessEUIDAttr,
Value: c.config.RootlessEUID,
})

// write boottime and monotonic time ns offsets.
if c.config.TimeOffsets != nil {
var offsetSpec bytes.Buffer
Expand Down Expand Up @@ -1216,9 +1149,9 @@ func ignoreTerminateErrors(err error) error {
return err
}

func requiresRootOrMappingTool(c *configs.Config) bool {
func requiresRootOrMappingTool(gidMappings []configs.IDMap) bool {
gidMap := []configs.IDMap{
{ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
}
return !reflect.DeepEqual(c.GIDMappings, gidMap)
return !reflect.DeepEqual(gidMappings, gidMap)
}
150 changes: 150 additions & 0 deletions libcontainer/container_setup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package libcontainer

import (
"encoding/binary"
"fmt"
"io"
"os"

"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/execabs"
"golang.org/x/sys/unix"
)

// NsExecSyncMsg is used for communication between the parent and child during
// container setup.
type NsExecSyncMsg uint32

const (
syncUsermapPls NsExecSyncMsg = iota + 0x40
syncUsermapAck
syncRecvPidPls
syncRecvPidAck
syncTimeOffsetsPls
syncTimeOffsetsAck
)

type NsExecSetup struct {
process *containerProcess
}

const bufSize int = 4

// parseNsExecSync runs the given callback function on each message received
// from the child. It will return once the child sends SYNC_RECVPID_PLS.
func parseNsExecSync(r io.Reader, fn func(NsExecSyncMsg) error) error {
logrus.Debugf("start to communicate with the nsexec\n")
var msg NsExecSyncMsg
var buf [bufSize]byte
native := nl.NativeEndian()

for {
if _, err := io.ReadAtLeast(r, buf[:], bufSize); err != nil {
return err
}
msg = NsExecSyncMsg(native.Uint32(buf[:]))
if err := fn(msg); err != nil {
return err
}
if msg == syncRecvPidPls {
break
}
}
logrus.Debugf("finished communicating with the nsexec\n")
return nil
}

// ackSyncMsg is used to send a message to the child.
func ackSyncMsg(f *os.File, msg NsExecSyncMsg) error {
var buf [bufSize]byte
native := nl.NativeEndian()
native.PutUint32(buf[:], uint32(msg))
if _, err := unix.Write(int(f.Fd()), buf[:]); err != nil {
logrus.Debugf("failed to write message to nsexec: %v", err)
return err
}
return nil
}

// helpDoingNsExec is used to help the process to communicate with the nsexec.
func (s *NsExecSetup) helpDoingNsExec() error {
syncSock := s.process.comm.stage1SockParent
err := parseNsExecSync(syncSock, func(msg NsExecSyncMsg) error {
switch msg {
case syncUsermapPls:
logrus.Debugf("stage-1 requested userns mappings")
if err := s.setupUsermap(); err != nil {
return err
}
return ackSyncMsg(syncSock, syncUsermapAck)
case syncRecvPidPls:
logrus.Debugf("stage-1 reports pid")
var pid uint32
if err := binary.Read(syncSock, nl.NativeEndian(), &pid); err != nil {
return err
}
s.process.childPid = int(pid)
return ackSyncMsg(syncSock, syncRecvPidAck)
case syncTimeOffsetsPls:
logrus.Debugf("stage-1 request to configure timens offsets")
if err := system.UpdateTimeNsOffsets(s.process.cmd.Process.Pid, s.process.container.config.TimeOffsets); err != nil {
return err
}
return ackSyncMsg(syncSock, syncTimeOffsetsAck)
default:
}
return fmt.Errorf("unexpected message %d", msg)
})
_ = syncSock.Close()
return err
}

// setupUsermap is used to set up the user mappings.
func (s *NsExecSetup) setupUsermap() error {
var uidMapPath, gidMapPath string

// Enable setgroups(2) if we've been asked to. But we also have to explicitly
// disable setgroups(2) if we're creating a rootless container for single-entry
// mapping. (this is required since Linux 3.19).
// For rootless multi-entry mapping, we should use newuidmap/newgidmap
// to do mapping user namespace.
if s.process.config.RootlessEUID && !requiresRootOrMappingTool(s.process.config.Config.GIDMappings) {
_ = system.UpdateSetgroups(s.process.cmd.Process.Pid, system.SetgroupsDeny)
}

nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range s.process.container.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(s.process.container.config.UIDMappings) > 0 {
if s.process.container.config.RootlessEUID {
if path, err := execabs.LookPath("newuidmap"); err == nil {
uidMapPath = path
}
}
}

// write gid mappings
if len(s.process.container.config.GIDMappings) > 0 {
if s.process.container.config.RootlessEUID {
if path, err := execabs.LookPath("newgidmap"); err == nil {
gidMapPath = path
}
}
}
}

/* Set up mappings. */
if err := system.UpdateUidmap(uidMapPath, s.process.cmd.Process.Pid, s.process.container.config.UIDMappings); err != nil {
return err
}
return system.UpdateGidmap(gidMapPath, s.process.cmd.Process.Pid, s.process.container.config.GIDMappings)
}
29 changes: 24 additions & 5 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"strconv"
"strings"
"syscall"
"unsafe"

"github.com/containerd/console"
"github.com/moby/sys/user"
Expand All @@ -35,11 +36,6 @@ const (
initStandard initType = "standard"
)

type pid struct {
Pid int `json:"stage2_pid"`
PidFirstChild int `json:"stage1_pid"`
}

// network is an internal struct used to setup container networks.
type network struct {
configs.Network
Expand Down Expand Up @@ -151,6 +147,11 @@ func startInitialization() (retErr error) {

logrus.SetOutput(logPipe)
logrus.SetFormatter(new(logrus.JSONFormatter))

/* For debugging. */
procName := append([]byte("runc:[2:INIT]"), 0)
_ = unix.Prctl(unix.PR_SET_NAME, uintptr(unsafe.Pointer(&procName[0])), 0, 0, 0)

logrus.Debug("child process in init()")

// Only init processes have FIFOFD.
Expand Down Expand Up @@ -215,6 +216,24 @@ func startInitialization() (retErr error) {
return err
}

if _, err := unix.Setsid(); err != nil {
return fmt.Errorf("setsid failed: %w", err)
}

if err := unix.Setuid(0); err != nil {
return fmt.Errorf("setuid failed %w", err)
}

if err := unix.Setgid(0); err != nil {
return fmt.Errorf("setgid failed %w", err)
}

if !config.RootlessEUID && requiresRootOrMappingTool(config.Config.GIDMappings) {
if err := unix.Setgroups([]int{0}); err != nil {
return fmt.Errorf("setgroups failed %w", err)
}
}

// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe, dmzExe)
}
Expand Down
16 changes: 5 additions & 11 deletions libcontainer/message_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,11 @@ import (
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
TimeOffsetsAttr uint16 = 27290
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
OomScoreAdjAttr uint16 = 27286
TimeOffsetsAttr uint16 = 27290
)

type Int32msg struct {
Expand Down
2 changes: 1 addition & 1 deletion libcontainer/nsenter/log.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ void write_log(int level, const char *format, ...)
if (stage == NULL)
goto out;
} else {
ret = asprintf(&stage, "nsexec-%d", current_stage);
ret = asprintf(&stage, "nsexec-%d", current_stage + 1);
if (ret < 0) {
stage = NULL;
goto out;
Expand Down
Loading

0 comments on commit b70c4af

Please sign in to comment.