From 56202d888645aa38a9632d57e5bbd5d67e8458dd Mon Sep 17 00:00:00 2001 From: Yashodhan Joshi Date: Mon, 9 Oct 2023 11:58:28 +0530 Subject: [PATCH] Put clone3 behind a feature Signed-off-by: Jorge Prendes --- crates/libcontainer/Cargo.toml | 1 + crates/libcontainer/src/process/fork.rs | 106 ++++++++++-------------- 2 files changed, 44 insertions(+), 63 deletions(-) diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index e4b4628405..e536a69aa9 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -18,6 +18,7 @@ systemd = ["libcgroups/systemd", "v2"] v2 = ["libcgroups/v2"] v1 = ["libcgroups/v1"] cgroupsv2_devices = ["libcgroups/cgroupsv2_devices"] +no-clone3 = [] [dependencies] bitflags = "2.4.0" diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index 114a22440c..704a18a6f8 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -1,10 +1,5 @@ -use std::{ffi::c_int, num::NonZeroUsize}; - use libc::SIGCHLD; -use nix::{ - sys::{mman, resource}, - unistd::Pid, -}; +use nix::unistd::Pid; #[derive(Debug, thiserror::Error)] pub enum CloneError { @@ -25,15 +20,13 @@ pub enum CloneError { } /// The callback function used in clone system call. The return value is i32 -/// which is consistent with C functions return code. The trait has to be -/// `FnMut` because we need to be able to call the closure multiple times, once -/// in clone3 and once in clone if fallback is required. The closure is boxed +/// which is consistent with C functions return code. The closure is boxed /// because we need to store the closure on heap, not stack in the case of /// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to /// pass in a child stack, which is empty. By storing the closure in heap, we /// can then in the new process to re-box the heap memory back to a closure /// correctly. -pub type CloneCb = Box i32>; +pub type CloneCb = Box i32>; // Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent @@ -48,39 +41,26 @@ pub fn container_clone_sibling(cb: CloneCb) -> Result { // The older `clone` will not return EINVAL in this case. Instead it ignores // the exit signal bits in the glibc wrapper. Therefore, we explicitly set // the exit_signal to None here, so this works for both version of clone. - clone_internal(cb, libc::CLONE_PARENT as u64, None) + clone_internal(cb, libc::CLONE_PARENT as u64, 0) } // Clone a child process and execute the callback. pub fn container_clone(cb: CloneCb) -> Result { - clone_internal(cb, 0, Some(SIGCHLD as u64)) + clone_internal(cb, 0, SIGCHLD as u64) } -// An internal wrapper to manage the clone3 vs clone fallback logic. -fn clone_internal( - mut cb: CloneCb, - flags: u64, - exit_signal: Option, -) -> Result { - match clone3(&mut cb, flags, exit_signal) { - Ok(pid) => Ok(pid), - // For now, we decide to only fallback on ENOSYS - Err(CloneError::Clone(nix::Error::ENOSYS)) => { - tracing::debug!("clone3 is not supported, fallback to clone"); - let pid = clone(cb, flags, exit_signal)?; - - Ok(pid) - } - Err(err) => Err(err), - } +#[cfg(feature = "no-clone3")] +// We should not use clone3 as required by the no-clone3 feature. Call clone directly. +fn clone_internal(cb: CloneCb, flags: u64, exit_signal: u64) -> Result { + clone(cb, flags, exit_signal) } -// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking -// the interface of fork. There is not need to explicitly manage the memory, so -// we can safely passing the callback closure as reference. -fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { +#[cfg(not(feature = "no-clone3"))] +// Try to use clone3, and fallback to clone in case of ENOSYS. +// Unlike the clone call, clone3 is currently using the kernel syscall. +fn clone_internal(cb: CloneCb, flags: u64, exit_signal: u64) -> Result { #[repr(C)] - struct clone3_args { + struct Clone3Args { flags: u64, pidfd: u64, child_tid: u64, @@ -93,12 +73,12 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result(); + let args_ptr = &mut args as *mut Clone3Args; + let args_size = std::mem::size_of::(); // For now, we can only use clone3 as a kernel syscall. Libc wrapper is not // available yet. This can have undefined behavior because libc authors do // not like people calling kernel syscall to directly create processes. Libc - // does perform additional bookkeeping when calling clone or fork. So far, - // we have not observed any issues with calling clone3 directly, but we - // should keep an eye on it. + // does perform additional bookkeeping when calling clone or fork. + // So far we've observed issues when the main process uses threads. In such + // cases enable the `no-clone3` feature. + // See https://github.com/containerd/runwasi/issues/347 match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { + -1 if nix::Error::last() == nix::Error::ENOSYS => clone(cb, flags, exit_signal), -1 => Err(CloneError::Clone(nix::Error::last())), 0 => { // Inside the cloned process, we execute the callback and exit with @@ -126,7 +108,10 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result { +fn clone(cb: CloneCb, flags: u64, exit_signal: u64) -> Result { + use nix::sys::{mman, resource}; + use std::{ffi::c_int, num::NonZeroUsize}; + const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K @@ -140,33 +125,28 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result