From dd324aad42f2285553e2a356b599e6e40796c4b9 Mon Sep 17 00:00:00 2001 From: Yashodhan Joshi Date: Mon, 9 Oct 2023 11:58:28 +0530 Subject: [PATCH] Put clone3 behind a feature Signed-off-by: Jorge Prendes --- crates/libcontainer/Cargo.toml | 1 + crates/libcontainer/src/process/fork.rs | 88 ++++++++++--------------- 2 files changed, 36 insertions(+), 53 deletions(-) diff --git a/crates/libcontainer/Cargo.toml b/crates/libcontainer/Cargo.toml index 3a1005445..ab301e486 100644 --- a/crates/libcontainer/Cargo.toml +++ b/crates/libcontainer/Cargo.toml @@ -18,6 +18,7 @@ systemd = ["libcgroups/systemd", "v2"] v2 = ["libcgroups/v2"] v1 = ["libcgroups/v1"] cgroupsv2_devices = ["libcgroups/cgroupsv2_devices"] +no-clone3 = [] [dependencies] bitflags = "2.4.0" diff --git a/crates/libcontainer/src/process/fork.rs b/crates/libcontainer/src/process/fork.rs index c0e055396..7eae03301 100644 --- a/crates/libcontainer/src/process/fork.rs +++ b/crates/libcontainer/src/process/fork.rs @@ -25,15 +25,13 @@ pub enum CloneError { } /// The callback function used in clone system call. The return value is i32 -/// which is consistent with C functions return code. The trait has to be -/// `FnMut` because we need to be able to call the closure multiple times, once -/// in clone3 and once in clone if fallback is required. The closure is boxed +/// which is consistent with C functions return code. The closure is boxed /// because we need to store the closure on heap, not stack in the case of /// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to /// pass in a child stack, which is empty. By storing the closure in heap, we /// can then in the new process to re-box the heap memory back to a closure /// correctly. -pub type CloneCb = Box i32>; +pub type CloneCb = Box i32>; // Clone a sibling process that shares the same parent as the calling // process. This is used to launch the container init process so the parent @@ -56,31 +54,18 @@ pub fn container_clone(cb: CloneCb) -> Result { clone_internal(cb, 0, Some(SIGCHLD as u64)) } -// An internal wrapper to manage the clone3 vs clone fallback logic. -fn clone_internal( - mut cb: CloneCb, - flags: u64, - exit_signal: Option, -) -> Result { - match clone3(&mut cb, flags, exit_signal) { - Ok(pid) => Ok(pid), - // For now, we decide to only fallback on ENOSYS - Err(CloneError::Clone(nix::Error::ENOSYS)) => { - tracing::debug!("clone3 is not supported, fallback to clone"); - let pid = clone(cb, flags, exit_signal)?; - - Ok(pid) - } - Err(err) => Err(err), - } +#[cfg(feature = "no-clone3")] +// We should not use clone3 as required by the no-clone3 feature. Call clone directly. +fn clone_internal(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { + clone(cb, flags, exit_signal) } -// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking -// the interface of fork. There is not need to explicitly manage the memory, so -// we can safely passing the callback closure as reference. -fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result { +#[cfg(not(feature = "no-clone3"))] +// Try to use clone3, and fallback to clone in case of ENOSYS. +// Unlike the clone call, clone3 is currently using the kernel syscall. +fn clone_internal(cb: CloneCb, flags: u64, exit_signal: Option) -> Result { #[repr(C)] - struct clone3_args { + struct Clone3Args { flags: u64, pidfd: u64, child_tid: u64, @@ -93,7 +78,7 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result(); + let args_ptr = &mut args as *mut Clone3Args; + let args_size = std::mem::size_of::(); // For now, we can only use clone3 as a kernel syscall. Libc wrapper is not // available yet. This can have undefined behavior because libc authors do // not like people calling kernel syscall to directly create processes. Libc - // does perform additional bookkeeping when calling clone or fork. So far, - // we have not observed any issues with calling clone3 directly, but we - // should keep an eye on it. + // does perform additional bookkeeping when calling clone or fork. + // So far we've observed issues when the main process uses threads. In such + // cases enable the `no-clone3` feature. + // See https://github.com/containerd/runwasi/issues/347 match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } { + -1 if nix::Error::last() == nix::Error::ENOSYS => clone(cb, flags, exit_signal), -1 => Err(CloneError::Clone(nix::Error::last())), 0 => { // Inside the cloned process, we execute the callback and exit with @@ -140,35 +127,30 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result` doesn't have any meaning because we won't use it. mman::mmap::( None, - NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?, + NonZeroUsize::new(stack_size).ok_or(CloneError::ZeroStackSize)?, mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE, mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK, None, @@ -187,7 +169,7 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option) -> Result) -> Result libc::c_int { + extern "C" fn main(data: *mut libc::c_void) -> c_int { unsafe { Box::from_raw(data as *mut CloneCb)() } }