Skip to content

Commit

Permalink
limit allocated stack in clone
Browse files Browse the repository at this point in the history
Signed-off-by: Jorge Prendes <[email protected]>
  • Loading branch information
jprendes committed Oct 9, 2023
1 parent 0e07a1e commit ca431fd
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 26 deletions.
2 changes: 1 addition & 1 deletion crates/libcontainer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ systemd = ["libcgroups/systemd", "v2"]
v2 = ["libcgroups/v2"]
v1 = ["libcgroups/v1"]
cgroupsv2_devices = ["libcgroups/cgroupsv2_devices"]
clone3 = []
no-clone3 = []

[dependencies]
bitflags = "2.4.0"
Expand Down
43 changes: 18 additions & 25 deletions crates/libcontainer/src/process/fork.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@ pub enum CloneError {
}

/// The callback function used in clone system call. The return value is i32
/// which is consistent with C functions return code. The trait has to be
/// `FnMut` because we need to be able to call the closure multiple times, once
/// in clone3 and once in clone if fallback is required. The closure is boxed
/// which is consistent with C functions return code. The closure is boxed
/// because we need to store the closure on heap, not stack in the case of
/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to
/// pass in a child stack, which is empty. By storing the closure in heap, we
/// can then in the new process to re-box the heap memory back to a closure
/// correctly.
pub type CloneCb = Box<dyn FnMut() -> i32>;
pub type CloneCb = Box<dyn FnOnce() -> i32>;

// Clone a sibling process that shares the same parent as the calling
// process. This is used to launch the container init process so the parent
Expand Down Expand Up @@ -58,10 +56,12 @@ fn clone_internal(
flags: u64,
exit_signal: Option<u64>,
) -> Result<Pid, CloneError> {
#[cfg(feature = "clone3")]
let res = clone3(&mut cb, flags, exit_signal);
let exit_signal = exit_signal.unwrap_or(0);

#[cfg(not(feature = "clone3"))]
#[cfg(not(feature = "no-clone3"))]
let res = clone3(cb, flags, exit_signal);

#[cfg(feature = "no-clone3")]
let res = clone(cb, flags, exit_signal);

res
Expand All @@ -70,10 +70,10 @@ fn clone_internal(
// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking
// the interface of fork. There is not need to explicitly manage the memory, so
// we can safely passing the callback closure as reference.
#[cfg(feature = "clone3")]
fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
#[cfg(not(feature = "no-clone3"))]
fn clone3(cb: CloneCb, flags: u64, exit_signal: u64) -> Result<Pid, CloneError> {
#[repr(C)]
struct clone3_args {
struct Clone3Args {
flags: u64,
pidfd: u64,
child_tid: u64,
Expand All @@ -86,28 +86,29 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid,
set_tid_size: u64,
cgroup: u64,
}
let mut args = clone3_args {
let mut args = Clone3Args {
flags,
pidfd: 0,
child_tid: 0,
parent_tid: 0,
exit_signal: exit_signal.unwrap_or(0),
exit_signal,
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};
let args_ptr = &mut args as *mut clone3_args;
let args_size = std::mem::size_of::<clone3_args>();
let args_ptr = &mut args as *mut Clone3Args;
let args_size = std::mem::size_of::<Clone3Args>();
// For now, we can only use clone3 as a kernel syscall. Libc wrapper is not
// available yet. This can have undefined behavior because libc authors do
// not like people calling kernel syscall to directly create processes. Libc
// does perform additional bookkeeping when calling clone or fork. So far,
// we have not observed any issues with calling clone3 directly, but we
// should keep an eye on it.
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
-1 if nix::Error::last() == nix::Error::ENOSYS => clone(cb, flags, exit_signal),
-1 => Err(CloneError::Clone(nix::Error::last())),
0 => {
// Inside the cloned process, we execute the callback and exit with
Expand All @@ -119,8 +120,7 @@ fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid,
}
}

#[cfg(not(feature = "clone3"))]
fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
fn clone(cb: CloneCb, flags: u64, exit_signal: u64) -> Result<Pid, CloneError> {
use nix::sys::{mman, resource};
use std::{ffi::c_int, num::NonZeroUsize};

Expand All @@ -139,14 +139,7 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, Clone
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
// mmap will return ENOMEM if stack size is unlimited when we create the
// child stack, so we need to set a reasonable default stack size.
let default_stack_size = if rlim_cur != u64::MAX {
rlim_cur as usize
} else {
tracing::debug!(
"stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)"
);
DEFAULT_STACK_SIZE
};
let default_stack_size = rlim_cur.min(DEFAULT_STACK_SIZE as u64) as usize;

// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
Expand Down Expand Up @@ -185,7 +178,7 @@ fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, Clone
let child_stack_top = unsafe { child_stack.add(default_stack_size) };

// Combine the clone flags with exit signals.
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
let combined_flags = (flags | exit_signal) as c_int;

// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
Expand Down

0 comments on commit ca431fd

Please sign in to comment.