materialized: attempt to produce nice backtraces on stack overflow

benesch · benesch · commit f11f74976f0c · 2021-02-01T22:47:06.000-05:00
Rust produces bad error messages on stack overflow, like "thread 'foo' has overflowed its stack" which provides very little insight into where the recursion that caused the stack to overflow occurred. See rust-lang/rust#51405 for details. This commit adds a SIGSEGV handler that attempts to print a backtrace, following the approach in the backtrace-on-stack-overflow crate. I copied the code from that crate into Materialize and tweaked it because it's a very small amount of code that we'll likely need to modify, and I wanted to improve its error handling. In my manual testing this produces a nice backtrace when Materialize overflows its stack.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/materialized/Cargo.toml b/src/materialized/Cargo.toml
@@ -48,8 +48,10 @@ include_dir = "0.6.0"
 itertools = "0.9.0"
 krb5-src = { version = "0.2.3", features = ["binaries"] }
 lazy_static = "1.4.0"
+libc = "0.2.84"
 log = "0.4.13"
 mz-process-collector = { path = "../mz-process-collector" }
+nix = "0.19.1"
 num_cpus = "1.0.0"
 openssl = { version = "0.10.32", features = ["vendored"] }
 openssl-sys = { version = "0.9.59", features = ["vendored"] }
diff --git a/src/materialized/src/bin/materialized/main.rs b/src/materialized/src/bin/materialized/main.rs
@@ -213,6 +213,7 @@ fn main() {
 
 fn run(args: Args) -> Result<(), anyhow::Error> {
     panic::set_hook(Box::new(handle_panic));
+    sys::enable_sigsegv_backtraces()?;
 
     if args.version > 0 {
         println!("materialized {}", materialized::BUILD_INFO.human_version());
diff --git a/src/materialized/src/bin/materialized/sys.rs b/src/materialized/src/bin/materialized/sys.rs
@@ -9,7 +9,13 @@
 
 //! System support functions.
 
+use std::alloc::{self, Layout};
+use std::ptr;
+
+use anyhow::{bail, Context};
 use log::{trace, warn};
+use nix::errno;
+use nix::sys::signal;
 
 #[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "ios")))]
 pub fn adjust_rlimits() {
@@ -90,3 +96,79 @@ pub fn adjust_rlimits() {
         )
     }
 }
+
+/// Attempts to enable backtraces when SIGSEGV occurs.
+///
+/// In particular, this means producing backtraces on stack overflow, as
+/// stack overflow raises SIGSEGV. The approach here involves making system
+/// calls to handle SIGSEGV on an alternate signal stack, which seems to work
+/// well in practice but may technically be undefined behavior.
+///
+/// Rust may someday do this by default.
+/// Follow: https://github.com/rust-lang/rust/issues/51405.
+pub fn enable_sigsegv_backtraces() -> Result<(), anyhow::Error> {
+    // This code is derived from the code in the backtrace-on-stack-overflow
+    // crate, which is freely available under the terms of the Apache 2.0
+    // license. The modifications here provide better error messages if any of
+    // the various system calls fail.
+    //
+    // See: https://github.com/matklad/backtrace-on-stack-overflow
+
+    // NOTE(benesch): The stack size was chosen to match the default Rust thread
+    // stack size of 2MiB. Probably overkill, but we'd much rather have
+    // backtraces on stack overflow than squabble over a few megabytes. Using
+    // libc::SIGSTKSZ is tempting, but its default of 8KiB on my system makes me
+    // nervous. Rust code isn't used to running with a stack that small.
+    const STACK_SIZE: usize = 2 << 20;
+
+    // x86_64 and aarch64 require 16-byte alignment. Its hard to imagine other
+    // platforms that would have more stringent requirements.
+    const STACK_ALIGN: usize = 16;
+
+    // Allocate a stack.
+    let buf_layout =
+        Layout::from_size_align(STACK_SIZE, STACK_ALIGN).expect("layout known to be valid");
+    // SAFETY: layout has non-zero size and the uninitialized memory that is
+    // returned is never read (at least, not by Rust).
+    let buf = unsafe { alloc::alloc(buf_layout) };
+
+    // Request that signals be delivered to this alternate stack.
+    let stack = libc::stack_t {
+        ss_sp: buf as *mut libc::c_void,
+        ss_flags: 0,
+        ss_size: STACK_SIZE,
+    };
+    // SAFETY: `stack` is a valid pointer to a `stack_t` object and the second
+    // parameter, `old_ss`, is permitted to be `NULL` according to POSIX.
+    let ret = unsafe { libc::sigaltstack(&stack, ptr::null_mut()) };
+    if ret == -1 {
+        let errno = errno::from_i32(errno::errno());
+        bail!("failed to configure alternate signal stack: {}", errno);
+    }
+
+    // Install a handler for SIGSEGV.
+    let action = signal::SigAction::new(
+        signal::SigHandler::Handler(handle_sigsegv),
+        signal::SaFlags::SA_NODEFER | signal::SaFlags::SA_ONSTACK,
+        signal::SigSet::empty(),
+    );
+    // SAFETY: see `handle_sigsegv`.
+    unsafe { signal::sigaction(signal::SIGSEGV, &action) }
+        .context("failed to install SIGSEGV handler")?;
+
+    Ok(())
+}
+
+extern "C" fn handle_sigsegv(_: i32) {
+    // SAFETY: this is is a signal handler function and technically must be
+    // "async-signal safe" [0]. That typically means no memory allocation, which
+    // means no panics or backtraces... but if we're here, we're already doomed
+    // by a segfault. So there is little harm to ignoring the rules and
+    // panicking. If we're successful, as we often are, the panic will be caught
+    // by our panic handler and displayed nicely with a backtrace that traces
+    // *through* the signal handler and includes the frames that led to the
+    // SIGSEGV.
+    //
+    // [0]: https://man7.org/linux/man-pages/man7/signal-safety.7.html
+    panic!("stack overflow");
+}