From 3c327f000c30810c0e580bed463ec2c1c02b1e0c Mon Sep 17 00:00:00 2001 From: Levi Morrison Date: Mon, 10 Feb 2025 00:08:51 -0700 Subject: [PATCH] fix(prof): avoid dlclose if threads did not join (#3075) * fix(prof): avoid dlclose if threads did not join If the PHP engine dlclose's the handle and the shared object is unloaded while another thread is running, we've hit undefined behavior. This also probably results in a crash (on platforms that unload instead of no-op). This may be the source of a crash when php-fpm does a log rotate. When doing the rotate, php-fpm shuts down all workers. If a worker is slow to process an upload and the timeout is hit, then we could hit this issue. * refactor to panic::resume_unwind immediately * style: reconnect safety comment to unsafe block --- Cargo.lock | 53 +++++++++++++++++-------- profiling/Cargo.toml | 1 + profiling/src/lib.rs | 21 +++++++--- profiling/src/php_ffi.c | 1 + profiling/src/profiling/mod.rs | 40 ++++++++++++------- profiling/src/profiling/thread_utils.rs | 32 +++++++++------ 6 files changed, 99 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8068b2b6e6..32efa4af9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -702,7 +702,7 @@ dependencies = [ "serde_json", "serde_repr", "serde_urlencoded", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tower-service", @@ -789,7 +789,7 @@ dependencies = [ "semver", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -965,7 +965,7 @@ dependencies = [ "mime", "mime_guess", "rand 0.8.5", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1471,6 +1471,7 @@ dependencies = [ "rand_distr", "rustc-hash", "serde_json", + "thiserror 2.0.11", "uuid", ] @@ -2088,7 +2089,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc 0.2.169", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3169,7 +3170,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -3539,7 +3540,7 @@ dependencies = [ "percent-encoding", "pin-project", "rand 0.8.5", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", ] @@ -3554,7 +3555,7 @@ dependencies = [ "lazy_static", "opentelemetry", "opentelemetry-semantic-conventions", - "thiserror", + "thiserror 1.0.69", "thrift", "tokio", ] @@ -3877,7 +3878,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52" dependencies = [ - "libc 0.1.12", + "libc 0.2.169", "nix 0.29.0", ] @@ -4268,7 +4269,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4437,7 +4438,7 @@ dependencies = [ "errno", "libc 0.2.169", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -4559,7 +4560,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a15e661f0f9dac21f3494fe5d23a6338c0ac116a2d22c2b63010acd89467ffe" dependencies = [ "byteorder", - "thiserror", + "thiserror 1.0.69", "twox-hash", ] @@ -5113,7 +5114,7 @@ dependencies = [ "serde_bytes", "static_assertions", "tarpc-plugins", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-serde", "tokio-util", @@ -5157,7 +5158,7 @@ dependencies = [ "getrandom", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5221,7 +5222,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-util", @@ -5234,7 +5235,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl 2.0.11", ] [[package]] @@ -5248,6 +5258,17 @@ dependencies = [ "syn 2.0.96", ] +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "thread_local" version = "1.1.8" @@ -5968,7 +5989,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/profiling/Cargo.toml b/profiling/Cargo.toml index 5bd0cb1c65..ef19028f97 100644 --- a/profiling/Cargo.toml +++ b/profiling/Cargo.toml @@ -33,6 +33,7 @@ serde_json = {version = "1.0"} rand = { version = "0.8.5" } rand_distr = { version = "0.4.3" } rustc-hash = "1.1.0" +thiserror = "2" uuid = { version = "1.0", features = ["v4"] } [dev-dependencies] diff --git a/profiling/src/lib.rs b/profiling/src/lib.rs index 2bb8929546..eb26685031 100644 --- a/profiling/src/lib.rs +++ b/profiling/src/lib.rs @@ -22,6 +22,7 @@ mod exception; mod timeline; use crate::config::{SystemSettings, INITIAL_SYSTEM_SETTINGS}; +use crate::zend::datadog_sapi_globals_request_info; use bindings::{ self as zend, ddog_php_prof_php_version, ddog_php_prof_php_version_id, ZendExtension, ZendResult, @@ -47,8 +48,6 @@ use std::sync::{Arc, Once}; use std::time::{Duration, Instant}; use uuid::Uuid; -use crate::zend::datadog_sapi_globals_request_info; - /// Name of the profiling module and zend_extension. Must not contain any /// interior null bytes and must be null terminated. static PROFILER_NAME: &[u8] = b"datadog-profiling\0"; @@ -890,11 +889,21 @@ extern "C" fn startup(extension: *mut ZendExtension) -> ZendResult { ZendResult::Success } -extern "C" fn shutdown(_extension: *mut ZendExtension) { +extern "C" fn shutdown(extension: *mut ZendExtension) { #[cfg(debug_assertions)] - trace!("shutdown({:p})", _extension); - - Profiler::shutdown(Duration::from_secs(2)); + trace!("shutdown({:p})", extension); + + // If a timeout was reached, then the thread is possibly alive. + // This means the engine cannot unload our handle, or else we'd hit + // immediate undefined behavior (and likely crash). + if let Err(err) = Profiler::shutdown(Duration::from_secs(2)) { + let num_failures = err.num_failures; + error!("{num_failures} thread(s) failed to join, intentionally leaking the extension's handle to prevent unloading"); + // SAFETY: during mshutdown, we have ownership of the extension struct. + // Our threads (which failed to join) do not mutate this struct at all + // either, providing no races. + unsafe { (*extension).handle = ptr::null_mut() } + } // SAFETY: calling in shutdown before zai config is shutdown, and after // all configuration is done being accessed. Well... in the happy-path, diff --git a/profiling/src/php_ffi.c b/profiling/src/php_ffi.c index 25b8b494a5..934ee2685f 100644 --- a/profiling/src/php_ffi.c +++ b/profiling/src/php_ffi.c @@ -265,6 +265,7 @@ void datadog_php_profiling_copy_string_view_into_zval(zval *dest, zai_str view, #ifdef CFG_TEST (void)dest; (void)view; + (void)persistent; ZEND_ASSERT(0); #else if (view.len == 0) { diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 2f151e2bc5..e2ba644f2f 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -19,8 +19,6 @@ use crate::bindings::{datadog_php_profiling_get_profiling_context, zend_execute_ use crate::config::SystemSettings; use crate::{CLOCKS, TAGS}; use chrono::Utc; -#[cfg(feature = "timeline")] -use core::{ptr, str}; use crossbeam_channel::{Receiver, Sender, TrySendError}; use datadog_profiling::api::{ Function, Label as ApiLabel, Location, Period, Sample, ValueType as ApiValueType, @@ -40,6 +38,8 @@ use std::sync::{Arc, Barrier}; use std::thread::JoinHandle; use std::time::{Duration, Instant, SystemTime}; +#[cfg(feature = "timeline")] +use core::{ptr, str}; #[cfg(feature = "timeline")] use std::time::UNIX_EPOCH; @@ -689,33 +689,39 @@ impl Profiler { /// Completes the shutdown process; to start it, call [Profiler::stop] /// before calling [Profiler::shutdown]. /// Note the timeout is per thread, and there may be multiple threads. + /// Returns Ok(true) if any thread hit a timeout. /// /// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase - pub fn shutdown(timeout: Duration) { + pub fn shutdown(timeout: Duration) -> Result<(), JoinError> { // SAFETY: the `take` access is a thread-safe API, and the PROFILER is // not being mutated outside single-threaded phases such as minit and // mshutdown. if let Some(profiler) = unsafe { PROFILER.take() } { - profiler.join_collector_and_uploader(timeout); + profiler.join_collector_and_uploader(timeout) + } else { + Ok(()) } } - pub fn join_collector_and_uploader(self, timeout: Duration) { + fn join_collector_and_uploader(self, timeout: Duration) -> Result<(), JoinError> { if self.should_join.load(Ordering::SeqCst) { - thread_utils::join_timeout( - self.time_collector_handle, - timeout, - "Recent samples may be lost.", - ); + let result1 = thread_utils::join_timeout(self.time_collector_handle, timeout); + if let Err(err) = &result1 { + warn!("{err}, recent samples may be lost"); + } // Wait for the time_collector to join, since that will drop // the sender half of the channel that the uploader is // holding, allowing it to finish. - thread_utils::join_timeout( - self.uploader_handle, - timeout, - "Recent samples are most likely lost.", - ); + let result2 = thread_utils::join_timeout(self.uploader_handle, timeout); + if let Err(err) = &result2 { + warn!("{err}, recent samples are most likely lost"); + } + + let num_failures = result1.is_err() as usize + result2.is_err() as usize; + result2.or(result1).map_err(|_| JoinError { num_failures }) + } else { + Ok(()) } } @@ -1288,6 +1294,10 @@ impl Profiler { } } +pub struct JoinError { + pub num_failures: usize, +} + #[cfg(test)] mod tests { use super::*; diff --git a/profiling/src/profiling/thread_utils.rs b/profiling/src/profiling/thread_utils.rs index 0848ba33d5..0542f1c3d9 100644 --- a/profiling/src/profiling/thread_utils.rs +++ b/profiling/src/profiling/thread_utils.rs @@ -1,15 +1,15 @@ -#[cfg(php_zts)] -use crate::sapi::Sapi; use crate::SAPI; -#[cfg(php_zts)] -use libc::c_char; use libc::sched_yield; -use log::warn; use once_cell::sync::OnceCell; use std::mem::MaybeUninit; use std::thread::JoinHandle; use std::time::{Duration, Instant}; +#[cfg(php_zts)] +use crate::sapi::Sapi; +#[cfg(php_zts)] +use libc::c_char; + /// Spawns a thread and masks off the signals that the Zend Engine uses. pub fn spawn(name: &str, f: F) -> JoinHandle where @@ -50,11 +50,18 @@ where } } +#[derive(thiserror::Error, Debug)] +#[error("timeout of {timeout_ms} ms reached when joining thread {thread}")] +pub struct TimeoutError { + thread: String, + timeout_ms: u128, +} + /// Waits for the handle to be finished. If finished, it will join the handle. -/// Otherwise, it will leak the handle. +/// Otherwise, it will leak the handle and return an error. /// # Panics -/// Panics if the thread being joined has panic'd. -pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration, impact: &str) { +/// If the thread being joined has panic'd, this will resume the panic. +pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration) -> Result<(), TimeoutError> { // After notifying the other threads, it's likely they'll need some time // to respond adequately. Joining on the JoinHandle is supposed to be the // correct way to do this, but we've observed this can panic: @@ -65,15 +72,16 @@ pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration, impact: &str) { while !handle.is_finished() { unsafe { sched_yield() }; if start.elapsed() >= timeout { - let name = handle.thread().name().unwrap_or("{unknown}"); - warn!("Timeout of {timeout:?} reached when joining thread '{name}'. {impact}"); - return; + let thread = handle.thread().name().unwrap_or("{unknown}").to_string(); + let timeout_ms = timeout.as_millis(); + return Err(TimeoutError { thread, timeout_ms }); } } if let Err(err) = handle.join() { - std::panic::resume_unwind(err) + std::panic::resume_unwind(err); } + Ok(()) } thread_local! {