diff --git a/Cargo.lock b/Cargo.lock index 8068b2b6e6..32efa4af9e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -702,7 +702,7 @@ dependencies = [ "serde_json", "serde_repr", "serde_urlencoded", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tower-service", @@ -789,7 +789,7 @@ dependencies = [ "semver", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -965,7 +965,7 @@ dependencies = [ "mime", "mime_guess", "rand 0.8.5", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1471,6 +1471,7 @@ dependencies = [ "rand_distr", "rustc-hash", "serde_json", + "thiserror 2.0.11", "uuid", ] @@ -2088,7 +2089,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc 0.2.169", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3169,7 +3170,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -3539,7 +3540,7 @@ dependencies = [ "percent-encoding", "pin-project", "rand 0.8.5", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", ] @@ -3554,7 +3555,7 @@ dependencies = [ "lazy_static", "opentelemetry", "opentelemetry-semantic-conventions", - "thiserror", + "thiserror 1.0.69", "thrift", "tokio", ] @@ -3877,7 +3878,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "059a34f111a9dee2ce1ac2826a68b24601c4298cfeb1a587c3cb493d5ab46f52" dependencies = [ - "libc 0.1.12", + "libc 0.2.169", "nix 0.29.0", ] @@ -4268,7 +4269,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4437,7 +4438,7 @@ dependencies = [ "errno", "libc 0.2.169", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -4559,7 +4560,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a15e661f0f9dac21f3494fe5d23a6338c0ac116a2d22c2b63010acd89467ffe" dependencies = [ "byteorder", - "thiserror", + "thiserror 1.0.69", "twox-hash", ] @@ -5113,7 +5114,7 @@ dependencies = [ "serde_bytes", "static_assertions", "tarpc-plugins", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-serde", "tokio-util", @@ -5157,7 +5158,7 @@ dependencies = [ "getrandom", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -5221,7 +5222,7 @@ dependencies = [ "serde", "serde_json", "serde_with", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-util", @@ -5234,7 +5235,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl 2.0.11", ] [[package]] @@ -5248,6 +5258,17 @@ dependencies = [ "syn 2.0.96", ] +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "thread_local" version = "1.1.8" @@ -5968,7 +5989,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/profiling/Cargo.toml b/profiling/Cargo.toml index 5bd0cb1c65..ef19028f97 100644 --- a/profiling/Cargo.toml +++ b/profiling/Cargo.toml @@ -33,6 +33,7 @@ serde_json = {version = "1.0"} rand = { version = "0.8.5" } rand_distr = { version = "0.4.3" } rustc-hash = "1.1.0" +thiserror = "2" uuid = { version = "1.0", features = ["v4"] } [dev-dependencies] diff --git a/profiling/src/lib.rs b/profiling/src/lib.rs index 982d595e31..e24580b857 100644 --- a/profiling/src/lib.rs +++ b/profiling/src/lib.rs @@ -22,6 +22,7 @@ mod exception; mod timeline; use crate::config::{SystemSettings, INITIAL_SYSTEM_SETTINGS}; +use crate::zend::datadog_sapi_globals_request_info; use bindings::{ self as zend, ddog_php_prof_php_version, ddog_php_prof_php_version_id, ZendExtension, ZendResult, @@ -46,8 +47,6 @@ use std::sync::{Arc, Once}; use std::time::{Duration, Instant}; use uuid::Uuid; -use crate::zend::datadog_sapi_globals_request_info; - /// Name of the profiling module and zend_extension. Must not contain any /// interior null bytes and must be null terminated. static PROFILER_NAME: &[u8] = b"datadog-profiling\0"; @@ -889,11 +888,21 @@ extern "C" fn startup(extension: *mut ZendExtension) -> ZendResult { ZendResult::Success } -extern "C" fn shutdown(_extension: *mut ZendExtension) { +extern "C" fn shutdown(extension: *mut ZendExtension) { #[cfg(debug_assertions)] - trace!("shutdown({:p})", _extension); - - Profiler::shutdown(Duration::from_secs(2)); + trace!("shutdown({:p})", extension); + + // If a timeout was reached, then the thread is possibly alive. + // This means the engine cannot unload our handle, or else we'd hit + // immediate undefined behavior (and likely crash). + if let Err(err) = Profiler::shutdown(Duration::from_secs(2)) { + let num_failures = err.num_failures; + error!("{num_failures} thread(s) failed to join, intentionally leaking the extension's handle to prevent unloading"); + // SAFETY: during mshutdown, we have ownership of the extension struct. + // Our threads (which failed to join) do not mutate this struct at all + // either, providing no races. + unsafe { (*extension).handle = ptr::null_mut() } + } // SAFETY: calling in shutdown before zai config is shutdown, and after // all configuration is done being accessed. Well... in the happy-path, diff --git a/profiling/src/php_ffi.c b/profiling/src/php_ffi.c index 25b8b494a5..934ee2685f 100644 --- a/profiling/src/php_ffi.c +++ b/profiling/src/php_ffi.c @@ -265,6 +265,7 @@ void datadog_php_profiling_copy_string_view_into_zval(zval *dest, zai_str view, #ifdef CFG_TEST (void)dest; (void)view; + (void)persistent; ZEND_ASSERT(0); #else if (view.len == 0) { diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 2f151e2bc5..e2ba644f2f 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -19,8 +19,6 @@ use crate::bindings::{datadog_php_profiling_get_profiling_context, zend_execute_ use crate::config::SystemSettings; use crate::{CLOCKS, TAGS}; use chrono::Utc; -#[cfg(feature = "timeline")] -use core::{ptr, str}; use crossbeam_channel::{Receiver, Sender, TrySendError}; use datadog_profiling::api::{ Function, Label as ApiLabel, Location, Period, Sample, ValueType as ApiValueType, @@ -40,6 +38,8 @@ use std::sync::{Arc, Barrier}; use std::thread::JoinHandle; use std::time::{Duration, Instant, SystemTime}; +#[cfg(feature = "timeline")] +use core::{ptr, str}; #[cfg(feature = "timeline")] use std::time::UNIX_EPOCH; @@ -689,33 +689,39 @@ impl Profiler { /// Completes the shutdown process; to start it, call [Profiler::stop] /// before calling [Profiler::shutdown]. /// Note the timeout is per thread, and there may be multiple threads. + /// Returns Ok(true) if any thread hit a timeout. /// /// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase - pub fn shutdown(timeout: Duration) { + pub fn shutdown(timeout: Duration) -> Result<(), JoinError> { // SAFETY: the `take` access is a thread-safe API, and the PROFILER is // not being mutated outside single-threaded phases such as minit and // mshutdown. if let Some(profiler) = unsafe { PROFILER.take() } { - profiler.join_collector_and_uploader(timeout); + profiler.join_collector_and_uploader(timeout) + } else { + Ok(()) } } - pub fn join_collector_and_uploader(self, timeout: Duration) { + fn join_collector_and_uploader(self, timeout: Duration) -> Result<(), JoinError> { if self.should_join.load(Ordering::SeqCst) { - thread_utils::join_timeout( - self.time_collector_handle, - timeout, - "Recent samples may be lost.", - ); + let result1 = thread_utils::join_timeout(self.time_collector_handle, timeout); + if let Err(err) = &result1 { + warn!("{err}, recent samples may be lost"); + } // Wait for the time_collector to join, since that will drop // the sender half of the channel that the uploader is // holding, allowing it to finish. - thread_utils::join_timeout( - self.uploader_handle, - timeout, - "Recent samples are most likely lost.", - ); + let result2 = thread_utils::join_timeout(self.uploader_handle, timeout); + if let Err(err) = &result2 { + warn!("{err}, recent samples are most likely lost"); + } + + let num_failures = result1.is_err() as usize + result2.is_err() as usize; + result2.or(result1).map_err(|_| JoinError { num_failures }) + } else { + Ok(()) } } @@ -1288,6 +1294,10 @@ impl Profiler { } } +pub struct JoinError { + pub num_failures: usize, +} + #[cfg(test)] mod tests { use super::*; diff --git a/profiling/src/profiling/thread_utils.rs b/profiling/src/profiling/thread_utils.rs index 0848ba33d5..0542f1c3d9 100644 --- a/profiling/src/profiling/thread_utils.rs +++ b/profiling/src/profiling/thread_utils.rs @@ -1,15 +1,15 @@ -#[cfg(php_zts)] -use crate::sapi::Sapi; use crate::SAPI; -#[cfg(php_zts)] -use libc::c_char; use libc::sched_yield; -use log::warn; use once_cell::sync::OnceCell; use std::mem::MaybeUninit; use std::thread::JoinHandle; use std::time::{Duration, Instant}; +#[cfg(php_zts)] +use crate::sapi::Sapi; +#[cfg(php_zts)] +use libc::c_char; + /// Spawns a thread and masks off the signals that the Zend Engine uses. pub fn spawn(name: &str, f: F) -> JoinHandle where @@ -50,11 +50,18 @@ where } } +#[derive(thiserror::Error, Debug)] +#[error("timeout of {timeout_ms} ms reached when joining thread {thread}")] +pub struct TimeoutError { + thread: String, + timeout_ms: u128, +} + /// Waits for the handle to be finished. If finished, it will join the handle. -/// Otherwise, it will leak the handle. +/// Otherwise, it will leak the handle and return an error. /// # Panics -/// Panics if the thread being joined has panic'd. -pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration, impact: &str) { +/// If the thread being joined has panic'd, this will resume the panic. +pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration) -> Result<(), TimeoutError> { // After notifying the other threads, it's likely they'll need some time // to respond adequately. Joining on the JoinHandle is supposed to be the // correct way to do this, but we've observed this can panic: @@ -65,15 +72,16 @@ pub fn join_timeout(handle: JoinHandle<()>, timeout: Duration, impact: &str) { while !handle.is_finished() { unsafe { sched_yield() }; if start.elapsed() >= timeout { - let name = handle.thread().name().unwrap_or("{unknown}"); - warn!("Timeout of {timeout:?} reached when joining thread '{name}'. {impact}"); - return; + let thread = handle.thread().name().unwrap_or("{unknown}").to_string(); + let timeout_ms = timeout.as_millis(); + return Err(TimeoutError { thread, timeout_ms }); } } if let Err(err) = handle.join() { - std::panic::resume_unwind(err) + std::panic::resume_unwind(err); } + Ok(()) } thread_local! {