cloudflare · inikulin · Aug 13, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 18, 2024
diff --git a/foundations/src/telemetry/memory_profiler.rs b/foundations/src/telemetry/memory_profiler.rs
@@ -1,24 +1,16 @@
 use super::settings::MemoryProfilerSettings;
-use crate::utils::feature_use;
 use crate::{BootstrapError, BootstrapResult, Result};
 use anyhow::bail;
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::OnceCell;
 use std::ffi::{CStr, CString};
 use std::fs::File;
 use std::io::Read;
 use std::os::raw::c_char;
-use std::thread;
+use std::sync::mpsc::{self};
 use tempfile::NamedTempFile;
-use tokio::sync::Mutex as AsyncMutex;
-use tokio::task::spawn_blocking;
-
-feature_use!(cfg(feature = "security"), {
-    use crate::security::common_syscall_allow_lists::SERVICE_BASICS;
-    use crate::security::{allow_list, enable_syscall_sandboxing, ViolationAction};
-});
+use tokio::sync::oneshot;
 
 static PROFILER: OnceCell<Option<MemoryProfiler>> = OnceCell::new();
-static PROFILING_IN_PROGRESS_LOCK: Lazy<AsyncMutex<()>> = Lazy::new(Default::default);
 
 mod control {
     use super::*;
@@ -49,19 +41,12 @@ mod control {
     }
 }
 
-// NOTE: prevent direct construction by the external code.
-#[derive(Copy, Clone)]
-struct Seal;
-
 /// A safe interface for [jemalloc]'s memory profiling functionality.
 ///
 /// [jemalloc]: https://github.com/jemalloc/jemalloc
-#[derive(Copy, Clone)]
+#[derive(Clone)]
 pub struct MemoryProfiler {
-    _seal: Seal,
-
-    #[cfg(feature = "security")]
-    sandbox_profiling_syscalls: bool,
+    request_heap_profile: mpsc::Sender<oneshot::Sender<Result<String>>>,
 }
 
 impl MemoryProfiler {
@@ -73,6 +58,9 @@ impl MemoryProfiler {
     /// Note that profiling needs to be explicitly enabled by setting `_RJEM_MALLOC_CONF=prof:true`
     /// environment variable for the binary and with [`MemoryProfilerSettings::enabled`] being set
     /// to `true`. Otherwise, this method will return `None`.
+    ///
+    /// If syscall sandboxing is being used (see [`crate::security`] for more details), telemetry
+    /// must be initialized prior to syscall sandboxing.
     pub fn get_or_init_with(settings: &MemoryProfilerSettings) -> BootstrapResult<Option<Self>> {
         const MAX_SAMPLE_INTERVAL: u8 = 64;
 
@@ -83,7 +71,7 @@ impl MemoryProfiler {
 
         PROFILER
             .get_or_try_init(|| init_profiler(settings))
-            .copied()
+            .cloned()
     }
 
     /// Returns a heap profile.
@@ -110,29 +98,10 @@ impl MemoryProfiler {
     /// }
     /// ```
     pub async fn heap_profile(&self) -> Result<String> {
-        // NOTE: we use tokio mutex here, so we can hold the lock across `await` points.
-        let Ok(_lock) = PROFILING_IN_PROGRESS_LOCK.try_lock() else {
-            return Err("profiling is already in progress".into());
-        };
-
-        #[cfg(feature = "security")]
-        let sandbox_profiling_syscalls = self.sandbox_profiling_syscalls;
+        let (response_sender, response_receiver) = oneshot::channel();
+        self.request_heap_profile.send(response_sender)?;
 
-        let collector_thread = thread::spawn(move || {
-            #[cfg(feature = "security")]
-            if sandbox_profiling_syscalls {
-                sandbox_jemalloc_syscalls()?;
-            }
-
-            collect_heap_profile()
-        });
-
-        spawn_blocking(move || {
-            collector_thread
-                .join()
-                .map_err(|_| "heap profile collector thread panicked")?
-        })
-        .await?
+        response_receiver.await?
     }
 
     /// Returns heap statistics.
@@ -169,6 +138,10 @@ fn init_profiler(settings: &MemoryProfilerSettings) -> BootstrapResult<Option<Me
         return Ok(None);
     }
 
+    let (request_sender, request_receiver) = mpsc::channel();
+
+    std::thread::spawn(move || heap_profile_thread(request_receiver));
+
     control::write(control::BACKGROUND_THREAD, true).map_err(|e| {
         BootstrapError::new(e).context("failed to activate background thread collection")
     })?;
@@ -180,13 +153,20 @@ fn init_profiler(settings: &MemoryProfilerSettings) -> BootstrapResult<Option<Me
         .map_err(|e| BootstrapError::new(e).context("failed to activate profiling"))?;
 
     Ok(Some(MemoryProfiler {
-        _seal: Seal,
-
-        #[cfg(feature = "security")]
-        sandbox_profiling_syscalls: settings.sandbox_profiling_syscalls,
+        request_heap_profile: request_sender,
     }))
 }
 
+fn heap_profile_thread(receive_request: mpsc::Receiver<oneshot::Sender<Result<String>>>) {
+    while let Ok(send_response) = receive_request.recv() {
+        if send_response.send(collect_heap_profile()).is_err() {
+            // A failure to send indicates the main thread's receiver is gone, so something else
+            // has already gone wrong there.
+            return;
+        }
+    }
+}
+
 fn collect_heap_profile() -> Result<String> {
     let out_file = NamedTempFile::new()?;
 
@@ -212,50 +192,50 @@ fn collect_heap_profile() -> Result<String> {
     Ok(String::from_utf8(profile)?)
 }
 
-#[cfg(feature = "security")]
-fn sandbox_jemalloc_syscalls() -> Result<()> {
-    #[cfg(target_arch = "x86_64")]
-    allow_list! {
-        static ALLOWED_SYSCALLS = [
-            ..SERVICE_BASICS,
-            // PXY-41: Required to call Instant::now from parking-lot.
-            clock_gettime,
-            openat,
-            creat,
-            unlink
-        ]
-    }
-
-    #[cfg(target_arch = "aarch64")]
-    allow_list! {
-        static ALLOWED_SYSCALLS = [
-            ..SERVICE_BASICS,
-            clock_gettime,
-            openat,
-            unlinkat
-        ]
-    }
-
-    enable_syscall_sandboxing(ViolationAction::KillProcess, &ALLOWED_SYSCALLS)?;
-
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::telemetry::settings::MemoryProfilerSettings;
+    use crate::{
+        security::{
+            allow_list,
+            common_syscall_allow_lists::{ASYNC, SERVICE_BASICS},
+            enable_syscall_sandboxing, ViolationAction,
+        },
+        telemetry::settings::MemoryProfilerSettings,
+    };
 
     #[test]
     fn sample_interval_out_of_bounds() {
         assert!(MemoryProfiler::get_or_init_with(&MemoryProfilerSettings {
             enabled: true,
             sample_interval: 128,
-            ..Default::default()
         })
         .is_err());
     }
 
+    #[tokio::test]
+    async fn profile_heap_after_seccomp_initialized() {
+        let profiler = MemoryProfiler::get_or_init_with(&MemoryProfilerSettings {
+            enabled: true,
+            ..Default::default()
+        })
+        .unwrap()
+        .unwrap_or_else(|| {
+            panic!("profiling should be enabled via `_RJEM_MALLOC_CONF=prof:true` env var");
+        });
+
+        allow_list! {
+           static ALLOW_PROFILING = [
+                ..SERVICE_BASICS,
+                ..ASYNC
+           ]
+        }
+        enable_syscall_sandboxing(ViolationAction::KillProcess, &ALLOW_PROFILING).unwrap();
+
+        let profile = profiler.heap_profile().await.unwrap();
+        assert!(!profile.is_empty());
+    }
+
     // NOTE: `heap_profile` uses raw pointers, the test ensures that it doesn't affect the returned future
     fn _assert_heap_profile_fut_is_send() {
         fn is_send<T: Send>(_t: T) {}

diff --git a/foundations/src/telemetry/mod.rs b/foundations/src/telemetry/mod.rs
@@ -17,6 +17,10 @@
 //! begining of the `main` function) with the [`init`] function for it to be collected by the
 //! external sinks.
 //!
+//! If syscall sandboxing is also being used (see [`crate::security`] for more details), telemetry
+//! must be initialized prior to syscall sandboxing, since it uses syscalls during initialization
+//! that it will not use later.
+//!
 //! # Telemetry context
 //!
 //! Foundations' telemetry is designed to not interfere with the production code, so you usually don't
@@ -688,7 +692,7 @@ pub struct TelemetryConfig<'c> {
 /// doesn't need to be called in tests and any specified settings will be ignored in test
 /// environments. Instead, all the telemetry will be collected in the [`TestTelemetryContext`].
 ///
-/// The function should be called once on service initialization. Consequent calls to the function
+/// The function should be called once on service initialization (prior to any [syscall sandboxing]). Consequent calls to the function
 /// don't have any effect.
 ///
 /// # Telemetry server
@@ -707,6 +711,7 @@ pub struct TelemetryConfig<'c> {
 /// [Prometheus text format]: https://prometheus.io/docs/instrumenting/exposition_formats/#text-based-format
 /// [jemalloc]: https://github.com/jemalloc/jemalloc
 /// [`TelemetryServerSettings::enabled`]: `crate::telemetry::settings::TelemetryServerSettings::enabled`
+/// [syscall sandboxing]: `crate::security`
 pub fn init(config: TelemetryConfig) -> BootstrapResult<TelemetryDriver> {
     let tele_futures: FuturesUnordered<_> = Default::default();
 

diff --git a/foundations/src/telemetry/server.rs b/foundations/src/telemetry/server.rs
@@ -47,6 +47,11 @@ pub(super) fn init(
     }
 
     let settings = Arc::new(settings);
+
+    // Eagerly init the memory profiler so it gets set up before syscalls are sandboxed with seccomp.
+    #[cfg(all(target_os = "linux", feature = "memory-profiling"))]
+    memory_profiling::profiler(Arc::clone(&settings)).map_err(|err| anyhow!(err))?;
+
     let router = create_router(&settings, custom_routes)?;
     let addr = settings.server.addr;
 
@@ -162,7 +167,7 @@ mod memory_profiling {
     use super::*;
     use crate::telemetry::MemoryProfiler;
 
-    fn profiler(settings: Arc<TelemetrySettings>) -> Result<MemoryProfiler> {
+    pub(super) fn profiler(settings: Arc<TelemetrySettings>) -> Result<MemoryProfiler> {
         MemoryProfiler::get_or_init_with(&settings.memory_profiler)?.ok_or_else(|| {
             "profiling should be enabled via `_RJEM_MALLOC_CONF=prof:true` env var".into()
         })

diff --git a/foundations/src/telemetry/settings/memory_profiler.rs b/foundations/src/telemetry/settings/memory_profiler.rs
@@ -17,14 +17,6 @@ pub struct MemoryProfilerSettings {
     /// The default is `19` (2 ^ 19 = 512KiB).
     #[serde(default = "MemoryProfilerSettings::default_sample_interval")]
     pub sample_interval: u8,
-
-    /// Enables [seccomp] sandboxing of syscalls made by [jemalloc] during heap profile collection.
-    ///
-    /// [seccomp]: https://en.wikipedia.org/wiki/Seccomp
-    /// [jemalloc]: https://github.com/jemalloc/jemalloc
-    #[cfg(feature = "security")]
-    #[serde(default = "MemoryProfilerSettings::default_sandbox_profiling_syscalls")]
-    pub sandbox_profiling_syscalls: bool,
 }
 
 #[cfg(not(feature = "settings"))]
@@ -33,10 +25,6 @@ impl Default for MemoryProfilerSettings {
         Self {
             enabled: false,
             sample_interval: MemoryProfilerSettings::default_sample_interval(),
-
-            #[cfg(feature = "security")]
-            sandbox_profiling_syscalls: MemoryProfilerSettings::default_sandbox_profiling_syscalls(
-            ),
         }
     }
 }
@@ -45,9 +33,4 @@ impl MemoryProfilerSettings {
     fn default_sample_interval() -> u8 {
         19
     }
-
-    #[cfg(feature = "security")]
-    fn default_sandbox_profiling_syscalls() -> bool {
-        true
-    }
 }