Skip to content

Commit

Permalink
utils: symbolize heap profiles (#10153)
Browse files Browse the repository at this point in the history
## Problem

Jemalloc heap profiles aren't symbolized. This is inconvenient, and
doesn't work with Grafana Cloud Profiles.

Resolves #9964.

## Summary of changes

Symbolize the heap profiles in-process, and strip unnecessary cruft.

This uses about 100 MB additional memory to cache the DWARF information,
but I believe this is already the case with CPU profiles, which use the
same library for symbolization. With cached DWARF information, the
symbolization CPU overhead is negligible.

Example profiles:

*
[pageserver.pb.gz](https://github.com/user-attachments/files/18141395/pageserver.pb.gz)
*
[safekeeper.pb.gz](https://github.com/user-attachments/files/18141396/safekeeper.pb.gz)
  • Loading branch information
erikgrinaker authored Dec 17, 2024
1 parent 007b13b commit a55853f
Show file tree
Hide file tree
Showing 6 changed files with 298 additions and 51 deletions.
105 changes: 65 additions & 40 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
backtrace = "0.3.74"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
Expand Down
5 changes: 4 additions & 1 deletion libs/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,20 @@ arc-swap.workspace = true
sentry.workspace = true
async-compression.workspace = true
anyhow.workspace = true
backtrace.workspace = true
bincode.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
diatomic-waker.workspace = true
flate2.workspace = true
git-version.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper0 = { workspace = true, features = ["full"] }
itertools.workspace = true
fail.workspace = true
futures = { workspace = true}
futures = { workspace = true }
jemalloc_pprof.workspace = true
jsonwebtoken.workspace = true
nix.workspace = true
Expand Down
46 changes: 36 additions & 10 deletions libs/utils/src/http/endpoint.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use crate::http::request::{get_query_param, parse_query_param};
use crate::pprof;
use ::pprof::protos::Message as _;
use ::pprof::ProfilerGuardBuilder;
use anyhow::{anyhow, Context};
use bytes::{Bytes, BytesMut};
use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION};
use hyper::http::HeaderValue;
use hyper::Method;
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use regex::Regex;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::sync::{mpsc, Mutex};
use tokio_stream::wrappers::ReceiverStream;
use tokio_util::io::ReaderStream;
use tracing::{debug, info, info_span, warn, Instrument};

Expand All @@ -18,11 +25,6 @@ use std::io::Write as _;
use std::str::FromStr;
use std::time::Duration;

use bytes::{Bytes, BytesMut};
use pprof::protos::Message as _;
use tokio::sync::{mpsc, Mutex};
use tokio_stream::wrappers::ReceiverStream;

static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libmetrics_metric_handler_requests_total",
Expand Down Expand Up @@ -365,7 +367,7 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A

// Take the profile.
let report = tokio::task::spawn_blocking(move || {
let guard = pprof::ProfilerGuardBuilder::default()
let guard = ProfilerGuardBuilder::default()
.frequency(frequency_hz)
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
.build()?;
Expand Down Expand Up @@ -457,10 +459,34 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
}

Format::Pprof => {
let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
let data = tokio::task::spawn_blocking(move || {
let bytes = prof_ctl.dump_pprof()?;
// Symbolize the profile.
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
// serialization roundtrip.
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
// Functions to strip from profiles. If true, also remove child frames.
vec![
(Regex::new("^__rust").unwrap(), false),
(Regex::new("^_start$").unwrap(), false),
(Regex::new("^irallocx_prof").unwrap(), true),
(Regex::new("^prof_alloc_prep").unwrap(), true),
(Regex::new("^std::rt::lang_start").unwrap(), false),
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
]
});
let profile = pprof::decode(&bytes)?;
let profile = pprof::symbolize(profile)?;
let profile = pprof::strip_locations(
profile,
&["libc", "libgcc", "pthread", "vdso"],
&STRIP_FUNCTIONS,
);
pprof::encode(&profile)
})
.await
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
.map_err(ApiError::InternalServerError)?;
Response::builder()
.status(200)
.header(CONTENT_TYPE, "application/octet-stream")
Expand Down
Loading

1 comment on commit a55853f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7095 tests run: 6796 passed, 1 failed, 298 skipped (full report)


Failures on Postgres 17

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_idle_checkpoints[debug-pg17]"
Flaky tests (7)

Postgres 16

  • test_pgdata_import_smoke[8-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]: release-arm64
  • test_pgdata_import_smoke[None-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]: release-arm64
  • test_lr_with_slow_safekeeper: release-arm64

Postgres 15

  • test_pgdata_import_smoke[8-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]: release-arm64
  • test_pgdata_import_smoke[None-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]: release-arm64

Postgres 14

  • test_pgdata_import_smoke[8-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]: release-arm64
  • test_pgdata_import_smoke[None-1024-RelBlockSize.MULTIPLE_RELATION_SEGMENTS]: release-arm64

Test coverage report is not available

The comment gets automatically updated with the latest test results
a55853f at 2024-12-17T17:58:58.858Z :recycle:

Please sign in to comment.