Skip to content

Commit

Permalink
WIP support for using PERF_RECORD_SWITCH
Browse files Browse the repository at this point in the history
  • Loading branch information
rocallahan committed Dec 15, 2023
1 parent 20efc0d commit c2eff3a
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 28 deletions.
5 changes: 0 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1728,11 +1728,6 @@ if(BUILD_TESTS)
\$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin_dir)")
endif(INSTALL_TESTSUITE)

add_test(check_environment
bash source_dir/src/test/check_environment_test.run)
set_tests_properties(check_environment
PROPERTIES FAIL_REGULAR_EXPRESSION "rr needs /proc/sys/kernel/perf_event_paranoid <= 1")

foreach(test ${BASIC_TESTS} ${TESTS_WITH_PROGRAM})
if (NOT x86ish AND ${test} MATCHES "^x86/.*")
continue()
Expand Down
6 changes: 2 additions & 4 deletions src/PerfCounters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -683,8 +683,7 @@ static void start_pt(pid_t tid, PerfCounters::PTState& state) {
init_perf_event_attr(&attr, event_type, 0);
state.pt_perf_event_fd = start_counter(tid, -1, &attr);

size_t page_size = sysconf(_SC_PAGESIZE);
void* base = mmap(NULL, page_size + PT_PERF_DATA_SIZE,
void* base = mmap(NULL, page_size() + PT_PERF_DATA_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED, state.pt_perf_event_fd, 0);
if (base == MAP_FAILED) {
FATAL() << "Can't allocate memory for PT DATA area";
Expand Down Expand Up @@ -770,10 +769,9 @@ PTData PerfCounters::extract_intel_pt_data() {
void PerfCounters::PTState::stop() {
pt_perf_event_fd.close();
if (mmap_aux_buffer) {
size_t page_size = sysconf(_SC_PAGESIZE);
munmap(mmap_aux_buffer, mmap_header->aux_size);
mmap_aux_buffer = nullptr;
munmap(mmap_header, page_size + PT_PERF_DATA_SIZE);
munmap(mmap_header, page_size() + PT_PERF_DATA_SIZE);
mmap_header = nullptr;
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/RecordSession.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2390,11 +2390,11 @@ static string lookup_by_path(const string& name) {
if (size >= 0) {
buf[size] = 0;
int val = atoi(buf);
if (val > 1) {
if (val > 2) {
fprintf(stderr,
"rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is %d.\n"
"Change it to 1, or use 'rr record -n' (slow).\n"
"Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n"
"rr needs /proc/sys/kernel/perf_event_paranoid <= 2, but it is %d.\n"
"Change it to 2.\n"
"Consider putting 'kernel.perf_event_paranoid = 2' in /etc/sysctl.d/10-rr.conf.\n"
"See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n"
"and 'man 5 sysctl.conf' (non-systemd systems) for more details.\n",
val);
Expand Down
18 changes: 18 additions & 0 deletions src/RecordTask.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ RecordTask::RecordTask(RecordSession& session, pid_t _tid, uint32_t serial,
emulated_stop_type(NOT_STOPPED),
blocked_sigs_dirty(true),
syscallbuf_blocked_sigs_generation(0),
desched_event_buf(nullptr),
flushed_num_rec_bytes(0),
flushed_syscallbuf(false),
delay_syscallbuf_reset_for_desched(false),
Expand Down Expand Up @@ -218,6 +219,10 @@ RecordTask::RecordTask(RecordSession& session, pid_t _tid, uint32_t serial,
}
}

static size_t desched_sample_buf_size() {
return page_size() * 2;
}

RecordTask::~RecordTask() {
if (emulated_ptracer) {
emulated_ptracer->emulated_ptrace_tracees.erase(this);
Expand Down Expand Up @@ -261,6 +266,10 @@ RecordTask::~RecordTask() {

// If this was stopped, notify the scheduler.
set_stopped(false);

if (desched_event_buf) {
munmap(desched_event_buf, desched_sample_buf_size());
}
}

void RecordTask::record_exit_event(WriteChildTid write_child_tid) {
Expand Down Expand Up @@ -519,6 +528,15 @@ template <typename Arch> void RecordTask::init_buffers_arch() {
fds->add_monitor(this, desched_fd_child, new PreserveFileMonitor());
desched_fd = remote.retrieve_fd(desched_fd_child);

if (desched_event_buf) {
munmap(desched_event_buf, desched_sample_buf_size());
}
desched_event_buf = static_cast<struct perf_event_mmap_page*>(
mmap(NULL, desched_sample_buf_size(), PROT_READ | PROT_WRITE, MAP_SHARED, desched_fd, 0));
if (desched_event_buf == MAP_FAILED) {
FATAL() << "Can't allocate memory for desched perf event buffer";
}

if (trace_writer().supports_file_data_cloning() &&
session().use_read_cloning()) {
cloned_file_data_fname = trace_writer().file_data_clone_file_name(tuid());
Expand Down
1 change: 1 addition & 0 deletions src/RecordTask.h
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,7 @@ class RecordTask final : public Task {

SyscallbufCodeLayout syscallbuf_code_layout;
ScopedFd desched_fd;
struct perf_event_mmap_page* desched_event_buf;
/* Value of hdr->num_rec_bytes when the buffer was flushed */
uint32_t flushed_num_rec_bytes;
/* Nonzero after the trace recorder has flushed the
Expand Down
26 changes: 26 additions & 0 deletions src/kernel_metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <linux/shm.h>
#include <signal.h>
#include <syscall.h>
#include <sys/resource.h>

#include "kernel_abi.h"
#include "kernel_supplement.h"
Expand Down Expand Up @@ -560,4 +561,29 @@ NativeArch::siginfo_t convert_to_native_siginfo(SupportedArch arch,
RR_ARCH_FUNCTION(convert_to_native_siginfo_arch, arch, data, size);
}

string rlimit_resource_name(int resource) {
switch (resource) {
CASE(RLIMIT_AS);
CASE(RLIMIT_CORE);
CASE(RLIMIT_CPU);
CASE(RLIMIT_DATA);
CASE(RLIMIT_FSIZE);
CASE(RLIMIT_LOCKS);
CASE(RLIMIT_MEMLOCK);
CASE(RLIMIT_MSGQUEUE);
CASE(RLIMIT_NICE);
CASE(RLIMIT_NOFILE);
CASE(RLIMIT_NPROC);
CASE(RLIMIT_RSS);
CASE(RLIMIT_RTPRIO);
CASE(RLIMIT_RTTIME);
CASE(RLIMIT_SIGPENDING);
CASE(RLIMIT_STACK);
default:
char buf[100];
snprintf(buf, sizeof(buf), "Unknown RLIMIT_ %d", resource);
return string(buf);
}
}

} // namespace rr
5 changes: 5 additions & 0 deletions src/kernel_metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ bool is_coredumping_signal(int signo);
NativeArch::siginfo_t convert_to_native_siginfo(SupportedArch arch,
const void* data, size_t size);

/**
* Convert an rlimit resource name to a string.
*/
std::string rlimit_resource_name(int resource);

} // namespace rr

#endif
9 changes: 7 additions & 2 deletions src/preload/syscallbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,14 @@ static int open_desched_event_counter(pid_t tid) {
local_memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.type = PERF_TYPE_SOFTWARE;
attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
attr.disabled = 1;
attr.config = PERF_COUNT_SW_DUMMY;
attr.sample_period = 1;
attr.disabled = 1;
attr.watermark = 1;
attr.context_switch = 1;
attr.wakeup_watermark = 1;
attr.exclude_kernel = 1;
attr.exclude_guest = 1;

tmp_fd = privileged_traced_perf_event_open(&attr, 0 /*self*/, -1 /*any cpu*/,
-1, 0);
Expand Down
44 changes: 31 additions & 13 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2262,32 +2262,50 @@ ssize_t read_to_end(const ScopedFd& fd, size_t offset, void* buf, size_t size) {
return ret;
}

static struct rlimit initial_fd_limit;
static struct rlimit raise_resource_limit(int resource, rlim_t max_value) {
struct rlimit initial;

void raise_resource_limits() {
if (getrlimit(RLIMIT_NOFILE, &initial_fd_limit) < 0) {
FATAL() << "Can't get RLIMIT_NOFILE";
if (getrlimit(resource, &initial) < 0) {
FATAL() << "Can't get rlimit " << rlimit_resource_name(resource);
}

struct rlimit new_limit = initial_fd_limit;
// Try raising fd limit to 65536
new_limit.rlim_cur = max<rlim_t>(new_limit.rlim_cur, 65536);
struct rlimit new_limit = initial;
new_limit.rlim_cur = max<rlim_t>(new_limit.rlim_cur, max_value);
if (new_limit.rlim_max != RLIM_INFINITY) {
new_limit.rlim_cur = min<rlim_t>(new_limit.rlim_cur, new_limit.rlim_max);
}
if (new_limit.rlim_cur != initial_fd_limit.rlim_cur) {
if (setrlimit(RLIMIT_NOFILE, &new_limit) < 0) {
LOG(warn) << "Failed to raise file descriptor limit";
if (new_limit.rlim_cur != initial.rlim_cur) {
if (setrlimit(resource, &new_limit) < 0) {
LOG(warn) << "Failed to raise rlimit " << rlimit_resource_name(resource)
<< " to " << new_limit.rlim_cur;
}
}

return initial;
}

void restore_initial_resource_limits() {
if (setrlimit(RLIMIT_NOFILE, &initial_fd_limit) < 0) {
LOG(warn) << "Failed to reset file descriptor limit";
static void restore_resource_limit(int resource, const struct rlimit& old_limit) {
if (setrlimit(resource, &old_limit) < 0) {
LOG(warn) << "Failed to reset rlimit " << rlimit_resource_name(resource);
}
}

static const int MAX_TRACEE_TASKS = 65536;
static struct rlimit initial_fd_limit;
static struct rlimit initial_memlock_limit;

void raise_resource_limits() {
// We need up to 5 perf event counters per tracee task
initial_fd_limit = raise_resource_limit(RLIMIT_NOFILE, 1024 + 5 * MAX_TRACEE_TASKS);
// We typically need one page of locked memory per tracee task
initial_memlock_limit = raise_resource_limit(RLIMIT_MEMLOCK, page_size() * 2 * MAX_TRACEE_TASKS);
}

void restore_initial_resource_limits() {
restore_resource_limit(RLIMIT_NOFILE, initial_fd_limit);
restore_resource_limit(RLIMIT_MEMLOCK, initial_memlock_limit);
}

template <typename Arch> static size_t word_size_arch() {
return sizeof(typename Arch::signed_long);
}
Expand Down

0 comments on commit c2eff3a

Please sign in to comment.