From 5e46012bbb3be4906faf5775f3cf13735722f6b0 Mon Sep 17 00:00:00 2001 From: d-netto Date: Wed, 25 Sep 2024 15:24:18 -0300 Subject: [PATCH] all tasks profiler --- src/julia_internal.h | 8 ++ src/signal-handling.c | 38 +++--- src/signals-mach.c | 177 ++++++++++++++------------ src/signals-unix.c | 226 +++++++++++++++++++++++----------- src/signals-win.c | 20 +-- src/stackwalk.c | 42 +++++-- src/task.c | 34 +++++ src/threading.c | 2 +- stdlib/Profile/src/Profile.jl | 71 ++++++++--- 9 files changed, 407 insertions(+), 211 deletions(-) diff --git a/src/julia_internal.h b/src/julia_internal.h index 94b1f85112d7df..df94c2c272915c 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -206,6 +206,14 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER; void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; +arraylist_t jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT; +int jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT; +extern volatile struct _jl_bt_element_t *profile_bt_data_prof; +extern volatile size_t profile_bt_size_max; +extern volatile size_t profile_bt_size_cur; +extern volatile int profile_running; +extern volatile int profile_all_tasks; + // number of cycles since power-on static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT { diff --git a/src/signal-handling.c b/src/signal-handling.c index c2a344a7525472..d630c615b7d0e7 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -18,16 +18,16 @@ extern "C" { #include // Profiler control variables -// Note: these "static" variables are also used in "signals-*.c" -static volatile jl_bt_element_t *bt_data_prof = NULL; -static volatile size_t bt_size_max = 0; -static volatile size_t bt_size_cur = 0; +volatile jl_bt_element_t *profile_bt_data_prof = NULL; +volatile size_t profile_bt_size_max = 0; +volatile size_t profile_bt_size_cur = 0; static volatile uint64_t nsecprof = 0; -static volatile int running = 0; -static const uint64_t GIGA = 1000000000ULL; +volatile int profile_running = 0; +volatile int profile_all_tasks = 0; +static const uint64_t GIGA = 1000000000ULL; // Timers to take samples at intervals JL_DLLEXPORT void jl_profile_stop_timer(void); -JL_DLLEXPORT int jl_profile_start_timer(void); +JL_DLLEXPORT int jl_profile_start_timer(uint8_t); // File-descriptor for safe logging on signal handling int jl_sig_fd; @@ -36,30 +36,30 @@ int jl_sig_fd; /////////////////////// JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec) { - bt_size_max = maxsize; + profile_bt_size_max = maxsize; nsecprof = delay_nsec; - if (bt_data_prof != NULL) - free((void*)bt_data_prof); - bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t)); - if (bt_data_prof == NULL && maxsize > 0) + if (profile_bt_data_prof != NULL) + free((void*)profile_bt_data_prof); + profile_bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t)); + if (profile_bt_data_prof == NULL && maxsize > 0) return -1; - bt_size_cur = 0; + profile_bt_size_cur = 0; return 0; } JL_DLLEXPORT uint8_t *jl_profile_get_data(void) { - return (uint8_t*) bt_data_prof; + return (uint8_t*) profile_bt_data_prof; } JL_DLLEXPORT size_t jl_profile_len_data(void) { - return bt_size_cur; + return profile_bt_size_cur; } JL_DLLEXPORT size_t jl_profile_maxlen_data(void) { - return bt_size_max; + return profile_bt_size_max; } JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void) @@ -69,12 +69,12 @@ JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void) JL_DLLEXPORT void jl_profile_clear_data(void) { - bt_size_cur = 0; + profile_bt_size_cur = 0; } JL_DLLEXPORT int jl_profile_is_running(void) { - return running; + return profile_running; } // Any function that acquires this lock must be either a unmanaged thread @@ -187,7 +187,7 @@ JL_DLLEXPORT int jl_profile_is_buffer_full(void) // Declare buffer full if there isn't enough room to sample even just the // thread metadata and one max-sized frame. The `+ 6` is for the two block // terminator `0`'s plus the 4 metadata entries. - return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max; + return profile_bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > profile_bt_size_max; } static uint64_t jl_last_sigint_trigger = 0; diff --git a/src/signals-mach.c b/src/signals-mach.c index 6ec8f95570f177..dcb5b7f7eb4991 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -603,6 +603,85 @@ void jl_unlock_stackwalk(int lockret) jl_unlock_profile_mach(1, lockret); } +// assumes holding `jl_lock_profile_mach` +void jl_profile_thread_mach(int tid) +{ + // if there is no space left, return early + if (jl_profile_is_buffer_full()) { + jl_profile_stop_timer(); + return; + } + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_prepare(); + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_prepare(); // briefly acquire the dlsym lock + host_thread_state_t state; + int valid_thread = jl_thread_suspend_and_get_state2(tid, &state); + unw_context_t *uc = (unw_context_t*)&state; + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_parent(); // quickly release the dlsym lock + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_parent(); + if (!valid_thread) + return; + if (profile_running) { +#ifdef LLVMLIBUNWIND + /* + * Unfortunately compact unwind info is incorrectly generated for quite a number of + * libraries by quite a large number of compilers. We can fall back to DWARF unwind info + * in some cases, but in quite a number of cases (especially libraries not compiled in debug + * mode, only the compact unwind info may be available). Even more unfortunately, there is no + * way to detect such bogus compact unwind info (other than noticing the resulting segfault). + * What we do here is ugly, but necessary until the compact unwind info situation improves. + * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. + * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost + * entry will always be correct, and the number of cases in which this is an issue is rather small. + * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling + * and during stack unwinding we only ever read memory, but never write it. + */ + + forceDwarf = 0; + unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point + + if (forceDwarf == 0) { + // Save the backtrace + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == 1) { + profile_bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == -1) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } + + forceDwarf = -2; +#else + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL); +#endif + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + } + // We're done! Resume the thread. + jl_thread_resume(tid); +} + +void jl_profile_task_unix(void); + void *mach_profile_listener(void *arg) { (void)arg; @@ -620,88 +699,21 @@ void *mach_profile_listener(void *arg) // sample each thread, round-robin style in reverse order // (so that thread zero gets notified last) int keymgr_locked = jl_lock_profile_mach(0); - int nthreads = jl_atomic_load_acquire(&jl_n_threads); - int *randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = randperm[idx]; - // if there is no space left, break early - if (jl_profile_is_buffer_full()) { - jl_profile_stop_timer(); - break; - } - - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_prepare(); - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_prepare(); // briefly acquire the dlsym lock - host_thread_state_t state; - int valid_thread = jl_thread_suspend_and_get_state2(i, &state); - unw_context_t *uc = (unw_context_t*)&state; - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_parent(); // quickly release the dlsym lock - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_parent(); - if (!valid_thread) - continue; - if (running) { -#ifdef LLVMLIBUNWIND - /* - * Unfortunately compact unwind info is incorrectly generated for quite a number of - * libraries by quite a large number of compilers. We can fall back to DWARF unwind info - * in some cases, but in quite a number of cases (especially libraries not compiled in debug - * mode, only the compact unwind info may be available). Even more unfortunately, there is no - * way to detect such bogus compact unwind info (other than noticing the resulting segfault). - * What we do here is ugly, but necessary until the compact unwind info situation improves. - * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. - * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost - * entry will always be correct, and the number of cases in which this is an issue is rather small. - * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling - * and during stack unwinding we only ever read memory, but never write it. - */ - - forceDwarf = 0; - unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point - - if (forceDwarf == 0) { - // Save the backtrace - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == 1) { - bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == -1) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } - - forceDwarf = -2; -#else - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); -#endif - jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - - // store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); - - // store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task_unix(); + } + else { + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in random order. + int i = randperm[idx]; + jl_profile_thread_mach(i); } - // We're done! Resume the thread. - jl_thread_resume(i); } jl_unlock_profile_mach(0, keymgr_locked); - if (running) { + if (profile_running) { jl_check_profile_autostop(); // Reset the alarm kern_return_t ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); @@ -710,7 +722,8 @@ void *mach_profile_listener(void *arg) } } -JL_DLLEXPORT int jl_profile_start_timer(void) + +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { kern_return_t ret; if (!profile_started) { @@ -739,7 +752,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void) timerprof.tv_sec = nsecprof/GIGA; timerprof.tv_nsec = nsecprof%GIGA; - running = 1; + profile_running = 1; + profile_all_tasks = all_tasks; // ensure the alarm is running ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); HANDLE_MACH_ERROR("clock_alarm", ret); @@ -749,5 +763,6 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; } diff --git a/src/signals-unix.c b/src/signals-unix.c index 3ebf7954dccfc7..f62808f063b726 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -9,6 +9,10 @@ #include #include #include + +#include "julia.h" +#include "julia_internal.h" + #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) #define MAP_ANONYMOUS MAP_ANON #endif @@ -562,7 +566,7 @@ int timer_graceperiod_elapsed(void) static timer_t timerprof; static struct itimerspec itsprof; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { struct sigevent sigprof; @@ -571,10 +575,12 @@ JL_DLLEXPORT int jl_profile_start_timer(void) sigprof.sigev_notify = SIGEV_SIGNAL; sigprof.sigev_signo = SIGUSR1; sigprof.sigev_value.sival_ptr = &timerprof; - // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer - running = 1; + // Because SIGUSR1 is multipurpose, set `profile_running` before so that we know that the first SIGUSR1 came from the timer + profile_running = 1; + profile_all_tasks = all_tasks; if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; return -2; } @@ -584,7 +590,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void) itsprof.it_value.tv_sec = nsecprof / GIGA; itsprof.it_value.tv_nsec = nsecprof % GIGA; if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) { - running = 0; + profile_running = 0; + profile_all_tasks = 0; return -3; } return 0; @@ -592,10 +599,10 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { - if (running) { + if (profile_running) { timer_delete(timerprof); last_timer_delete_time = jl_hrtime(); - running = 0; + profile_running = 0; } } @@ -691,7 +698,7 @@ void trigger_profile_peek(void) jl_safe_printf("\n======================================================================================\n"); jl_safe_printf("Information request received. A stacktrace will print followed by a %.1f second profile\n", profile_peek_duration); jl_safe_printf("======================================================================================\n"); - if (bt_size_max == 0){ + if (profile_bt_size_max == 0){ // If the buffer hasn't been initialized, initialize with default size // Keep these values synchronized with Profile.default_init() if (jl_profile_init(10000000, 1000000) == -1) { @@ -699,13 +706,120 @@ void trigger_profile_peek(void) return; } } - bt_size_cur = 0; // clear profile buffer - if (jl_profile_start_timer() < 0) + profile_bt_size_cur = 0; // clear profile buffer + if (jl_profile_start_timer(0) < 0) jl_safe_printf("ERROR: Could not start profile timer\n"); else profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9); } +void jl_profile_task_unix(void) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + + // Get a random task for which we know that we won't + // drop a sample in the profiler + // TODO(Diogo, Nick): should we put an upper bound on the number of iterations? + arraylist_t tasks = jl_get_all_tasks_arraylist(); + jl_task_t *t = NULL; + uint64_t seed = jl_rand(); + while (1) { + t = tasks.items[cong(tasks.len, UINT64_MAX, &seed)]; + assert(t == NULL || jl_is_task(t)); + if (t == NULL) { + continue; + } + int t_state = jl_atomic_load_relaxed(&t->_state); + if (t_state == JL_TASK_STATE_DONE) { + continue; + } + break; + } + arraylist_free(&tasks); + assert(t != NULL); + + int tid = jl_rec_backtrace(t); + // we failed to get a backtrace + // TODO(Diogo, Nick): should we add a special value to the buffer to indicate this? + // Should we retry? Note that dropping a backtraces affects the effective sample rate... + if (tid == INT16_MAX) { + return; + } + // TODO(Diogo, Nick): we were running the profiler on a Julia session with --threads=8 ans saw same samples + // from thread 16. Investigate whether this makes sense... + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)t; + + // store cpu cycle clock + // TODO(Diogo, Nick): why are we storing the cycle clock here? + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // the thread profiler uses this block to record whether the thread is not sleeping (1) or sleeping (2) + // let's use a dummy value which is not 1 or 2 to + // indicate that we are profiling a task, and therefore, this block is not about the thread state + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 3; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; +} + +// assumes holding `jl_lock_stackwalk` +void jl_profile_thread_unix(int tid, bt_context_t *signal_context) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + // notify thread to stop + if (!jl_thread_suspend_and_get_state(tid, 1, signal_context)) + return; + // unwinding can fail, so keep track of the current state + // and restore from the SEGV handler if anything happens. + jl_jmp_buf *old_buf = jl_get_safe_restore(); + jl_jmp_buf buf; + + jl_set_safe_restore(&buf); + if (jl_setjmp(buf, 0)) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } else { + // Get backtrace data + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, signal_context, NULL); + } + jl_set_safe_restore(old_buf); + + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls2->tid + 1; + + // store task id (never null) + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); + + // store cpu cycle clock + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; + + // Mark the end of this block with two 0's + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + + // notify thread to resume + jl_thread_resume(tid); +} + static void *signal_listener(void *arg) { static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1]; @@ -805,13 +919,13 @@ static void *signal_listener(void *arg) int doexit = critical; #ifdef SIGINFO if (sig == SIGINFO) { - if (running != 1) + if (profile_running != 1) trigger_profile_peek(); doexit = 0; } #else if (sig == SIGUSR1) { - if (running != 1 && timer_graceperiod_elapsed()) + if (profile_running != 1 && timer_graceperiod_elapsed()) trigger_profile_peek(); doexit = 0; } @@ -845,78 +959,46 @@ static void *signal_listener(void *arg) bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) bt_context_t signal_context; - // sample each thread, round-robin style in reverse order - // (so that thread zero gets notified last) - if (critical || profile) { + if (critical) { int lockret = jl_lock_stackwalk(); - int *randperm; - if (profile) - randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = profile ? randperm[idx] : idx; + // sample each thread, round-robin style in reverse order + // (so that thread zero gets notified last) + for (int i = nthreads; i-- > 0; ) { // notify thread to stop if (!jl_thread_suspend_and_get_state(i, 1, &signal_context)) continue; // do backtrace on thread contexts for critical signals // this part must be signal-handler safe - if (critical) { - bt_size += rec_backtrace_ctx(bt_data + bt_size, - JL_MAX_BT_SIZE / nthreads - 1, - &signal_context, NULL); - bt_data[bt_size++].uintptr = 0; - } - - // do backtrace for profiler - if (profile && running) { - if (jl_profile_is_buffer_full()) { - // Buffer full: Delete the timer - jl_profile_stop_timer(); - } - else { - // unwinding can fail, so keep track of the current state - // and restore from the SEGV handler if anything happens. - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0)) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } else { - // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &signal_context, NULL); - } - jl_set_safe_restore(old_buf); - - jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; - - // store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); - - // store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; - } - } - - // notify thread to resume + bt_size += rec_backtrace_ctx(bt_data + bt_size, + JL_MAX_BT_SIZE / nthreads - 1, + &signal_context, NULL); + bt_data[bt_size++].uintptr = 0; jl_thread_resume(i); } jl_unlock_stackwalk(lockret); } + else if (profile) { + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task_unix(); + } + else { + int lockret = jl_lock_stackwalk(); + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in the random order. + int i = randperm[idx]; + // do backtrace for profiler + if (profile && profile_running) { + jl_profile_thread_unix(i, &signal_context); + } + } + jl_unlock_stackwalk(lockret); + } + } #ifndef HAVE_MACH - if (profile && running) { + if (profile && profile_running) { jl_check_profile_autostop(); #if defined(HAVE_TIMER) timer_settime(timerprof, 0, &itsprof, NULL); diff --git a/src/signals-win.c b/src/signals-win.c index bcb3a1fd246f0e..c6cc556f41db2d 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -415,26 +415,26 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) break; } // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &ctxThread, NULL); + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, &ctxThread, NULL); jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread // store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1; // store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock(); // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; + profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0; jl_unlock_stackwalk(lockret); jl_thread_resume(0); jl_check_profile_autostop(); @@ -449,7 +449,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) static volatile TIMECAPS timecaps; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { if (hBtThread == NULL) { @@ -483,6 +483,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void) if (TIMERR_NOERROR != timeBeginPeriod(timecaps.wPeriodMin)) timecaps.wPeriodMin = 0; } + profile_all_tasks = all_tasks; running = 1; // set `running` finally return 0; } @@ -491,6 +492,7 @@ JL_DLLEXPORT void jl_profile_stop_timer(void) if (running && timecaps.wPeriodMin) timeEndPeriod(timecaps.wPeriodMin); running = 0; + profile_all_tasks = 0; } void jl_install_default_signal_handlers(void) diff --git a/src/stackwalk.c b/src/stackwalk.c index ab3f13c8e97468..5c8893d28638f2 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -868,26 +868,35 @@ _os_ptr_munge(uintptr_t ptr) #endif -extern bt_context_t *jl_to_bt_context(void *sigctx); +STATIC_INLINE int all_tasks_profile_running(void) +{ + return profile_running && profile_all_tasks; +} -static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT +int jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT { - jl_task_t *ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; - ptls->bt_size = 0; + jl_task_t *ct = NULL; + jl_ptls_t ptls = NULL; + int16_t tid = INT16_MAX; + if (!all_tasks_profile_running()) { + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + ptls->bt_size = 0; + tid = ptls->tid; + } if (t == ct) { ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); - return; + return ptls->tid; } bt_context_t *context = NULL; bt_context_t c; int16_t old = -1; - while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) { + while (!jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid) { int lockret = jl_lock_stackwalk(); // if this task is already running somewhere, we need to stop the thread it is running on and query its state if (!jl_thread_suspend_and_get_state(old, 0, &c)) { jl_unlock_stackwalk(lockret); - return; + return INT16_MAX; // to indicate a failure to get the backtrace } jl_unlock_stackwalk(lockret); if (jl_atomic_load_relaxed(&t->tid) == old) { @@ -1109,12 +1118,23 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT #pragma message("jl_rec_backtrace not defined for unknown task system") #endif } - if (context) - ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack); + if (context) { + // Record into the profile buffer + if (all_tasks_profile_running()) { + profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, + profile_bt_size_max - profile_bt_size_cur - 1, context, NULL); + } + // Record into the buffer owned by the threads's TLS + else { + ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, + context, t->gcstack); + } + } if (old == -1) jl_atomic_store_relaxed(&t->tid, old); - else if (old != ptls->tid) + else if (old != tid) jl_thread_resume(old); + return old; } //-------------------------------------------------- diff --git a/src/task.c b/src/task.c index 86033a81ddf412..c45f1168d406ae 100644 --- a/src/task.c +++ b/src/task.c @@ -1115,6 +1115,40 @@ JL_DLLEXPORT jl_task_t *jl_get_current_task(void) return pgcstack == NULL ? NULL : container_of(pgcstack, jl_task_t, gcstack); } +extern int gc_first_tid; + +// Select a task at random to profile. Racy: `live_tasks` can change at any time. +arraylist_t jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT +{ + arraylist_t tasks; + arraylist_new(&tasks, 0); + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + for (size_t i = 0; i < nthreads; i++) { + // skip GC threads... + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } + jl_ptls_t ptls2 = allstates[i]; + if (ptls2 == NULL) { + continue; + } + jl_task_t *t = ptls2->root_task; + if (t->stkbuf != NULL) { + arraylist_push(&tasks, t); + } + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + for (size_t i = 0; i < n; i++) { + jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); + if (t->stkbuf != NULL) { + arraylist_push(&tasks, t); + } + } + } + return tasks; +} + #ifdef JL_HAVE_ASYNCIFY JL_DLLEXPORT jl_ucontext_t *task_ctx_ptr(jl_task_t *t) diff --git a/src/threading.c b/src/threading.c index d89e8d09745fc3..d0f25de374462c 100644 --- a/src/threading.c +++ b/src/threading.c @@ -1035,7 +1035,7 @@ JL_DLLEXPORT int jl_heartbeat_resume(void) if (uv_sem_trywait(&heartbeat_off_sem) != 0) { return -1; } - + // reset state as we've been paused n_hbs_missed = 0; n_hbs_recvd = 0; diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index a0a1ecd1964ed9..f00afefd5a3c13 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -31,6 +31,25 @@ macro profile(ex) end end +export @profile_tasks + +""" + @profile_tasks + +`@profile_tasks ` runs your expression while taking periodic backtraces of a sample of all live tasks (both running and not running). +These are appended to an internal buffer of backtraces. +""" +macro profile_tasks(ex) + return quote + try + start_timer(true) + $(esc(ex)) + finally + stop_timer() + end + end +end + # An internal function called to show the report after an information request (SIGINFO or SIGUSR1). function _peek_report() iob = IOBuffer() @@ -369,9 +388,9 @@ end function has_meta(data) for i in 6:length(data) - data[i] == 0 || continue # first block end null - data[i - 1] == 0 || continue # second block end null - data[i - META_OFFSET_SLEEPSTATE] in 1:2 || continue + data[i] == 0 || continue # first block end null + data[i - 1] == 0 || continue # second block end null + data[i - META_OFFSET_SLEEPSTATE] in 1:3 || continue # 1 for not sleeping, 2 for sleeping, 3 for task profiler fake state data[i - META_OFFSET_CPUCYCLECLOCK] != 0 || continue data[i - META_OFFSET_TASKID] != 0 || continue data[i - META_OFFSET_THREADID] != 0 || continue @@ -562,9 +581,9 @@ Julia, and examine the resulting `*.mem` files. clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) # C wrappers -function start_timer() +function start_timer(all_tasks::Bool=false) check_init() # if the profile buffer hasn't been initialized, initialize with default size - status = ccall(:jl_profile_start_timer, Cint, ()) + status = ccall(:jl_profile_start_timer, Cint, (Bool,), all_tasks) if status < 0 error(error_codes[status]) end @@ -676,12 +695,16 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, startframe = length(data) skip = false nsleeping = 0 + task_profiler = false for i in startframe:-1:1 (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP ip = data[i] if is_block_end(data, i) # read metadata - thread_sleeping = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + thread_sleeping_state = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + if thread_sleeping_state == 2 + task_profiler = true + end # cpu_cycle_clock = data[i - META_OFFSET_CPUCYCLECLOCK] taskid = data[i - META_OFFSET_TASKID] threadid = data[i - META_OFFSET_THREADID] @@ -689,7 +712,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, skip = true continue end - if thread_sleeping == 1 + if thread_sleeping_state == 1 nsleeping += 1 end skip = false @@ -723,12 +746,12 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, end end @assert length(lilist) == length(n) == length(m) == length(lilist_idx) - return (lilist, n, m, totalshots, nsleeping) + return (lilist, n, m, totalshots, nsleeping, task_profiler) end function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) - lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) + lilist, n, m, totalshots, nsleeping, task_profiler = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks) if false # optional: drop the "non-interpretable" ones keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist) lilist = lilist[keep] @@ -748,11 +771,15 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo return true end is_subsection || print_flat(io, lilist, n, m, cols, filenamemap, fmt) - Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%") + if task_profiler + Base.print(io, "Total snapshots: ", totalshots, "\n") + else + Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%") + end if is_subsection println(io) print_flat(io, lilist, n, m, cols, filenamemap, fmt) - else + elseif !task_profiler Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n") end return false @@ -927,12 +954,16 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI startframe = length(all) skip = false nsleeping = 0 + task_profiler = false for i in startframe:-1:1 (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (it's read ahead below) and extra block end NULL IP ip = all[i] if is_block_end(all, i) # read metadata - thread_sleeping = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + thread_sleeping_state = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0 + if thread_sleeping_state == 2 + task_profiler = true + end # cpu_cycle_clock = all[i - META_OFFSET_CPUCYCLECLOCK] taskid = all[i - META_OFFSET_TASKID] threadid = all[i - META_OFFSET_THREADID] @@ -941,7 +972,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI skip = true continue end - if thread_sleeping == 1 + if thread_sleeping_state == 1 nsleeping += 1 end skip = false @@ -1047,7 +1078,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI nothing end cleanup!(root) - return root, nsleeping + return root, nsleeping, task_profiler end function maxstats(root::StackFrameTree) @@ -1116,9 +1147,9 @@ end function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat, threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool) if fmt.combine - root, nsleeping = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping, task_profiler = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks) else - root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) + root, nsleeping, task_profiler = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks) end util_perc = (1 - (nsleeping / root.count)) * 100 is_subsection || print_tree(io, root, cols, fmt, is_subsection) @@ -1132,11 +1163,15 @@ function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, Line end return true end - Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") + if task_profiler + Base.print(io, "Total snapshots: ", root.count, "\n") + else + Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%") + end if is_subsection Base.println(io) print_tree(io, root, cols, fmt, is_subsection) - else + elseif !task_profiler Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n") end return false