diff --git a/src/gc.c b/src/gc.c index dad57687325450..3701df108a0f46 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3922,7 +3922,6 @@ void jl_gc_init(void) JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock"); JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock"); uv_mutex_init(&page_profile_lock); - uv_mutex_init(&gc_perm_lock); uv_mutex_init(&gc_threads_lock); uv_cond_init(&gc_threads_cond); uv_sem_init(&gc_sweep_assists_needed, 0); diff --git a/src/init.c b/src/init.c index e482f8b77ee9ba..37eb08f172be53 100644 --- a/src/init.c +++ b/src/init.c @@ -738,6 +738,8 @@ static void init_global_mutexes(void) { JL_MUTEX_INIT(&typecache_lock, "typecache_lock"); } +extern uv_mutex_t array_to_string_print_lock; + JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) { // initialize many things, in no particular order @@ -747,6 +749,10 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) // Make sure we finalize the tls callback before starting any threads. (void)jl_get_pgcstack(); + // Initialize a few locks... + uv_mutex_init(&gc_perm_lock); + uv_mutex_init(&array_to_string_print_lock); + // initialize backtraces jl_init_profile_lock(); #ifdef _OS_WINDOWS_ @@ -773,6 +779,7 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) jl_io_loop = uv_default_loop(); // this loop will internal events (spawning process etc.), // best to call this first, since it also initializes libuv jl_init_uv(); + jl_init_threading(); init_stdio(); restore_fp_env(); if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON) @@ -818,7 +825,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) jl_init_rand(); jl_init_runtime_ccall(); jl_init_tasks(); - jl_init_threading(); jl_init_threadinginfra(); if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON) jl_install_default_signal_handlers(); @@ -855,8 +861,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel) void jl_init_heartbeat(void); -extern uv_mutex_t array_to_string_print_lock; - static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct) { JL_TIMING(JULIA_INIT, JULIA_INIT); @@ -892,8 +896,9 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ } if (jl_base_module == NULL) { - // nthreads > 1 requires code in Base - jl_atomic_store_relaxed(&jl_n_threads, 1); + const int num_min_mutator_threads = 1; // main thread + // nthreads > num_min_mutator_threads requires code in Base + jl_atomic_store_relaxed(&jl_n_threads, num_min_mutator_threads); jl_n_markthreads = 0; jl_n_sweepthreads = 0; jl_n_gcthreads = 0; @@ -904,8 +909,6 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ jl_start_gc_threads(); uv_barrier_wait(&thread_init_done); - uv_mutex_init(&array_to_string_print_lock); - jl_init_heartbeat(); jl_gc_enable(1); diff --git a/src/julia_internal.h b/src/julia_internal.h index 94b1f85112d7df..079661a59b8c3e 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -206,6 +206,14 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER; void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; +jl_task_t *jl_get_random_task(void) JL_NOTSAFEPOINT; +void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT; +extern volatile struct _jl_bt_element_t *bt_data_prof; +extern volatile size_t bt_size_max; +extern volatile size_t bt_size_cur; +extern volatile int running; +extern volatile int profile_all_tasks; + // number of cycles since power-on static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT { diff --git a/src/partr.c b/src/partr.c index 33631dc83c05a8..03ab1d5ff688e7 100644 --- a/src/partr.c +++ b/src/partr.c @@ -90,6 +90,23 @@ JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max, uint32_t unbias) return cong(max, -(uint64_t)-unbias, &ptls->rngseed); } +jl_ptls_t jl_threadfun_preamble(void *arg, uint8_t state) +{ + jl_threadarg_t *targ = (jl_threadarg_t*)arg; + // initialize this thread (set tid and create heap) + jl_ptls_t ptls = jl_init_threadtls(targ->tid); + void *stack_lo, *stack_hi; + jl_init_stack_limits(0, &stack_lo, &stack_hi); + // warning: this changes `jl_current_task`, so be careful not to call that from this function + jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); + JL_GC_PROMISE_ROOTED(ct); + // wait for all threads + jl_gc_state_set(ptls, state, 0); + uv_barrier_wait(targ->barrier); + free(targ); + return ptls; +} + // initialize the threading infrastructure // (called only by the main thread) void jl_init_threadinginfra(void) @@ -123,19 +140,7 @@ void jl_parallel_gc_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; - // initialize this thread (set tid and create heap) - jl_ptls_t ptls = jl_init_threadtls(targ->tid); - void *stack_lo, *stack_hi; - jl_init_stack_limits(0, &stack_lo, &stack_hi); - // warning: this changes `jl_current_task`, so be careful not to call that from this function - jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); - JL_GC_PROMISE_ROOTED(ct); - // wait for all threads - jl_gc_state_set(ptls, JL_GC_PARALLEL_COLLECTOR_THREAD, 0); - uv_barrier_wait(targ->barrier); - - // free the thread argument here - free(targ); + jl_ptls_t ptls = jl_threadfun_preamble(targ, JL_GC_PARALLEL_COLLECTOR_THREAD); while (1) { uv_mutex_lock(&gc_threads_lock); @@ -158,19 +163,8 @@ void jl_concurrent_gc_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; - // initialize this thread (set tid and create heap) - jl_ptls_t ptls = jl_init_threadtls(targ->tid); - void *stack_lo, *stack_hi; - jl_init_stack_limits(0, &stack_lo, &stack_hi); - // warning: this changes `jl_current_task`, so be careful not to call that from this function - jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); - JL_GC_PROMISE_ROOTED(ct); - // wait for all threads - jl_gc_state_set(ptls, JL_GC_CONCURRENT_COLLECTOR_THREAD, 0); - uv_barrier_wait(targ->barrier); - - // free the thread argument here - free(targ); + jl_ptls_t ptls = jl_threadfun_preamble(targ, JL_GC_CONCURRENT_COLLECTOR_THREAD); + (void)ptls; while (1) { assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_CONCURRENT_COLLECTOR_THREAD); @@ -184,20 +178,8 @@ void jl_threadfun(void *arg) { jl_threadarg_t *targ = (jl_threadarg_t*)arg; - // initialize this thread (set tid, create heap, set up root task) - jl_ptls_t ptls = jl_init_threadtls(targ->tid); - void *stack_lo, *stack_hi; - jl_init_stack_limits(0, &stack_lo, &stack_hi); - // warning: this changes `jl_current_task`, so be careful not to call that from this function - jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); - JL_GC_PROMISE_ROOTED(ct); - - // wait for all threads - jl_gc_state_set(ptls, JL_GC_STATE_SAFE, 0); - uv_barrier_wait(targ->barrier); - - // free the thread argument here - free(targ); + jl_ptls_t ptls = jl_threadfun_preamble(targ, JL_GC_STATE_SAFE); + jl_task_t *ct = jl_current_task; (void)jl_gc_unsafe_enter(ptls); jl_finish_task(ct); // noreturn diff --git a/src/signal-handling.c b/src/signal-handling.c index c2a344a7525472..dfd025da65a5b1 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -18,16 +18,16 @@ extern "C" { #include // Profiler control variables -// Note: these "static" variables are also used in "signals-*.c" -static volatile jl_bt_element_t *bt_data_prof = NULL; -static volatile size_t bt_size_max = 0; -static volatile size_t bt_size_cur = 0; +volatile jl_bt_element_t *bt_data_prof = NULL; +volatile size_t bt_size_max = 0; +volatile size_t bt_size_cur = 0; static volatile uint64_t nsecprof = 0; -static volatile int running = 0; -static const uint64_t GIGA = 1000000000ULL; +volatile int running = 0; +volatile int profile_all_tasks = 0; +static const uint64_t GIGA = 1000000000ULL; // Timers to take samples at intervals JL_DLLEXPORT void jl_profile_stop_timer(void); -JL_DLLEXPORT int jl_profile_start_timer(void); +JL_DLLEXPORT int jl_profile_start_timer(uint8_t); // File-descriptor for safe logging on signal handling int jl_sig_fd; diff --git a/src/signals-mach.c b/src/signals-mach.c index 6ec8f95570f177..5d6cd0d405a13a 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -603,6 +603,85 @@ void jl_unlock_stackwalk(int lockret) jl_unlock_profile_mach(1, lockret); } +// assumes holding `jl_lock_profile_mach` +void jl_profile_thread_mach(int tid) +{ + // if there is no space left, return early + if (jl_profile_is_buffer_full()) { + jl_profile_stop_timer(); + return; + } + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_prepare(); + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_prepare(); // briefly acquire the dlsym lock + host_thread_state_t state; + int valid_thread = jl_thread_suspend_and_get_state2(tid, &state); + unw_context_t *uc = (unw_context_t*)&state; + if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) + _dyld_atfork_parent(); // quickly release the dlsym lock + if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) + _dyld_dlopen_atfork_parent(); + if (!valid_thread) + return; + if (running) { +#ifdef LLVMLIBUNWIND + /* + * Unfortunately compact unwind info is incorrectly generated for quite a number of + * libraries by quite a large number of compilers. We can fall back to DWARF unwind info + * in some cases, but in quite a number of cases (especially libraries not compiled in debug + * mode, only the compact unwind info may be available). Even more unfortunately, there is no + * way to detect such bogus compact unwind info (other than noticing the resulting segfault). + * What we do here is ugly, but necessary until the compact unwind info situation improves. + * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. + * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost + * entry will always be correct, and the number of cases in which this is an issue is rather small. + * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling + * and during stack unwinding we only ever read memory, but never write it. + */ + + forceDwarf = 0; + unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point + + if (forceDwarf == 0) { + // Save the backtrace + bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == 1) { + bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); + } + else if (forceDwarf == -1) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } + + forceDwarf = -2; +#else + bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); +#endif + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; + + // store task id (never null) + bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); + + // store cpu cycle clock + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; + + // Mark the end of this block with two 0's + bt_data_prof[bt_size_cur++].uintptr = 0; + bt_data_prof[bt_size_cur++].uintptr = 0; + } + // We're done! Resume the thread. + jl_thread_resume(tid); +} + +void jl_profile_task_unix(size_t nthreads); + void *mach_profile_listener(void *arg) { (void)arg; @@ -620,85 +699,18 @@ void *mach_profile_listener(void *arg) // sample each thread, round-robin style in reverse order // (so that thread zero gets notified last) int keymgr_locked = jl_lock_profile_mach(0); - int nthreads = jl_atomic_load_acquire(&jl_n_threads); - int *randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = randperm[idx]; - // if there is no space left, break early - if (jl_profile_is_buffer_full()) { - jl_profile_stop_timer(); - break; - } - - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_prepare(); - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_prepare(); // briefly acquire the dlsym lock - host_thread_state_t state; - int valid_thread = jl_thread_suspend_and_get_state2(i, &state); - unw_context_t *uc = (unw_context_t*)&state; - if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) - _dyld_atfork_parent(); // quickly release the dlsym lock - if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL) - _dyld_dlopen_atfork_parent(); - if (!valid_thread) - continue; - if (running) { -#ifdef LLVMLIBUNWIND - /* - * Unfortunately compact unwind info is incorrectly generated for quite a number of - * libraries by quite a large number of compilers. We can fall back to DWARF unwind info - * in some cases, but in quite a number of cases (especially libraries not compiled in debug - * mode, only the compact unwind info may be available). Even more unfortunately, there is no - * way to detect such bogus compact unwind info (other than noticing the resulting segfault). - * What we do here is ugly, but necessary until the compact unwind info situation improves. - * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info. - * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost - * entry will always be correct, and the number of cases in which this is an issue is rather small. - * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling - * and during stack unwinding we only ever read memory, but never write it. - */ - - forceDwarf = 0; - unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point - - if (forceDwarf == 0) { - // Save the backtrace - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == 1) { - bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); - } - else if (forceDwarf == -1) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } - - forceDwarf = -2; -#else - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); -#endif - jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - - // store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); - - // store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task_unix(nthreads); + } + else { + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in random order. + int i = randperm[idx]; + jl_profile_thread_mach(i); } - // We're done! Resume the thread. - jl_thread_resume(i); } jl_unlock_profile_mach(0, keymgr_locked); if (running) { @@ -710,7 +722,8 @@ void *mach_profile_listener(void *arg) } } -JL_DLLEXPORT int jl_profile_start_timer(void) + +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { kern_return_t ret; if (!profile_started) { @@ -740,6 +753,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void) timerprof.tv_nsec = nsecprof%GIGA; running = 1; + profile_all_tasks = all_tasks; // ensure the alarm is running ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port); HANDLE_MACH_ERROR("clock_alarm", ret); @@ -750,4 +764,5 @@ JL_DLLEXPORT int jl_profile_start_timer(void) JL_DLLEXPORT void jl_profile_stop_timer(void) { running = 0; + profile_all_tasks = 0; } diff --git a/src/signals-unix.c b/src/signals-unix.c index 3ebf7954dccfc7..db425a082c72e3 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -9,6 +9,10 @@ #include #include #include + +#include "julia.h" +#include "julia_internal.h" + #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) #define MAP_ANONYMOUS MAP_ANON #endif @@ -562,7 +566,7 @@ int timer_graceperiod_elapsed(void) static timer_t timerprof; static struct itimerspec itsprof; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { struct sigevent sigprof; @@ -573,8 +577,10 @@ JL_DLLEXPORT int jl_profile_start_timer(void) sigprof.sigev_value.sival_ptr = &timerprof; // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer running = 1; + profile_all_tasks = all_tasks; if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) { running = 0; + profile_all_tasks = 0; return -2; } @@ -585,6 +591,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void) itsprof.it_value.tv_nsec = nsecprof % GIGA; if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) { running = 0; + profile_all_tasks = 0; return -3; } return 0; @@ -700,12 +707,97 @@ void trigger_profile_peek(void) } } bt_size_cur = 0; // clear profile buffer - if (jl_profile_start_timer() < 0) + if (jl_profile_start_timer(0) < 0) jl_safe_printf("ERROR: Could not start profile timer\n"); else profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9); } +void jl_profile_task_unix(size_t nthreads) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + + jl_task_t *t = jl_get_random_task(); + assert(t == NULL || jl_is_task(t)); + if (t == NULL) { + return; + } + int t_state = jl_atomic_load_relaxed(&t->_state); + if (t_state == JL_TASK_STATE_DONE) { + return; + } + + jl_rec_backtrace(t); + + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = 1; // dummy value for now... Is this ever used when outputting the profile? + + // store task id (never null) + bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)t; + + // store cpu cycle clock. XXX(Diogo, Nick): why are we recording the cycleclock here? + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = 1; // dummy value for now... Is this ever used when outputting the profile? + + // Mark the end of this block with two 0's + bt_data_prof[bt_size_cur++].uintptr = 0; + bt_data_prof[bt_size_cur++].uintptr = 0; +} + +// assumes holding `jl_lock_stackwalk` +void jl_profile_thread_unix(int tid, bt_context_t *signal_context) +{ + if (jl_profile_is_buffer_full()) { + // Buffer full: Delete the timer + jl_profile_stop_timer(); + return; + } + // notify thread to stop + if (!jl_thread_suspend_and_get_state(tid, 1, signal_context)) + return; + // unwinding can fail, so keep track of the current state + // and restore from the SEGV handler if anything happens. + jl_jmp_buf *old_buf = jl_get_safe_restore(); + jl_jmp_buf buf; + + jl_set_safe_restore(&buf); + if (jl_setjmp(buf, 0)) { + jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); + } else { + // Get backtrace data + bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, + bt_size_max - bt_size_cur - 1, signal_context, NULL); + } + jl_set_safe_restore(old_buf); + + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + + // store threadid but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; + + // store task id (never null) + bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); + + // store cpu cycle clock + bt_data_prof[bt_size_cur++].uintptr = cycleclock(); + + // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block + bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; + + // Mark the end of this block with two 0's + bt_data_prof[bt_size_cur++].uintptr = 0; + bt_data_prof[bt_size_cur++].uintptr = 0; + + // notify thread to resume + jl_thread_resume(tid); +} + static void *signal_listener(void *arg) { static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1]; @@ -845,76 +937,44 @@ static void *signal_listener(void *arg) bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) bt_context_t signal_context; - // sample each thread, round-robin style in reverse order - // (so that thread zero gets notified last) - if (critical || profile) { + if (critical) { int lockret = jl_lock_stackwalk(); - int *randperm; - if (profile) - randperm = profile_get_randperm(nthreads); - for (int idx = nthreads; idx-- > 0; ) { - // Stop the threads in the random or reverse round-robin order. - int i = profile ? randperm[idx] : idx; + // sample each thread, round-robin style in reverse order + // (so that thread zero gets notified last) + for (int i = nthreads; i-- > 0; ) { // notify thread to stop if (!jl_thread_suspend_and_get_state(i, 1, &signal_context)) continue; // do backtrace on thread contexts for critical signals // this part must be signal-handler safe - if (critical) { - bt_size += rec_backtrace_ctx(bt_data + bt_size, - JL_MAX_BT_SIZE / nthreads - 1, - &signal_context, NULL); - bt_data[bt_size++].uintptr = 0; - } - - // do backtrace for profiler - if (profile && running) { - if (jl_profile_is_buffer_full()) { - // Buffer full: Delete the timer - jl_profile_stop_timer(); - } - else { - // unwinding can fail, so keep track of the current state - // and restore from the SEGV handler if anything happens. - jl_jmp_buf *old_buf = jl_get_safe_restore(); - jl_jmp_buf buf; - - jl_set_safe_restore(&buf); - if (jl_setjmp(buf, 0)) { - jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n"); - } else { - // Get backtrace data - bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, - bt_size_max - bt_size_cur - 1, &signal_context, NULL); - } - jl_set_safe_restore(old_buf); - - jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; - - // store threadid but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; - - // store task id (never null) - bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); - - // store cpu cycle clock - bt_data_prof[bt_size_cur++].uintptr = cycleclock(); - - // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block - bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1; - - // Mark the end of this block with two 0's - bt_data_prof[bt_size_cur++].uintptr = 0; - bt_data_prof[bt_size_cur++].uintptr = 0; - } - } - - // notify thread to resume + bt_size += rec_backtrace_ctx(bt_data + bt_size, + JL_MAX_BT_SIZE / nthreads - 1, + &signal_context, NULL); + bt_data[bt_size++].uintptr = 0; jl_thread_resume(i); } jl_unlock_stackwalk(lockret); } + else if (profile) { + if (profile_all_tasks) { + // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace` + jl_profile_task_unix(nthreads); + } + else { + int lockret = jl_lock_stackwalk(); + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { + // Stop the threads in the random order. + int i = randperm[idx]; + // do backtrace for profiler + if (profile && running) { + jl_profile_thread_unix(i, &signal_context); + } + } + jl_unlock_stackwalk(lockret); + } + } #ifndef HAVE_MACH if (profile && running) { jl_check_profile_autostop(); diff --git a/src/signals-win.c b/src/signals-win.c index bcb3a1fd246f0e..ff0b1269050ffd 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -449,7 +449,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) static volatile TIMECAPS timecaps; -JL_DLLEXPORT int jl_profile_start_timer(void) +JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks) { if (hBtThread == NULL) { @@ -483,6 +483,7 @@ JL_DLLEXPORT int jl_profile_start_timer(void) if (TIMERR_NOERROR != timeBeginPeriod(timecaps.wPeriodMin)) timecaps.wPeriodMin = 0; } + profile_all_tasks = all_tasks; running = 1; // set `running` finally return 0; } @@ -491,6 +492,7 @@ JL_DLLEXPORT void jl_profile_stop_timer(void) if (running && timecaps.wPeriodMin) timeEndPeriod(timecaps.wPeriodMin); running = 0; + profile_all_tasks = 0; } void jl_install_default_signal_handlers(void) diff --git a/src/stackwalk.c b/src/stackwalk.c index 8a12fb2a281431..664e2ff9b3a48b 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -868,21 +868,38 @@ _os_ptr_munge(uintptr_t ptr) #endif -extern bt_context_t *jl_to_bt_context(void *sigctx); +STATIC_INLINE int all_tasks_profile_running(void) +{ + return running && profile_all_tasks; +} -static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT +void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT { - jl_task_t *ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; - ptls->bt_size = 0; + jl_task_t *ct = NULL; + jl_ptls_t ptls = NULL; + int16_t tid = INT16_MAX; + if (!all_tasks_profile_running()) { + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + ptls->bt_size = 0; + tid = ptls->tid; + } if (t == ct) { - ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); - return; + // Record into the profile buffer + if (all_tasks_profile_running()) { + bt_size_cur += rec_backtrace((jl_bt_element_t*)bt_data_prof + bt_size_cur, + bt_size_max - bt_size_cur - 1, 0); + return; + } + else { + ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0); + return; + } } bt_context_t *context = NULL; bt_context_t c; int16_t old = -1; - while (!jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid) { + while (!jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid) { int lockret = jl_lock_stackwalk(); // if this task is already running somewhere, we need to stop the thread it is running on and query its state if (!jl_thread_suspend_and_get_state(old, 0, &c)) { @@ -1109,11 +1126,21 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT #pragma message("jl_rec_backtrace not defined for unknown task system") #endif } - if (context) - ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context, t->gcstack); + if (context) { + // Record into the profile buffer + if (all_tasks_profile_running()) { + bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, + bt_size_max - bt_size_cur - 1, context, NULL); + } + // Record into the buffer owned by the threads's TLS + else { + ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, + context, t->gcstack); + } + } if (old == -1) jl_atomic_store_relaxed(&t->tid, old); - else if (old != ptls->tid) + else if (old != tid) jl_thread_resume(old); } diff --git a/src/task.c b/src/task.c index 86033a81ddf412..c9eb5f14efb999 100644 --- a/src/task.c +++ b/src/task.c @@ -1115,6 +1115,46 @@ JL_DLLEXPORT jl_task_t *jl_get_current_task(void) return pgcstack == NULL ? NULL : container_of(pgcstack, jl_task_t, gcstack); } +extern int gc_first_tid; + +// Select a task at random to profile. Racy: `live_tasks` can change at any time. +jl_task_t *jl_get_random_task(void) JL_NOTSAFEPOINT +{ + arraylist_t tasks; + arraylist_new(&tasks, 0); + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + for (size_t i = 0; i < nthreads; i++) { + // skip GC threads... + if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) { + continue; + } + jl_ptls_t ptls2 = allstates[i]; + if (ptls2 == NULL) { + continue; + } + jl_task_t *t = ptls2->root_task; + if (t->stkbuf != NULL) { + arraylist_push(&tasks, t); + } + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; + size_t n = mtarraylist_length(live_tasks); + for (size_t i = 0; i < n; i++) { + jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); + if (t->stkbuf != NULL) { + arraylist_push(&tasks, t); + } + } + } + size_t n = tasks.len; + if (n == 0) { + return NULL; + } + jl_task_t *t = (jl_task_t*)tasks.items[jl_rand() % n]; + arraylist_free(&tasks); + return t; +} + #ifdef JL_HAVE_ASYNCIFY JL_DLLEXPORT jl_ucontext_t *task_ctx_ptr(jl_task_t *t) diff --git a/src/threading.c b/src/threading.c index 8f350d41f64b18..245cfb6ce19e33 100644 --- a/src/threading.c +++ b/src/threading.c @@ -710,6 +710,8 @@ void jl_init_threading(void) jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size); jl_n_gcthreads = ngcthreads; gc_first_tid = nthreads + nthreadsi; + + uv_barrier_init(&thread_init_done, jl_all_tls_states_size); } uv_barrier_t thread_init_done; @@ -747,16 +749,12 @@ void jl_start_threads(void) mask[0] = 0; } - // create threads - uv_barrier_init(&thread_init_done, nthreads); - + // Create threads // GC/System threads need to be after the worker threads. int nmutator_threads = nthreads - ngcthreads; for (i = 1; i < nmutator_threads; ++i) { - jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread - t->tid = i; - t->barrier = &thread_init_done; + jl_threadarg_t *t = jl_threadarg_new(i, &thread_init_done, NULL); uv_thread_create(&uvtid, jl_threadfun, t); if (exclusive) { mask[i] = 1; diff --git a/src/threading.h b/src/threading.h index cb265376997132..22b267b73ed381 100644 --- a/src/threading.h +++ b/src/threading.h @@ -22,10 +22,20 @@ typedef struct _jl_threadarg_t { void *arg; } jl_threadarg_t; +STATIC_INLINE jl_threadarg_t *jl_threadarg_new(int16_t tid, uv_barrier_t *barrier, void *arg) +{ + jl_threadarg_t *targ = (jl_threadarg_t*)malloc_s(sizeof(jl_threadarg_t)); + targ->tid = tid; + targ->barrier = barrier; + targ->arg = arg; + return targ; +} + // each thread must initialize its TLS jl_ptls_t jl_init_threadtls(int16_t tid) JL_NOTSAFEPOINT; // provided by a threading infrastructure +jl_ptls_t jl_threadfun_preamble(void *arg, uint8_t state); void jl_init_threadinginfra(void); void jl_parallel_gc_threadfun(void *arg); void jl_concurrent_gc_threadfun(void *arg); diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index a0a1ecd1964ed9..63a1f3a4517d92 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -31,6 +31,25 @@ macro profile(ex) end end +export @profile_all + +""" + @profile_all + +`@profile_all ` runs your expression while taking periodic backtraces of a sample of all live tasks (both running and not running). +These are appended to an internal buffer of backtraces. +""" +macro profile_all(ex) + return quote + try + start_timer(true) + $(esc(ex)) + finally + stop_timer() + end + end +end + # An internal function called to show the report after an information request (SIGINFO or SIGUSR1). function _peek_report() iob = IOBuffer() @@ -562,9 +581,9 @@ Julia, and examine the resulting `*.mem` files. clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) # C wrappers -function start_timer() +function start_timer(all_tasks::Bool=false) check_init() # if the profile buffer hasn't been initialized, initialize with default size - status = ccall(:jl_profile_start_timer, Cint, ()) + status = ccall(:jl_profile_start_timer, Cint, (Bool,), all_tasks) if status < 0 error(error_codes[status]) end