diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 1c1e2869607620..69e37db22c89fc 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -190,7 +190,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO return stk; } -void sweep_stack_pools(void) +void sweep_stack_pool_loop(void) { // Stack sweeping algorithm: // // deallocate stacks if we have too many sitting around unused @@ -203,8 +203,11 @@ void sweep_stack_pools(void) // bufsz = t->bufsz // if (stkbuf) // push(free_stacks[sz], stkbuf) - assert(gc_n_threads); - for (int i = 0; i < gc_n_threads; i++) { + jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + while (1) { + int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); + if (i < 0) + break; jl_ptls_t ptls2 = gc_all_tls_states[i]; // free half of stacks that remain unused since last sweep @@ -264,6 +267,7 @@ void sweep_stack_pools(void) } live_tasks->len -= ndel; } + jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); } JL_DLLEXPORT jl_array_t *jl_live_tasks(void) diff --git a/src/gc-tls.h b/src/gc-tls.h index a9f711198e9142..6c9b814ca5661f 100644 --- a/src/gc-tls.h +++ b/src/gc-tls.h @@ -82,6 +82,7 @@ typedef struct { jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; _Atomic(size_t) gc_sweeps_requested; + _Atomic(uint8_t) gc_stack_sweep_requested; arraylist_t sweep_objs; } jl_gc_tls_states_t; diff --git a/src/gc.c b/src/gc.c index dad57687325450..9b174df9e15b68 100644 --- a/src/gc.c +++ b/src/gc.c @@ -26,6 +26,10 @@ _Atomic(int) gc_n_threads_sweeping; _Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC _Atomic(int) gc_master_tid; +// counter for sharing work when sweeping stacks +_Atomic(int) gc_ptls_sweep_idx; +// counter for round robin of giving back stack pages to the OS +_Atomic(int) gc_stack_free_idx; // `tid` of first GC thread int gc_first_tid; // Mutex/cond used to synchronize wakeup of GC threads on parallel marking @@ -1525,6 +1529,44 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start; } +// wake up all threads to sweep the stacks +void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + uv_mutex_lock(&gc_threads_lock); + int first = gc_first_parallel_collector_thread_id(); + int last = gc_last_parallel_collector_thread_id(); + for (int i = first; i <= last; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + gc_check_ptls_of_parallel_collector_thread(ptls2); + jl_atomic_fetch_add(&ptls2->gc_tls.gc_stack_sweep_requested, 1); + } + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); + return; +} + +void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT +{ + while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx)>= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { + jl_cpu_pause(); + } +} + +void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + // initialize ptls index for parallel sweeping of stack pools + assert(gc_n_threads); + int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx); + if (stack_free_idx + 1 == gc_n_threads) + jl_atomic_store_relaxed(&gc_stack_free_idx, 0); + else + jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1); + jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial + gc_sweep_wake_all_stacks(ptls); + sweep_stack_pool_loop(); + gc_sweep_wait_for_all_stacks(); +} + static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT { assert(pg->fl_begin_offset != UINT16_MAX); @@ -3604,7 +3646,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #endif current_sweep_full = sweep_full; sweep_weak_refs(); - sweep_stack_pools(); + sweep_stack_pools(ptls); gc_sweep_foreign_objs(); gc_sweep_other(ptls, sweep_full); gc_scrub(); diff --git a/src/gc.h b/src/gc.h index b06deec9d72389..e67fbeebc3d1f8 100644 --- a/src/gc.h +++ b/src/gc.h @@ -565,6 +565,8 @@ extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping; +extern _Atomic(int) gc_ptls_sweep_idx; +extern _Atomic(int) gc_stack_free_idx; extern uv_barrier_t thread_init_done; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; @@ -574,7 +576,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); void gc_sweep_pool_parallel(jl_ptls_t ptls); void gc_free_pages(void); -void sweep_stack_pools(void); +void sweep_stack_pool_loop(void); void jl_gc_debug_init(void); // GC pages