From 04914eb9c90c7d9651788cd1ed01203abb0332db Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:57:24 -0300 Subject: [PATCH 1/5] drop jl_gc_pool_alloc in favor of instrumented version (#55085) And do some re-naming. --- src/gc.c | 23 ++++------------------- src/jl_exported_funcs.inc | 2 -- src/llvm-pass-helpers.cpp | 4 ++-- test/llvmpasses/alloc-opt-gcframe.ll | 10 +++++----- test/llvmpasses/final-lower-gc.ll | 4 ++-- test/llvmpasses/julia-licm-fail.ll | 2 +- test/llvmpasses/julia-licm-missed.ll | 2 +- test/llvmpasses/julia-licm.ll | 2 +- 8 files changed, 16 insertions(+), 33 deletions(-) diff --git a/src/gc.c b/src/gc.c index ec43e6b6db600..3b83ce13c4732 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1027,15 +1027,9 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) return jl_valueof(&v->header); } -// Deprecated version, supported for legacy code. -JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) -{ - jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); - maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag); - return val; -} + // Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. -JL_DLLEXPORT jl_value_t *jl_gc_big_alloc_instrumented(jl_ptls_t ptls, size_t sz, jl_value_t *type) +JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type); @@ -1299,7 +1293,7 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset jl_gc_pool_t *p = (jl_gc_pool_t*)((char*)ptls + pool_offset); assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0); #ifdef MEMDEBUG - return jl_gc_big_alloc(ptls, osize); + return jl_gc_big_alloc(ptls, osize, NULL); #endif maybe_collect(ptls); jl_atomic_store_relaxed(&ptls->gc_num.allocd, @@ -1347,17 +1341,8 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset return jl_valueof(v); } -// Deprecated version, supported for legacy code. -JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, - int osize) -{ - jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); - maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag); - return val; -} // Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. -JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_instrumented(jl_ptls_t ptls, int pool_offset, - int osize, jl_value_t* type) +JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize, jl_value_t* type) { jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index a98158f3d3e5e..ad49bfd7b3b9f 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -154,7 +154,6 @@ XX(jl_gc_allocobj) \ XX(jl_gc_alloc_typed) \ XX(jl_gc_big_alloc) \ - XX(jl_gc_big_alloc_instrumented) \ XX(jl_gc_collect) \ XX(jl_gc_conservative_gc_support_enabled) \ XX(jl_gc_counted_calloc) \ @@ -183,7 +182,6 @@ XX(jl_gc_new_weakref_th) \ XX(jl_gc_num) \ XX(jl_gc_pool_alloc) \ - XX(jl_gc_pool_alloc_instrumented) \ XX(jl_gc_queue_multiroot) \ XX(jl_gc_queue_root) \ XX(jl_gc_safepoint) \ diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index 39d37cee3928b..005b17186f10f 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -238,8 +238,8 @@ namespace jl_intrinsics { } namespace jl_well_known { - static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc_instrumented); - static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc_instrumented); + static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc); + static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc); static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root); static const char *GC_ALLOC_TYPED_NAME = XSTR(jl_gc_alloc_typed); diff --git a/test/llvmpasses/alloc-opt-gcframe.ll b/test/llvmpasses/alloc-opt-gcframe.ll index f600399ac2a7a..91d687cac1eb3 100644 --- a/test/llvmpasses/alloc-opt-gcframe.ll +++ b/test/llvmpasses/alloc-opt-gcframe.ll @@ -24,7 +24,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; OPAQUE: %current_task = getelementptr inbounds ptr, ptr %gcstack, i64 -12 ; OPAQUE: [[ptls_field:%.*]] = getelementptr inbounds ptr, ptr %current_task, i64 16 ; OPAQUE-NEXT: [[ptls_load:%.*]] = load ptr, ptr [[ptls_field]], align 8, !tbaa !0 -; OPAQUE-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}}) +; OPAQUE-NEXT: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc(ptr [[ptls_load]], i32 [[SIZE_T:[0-9]+]], i32 16, i64 {{.*}} @tag {{.*}}) ; OPAQUE: store atomic ptr addrspace(10) @tag, ptr addrspace(10) {{.*}} unordered, align 8, !tbaa !4 define {} addrspace(10)* @return_obj() { @@ -270,11 +270,11 @@ L3: } ; CHECK-LABEL: }{{$}} -; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc_instrumented(i8*, -; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc_instrumented(i8*, +; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, +; TYPED: declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, -; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_pool_alloc_instrumented(ptr, -; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_big_alloc_instrumented(ptr, +; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_pool_alloc(ptr, +; OPAQUE: declare noalias nonnull ptr addrspace(10) @ijl_gc_big_alloc(ptr, declare void @external_function() declare {}*** @julia.get_pgcstack() declare noalias nonnull {} addrspace(10)* @julia.gc_alloc_obj({}**, i64, {} addrspace(10)*) diff --git a/test/llvmpasses/final-lower-gc.ll b/test/llvmpasses/final-lower-gc.ll index 64d2c06e534b2..493f1d501cb8e 100644 --- a/test/llvmpasses/final-lower-gc.ll +++ b/test/llvmpasses/final-lower-gc.ll @@ -80,8 +80,8 @@ top: %pgcstack = call {}*** @julia.get_pgcstack() %ptls = call {}*** @julia.ptls_states() %ptls_i8 = bitcast {}*** %ptls to i8* -; TYPED: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc_instrumented -; OPAQUE: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc_instrumented +; TYPED: %v = call noalias nonnull dereferenceable({{[0-9]+}}) {} addrspace(10)* @ijl_gc_pool_alloc +; OPAQUE: %v = call noalias nonnull dereferenceable({{[0-9]+}}) ptr addrspace(10) @ijl_gc_pool_alloc %v = call {} addrspace(10)* @julia.gc_alloc_bytes(i8* %ptls_i8, i64 8, i64 12341234) %0 = bitcast {} addrspace(10)* %v to {} addrspace(10)* addrspace(10)* %1 = getelementptr {} addrspace(10)*, {} addrspace(10)* addrspace(10)* %0, i64 -1 diff --git a/test/llvmpasses/julia-licm-fail.ll b/test/llvmpasses/julia-licm-fail.ll index 464a96f1413d9..78640a28aeb01 100644 --- a/test/llvmpasses/julia-licm-fail.ll +++ b/test/llvmpasses/julia-licm-fail.ll @@ -82,7 +82,7 @@ declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 declare void @ijl_gc_queue_root({} addrspace(10)*) #3 ; Function Attrs: allocsize(1) -declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #1 +declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32, i8*) #1 ; Function Attrs: allocsize(1) declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #1 diff --git a/test/llvmpasses/julia-licm-missed.ll b/test/llvmpasses/julia-licm-missed.ll index 941b2d072a1cc..990dc02442360 100644 --- a/test/llvmpasses/julia-licm-missed.ll +++ b/test/llvmpasses/julia-licm-missed.ll @@ -96,7 +96,7 @@ declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 declare void @ijl_gc_queue_root({} addrspace(10)*) #3 ; Function Attrs: allocsize(1) -declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #1 +declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32, i8*) #1 ; Function Attrs: allocsize(1) declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #1 diff --git a/test/llvmpasses/julia-licm.ll b/test/llvmpasses/julia-licm.ll index 8bedc5db75d96..886e0c9716f61 100644 --- a/test/llvmpasses/julia-licm.ll +++ b/test/llvmpasses/julia-licm.ll @@ -164,7 +164,7 @@ declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 declare void @ijl_gc_queue_root({} addrspace(10)*) #3 ; Function Attrs: allocsize(1) -declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32) #1 +declare noalias nonnull {} addrspace(10)* @ijl_gc_pool_alloc(i8*, i32, i32, i8*) #1 ; Function Attrs: allocsize(1) declare noalias nonnull {} addrspace(10)* @ijl_gc_big_alloc(i8*, i64) #1 From 48db26900f8376f8c26c59ad842d17c608fd0372 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Wed, 10 Jul 2024 16:54:31 -0300 Subject: [PATCH 2/5] create GC TLS (#55086) Encapsulates all relevant GC thread-local-state into a separate structure. Motivation is that MMTk will have its own version of GC thread-local-state, so doesn't need all of the Julia GC TLS. In the future, folks who would be using MMTk would be setting a pre-processor flag which would lead to either the stock Julia GC TLS or MMTk's GC TLS to be included in `julia_threads.h`. I.e., we would have something like: ```C jl_gc_mmtk_tls_states mmtk_gc_tls; jl_gc_tls_states gc_tls; ``` --- src/Makefile | 2 +- src/array.c | 2 +- src/gc-debug.c | 73 +++------ src/gc-stacks.c | 16 +- src/gc-tls.h | 103 +++++++++++++ src/gc.c | 344 ++++++++++++++++++++++--------------------- src/julia_internal.h | 2 +- src/julia_threads.h | 83 +---------- src/partr.c | 4 +- src/stackwalk.c | 2 +- 10 files changed, 314 insertions(+), 317 deletions(-) create mode 100644 src/gc-tls.h diff --git a/src/Makefile b/src/Makefile index 0127e513ce144..fbbd7461e3c9c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -99,7 +99,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0) UV_HEADERS += uv.h UV_HEADERS += uv/*.h endif -PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) +PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) ifeq ($(OS),WINNT) PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h) endif diff --git a/src/array.c b/src/array.c index c7f71c9695e87..bfa62e0f75273 100644 --- a/src/array.c +++ b/src/array.c @@ -499,7 +499,7 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len) const size_t allocsz = sz + sizeof(jl_taggedvalue_t); if (sz <= GC_MAX_SZCLASS) { int pool_id = jl_gc_szclass_align8(allocsz); - jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id]; + jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id]; int osize = jl_gc_sizeclasses[pool_id]; // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.) diff --git a/src/gc-debug.c b/src/gc-debug.c index 6faee6b1de97f..5b90ce7cd955b 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -97,7 +97,7 @@ static arraylist_t bits_save[4]; static void gc_clear_mark_page(jl_gc_pagemeta_t *pg, int bits) { jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n]; - jl_gc_pool_t *pool = &ptls2->heap.norm_pools[pg->pool_n]; + jl_gc_pool_t *pool = &ptls2->gc_tls.heap.norm_pools[pg->pool_n]; jl_taggedvalue_t *pv = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); char *lim = (char*)pv + GC_PAGE_SZ - GC_PAGE_OFFSET - pool->osize; while ((char*)pv <= lim) { @@ -112,7 +112,7 @@ static void gc_clear_mark_outer(int bits) { for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom); while (pg != NULL) { gc_clear_mark_page(pg, bits); pg = pg->next; @@ -132,7 +132,7 @@ static void clear_mark(int bits) } bigval_t *v; for (int i = 0; i < gc_n_threads; i++) { - v = gc_all_tls_states[i]->heap.big_objects; + v = gc_all_tls_states[i]->gc_tls.heap.big_objects; while (v != NULL) { void *gcv = &v->header; if (!gc_verifying) @@ -170,7 +170,7 @@ static void gc_verify_track(jl_ptls_t ptls) return; do { jl_gc_markqueue_t mq; - jl_gc_markqueue_t *mq2 = &ptls->mark_queue; + jl_gc_markqueue_t *mq2 = &ptls->gc_tls.mark_queue; ws_queue_t *cq = &mq.chunk_queue; ws_queue_t *q = &mq.ptr_queue; jl_atomic_store_relaxed(&cq->top, 0); @@ -230,7 +230,7 @@ void gc_verify(jl_ptls_t ptls) return; } jl_gc_markqueue_t mq; - jl_gc_markqueue_t *mq2 = &ptls->mark_queue; + jl_gc_markqueue_t *mq2 = &ptls->gc_tls.mark_queue; ws_queue_t *cq = &mq.chunk_queue; ws_queue_t *q = &mq.ptr_queue; jl_atomic_store_relaxed(&cq->top, 0); @@ -289,7 +289,7 @@ static void gc_verify_tags_page(jl_gc_pagemeta_t *pg) int p_n = pg->pool_n; int t_n = pg->thread_n; jl_ptls_t ptls2 = gc_all_tls_states[t_n]; - jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; + jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[p_n]; int osize = pg->osize; char *data = pg->data; char *page_begin = data + GC_PAGE_OFFSET; @@ -349,42 +349,13 @@ static void gc_verify_tags_page(jl_gc_pagemeta_t *pg) static void gc_verify_tags_pagetable0(pagetable0_t *pagetable0) { - for (int pg_i = 0; pg_i < REGION0_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable0->allocmap[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_verify_tags_page(pagetable0->meta[pg_i * 32 + j]); - } - } - } - } -} - -static void gc_verify_tags_pagetable1(pagetable1_t *pagetable1) -{ - for (int pg_i = 0; pg_i < REGION1_PG_COUNT / 32; pg_i++) { - uint32_t line = pagetable1->allocmap0[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_verify_tags_pagetable0(pagetable1->meta0[pg_i * 32 + j]); - } - } - } - } -} - -static void gc_verify_tags_pagetable(void) -{ - for (int pg_i = 0; pg_i < (REGION2_PG_COUNT + 31) / 32; pg_i++) { - uint32_t line = memory_map.allocmap1[pg_i]; - if (line) { - for (int j = 0; j < 32; j++) { - if ((line >> j) & 1) { - gc_verify_tags_pagetable1(memory_map.meta1[pg_i * 32 + j]); - } - } + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + jl_gc_page_stack_t *pgstk = &ptls2->gc_tls.page_metadata_allocd; + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&pgstk->bottom); + while (pg != NULL) { + gc_verify_tags_page(pg); + pg = pg->next; } } } @@ -396,7 +367,7 @@ void gc_verify_tags(void) jl_ptls_t ptls2 = gc_all_tls_states[t_i]; for (int i = 0; i < JL_GC_N_POOLS; i++) { // for all pools, iterate its freelist - jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; + jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i]; jl_taggedvalue_t *next = p->freelist; jl_taggedvalue_t *last = NULL; char *allocating = gc_page_data(next); @@ -837,8 +808,8 @@ void gc_time_mark_pause(int64_t t0, int64_t scanned_bytes, int64_t remset_nptr = 0; for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - last_remset_len += ptls2->heap.last_remset->len; - remset_nptr = ptls2->heap.remset_nptr; + last_remset_len += ptls2->gc_tls.heap.last_remset->len; + remset_nptr = ptls2->gc_tls.heap.remset_nptr; } jl_safe_printf("GC mark pause %.2f ms | " "scanned %" PRId64 " kB = %" PRId64 " + %" PRId64 " | " @@ -969,13 +940,13 @@ void gc_stats_all_pool(void) for (int i = 0; i < JL_GC_N_POOLS; i++) { for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - size_t b = pool_stats(&ptls2->heap.norm_pools[i], &w, &np, &nol); + size_t b = pool_stats(&ptls2->gc_tls.heap.norm_pools[i], &w, &np, &nol); nb += b; - no += (b / ptls2->heap.norm_pools[i].osize); + no += (b / ptls2->gc_tls.heap.norm_pools[i].osize); tw += w; tp += np; nold += nol; - noldbytes += nol * ptls2->heap.norm_pools[i].osize; + noldbytes += nol * ptls2->gc_tls.heap.norm_pools[i].osize; } } jl_safe_printf("%lld objects (%lld%% old), %lld kB (%lld%% old) total allocated, " @@ -994,7 +965,7 @@ void gc_stats_big_obj(void) size_t nused=0, nbytes=0, nused_old=0, nbytes_old=0; for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - bigval_t *v = ptls2->heap.big_objects; + bigval_t *v = ptls2->gc_tls.heap.big_objects; while (v != NULL) { if (gc_marked(v->bits.gc)) { nused++; @@ -1011,7 +982,7 @@ void gc_stats_big_obj(void) v = v->next; } - mallocarray_t *ma = ptls2->heap.mallocarrays; + mallocarray_t *ma = ptls2->gc_tls.heap.mallocarrays; while (ma != NULL) { if (gc_marked(jl_astaggedvalue(ma->a)->bits.gc)) { nused++; @@ -1057,7 +1028,7 @@ static void gc_count_pool_pagetable(void) { for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom); while (pg != NULL) { if (gc_alloc_map_is_set(pg->data)) { gc_count_pool_page(pg); diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 693cb8d0eadf0..1c1e286960762 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -119,7 +119,7 @@ static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) { unsigned pool_id = select_pool(bufsz); if (pool_sizes[pool_id] == bufsz) { - small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); + small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf); return; } } @@ -148,7 +148,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task) #ifdef _COMPILER_ASAN_ENABLED_ __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz); #endif - small_arraylist_push(&ptls->heap.free_stacks[pool_id], stkbuf); + small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf); } } } @@ -163,7 +163,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) { unsigned pool_id = select_pool(ssize); ssize = pool_sizes[pool_id]; - small_arraylist_t *pool = &ptls->heap.free_stacks[pool_id]; + small_arraylist_t *pool = &ptls->gc_tls.heap.free_stacks[pool_id]; if (pool->len > 0) { stk = small_arraylist_pop(pool); } @@ -184,7 +184,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO } *bufsz = ssize; if (owner) { - small_arraylist_t *live_tasks = &ptls->heap.live_tasks; + small_arraylist_t *live_tasks = &ptls->gc_tls.heap.live_tasks; mtarraylist_push(live_tasks, owner); } return stk; @@ -209,7 +209,7 @@ void sweep_stack_pools(void) // free half of stacks that remain unused since last sweep for (int p = 0; p < JL_N_STACK_POOLS; p++) { - small_arraylist_t *al = &ptls2->heap.free_stacks[p]; + small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p]; size_t n_to_free; if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { n_to_free = al->len / 2; @@ -225,7 +225,7 @@ void sweep_stack_pools(void) } } - small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; size_t n = 0; size_t ndel = 0; size_t l = live_tasks->len; @@ -280,7 +280,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void) jl_ptls_t ptls2 = allstates[i]; if (ptls2 == NULL) continue; - small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; size_t n = mtarraylist_length(live_tasks); l += n + (ptls2->root_task->stkbuf != NULL); } @@ -303,7 +303,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void) goto restart; ((void**)jl_array_data(a))[j++] = t; } - small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; size_t n = mtarraylist_length(live_tasks); for (size_t i = 0; i < n; i++) { jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i); diff --git a/src/gc-tls.h b/src/gc-tls.h new file mode 100644 index 0000000000000..3f8662b467e30 --- /dev/null +++ b/src/gc-tls.h @@ -0,0 +1,103 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// Meant to be included in "julia_threads.h" +#ifndef JL_GC_TLS_H +#define JL_GC_TLS_H + +#include "julia_atomics.h" +#include "work-stealing-queue.h" +// GC threading ------------------------------------------------------------------ + +#include "arraylist.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + struct _jl_taggedvalue_t *freelist; // root of list of free objects + struct _jl_taggedvalue_t *newpages; // root of list of chunks of free objects + uint16_t osize; // size of objects in this pool +} jl_gc_pool_t; + +typedef struct { + // variable for tracking weak references + small_arraylist_t weak_refs; + // live tasks started on this thread + // that are holding onto a stack from the pool + small_arraylist_t live_tasks; + + // variables for tracking malloc'd arrays + struct _mallocarray_t *mallocarrays; + struct _mallocarray_t *mafreelist; + + // variables for tracking big objects + struct _bigval_t *big_objects; + + // lower bound of the number of pointers inside remembered values + int remset_nptr; + // remembered set + arraylist_t remset; + + // variables for allocating objects from pools +#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h` + jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS]; + +#define JL_N_STACK_POOLS 16 + small_arraylist_t free_stacks[JL_N_STACK_POOLS]; +} jl_thread_heap_t; + +typedef struct { + _Atomic(int64_t) allocd; + _Atomic(int64_t) pool_live_bytes; + _Atomic(int64_t) freed; + _Atomic(uint64_t) malloc; + _Atomic(uint64_t) realloc; + _Atomic(uint64_t) poolalloc; + _Atomic(uint64_t) bigalloc; + _Atomic(uint64_t) freecall; +} jl_thread_gc_num_t; + +typedef struct { + ws_queue_t chunk_queue; + ws_queue_t ptr_queue; + arraylist_t reclaim_set; +} jl_gc_markqueue_t; + +typedef struct { + // thread local increment of `perm_scanned_bytes` + size_t perm_scanned_bytes; + // thread local increment of `scanned_bytes` + size_t scanned_bytes; + // Number of queued big objects (<= 1024) + size_t nbig_obj; + // Array of queued big objects to be moved between the young list + // and the old list. + // A set low bit means that the object should be moved from the old list + // to the young list (`mark_reset_age`). + // Objects can only be put into this list when the mark bit is flipped to + // `1` (atomically). Combining with the sync after marking, + // this makes sure that a single objects can only appear once in + // the lists (the mark bit cannot be flipped to `0` without sweeping) + void *big_obj[1024]; +} jl_gc_mark_cache_t; + +typedef struct { + _Atomic(struct _jl_gc_pagemeta_t *) bottom; +} jl_gc_page_stack_t; + +typedef struct { + jl_thread_heap_t heap; + jl_gc_page_stack_t page_metadata_allocd; + jl_thread_gc_num_t gc_num; + jl_gc_markqueue_t mark_queue; + jl_gc_mark_cache_t gc_cache; + _Atomic(size_t) gc_sweeps_requested; + arraylist_t sweep_objs; +} jl_gc_tls_states_t; + +#ifdef __cplusplus +} +#endif + +#endif // JL_GC_TLS_H diff --git a/src/gc.c b/src/gc.c index 3b83ce13c4732..e96fc9637d6f9 100644 --- a/src/gc.c +++ b/src/gc.c @@ -21,7 +21,7 @@ int jl_n_sweepthreads; _Atomic(int) gc_n_threads_marking; // Number of threads sweeping _Atomic(int) gc_n_threads_sweeping; -// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing) +// Temporary for the `ptls->gc_tls.page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing) _Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC _Atomic(int) gc_master_tid; @@ -670,7 +670,7 @@ static void gc_sweep_foreign_objs(void) for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 != NULL) - gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); + gc_sweep_foreign_objs_in_list(&ptls2->gc_tls.sweep_objs); } } @@ -767,7 +767,7 @@ static void gc_sync_cache_nolock(jl_ptls_t ptls, jl_gc_mark_cache_t *gc_cache) J bigval_t *hdr = (bigval_t*)gc_ptr_clear_tag(ptr, 1); gc_big_object_unlink(hdr); if (gc_ptr_tag(ptr, 1)) { - gc_big_object_link(hdr, &ptls->heap.big_objects); + gc_big_object_link(hdr, &ptls->gc_tls.heap.big_objects); } else { // Move hdr from `big_objects` list to `big_objects_marked list` @@ -784,7 +784,7 @@ static void gc_sync_cache_nolock(jl_ptls_t ptls, jl_gc_mark_cache_t *gc_cache) J static void gc_sync_cache(jl_ptls_t ptls) JL_NOTSAFEPOINT { uv_mutex_lock(&gc_cache_lock); - gc_sync_cache_nolock(ptls, &ptls->gc_cache); + gc_sync_cache_nolock(ptls, &ptls->gc_tls.gc_cache); uv_mutex_unlock(&gc_cache_lock); } @@ -795,22 +795,22 @@ static void gc_sync_all_caches_nolock(jl_ptls_t ptls) for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) - gc_sync_cache_nolock(ptls, &ptls2->gc_cache); + gc_sync_cache_nolock(ptls, &ptls2->gc_tls.gc_cache); } } STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr, int toyoung) JL_NOTSAFEPOINT { - const int nentry = sizeof(ptls->gc_cache.big_obj) / sizeof(void*); - size_t nobj = ptls->gc_cache.nbig_obj; + const int nentry = sizeof(ptls->gc_tls.gc_cache.big_obj) / sizeof(void*); + size_t nobj = ptls->gc_tls.gc_cache.nbig_obj; if (__unlikely(nobj >= nentry)) { gc_sync_cache(ptls); nobj = 0; } uintptr_t v = (uintptr_t)hdr; - ptls->gc_cache.big_obj[nobj] = (void*)(toyoung ? (v | 1) : v); - ptls->gc_cache.nbig_obj = nobj + 1; + ptls->gc_tls.gc_cache.big_obj[nobj] = (void*)(toyoung ? (v | 1) : v); + ptls->gc_tls.gc_cache.nbig_obj = nobj + 1; } // Atomically set the mark bit for object and return whether it was previously unmarked @@ -848,11 +848,11 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, assert(!gc_alloc_map_is_set((char*)o)); bigval_t *hdr = bigval_header(o); if (mark_mode == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += hdr->sz; + ptls->gc_tls.gc_cache.perm_scanned_bytes += hdr->sz; gc_queue_big_marked(ptls, hdr, 0); } else { - ptls->gc_cache.scanned_bytes += hdr->sz; + ptls->gc_tls.gc_cache.scanned_bytes += hdr->sz; // We can't easily tell if the object is old or being promoted // from the gc bits but if the `age` is `0` then the object // must be already on a young list. @@ -872,12 +872,12 @@ STATIC_INLINE void gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o, gc_setmark_big(ptls, o, mark_mode); #else if (mark_mode == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += page->osize; + ptls->gc_tls.gc_cache.perm_scanned_bytes += page->osize; static_assert(sizeof(_Atomic(uint16_t)) == sizeof(page->nold), ""); jl_atomic_fetch_add_relaxed((_Atomic(uint16_t)*)&page->nold, 1); } else { - ptls->gc_cache.scanned_bytes += page->osize; + ptls->gc_tls.gc_cache.scanned_bytes += page->osize; if (mark_reset_age) { page->has_young = 1; } @@ -930,7 +930,7 @@ void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL STATIC_INLINE void maybe_collect(jl_ptls_t ptls) { - if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) { + if (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) >= 0 || jl_gc_debug_check_other()) { jl_gc_collect(JL_GC_AUTO); } else { @@ -946,7 +946,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type); wr->value = value; // NOTE: wb not needed here - small_arraylist_push(&ptls->heap.weak_refs, wr); + small_arraylist_push(&ptls->gc_tls.heap.weak_refs, wr); return wr; } @@ -956,8 +956,8 @@ static void clear_weak_refs(void) for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 != NULL) { - size_t n, l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; + size_t n, l = ptls2->gc_tls.heap.weak_refs.len; + void **lst = ptls2->gc_tls.heap.weak_refs.items; for (n = 0; n < l; n++) { jl_weakref_t *wr = (jl_weakref_t*)lst[n]; if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc)) @@ -975,8 +975,8 @@ static void sweep_weak_refs(void) if (ptls2 != NULL) { size_t n = 0; size_t ndel = 0; - size_t l = ptls2->heap.weak_refs.len; - void **lst = ptls2->heap.weak_refs.items; + size_t l = ptls2->gc_tls.heap.weak_refs.len; + void **lst = ptls2->gc_tls.heap.weak_refs.items; if (l == 0) continue; while (1) { @@ -991,7 +991,7 @@ static void sweep_weak_refs(void) lst[n] = lst[n + ndel]; lst[n + ndel] = tmp; } - ptls2->heap.weak_refs.len -= ndel; + ptls2->gc_tls.heap.weak_refs.len -= ndel; } } } @@ -1015,15 +1015,15 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) jl_throw(jl_memory_exception); gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t, gc_cblist_notify_external_alloc, (v, allocsz)); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); - jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, - jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc) + 1); #ifdef MEMDEBUG memset(v, 0xee, allocsz); #endif v->sz = allocsz; - gc_big_object_link(v, &ptls->heap.big_objects); + gc_big_object_link(v, &ptls->gc_tls.heap.big_objects); return jl_valueof(&v->header); } @@ -1085,17 +1085,17 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 != NULL) - sweep_big_list(sweep_full, &ptls2->heap.big_objects); + sweep_big_list(sweep_full, &ptls2->gc_tls.heap.big_objects); } if (sweep_full) { bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); // Move all survivors from big_objects_marked list to the big_objects list of this thread. - if (ptls->heap.big_objects) - ptls->heap.big_objects->prev = last_next; - *last_next = ptls->heap.big_objects; - ptls->heap.big_objects = big_objects_marked; - if (ptls->heap.big_objects) - ptls->heap.big_objects->prev = &ptls->heap.big_objects; + if (ptls->gc_tls.heap.big_objects) + ptls->gc_tls.heap.big_objects->prev = last_next; + *last_next = ptls->gc_tls.heap.big_objects; + ptls->gc_tls.heap.big_objects = big_objects_marked; + if (ptls->gc_tls.heap.big_objects) + ptls->gc_tls.heap.big_objects->prev = &ptls->gc_tls.heap.big_objects; big_objects_marked = NULL; } gc_time_big_end(); @@ -1107,30 +1107,30 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT { // This is **NOT** a GC safe point. mallocarray_t *ma; - if (ptls->heap.mafreelist == NULL) { + if (ptls->gc_tls.heap.mafreelist == NULL) { ma = (mallocarray_t*)malloc_s(sizeof(mallocarray_t)); } else { - ma = ptls->heap.mafreelist; - ptls->heap.mafreelist = ma->next; + ma = ptls->gc_tls.heap.mafreelist; + ptls->gc_tls.heap.mafreelist = ma->next; } ma->a = a; - ma->next = ptls->heap.mallocarrays; - ptls->heap.mallocarrays = ma; + ma->next = ptls->gc_tls.heap.mallocarrays; + ptls->gc_tls.heap.mallocarrays = ma; } void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT { jl_ptls_t ptls = jl_current_task->ptls; - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz); } void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT { jl_ptls_t ptls = jl_current_task->ptls; - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freed, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freed) + sz); } static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT @@ -1142,13 +1142,13 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls) { - dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval); - dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); - dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc); - dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc); - dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc); - dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc); - dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall); + dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval); + dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freed); + dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc); + dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc); + dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc); + dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc); + dest->freecall += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freecall); } } } @@ -1163,13 +1163,13 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls != NULL) { // don't reset `pool_live_bytes` here - jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); - jl_atomic_store_relaxed(&ptls->gc_num.freed, 0); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, 0); - jl_atomic_store_relaxed(&ptls->gc_num.realloc, 0); - jl_atomic_store_relaxed(&ptls->gc_num.poolalloc, 0); - jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, 0); - jl_atomic_store_relaxed(&ptls->gc_num.freecall, 0); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freed, 0); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freecall, 0); } } } @@ -1223,8 +1223,8 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - mallocarray_t *ma = ptls2->heap.mallocarrays; - mallocarray_t **pma = &ptls2->heap.mallocarrays; + mallocarray_t *ma = ptls2->gc_tls.heap.mallocarrays; + mallocarray_t **pma = &ptls2->gc_tls.heap.mallocarrays; while (ma != NULL) { mallocarray_t *nxt = ma->next; int bits = jl_astaggedvalue(ma->a)->bits.gc; @@ -1235,8 +1235,8 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT *pma = nxt; assert(ma->a->flags.how == 2); jl_gc_free_array(ma->a); - ma->next = ptls2->heap.mafreelist; - ptls2->heap.mafreelist = ma; + ma->next = ptls2->gc_tls.heap.mafreelist; + ptls2->gc_tls.heap.mafreelist = ma; } gc_time_count_mallocd_array(bits); ma = nxt; @@ -1251,7 +1251,7 @@ STATIC_INLINE jl_taggedvalue_t *gc_reset_page(jl_ptls_t ptls2, const jl_gc_pool_ { assert(GC_PAGE_OFFSET >= sizeof(void*)); pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; - pg->pool_n = p - ptls2->heap.norm_pools; + pg->pool_n = p - ptls2->gc_tls.heap.norm_pools; jl_taggedvalue_t *beg = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); pg->has_young = 0; pg->has_marked = 0; @@ -1277,7 +1277,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT pg->osize = p->osize; pg->thread_n = ptls->tid; set_page_metadata(pg); - push_lf_back(&ptls->page_metadata_allocd, pg); + push_lf_back(&ptls->gc_tls.page_metadata_allocd, pg); jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg); p->newpages = fl; return fl; @@ -1296,12 +1296,12 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset return jl_gc_big_alloc(ptls, osize, NULL); #endif maybe_collect(ptls); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + osize); - jl_atomic_store_relaxed(&ptls->gc_num.pool_live_bytes, - jl_atomic_load_relaxed(&ptls->gc_num.pool_live_bytes) + osize); - jl_atomic_store_relaxed(&ptls->gc_num.poolalloc, - jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + osize); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + osize); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc) + 1); // first try to use the freelist jl_taggedvalue_t *v = p->freelist; if (v != NULL) { @@ -1363,7 +1363,7 @@ int jl_gc_classify_pools(size_t sz, int *osize) size_t allocsz = sz + sizeof(jl_taggedvalue_t); int klass = jl_gc_szclass(allocsz); *osize = jl_gc_sizeclasses[klass]; - return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]); + return (int)(intptr_t)(&((jl_ptls_t)0)->gc_tls.heap.norm_pools[klass]); } // sweep phase @@ -1516,8 +1516,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_ // instead of adding it to the thread that originally allocated the page, so we can avoid // an atomic-fetch-add here. size_t delta = (GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize); - jl_atomic_store_relaxed(&ptls->gc_num.pool_live_bytes, - jl_atomic_load_relaxed(&ptls->gc_num.pool_live_bytes) + delta); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + delta); jl_atomic_fetch_add_relaxed((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize); } @@ -1527,7 +1527,7 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa int p_n = pg->pool_n; int t_n = pg->thread_n; jl_ptls_t ptls2 = gc_all_tls_states[t_n]; - jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; + jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[p_n]; int osize = pg->osize; gc_sweep_page(s, p, allocd, pg, osize); } @@ -1573,7 +1573,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc jl_gc_pagemeta_t *tail = NULL; memset(&tmp, 0, sizeof(tmp)); while (1) { - jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd); + jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->gc_tls.page_metadata_allocd); if (pg == NULL) { break; } @@ -1602,9 +1602,9 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc } } if (tail != NULL) { - tail->next = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + tail->next = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom); } - ptls2->page_metadata_allocd = tmp; + ptls2->gc_tls.page_metadata_allocd = tmp; if (n_pages_to_scan >= n_pages_worth_parallel_sweep) { break; } @@ -1625,7 +1625,7 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_ for (int i = first; i <= last; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_check_ptls_of_parallel_collector_thread(ptls2); - jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1); + jl_atomic_fetch_add(&ptls2->gc_tls.gc_sweeps_requested, 1); } uv_cond_broadcast(&gc_threads_cond); uv_mutex_unlock(&gc_threads_lock); @@ -1641,7 +1641,7 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_ for (int i = first; i <= last; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_check_ptls_of_parallel_collector_thread(ptls2); - while (jl_atomic_load_acquire(&ptls2->gc_sweeps_requested) != 0) { + while (jl_atomic_load_acquire(&ptls2->gc_tls.gc_sweeps_requested) != 0) { jl_cpu_pause(); } } @@ -1675,7 +1675,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) continue; } jl_gc_page_stack_t *dest = &allocd_scratch[ptls2->tid].stack; - jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd); + jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->gc_tls.page_metadata_allocd); // failed steal attempt if (pg == NULL) { continue; @@ -1692,7 +1692,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) if (ptls2 == NULL) { continue; } - jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom); if (pg != NULL) { no_more_work = 0; break; @@ -1770,9 +1770,9 @@ static void gc_sweep_pool(void) } continue; } - jl_atomic_store_relaxed(&ptls2->gc_num.pool_live_bytes, 0); + jl_atomic_store_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes, 0); for (int i = 0; i < JL_GC_N_POOLS; i++) { - jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; + jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i]; jl_taggedvalue_t *last = p->freelist; if (last != NULL) { jl_gc_pagemeta_t *pg = jl_assume(page_metadata_unsafe(last)); @@ -1804,9 +1804,9 @@ static void gc_sweep_pool(void) for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - ptls2->page_metadata_allocd = new_gc_allocd_scratch[t_i].stack; + ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack; for (int i = 0; i < JL_GC_N_POOLS; i++) { - jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; + jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i]; p->newpages = NULL; } } @@ -1818,7 +1818,7 @@ static void gc_sweep_pool(void) if (ptls2 == NULL) { continue; } - jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom); + jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom); while (pg != NULL) { jl_gc_pagemeta_t *pg2 = pg->next; if (pg->fl_begin_offset != UINT16_MAX) { @@ -1880,8 +1880,8 @@ JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr) // which is not idempotent. See comments in https://github.com/JuliaLang/julia/issues/50419 uintptr_t header = jl_atomic_fetch_and_relaxed((_Atomic(uintptr_t) *)&o->header, ~GC_OLD); if (header & GC_OLD) { // write barrier has not been triggered in this object yet - arraylist_push(&ptls->heap.remset, (jl_value_t*)ptr); - ptls->heap.remset_nptr++; // conservative + arraylist_push(&ptls->gc_tls.heap.remset, (jl_value_t*)ptr); + ptls->gc_tls.heap.remset_nptr++; // conservative } } @@ -1986,8 +1986,8 @@ STATIC_INLINE void gc_mark_push_remset(jl_ptls_t ptls, jl_value_t *obj, uintptr_t nptr) JL_NOTSAFEPOINT { if (__unlikely((nptr & 0x3) == 0x3)) { - ptls->heap.remset_nptr += nptr >> 2; - arraylist_t *remset = &ptls->heap.remset; + ptls->gc_tls.heap.remset_nptr += nptr >> 2; + arraylist_t *remset = &ptls->gc_tls.heap.remset; size_t len = remset->len; if (__unlikely(len >= remset->max)) { arraylist_push(remset, obj); @@ -2051,7 +2051,7 @@ JL_NORETURN NOINLINE void gc_dump_queue_and_abort(jl_ptls_t ptls, jl_datatype_t if (jl_n_gcthreads == 0) { jl_safe_printf("\n"); jl_value_t *new_obj; - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_safe_printf("thread %d ptr queue:\n", ptls->tid); jl_safe_printf("~~~~~~~~~~ ptr queue top ~~~~~~~~~~\n"); while ((new_obj = gc_ptr_queue_steal_from(mq)) != NULL) { @@ -2090,7 +2090,7 @@ STATIC_INLINE jl_value_t *gc_mark_obj8(jl_ptls_t ptls, char *obj8_parent, uint8_ uint8_t *obj8_end, uintptr_t nptr) JL_NOTSAFEPOINT { (void)jl_assume(obj8_begin < obj8_end); - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t **slot = NULL; jl_value_t *new_obj = NULL; for (; obj8_begin < obj8_end; obj8_begin++) { @@ -2122,7 +2122,7 @@ STATIC_INLINE jl_value_t *gc_mark_obj16(jl_ptls_t ptls, char *obj16_parent, uint uint16_t *obj16_end, uintptr_t nptr) JL_NOTSAFEPOINT { (void)jl_assume(obj16_begin < obj16_end); - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t **slot = NULL; jl_value_t *new_obj = NULL; for (; obj16_begin < obj16_end; obj16_begin++) { @@ -2154,7 +2154,7 @@ STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint uint32_t *obj32_end, uintptr_t nptr) JL_NOTSAFEPOINT { (void)jl_assume(obj32_begin < obj32_end); - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t **slot = NULL; jl_value_t *new_obj = NULL; for (; obj32_begin < obj32_end; obj32_begin++) { @@ -2185,7 +2185,7 @@ STATIC_INLINE jl_value_t *gc_mark_obj32(jl_ptls_t ptls, char *obj32_parent, uint STATIC_INLINE void gc_mark_objarray(jl_ptls_t ptls, jl_value_t *obj_parent, jl_value_t **obj_begin, jl_value_t **obj_end, uint32_t step, uintptr_t nptr) JL_NOTSAFEPOINT { - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t *new_obj; // Decide whether need to chunk objary (void)jl_assume(step > 0); @@ -2252,7 +2252,7 @@ STATIC_INLINE void gc_mark_array8(jl_ptls_t ptls, jl_value_t *ary8_parent, jl_va jl_value_t **ary8_end, uint8_t *elem_begin, uint8_t *elem_end, uintptr_t nptr) JL_NOTSAFEPOINT { - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t *new_obj; size_t elsize = ((jl_array_t *)ary8_parent)->elsize / sizeof(jl_value_t *); assert(elsize > 0); @@ -2329,7 +2329,7 @@ STATIC_INLINE void gc_mark_array16(jl_ptls_t ptls, jl_value_t *ary16_parent, jl_ jl_value_t **ary16_end, uint16_t *elem_begin, uint16_t *elem_end, uintptr_t nptr) JL_NOTSAFEPOINT { - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t *new_obj; size_t elsize = ((jl_array_t *)ary16_parent)->elsize / sizeof(jl_value_t *); assert(elsize > 0); @@ -2456,7 +2456,7 @@ STATIC_INLINE void gc_mark_chunk(jl_ptls_t ptls, jl_gc_markqueue_t *mq, jl_gc_ch STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroots, uintptr_t offset, uintptr_t lb, uintptr_t ub) JL_NOTSAFEPOINT { - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t *new_obj; uint32_t nr = nroots >> 2; while (1) { @@ -2501,7 +2501,7 @@ STATIC_INLINE void gc_mark_stack(jl_ptls_t ptls, jl_gcframe_t *s, uint32_t nroot // Mark exception stack STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, size_t itr) JL_NOTSAFEPOINT { - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t *new_obj; while (itr > 0) { size_t bt_size = jl_excstack_bt_size(excstack, itr); @@ -2532,7 +2532,7 @@ STATIC_INLINE void gc_mark_excstack(jl_ptls_t ptls, jl_excstack_t *excstack, siz STATIC_INLINE void gc_mark_module_binding(jl_ptls_t ptls, jl_module_t *mb_parent, uintptr_t nptr, uint8_t bits) JL_NOTSAFEPOINT { - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; jl_value_t *bindings = (jl_value_t *)jl_atomic_load_relaxed(&mb_parent->bindings); gc_assert_parent_validity((jl_value_t *)mb_parent, bindings); gc_try_claim_and_push(mq, bindings, &nptr); @@ -2607,7 +2607,7 @@ JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) { int may_claim = gc_try_setmark_tag(jl_astaggedvalue(obj), GC_MARKED); if (may_claim) - gc_ptr_queue_push(&ptls->mark_queue, obj); + gc_ptr_queue_push(&ptls->gc_tls.mark_queue, obj); return may_claim; } @@ -2783,10 +2783,10 @@ FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_ if (update_meta || foreign_alloc) { gc_heap_snapshot_record_hidden_edge(new_obj, a->data, jl_array_nbytes(a), flags.pooled); if (bits == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += jl_array_nbytes(a); + ptls->gc_tls.gc_cache.perm_scanned_bytes += jl_array_nbytes(a); } else { - ptls->gc_cache.scanned_bytes += jl_array_nbytes(a); + ptls->gc_tls.gc_cache.scanned_bytes += jl_array_nbytes(a); } } } @@ -2912,7 +2912,7 @@ FORCE_INLINE void gc_mark_outrefs(jl_ptls_t ptls, jl_gc_markqueue_t *mq, void *_ void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq) { while (1) { - void *new_obj = (void *)gc_ptr_queue_pop(&ptls->mark_queue); + void *new_obj = (void *)gc_ptr_queue_pop(&ptls->gc_tls.mark_queue); // No more objects to mark if (__unlikely(new_obj == NULL)) { return; @@ -2940,17 +2940,16 @@ void gc_drain_own_chunkqueue(jl_ptls_t ptls, jl_gc_markqueue_t *mq) // makes it easier to implement parallel marking via work-stealing JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls) { - gc_mark_loop_serial_(ptls, &ptls->mark_queue); - gc_drain_own_chunkqueue(ptls, &ptls->mark_queue); + gc_mark_loop_serial_(ptls, &ptls->gc_tls.mark_queue); + gc_drain_own_chunkqueue(ptls, &ptls->gc_tls.mark_queue); } void gc_mark_and_steal(jl_ptls_t ptls) { - jl_gc_markqueue_t *mq = &ptls->mark_queue; - jl_gc_markqueue_t *mq_master = NULL; int master_tid = jl_atomic_load(&gc_master_tid); assert(master_tid != -1); - mq_master = &gc_all_tls_states[master_tid]->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; + jl_gc_markqueue_t *mq_master = &gc_all_tls_states[master_tid]->gc_tls.mark_queue; void *new_obj; jl_gc_chunk_t c; pop : { @@ -2981,7 +2980,7 @@ void gc_mark_and_steal(jl_ptls_t ptls) int v = gc_random_parallel_collector_thread_id(ptls); jl_ptls_t ptls2 = gc_all_tls_states[v]; gc_check_ptls_of_parallel_collector_thread(ptls2); - jl_gc_markqueue_t *mq2 = &ptls2->mark_queue; + jl_gc_markqueue_t *mq2 = &ptls2->gc_tls.mark_queue; c = gc_chunkqueue_steal_from(mq2); if (c.cid != GC_empty_chunk) { gc_mark_chunk(ptls, mq, &c); @@ -2992,7 +2991,7 @@ void gc_mark_and_steal(jl_ptls_t ptls) for (int i = first; i <= last; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_check_ptls_of_parallel_collector_thread(ptls2); - jl_gc_markqueue_t *mq2 = &ptls2->mark_queue; + jl_gc_markqueue_t *mq2 = &ptls2->gc_tls.mark_queue; c = gc_chunkqueue_steal_from(mq2); if (c.cid != GC_empty_chunk) { gc_mark_chunk(ptls, mq, &c); @@ -3012,7 +3011,7 @@ void gc_mark_and_steal(jl_ptls_t ptls) int v = gc_random_parallel_collector_thread_id(ptls); jl_ptls_t ptls2 = gc_all_tls_states[v]; gc_check_ptls_of_parallel_collector_thread(ptls2); - jl_gc_markqueue_t *mq2 = &ptls2->mark_queue; + jl_gc_markqueue_t *mq2 = &ptls2->gc_tls.mark_queue; new_obj = gc_ptr_queue_steal_from(mq2); if (new_obj != NULL) goto mark; @@ -3021,7 +3020,7 @@ void gc_mark_and_steal(jl_ptls_t ptls) for (int i = first; i <= last; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_check_ptls_of_parallel_collector_thread(ptls2); - jl_gc_markqueue_t *mq2 = &ptls2->mark_queue; + jl_gc_markqueue_t *mq2 = &ptls2->gc_tls.mark_queue; new_obj = gc_ptr_queue_steal_from(mq2); if (new_obj != NULL) goto mark; @@ -3039,10 +3038,10 @@ size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT { // assume each chunk is worth 256 units of work and each pointer // is worth 1 unit of work - size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) - - jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top)); - work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) - - jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top)); + size_t work = 256 * (jl_atomic_load_relaxed(&ptls->gc_tls.mark_queue.chunk_queue.bottom) - + jl_atomic_load_relaxed(&ptls->gc_tls.mark_queue.chunk_queue.top)); + work += (jl_atomic_load_relaxed(&ptls->gc_tls.mark_queue.ptr_queue.bottom) - + jl_atomic_load_relaxed(&ptls->gc_tls.mark_queue.ptr_queue.top)); return work; } @@ -3152,7 +3151,10 @@ void gc_mark_clean_reclaim_sets(void) // Clean up `reclaim-sets` for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - arraylist_t *reclaim_set2 = &ptls2->mark_queue.reclaim_set; + if (ptls2 == NULL) { + continue; + } + arraylist_t *reclaim_set2 = &ptls2->gc_tls.mark_queue.reclaim_set; ws_array_t *a = NULL; while ((a = (ws_array_t *)arraylist_pop(reclaim_set2)) != NULL) { free(a->buffer); @@ -3165,10 +3167,10 @@ void gc_mark_clean_reclaim_sets(void) if (ptls2 == NULL) { continue; } - jl_atomic_store_relaxed(&ptls2->mark_queue.ptr_queue.bottom, 0); - jl_atomic_store_relaxed(&ptls2->mark_queue.ptr_queue.top, 0); - jl_atomic_store_relaxed(&ptls2->mark_queue.chunk_queue.bottom, 0); - jl_atomic_store_relaxed(&ptls2->mark_queue.chunk_queue.top, 0); + jl_atomic_store_relaxed(&ptls2->gc_tls.mark_queue.ptr_queue.bottom, 0); + jl_atomic_store_relaxed(&ptls2->gc_tls.mark_queue.ptr_queue.top, 0); + jl_atomic_store_relaxed(&ptls2->gc_tls.mark_queue.chunk_queue.bottom, 0); + jl_atomic_store_relaxed(&ptls2->gc_tls.mark_queue.chunk_queue.top, 0); } } @@ -3217,8 +3219,8 @@ static void gc_queue_bt_buf(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) static void gc_queue_remset(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) { - void **items = ptls2->heap.remset.items; - size_t len = ptls2->heap.remset.len; + void **items = ptls2->gc_tls.heap.remset.items; + size_t len = ptls2->gc_tls.heap.remset.len; for (size_t i = 0; i < len; i++) { void *_v = items[i]; jl_astaggedvalue(_v)->bits.gc = GC_OLD_MARKED; @@ -3226,8 +3228,8 @@ static void gc_queue_remset(jl_gc_markqueue_t *mq, jl_ptls_t ptls2) gc_ptr_queue_push(mq, v); } // Don't forget to clear the remset - ptls2->heap.remset.len = 0; - ptls2->heap.remset_nptr = 0; + ptls2->gc_tls.heap.remset.len = 0; + ptls2->gc_tls.heap.remset_nptr = 0; } static void gc_check_all_remsets_are_empty(void) @@ -3235,8 +3237,8 @@ static void gc_check_all_remsets_are_empty(void) for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 != NULL) { - assert(ptls2->heap.remset.len == 0); - assert(ptls2->heap.remset_nptr == 0); + assert(ptls2->gc_tls.heap.remset.len == 0); + assert(ptls2->gc_tls.heap.remset_nptr == 0); } } } @@ -3409,7 +3411,7 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) for (int i = 0; i < n_threads; i++) { jl_ptls_t ptls2 = all_tls_states[i]; if (ptls2 != NULL) { - pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_num.pool_live_bytes); + pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes); } } return pool_live_bytes; @@ -3431,7 +3433,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // so that the sweep shows a downward trend in memory usage. jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, gc_num.allocd); - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; uint64_t gc_start_time = jl_hrtime(); int64_t last_perm_scanned_bytes = perm_scanned_bytes; @@ -3448,7 +3450,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (!single_threaded_mark) { int dest_tid = gc_ith_parallel_collector_thread_id(t_i % jl_n_markthreads); ptls_dest = gc_all_tls_states[dest_tid]; - mq_dest = &ptls_dest->mark_queue; + mq_dest = &ptls_dest->gc_tls.mark_queue; } if (ptls2 != NULL) { // 1.1. mark every thread local root @@ -3542,7 +3544,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 != NULL) - nptr += ptls2->heap.remset_nptr; + nptr += ptls2->gc_tls.heap.remset_nptr; } // many pointers in the intergen frontier => "quick" mark is not quick @@ -3641,13 +3643,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (ptls2 == NULL) continue; if (!sweep_full) { - for (int i = 0; i < ptls2->heap.remset.len; i++) { - void *ptr = ptls2->heap.remset.items[i]; + for (int i = 0; i < ptls2->gc_tls.heap.remset.len; i++) { + void *ptr = ptls2->gc_tls.heap.remset.items[i]; jl_astaggedvalue(ptr)->bits.gc = GC_MARKED; } } else { - ptls2->heap.remset.len = 0; + ptls2->gc_tls.heap.remset.len = 0; } } @@ -3735,8 +3737,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) jl_task_t *ct = jl_current_task; jl_ptls_t ptls = ct->ptls; if (jl_atomic_load_acquire(&jl_gc_disable_counter)) { - size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval; - jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); + size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval; + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval); static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), ""); jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes); return; @@ -3848,7 +3850,7 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty) // Per-thread initialization void jl_init_thread_heap(jl_ptls_t ptls) { - jl_thread_heap_t *heap = &ptls->heap; + jl_thread_heap_t *heap = &ptls->gc_tls.heap; jl_gc_pool_t *p = heap->norm_pools; for (int i = 0; i < JL_GC_N_POOLS; i++) { p[i].osize = jl_gc_sizeclasses[i]; @@ -3864,15 +3866,15 @@ void jl_init_thread_heap(jl_ptls_t ptls) heap->big_objects = NULL; arraylist_new(&heap->remset, 0); arraylist_new(&ptls->finalizers, 0); - arraylist_new(&ptls->sweep_objs, 0); + arraylist_new(&ptls->gc_tls.sweep_objs, 0); - jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; + jl_gc_mark_cache_t *gc_cache = &ptls->gc_tls.gc_cache; gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; gc_cache->nbig_obj = 0; // Initialize GC mark-queue - jl_gc_markqueue_t *mq = &ptls->mark_queue; + jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; ws_queue_t *cq = &mq->chunk_queue; ws_array_t *wsa = create_ws_array(GC_CHUNK_QUEUE_INIT_SIZE, sizeof(jl_gc_chunk_t)); jl_atomic_store_relaxed(&cq->top, 0); @@ -3885,8 +3887,8 @@ void jl_init_thread_heap(jl_ptls_t ptls) jl_atomic_store_relaxed(&q->array, wsa2); arraylist_new(&mq->reclaim_set, 32); - memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); + memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num)); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval); } // System-wide initializations @@ -3964,10 +3966,10 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) if (data != NULL && pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1); } return data; } @@ -3980,10 +3982,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) if (data != NULL && pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + nm*sz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1); } return data; } @@ -3995,10 +3997,10 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) free(p); if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); - jl_atomic_store_relaxed(&ptls->gc_num.freecall, - jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freed, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freed) + sz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freecall, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freecall) + 1); } } @@ -4011,13 +4013,13 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); if (sz < old) - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz)); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freed, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freed) + (old - sz)); else - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old)); - jl_atomic_store_relaxed(&ptls->gc_num.realloc, - jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + (sz - old)); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc) + 1); } return data; } @@ -4097,10 +4099,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) if (b == NULL) jl_throw(jl_memory_exception); - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1); #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif @@ -4137,17 +4139,17 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds errno = last_errno; // gc_managed_realloc_ is currently used exclusively for resizing array buffers. if (is_old_marked) { - ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; + ptls->gc_tls.gc_cache.perm_scanned_bytes += allocsz - oldsz; inc_live_bytes(allocsz - oldsz); } else if (allocsz < oldsz) - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz)); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.freed, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.freed) + (oldsz - allocsz)); else - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz)); - jl_atomic_store_relaxed(&ptls->gc_num.realloc, - jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + (allocsz - oldsz)); + jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, + jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc) + 1); if (allocsz > oldsz) { maybe_record_alloc_to_profile((jl_value_t*)b, allocsz - oldsz, (jl_datatype_t*)jl_buff_tag); } @@ -4338,7 +4340,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) goto valid_object; } jl_gc_pool_t *pool = - gc_all_tls_states[meta->thread_n]->heap.norm_pools + + gc_all_tls_states[meta->thread_n]->gc_tls.heap.norm_pools + meta->pool_n; if (meta->fl_begin_offset == UINT16_MAX) { // case 2: this is a page on the newpages list @@ -4418,7 +4420,7 @@ JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty) JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj) { - arraylist_push(&ptls->sweep_objs, obj); + arraylist_push(&ptls->gc_tls.sweep_objs, obj); } #ifdef __cplusplus diff --git a/src/julia_internal.h b/src/julia_internal.h index 1ea84c0dc0d4f..75ebad73a4546 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -498,7 +498,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) const size_t allocsz = sz + sizeof(jl_taggedvalue_t); if (sz <= GC_MAX_SZCLASS) { int pool_id = jl_gc_szclass(allocsz); - jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id]; + jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id]; int osize = jl_gc_sizeclasses[pool_id]; // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.) diff --git a/src/julia_threads.h b/src/julia_threads.h index 9bca500604b0e..3d06d380d10c2 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -4,8 +4,8 @@ #ifndef JL_THREADS_H #define JL_THREADS_H +#include "gc-tls.h" #include "julia_atomics.h" -#include "work-stealing-queue.h" #ifndef _OS_WINDOWS_ #include "pthread.h" #endif @@ -122,80 +122,7 @@ typedef struct { uint32_t count; } jl_mutex_t; -typedef struct { - jl_taggedvalue_t *freelist; // root of list of free objects - jl_taggedvalue_t *newpages; // root of list of chunks of free objects - uint16_t osize; // size of objects in this pool -} jl_gc_pool_t; - -typedef struct { - _Atomic(int64_t) allocd; - _Atomic(int64_t) pool_live_bytes; - _Atomic(int64_t) freed; - _Atomic(uint64_t) malloc; - _Atomic(uint64_t) realloc; - _Atomic(uint64_t) poolalloc; - _Atomic(uint64_t) bigalloc; - _Atomic(uint64_t) freecall; -} jl_thread_gc_num_t; - -typedef struct { - // variable for tracking weak references - small_arraylist_t weak_refs; - // live tasks started on this thread - // that are holding onto a stack from the pool - small_arraylist_t live_tasks; - - // variables for tracking malloc'd arrays - struct _mallocarray_t *mallocarrays; - struct _mallocarray_t *mafreelist; - - // variables for tracking big objects - struct _bigval_t *big_objects; - - // lower bound of the number of pointers inside remembered values - int remset_nptr; - // remembered set - arraylist_t remset; - - // variables for allocating objects from pools -#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h` - jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS]; - -#define JL_N_STACK_POOLS 16 - small_arraylist_t free_stacks[JL_N_STACK_POOLS]; -} jl_thread_heap_t; - -typedef struct { - ws_queue_t chunk_queue; - ws_queue_t ptr_queue; - arraylist_t reclaim_set; -} jl_gc_markqueue_t; - -typedef struct { - // thread local increment of `perm_scanned_bytes` - size_t perm_scanned_bytes; - // thread local increment of `scanned_bytes` - size_t scanned_bytes; - // Number of queued big objects (<= 1024) - size_t nbig_obj; - // Array of queued big objects to be moved between the young list - // and the old list. - // A set low bit means that the object should be moved from the old list - // to the young list (`mark_reset_age`). - // Objects can only be put into this list when the mark bit is flipped to - // `1` (atomically). Combining with the sync after marking, - // this makes sure that a single objects can only appear once in - // the lists (the mark bit cannot be flipped to `0` without sweeping) - void *big_obj[1024]; -} jl_gc_mark_cache_t; - struct _jl_bt_element_t; -struct _jl_gc_pagemeta_t; - -typedef struct { - _Atomic(struct _jl_gc_pagemeta_t *) bottom; -} jl_gc_page_stack_t; // This includes all the thread local states we care about for a thread. // Changes to TLS field types must be reflected in codegen. @@ -227,8 +154,7 @@ typedef struct _jl_tls_states_t { int8_t disable_gc; // Counter to disable finalizer **on the current thread** int finalizers_inhibited; - jl_thread_heap_t heap; // this is very large, and the offset is baked into codegen - jl_thread_gc_num_t gc_num; + jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen volatile sig_atomic_t defer_signal; _Atomic(struct _jl_task_t*) current_task; struct _jl_task_t *next_task; @@ -262,11 +188,6 @@ typedef struct _jl_tls_states_t { #endif jl_thread_t system_id; arraylist_t finalizers; - jl_gc_page_stack_t page_metadata_allocd; - jl_gc_markqueue_t mark_queue; - jl_gc_mark_cache_t gc_cache; - arraylist_t sweep_objs; - _Atomic(int64_t) gc_sweeps_requested; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception; diff --git a/src/partr.c b/src/partr.c index b0b4ee77a9ec5..33631dc83c05a 100644 --- a/src/partr.c +++ b/src/partr.c @@ -115,7 +115,7 @@ static inline int may_mark(void) JL_NOTSAFEPOINT static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT { - return (jl_atomic_load(&ptls->gc_sweeps_requested) > 0); + return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0); } // parallel gc thread function @@ -148,7 +148,7 @@ void jl_parallel_gc_threadfun(void *arg) if (may_sweep(ptls)) { assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); gc_sweep_pool_parallel(ptls); - jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1); + jl_atomic_fetch_add(&ptls->gc_tls.gc_sweeps_requested, -1); } } } diff --git a/src/stackwalk.c b/src/stackwalk.c index ebf56931c2c65..8a12fb2a28143 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1187,7 +1187,7 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT if (ptls2 == NULL) { continue; } - small_arraylist_t *live_tasks = &ptls2->heap.live_tasks; + small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks; size_t n = mtarraylist_length(live_tasks); int t_state = JL_TASK_STATE_DONE; jl_task_t *t = ptls2->root_task; From 237325e4ccdeedf9a1570313a5db364af35726c0 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Mon, 15 Jul 2024 19:20:23 -0300 Subject: [PATCH 3/5] Simplify sweeping of big values (#54936) Simplifies the layout of the doubly linked list of big objects to make it a bit more canonical: let's just store a pointer to the previous element, instead of storing a "pointer to the next element of the previous element". This should make the implementation a bit easier to understand without incurring any memory overhead. I ran the serial and multithreaded benchmarks from GCBenchmarks and this seems fairly close to performance neutral on my machine. We also ran our internal benchmarks on it at RAI and it looks fine from a correctness and performance point of view. --------- Co-authored-by: Kiran Pamnany --- src/gc-debug.c | 8 +-- src/gc-tls.h | 4 +- src/gc.c | 158 ++++++++++++++++++++++++------------------------- src/gc.h | 35 +++++++---- 4 files changed, 105 insertions(+), 100 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index 5b90ce7cd955b..91e8b7fa9f63d 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -132,7 +132,7 @@ static void clear_mark(int bits) } bigval_t *v; for (int i = 0; i < gc_n_threads; i++) { - v = gc_all_tls_states[i]->gc_tls.heap.big_objects; + v = gc_all_tls_states[i]->gc_tls.heap.young_generation_of_bigvals; while (v != NULL) { void *gcv = &v->header; if (!gc_verifying) @@ -142,7 +142,7 @@ static void clear_mark(int bits) } } - v = big_objects_marked; + v = oldest_generation_of_bigvals; while (v != NULL) { void *gcv = &v->header; if (!gc_verifying) @@ -965,7 +965,7 @@ void gc_stats_big_obj(void) size_t nused=0, nbytes=0, nused_old=0, nbytes_old=0; for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; - bigval_t *v = ptls2->gc_tls.heap.big_objects; + bigval_t *v = ptls2->gc_tls.heap.young_generation_of_bigvals; while (v != NULL) { if (gc_marked(v->bits.gc)) { nused++; @@ -973,7 +973,7 @@ void gc_stats_big_obj(void) } v = v->next; } - v = big_objects_marked; + v = oldest_generation_of_bigvals; while (v != NULL) { if (gc_marked(v->bits.gc)) { nused_old++; diff --git a/src/gc-tls.h b/src/gc-tls.h index 3f8662b467e30..dc15cda48fa58 100644 --- a/src/gc-tls.h +++ b/src/gc-tls.h @@ -31,8 +31,8 @@ typedef struct { struct _mallocarray_t *mallocarrays; struct _mallocarray_t *mafreelist; - // variables for tracking big objects - struct _bigval_t *big_objects; + // variable for tracking young (i.e. not in `GC_OLD_MARKED`/last generation) large objects + struct _bigval_t *young_generation_of_bigvals; // lower bound of the number of pointers inside remembered values int remset_nptr; diff --git a/src/gc.c b/src/gc.c index e96fc9637d6f9..c70eee6905ee9 100644 --- a/src/gc.c +++ b/src/gc.c @@ -34,6 +34,8 @@ uv_cond_t gc_threads_cond; uv_sem_t gc_sweep_assists_needed; // Mutex used to coordinate entry of GC threads in the mark loop uv_mutex_t gc_queue_observer_lock; +// Tag for sentinel nodes in bigval list +uintptr_t gc_bigval_sentinel_tag; // Linked list of callback functions @@ -150,7 +152,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t // is going to realloc the buffer (of its own list) or accessing the // list of another thread static jl_mutex_t finalizers_lock; -static uv_mutex_t gc_cache_lock; // mutex for gc-heap-snapshot. jl_mutex_t heapsnapshot_lock; @@ -201,8 +202,8 @@ JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT return jl_buff_tag; } -// List of marked big objects. Not per-thread. Accessed only by master thread. -bigval_t *big_objects_marked = NULL; +// List of big objects in oldest generation (`GC_OLD_MARKED`). Not per-thread. Accessed only by master thread. +bigval_t *oldest_generation_of_bigvals = NULL; // -- Finalization -- // `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers. @@ -759,60 +760,25 @@ static int64_t t_start = 0; // Time GC starts; static int64_t last_trim_maxrss = 0; #endif -static void gc_sync_cache_nolock(jl_ptls_t ptls, jl_gc_mark_cache_t *gc_cache) JL_NOTSAFEPOINT +static void gc_sync_cache(jl_ptls_t ptls, jl_gc_mark_cache_t *gc_cache) JL_NOTSAFEPOINT { - const int nbig = gc_cache->nbig_obj; - for (int i = 0; i < nbig; i++) { - void *ptr = gc_cache->big_obj[i]; - bigval_t *hdr = (bigval_t*)gc_ptr_clear_tag(ptr, 1); - gc_big_object_unlink(hdr); - if (gc_ptr_tag(ptr, 1)) { - gc_big_object_link(hdr, &ptls->gc_tls.heap.big_objects); - } - else { - // Move hdr from `big_objects` list to `big_objects_marked list` - gc_big_object_link(hdr, &big_objects_marked); - } - } - gc_cache->nbig_obj = 0; perm_scanned_bytes += gc_cache->perm_scanned_bytes; scanned_bytes += gc_cache->scanned_bytes; gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; } -static void gc_sync_cache(jl_ptls_t ptls) JL_NOTSAFEPOINT -{ - uv_mutex_lock(&gc_cache_lock); - gc_sync_cache_nolock(ptls, &ptls->gc_tls.gc_cache); - uv_mutex_unlock(&gc_cache_lock); -} - // No other threads can be running marking at the same time -static void gc_sync_all_caches_nolock(jl_ptls_t ptls) +static void gc_sync_all_caches(jl_ptls_t ptls) { assert(gc_n_threads); for (int t_i = 0; t_i < gc_n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) - gc_sync_cache_nolock(ptls, &ptls2->gc_tls.gc_cache); + gc_sync_cache(ptls, &ptls2->gc_tls.gc_cache); } } -STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr, - int toyoung) JL_NOTSAFEPOINT -{ - const int nentry = sizeof(ptls->gc_tls.gc_cache.big_obj) / sizeof(void*); - size_t nobj = ptls->gc_tls.gc_cache.nbig_obj; - if (__unlikely(nobj >= nentry)) { - gc_sync_cache(ptls); - nobj = 0; - } - uintptr_t v = (uintptr_t)hdr; - ptls->gc_tls.gc_cache.big_obj[nobj] = (void*)(toyoung ? (v | 1) : v); - ptls->gc_tls.gc_cache.nbig_obj = nobj + 1; -} - // Atomically set the mark bit for object and return whether it was previously unmarked FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT { @@ -849,16 +815,14 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o, bigval_t *hdr = bigval_header(o); if (mark_mode == GC_OLD_MARKED) { ptls->gc_tls.gc_cache.perm_scanned_bytes += hdr->sz; - gc_queue_big_marked(ptls, hdr, 0); } else { ptls->gc_tls.gc_cache.scanned_bytes += hdr->sz; - // We can't easily tell if the object is old or being promoted - // from the gc bits but if the `age` is `0` then the object - // must be already on a young list. if (mark_reset_age) { + assert(jl_atomic_load(&gc_n_threads_marking) == 0); // `mark_reset_age` is only used during single-threaded marking // Reset the object as if it was just allocated - gc_queue_big_marked(ptls, hdr, 1); + gc_big_object_unlink(hdr); + gc_big_object_link(ptls->gc_tls.heap.young_generation_of_bigvals, hdr); } } } @@ -1023,7 +987,7 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) memset(v, 0xee, allocsz); #endif v->sz = allocsz; - gc_big_object_link(v, &ptls->gc_tls.heap.big_objects); + gc_big_object_link(ptls->gc_tls.heap.young_generation_of_bigvals, v); return jl_valueof(&v->header); } @@ -1043,60 +1007,85 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t sz) { return jl_gc_big_alloc_inner(ptls, sz); } -// Sweep list rooted at *pv, removing and freeing any unmarked objects. -// Return pointer to last `next` field in the culled list. -static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT +FORCE_INLINE void sweep_unlink_and_free(bigval_t *v) JL_NOTSAFEPOINT +{ + gc_big_object_unlink(v); + gc_num.freed += v->sz; +#ifdef MEMDEBUG + memset(v, 0xbb, v->sz); +#endif + gc_invoke_callbacks(jl_gc_cb_notify_external_free_t, gc_cblist_notify_external_free, (v)); + jl_free_aligned(v); +} + +static bigval_t *sweep_list_of_young_bigvals(bigval_t *young) JL_NOTSAFEPOINT { - bigval_t *v = *pv; + bigval_t *last_node = young; + bigval_t *v = young->next; // skip the sentinel + bigval_t *old = oldest_generation_of_bigvals; + int sweep_full = current_sweep_full; // don't load the global in the hot loop while (v != NULL) { bigval_t *nxt = v->next; int bits = v->bits.gc; int old_bits = bits; if (gc_marked(bits)) { - pv = &v->next; if (sweep_full || bits == GC_MARKED) { bits = GC_OLD; + last_node = v; + } + else { // `bits == GC_OLD_MARKED` + assert(bits == GC_OLD_MARKED); + // reached oldest generation, move from young list to old list + gc_big_object_unlink(v); + gc_big_object_link(old, v); } v->bits.gc = bits; } else { - // Remove v from list and free it - *pv = nxt; - if (nxt) - nxt->prev = pv; - gc_num.freed += v->sz; -#ifdef MEMDEBUG - memset(v, 0xbb, v->sz); -#endif - gc_invoke_callbacks(jl_gc_cb_notify_external_free_t, - gc_cblist_notify_external_free, (v)); - jl_free_aligned(v); + sweep_unlink_and_free(v); } gc_time_count_big(old_bits, bits); v = nxt; } - return pv; + return last_node; +} + +static void sweep_list_of_oldest_bigvals(bigval_t *young) JL_NOTSAFEPOINT +{ + bigval_t *v = oldest_generation_of_bigvals->next; // skip the sentinel + while (v != NULL) { + bigval_t *nxt = v->next; + assert(v->bits.gc == GC_OLD_MARKED); + v->bits.gc = GC_OLD; + gc_time_count_big(GC_OLD_MARKED, GC_OLD); + v = nxt; + } } -static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT +static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT { gc_time_big_start(); assert(gc_n_threads); + bigval_t *last_node_in_my_list = NULL; for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 != NULL) - sweep_big_list(sweep_full, &ptls2->gc_tls.heap.big_objects); + if (ptls2 != NULL) { + bigval_t *last_node = sweep_list_of_young_bigvals(ptls2->gc_tls.heap.young_generation_of_bigvals); + if (ptls == ptls2) { + last_node_in_my_list = last_node; + } + } } - if (sweep_full) { - bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); - // Move all survivors from big_objects_marked list to the big_objects list of this thread. - if (ptls->gc_tls.heap.big_objects) - ptls->gc_tls.heap.big_objects->prev = last_next; - *last_next = ptls->gc_tls.heap.big_objects; - ptls->gc_tls.heap.big_objects = big_objects_marked; - if (ptls->gc_tls.heap.big_objects) - ptls->gc_tls.heap.big_objects->prev = &ptls->gc_tls.heap.big_objects; - big_objects_marked = NULL; + if (current_sweep_full) { + sweep_list_of_oldest_bigvals(ptls->gc_tls.heap.young_generation_of_bigvals); + // move all nodes in `oldest_generation_of_bigvals` to my list of bigvals + assert(last_node_in_my_list != NULL); + assert(last_node_in_my_list->next == NULL); + last_node_in_my_list->next = oldest_generation_of_bigvals->next; // skip the sentinel + if (oldest_generation_of_bigvals->next != NULL) { + oldest_generation_of_bigvals->next->prev = last_node_in_my_list; + } + oldest_generation_of_bigvals->next = NULL; } gc_time_big_end(); } @@ -1536,7 +1525,7 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT { sweep_malloced_arrays(); - sweep_big(ptls, sweep_full); + sweep_big(ptls); } static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT @@ -3524,7 +3513,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // marking is over // Flush everything in mark cache - gc_sync_all_caches_nolock(ptls); + gc_sync_all_caches(ptls); int64_t live_sz_ub = live_bytes + actual_allocd; int64_t live_sz_est = scanned_bytes + perm_scanned_bytes; @@ -3863,7 +3852,9 @@ void jl_init_thread_heap(jl_ptls_t ptls) small_arraylist_new(&heap->free_stacks[i], 0); heap->mallocarrays = NULL; heap->mafreelist = NULL; - heap->big_objects = NULL; + heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel + assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized + heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag; arraylist_new(&heap->remset, 0); arraylist_new(&ptls->finalizers, 0); arraylist_new(&ptls->gc_tls.sweep_objs, 0); @@ -3871,7 +3862,6 @@ void jl_init_thread_heap(jl_ptls_t ptls) jl_gc_mark_cache_t *gc_cache = &ptls->gc_tls.gc_cache; gc_cache->perm_scanned_bytes = 0; gc_cache->scanned_bytes = 0; - gc_cache->nbig_obj = 0; // Initialize GC mark-queue jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue; @@ -3897,12 +3887,16 @@ void jl_gc_init(void) JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock"); JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock"); uv_mutex_init(&page_profile_lock); - uv_mutex_init(&gc_cache_lock); uv_mutex_init(&gc_perm_lock); uv_mutex_init(&gc_threads_lock); uv_cond_init(&gc_threads_cond); uv_sem_init(&gc_sweep_assists_needed, 0); uv_mutex_init(&gc_queue_observer_lock); + void *_addr = (void*)calloc_s(1); // dummy allocation to get the sentinel tag + uintptr_t addr = (uintptr_t)_addr; + gc_bigval_sentinel_tag = addr; + oldest_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel + oldest_generation_of_bigvals->header = gc_bigval_sentinel_tag; jl_gc_init_page(); jl_gc_debug_init(); diff --git a/src/gc.h b/src/gc.h index 1a74d334c98f4..2ec249d322d94 100644 --- a/src/gc.h +++ b/src/gc.h @@ -120,9 +120,11 @@ typedef struct _jl_gc_chunk_t { // layout for big (>2k) objects +extern uintptr_t gc_bigval_sentinel_tag; + JL_EXTENSION typedef struct _bigval_t { struct _bigval_t *next; - struct _bigval_t **prev; // pointer to the next field of the prev entry + struct _bigval_t *prev; size_t sz; #ifdef _P64 // Add padding so that the value is 64-byte aligned // (8 pointers of 8 bytes each) - (4 other pointers in struct) @@ -431,7 +433,7 @@ STATIC_INLINE unsigned ffs_u32(uint32_t bitvec) #endif extern jl_gc_num_t gc_num; -extern bigval_t *big_objects_marked; +extern bigval_t *oldest_generation_of_bigvals; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; extern int64_t buffered_pages; @@ -529,21 +531,30 @@ STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT NOINLINE uintptr_t gc_get_stack_ptr(void); -STATIC_INLINE void gc_big_object_unlink(const bigval_t *hdr) JL_NOTSAFEPOINT +FORCE_INLINE void gc_big_object_unlink(const bigval_t *node) JL_NOTSAFEPOINT { - *hdr->prev = hdr->next; - if (hdr->next) { - hdr->next->prev = hdr->prev; + assert(node != oldest_generation_of_bigvals); + assert(node->header != gc_bigval_sentinel_tag); + assert(node->prev != NULL); + if (node->next != NULL) { + node->next->prev = node->prev; } + node->prev->next = node->next; } -STATIC_INLINE void gc_big_object_link(bigval_t *hdr, bigval_t **list) JL_NOTSAFEPOINT +FORCE_INLINE void gc_big_object_link(bigval_t *sentinel_node, bigval_t *node) JL_NOTSAFEPOINT { - hdr->next = *list; - hdr->prev = list; - if (*list) - (*list)->prev = &hdr->next; - *list = hdr; + assert(sentinel_node != NULL); + assert(sentinel_node->header == gc_bigval_sentinel_tag); + assert(sentinel_node->prev == NULL); + assert(node->header != gc_bigval_sentinel_tag); + // a new node gets linked in at the head of the list + node->next = sentinel_node->next; + node->prev = sentinel_node; + if (sentinel_node->next != NULL) { + sentinel_node->next->prev = node; + } + sentinel_node->next = node; } extern uv_mutex_t gc_threads_lock; From a0d187233a843b2c3a462b3ac1576f8d047552bc Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Mon, 15 Jul 2024 23:42:14 -0300 Subject: [PATCH 4/5] delete some unused fields of jl_gc_mark_cache_t (#55138) They should have been deleted in https://github.com/JuliaLang/julia/pull/54936, but were not. --- src/gc-tls.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/gc-tls.h b/src/gc-tls.h index dc15cda48fa58..a9f711198e914 100644 --- a/src/gc-tls.h +++ b/src/gc-tls.h @@ -69,17 +69,6 @@ typedef struct { size_t perm_scanned_bytes; // thread local increment of `scanned_bytes` size_t scanned_bytes; - // Number of queued big objects (<= 1024) - size_t nbig_obj; - // Array of queued big objects to be moved between the young list - // and the old list. - // A set low bit means that the object should be moved from the old list - // to the young list (`mark_reset_age`). - // Objects can only be put into this list when the mark bit is flipped to - // `1` (atomically). Combining with the sync after marking, - // this makes sure that a single objects can only appear once in - // the lists (the mark bit cannot be flipped to `0` without sweeping) - void *big_obj[1024]; } jl_gc_mark_cache_t; typedef struct { From 23a8e36fb95e7af0b4cb20e7f766b7dfb29c9acc Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Tue, 16 Jul 2024 19:00:06 -0300 Subject: [PATCH 5/5] create separate function to spawn GC threads (#55108) Third-party GCs (e.g. MMTk) will probably have their own function to spawn GC threads. --- src/gc.c | 20 ++++++++++++++++++++ src/gc.h | 1 + src/init.c | 2 ++ src/julia_internal.h | 2 ++ src/threading.c | 26 ++++++++------------------ src/threading.h | 2 ++ 6 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/gc.c b/src/gc.c index c70eee6905ee9..15d48f600ed80 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3881,6 +3881,26 @@ void jl_init_thread_heap(jl_ptls_t ptls) jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval); } +void jl_start_gc_threads(void) +{ + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); + int ngcthreads = jl_n_gcthreads; + int nmutator_threads = nthreads - ngcthreads; + uv_thread_t uvtid; + for (int i = nmutator_threads; i < nthreads; ++i) { + jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread + t->tid = i; + t->barrier = &thread_init_done; + if (i == nthreads - 1 && jl_n_sweepthreads == 1) { + uv_thread_create(&uvtid, jl_concurrent_gc_threadfun, t); + } + else { + uv_thread_create(&uvtid, jl_parallel_gc_threadfun, t); + } + uv_thread_detach(&uvtid); + } +} + // System-wide initializations void jl_gc_init(void) { diff --git a/src/gc.h b/src/gc.h index 2ec249d322d94..b4d421c708547 100644 --- a/src/gc.h +++ b/src/gc.h @@ -562,6 +562,7 @@ extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping; +extern uv_barrier_t thread_init_done; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_NOTSAFEPOINT; diff --git a/src/init.c b/src/init.c index 83f528b35ac11..0a0878a081e28 100644 --- a/src/init.c +++ b/src/init.c @@ -899,6 +899,8 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ jl_n_threads_per_pool[1] = 0; } jl_start_threads(); + jl_start_gc_threads(); + uv_barrier_wait(&thread_init_done); jl_init_heartbeat(); diff --git a/src/julia_internal.h b/src/julia_internal.h index 75ebad73a4546..9ced95a8f027a 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -942,8 +942,10 @@ extern JL_DLLEXPORT ssize_t jl_tls_offset; extern JL_DLLEXPORT const int jl_tls_elf_support; void jl_init_threading(void); void jl_start_threads(void); +void jl_start_gc_threads(void); // Whether the GC is running +extern uv_mutex_t safepoint_lock; extern char *jl_safepoint_pages; STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr) { diff --git a/src/threading.c b/src/threading.c index 05607c12085f2..8f350d41f64b1 100644 --- a/src/threading.c +++ b/src/threading.c @@ -712,7 +712,7 @@ void jl_init_threading(void) gc_first_tid = nthreads + nthreadsi; } -static uv_barrier_t thread_init_done; +uv_barrier_t thread_init_done; void jl_start_threads(void) { @@ -751,30 +751,20 @@ void jl_start_threads(void) uv_barrier_init(&thread_init_done, nthreads); // GC/System threads need to be after the worker threads. - int nworker_threads = nthreads - ngcthreads; + int nmutator_threads = nthreads - ngcthreads; - for (i = 1; i < nthreads; ++i) { + for (i = 1; i < nmutator_threads; ++i) { jl_threadarg_t *t = (jl_threadarg_t *)malloc_s(sizeof(jl_threadarg_t)); // ownership will be passed to the thread t->tid = i; t->barrier = &thread_init_done; - if (i < nworker_threads) { - uv_thread_create(&uvtid, jl_threadfun, t); - if (exclusive) { - mask[i] = 1; - uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize); - mask[i] = 0; - } - } - else if (i == nthreads - 1 && jl_n_sweepthreads == 1) { - uv_thread_create(&uvtid, jl_concurrent_gc_threadfun, t); - } - else { - uv_thread_create(&uvtid, jl_parallel_gc_threadfun, t); + uv_thread_create(&uvtid, jl_threadfun, t); + if (exclusive) { + mask[i] = 1; + uv_thread_setaffinity(&uvtid, mask, NULL, cpumasksize); + mask[i] = 0; } uv_thread_detach(&uvtid); } - - uv_barrier_wait(&thread_init_done); } _Atomic(unsigned) _threadedregion; // HACK: keep track of whether to prioritize IO or threading diff --git a/src/threading.h b/src/threading.h index 260ecffa30dd5..cb26537699713 100644 --- a/src/threading.h +++ b/src/threading.h @@ -12,6 +12,8 @@ extern "C" { #define PROFILE_JL_THREADING 0 +extern uv_barrier_t thread_init_done; + extern _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */ typedef struct _jl_threadarg_t {