diff --git a/base/timing.jl b/base/timing.jl index 5fefd75c158522..5cc9ebc43996fc 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -1,6 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -# This type must be kept in sync with the C struct in src/gc.h +# This type must be kept in sync with the C struct in src/gc-common.h struct GC_Num allocd ::Int64 # GC internal deferred_alloc ::Int64 # GC internal @@ -47,7 +47,7 @@ gc_total_bytes(gc_num::GC_Num) = gc_num.allocd + gc_num.deferred_alloc + gc_num.total_allocd function GC_Diff(new::GC_Num, old::GC_Num) - # logic from `src/gc.c:jl_gc_total_bytes` + # logic from `jl_gc_total_bytes` old_allocd = gc_total_bytes(old) new_allocd = gc_total_bytes(new) return GC_Diff(new_allocd - old_allocd, diff --git a/doc/src/devdocs/object.md b/doc/src/devdocs/object.md index a2f72d623ab21a..8134132d6ee753 100644 --- a/doc/src/devdocs/object.md +++ b/doc/src/devdocs/object.md @@ -92,7 +92,7 @@ The corresponding global `jl_datatype_t` objects are created by [`jl_init_types` The garbage collector uses several bits from the metadata portion of the `jl_typetag_t` to track each object in the system. Further details about this algorithm can be found in the comments of -the [garbage collector implementation in `gc.c`](https://github.com/JuliaLang/julia/blob/master/src/gc.c). +the [garbage collector implementation in `gc-stock.c`](https://github.com/JuliaLang/julia/blob/master/src/gc-stock.c). ## Object allocation @@ -179,7 +179,7 @@ jl_value_t *newstruct(jl_value_t *type); jl_value_t *newobj(jl_value_t *type, size_t nfields); ``` -And at the lowest level, memory is getting allocated by a call to the garbage collector (in `gc.c`), +And at the lowest level, memory is getting allocated by a call to the garbage collector (in `gc-stock.c`), then tagged with its type: ```c diff --git a/src/Makefile b/src/Makefile index 4da44a8cc8d816..6f78f4a8b6aa15 100644 --- a/src/Makefile +++ b/src/Makefile @@ -44,7 +44,7 @@ SRCS := \ jltypes gf typemap smallintset ast builtins module interpreter symbol \ dlload sys init task array genericmemory staticdata toplevel jl_uv datatype \ simplevector runtime_intrinsics precompile jloptions mtarraylist \ - threading scheduler stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \ + threading scheduler stackwalk gc-common gc-stock gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \ jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \ crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine @@ -103,7 +103,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0) UV_HEADERS += uv.h UV_HEADERS += uv/*.h endif -PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) +PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) ifeq ($(OS),WINNT) PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h) endif @@ -316,11 +316,11 @@ $(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,\ $(BUILDDIR)/datatype.o $(BUILDDIR)/datatype.dbg.obj: $(SRCDIR)/support/htable.h $(SRCDIR)/support/htable.inc $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,debuginfo.h processor.h jitlayers.h debug-registry.h) $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h -$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h -$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h -$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h -$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h -$(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h +$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h +$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h +$(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h +$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h +$(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h $(BUILDDIR)/gc-page-profiler.o $(BUILDDIR)/gc-page-profiler.dbg.obj: $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h @@ -331,10 +331,10 @@ $(BUILDDIR)/llvm-alloc-helpers.o $(BUILDDIR)/llvm-alloc-helpers.dbg.obj: $(SRCDI $(BUILDDIR)/llvm-alloc-opt.o $(BUILDDIR)/llvm-alloc-opt.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-alloc-helpers.h $(BUILDDIR)/llvm-cpufeatures.o $(BUILDDIR)/llvm-cpufeatures.dbg.obj: $(SRCDIR)/jitlayers.h $(BUILDDIR)/llvm-demote-float16.o $(BUILDDIR)/llvm-demote-float16.dbg.obj: $(SRCDIR)/jitlayers.h -$(BUILDDIR)/llvm-final-gc-lowering.o $(BUILDDIR)/llvm-final-gc-lowering.dbg.obj: $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-codegen-shared.h +$(BUILDDIR)/llvm-final-gc-lowering.o $(BUILDDIR)/llvm-final-gc-lowering.dbg.obj: $(SRCDIR)/llvm-gc-interface-passes.h $(BUILDDIR)/llvm-gc-invariant-verifier.o $(BUILDDIR)/llvm-gc-invariant-verifier.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/llvm-julia-licm.o $(BUILDDIR)/llvm-julia-licm.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-alloc-helpers.h $(SRCDIR)/llvm-pass-helpers.h -$(BUILDDIR)/llvm-late-gc-lowering.o $(BUILDDIR)/llvm-late-gc-lowering.dbg.obj: $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-codegen-shared.h +$(BUILDDIR)/llvm-late-gc-lowering.o $(BUILDDIR)/llvm-late-gc-lowering.dbg.obj: $(SRCDIR)/llvm-gc-interface-passes.h $(BUILDDIR)/llvm-lower-handlers.o $(BUILDDIR)/llvm-lower-handlers.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/llvm-multiversioning.o $(BUILDDIR)/llvm-multiversioning.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/processor.h $(BUILDDIR)/llvm-pass-helpers.o $(BUILDDIR)/llvm-pass-helpers.dbg.obj: $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-codegen-shared.h @@ -348,7 +348,7 @@ $(BUILDDIR)/toplevel.o $(BUILDDIR)/toplevel.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/ircode.o $(BUILDDIR)/ircode.dbg.obj: $(SRCDIR)/serialize.h $(SRCDIR)/common_symbols1.inc $(SRCDIR)/common_symbols2.inc $(BUILDDIR)/pipeline.o $(BUILDDIR)/pipeline.dbg.obj: $(SRCDIR)/passes.h $(SRCDIR)/jitlayers.h -$(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h) +$(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc-common.o gc-stock.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h) $(addprefix $(BUILDDIR)/,APInt-C.o APInt-C.dbg.obj runtime_intrinsics.o runtime_intrinsics.dbg.obj): $(SRCDIR)/APInt-C.h # archive library file rules diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp index c7ee32269138a4..5b462d48cd2def 100644 --- a/src/gc-alloc-profiler.cpp +++ b/src/gc-alloc-profiler.cpp @@ -3,7 +3,6 @@ #include "gc-alloc-profiler.h" #include "julia_internal.h" -#include "gc.h" #include "llvm/ADT/SmallVector.h" diff --git a/src/gc-common.c b/src/gc-common.c new file mode 100644 index 00000000000000..ee461b576ea9e7 --- /dev/null +++ b/src/gc-common.c @@ -0,0 +1,506 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "gc-common.h" +#include "julia.h" +#include "julia_atomics.h" +#include "julia_gcext.h" +#include "julia_assert.h" +#include "threading.h" +#ifdef __GLIBC__ +#include // for malloc_trim +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// =========================================================================== // +// GC Metrics +// =========================================================================== // + +jl_gc_num_t gc_num = {0}; + +// =========================================================================== // +// GC Callbacks +// =========================================================================== // + +jl_gc_callback_list_t *gc_cblist_root_scanner; +jl_gc_callback_list_t *gc_cblist_task_scanner; +jl_gc_callback_list_t *gc_cblist_pre_gc; +jl_gc_callback_list_t *gc_cblist_post_gc; +jl_gc_callback_list_t *gc_cblist_notify_external_alloc; +jl_gc_callback_list_t *gc_cblist_notify_external_free; +jl_gc_callback_list_t *gc_cblist_notify_gc_pressure; + +static void jl_gc_register_callback(jl_gc_callback_list_t **list, + jl_gc_cb_func_t func) +{ + while (*list != NULL) { + if ((*list)->func == func) + return; + list = &((*list)->next); + } + *list = (jl_gc_callback_list_t *)malloc_s(sizeof(jl_gc_callback_list_t)); + (*list)->next = NULL; + (*list)->func = func; +} + +static void jl_gc_deregister_callback(jl_gc_callback_list_t **list, + jl_gc_cb_func_t func) +{ + while (*list != NULL) { + if ((*list)->func == func) { + jl_gc_callback_list_t *tmp = *list; + (*list) = (*list)->next; + free(tmp); + return; + } + list = &((*list)->next); + } +} + +JL_DLLEXPORT void jl_gc_set_cb_root_scanner(jl_gc_cb_root_scanner_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_root_scanner, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_root_scanner, (jl_gc_cb_func_t)cb); +} + +JL_DLLEXPORT void jl_gc_set_cb_task_scanner(jl_gc_cb_task_scanner_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_task_scanner, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_task_scanner, (jl_gc_cb_func_t)cb); +} + +JL_DLLEXPORT void jl_gc_set_cb_pre_gc(jl_gc_cb_pre_gc_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_pre_gc, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_pre_gc, (jl_gc_cb_func_t)cb); +} + +JL_DLLEXPORT void jl_gc_set_cb_post_gc(jl_gc_cb_post_gc_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_post_gc, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_post_gc, (jl_gc_cb_func_t)cb); +} + +JL_DLLEXPORT void jl_gc_set_cb_notify_external_alloc(jl_gc_cb_notify_external_alloc_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_notify_external_alloc, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_notify_external_alloc, (jl_gc_cb_func_t)cb); +} + +JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_free_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); +} + +JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); +} + +// =========================================================================== // +// Finalization +// =========================================================================== // + +jl_mutex_t finalizers_lock; +arraylist_t finalizer_list_marked; +arraylist_t to_finalize; +JL_DLLEXPORT _Atomic(int) jl_gc_have_pending_finalizers = 0; + +void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT +{ + arraylist_push(&to_finalize, o); + arraylist_push(&to_finalize, f); + // doesn't need release, since we'll keep checking (on the reader) until we see the work and + // release our lock, and that will have a release barrier by then + jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 1); +} + +void run_finalizer(jl_task_t *ct, void *o, void *ff) +{ + int ptr_finalizer = gc_ptr_tag(o, 1); + o = gc_ptr_clear_tag(o, 3); + if (ptr_finalizer) { + ((void (*)(void*))ff)((void*)o); + return; + } + JL_TRY { + size_t last_age = ct->world_age; + ct->world_age = jl_atomic_load_acquire(&jl_world_counter); + jl_apply_generic((jl_value_t*)ff, (jl_value_t**)&o, 1); + ct->world_age = last_age; + } + JL_CATCH { + jl_printf((JL_STREAM*)STDERR_FILENO, "error in running finalizer: "); + jl_static_show((JL_STREAM*)STDERR_FILENO, jl_current_exception(ct)); + jl_printf((JL_STREAM*)STDERR_FILENO, "\n"); + jlbacktrace(); // written to STDERR_FILENO + } +} + +// if `need_sync` is true, the `list` is the `finalizers` list of another +// thread and we need additional synchronizations +static void finalize_object(arraylist_t *list, jl_value_t *o, + arraylist_t *copied_list, int need_sync) JL_NOTSAFEPOINT +{ + // The acquire load makes sure that the first `len` objects are valid. + // If `need_sync` is true, all mutations of the content should be limited + // to the first `oldlen` elements and no mutation is allowed after the + // new length is published with the `cmpxchg` at the end of the function. + // This way, the mutation should not conflict with the owning thread, + // which only writes to locations later than `len` + // and will not resize the buffer without acquiring the lock. + size_t len = need_sync ? jl_atomic_load_acquire((_Atomic(size_t)*)&list->len) : list->len; + size_t oldlen = len; + void **items = list->items; + size_t j = 0; + for (size_t i = 0; i < len; i += 2) { + void *v = items[i]; + int move = 0; + if (o == (jl_value_t*)gc_ptr_clear_tag(v, 1)) { + void *f = items[i + 1]; + move = 1; + arraylist_push(copied_list, v); + arraylist_push(copied_list, f); + } + if (move || __unlikely(!v)) { + // remove item + } + else { + if (j < i) { + items[j] = items[i]; + items[j+1] = items[i+1]; + } + j += 2; + } + } + len = j; + if (oldlen == len) + return; + if (need_sync) { + // The memset needs to be unconditional since the thread might have + // already read the length. + // The `memset` (like any other content mutation) has to be done + // **before** the `cmpxchg` which publishes the length. + memset(&items[len], 0, (oldlen - len) * sizeof(void*)); + jl_atomic_cmpswap((_Atomic(size_t)*)&list->len, &oldlen, len); + } + else { + list->len = len; + } +} + +// The first two entries are assumed to be empty and the rest are assumed to +// be pointers to `jl_value_t` objects +static void jl_gc_push_arraylist(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT +{ + void **items = list->items; + items[0] = (void*)JL_GC_ENCODE_PUSHARGS(list->len - 2); + items[1] = ct->gcstack; + ct->gcstack = (jl_gcframe_t*)items; +} + +// Same assumption as `jl_gc_push_arraylist`. Requires the finalizers lock +// to be hold for the current thread and will release the lock when the +// function returns. +static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT_LEAVE +{ + // Avoid marking `ct` as non-migratable via an `@async` task (as noted in the docstring + // of `finalizer`) in a finalizer: + uint8_t sticky = ct->sticky; + // empty out the first two entries for the GC frame + arraylist_push(list, list->items[0]); + arraylist_push(list, list->items[1]); + jl_gc_push_arraylist(ct, list); + void **items = list->items; + size_t len = list->len; + JL_UNLOCK_NOGC(&finalizers_lock); + // run finalizers in reverse order they were added, so lower-level finalizers run last + for (size_t i = len-4; i >= 2; i -= 2) + run_finalizer(ct, items[i], items[i + 1]); + // first entries were moved last to make room for GC frame metadata + run_finalizer(ct, items[len-2], items[len-1]); + // matches the jl_gc_push_arraylist above + JL_GC_POP(); + ct->sticky = sticky; +} + +static uint64_t finalizer_rngState[JL_RNG_SIZE]; + +void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT; + +JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void) +{ + jl_rng_split(finalizer_rngState, jl_current_task->rngState); +} + +void run_finalizers(jl_task_t *ct, int finalizers_thread) +{ + // Racy fast path: + // The race here should be OK since the race can only happen if + // another thread is writing to it with the lock held. In such case, + // we don't need to run pending finalizers since the writer thread + // will flush it. + if (to_finalize.len == 0) + return; + JL_LOCK_NOGC(&finalizers_lock); + if (to_finalize.len == 0) { + JL_UNLOCK_NOGC(&finalizers_lock); + return; + } + arraylist_t copied_list; + memcpy(&copied_list, &to_finalize, sizeof(copied_list)); + if (to_finalize.items == to_finalize._space) { + copied_list.items = copied_list._space; + } + jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0); + arraylist_new(&to_finalize, 0); + + uint64_t save_rngState[JL_RNG_SIZE]; + memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState)); + jl_rng_split(ct->rngState, finalizer_rngState); + + // This releases the finalizers lock. + int8_t was_in_finalizer = ct->ptls->in_finalizer; + ct->ptls->in_finalizer = !finalizers_thread; + jl_gc_run_finalizers_in_list(ct, &copied_list); + ct->ptls->in_finalizer = was_in_finalizer; + arraylist_free(&copied_list); + + memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState)); +} + +JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct) +{ + if (ct == NULL) + ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0 && ptls->engine_nqueued == 0) { + run_finalizers(ct, 0); + } +} + +JL_DLLEXPORT int jl_gc_get_finalizers_inhibited(jl_ptls_t ptls) +{ + if (ptls == NULL) + ptls = jl_current_task->ptls; + return ptls->finalizers_inhibited; +} + +JL_DLLEXPORT void jl_gc_disable_finalizers_internal(void) +{ + jl_ptls_t ptls = jl_current_task->ptls; + ptls->finalizers_inhibited++; +} + +JL_DLLEXPORT void jl_gc_enable_finalizers_internal(void) +{ + jl_task_t *ct = jl_current_task; +#ifdef NDEBUG + ct->ptls->finalizers_inhibited--; +#else + jl_gc_enable_finalizers(ct, 1); +#endif +} + +JL_DLLEXPORT void jl_gc_enable_finalizers(jl_task_t *ct, int on) +{ + if (ct == NULL) + ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + int old_val = ptls->finalizers_inhibited; + int new_val = old_val + (on ? -1 : 1); + if (new_val < 0) { + JL_TRY { + jl_error(""); // get a backtrace + } + JL_CATCH { + jl_printf((JL_STREAM*)STDERR_FILENO, "WARNING: GC finalizers already enabled on this thread.\n"); + // Only print the backtrace once, to avoid spamming the logs + static int backtrace_printed = 0; + if (backtrace_printed == 0) { + backtrace_printed = 1; + jlbacktrace(); // written to STDERR_FILENO + } + } + return; + } + ptls->finalizers_inhibited = new_val; + if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers)) { + jl_gc_run_pending_finalizers(ct); + } +} + +JL_DLLEXPORT int8_t jl_gc_is_in_finalizer(void) +{ + return jl_current_task->ptls->in_finalizer; +} + +static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT +{ + void **items = flist->items; + size_t len = flist->len; + for(size_t i = 0; i < len; i+=2) { + void *v = items[i]; + void *f = items[i + 1]; + if (__unlikely(!v)) + continue; + schedule_finalization(v, f); + } + flist->len = 0; +} + +void jl_gc_run_all_finalizers(jl_task_t *ct) +{ + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + // this is called from `jl_atexit_hook`; threads could still be running + // so we have to guard the finalizers' lists + JL_LOCK_NOGC(&finalizers_lock); + schedule_all_finalizers(&finalizer_list_marked); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 != NULL) + schedule_all_finalizers(&ptls2->finalizers); + } + // unlock here because `run_finalizers` locks this + JL_UNLOCK_NOGC(&finalizers_lock); + run_finalizers(ct, 1); +} + +void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT +{ + assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_STATE_UNSAFE); + arraylist_t *a = &ptls->finalizers; + // This acquire load and the release store at the end are used to + // synchronize with `finalize_object` on another thread. Apart from the GC, + // which is blocked by entering a unsafe region, there might be only + // one other thread accessing our list in `finalize_object` + // (only one thread since it needs to acquire the finalizer lock). + // Similar to `finalize_object`, all content mutation has to be done + // between the acquire and the release of the length. + size_t oldlen = jl_atomic_load_acquire((_Atomic(size_t)*)&a->len); + if (__unlikely(oldlen + 2 > a->max)) { + JL_LOCK_NOGC(&finalizers_lock); + // `a->len` might have been modified. + // Another possibility is to always grow the array to `oldlen + 2` but + // it's simpler this way and uses slightly less memory =) + oldlen = a->len; + arraylist_grow(a, 2); + a->len = oldlen; + JL_UNLOCK_NOGC(&finalizers_lock); + } + void **items = a->items; + items[oldlen] = v; + items[oldlen + 1] = f; + jl_atomic_store_release((_Atomic(size_t)*)&a->len, oldlen + 2); +} + +JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT +{ + jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f); +} + +// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads) +JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT +{ + assert(!gc_ptr_tag(v, 3)); + jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f); +} + +JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT +{ + if (__unlikely(jl_typetagis(f, jl_voidpointer_type))) { + jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f)); + } + else { + jl_gc_add_finalizer_(ptls, v, f); + } +} + +JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) +{ + jl_ptls_t ptls = jl_current_task->ptls; + jl_gc_add_finalizer_th(ptls, v, f); +} + +JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) +{ + JL_LOCK_NOGC(&finalizers_lock); + // Copy the finalizers into a temporary list so that code in the finalizer + // won't change the list as we loop through them. + // This list is also used as the GC frame when we are running the finalizers + arraylist_t copied_list; + arraylist_new(&copied_list, 0); + // No need to check the to_finalize list since the user is apparently + // still holding a reference to the object + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 != NULL) + finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); + } + finalize_object(&finalizer_list_marked, o, &copied_list, 0); + if (copied_list.len > 0) { + // This releases the finalizers lock. + jl_gc_run_finalizers_in_list(ct, &copied_list); + } + else { + JL_UNLOCK_NOGC(&finalizers_lock); + } + arraylist_free(&copied_list); +} + +JL_DLLEXPORT void jl_finalize(jl_value_t *o) +{ + jl_finalize_th(jl_current_task, o); +} + +// =========================================================================== // +// Threading +// =========================================================================== // + +int gc_n_threads; +jl_ptls_t* gc_all_tls_states; + +// =========================================================================== // +// MISC +// =========================================================================== // + +const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00 +JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT +{ + return jl_buff_tag; +} + +// callback for passing OOM errors from gmp +JL_DLLEXPORT void jl_throw_out_of_memory_error(void) +{ + jl_throw(jl_memory_exception); +} + +#ifdef __cplusplus +} +#endif diff --git a/src/gc-common.h b/src/gc-common.h new file mode 100644 index 00000000000000..4d53830442a7dc --- /dev/null +++ b/src/gc-common.h @@ -0,0 +1,176 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#ifndef JL_GC_COMMON_H +#define JL_GC_COMMON_H + +#include "julia.h" +#include "julia_internal.h" +#ifndef _OS_WINDOWS_ +#include +#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// =========================================================================== // +// GC Callbacks +// =========================================================================== // + +typedef void (*jl_gc_cb_func_t)(void); + +typedef struct _jl_gc_callback_list_t { + struct _jl_gc_callback_list_t *next; + jl_gc_cb_func_t func; +} jl_gc_callback_list_t; + +extern jl_gc_callback_list_t *gc_cblist_root_scanner; +extern jl_gc_callback_list_t *gc_cblist_task_scanner; +extern jl_gc_callback_list_t *gc_cblist_pre_gc; +extern jl_gc_callback_list_t *gc_cblist_post_gc; +extern jl_gc_callback_list_t *gc_cblist_notify_external_alloc; +extern jl_gc_callback_list_t *gc_cblist_notify_external_free; +extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure; + +#define gc_invoke_callbacks(ty, list, args) \ + do { \ + for (jl_gc_callback_list_t *cb = list; \ + cb != NULL; \ + cb = cb->next) \ + { \ + ((ty)(cb->func)) args; \ + } \ + } while (0) + +#ifdef __cplusplus +} +#endif + +// =========================================================================== // +// malloc wrappers, aligned allocation +// =========================================================================== // + +#if defined(_OS_WINDOWS_) +STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align) +{ + return _aligned_malloc(sz ? sz : 1, align); +} +STATIC_INLINE void *jl_realloc_aligned(void *p, size_t sz, size_t oldsz, + size_t align) +{ + (void)oldsz; + return _aligned_realloc(p, sz ? sz : 1, align); +} +STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT +{ + _aligned_free(p); +} +#else +STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align) +{ +#if defined(_P64) || defined(__APPLE__) + if (align <= 16) + return malloc(sz); +#endif + void *ptr; + if (posix_memalign(&ptr, align, sz)) + return NULL; + return ptr; +} +STATIC_INLINE void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz, + size_t align) +{ +#if defined(_P64) || defined(__APPLE__) + if (align <= 16) + return realloc(d, sz); +#endif + void *b = jl_malloc_aligned(sz, align); + if (b != NULL) { + memcpy(b, d, oldsz > sz ? sz : oldsz); + free(d); + } + return b; +} +STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT +{ + free(p); +} +#endif +#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT) +#define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT) + +// =========================================================================== // +// Pointer tagging +// =========================================================================== // + +STATIC_INLINE int gc_marked(uintptr_t bits) JL_NOTSAFEPOINT +{ + return (bits & GC_MARKED) != 0; +} + +STATIC_INLINE int gc_old(uintptr_t bits) JL_NOTSAFEPOINT +{ + return (bits & GC_OLD) != 0; +} + +STATIC_INLINE uintptr_t gc_set_bits(uintptr_t tag, int bits) JL_NOTSAFEPOINT +{ + return (tag & ~(uintptr_t)3) | bits; +} + +STATIC_INLINE uintptr_t gc_ptr_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT +{ + return ((uintptr_t)v) & mask; +} + +STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT +{ + return (void*)(((uintptr_t)v) & ~mask); +} + +// =========================================================================== // +// GC Metrics +// =========================================================================== // + +extern jl_gc_num_t gc_num; + +// =========================================================================== // +// Stop-the-world for GC +// =========================================================================== // +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads); + +// =========================================================================== // +// Finalization +// =========================================================================== // + +// Protect all access to `finalizer_list_marked` and `to_finalize`. +// For accessing `ptls->finalizers`, the lock is needed if a thread +// is going to realloc the buffer (of its own list) or accessing the +// list of another thread +extern jl_mutex_t finalizers_lock; +// `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers. +// If an object pointer has the lowest bit set, the next pointer is an unboxed c function pointer. +// If an object pointer has the second lowest bit set, the current pointer is a c object pointer. +// It must be aligned at least 4, and it finalized immediately (at "quiescence"). +// `to_finalize` should not have tagged pointers. +extern arraylist_t finalizer_list_marked; +extern arraylist_t to_finalize; + +void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT; +void run_finalizer(jl_task_t *ct, void *o, void *ff); +void run_finalizers(jl_task_t *ct, int finalizers_thread); +JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o); + + +// =========================================================================== // +// Threading +// =========================================================================== // + +extern int gc_n_threads; +extern jl_ptls_t* gc_all_tls_states; + +#endif // JL_GC_COMMON_H diff --git a/src/gc-debug.c b/src/gc-debug.c index a7699cc3d0168c..ec3c8d731edd89 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1,6 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -#include "gc.h" +#include "gc-common.h" +#include "gc-stock.h" #include "julia.h" #include #include diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp index 77a6e70a127e6e..b84d1f96f273ce 100644 --- a/src/gc-heap-snapshot.cpp +++ b/src/gc-heap-snapshot.cpp @@ -2,9 +2,9 @@ #include "gc-heap-snapshot.h" +#include "julia.h" #include "julia_internal.h" #include "julia_assert.h" -#include "gc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" @@ -183,7 +183,8 @@ struct HeapSnapshot { // when snapshotting is on. int gc_heap_snapshot_enabled = 0; HeapSnapshot *g_snapshot = nullptr; -extern jl_mutex_t heapsnapshot_lock; +// mutex for gc-heap-snapshot. +jl_mutex_t heapsnapshot_lock; void final_serialize_heap_snapshot(ios_t *json, ios_t *strings, HeapSnapshot &snapshot, char all_one); void serialize_heap_snapshot(ios_t *stream, HeapSnapshot &snapshot, char all_one); diff --git a/src/gc-heap-snapshot.h b/src/gc-heap-snapshot.h index 70884f5f62d6a5..a58f58aba54586 100644 --- a/src/gc-heap-snapshot.h +++ b/src/gc-heap-snapshot.h @@ -35,6 +35,7 @@ void _gc_heap_snapshot_record_finlist(jl_value_t *finlist, size_t index) JL_NOTS extern int gc_heap_snapshot_enabled; extern int prev_sweep_full; +extern jl_mutex_t heapsnapshot_lock; int gc_slot_to_fieldidx(void *_obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT; int gc_slot_to_arrayidx(void *_obj, void *begin) JL_NOTSAFEPOINT; diff --git a/src/gc-interface.h b/src/gc-interface.h new file mode 100644 index 00000000000000..bcbe3c4bea6dbf --- /dev/null +++ b/src/gc-interface.h @@ -0,0 +1,235 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +/* + Garbage Collection interface that must be implemented by third-party GCs +*/ + +#ifndef JL_GC_INTERFACE_H +#define JL_GC_INTERFACE_H + +#include "dtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct _jl_tls_states_t; +struct _jl_value_t; +struct _jl_weakref_t; +struct _jl_datatype_t; + +// ========================================================================= // +// GC Metrics +// ========================================================================= // + +// This struct must be kept in sync with the Julia type of the same name in base/timing.jl +typedef struct { + int64_t allocd; + int64_t deferred_alloc; + int64_t freed; + uint64_t malloc; + uint64_t realloc; + uint64_t poolalloc; + uint64_t bigalloc; + uint64_t freecall; + uint64_t total_time; + uint64_t total_allocd; + size_t interval; + int pause; + int full_sweep; + uint64_t max_pause; + uint64_t max_memory; + uint64_t time_to_safepoint; + uint64_t max_time_to_safepoint; + uint64_t total_time_to_safepoint; + uint64_t sweep_time; + uint64_t mark_time; + uint64_t total_sweep_time; + uint64_t total_mark_time; + uint64_t last_full_sweep; + uint64_t last_incremental_sweep; +} jl_gc_num_t; + +// ========================================================================= // +// System-wide Initialization +// ========================================================================= // + +// System-wide initialization function. Responsible for initializing global locks as well as +// global memory parameters (e.g. target heap size) used by the collector. +void jl_gc_init(void); +// Spawns GC threads. +void jl_start_gc_threads(void); + +// ========================================================================= // +// Per-thread Initialization +// ========================================================================= // + +// Initializes thread-local data structures such as thread-local object pools, +// thread-local remembered sets and thread-local allocation counters. +// Should be called exactly once per Julia thread. +void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT; +// Deallocates any memory previously used for thread-local GC data structures. +// Mostly used to ensure that we perform this memory cleanup for foreign threads that are +// about to leave Julia. +void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls); + +// ========================================================================= // +// Controls +// ========================================================================= // + +typedef enum { + JL_GC_AUTO = 0, // use heuristics to determine the collection type + JL_GC_FULL = 1, // force a full collection + JL_GC_INCREMENTAL = 2, // force an incremental collection +} jl_gc_collection_t; +// Enables or disables (depending on the value of the argument) the collector. Returns +// whether GC was previously enabled. +JL_DLLEXPORT int jl_gc_enable(int on); +// Returns whether the collector is enabled. +JL_DLLEXPORT int jl_gc_is_enabled(void); +// Sets a soft limit to Julia's heap. +JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem); +// Runs a GC cycle. This function's parameter determines whether we're running an +// incremental, full, or automatic (i.e. heuristic driven) collection. Returns whether we +// should run a collection cycle again (e.g. a full mark right after a full sweep to ensure +// we do a full heap traversal). +JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection); + +// ========================================================================= // +// Metrics +// ========================================================================= // + +// Retrieves Julia's `GC_Num` (structure that stores GC statistics). +JL_DLLEXPORT jl_gc_num_t jl_gc_num(void); +// Returns the difference between the current value of total live bytes now +// (live bytes at the last collection plus number of bytes allocated since then), +// compared to the value at the last time this function was called. +JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT; +// Returns the difference between the current value of total live bytes now +// (live bytes at the last collection plus number of bytes allocated since then) +// compared to the value at the last time this function was called. The offset parameter +// is subtracted from this value in order to obtain the return value. +JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT; +// Returns the number of pool allocated bytes. This could always return 0 for GC +// implementations that do not use pools. +JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void); +// Returns the number of live bytes at the end of the last collection cycle +// (doesn't include the number of allocated bytes since then). +JL_DLLEXPORT int64_t jl_gc_live_bytes(void); +// Stores the number of live bytes at the end of the last collection cycle plus the number +// of bytes we allocated since then into the 64-bit integer pointer passed as an argument. +JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT; +// Retrieves the value of Julia's soft heap limit. +JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void); +// High-resolution (nano-seconds) value of total time spent in GC. +JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void); + +// ========================================================================= // +// Allocation +// ========================================================================= // + +// Allocates small objects and increments Julia allocation counterst. +// The (possibly unused in some implementations) offset to the arena in which we're +// allocating is passed in the second parameter, and the object size in the third parameter. +// If thread-local allocators are used, then this function should allocate in the +// thread-local allocator of the thread referenced by the jl_ptls_t argument. An additional +// (last) parameter containing information about the type of the object being allocated may +// be used to record an allocation of that type in the allocation profiler. +JL_DLLEXPORT struct _jl_value_t *jl_gc_small_alloc(struct _jl_tls_states_t *ptls, + int offset, int osize, + struct _jl_value_t *type); +// Description: Allocates large objects and increments Julia allocation counters. +// If thread-local allocators are used, then this function should allocate in the +// thread-local allocator of the thread referenced by the jl_ptls_t argument. An additional +// (last) parameter containing information about the type of the object being allocated may +// be used to record an allocation of that type in the allocation profiler. +JL_DLLEXPORT struct _jl_value_t *jl_gc_big_alloc(struct _jl_tls_states_t *ptls, size_t sz, + struct _jl_value_t *type); +// Wrapper around Libc malloc that updates Julia allocation counters. +JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz); +// Wrapper around Libc calloc that updates Julia allocation counters. +JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz); +// Wrapper around Libc free that updates Julia allocation counters. +JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz); +// Wrapper around Libc realloc that updates Julia allocation counters. +JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz); +// Wrapper around Libc malloc that allocates a memory region with a few additional machine +// words before the actual payload that are used to record the size of the requested +// allocation. Also updates Julia allocation counters. +JL_DLLEXPORT void *jl_malloc(size_t sz); +// Wrapper around Libc calloc that allocates a memory region with a few additional machine +// words before the actual payload that are used to record the size of the requested +// allocation. Also updates Julia allocation counters. +JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz); +// Wrapper around Libc free that takes a memory region allocated with jl_malloc or +// jl_calloc, and uses the size information stored in the first machine words of the memory +// buffer update Julia allocation counters, freeing the corresponding memory buffer in the +// end. +JL_DLLEXPORT void jl_free(void *p); +// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or +// jl_calloc, and uses the size information stored in the first machine words of the memory +// buffer to update Julia allocation counters, reallocating the corresponding memory buffer +// in the end. +JL_DLLEXPORT void *jl_realloc(void *p, size_t sz); +// Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and +// Strings. It increments Julia allocation counters and should check whether we're close to +// the Julia heap target, and therefore, whether we should run a collection. Note that this +// doesn't record the size of the allocation request in a side metadata (i.e. a few words in +// front of the memory payload): this function is used for Julia object allocations, and we +// assume that there is already a field in the Julia object being allocated that we may use +// to store the size of the memory buffer. +JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz); +// Allocates a new weak-reference, assigns its value and increments Julia allocation +// counters. If thread-local allocators are used, then this function should allocate in the +// thread-local allocator of the thread referenced by the first jl_ptls_t argument. +JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls, + struct _jl_value_t *value); +// Allocates a new weak-reference, assigns its value and increments Julia allocation +// counters. If thread-local allocators are used, then this function should allocate in the +// thread-local allocator of the current thread. +JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value); +// Allocates an object whose size is specified by the function argument and increments Julia +// allocation counters. If thread-local allocators are used, then this function should +// allocate in the thread-local allocator of the current thread. +JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz); +// Permanently allocates a memory slot whose size is specified by the first parameter. +// The second parameter specifies whether the memory should be filled with zeros, +// and the third and fourth parameters specify alignment and offset of the corresponding +// payload compared to the start of the memory block. +JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, + unsigned offset) JL_NOTSAFEPOINT; +// TODO: Document this function. +struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT; + +// ========================================================================= // +// Runtime Write-Barriers +// ========================================================================= // + +// Write barrier slow-path. If a generational collector is used, +// it may enqueue an old object into the remembered set of the calling thread. +JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr); +// TODO: Document this function. +JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored, + struct _jl_datatype_t *dt) JL_NOTSAFEPOINT; +// If a generational collector is used, checks whether the function argument points to an +// old object, and if so, calls the write barrier slow path above. In most cases, this +// function is used when its caller has verified that there is a young reference in the +// object that's being passed as an argument to this function. +STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT; +// Write barrier function that must be used after pointer writes to heap-allocated objects – +// the value of the field being written must also point to a heap-allocated object. +// If a generational collector is used, it may check whether the two function arguments are +// in different GC generations (i.e. if the first argument points to an old object and the +// second argument points to a young object), and if so, call the write barrier slow-path. +STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT; +// Write-barrier function that must be used after copying multiple fields of an object into +// another. It should be semantically equivalent to triggering multiple write barriers – one +// per field of the object being copied, but may be special-cased for performance reasons. +STATIC_INLINE void jl_gc_multi_wb(const void *parent, + const struct _jl_value_t *ptr) JL_NOTSAFEPOINT; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c index 2e876e4b7b4d6c..2625fa812781a7 100644 --- a/src/gc-page-profiler.c +++ b/src/gc-page-profiler.c @@ -1,6 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license #include "gc-page-profiler.h" +#include "julia.h" #ifdef __cplusplus extern "C" { diff --git a/src/gc-page-profiler.h b/src/gc-page-profiler.h index 28989f8f8e206f..0dd72ad072fa91 100644 --- a/src/gc-page-profiler.h +++ b/src/gc-page-profiler.h @@ -3,7 +3,7 @@ #ifndef GC_PAGE_PROFILER_H #define GC_PAGE_PROFILER_H -#include "gc.h" +#include "gc-stock.h" #ifdef __cplusplus extern "C" { diff --git a/src/gc-pages.c b/src/gc-pages.c index 971dbe92d7fac6..71d59de29166f5 100644 --- a/src/gc-pages.c +++ b/src/gc-pages.c @@ -1,6 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -#include "gc.h" +#include "gc-common.h" +#include "gc-stock.h" #ifndef _OS_WINDOWS_ # include #endif diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 2f6075b56b8ef1..5706f4ce67c1dd 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -1,6 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -#include "gc.h" +#include "gc-common.h" +#include "threading.h" #ifndef _OS_WINDOWS_ # include #endif diff --git a/src/gc.c b/src/gc-stock.c similarity index 89% rename from src/gc.c rename to src/gc-stock.c index 3ab0aa4b1bf1b4..fa6a9325d4983e 100644 --- a/src/gc.c +++ b/src/gc-stock.c @@ -1,6 +1,9 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -#include "gc.h" +#include "gc-common.h" +#include "gc-stock.h" +#include "gc-alloc-profiler.h" +#include "gc-heap-snapshot.h" #include "gc-page-profiler.h" #include "julia.h" #include "julia_atomics.h" @@ -38,125 +41,6 @@ uv_mutex_t gc_queue_observer_lock; // Tag for sentinel nodes in bigval list uintptr_t gc_bigval_sentinel_tag; -// Linked list of callback functions - -typedef void (*jl_gc_cb_func_t)(void); - -typedef struct jl_gc_callback_list_t { - struct jl_gc_callback_list_t *next; - jl_gc_cb_func_t func; -} jl_gc_callback_list_t; - -static jl_gc_callback_list_t *gc_cblist_root_scanner; -static jl_gc_callback_list_t *gc_cblist_task_scanner; -static jl_gc_callback_list_t *gc_cblist_pre_gc; -static jl_gc_callback_list_t *gc_cblist_post_gc; -static jl_gc_callback_list_t *gc_cblist_notify_external_alloc; -static jl_gc_callback_list_t *gc_cblist_notify_external_free; -static jl_gc_callback_list_t *gc_cblist_notify_gc_pressure; - -#define gc_invoke_callbacks(ty, list, args) \ - do { \ - for (jl_gc_callback_list_t *cb = list; \ - cb != NULL; \ - cb = cb->next) \ - { \ - ((ty)(cb->func)) args; \ - } \ - } while (0) - -static void jl_gc_register_callback(jl_gc_callback_list_t **list, - jl_gc_cb_func_t func) -{ - while (*list != NULL) { - if ((*list)->func == func) - return; - list = &((*list)->next); - } - *list = (jl_gc_callback_list_t *)malloc_s(sizeof(jl_gc_callback_list_t)); - (*list)->next = NULL; - (*list)->func = func; -} - -static void jl_gc_deregister_callback(jl_gc_callback_list_t **list, - jl_gc_cb_func_t func) -{ - while (*list != NULL) { - if ((*list)->func == func) { - jl_gc_callback_list_t *tmp = *list; - (*list) = (*list)->next; - free(tmp); - return; - } - list = &((*list)->next); - } -} - -JL_DLLEXPORT void jl_gc_set_cb_root_scanner(jl_gc_cb_root_scanner_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_root_scanner, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_root_scanner, (jl_gc_cb_func_t)cb); -} - -JL_DLLEXPORT void jl_gc_set_cb_task_scanner(jl_gc_cb_task_scanner_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_task_scanner, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_task_scanner, (jl_gc_cb_func_t)cb); -} - -JL_DLLEXPORT void jl_gc_set_cb_pre_gc(jl_gc_cb_pre_gc_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_pre_gc, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_pre_gc, (jl_gc_cb_func_t)cb); -} - -JL_DLLEXPORT void jl_gc_set_cb_post_gc(jl_gc_cb_post_gc_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_post_gc, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_post_gc, (jl_gc_cb_func_t)cb); -} - -JL_DLLEXPORT void jl_gc_set_cb_notify_external_alloc(jl_gc_cb_notify_external_alloc_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_notify_external_alloc, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_notify_external_alloc, (jl_gc_cb_func_t)cb); -} - -JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_free_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); -} - -JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable) -{ - if (enable) - jl_gc_register_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); - else - jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); -} - -// Protect all access to `finalizer_list_marked` and `to_finalize`. -// For accessing `ptls->finalizers`, the lock is needed if a thread -// is going to realloc the buffer (of its own list) or accessing the -// list of another thread -static jl_mutex_t finalizers_lock; - -// mutex for gc-heap-snapshot. -jl_mutex_t heapsnapshot_lock; - // Flag that tells us whether we need to support conservative marking // of objects. static _Atomic(int) support_conservative_marking = 0; @@ -193,406 +77,11 @@ static _Atomic(int) support_conservative_marking = 0; * finalizers in unmanaged (GC safe) mode. */ -jl_gc_num_t gc_num = {0}; -static size_t last_long_collect_interval; -int gc_n_threads; -jl_ptls_t* gc_all_tls_states; gc_heapstatus_t gc_heap_stats = {0}; -int next_sweep_full = 0; -const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00 -JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT -{ - return jl_buff_tag; -} // List of big objects in oldest generation (`GC_OLD_MARKED`). Not per-thread. Accessed only by master thread. bigval_t *oldest_generation_of_bigvals = NULL; -// -- Finalization -- -// `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers. -// If an object pointer has the lowest bit set, the next pointer is an unboxed c function pointer. -// If an object pointer has the second lowest bit set, the current pointer is a c object pointer. -// It must be aligned at least 4, and it finalized immediately (at "quiescence"). -// `to_finalize` should not have tagged pointers. -arraylist_t finalizer_list_marked; -arraylist_t to_finalize; -JL_DLLEXPORT _Atomic(int) jl_gc_have_pending_finalizers = 0; - -void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads); - -// malloc wrappers, aligned allocation - -#if defined(_OS_WINDOWS_) -STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align) -{ - return _aligned_malloc(sz ? sz : 1, align); -} -STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT -{ - _aligned_free(p); -} -#else -STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align) -{ -#if defined(_P64) || defined(__APPLE__) - if (align <= 16) - return malloc(sz); -#endif - void *ptr; - if (posix_memalign(&ptr, align, sz)) - return NULL; - return ptr; -} -STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT -{ - free(p); -} -#endif -#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT) - -static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT -{ - arraylist_push(&to_finalize, o); - arraylist_push(&to_finalize, f); - // doesn't need release, since we'll keep checking (on the reader) until we see the work and - // release our lock, and that will have a release barrier by then - jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 1); -} - -static void run_finalizer(jl_task_t *ct, void *o, void *ff) -{ - int ptr_finalizer = gc_ptr_tag(o, 1); - o = gc_ptr_clear_tag(o, 3); - if (ptr_finalizer) { - ((void (*)(void*))ff)((void*)o); - return; - } - JL_TRY { - size_t last_age = ct->world_age; - ct->world_age = jl_atomic_load_acquire(&jl_world_counter); - jl_apply_generic((jl_value_t*)ff, (jl_value_t**)&o, 1); - ct->world_age = last_age; - } - JL_CATCH { - jl_printf((JL_STREAM*)STDERR_FILENO, "error in running finalizer: "); - jl_static_show((JL_STREAM*)STDERR_FILENO, jl_current_exception(ct)); - jl_printf((JL_STREAM*)STDERR_FILENO, "\n"); - jlbacktrace(); // written to STDERR_FILENO - } -} - -// if `need_sync` is true, the `list` is the `finalizers` list of another -// thread and we need additional synchronizations -static void finalize_object(arraylist_t *list, jl_value_t *o, - arraylist_t *copied_list, int need_sync) JL_NOTSAFEPOINT -{ - // The acquire load makes sure that the first `len` objects are valid. - // If `need_sync` is true, all mutations of the content should be limited - // to the first `oldlen` elements and no mutation is allowed after the - // new length is published with the `cmpxchg` at the end of the function. - // This way, the mutation should not conflict with the owning thread, - // which only writes to locations later than `len` - // and will not resize the buffer without acquiring the lock. - size_t len = need_sync ? jl_atomic_load_acquire((_Atomic(size_t)*)&list->len) : list->len; - size_t oldlen = len; - void **items = list->items; - size_t j = 0; - for (size_t i = 0; i < len; i += 2) { - void *v = items[i]; - int move = 0; - if (o == (jl_value_t*)gc_ptr_clear_tag(v, 1)) { - void *f = items[i + 1]; - move = 1; - arraylist_push(copied_list, v); - arraylist_push(copied_list, f); - } - if (move || __unlikely(!v)) { - // remove item - } - else { - if (j < i) { - items[j] = items[i]; - items[j+1] = items[i+1]; - } - j += 2; - } - } - len = j; - if (oldlen == len) - return; - if (need_sync) { - // The memset needs to be unconditional since the thread might have - // already read the length. - // The `memset` (like any other content mutation) has to be done - // **before** the `cmpxchg` which publishes the length. - memset(&items[len], 0, (oldlen - len) * sizeof(void*)); - jl_atomic_cmpswap((_Atomic(size_t)*)&list->len, &oldlen, len); - } - else { - list->len = len; - } -} - -// The first two entries are assumed to be empty and the rest are assumed to -// be pointers to `jl_value_t` objects -static void jl_gc_push_arraylist(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT -{ - void **items = list->items; - items[0] = (void*)JL_GC_ENCODE_PUSHARGS(list->len - 2); - items[1] = ct->gcstack; - ct->gcstack = (jl_gcframe_t*)items; -} - -// Same assumption as `jl_gc_push_arraylist`. Requires the finalizers lock -// to be hold for the current thread and will release the lock when the -// function returns. -static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT_LEAVE -{ - // Avoid marking `ct` as non-migratable via an `@async` task (as noted in the docstring - // of `finalizer`) in a finalizer: - uint8_t sticky = ct->sticky; - // empty out the first two entries for the GC frame - arraylist_push(list, list->items[0]); - arraylist_push(list, list->items[1]); - jl_gc_push_arraylist(ct, list); - void **items = list->items; - size_t len = list->len; - JL_UNLOCK_NOGC(&finalizers_lock); - // run finalizers in reverse order they were added, so lower-level finalizers run last - for (size_t i = len-4; i >= 2; i -= 2) - run_finalizer(ct, items[i], items[i + 1]); - // first entries were moved last to make room for GC frame metadata - run_finalizer(ct, items[len-2], items[len-1]); - // matches the jl_gc_push_arraylist above - JL_GC_POP(); - ct->sticky = sticky; -} - -static uint64_t finalizer_rngState[JL_RNG_SIZE]; - -void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT; - -JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void) -{ - jl_rng_split(finalizer_rngState, jl_current_task->rngState); -} - -static void run_finalizers(jl_task_t *ct, int finalizers_thread) -{ - // Racy fast path: - // The race here should be OK since the race can only happen if - // another thread is writing to it with the lock held. In such case, - // we don't need to run pending finalizers since the writer thread - // will flush it. - if (to_finalize.len == 0) - return; - JL_LOCK_NOGC(&finalizers_lock); - if (to_finalize.len == 0) { - JL_UNLOCK_NOGC(&finalizers_lock); - return; - } - arraylist_t copied_list; - memcpy(&copied_list, &to_finalize, sizeof(copied_list)); - if (to_finalize.items == to_finalize._space) { - copied_list.items = copied_list._space; - } - jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0); - arraylist_new(&to_finalize, 0); - - uint64_t save_rngState[JL_RNG_SIZE]; - memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState)); - jl_rng_split(ct->rngState, finalizer_rngState); - - // This releases the finalizers lock. - int8_t was_in_finalizer = ct->ptls->in_finalizer; - ct->ptls->in_finalizer = !finalizers_thread; - jl_gc_run_finalizers_in_list(ct, &copied_list); - ct->ptls->in_finalizer = was_in_finalizer; - arraylist_free(&copied_list); - - memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState)); -} - -JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct) -{ - if (ct == NULL) - ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; - if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0 && ptls->engine_nqueued == 0) { - run_finalizers(ct, 0); - } -} - -JL_DLLEXPORT int jl_gc_get_finalizers_inhibited(jl_ptls_t ptls) -{ - if (ptls == NULL) - ptls = jl_current_task->ptls; - return ptls->finalizers_inhibited; -} - -JL_DLLEXPORT void jl_gc_disable_finalizers_internal(void) -{ - jl_ptls_t ptls = jl_current_task->ptls; - ptls->finalizers_inhibited++; -} - -JL_DLLEXPORT void jl_gc_enable_finalizers_internal(void) -{ - jl_task_t *ct = jl_current_task; -#ifdef NDEBUG - ct->ptls->finalizers_inhibited--; -#else - jl_gc_enable_finalizers(ct, 1); -#endif -} - -JL_DLLEXPORT void jl_gc_enable_finalizers(jl_task_t *ct, int on) -{ - if (ct == NULL) - ct = jl_current_task; - jl_ptls_t ptls = ct->ptls; - int old_val = ptls->finalizers_inhibited; - int new_val = old_val + (on ? -1 : 1); - if (new_val < 0) { - JL_TRY { - jl_error(""); // get a backtrace - } - JL_CATCH { - jl_printf((JL_STREAM*)STDERR_FILENO, "WARNING: GC finalizers already enabled on this thread.\n"); - // Only print the backtrace once, to avoid spamming the logs - static int backtrace_printed = 0; - if (backtrace_printed == 0) { - backtrace_printed = 1; - jlbacktrace(); // written to STDERR_FILENO - } - } - return; - } - ptls->finalizers_inhibited = new_val; - if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers)) { - jl_gc_run_pending_finalizers(ct); - } -} - -JL_DLLEXPORT int8_t jl_gc_is_in_finalizer(void) -{ - return jl_current_task->ptls->in_finalizer; -} - -static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT -{ - void **items = flist->items; - size_t len = flist->len; - for(size_t i = 0; i < len; i+=2) { - void *v = items[i]; - void *f = items[i + 1]; - if (__unlikely(!v)) - continue; - schedule_finalization(v, f); - } - flist->len = 0; -} - -void jl_gc_run_all_finalizers(jl_task_t *ct) -{ - int gc_n_threads; - jl_ptls_t* gc_all_tls_states; - gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); - gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); - // this is called from `jl_atexit_hook`; threads could still be running - // so we have to guard the finalizers' lists - JL_LOCK_NOGC(&finalizers_lock); - schedule_all_finalizers(&finalizer_list_marked); - for (int i = 0; i < gc_n_threads; i++) { - jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 != NULL) - schedule_all_finalizers(&ptls2->finalizers); - } - // unlock here because `run_finalizers` locks this - JL_UNLOCK_NOGC(&finalizers_lock); - run_finalizers(ct, 1); -} - -void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT -{ - assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_STATE_UNSAFE); - arraylist_t *a = &ptls->finalizers; - // This acquire load and the release store at the end are used to - // synchronize with `finalize_object` on another thread. Apart from the GC, - // which is blocked by entering a unsafe region, there might be only - // one other thread accessing our list in `finalize_object` - // (only one thread since it needs to acquire the finalizer lock). - // Similar to `finalize_object`, all content mutation has to be done - // between the acquire and the release of the length. - size_t oldlen = jl_atomic_load_acquire((_Atomic(size_t)*)&a->len); - if (__unlikely(oldlen + 2 > a->max)) { - JL_LOCK_NOGC(&finalizers_lock); - // `a->len` might have been modified. - // Another possibility is to always grow the array to `oldlen + 2` but - // it's simpler this way and uses slightly less memory =) - oldlen = a->len; - arraylist_grow(a, 2); - a->len = oldlen; - JL_UNLOCK_NOGC(&finalizers_lock); - } - void **items = a->items; - items[oldlen] = v; - items[oldlen + 1] = f; - jl_atomic_store_release((_Atomic(size_t)*)&a->len, oldlen + 2); -} - -JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT -{ - jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f); -} - -// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads) -JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT -{ - assert(!gc_ptr_tag(v, 3)); - jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f); -} - -JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT -{ - if (__unlikely(jl_typetagis(f, jl_voidpointer_type))) { - jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f)); - } - else { - jl_gc_add_finalizer_(ptls, v, f); - } -} - -JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) -{ - JL_LOCK_NOGC(&finalizers_lock); - // Copy the finalizers into a temporary list so that code in the finalizer - // won't change the list as we loop through them. - // This list is also used as the GC frame when we are running the finalizers - arraylist_t copied_list; - arraylist_new(&copied_list, 0); - // No need to check the to_finalize list since the user is apparently - // still holding a reference to the object - int gc_n_threads; - jl_ptls_t* gc_all_tls_states; - gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); - gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); - for (int i = 0; i < gc_n_threads; i++) { - jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 != NULL) - finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); - } - finalize_object(&finalizer_list_marked, o, &copied_list, 0); - if (copied_list.len > 0) { - // This releases the finalizers lock. - jl_gc_run_finalizers_in_list(ct, &copied_list); - } - else { - JL_UNLOCK_NOGC(&finalizers_lock); - } - arraylist_free(&copied_list); -} - // explicitly scheduled objects for the sweepfunc callback static void gc_sweep_foreign_objs_in_list(arraylist_t *objs) JL_NOTSAFEPOINT { @@ -705,13 +194,13 @@ static int64_t scanned_bytes; // young bytes scanned while marking static int64_t perm_scanned_bytes; // old bytes scanned while marking int prev_sweep_full = 1; int current_sweep_full = 0; +int next_sweep_full = 0; int under_pressure = 0; // Full collection heuristics static int64_t live_bytes = 0; static int64_t promoted_bytes = 0; static int64_t last_live_bytes = 0; // live_bytes at last collection -static int64_t t_start = 0; // Time GC starts; #ifdef __GLIBC__ // maxrss at last malloc_trim static int64_t last_trim_maxrss = 0; @@ -861,8 +350,7 @@ STATIC_INLINE void maybe_collect(jl_ptls_t ptls) // weak references -JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, - jl_value_t *value) +JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value) { jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type); @@ -3751,7 +3239,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) live_bytes += -gc_num.freed + gc_num.allocd; jl_timing_counter_dec(JL_TIMING_COUNTER_HeapSize, gc_num.freed); - gc_time_summary(sweep_full, t_start, gc_end_time, gc_num.freed, + gc_time_summary(sweep_full, gc_start_time, gc_end_time, gc_num.freed, live_bytes, gc_num.interval, pause, gc_num.time_to_safepoint, gc_num.mark_time, gc_num.sweep_time); @@ -3972,6 +3460,79 @@ void jl_start_gc_threads(void) } } +STATIC_INLINE int may_mark(void) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&gc_n_threads_marking) > 0); +} + +STATIC_INLINE int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0); +} + +// parallel gc thread function +void jl_parallel_gc_threadfun(void *arg) +{ + jl_threadarg_t *targ = (jl_threadarg_t*)arg; + + // initialize this thread (set tid and create heap) + jl_ptls_t ptls = jl_init_threadtls(targ->tid); + void *stack_lo, *stack_hi; + jl_init_stack_limits(0, &stack_lo, &stack_hi); + // warning: this changes `jl_current_task`, so be careful not to call that from this function + jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); + JL_GC_PROMISE_ROOTED(ct); + (void)jl_atomic_fetch_add_relaxed(&nrunning, -1); + // wait for all threads + jl_gc_state_set(ptls, JL_GC_PARALLEL_COLLECTOR_THREAD, JL_GC_STATE_UNSAFE); + uv_barrier_wait(targ->barrier); + + // free the thread argument here + free(targ); + + while (1) { + uv_mutex_lock(&gc_threads_lock); + while (!may_mark() && !may_sweep(ptls)) { + uv_cond_wait(&gc_threads_cond, &gc_threads_lock); + } + uv_mutex_unlock(&gc_threads_lock); + assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); + gc_mark_loop_parallel(ptls, 0); + if (may_sweep(ptls)) { + assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); + gc_sweep_pool_parallel(ptls); + jl_atomic_fetch_add(&ptls->gc_tls.gc_sweeps_requested, -1); + } + } +} + +// concurrent gc thread function +void jl_concurrent_gc_threadfun(void *arg) +{ + jl_threadarg_t *targ = (jl_threadarg_t*)arg; + + // initialize this thread (set tid and create heap) + jl_ptls_t ptls = jl_init_threadtls(targ->tid); + void *stack_lo, *stack_hi; + jl_init_stack_limits(0, &stack_lo, &stack_hi); + // warning: this changes `jl_current_task`, so be careful not to call that from this function + jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); + JL_GC_PROMISE_ROOTED(ct); + (void)jl_atomic_fetch_add_relaxed(&nrunning, -1); + // wait for all threads + jl_gc_state_set(ptls, JL_GC_CONCURRENT_COLLECTOR_THREAD, JL_GC_STATE_UNSAFE); + uv_barrier_wait(targ->barrier); + + // free the thread argument here + free(targ); + + while (1) { + assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_CONCURRENT_COLLECTOR_THREAD); + uv_sem_wait(&gc_sweep_assists_needed); + gc_free_pages(); + } +} + // System-wide initializations void jl_gc_init(void) { @@ -3997,7 +3558,6 @@ void jl_gc_init(void) arraylist_new(&to_finalize, 0); jl_atomic_store_relaxed(&gc_heap_stats.heap_target, default_collect_interval); gc_num.interval = default_collect_interval; - last_long_collect_interval = default_collect_interval; gc_num.allocd = 0; gc_num.max_pause = 0; gc_num.max_memory = 0; @@ -4018,8 +3578,6 @@ void jl_gc_init(void) hint = min_heap_size_hint; jl_gc_set_max_memory(hint - mem_reserve); } - - t_start = jl_hrtime(); } JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) @@ -4035,12 +3593,6 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void) return max_total_memory; } -// callback for passing OOM errors from gmp -JL_DLLEXPORT void jl_throw_out_of_memory_error(void) -{ - jl_throw(jl_memory_exception); -} - // allocation wrappers that track allocation and let collection run JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) @@ -4290,15 +3842,16 @@ void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset) return p; } -JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) -{ - jl_ptls_t ptls = jl_current_task->ptls; - jl_gc_add_finalizer_th(ptls, v, f); -} - -JL_DLLEXPORT void jl_finalize(jl_value_t *o) +jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT { - jl_finalize_th(jl_current_task, o); + const size_t allocsz = sz + sizeof(jl_taggedvalue_t); + unsigned align = (sz == 0 ? sizeof(void*) : (allocsz <= sizeof(void*) * 2 ? + sizeof(void*) * 2 : 16)); + jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align, + sizeof(void*) % align); + uintptr_t tag = (uintptr_t)ty; + o->header = tag | GC_OLD_MARKED; + return jl_valueof(o); } JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value) diff --git a/src/gc.h b/src/gc-stock.h similarity index 92% rename from src/gc.h rename to src/gc-stock.h index 0d8421912dbc7e..592bd25630ab7e 100644 --- a/src/gc.h +++ b/src/gc-stock.h @@ -18,16 +18,8 @@ #include "julia.h" #include "julia_threads.h" #include "julia_internal.h" -#include "threading.h" -#ifndef _OS_WINDOWS_ -#include -#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) -#define MAP_ANONYMOUS MAP_ANON -#endif -#endif #include "julia_assert.h" -#include "gc-heap-snapshot.h" -#include "gc-alloc-profiler.h" +#include "threading.h" #ifdef __cplusplus extern "C" { @@ -41,9 +33,6 @@ extern "C" { #define GC_PAGE_SZ (1 << GC_PAGE_LG2) #define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT)) -#define jl_malloc_tag ((void*)0xdeadaa01) -#define jl_singleton_tag ((void*)0xdeadaa02) - // Used by GC_DEBUG_ENV typedef struct { uint64_t num; @@ -62,34 +51,6 @@ typedef struct { jl_alloc_num_t print; } jl_gc_debug_env_t; -// This struct must be kept in sync with the Julia type of the same name in base/timing.jl -typedef struct { - int64_t allocd; - int64_t deferred_alloc; - int64_t freed; - uint64_t malloc; - uint64_t realloc; - uint64_t poolalloc; - uint64_t bigalloc; - uint64_t freecall; - uint64_t total_time; - uint64_t total_allocd; - size_t interval; - int pause; - int full_sweep; - uint64_t max_pause; - uint64_t max_memory; - uint64_t time_to_safepoint; - uint64_t max_time_to_safepoint; - uint64_t total_time_to_safepoint; - uint64_t sweep_time; - uint64_t mark_time; - uint64_t total_sweep_time; - uint64_t total_mark_time; - uint64_t last_full_sweep; - uint64_t last_incremental_sweep; -} jl_gc_num_t; - // Array chunks (work items representing suffixes of // large arrays of pointers left to be marked) @@ -440,14 +401,9 @@ STATIC_INLINE unsigned ffs_u32(uint32_t bitvec) } #endif -extern jl_gc_num_t gc_num; extern bigval_t *oldest_generation_of_bigvals; -extern arraylist_t finalizer_list_marked; -extern arraylist_t to_finalize; extern int64_t buffered_pages; extern int gc_first_tid; -extern int gc_n_threads; -extern jl_ptls_t* gc_all_tls_states; extern gc_heapstatus_t gc_heap_stats; STATIC_INLINE int gc_first_parallel_collector_thread_id(void) JL_NOTSAFEPOINT @@ -523,31 +479,6 @@ STATIC_INLINE jl_taggedvalue_t *page_pfl_end(jl_gc_pagemeta_t *p) JL_NOTSAFEPOIN return (jl_taggedvalue_t*)(p->data + p->fl_end_offset); } -STATIC_INLINE int gc_marked(uintptr_t bits) JL_NOTSAFEPOINT -{ - return (bits & GC_MARKED) != 0; -} - -STATIC_INLINE int gc_old(uintptr_t bits) JL_NOTSAFEPOINT -{ - return (bits & GC_OLD) != 0; -} - -STATIC_INLINE uintptr_t gc_set_bits(uintptr_t tag, int bits) JL_NOTSAFEPOINT -{ - return (tag & ~(uintptr_t)3) | bits; -} - -STATIC_INLINE uintptr_t gc_ptr_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT -{ - return ((uintptr_t)v) & mask; -} - -STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT -{ - return (void*)(((uintptr_t)v) & ~mask); -} - FORCE_INLINE void gc_big_object_unlink(const bigval_t *node) JL_NOTSAFEPOINT { assert(node != oldest_generation_of_bigvals); @@ -580,6 +511,7 @@ extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping; +extern _Atomic(int) nrunning; extern uv_barrier_t thread_init_done; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT; diff --git a/src/julia.h b/src/julia.h index 4574c47518d81e..5ff894af6e1d5e 100644 --- a/src/julia.h +++ b/src/julia.h @@ -82,6 +82,7 @@ typedef struct _jl_tls_states_t *jl_ptls_t; #ifdef JL_LIBRARY_EXPORTS #include "uv.h" #endif +#include "gc-interface.h" #include "julia_atomics.h" #include "julia_threads.h" #include "julia_assert.h" @@ -1046,35 +1047,14 @@ extern void JL_GC_POP() JL_NOTSAFEPOINT; #endif -JL_DLLEXPORT int jl_gc_enable(int on); -JL_DLLEXPORT int jl_gc_is_enabled(void); - -typedef enum { - JL_GC_AUTO = 0, // use heuristics to determine the collection type - JL_GC_FULL = 1, // force a full collection - JL_GC_INCREMENTAL = 2, // force an incremental collection -} jl_gc_collection_t; - -JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t); - JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_finalize(jl_value_t *o); -JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value); -JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz); JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz); -JL_DLLEXPORT void jl_gc_use(jl_value_t *a); -// Set GC memory trigger in bytes for greedy memory collecting -JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem); -JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void); - -JL_DLLEXPORT void jl_clear_malloc_data(void); // GC write barriers -JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *root) JL_NOTSAFEPOINT; -JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *root, const void *stored, jl_datatype_t *dt) JL_NOTSAFEPOINT; STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT { @@ -1106,7 +1086,6 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_ jl_gc_queue_multiroot((jl_value_t*)parent, ptr, dt); } -JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz); JL_DLLEXPORT void jl_gc_safepoint(void); JL_DLLEXPORT int jl_safepoint_suspend_thread(int tid, int waitstate); JL_DLLEXPORT void jl_safepoint_suspend_all_threads(struct _jl_task_t *ct); diff --git a/src/julia_internal.h b/src/julia_internal.h index 392fe8fd9a1fb8..2ec4c9da7452b0 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -311,7 +311,7 @@ static inline void memassign_safe(int hasptr, char *dst, const jl_value_t *src, memcpy(dst, jl_assume_aligned(src, sizeof(void*)), nb); } -// -- gc.c -- // +// -- GC -- // #define GC_CLEAN 0 // freshly allocated #define GC_MARKED 1 // reachable and young @@ -350,8 +350,6 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize); jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz); JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT; -JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, - unsigned align, unsigned offset) JL_NOTSAFEPOINT; void gc_sweep_sysimg(void); @@ -544,17 +542,6 @@ STATIC_INLINE jl_gc_tracked_buffer_t *jl_gc_alloc_buf(jl_ptls_t ptls, size_t sz) return jl_gc_alloc(ptls, sz, (void*)jl_buff_tag); } -STATIC_INLINE jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT -{ - const size_t allocsz = sz + sizeof(jl_taggedvalue_t); - unsigned align = (sz == 0 ? sizeof(void*) : (allocsz <= sizeof(void*) * 2 ? - sizeof(void*) * 2 : 16)); - jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align, - sizeof(void*) % align); - uintptr_t tag = (uintptr_t)ty; - o->header = tag | GC_OLD_MARKED; - return jl_valueof(o); -} jl_value_t *jl_permbox8(jl_datatype_t *t, uintptr_t tag, uint8_t x); jl_value_t *jl_permbox32(jl_datatype_t *t, uintptr_t tag, uint32_t x); jl_svec_t *jl_perm_symsvec(size_t n, ...); @@ -590,14 +577,6 @@ jl_svec_t *jl_perm_symsvec(size_t n, ...); #endif #endif -JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz); - -JL_DLLEXPORT void JL_NORETURN jl_throw_out_of_memory_error(void); - - -JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT; -JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT; -void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT; void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT; size_t jl_genericmemory_nbytes(jl_genericmemory_t *a) JL_NOTSAFEPOINT; void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT; @@ -945,9 +924,7 @@ void jl_init_tasks(void) JL_GC_DISABLED; void jl_init_stack_limits(int ismaster, void **stack_hi, void **stack_lo) JL_NOTSAFEPOINT; jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi); void jl_init_serializer(void); -void jl_gc_init(void); void jl_init_uv(void); -void jl_init_thread_heap(jl_ptls_t ptls) JL_NOTSAFEPOINT; void jl_init_int32_int64_cache(void); JL_DLLEXPORT void jl_init_options(void); @@ -957,7 +934,6 @@ extern JL_DLLEXPORT ssize_t jl_tls_offset; extern JL_DLLEXPORT const int jl_tls_elf_support; void jl_init_threading(void); void jl_start_threads(void); -void jl_start_gc_threads(void); // Whether the GC is running extern uv_mutex_t safepoint_lock; @@ -1301,6 +1277,9 @@ void jl_push_excstack(jl_task_t *ct, jl_excstack_t **stack JL_REQUIRE_ROOTED_SLO jl_value_t *exception JL_ROOTED_ARGUMENT, jl_bt_element_t *bt_data, size_t bt_size); +// System util to get maximum RSS +JL_DLLEXPORT size_t jl_maxrss(void); + //-------------------------------------------------- // congruential random number generator // for a small amount of thread-local randomness diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index fe32e6d09a8565..0605098bec3616 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -1,22 +1,6 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -#include "llvm-version.h" -#include "passes.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "llvm-codegen-shared.h" -#include "julia.h" -#include "julia_internal.h" -#include "llvm-pass-helpers.h" +#include "llvm-gc-interface-passes.h" #define DEBUG_TYPE "final_gc_lowering" STATISTIC(NewGCFrameCount, "Number of lowered newGCFrameFunc intrinsics"); @@ -27,50 +11,6 @@ STATISTIC(GCAllocBytesCount, "Number of lowered GCAllocBytesFunc intrinsics"); STATISTIC(QueueGCRootCount, "Number of lowered queueGCRootFunc intrinsics"); STATISTIC(SafepointCount, "Number of lowered safepoint intrinsics"); -using namespace llvm; - -// The final GC lowering pass. This pass lowers platform-agnostic GC -// intrinsics to platform-dependent instruction sequences. The -// intrinsics it targets are those produced by the late GC frame -// lowering pass. -// -// This pass targets typical back-ends for which the standard Julia -// runtime library is available. Atypical back-ends should supply -// their own lowering pass. - -struct FinalLowerGC: private JuliaPassContext { - bool runOnFunction(Function &F); - -private: - Function *queueRootFunc; - Function *smallAllocFunc; - Function *bigAllocFunc; - Function *allocTypedFunc; - Instruction *pgcstack; - Type *T_size; - - // Lowers a `julia.new_gc_frame` intrinsic. - void lowerNewGCFrame(CallInst *target, Function &F); - - // Lowers a `julia.push_gc_frame` intrinsic. - void lowerPushGCFrame(CallInst *target, Function &F); - - // Lowers a `julia.pop_gc_frame` intrinsic. - void lowerPopGCFrame(CallInst *target, Function &F); - - // Lowers a `julia.get_gc_frame_slot` intrinsic. - void lowerGetGCFrameSlot(CallInst *target, Function &F); - - // Lowers a `julia.gc_alloc_bytes` intrinsic. - void lowerGCAllocBytes(CallInst *target, Function &F); - - // Lowers a `julia.queue_gc_root` intrinsic. - void lowerQueueGCRoot(CallInst *target, Function &F); - - // Lowers a `julia.safepoint` intrinsic. - void lowerSafepoint(CallInst *target, Function &F); -}; - void FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F) { ++NewGCFrameCount; diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h new file mode 100644 index 00000000000000..cb485751d407b6 --- /dev/null +++ b/src/llvm-gc-interface-passes.h @@ -0,0 +1,413 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +/* + LLVM passes that may be partially modified by a third-party GC implementation. +*/ + +#include "llvm-version.h" +#include "passes.h" + +#include "llvm/IR/DerivedTypes.h" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "llvm-codegen-shared.h" +#include "julia.h" +#include "julia_internal.h" +#include "julia_assert.h" +#include "llvm-pass-helpers.h" +#include +#include + +#ifndef LLVM_GC_PASSES_H +#define LLVM_GC_PASSES_H + +using namespace llvm; + +/* Julia GC Root Placement pass. For a general overview of the design of GC + root lowering, see the devdocs. This file is the actual implementation. + + The actual algorithm is fairly straightforward. First recall the goal of this + pass: + + Minimize the number of needed gc roots/stores to them subject to the constraint + that at every safepoint, any live gc-tracked pointer (i.e. for which there is + a path after this point that contains a use of this pointer) is in some gc slot. + + In particular, in order to understand this algorithm, it is important to + realize that the only places where rootedness matters is at safepoints. + + Now, the primary phases of the algorithm are: + + 1. Local Scan + + During this step, each Basic Block is inspected and analyzed for local + properties. In particular, we want to determine the ordering of any of + the following activities: + + - Any Def of a gc-tracked pointer. In general Defs are the results of + calls or loads from appropriate memory locations. Phi nodes and + selects do complicate this story slightly as described below. + - Any use of a gc-tracked or derived pointer. As described in the + devdocs, a use is in general one of + a) a load from a tracked/derived value + b) a store to a tracked/derived value + c) a store OF a tracked/derived value + d) a use of a value as a call operand (including operand bundles) + - Any safepoint + + Crucially, we also perform pointer numbering during the local scan, + assigning every Def a unique integer and caching the integer for each + derived pointer. This allows us to operate only on the set of Defs ( + represented by these integers) for the rest of the algorithm. We also + maintain some local utility information that is needed by later passes + (see the BBState struct for details). + + 2. Dataflow Computation + + This computation operates entirely over the function's control flow graph + and does not look into a basic block. The algorithm is essentially + textbook iterative data flow for liveness computation. However, the + data flow equations are slightly more complicated because we also + forward propagate rootedness information in addition to backpropagating + liveness. + + 3. Live Set Computation + + With the liveness information from the previous step, we can now compute, + for every safepoint, the set of values live at that particular safepoint. + There are three pieces of information being combined here: + i. Values that needed to be live due to local analysis (e.g. there + was a def, then a safepoint, then a use). This was computed during + local analysis. + ii. Values that are live across the basic block (i.e. they are live + at every safepoint within the basic block). This relies entirely + on the liveness information. + iii. Values that are now live-out from the basic block (i.e. they are + live at every safepoint following their def). During local + analysis, we keep, for every safepoint, those values that would + be live if they were live out. Here we can check if they are + actually live-out and make the appropriate additions to the live + set. + + Lastly, we also explicitly compute, for each value, the list of values + that are simultaneously live at some safepoint. This is known as an + "interference graph" and is the input to the next step. + + 4. GC Root coloring + + Two values which are not simultaneously live at a safepoint can share the + same slot. This is an important optimization, because otherwise long + functions would have exceptionally large GC slots, reducing performance + and bloating the size of the stack. Assigning values to these slots is + equivalent to doing graph coloring on the interference graph - the graph + where nodes are values and two values have an edge if they are + simultaneously live at a safepoint - which we computed in the previous + step. Now graph coloring in general is a hard problem. However, for SSA + form programs, (and most programs in general, by virtue of their + structure), the resulting interference graphs are chordal and can be + colored optimally in linear time by performing greedy coloring in a + perfect elimination order. Now, our interference graphs are likely not + entirely chordal due to some non-SSA corner cases. However, using the same + algorithm should still give a very good coloring while having sufficiently + low runtime. + + 5. JLCall frame optimizations + + Unlike earlier iterations of the gc root placement logic, jlcall frames + are no longer treated as a special case and need not necessarily be sunk + into the gc frame. Additionally, we now emit lifetime + intrinsics, so regular stack slot coloring will merge any jlcall frames + not sunk into the gc frame. Nevertheless performing such sinking can still + be profitable. Since all arguments to a jlcall are guaranteed to be live + at that call in some gc slot, we can attempt to rearrange the slots within + the gc-frame, or reuse slots not assigned at that particular location + for the gcframe. However, even without this optimization, stack frames + are at most two times larger than optimal (because regular stack coloring + can merge the jlcall allocas). + + N.B.: This step is not yet implemented. + + 6. Root placement + + This performs the actual insertion of the GCFrame pushes/pops, zeros out + the gc frame and creates the stores to the gc frame according to the + stack slot assignment computed in the previous step. GC frames stores + are generally sunk right before the first safe point that use them + (this is beneficial for code where the primary path does not have + safepoints, but some other path - e.g. the error path does). However, + if the first safepoint is not dominated by the definition (this can + happen due to the non-ssa corner cases), the store is inserted right after + the definition. + + 7. Cleanup + + This step performs necessary cleanup before passing the IR to codegen. In + particular, it removes any calls to julia_from_objref intrinsics and + removes the extra operand bundles from ccalls. In the future it could + also strip the addrspace information from all values as this + information is no longer needed. + + + There are a couple important special cases that deserve special attention: + + A. PHIs and Selects + + In general PHIs and selects are treated as separate defs for the purposes + of the algorithm and their operands as uses of those values. It is + important to consider however WHERE the uses of PHI's operands are + located. It is neither at the start of the basic block, because the values + do not dominate the block (so can't really consider them live-in), nor + at the end of the predecessor (because they are actually live out). + Instead it is best to think of those uses as living on the edge between + the appropriate predecessor and the block containing the PHI. + + Another concern is PHIs of derived values. Since we cannot simply root + these values by storing them to a GC slot, we need to insert a new, + artificial PHI that tracks the base pointers for the derived values. E.g. + in: + + A: + %Abase = load addrspace(10) *... + %Aderived = addrspacecast %Abase to addrspace(11) + B: + %Bbase = load addrspace(10) *... + %Bderived = addrspacecast %Bbase to addrspace(11) + C: + %phi = phi [%Aderived, %A + %Bderived, %B] + + we will insert another phi in C to track the relevant base pointers: + + %philift = phi [%Abase, %A + %Bbase, %B] + + We then pretend, for the purposes of numbering that %phi was derived from + %philift. Note that in order to be able to do this, we need to be able to + perform this lifting either during numbering or instruction scanning. + + B. Vectors of pointers/Union representations + + Since this pass runs very late in the pass pipeline, it runs after the + various vectorization passes. As a result, we have to potentially deal + with vectors of gc-tracked pointers. For the purposes of most of the + algorithm, we simply assign every element of the vector a separate number + and no changes are needed. However, those parts of the algorithm that + look at IR need to be aware of the possibility of encountering vectors of + pointers. + + Similarly, unions (e.g. in call returns) are represented as a struct of + a gc-tracked value and an argument selector. We simply assign a single + number to this struct and proceed as if it was a single pointer. However, + this again requires care at the IR level. + + C. Non mem2reg'd allocas + + Under some circumstances, allocas will still be present in the IR when + we get to this pass. We don't try very hard to handle this case, and + simply sink the alloca into the GCFrame. +*/ + +// 4096 bits == 64 words (64 bit words). Larger bit numbers are faster and doing something +// substantially smaller here doesn't actually save much memory because of malloc overhead. +// Too large is bad also though - 4096 was found to be a reasonable middle ground. +using LargeSparseBitVector = SparseBitVector<4096>; + +struct BBState { + // Uses in this BB + // These do not get updated after local analysis + LargeSparseBitVector Defs; + LargeSparseBitVector PhiOuts; + LargeSparseBitVector UpExposedUses; + // These get updated during dataflow + LargeSparseBitVector LiveIn; + LargeSparseBitVector LiveOut; + SmallVector Safepoints; + int TopmostSafepoint = -1; + bool HasSafepoint = false; + // Have we gone through this basic block in our local scan yet? + bool Done = false; +}; + +struct State { + Function *const F; + DominatorTree *DT; + + // The maximum assigned value number + int MaxPtrNumber; + // The maximum assigned safepoint number + int MaxSafepointNumber; + // Cache of numbers assigned to IR values. This includes caching of numbers + // for derived values + std::map AllPtrNumbering; + std::map> AllCompositeNumbering; + // The reverse of the previous maps + std::map ReversePtrNumbering; + // Neighbors in the coloring interference graph. I.e. for each value, the + // indices of other values that are used simultaneously at some safe point. + SmallVector Neighbors; + // The result of the local analysis + std::map BBStates; + + // Refinement map. If all of the values are rooted + // (-1 means an externally rooted value and -2 means a globally/permanently rooted value), + // the key is already rooted (but not the other way around). + // A value that can be refined to -2 never need any rooting or write barrier. + // A value that can be refined to -1 don't need local root but still need write barrier. + // At the end of `LocalScan` this map has a few properties + // 1. Values are either < 0 or dominates the key + // 2. Therefore this is a DAG + std::map> Refinements; + + // GC preserves map. All safepoints dominated by the map key, but not any + // of its uses need to preserve the values listed in the map value. + std::map> GCPreserves; + + // The assignment of numbers to safepoints. The indices in the map + // are indices into the next three maps which store safepoint properties + std::map SafepointNumbering; + + // Reverse mapping index -> safepoint + SmallVector ReverseSafepointNumbering; + + // Instructions that can return twice. For now, all values live at these + // instructions will get their own, dedicated GC frame slots, because they + // have unobservable control flow, so we can't be sure where they're + // actually live. All of these are also considered safepoints. + SmallVector ReturnsTwice; + + // The set of values live at a particular safepoint + SmallVector< LargeSparseBitVector , 0> LiveSets; + // Those values that - if live out from our parent basic block - are live + // at this safepoint. + SmallVector> LiveIfLiveOut; + // The set of values that are kept alive by the callee. + SmallVector> CalleeRoots; + // We don't bother doing liveness on Allocas that were not mem2reg'ed. + // they just get directly sunk into the root array. + SmallVector Allocas; + DenseMap ArrayAllocas; + DenseMap ShadowAllocas; + SmallVector, 0> TrackedStores; + State(Function &F) : F(&F), DT(nullptr), MaxPtrNumber(-1), MaxSafepointNumber(-1) {} +}; + + +struct LateLowerGCFrame: private JuliaPassContext { + function_ref GetDT; + LateLowerGCFrame(function_ref GetDT) : GetDT(GetDT) {} + +public: + bool runOnFunction(Function &F, bool *CFGModified = nullptr); + +private: + CallInst *pgcstack; + + void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef &SafepointsSoFar, + SmallVector &&RefinedPtr = SmallVector()); + void NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitVector &Uses); + void NoteUse(State &S, BBState &BBS, Value *V) { + NoteUse(S, BBS, V, BBS.UpExposedUses); + } + + void LiftPhi(State &S, PHINode *Phi); + void LiftSelect(State &S, SelectInst *SI); + Value *MaybeExtractScalar(State &S, std::pair ValExpr, Instruction *InsertBefore); + SmallVector MaybeExtractVector(State &S, Value *BaseVec, Instruction *InsertBefore); + Value *GetPtrForNumber(State &S, unsigned Num, Instruction *InsertBefore); + + int Number(State &S, Value *V); + int NumberBase(State &S, Value *Base); + SmallVector NumberAll(State &S, Value *V); + SmallVector NumberAllBase(State &S, Value *Base); + + void NoteOperandUses(State &S, BBState &BBS, User &UI); + void MaybeTrackDst(State &S, MemTransferInst *MI); + void MaybeTrackStore(State &S, StoreInst *I); + State LocalScan(Function &F); + void ComputeLiveness(State &S); + void ComputeLiveSets(State &S); + SmallVector ColorRoots(const State &S); + void PlaceGCFrameStore(State &S, unsigned R, unsigned MinColorRoot, ArrayRef Colors, Value *GCFrame, Instruction *InsertBefore); + void PlaceGCFrameStores(State &S, unsigned MinColorRoot, ArrayRef Colors, Value *GCFrame); + void PlaceRootsAndUpdateCalls(SmallVectorImpl &Colors, State &S, std::map>); + void CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified); + bool CleanupIR(Function &F, State *S, bool *CFGModified); + void NoteUseChain(State &S, BBState &BBS, User *TheUser); + SmallVector GetPHIRefinements(PHINode *phi, State &S); + void FixUpRefinements(ArrayRef PHINumbers, State &S); + void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef CalleeRoots); + Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V); + Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V); +}; + +// The final GC lowering pass. This pass lowers platform-agnostic GC +// intrinsics to platform-dependent instruction sequences. The +// intrinsics it targets are those produced by the late GC frame +// lowering pass. +// +// This pass targets typical back-ends for which the standard Julia +// runtime library is available. Atypical back-ends should supply +// their own lowering pass. + +struct FinalLowerGC: private JuliaPassContext { + bool runOnFunction(Function &F); + +private: + Function *queueRootFunc; + Function *smallAllocFunc; + Function *bigAllocFunc; + Function *allocTypedFunc; + Instruction *pgcstack; + Type *T_size; + + // Lowers a `julia.new_gc_frame` intrinsic. + void lowerNewGCFrame(CallInst *target, Function &F); + + // Lowers a `julia.push_gc_frame` intrinsic. + void lowerPushGCFrame(CallInst *target, Function &F); + + // Lowers a `julia.pop_gc_frame` intrinsic. + void lowerPopGCFrame(CallInst *target, Function &F); + + // Lowers a `julia.get_gc_frame_slot` intrinsic. + void lowerGetGCFrameSlot(CallInst *target, Function &F); + + // Lowers a `julia.gc_alloc_bytes` intrinsic. + void lowerGCAllocBytes(CallInst *target, Function &F); + + // Lowers a `julia.queue_gc_root` intrinsic. + void lowerQueueGCRoot(CallInst *target, Function &F); + + // Lowers a `julia.safepoint` intrinsic. + void lowerSafepoint(CallInst *target, Function &F); +}; + +#endif // LLVM_GC_PASSES_H diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 65b8cdc5c7c055..e08f08860dfaf3 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1,367 +1,9 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -#include "llvm-version.h" -#include "passes.h" - -#include "llvm/IR/DerivedTypes.h" -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "llvm-codegen-shared.h" -#include "julia.h" -#include "julia_internal.h" -#include "julia_assert.h" -#include "llvm-pass-helpers.h" -#include -#include +#include "llvm-gc-interface-passes.h" #define DEBUG_TYPE "late_lower_gcroot" -using namespace llvm; - -/* Julia GC Root Placement pass. For a general overview of the design of GC - root lowering, see the devdocs. This file is the actual implementation. - - The actual algorithm is fairly straightforward. First recall the goal of this - pass: - - Minimize the number of needed gc roots/stores to them subject to the constraint - that at every safepoint, any live gc-tracked pointer (i.e. for which there is - a path after this point that contains a use of this pointer) is in some gc slot. - - In particular, in order to understand this algorithm, it is important to - realize that the only places where rootedness matters is at safepoints. - - Now, the primary phases of the algorithm are: - - 1. Local Scan - - During this step, each Basic Block is inspected and analyzed for local - properties. In particular, we want to determine the ordering of any of - the following activities: - - - Any Def of a gc-tracked pointer. In general Defs are the results of - calls or loads from appropriate memory locations. Phi nodes and - selects do complicate this story slightly as described below. - - Any use of a gc-tracked or derived pointer. As described in the - devdocs, a use is in general one of - a) a load from a tracked/derived value - b) a store to a tracked/derived value - c) a store OF a tracked/derived value - d) a use of a value as a call operand (including operand bundles) - - Any safepoint - - Crucially, we also perform pointer numbering during the local scan, - assigning every Def a unique integer and caching the integer for each - derived pointer. This allows us to operate only on the set of Defs ( - represented by these integers) for the rest of the algorithm. We also - maintain some local utility information that is needed by later passes - (see the BBState struct for details). - - 2. Dataflow Computation - - This computation operates entirely over the function's control flow graph - and does not look into a basic block. The algorithm is essentially - textbook iterative data flow for liveness computation. However, the - data flow equations are slightly more complicated because we also - forward propagate rootedness information in addition to backpropagating - liveness. - - 3. Live Set Computation - - With the liveness information from the previous step, we can now compute, - for every safepoint, the set of values live at that particular safepoint. - There are three pieces of information being combined here: - i. Values that needed to be live due to local analysis (e.g. there - was a def, then a safepoint, then a use). This was computed during - local analysis. - ii. Values that are live across the basic block (i.e. they are live - at every safepoint within the basic block). This relies entirely - on the liveness information. - iii. Values that are now live-out from the basic block (i.e. they are - live at every safepoint following their def). During local - analysis, we keep, for every safepoint, those values that would - be live if they were live out. Here we can check if they are - actually live-out and make the appropriate additions to the live - set. - - Lastly, we also explicitly compute, for each value, the list of values - that are simultaneously live at some safepoint. This is known as an - "interference graph" and is the input to the next step. - - 4. GC Root coloring - - Two values which are not simultaneously live at a safepoint can share the - same slot. This is an important optimization, because otherwise long - functions would have exceptionally large GC slots, reducing performance - and bloating the size of the stack. Assigning values to these slots is - equivalent to doing graph coloring on the interference graph - the graph - where nodes are values and two values have an edge if they are - simultaneously live at a safepoint - which we computed in the previous - step. Now graph coloring in general is a hard problem. However, for SSA - form programs, (and most programs in general, by virtue of their - structure), the resulting interference graphs are chordal and can be - colored optimally in linear time by performing greedy coloring in a - perfect elimination order. Now, our interference graphs are likely not - entirely chordal due to some non-SSA corner cases. However, using the same - algorithm should still give a very good coloring while having sufficiently - low runtime. - - 5. JLCall frame optimizations - - Unlike earlier iterations of the gc root placement logic, jlcall frames - are no longer treated as a special case and need not necessarily be sunk - into the gc frame. Additionally, we now emit lifetime - intrinsics, so regular stack slot coloring will merge any jlcall frames - not sunk into the gc frame. Nevertheless performing such sinking can still - be profitable. Since all arguments to a jlcall are guaranteed to be live - at that call in some gc slot, we can attempt to rearrange the slots within - the gc-frame, or reuse slots not assigned at that particular location - for the gcframe. However, even without this optimization, stack frames - are at most two times larger than optimal (because regular stack coloring - can merge the jlcall allocas). - - N.B.: This step is not yet implemented. - - 6. Root placement - - This performs the actual insertion of the GCFrame pushes/pops, zeros out - the gc frame and creates the stores to the gc frame according to the - stack slot assignment computed in the previous step. GC frames stores - are generally sunk right before the first safe point that use them - (this is beneficial for code where the primary path does not have - safepoints, but some other path - e.g. the error path does). However, - if the first safepoint is not dominated by the definition (this can - happen due to the non-ssa corner cases), the store is inserted right after - the definition. - - 7. Cleanup - - This step performs necessary cleanup before passing the IR to codegen. In - particular, it removes any calls to julia_from_objref intrinsics and - removes the extra operand bundles from ccalls. In the future it could - also strip the addrspace information from all values as this - information is no longer needed. - - - There are a couple important special cases that deserve special attention: - - A. PHIs and Selects - - In general PHIs and selects are treated as separate defs for the purposes - of the algorithm and their operands as uses of those values. It is - important to consider however WHERE the uses of PHI's operands are - located. It is neither at the start of the basic block, because the values - do not dominate the block (so can't really consider them live-in), nor - at the end of the predecessor (because they are actually live out). - Instead it is best to think of those uses as living on the edge between - the appropriate predecessor and the block containing the PHI. - - Another concern is PHIs of derived values. Since we cannot simply root - these values by storing them to a GC slot, we need to insert a new, - artificial PHI that tracks the base pointers for the derived values. E.g. - in: - - A: - %Abase = load addrspace(10) *... - %Aderived = addrspacecast %Abase to addrspace(11) - B: - %Bbase = load addrspace(10) *... - %Bderived = addrspacecast %Bbase to addrspace(11) - C: - %phi = phi [%Aderived, %A - %Bderived, %B] - - we will insert another phi in C to track the relevant base pointers: - - %philift = phi [%Abase, %A - %Bbase, %B] - - We then pretend, for the purposes of numbering that %phi was derived from - %philift. Note that in order to be able to do this, we need to be able to - perform this lifting either during numbering or instruction scanning. - - B. Vectors of pointers/Union representations - - Since this pass runs very late in the pass pipeline, it runs after the - various vectorization passes. As a result, we have to potentially deal - with vectors of gc-tracked pointers. For the purposes of most of the - algorithm, we simply assign every element of the vector a separate number - and no changes are needed. However, those parts of the algorithm that - look at IR need to be aware of the possibility of encountering vectors of - pointers. - - Similarly, unions (e.g. in call returns) are represented as a struct of - a gc-tracked value and an argument selector. We simply assign a single - number to this struct and proceed as if it was a single pointer. However, - this again requires care at the IR level. - - C. Non mem2reg'd allocas - - Under some circumstances, allocas will still be present in the IR when - we get to this pass. We don't try very hard to handle this case, and - simply sink the alloca into the GCFrame. -*/ - -// 4096 bits == 64 words (64 bit words). Larger bit numbers are faster and doing something -// substantially smaller here doesn't actually save much memory because of malloc overhead. -// Too large is bad also though - 4096 was found to be a reasonable middle ground. -using LargeSparseBitVector = SparseBitVector<4096>; - -struct BBState { - // Uses in this BB - // These do not get updated after local analysis - LargeSparseBitVector Defs; - LargeSparseBitVector PhiOuts; - LargeSparseBitVector UpExposedUses; - // These get updated during dataflow - LargeSparseBitVector LiveIn; - LargeSparseBitVector LiveOut; - SmallVector Safepoints; - int TopmostSafepoint = -1; - bool HasSafepoint = false; - // Have we gone through this basic block in our local scan yet? - bool Done = false; -}; - -struct State { - Function *const F; - DominatorTree *DT; - - // The maximum assigned value number - int MaxPtrNumber; - // The maximum assigned safepoint number - int MaxSafepointNumber; - // Cache of numbers assigned to IR values. This includes caching of numbers - // for derived values - std::map AllPtrNumbering; - std::map> AllCompositeNumbering; - // The reverse of the previous maps - std::map ReversePtrNumbering; - // Neighbors in the coloring interference graph. I.e. for each value, the - // indices of other values that are used simultaneously at some safe point. - SmallVector Neighbors; - // The result of the local analysis - std::map BBStates; - - // Refinement map. If all of the values are rooted - // (-1 means an externally rooted value and -2 means a globally/permanently rooted value), - // the key is already rooted (but not the other way around). - // A value that can be refined to -2 never need any rooting or write barrier. - // A value that can be refined to -1 don't need local root but still need write barrier. - // At the end of `LocalScan` this map has a few properties - // 1. Values are either < 0 or dominates the key - // 2. Therefore this is a DAG - std::map> Refinements; - - // GC preserves map. All safepoints dominated by the map key, but not any - // of its uses need to preserve the values listed in the map value. - std::map> GCPreserves; - - // The assignment of numbers to safepoints. The indices in the map - // are indices into the next three maps which store safepoint properties - std::map SafepointNumbering; - - // Reverse mapping index -> safepoint - SmallVector ReverseSafepointNumbering; - - // Instructions that can return twice. For now, all values live at these - // instructions will get their own, dedicated GC frame slots, because they - // have unobservable control flow, so we can't be sure where they're - // actually live. All of these are also considered safepoints. - SmallVector ReturnsTwice; - - // The set of values live at a particular safepoint - SmallVector< LargeSparseBitVector , 0> LiveSets; - // Those values that - if live out from our parent basic block - are live - // at this safepoint. - SmallVector> LiveIfLiveOut; - // The set of values that are kept alive by the callee. - SmallVector> CalleeRoots; - // We don't bother doing liveness on Allocas that were not mem2reg'ed. - // they just get directly sunk into the root array. - SmallVector Allocas; - DenseMap ArrayAllocas; - DenseMap ShadowAllocas; - SmallVector, 0> TrackedStores; - State(Function &F) : F(&F), DT(nullptr), MaxPtrNumber(-1), MaxSafepointNumber(-1) {} -}; - - -struct LateLowerGCFrame: private JuliaPassContext { - function_ref GetDT; - LateLowerGCFrame(function_ref GetDT) : GetDT(GetDT) {} - -public: - bool runOnFunction(Function &F, bool *CFGModified = nullptr); - -private: - CallInst *pgcstack; - - void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef &SafepointsSoFar, - SmallVector &&RefinedPtr = SmallVector()); - void NoteUse(State &S, BBState &BBS, Value *V, LargeSparseBitVector &Uses); - void NoteUse(State &S, BBState &BBS, Value *V) { - NoteUse(S, BBS, V, BBS.UpExposedUses); - } - - void LiftPhi(State &S, PHINode *Phi); - void LiftSelect(State &S, SelectInst *SI); - Value *MaybeExtractScalar(State &S, std::pair ValExpr, Instruction *InsertBefore); - SmallVector MaybeExtractVector(State &S, Value *BaseVec, Instruction *InsertBefore); - Value *GetPtrForNumber(State &S, unsigned Num, Instruction *InsertBefore); - - int Number(State &S, Value *V); - int NumberBase(State &S, Value *Base); - SmallVector NumberAll(State &S, Value *V); - SmallVector NumberAllBase(State &S, Value *Base); - - void NoteOperandUses(State &S, BBState &BBS, User &UI); - void MaybeTrackDst(State &S, MemTransferInst *MI); - void MaybeTrackStore(State &S, StoreInst *I); - State LocalScan(Function &F); - void ComputeLiveness(State &S); - void ComputeLiveSets(State &S); - SmallVector ColorRoots(const State &S); - void PlaceGCFrameStore(State &S, unsigned R, unsigned MinColorRoot, ArrayRef Colors, Value *GCFrame, Instruction *InsertBefore); - void PlaceGCFrameStores(State &S, unsigned MinColorRoot, ArrayRef Colors, Value *GCFrame); - void PlaceRootsAndUpdateCalls(SmallVectorImpl &Colors, State &S, std::map>); - void CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified); - bool CleanupIR(Function &F, State *S, bool *CFGModified); - void NoteUseChain(State &S, BBState &BBS, User *TheUser); - SmallVector GetPHIRefinements(PHINode *phi, State &S); - void FixUpRefinements(ArrayRef PHINumbers, State &S); - void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef CalleeRoots); - Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V); - Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V); -}; - static unsigned getValueAddrSpace(Value *V) { return V->getType()->getPointerAddressSpace(); } diff --git a/src/scheduler.c b/src/scheduler.c index 2c7dbd63ef4a44..ab1d671ebf7a7b 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -7,7 +7,6 @@ #include "julia.h" #include "julia_internal.h" -#include "gc.h" #include "threading.h" #ifdef __cplusplus @@ -32,7 +31,7 @@ static const int16_t sleeping_like_the_dead JL_UNUSED = 2; // a running count of how many threads are currently not_sleeping // plus a running count of the number of in-flight wake-ups // n.b. this may temporarily exceed jl_n_threads -static _Atomic(int) nrunning = 0; +_Atomic(int) nrunning = 0; // invariant: No thread is ever asleep unless sleep_check_state is sleeping (or we have a wakeup signal pending). // invariant: Any particular thread is not asleep unless that thread's sleep_check_state is sleeping. @@ -112,79 +111,6 @@ void jl_init_threadinginfra(void) void JL_NORETURN jl_finish_task(jl_task_t *ct); -static inline int may_mark(void) JL_NOTSAFEPOINT -{ - return (jl_atomic_load(&gc_n_threads_marking) > 0); -} - -static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT -{ - return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0); -} - -// parallel gc thread function -void jl_parallel_gc_threadfun(void *arg) -{ - jl_threadarg_t *targ = (jl_threadarg_t*)arg; - - // initialize this thread (set tid and create heap) - jl_ptls_t ptls = jl_init_threadtls(targ->tid); - void *stack_lo, *stack_hi; - jl_init_stack_limits(0, &stack_lo, &stack_hi); - // warning: this changes `jl_current_task`, so be careful not to call that from this function - jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); - JL_GC_PROMISE_ROOTED(ct); - (void)jl_atomic_fetch_add_relaxed(&nrunning, -1); - // wait for all threads - jl_gc_state_set(ptls, JL_GC_PARALLEL_COLLECTOR_THREAD, JL_GC_STATE_UNSAFE); - uv_barrier_wait(targ->barrier); - - // free the thread argument here - free(targ); - - while (1) { - uv_mutex_lock(&gc_threads_lock); - while (!may_mark() && !may_sweep(ptls)) { - uv_cond_wait(&gc_threads_cond, &gc_threads_lock); - } - uv_mutex_unlock(&gc_threads_lock); - assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); - gc_mark_loop_parallel(ptls, 0); - if (may_sweep(ptls)) { - assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); - gc_sweep_pool_parallel(ptls); - jl_atomic_fetch_add(&ptls->gc_tls.gc_sweeps_requested, -1); - } - } -} - -// concurrent gc thread function -void jl_concurrent_gc_threadfun(void *arg) -{ - jl_threadarg_t *targ = (jl_threadarg_t*)arg; - - // initialize this thread (set tid and create heap) - jl_ptls_t ptls = jl_init_threadtls(targ->tid); - void *stack_lo, *stack_hi; - jl_init_stack_limits(0, &stack_lo, &stack_hi); - // warning: this changes `jl_current_task`, so be careful not to call that from this function - jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); - JL_GC_PROMISE_ROOTED(ct); - (void)jl_atomic_fetch_add_relaxed(&nrunning, -1); - // wait for all threads - jl_gc_state_set(ptls, JL_GC_CONCURRENT_COLLECTOR_THREAD, JL_GC_STATE_UNSAFE); - uv_barrier_wait(targ->barrier); - - // free the thread argument here - free(targ); - - while (1) { - assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_CONCURRENT_COLLECTOR_THREAD); - uv_sem_wait(&gc_sweep_assists_needed); - gc_free_pages(); - } -} - // thread function: used by all mutator threads except the main thread void jl_threadfun(void *arg) { diff --git a/src/stackwalk.c b/src/stackwalk.c index 7e4a04f6b77e43..a63694e7c3b0c9 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -5,7 +5,7 @@ utilities for walking the stack and looking up information about code addresses */ #include -#include "gc.h" +#include "gc-stock.h" #include "julia.h" #include "julia_internal.h" #include "threading.h" diff --git a/src/threading.c b/src/threading.c index 0c4e1ccf70eb02..9291642f1992c2 100644 --- a/src/threading.c +++ b/src/threading.c @@ -459,8 +459,6 @@ void jl_safepoint_resume_all_threads(jl_task_t *ct) void jl_task_frame_noreturn(jl_task_t *ct) JL_NOTSAFEPOINT; void scheduler_delete_thread(jl_ptls_t ptls) JL_NOTSAFEPOINT; -void jl_free_thread_gc_state(jl_ptls_t ptls); - static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER { #ifndef _OS_WINDOWS_