From dc89cedff2709dd9193eeaec6688c4a0be59a228 Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Fri, 25 Oct 2024 03:18:02 +0000 Subject: [PATCH] Adding support for MMTk (non-moving Immix) --- Make.inc | 44 + base/timing.jl | 17 +- src/Makefile | 56 +- src/gc-common.c | 50 + src/gc-common.h | 35 + src/gc-interface.h | 12 +- src/gc-mmtk.c | 1180 +++++++++++++++++ src/gc-stacks.c | 97 -- src/gc-stock.c | 193 ++- src/gc-stock.h | 26 +- src/gc-tls-mmtk.h | 23 + src/{gc-tls.h => gc-tls-stock.h} | 0 src/julia_internal.h | 2 +- src/julia_locks.h | 2 +- src/julia_threads.h | 6 +- src/llvm-gc-interface-passes.h | 3 + src/llvm-late-gc-lowering-mmtk.cpp | 96 ++ src/llvm-late-gc-lowering-stock.cpp | 9 + src/llvm-late-gc-lowering.cpp | 26 + src/staticdata.c | 2 + src/threading.c | 4 + .../InteractiveUtils/src/InteractiveUtils.jl | 1 + 22 files changed, 1674 insertions(+), 210 deletions(-) create mode 100644 src/gc-mmtk.c create mode 100644 src/gc-tls-mmtk.h rename src/{gc-tls.h => gc-tls-stock.h} (100%) create mode 100644 src/llvm-late-gc-lowering-mmtk.cpp create mode 100644 src/llvm-late-gc-lowering-stock.cpp diff --git a/Make.inc b/Make.inc index a60a95d21c3db..feb0c0be733ff 100644 --- a/Make.inc +++ b/Make.inc @@ -80,6 +80,9 @@ HAVE_SSP := 0 WITH_GC_VERIFY := 0 WITH_GC_DEBUG_ENV := 0 +# Use stock if MMTK_PLAN hasn't been defined +MMTK_PLAN ?= None + # Enable DTrace support WITH_DTRACE := 0 @@ -829,6 +832,41 @@ JCXXFLAGS += -DGC_DEBUG_ENV JCFLAGS += -DGC_DEBUG_ENV endif +ifneq (${MMTK_PLAN},None) +ifeq (${MMTK_JULIA_DIR},) +$(error MMTK_JULIA_DIR must be set to use MMTk) +endif +JCXXFLAGS += -DMMTK_GC +JCFLAGS += -DMMTK_GC +ifeq (${MMTK_BUILD},) +ifeq (debug,$(findstring debug,$(MAKECMDGOALS))) +MMTK_BUILD = debug +else +MMTK_BUILD = release +endif +endif +ifeq (${MMTK_PLAN},Immix) +JCXXFLAGS += -DMMTK_PLAN_IMMIX +JCFLAGS += -DMMTK_PLAN_IMMIX +else +$(error "Unsupported MMTk plan: $(MMTK_PLAN)") +endif +MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk +MMTK_API_INC = $(MMTK_DIR)/api +ifeq ($(OS),Linux) +MMTK_LIB_NAME := libmmtk_julia.so +else +$(error "Unsupported OS for MMTk") +endif +MMTK_LIB_SRC := $(MMTK_DIR)/target/$(MMTK_BUILD)/$(MMTK_LIB_NAME) +MMTK_LIB_DST := $(BUILDROOT)/usr/lib/$(MMTK_LIB_NAME) +MMTK_LIB := -lmmtk_julia +LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD)/ +else +MMTK_JULIA_INC := +MMTK_LIB := +endif + ifeq ($(WITH_DTRACE), 1) JCXXFLAGS += -DUSE_DTRACE JCFLAGS += -DUSE_DTRACE @@ -1823,6 +1861,9 @@ PRINT_PERL = printf ' %b %b\n' $(PERLCOLOR)PERL$(ENDCOLOR) $(BINCOLOR)$(GOAL) PRINT_FLISP = printf ' %b %b\n' $(FLISPCOLOR)FLISP$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1) PRINT_JULIA = printf ' %b %b\n' $(JULIACOLOR)JULIA$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1) PRINT_DTRACE = printf ' %b %b\n' $(DTRACECOLOR)DTRACE$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1) +ifneq (${MMTK_PLAN},None) +PRINT_MMTK = printf ' %b %b\n' $(LINKCOLOR)MMTK$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1) +endif else QUIET_MAKE = @@ -1833,6 +1874,9 @@ PRINT_PERL = echo '$(subst ','\'',$(1))'; $(1) PRINT_FLISP = echo '$(subst ','\'',$(1))'; $(1) PRINT_JULIA = echo '$(subst ','\'',$(1))'; $(1) PRINT_DTRACE = echo '$(subst ','\'',$(1))'; $(1) +ifneq (${MMTK_PLAN},None) +PRINT_MMTK = echo '$(subst ','\'',$(1))'; $(1) +endif endif # VERBOSE diff --git a/base/timing.jl b/base/timing.jl index 1de3727756829..0088f8bb77eca 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -106,9 +106,14 @@ function gc_page_utilization_data() return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false) end + +const USING_STOCK_GC = occursin("stock", unsafe_string(ccall(:jl_gc_active_impl, Ptr{UInt8}, ()))) +# Full sweep reasons are currently only available for the stock GC +@static if USING_STOCK_GC # must be kept in sync with `src/gc-stock.h`` const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP, :FULL_SWEEP_REASON_USER_MAX_EXCEEDED, :FULL_SWEEP_REASON_LARGE_PROMOTION_RATE] +end """ Base.full_sweep_reasons() @@ -124,11 +129,15 @@ The reasons are: Note that the set of reasons is not guaranteed to be stable across minor versions of Julia. """ function full_sweep_reasons() - reason = cglobal(:jl_full_sweep_reasons, UInt64) - reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false) d = Dict{Symbol, Int64}() - for (i, r) in enumerate(FULL_SWEEP_REASONS) - d[r] = reasons_as_array[i] + # populate the dictionary according to the reasons above for the stock GC + # otherwise return an empty dictionary for now + @static if USING_STOCK_GC + reason = cglobal(:jl_full_sweep_reasons, UInt64) + reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false) + for (i, r) in enumerate(FULL_SWEEP_REASONS) + d[r] = reasons_as_array[i] + end end return d end diff --git a/src/Makefile b/src/Makefile index 9355ca2c4c675..f5dd46741c6e9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -29,6 +29,10 @@ ifeq ($(USECLANG),1) FLAGS += -Wno-return-type-c-linkage -Wno-atomic-alignment endif +ifeq ($(WITH_MMTK), 1) +FLAGS += -I$(MMTK_API_INC) +endif + FLAGS += -DJL_BUILD_ARCH='"$(ARCH)"' ifeq ($(OS),WINNT) FLAGS += -DJL_BUILD_UNAME='"NT"' @@ -40,23 +44,41 @@ ifeq ($(OS),FreeBSD) FLAGS += -I$(LOCALBASE)/include endif +# GC source code. It depends on which GC implementation to use. +GC_SRCS := gc-common gc-stacks gc-alloc-profiler gc-heap-snapshot +ifneq (${MMTK_PLAN},None) +GC_SRCS += gc-mmtk +else +GC_SRCS += gc-stock gc-debug gc-pages gc-page-profiler +endif + SRCS := \ jltypes gf typemap smallintset ast builtins module interpreter symbol \ dlload sys init task array genericmemory staticdata toplevel jl_uv datatype \ simplevector runtime_intrinsics precompile jloptions mtarraylist \ - threading scheduler stackwalk gc-common gc-stock gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \ - jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \ - crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine + threading scheduler stackwalk \ + method jlapi signal-handling safepoint timing subtype rtutils \ + crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine \ + $(GC_SRCS) RT_LLVMLINK := CG_LLVMLINK := ifeq ($(JULIACODEGEN),LLVM) +# Currently these files are used by both GCs. But we should make the list specific to stock, and MMTk should have its own implementation. +GC_CODEGEN_SRCS := llvm-final-gc-lowering llvm-late-gc-lowering llvm-gc-invariant-verifier +ifneq (${MMTK_PLAN},None) +FLAGS += -I$(MMTK_API_INC) +GC_CODEGEN_SRCS += llvm-late-gc-lowering-mmtk +else +GC_CODEGEN_SRCS += llvm-late-gc-lowering-stock +endif CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \ - llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering llvm-ptls \ - llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \ + llvm-pass-helpers llvm-ptls \ + llvm-lower-handlers llvm-propagate-addrspaces \ llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \ - llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api + llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api \ + $(GC_CODEGEN_SRCS) FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir) CG_LLVM_LIBS := all ifeq ($(USE_POLLY),1) @@ -99,7 +121,12 @@ ifeq ($(USE_SYSTEM_LIBUV),0) UV_HEADERS += uv.h UV_HEADERS += uv/*.h endif -PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) +PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) +ifneq (${MMTK_PLAN},None) + PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-mmtk.h) +else + PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-stock.h) +endif ifeq ($(OS),WINNT) PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h) endif @@ -164,8 +191,8 @@ LIBJULIA_PATH_REL := libjulia endif COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir) -RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) -CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) +RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB) +CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB) RT_DEBUG_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a -ljulia-debug $(RT_LIBS) CG_DEBUG_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia-debug -ljulia-internal-debug RT_RELEASE_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a -ljulia $(RT_LIBS) @@ -222,6 +249,12 @@ $(BUILDDIR)/%.h.gen : $(SRCDIR)/%.d sed 's/JULIA_/JL_PROBE_/' $@ > $@.tmp mv $@.tmp $@ +# Compile files from the binding side and copy so file into lib folder +ifneq (${MMTK_PLAN},None) +$(MMTK_LIB_DST): $(MMTK_LIB_SRC) + @$(call PRINT_MMTK, cp $< $@) +endif + $(BUILDDIR)/jl_internal_funcs.inc: $(SRCDIR)/jl_exported_funcs.inc # Generate `.inc` file that contains a list of `#define` macros to rename functions defined in `libjulia-internal` # to have a `ijl_` prefix instead of `jl_`, to denote that they are coming from `libjulia-internal`. This avoids @@ -314,6 +347,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h +$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h @@ -386,13 +420,13 @@ $(BUILDDIR)/julia.expmap: $(SRCDIR)/julia.expmap.in $(JULIAHOME)/VERSION $(LLVM_ sed <'$<' >'$@' -e "s/@JULIA_SHLIB_SYMBOL_VERSION@/JL_LIBJULIA_$(SOMAJOR)/" \ -e "s/@LLVM_SHLIB_SYMBOL_VERSION@/$(LLVM_SHLIB_SYMBOL_VERSION)/" -$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV) +$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(MMTK_LIB_DST) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV) @$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \ $(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT))) @$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@ $(DSYMUTIL) $@ -$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV) +$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(MMTK_LIB_DST) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV) @$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(RPATH_LIB) -o $@ \ $(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT))) @$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@ diff --git a/src/gc-common.c b/src/gc-common.c index 3d578b81578b1..c07b707b17709 100644 --- a/src/gc-common.c +++ b/src/gc-common.c @@ -540,6 +540,38 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty) return jl_gc_alloc_(ptls, sz, ty); } +JL_DLLEXPORT void *jl_malloc(size_t sz) +{ + return jl_gc_counted_malloc(sz); +} + +//_unchecked_calloc does not check for potential overflow of nm*sz +STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) { + size_t nmsz = nm*sz; + return jl_gc_counted_calloc(nmsz, 1); +} + +JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) +{ + if (nm > SSIZE_MAX/sz) + return NULL; + return _unchecked_calloc(nm, sz); +} + +JL_DLLEXPORT void jl_free(void *p) +{ + if (p != NULL) { + size_t sz = memory_block_usable_size(p, 0); + return jl_gc_counted_free_with_size(p, sz); + } +} + +JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) +{ + size_t old = p ? memory_block_usable_size(p, 0) : 0; + return jl_gc_counted_realloc_with_old_size(p, old, sz); +} + // =========================================================================== // // Generic Memory // =========================================================================== // @@ -668,6 +700,24 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void) jl_throw(jl_memory_exception); } +// Sweeping mtarraylist_buffers: +// These buffers are made unreachable via `mtarraylist_resizeto` from mtarraylist.c +// and are freed at the end of GC via jl_gc_sweep_stack_pools_and_mtarraylist_buffers +void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT +{ + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; + if (ptls == NULL) { + continue; + } + small_arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers; + void *buf; + while ((buf = small_arraylist_pop(buffers)) != NULL) { + free(buf); + } + } +} + #ifdef __cplusplus } #endif diff --git a/src/gc-common.h b/src/gc-common.h index 1745df17950f9..ca5e21084209e 100644 --- a/src/gc-common.h +++ b/src/gc-common.h @@ -24,6 +24,31 @@ extern "C" { #endif +// =========================================================================== // +// GC Big objects +// =========================================================================== // + +JL_EXTENSION typedef struct _bigval_t { + struct _bigval_t *next; + struct _bigval_t *prev; + size_t sz; +#ifdef _P64 // Add padding so that the value is 64-byte aligned + // (8 pointers of 8 bytes each) - (4 other pointers in struct) + void *_padding[8 - 4]; +#else + // (16 pointers of 4 bytes each) - (4 other pointers in struct) + void *_padding[16 - 4]; +#endif + //struct jl_taggedvalue_t <>; + union { + uintptr_t header; + struct { + uintptr_t gc:2; + } bits; + }; + // must be 64-byte aligned here, in 32 & 64 bit modes +} bigval_t; + // =========================================================================== // // GC Callbacks // =========================================================================== // @@ -187,4 +212,14 @@ extern jl_ptls_t* gc_all_tls_states; extern int gc_logging_enabled; +// =========================================================================== // +// MISC +// =========================================================================== // + +// number of stacks to always keep available per pool +#define MIN_STACK_MAPPINGS_PER_POOL 5 + +void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT; +void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT; + #endif // JL_GC_COMMON_H diff --git a/src/gc-interface.h b/src/gc-interface.h index eb6687d52d9ab..e4a27782f7520 100644 --- a/src/gc-interface.h +++ b/src/gc-interface.h @@ -98,6 +98,13 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem); JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection); // Returns whether the thread with `tid` is a collector thread JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT; +// Returns which GC implementation is being used and possibly its version according to the list of supported GCs +// NB: it should clearly identify the GC by including e.g. ‘stock’ or ‘mmtk’ as a substring. +JL_DLLEXPORT const char* jl_gc_active_impl(void); +// Sweep Julia's stack pools and mtarray buffers. Note that this function has been added to the interface as +// each GC should implement it but it will most likely not be used by other code in the runtime. +// It still needs to be annotated with JL_DLLEXPORT since it is called from Rust by MMTk. +JL_DLLEXPORT void jl_gc_sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT; // ========================================================================= // // Metrics @@ -138,7 +145,6 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void); // **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record // an allocation of that type in the allocation profiler. struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty); - // Allocates small objects and increments Julia allocation counterst. Size of the object // header must be included in the object size. The (possibly unused in some implementations) // offset to the arena in which we're allocating is passed in the second parameter, and the @@ -198,6 +204,10 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, // the allocated object. All objects stored in fields of this object // must be either permanently allocated or have other roots. struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT; +// This function notifies the GC about memory addresses that are set when loading the boot image. +// The GC may use that information to, for instance, determine that such objects should +// be treated as marked and belonged to the old generation in nursery collections. +void jl_gc_notify_image_load(const char* img_data, size_t len); // ========================================================================= // // Runtime Write-Barriers diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c new file mode 100644 index 0000000000000..4aab56d008ca3 --- /dev/null +++ b/src/gc-mmtk.c @@ -0,0 +1,1180 @@ +#include "gc-common.h" +#include "gc-tls-mmtk.h" +#include "mmtkMutator.h" +#include "threading.h" + +// File exists in the binding +#include "mmtk.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// ========================================================================= // +// Julia specific +// ========================================================================= // + +extern jl_value_t *cmpswap_names JL_GLOBALLY_ROOTED; +extern const unsigned pool_sizes[]; +extern jl_mutex_t finalizers_lock; + +// FIXME: Should the values below be shared between both GC's? +// Note that MMTk uses a hard max heap limit, which is set by default +// as 70% of the free available memory. The min heap is set as the +// default_collect_interval variable below. + +// max_total_memory is a suggestion. We try very hard to stay +// under this limit, but we will go above it rather than halting. +#ifdef _P64 +typedef uint64_t memsize_t; +static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*); +// We expose this to the user/ci as jl_gc_set_max_memory +static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024; +#else +typedef uint32_t memsize_t; +static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*); +// Work really hard to stay within 2GB +// Alternative is to risk running out of address space +// on 32 bit architectures. +#define MAX32HEAP 1536 * 1024 * 1024 +static memsize_t max_total_memory = (memsize_t) MAX32HEAP; +#endif + +// ========================================================================= // +// Defined by the binding +// ========================================================================= // + +extern void mmtk_julia_copy_stack_check(int copy_stack); +extern void mmtk_gc_init(uintptr_t min_heap_size, uintptr_t max_heap_size, uintptr_t n_gcthreads, uintptr_t header_size, uintptr_t tag); +extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr); +extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr); +extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator); +extern void mmtk_post_alloc(void* mutator, void* refer, size_t bytes, int allocator); +extern void mmtk_store_obj_size_c(void* obj, size_t size); +extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS; +extern const void* MMTK_SIDE_VO_BIT_BASE_ADDRESS; + +// ========================================================================= // +// GC Initialization and Control +// ========================================================================= // + +void jl_gc_init(void) { + // TODO: use jl_options.heap_size_hint to set MMTk's fixed heap size? (see issue: https://github.com/mmtk/mmtk-julia/issues/167) + JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock"); + + arraylist_new(&to_finalize, 0); + arraylist_new(&finalizer_list_marked, 0); + + gc_num.allocd = 0; + gc_num.max_pause = 0; + gc_num.max_memory = 0; + + long long min_heap_size; + long long max_heap_size; + char* min_size_def = getenv("MMTK_MIN_HSIZE"); + char* min_size_gb = getenv("MMTK_MIN_HSIZE_G"); + + char* max_size_def = getenv("MMTK_MAX_HSIZE"); + char* max_size_gb = getenv("MMTK_MAX_HSIZE_G"); + + // default min heap currently set as Julia's default_collect_interval + if (min_size_def != NULL) { + char *p; + double min_size = strtod(min_size_def, &p); + min_heap_size = (long) 1024 * 1024 * min_size; + } else if (min_size_gb != NULL) { + char *p; + double min_size = strtod(min_size_gb, &p); + min_heap_size = (long) 1024 * 1024 * 1024 * min_size; + } else { + min_heap_size = default_collect_interval; + } + + // default max heap currently set as 70% the free memory in the system + if (max_size_def != NULL) { + char *p; + double max_size = strtod(max_size_def, &p); + max_heap_size = (long) 1024 * 1024 * max_size; + } else if (max_size_gb != NULL) { + char *p; + double max_size = strtod(max_size_gb, &p); + max_heap_size = (long) 1024 * 1024 * 1024 * max_size; + } else { + max_heap_size = uv_get_free_memory() * 70 / 100; + } + + // Assert that the number of stock GC threads is 0; MMTK uses the number of threads in jl_options.ngcthreads + assert(jl_n_gcthreads == 0); + + // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined + int copy_stacks; + +#ifdef COPY_STACKS + copy_stacks = 1; +#else + copy_stacks = 0; +#endif + + mmtk_julia_copy_stack_check(copy_stacks); + + // if only max size is specified initialize MMTk with a fixed size heap + // TODO: We just assume mark threads means GC threads, and ignore the number of concurrent sweep threads. + // If the two values are the same, we can use either. Otherwise, we need to be careful. + uintptr_t gcthreads = jl_options.nmarkthreads; + if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) { + mmtk_gc_init(0, max_heap_size, gcthreads, (sizeof(jl_taggedvalue_t)), jl_buff_tag); + } else { + mmtk_gc_init(min_heap_size, max_heap_size, gcthreads, (sizeof(jl_taggedvalue_t)), jl_buff_tag); + } +} + +void jl_start_gc_threads(void) { + jl_ptls_t ptls = jl_current_task->ptls; + mmtk_initialize_collection((void *)ptls); +} + +void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT { + jl_thread_heap_common_t *heap = &ptls->gc_tls_common.heap; + small_arraylist_new(&heap->weak_refs, 0); + small_arraylist_new(&heap->live_tasks, 0); + for (int i = 0; i < JL_N_STACK_POOLS; i++) + small_arraylist_new(&heap->free_stacks[i], 0); + small_arraylist_new(&heap->mallocarrays, 0); + arraylist_new(&ptls->finalizers, 0); + // Initialize `lazily_freed_mtarraylist_buffers` + small_arraylist_new(&ptls->lazily_freed_mtarraylist_buffers, 0); + // Clear the malloc sz count + jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0); + // Create mutator + MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid); + // Copy the mutator to the thread local storage + memcpy(&ptls->gc_tls.mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext)); + // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed) + mmtk_post_bind_mutator(&ptls->gc_tls.mmtk_mutator, mmtk_mutator); + memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num)); +} + +void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) { + mmtk_destroy_mutator(&ptls->gc_tls.mmtk_mutator); +} + +JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) { + // MMTk currently does not allow setting the heap size at runtime +} + +inline void maybe_collect(jl_ptls_t ptls) +{ + // Just do a safe point for general maybe_collect + jl_gc_safepoint_(ptls); +} + +// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll), +// is expensive. So we only check for every few allocations. +static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz) +{ + // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to + // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage + // as much as we can. + if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) { + jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0); + mmtk_gc_poll(ptls); + } else { + size_t curr = jl_atomic_load_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll); + jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, curr + sz); + jl_gc_safepoint_(ptls); + } +} + +// This is called when the user calls for a GC with Gc.gc() +JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) { + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + if (jl_atomic_load_acquire(&jl_gc_disable_counter)) { + size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval; + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval); + static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), ""); + jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes); + return; + } + mmtk_handle_user_collection_request(ptls, collection); +} + + +// Based on jl_gc_collect from gc-stock.c +// called when stopping the thread in `mmtk_block_for_gc` +JL_DLLEXPORT void jl_gc_prepare_to_collect(void) +{ + // FIXME: set to JL_GC_AUTO since we're calling it from mmtk + // maybe just remove this? + JL_PROBE_GC_BEGIN(JL_GC_AUTO); + + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + if (jl_atomic_load_acquire(&jl_gc_disable_counter)) { + size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval; + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval); + static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), ""); + jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes); + return; + } + + int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state); + jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING); + // `jl_safepoint_start_gc()` makes sure only one thread can run the GC. + uint64_t t0 = jl_hrtime(); + if (!jl_safepoint_start_gc(ct)) { + jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); + jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state + return; + } + + JL_TIMING_SUSPEND_TASK(GC, ct); + JL_TIMING(GC, GC); + + int last_errno = errno; +#ifdef _OS_WINDOWS_ + DWORD last_error = GetLastError(); +#endif + // Now we are ready to wait for other threads to hit the safepoint, + // we can do a few things that doesn't require synchronization. + // + // We must sync here with the tls_lock operations, so that we have a + // seq-cst order between these events now we know that either the new + // thread must run into our safepoint flag or we must observe the + // existence of the thread in the jl_n_threads count. + // + // TODO: concurrently queue objects + jl_fence(); + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads); + JL_PROBE_GC_STOP_THE_WORLD(); + + uint64_t t1 = jl_hrtime(); + uint64_t duration = t1 - t0; + if (duration > gc_num.max_time_to_safepoint) + gc_num.max_time_to_safepoint = duration; + gc_num.time_to_safepoint = duration; + gc_num.total_time_to_safepoint += duration; + + if (!jl_atomic_load_acquire(&jl_gc_disable_counter)) { + JL_LOCK_NOGC(&finalizers_lock); // all the other threads are stopped, so this does not make sense, right? otherwise, failing that, this seems like plausibly a deadlock +#ifndef __clang_gcanalyzer__ + mmtk_block_thread_for_gc(); +#endif + JL_UNLOCK_NOGC(&finalizers_lock); + } + + gc_n_threads = 0; + gc_all_tls_states = NULL; + jl_safepoint_end_gc(); + jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); + JL_PROBE_GC_END(); + jl_safepoint_wait_thread_resume(ct); // block in thread-suspend now if requested, after clearing the gc_state + + // Only disable finalizers on current thread + // Doing this on all threads is racy (it's impossible to check + // or wait for finalizers on other threads without dead lock). + if (!ptls->finalizers_inhibited && ptls->locks.len == 0) { + JL_TIMING(GC, GC_Finalizers); + run_finalizers(ct, 0); + } + JL_PROBE_GC_FINALIZER(); + +#ifdef _OS_WINDOWS_ + SetLastError(last_error); +#endif + errno = last_errno; +} + +// ========================================================================= // +// GC Statistics +// ========================================================================= // + +JL_DLLEXPORT const char* jl_gc_active_impl(void) { + const char* mmtk_version = get_mmtk_version(); + return mmtk_version; +} + +int64_t last_gc_total_bytes = 0; +int64_t last_live_bytes = 0; // live_bytes at last collection +int64_t live_bytes = 0; + +// FIXME: The functions combine_thread_gc_counts and reset_thread_gc_counts +// are currently nearly identical for mmtk and for stock. However, the stats +// are likely different (e.g., MMTk doesn't track the bytes allocated in the fastpath, +// but only when the slowpath is called). We might need to adapt these later so that +// the statistics are the same or as close as possible for each GC. +static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT +{ + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; + if (ptls) { + dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval); + dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc); + dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc); + dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc); + dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc); + dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc); + if (update_heap) { + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0); + } + } + } +} + +void reset_thread_gc_counts(void) JL_NOTSAFEPOINT +{ + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; + if (ptls != NULL) { + // don't reset `pool_live_bytes` here + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0); + } + } +} + +// Retrieves Julia's `GC_Num` (structure that stores GC statistics). +JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) { + jl_gc_num_t num = gc_num; + combine_thread_gc_counts(&num, 0); + return num; +} + +JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT { + int64_t oldtb = last_gc_total_bytes; + int64_t newtb; + jl_gc_get_total_bytes(&newtb); + last_gc_total_bytes = newtb; + return newtb - oldtb; +} + +JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT +{ + int64_t oldtb = last_gc_total_bytes; + int64_t newtb; + jl_gc_get_total_bytes(&newtb); + last_gc_total_bytes = newtb - offset; + return newtb - oldtb; +} + +JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) { + return 0; +} + +void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT +{ + jl_ptls_t ptls = jl_current_task->ptls; + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz); +} + +void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT +{ +} + +int64_t inc_live_bytes(int64_t inc) JL_NOTSAFEPOINT +{ + jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, inc); + return live_bytes += inc; +} + +void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT +{ + combine_thread_gc_counts(&gc_num, 0); + inc_live_bytes(gc_num.deferred_alloc + gc_num.allocd); + gc_num.allocd = 0; + gc_num.deferred_alloc = 0; + reset_thread_gc_counts(); +} + +JL_DLLEXPORT int64_t jl_gc_live_bytes(void) { + return last_live_bytes; +} + +JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT +{ + jl_gc_num_t num = gc_num; + combine_thread_gc_counts(&num, 0); + // Sync this logic with `base/util.jl:GC_Diff` + *bytes = (num.total_allocd + num.deferred_alloc + num.allocd); +} + +JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void) +{ + // FIXME: should probably return MMTk's heap size + return max_total_memory; +} + +// These are needed to collect MMTk statistics from a Julia program using ccall +JL_DLLEXPORT void (jl_mmtk_harness_begin)(void) +{ + jl_ptls_t ptls = jl_current_task->ptls; + mmtk_harness_begin(ptls); +} + +JL_DLLEXPORT void (jl_mmtk_harness_end)(void) +{ + mmtk_harness_end(); +} + +// ========================================================================= // +// Root Processing, Object Scanning and Julia-specific sweeping +// ========================================================================= // + +static void add_node_to_roots_buffer(RootsWorkClosure* closure, RootsWorkBuffer* buf, size_t* buf_len, void* root) { + if (root == NULL) + return; + + buf->ptr[*buf_len] = root; + *buf_len += 1; + if (*buf_len >= buf->cap) { + RootsWorkBuffer new_buf = (closure->report_nodes_func)(buf->ptr, *buf_len, buf->cap, closure->data, true); + *buf = new_buf; + *buf_len = 0; + } +} + +static void add_node_to_tpinned_roots_buffer(RootsWorkClosure* closure, RootsWorkBuffer* buf, size_t* buf_len, void* root) { + if (root == NULL) + return; + + buf->ptr[*buf_len] = root; + *buf_len += 1; + if (*buf_len >= buf->cap) { + RootsWorkBuffer new_buf = (closure->report_tpinned_nodes_func)(buf->ptr, *buf_len, buf->cap, closure->data, true); + *buf = new_buf; + *buf_len = 0; + } +} + +JL_DLLEXPORT void jl_gc_scan_vm_specific_roots(RootsWorkClosure* closure) +{ + // Create a new buf + RootsWorkBuffer buf = (closure->report_nodes_func)((void**)0, 0, 0, closure->data, true); + size_t len = 0; + + // add module + add_node_to_roots_buffer(closure, &buf, &len, jl_main_module); + + // buildin values + add_node_to_roots_buffer(closure, &buf, &len, jl_an_empty_vec_any); + add_node_to_roots_buffer(closure, &buf, &len, jl_module_init_order); + for (size_t i = 0; i < jl_current_modules.size; i += 2) { + if (jl_current_modules.table[i + 1] != HT_NOTFOUND) { + add_node_to_roots_buffer(closure, &buf, &len, jl_current_modules.table[i]); + } + } + add_node_to_roots_buffer(closure, &buf, &len, jl_anytuple_type_type); + for (size_t i = 0; i < N_CALL_CACHE; i++) { + jl_typemap_entry_t *v = jl_atomic_load_relaxed(&call_cache[i]); + add_node_to_roots_buffer(closure, &buf, &len, v); + } + add_node_to_roots_buffer(closure, &buf, &len, _jl_debug_method_invalidation); + + // constants + add_node_to_roots_buffer(closure, &buf, &len, jl_emptytuple_type); + add_node_to_roots_buffer(closure, &buf, &len, cmpswap_names); + + // jl_global_roots_table must be transitively pinned + RootsWorkBuffer tpinned_buf = (closure->report_tpinned_nodes_func)((void**)0, 0, 0, closure->data, true); + size_t tpinned_len = 0; + add_node_to_tpinned_roots_buffer(closure, &tpinned_buf, &tpinned_len, jl_global_roots_list); + add_node_to_tpinned_roots_buffer(closure, &tpinned_buf, &tpinned_len, jl_global_roots_keyset); + + // Push the result of the work. + (closure->report_nodes_func)(buf.ptr, len, buf.cap, closure->data, false); + (closure->report_tpinned_nodes_func)(tpinned_buf.ptr, tpinned_len, tpinned_buf.cap, closure->data, false); +} + +JL_DLLEXPORT void jl_gc_scan_julia_exc_obj(void* obj_raw, void* closure, ProcessSlotFn process_slot) { + jl_task_t *ta = (jl_task_t*)obj_raw; + + if (ta->excstack) { // inlining label `excstack` from mark_loop + + // the excstack should always be a heap object + assert(mmtk_object_is_managed_by_mmtk(ta->excstack)); + + process_slot(closure, &ta->excstack); + jl_excstack_t *excstack = ta->excstack; + size_t itr = ta->excstack->top; + size_t bt_index = 0; + size_t jlval_index = 0; + while (itr > 0) { + size_t bt_size = jl_excstack_bt_size(excstack, itr); + jl_bt_element_t *bt_data = jl_excstack_bt_data(excstack, itr); + for (; bt_index < bt_size; bt_index += jl_bt_entry_size(bt_data + bt_index)) { + jl_bt_element_t *bt_entry = bt_data + bt_index; + if (jl_bt_is_native(bt_entry)) + continue; + // Found an extended backtrace entry: iterate over any + // GC-managed values inside. + size_t njlvals = jl_bt_num_jlvals(bt_entry); + while (jlval_index < njlvals) { + jl_value_t** new_obj_slot = &bt_entry[2 + jlval_index].jlvalue; + jlval_index += 1; + process_slot(closure, new_obj_slot); + } + jlval_index = 0; + } + + jl_bt_element_t *stack_raw = (jl_bt_element_t *)(excstack+1); + jl_value_t** stack_obj_slot = &stack_raw[itr-1].jlvalue; + + itr = jl_excstack_next(excstack, itr); + bt_index = 0; + jlval_index = 0; + process_slot(closure, stack_obj_slot); + } + } +} + +// This is used in mmtk_sweep_malloced_memory and it is slightly different +// from jl_gc_free_memory from gc-stock.c as the stock GC updates the +// information in the global variable gc_heap_stats (which is specific to the stock GC) +static void jl_gc_free_memory(jl_genericmemory_t *m, int isaligned) JL_NOTSAFEPOINT +{ + assert(jl_is_genericmemory(m)); + assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2); + char *d = (char*)m->ptr; + size_t freed_bytes = memory_block_usable_size(d, isaligned); + assert(freed_bytes != 0); + if (isaligned) + jl_free_aligned(d); + else + free(d); + gc_num.freed += freed_bytes; + gc_num.freecall++; +} + +JL_DLLEXPORT void jl_gc_mmtk_sweep_malloced_memory(void) JL_NOTSAFEPOINT +{ + void* iter = mmtk_new_mutator_iterator(); + jl_ptls_t ptls2 = (jl_ptls_t)mmtk_get_next_mutator_tls(iter); + while(ptls2 != NULL) { + size_t n = 0; + size_t l = ptls2->gc_tls_common.heap.mallocarrays.len; + void **lst = ptls2->gc_tls_common.heap.mallocarrays.items; + // filter without preserving order + while (n < l) { + jl_genericmemory_t *m = (jl_genericmemory_t*)((uintptr_t)lst[n] & ~1); + if (mmtk_is_live_object(m)) { + n++; + } + else { + int isaligned = (uintptr_t)lst[n] & 1; + jl_gc_free_memory(m, isaligned); + l--; + lst[n] = lst[l]; + } + } + ptls2->gc_tls_common.heap.mallocarrays.len = l; + ptls2 = (jl_ptls_t)mmtk_get_next_mutator_tls(iter); + } + mmtk_close_mutator_iterator(iter); +} + +#define jl_genericmemory_elsize(a) (((jl_datatype_t*)jl_typetagof(a))->layout->size) + +// if data is inlined inside the genericmemory object --- to->ptr needs to be updated when copying the array +JL_DLLEXPORT void jl_gc_update_inlined_array(void* from, void* to) { + jl_value_t* jl_from = (jl_value_t*) from; + jl_value_t* jl_to = (jl_value_t*) to; + + uintptr_t tag_to = (uintptr_t)jl_typeof(jl_to); + jl_datatype_t *vt = (jl_datatype_t*)tag_to; + + if(vt->name == jl_genericmemory_typename) { + jl_genericmemory_t *a = (jl_genericmemory_t*)jl_from; + jl_genericmemory_t *b = (jl_genericmemory_t*)jl_to; + int how = jl_genericmemory_how(a); + + if (how == 0 && mmtk_object_is_managed_by_mmtk(a->ptr)) { // a is inlined (a->ptr points into the mmtk object) + size_t offset_of_data = ((size_t)a->ptr - (size_t)a); + if (offset_of_data > 0) { + b->ptr = (void*)((size_t) b + offset_of_data); + } + } + } +} + +// modified sweep_stack_pools from gc-stacks.c +JL_DLLEXPORT void jl_gc_mmtk_sweep_stack_pools(void) +{ + // Stack sweeping algorithm: + // // deallocate stacks if we have too many sitting around unused + // for (stk in halfof(free_stacks)) + // free_stack(stk, pool_sz); + // // then sweep the task stacks + // for (t in live_tasks) + // if (!gc-marked(t)) + // stkbuf = t->stkbuf + // bufsz = t->bufsz + // if (stkbuf) + // push(free_stacks[sz], stkbuf) + assert(gc_n_threads); + for (int i = 0; i < jl_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; + + // free half of stacks that remain unused since last sweep + for (int p = 0; p < JL_N_STACK_POOLS; p++) { + small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p]; + size_t n_to_free; + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + n_to_free = al->len; // not alive yet or dead, so it does not need these anymore + } + else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { + n_to_free = al->len / 2; + if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) + n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; + } + else { + n_to_free = 0; + } + for (int n = 0; n < n_to_free; n++) { + void *stk = small_arraylist_pop(al); + free_stack(stk, pool_sizes[p]); + } + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(al); + } + } + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks); + } + + small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks; + size_t n = 0; + size_t ndel = 0; + size_t l = live_tasks->len; + void **lst = live_tasks->items; + if (l == 0) + continue; + while (1) { + jl_task_t *t = (jl_task_t*)lst[n]; + if (mmtk_is_live_object(t)) { + jl_task_t *maybe_forwarded = (jl_task_t*)mmtk_get_possibly_forwarded(t); + live_tasks->items[n] = maybe_forwarded; + t = maybe_forwarded; + assert(jl_is_task(t)); + if (t->ctx.stkbuf == NULL) + ndel++; // jl_release_task_stack called + else + n++; + } else { + ndel++; + void *stkbuf = t->ctx.stkbuf; + size_t bufsz = t->ctx.bufsz; + if (stkbuf) { + t->ctx.stkbuf = NULL; + _jl_free_stack(ptls2, stkbuf, bufsz); + } +#ifdef _COMPILER_TSAN_ENABLED_ + if (t->ctx.tsan_state) { + __tsan_destroy_fiber(t->ctx.tsan_state); + t->ctx.tsan_state = NULL; + } +#endif + } + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n + ndel] = tmp; + } + live_tasks->len -= ndel; + } +} + +JL_DLLEXPORT void jl_gc_sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + jl_gc_mmtk_sweep_stack_pools(); + sweep_mtarraylist_buffers(); +} + +JL_DLLEXPORT void* jl_gc_get_stackbase(int16_t tid) { + assert(tid >= 0); + jl_ptls_t ptls2 = jl_all_tls_states[tid]; + return ptls2->stackbase; +} + +JL_DLLEXPORT void jl_gc_update_stats(uint64_t inc, size_t mmtk_live_bytes, bool is_nursery_gc) { + gc_num.total_time += inc; + gc_num.pause += 1; + gc_num.full_sweep += !(is_nursery_gc); + gc_num.total_allocd += gc_num.allocd; + gc_num.allocd = 0; + live_bytes = mmtk_live_bytes; +} + +#define jl_genericmemory_data_owner_field_addr(a) ((jl_value_t**)((jl_genericmemory_t*)(a) + 1)) + +JL_DLLEXPORT void* jl_gc_get_owner_address_to_mmtk(void* m) { + return (void*)jl_genericmemory_data_owner_field_addr(m); +} + +// same as jl_genericmemory_how but with JL_DLLEXPORT +// we should probably inline this in Rust +JL_DLLEXPORT size_t jl_gc_genericmemory_how(void *arg) JL_NOTSAFEPOINT +{ + jl_genericmemory_t* m = (jl_genericmemory_t*)arg; + if (m->ptr == (void*)((char*)m + 16)) // JL_SMALL_BYTE_ALIGNMENT (from julia_internal.h) + return 0; + jl_value_t *owner = jl_genericmemory_data_owner_field(m); + if (owner == (jl_value_t*)m) + return 1; + if (owner == NULL) + return 2; + return 3; +} + +// ========================================================================= // +// Weak References and Finalizers +// ========================================================================= // + +JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value) +{ + jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type); + wr->value = value; // NOTE: wb not needed here + mmtk_add_weak_candidate(wr); + return wr; +} + +JL_DLLEXPORT void* jl_gc_get_thread_finalizer_list(void* ptls_raw) { + jl_ptls_t ptls = (jl_ptls_t) ptls_raw; + return (void*)&ptls->finalizers; +} + +JL_DLLEXPORT void* jl_gc_get_to_finalize_list(void) { + return (void*)&to_finalize; +} + +JL_DLLEXPORT void* jl_gc_get_marked_finalizers_list(void) { + return (void*)&finalizer_list_marked; +} + +JL_DLLEXPORT int* jl_gc_get_have_pending_finalizers(void) { + return (int*)&jl_gc_have_pending_finalizers; +} + +// ========================================================================= // +// Allocation +// ========================================================================= // + +#define MMTK_DEFAULT_IMMIX_ALLOCATOR (0) +#define MMTK_IMMORTAL_BUMP_ALLOCATOR (0) + +int jl_gc_classify_pools(size_t sz, int *osize) +{ + if (sz > GC_MAX_SZCLASS) + return -1; // call big alloc function + size_t allocsz = sz + sizeof(jl_taggedvalue_t); + *osize = LLT_ALIGN(allocsz, 16); + return 0; // use MMTk's fastpath logic +} + +#define MMTK_MIN_ALIGNMENT 4 +// MMTk assumes allocation size is aligned to min alignment. +inline size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT +{ + return (sz + MMTK_MIN_ALIGNMENT - 1) & ~(MMTK_MIN_ALIGNMENT - 1); +} + +inline void* bump_alloc_fast(MMTkMutatorContext* mutator, uintptr_t* cursor, uintptr_t limit, size_t size, size_t align, size_t offset, int allocator) { + intptr_t delta = (-offset - *cursor) & (align - 1); + uintptr_t result = *cursor + (uintptr_t)delta; + + if (__unlikely(result + size > limit)) { + return (void*) mmtk_alloc(mutator, size, align, offset, allocator); + } else{ + *cursor = result + size; + return (void*)result; + } +} + +inline void* mmtk_immix_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) { + ImmixAllocator* allocator = &mutator->allocators.immix[MMTK_DEFAULT_IMMIX_ALLOCATOR]; + return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (intptr_t)allocator->limit, size, align, offset, 0); +} + +inline void mmtk_immix_post_alloc_slow(MMTkMutatorContext* mutator, void* obj, size_t size) { + mmtk_post_alloc(mutator, obj, size, 0); +} + +inline void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) { + // FIXME: for now, we do nothing + // but when supporting moving, this is where we set the valid object (VO) bit +} + +inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) { + BumpAllocator* allocator = &mutator->allocators.bump_pointer[MMTK_IMMORTAL_BUMP_ALLOCATOR]; + return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1); +} + +inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) { + // FIXME: Similarly, for now, we do nothing + // but when supporting moving, this is where we set the valid object (VO) bit + // and log (old gen) bit +} + +JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty) +{ + // safepoint + jl_gc_safepoint_(ptls); + + jl_value_t *v; + if ((uintptr_t)ty != jl_buff_tag) { + // v needs to be 16 byte aligned, therefore v_tagged needs to be offset accordingly to consider the size of header + jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize, align), align, sizeof(jl_taggedvalue_t)); + v = jl_valueof(v_tagged); + mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize, align)); + } else { + // allocating an extra word to store the size of buffer objects + jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align), align, 0); + jl_value_t* v_tagged_aligned = ((jl_value_t*)((char*)(v_tagged) + sizeof(jl_taggedvalue_t))); + v = jl_valueof(v_tagged_aligned); + mmtk_store_obj_size_c(v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align)); + mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align)); + } + + ptls->gc_tls_common.gc_num.allocd += osize; + ptls->gc_tls_common.gc_num.poolalloc++; + + return v; +} + +JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz) +{ + // safepoint + jl_gc_safepoint_(ptls); + + size_t offs = offsetof(bigval_t, header); + assert(sz >= sizeof(jl_taggedvalue_t) && "sz must include tag"); + static_assert(offsetof(bigval_t, header) >= sizeof(void*), "Empty bigval header?"); + static_assert(sizeof(bigval_t) % JL_HEAP_ALIGNMENT == 0, ""); + size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT); + if (allocsz < sz) { // overflow in adding offs, size was "negative" + assert(0 && "Error when allocating big object"); + jl_throw(jl_memory_exception); + } + + bigval_t *v = (bigval_t*)mmtk_alloc_large(&ptls->gc_tls.mmtk_mutator, allocsz, JL_CACHE_BYTE_ALIGNMENT, 0, 2); + + if (v == NULL) { + assert(0 && "Allocation failed"); + jl_throw(jl_memory_exception); + } + v->sz = allocsz; + + ptls->gc_tls_common.gc_num.allocd += allocsz; + ptls->gc_tls_common.gc_num.bigalloc++; + + jl_value_t *result = jl_valueof(&v->header); + mmtk_post_alloc(&ptls->gc_tls.mmtk_mutator, result, allocsz, 2); + + return result; +} + +// Instrumented version of jl_gc_small_alloc_inner, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_gc_small_alloc(jl_ptls_t ptls, int offset, int osize, jl_value_t* type) +{ + assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0); + + jl_value_t *val = jl_mmtk_gc_alloc_default(ptls, osize, 16, NULL); + maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); + return val; +} + +// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type) +{ + // TODO: assertion needed here? + assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0); + + jl_value_t *val = jl_mmtk_gc_alloc_big(ptls, sz); + maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type); + return val; +} + +inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) +{ + jl_value_t *v; + const size_t allocsz = sz + sizeof(jl_taggedvalue_t); + if (sz <= GC_MAX_SZCLASS) { + v = jl_mmtk_gc_alloc_default(ptls, allocsz, 16, ty); + } + else { + if (allocsz < sz) // overflow in adding offs, size was "negative" + jl_throw(jl_memory_exception); + v = jl_mmtk_gc_alloc_big(ptls, allocsz); + } + jl_set_typeof(v, ty); + maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty); + return v; +} + +// allocation wrappers that track allocation and let collection run +JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) +{ + jl_gcframe_t **pgcstack = jl_get_pgcstack(); + jl_task_t *ct = jl_current_task; + void *data = malloc(sz); + if (data != NULL && pgcstack != NULL && ct->world_age) { + jl_ptls_t ptls = ct->ptls; + malloc_maybe_collect(ptls, sz); + jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz); + } + return data; +} + +JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) +{ + jl_gcframe_t **pgcstack = jl_get_pgcstack(); + jl_task_t *ct = jl_current_task; + void *data = calloc(nm, sz); + if (data != NULL && pgcstack != NULL && ct->world_age) { + jl_ptls_t ptls = ct->ptls; + malloc_maybe_collect(ptls, nm * sz); + jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz); + } + return data; +} + +JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) +{ + jl_gcframe_t **pgcstack = jl_get_pgcstack(); + jl_task_t *ct = jl_current_task; + free(p); + if (pgcstack != NULL && ct->world_age) { + jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz); + } +} + +JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) +{ + jl_gcframe_t **pgcstack = jl_get_pgcstack(); + jl_task_t *ct = jl_current_task; + if (pgcstack && ct->world_age) { + jl_ptls_t ptls = ct->ptls; + malloc_maybe_collect(ptls, sz); + if (sz < old) + jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, old - sz); + else + jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz - old); + } + return realloc(p, sz); +} + +void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset) +{ + jl_ptls_t ptls = jl_current_task->ptls; + size_t allocsz = mmtk_align_alloc_sz(sz); + void* addr = mmtk_immortal_alloc_fast(&ptls->gc_tls.mmtk_mutator, allocsz, align, offset); + return addr; +} + +void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset) +{ + return jl_gc_perm_alloc_nolock(sz, zero, align, offset); +} + +jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT +{ + const size_t allocsz = sz + sizeof(jl_taggedvalue_t); + unsigned align = (sz == 0 ? sizeof(void*) : (allocsz <= sizeof(void*) * 2 ? + sizeof(void*) * 2 : 16)); + jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align, + sizeof(void*) % align); + + jl_ptls_t ptls = jl_current_task->ptls; + mmtk_immortal_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, jl_valueof(o), allocsz); + o->header = (uintptr_t)ty; + return jl_valueof(o); +} + +JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) +{ + jl_ptls_t ptls = jl_current_task->ptls; + maybe_collect(ptls); + size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); + if (allocsz < sz) // overflow in adding offs, size was "negative" + jl_throw(jl_memory_exception); + + int last_errno = errno; +#ifdef _OS_WINDOWS_ + DWORD last_error = GetLastError(); +#endif + void *b = malloc_cache_align(allocsz); + if (b == NULL) + jl_throw(jl_memory_exception); + + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1); + // FIXME: Should these be part of mmtk's heap? + // malloc_maybe_collect(ptls, sz); + // jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, allocsz); +#ifdef _OS_WINDOWS_ + SetLastError(last_error); +#endif + errno = last_errno; + // jl_gc_managed_malloc is currently always used for allocating array buffers. + maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag); + return b; +} + +void jl_gc_notify_image_load(const char* img_data, size_t len) +{ + mmtk_set_vm_space((void*)img_data, len); +} + +// ========================================================================= // +// Code specific to stock that is not supported by MMTk +// ========================================================================= // + +// mutex for page profile +uv_mutex_t page_profile_lock; + +JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream) +{ + uv_mutex_lock(&page_profile_lock); + const char *str = "Page profiler in unsupported in MMTk."; + ios_write(stream, str, strlen(str)); + uv_mutex_unlock(&page_profile_lock); +} + +// this seems to be needed by the gc tests +#define JL_GC_N_MAX_POOLS 51 +JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS]; + +STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT +{ + // FIXME: MMTk would have to provide its own stats +} + +#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants + +JL_DLLEXPORT uint64_t jl_get_pg_size(void) +{ + return MMTK_GC_PAGE_SZ; +} + +// Not used by mmtk +// Number of GC threads that may run parallel marking +int jl_n_markthreads; +// Number of GC threads that may run concurrent sweeping (0 or 1) +int jl_n_sweepthreads; +// `tid` of first GC thread +int gc_first_tid; +// Number of threads sweeping stacks +_Atomic(int) gc_n_threads_sweeping_stacks; +// counter for sharing work when sweeping stacks +_Atomic(int) gc_ptls_sweep_idx; +// counter for round robin of giving back stack pages to the OS +_Atomic(int) gc_stack_free_idx = 0; + +JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT +{ + mmtk_unreachable(); +} + +JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored, + struct _jl_datatype_t *dt) JL_NOTSAFEPOINT +{ + mmtk_unreachable(); +} + +JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) +{ + mmtk_unreachable(); + return 0; +} + +JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, + jl_value_t **objs, size_t nobjs) +{ + mmtk_unreachable(); +} + +JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void) +{ + // TODO: meaningful for MMTk? + return GC_MAX_SZCLASS; +} + +JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj) +{ + // FIXME: do we need to implement this? +} + +// gc-debug functions +JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p) +{ + return NULL; +} + +void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT +{ +} + +int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT +{ + return 0; +} + +void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT +{ + // May not be accurate but should be helpful enough + uint64_t pool_count = gc_num.poolalloc; + uint64_t big_count = gc_num.bigalloc; + jl_safe_printf("Allocations: %" PRIu64 " " + "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n", + pool_count + big_count, pool_count, big_count, gc_num.pause); +} + +JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void) +{ + return sizeof(bigval_t); +} + +void jl_print_gc_stats(JL_STREAM *s) +{ +} + +JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void) +{ + return 0; +} + +JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void) +{ + return 0; +} + +// TODO: if this is needed, it can be added in MMTk +JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) +{ + return NULL; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/gc-stacks.c b/src/gc-stacks.c index a0ca2561c5cf9..9387c7fb065ec 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -1,7 +1,6 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license #include "gc-common.h" -#include "gc-stock.h" #include "threading.h" #ifndef _OS_WINDOWS_ # include @@ -21,9 +20,6 @@ # endif #endif -// number of stacks to always keep available per pool -#define MIN_STACK_MAPPINGS_PER_POOL 5 - const size_t jl_guard_size = (4096 * 8); static _Atomic(uint32_t) num_stack_mappings = 0; @@ -203,99 +199,6 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO return stk; } -void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT -{ - // Stack sweeping algorithm: - // // deallocate stacks if we have too many sitting around unused - // for (stk in halfof(free_stacks)) - // free_stack(stk, pool_sz); - // // then sweep the task stacks - // for (t in live_tasks) - // if (!gc-marked(t)) - // stkbuf = t->stkbuf - // bufsz = t->bufsz - // if (stkbuf) - // push(free_stacks[sz], stkbuf) - jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1); - while (1) { - int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); - if (i < 0) - break; - jl_ptls_t ptls2 = gc_all_tls_states[i]; - if (ptls2 == NULL) - continue; - assert(gc_n_threads); - // free half of stacks that remain unused since last sweep - if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) { - for (int p = 0; p < JL_N_STACK_POOLS; p++) { - small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p]; - size_t n_to_free; - if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { - n_to_free = al->len; // not alive yet or dead, so it does not need these anymore - } - else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { - n_to_free = al->len / 2; - if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) - n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; - } - else { - n_to_free = 0; - } - for (int n = 0; n < n_to_free; n++) { - void *stk = small_arraylist_pop(al); - free_stack(stk, pool_sizes[p]); - } - if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { - small_arraylist_free(al); - } - } - } - if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { - small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks); - } - - small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks; - size_t n = 0; - size_t ndel = 0; - size_t l = live_tasks->len; - void **lst = live_tasks->items; - if (l == 0) - continue; - while (1) { - jl_task_t *t = (jl_task_t*)lst[n]; - assert(jl_is_task(t)); - if (gc_marked(jl_astaggedvalue(t)->bits.gc)) { - if (t->ctx.stkbuf == NULL) - ndel++; // jl_release_task_stack called - else - n++; - } - else { - ndel++; - void *stkbuf = t->ctx.stkbuf; - size_t bufsz = t->ctx.bufsz; - if (stkbuf) { - t->ctx.stkbuf = NULL; - _jl_free_stack(ptls2, stkbuf, bufsz); - } -#ifdef _COMPILER_TSAN_ENABLED_ - if (t->ctx.tsan_state) { - __tsan_destroy_fiber(t->ctx.tsan_state); - t->ctx.tsan_state = NULL; - } -#endif - } - if (n >= l - ndel) - break; - void *tmp = lst[n]; - lst[n] = lst[n + ndel]; - lst[n + ndel] = tmp; - } - live_tasks->len -= ndel; - } - jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1); -} - // Builds a list of the live tasks. Racy: `live_tasks` can expand at any time. arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT { diff --git a/src/gc-stock.c b/src/gc-stock.c index c1bc1d64ae199..1690029bb7385 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -1010,22 +1010,102 @@ void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT } } -void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT -{ - for (int i = 0; i < gc_n_threads; i++) { - jl_ptls_t ptls = gc_all_tls_states[i]; - if (ptls == NULL) { +extern const unsigned pool_sizes[]; + +void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT +{ + // Stack sweeping algorithm: + // // deallocate stacks if we have too many sitting around unused + // for (stk in halfof(free_stacks)) + // free_stack(stk, pool_sz); + // // then sweep the task stacks + // for (t in live_tasks) + // if (!gc-marked(t)) + // stkbuf = t->stkbuf + // bufsz = t->bufsz + // if (stkbuf) + // push(free_stacks[sz], stkbuf) + jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1); + while (1) { + int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); + if (i < 0) + break; + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) continue; + assert(gc_n_threads); + // free half of stacks that remain unused since last sweep + if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) { + for (int p = 0; p < JL_N_STACK_POOLS; p++) { + small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p]; + size_t n_to_free; + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + n_to_free = al->len; // not alive yet or dead, so it does not need these anymore + } + else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { + n_to_free = al->len / 2; + if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) + n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; + } + else { + n_to_free = 0; + } + for (int n = 0; n < n_to_free; n++) { + void *stk = small_arraylist_pop(al); + free_stack(stk, pool_sizes[p]); + } + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(al); + } + } } - small_arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers; - void *buf; - while ((buf = small_arraylist_pop(buffers)) != NULL) { - free(buf); + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks); } + + small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks; + size_t n = 0; + size_t ndel = 0; + size_t l = live_tasks->len; + void **lst = live_tasks->items; + if (l == 0) + continue; + while (1) { + jl_task_t *t = (jl_task_t*)lst[n]; + assert(jl_is_task(t)); + if (gc_marked(jl_astaggedvalue(t)->bits.gc)) { + if (t->ctx.stkbuf == NULL) + ndel++; // jl_release_task_stack called + else + n++; + } + else { + ndel++; + void *stkbuf = t->ctx.stkbuf; + size_t bufsz = t->ctx.bufsz; + if (stkbuf) { + t->ctx.stkbuf = NULL; + _jl_free_stack(ptls2, stkbuf, bufsz); + } +#ifdef _COMPILER_TSAN_ENABLED_ + if (t->ctx.tsan_state) { + __tsan_destroy_fiber(t->ctx.tsan_state); + t->ctx.tsan_state = NULL; + } +#endif + } + if (n >= l - ndel) + break; + void *tmp = lst[n]; + lst[n] = lst[n + ndel]; + lst[n + ndel] = tmp; + } + live_tasks->len -= ndel; } + jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1); } -void sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT +JL_DLLEXPORT void jl_gc_sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT { // initialize ptls index for parallel sweeping of stack pools assert(gc_n_threads); @@ -3087,7 +3167,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) current_sweep_full = sweep_full; sweep_weak_refs(); uint64_t stack_pool_time = jl_hrtime(); - sweep_stack_pools_and_mtarraylist_buffers(ptls); + jl_gc_sweep_stack_pools_and_mtarraylist_buffers(ptls); stack_pool_time = jl_hrtime() - stack_pool_time; gc_num.total_stack_pool_sweep_time += stack_pool_time; gc_num.stack_pool_sweep_time = stack_pool_time; @@ -3645,61 +3725,6 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void) // allocation wrappers that add to gc pressure -JL_DLLEXPORT void *jl_malloc(size_t sz) -{ - return jl_gc_counted_malloc(sz); -} - -//_unchecked_calloc does not check for potential overflow of nm*sz -STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) { - size_t nmsz = nm*sz; - return jl_gc_counted_calloc(nmsz, 1); -} - -JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz) -{ - if (nm > SSIZE_MAX/sz) - return NULL; - return _unchecked_calloc(nm, sz); -} - -JL_DLLEXPORT void jl_free(void *p) -{ - if (p != NULL) { - size_t sz = memory_block_usable_size(p, 0); - free(p); - jl_task_t *ct = jl_get_current_task(); - if (ct != NULL) - jl_batch_accum_free_size(ct->ptls, sz); - } -} - -JL_DLLEXPORT void *jl_realloc(void *p, size_t sz) -{ - size_t old = p ? memory_block_usable_size(p, 0) : 0; - void *data = realloc(p, sz); - jl_task_t *ct = jl_get_current_task(); - if (data != NULL && ct != NULL) { - sz = memory_block_usable_size(data, 0); - jl_ptls_t ptls = ct->ptls; - maybe_collect(ptls); - if (!(sz < old)) - jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old)); - jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, - jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1); - - int64_t diff = sz - old; - if (diff < 0) { - jl_batch_accum_free_size(ptls, -diff); - } - else { - jl_batch_accum_heap_size(ptls, diff); - } - } - return data; -} - JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { void *data = malloc(sz); @@ -3736,12 +3761,34 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) { - return jl_free(p); + free(p); + jl_task_t *ct = jl_get_current_task(); + if (ct != NULL) + jl_batch_accum_free_size(ct->ptls, sz); } JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { - return jl_realloc(p, sz); + void *data = realloc(p, sz); + jl_task_t *ct = jl_get_current_task(); + if (data != NULL && ct != NULL) { + sz = memory_block_usable_size(data, 0); + jl_ptls_t ptls = ct->ptls; + maybe_collect(ptls); + if (!(sz < old)) + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old)); + jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, + jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1); + int64_t diff = sz - old; + if (diff < 0) { + jl_batch_accum_free_size(ptls, -diff); + } + else { + jl_batch_accum_heap_size(ptls, diff); + } + } + return data; } // allocating blocks for Arrays and Strings @@ -4005,12 +4052,20 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void) return sizeof(bigval_t); } - JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj) { arraylist_push(&ptls->gc_tls.sweep_objs, obj); } +void jl_gc_notify_image_load(const char* img_data, size_t len) +{ + // Do nothing +} + +JL_DLLEXPORT const char* jl_gc_active_impl(void) { + return "Built with stock GC"; +} + #ifdef __cplusplus } #endif diff --git a/src/gc-stock.h b/src/gc-stock.h index b9a2e720f120a..d478ee1366da0 100644 --- a/src/gc-stock.h +++ b/src/gc-stock.h @@ -5,7 +5,6 @@ . non-moving, precise mark and sweep collector . pool-allocates small objects, keeps big objects on a simple list */ - #ifndef JL_GC_H #define JL_GC_H @@ -20,6 +19,7 @@ #include "julia_internal.h" #include "julia_assert.h" #include "threading.h" +#include "gc-common.h" #ifdef __cplusplus extern "C" { @@ -85,27 +85,6 @@ typedef struct _jl_gc_chunk_t { extern uintptr_t gc_bigval_sentinel_tag; -JL_EXTENSION typedef struct _bigval_t { - struct _bigval_t *next; - struct _bigval_t *prev; - size_t sz; -#ifdef _P64 // Add padding so that the value is 64-byte aligned - // (8 pointers of 8 bytes each) - (4 other pointers in struct) - void *_padding[8 - 4]; -#else - // (16 pointers of 4 bytes each) - (4 other pointers in struct) - void *_padding[16 - 4]; -#endif - //struct jl_taggedvalue_t <>; - union { - uintptr_t header; - struct { - uintptr_t gc:2; - } bits; - }; - // must be 64-byte aligned here, in 32 & 64 bit modes -} bigval_t; - // pool page metadata typedef struct _jl_gc_pagemeta_t { // next metadata structure in per-thread list @@ -519,9 +498,6 @@ extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping_pools; -extern _Atomic(int) gc_n_threads_sweeping_stacks; -extern _Atomic(int) gc_ptls_sweep_idx; -extern _Atomic(int) gc_stack_free_idx; extern _Atomic(int) n_threads_running; extern uv_barrier_t thread_init_done; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h new file mode 100644 index 0000000000000..5b69aef5d55fb --- /dev/null +++ b/src/gc-tls-mmtk.h @@ -0,0 +1,23 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#ifndef JL_GC_TLS_H +#define JL_GC_TLS_H + +#include +#include "mmtkMutator.h" +#include "julia_atomics.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + MMTkMutatorContext mmtk_mutator; + _Atomic(size_t) malloc_sz_since_last_poll; +} jl_gc_tls_states_t; + +#ifdef __cplusplus +} +#endif + +#endif // JL_GC_TLS_H diff --git a/src/gc-tls.h b/src/gc-tls-stock.h similarity index 100% rename from src/gc-tls.h rename to src/gc-tls-stock.h diff --git a/src/julia_internal.h b/src/julia_internal.h index e0e8158f8e0d9..8d241502287eb 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -226,7 +226,7 @@ extern volatile int profile_all_tasks; // Ensures that we can safely read the `live_tasks`field of every TLS when profiling. // We want to avoid the case that a GC gets interleaved with `jl_profile_task` and shrinks // the `live_tasks` array while we are reading it or frees tasks that are being profiled. -// Because of that, this lock must be held in `jl_profile_task` and `sweep_stack_pools_and_mtarraylist_buffers`. +// Because of that, this lock must be held in `jl_profile_task` and `jl_gc_sweep_stack_pools_and_mtarraylist_buffers`. extern uv_mutex_t live_tasks_lock; // Ensures that we can safely write to `profile_bt_data_prof` and `profile_bt_size_cur`. // We want to avoid the case that: diff --git a/src/julia_locks.h b/src/julia_locks.h index 35bcf7dd97322..92d67b34b1692 100644 --- a/src/julia_locks.h +++ b/src/julia_locks.h @@ -116,7 +116,7 @@ class jl_unique_gcsafe_lock { { jl_task_t *ct = jl_current_task; gc_state = jl_gc_safe_enter(ct->ptls); // contains jl_gc_safepoint after enter - this->native = std::unique_lock(native); + this->native = std::unique_lock(native); ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph } jl_unique_gcsafe_lock(jl_unique_gcsafe_lock &&native) = delete; diff --git a/src/julia_threads.h b/src/julia_threads.h index faa8ab9e0aaf4..b6ef65dc7fe52 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -4,7 +4,11 @@ #ifndef JL_THREADS_H #define JL_THREADS_H -#include "gc-tls.h" +#ifndef MMTK_GC +#include "gc-tls-stock.h" +#else +#include "gc-tls-mmtk.h" +#endif #include "gc-tls-common.h" #include "julia_atomics.h" #ifndef _OS_WINDOWS_ diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h index 278987858eab7..7b2a4bb033203 100644 --- a/src/llvm-gc-interface-passes.h +++ b/src/llvm-gc-interface-passes.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -328,6 +329,7 @@ struct LateLowerGCFrame: private JuliaPassContext { private: CallInst *pgcstack; + Function *smallAllocFunc; void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef &SafepointsSoFar, SmallVector &&RefinedPtr = SmallVector()); @@ -366,6 +368,7 @@ struct LateLowerGCFrame: private JuliaPassContext { void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef CalleeRoots); Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V); Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V); + Value* lowerGCAllocBytesLate(CallInst *target, Function &F); }; // The final GC lowering pass. This pass lowers platform-agnostic GC diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-late-gc-lowering-mmtk.cpp new file mode 100644 index 0000000000000..5539c8dbcf153 --- /dev/null +++ b/src/llvm-late-gc-lowering-mmtk.cpp @@ -0,0 +1,96 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "llvm-gc-interface-passes.h" + +Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) +{ + assert(target->arg_size() == 3); + + IRBuilder<> builder(target); + auto ptls = target->getArgOperand(0); + auto type = target->getArgOperand(2); + if (auto CI = dyn_cast(target->getArgOperand(1))) { + size_t sz = (size_t)CI->getZExtValue(); + // This is strongly architecture and OS dependent + int osize; + int offset = jl_gc_classify_pools(sz, &osize); + if (offset >= 0) { + // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc + // We do a slowpath/fastpath check and lower it only on the slowpath, returning + // the cursor and updating it in the fastpath. + auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); + auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); + + // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk. + // Setting this to false will increase allocation overhead a lot, and should only be used for debugging. + const bool INLINE_FASTPATH_ALLOCATION = true; + + if (INLINE_FASTPATH_ALLOCATION) { + // Assuming we use the first immix allocator. + // FIXME: We should get the allocator index and type from MMTk. + auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); + + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); + + auto cursor_ptr = builder.CreateInBoundsGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor = builder.CreateAlignedLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, Align(sizeof(void *)), "cursor"); + + // offset = 8 + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + // alignment 16 (15 = 16 - 1) + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_ptr = builder.CreateInBoundsGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit = builder.CreateAlignedLoad(Type::getInt64Ty(target->getContext()), limit_ptr, Align(sizeof(void *)), "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); + + auto next_instr = target->getNextNode(); + SmallVector Weights{1, 9}; + + MDBuilder MDB(F.getContext()); + SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); + + builder.SetInsertPoint(next_instr); + auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(next_instr->getParent()); + + // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_tls.gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num)); + auto pool_alloc_tls = builder.CreateInBoundsGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_allocd = builder.CreateAlignedLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls, Align(sizeof(void *))); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); + builder.CreateBr(next_instr->getParent()); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + return phiNode; + } + } + } + return target; +} diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp new file mode 100644 index 0000000000000..2a11487773396 --- /dev/null +++ b/src/llvm-late-gc-lowering-stock.cpp @@ -0,0 +1,9 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "llvm-gc-interface-passes.h" + +Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) +{ + // Do nothing for the stock GC + return target; +} diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index ff2cac6e49406..7d6fba65a79e7 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2446,6 +2446,7 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(ArrayRef Colors, int PreAss bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { initAll(*F.getParent()); + smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc); LLVM_DEBUG(dbgs() << "GC ROOT PLACEMENT: Processing function " << F.getName() << "\n"); if (!pgcstack_getter && !adoptthread_func) return CleanupIR(F, nullptr, CFGModified); @@ -2460,6 +2461,31 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { std::map> CallFrames; // = OptimizeCallFrames(S, Ordering); PlaceRootsAndUpdateCalls(Colors.first, Colors.second, S, CallFrames); CleanupIR(F, &S, CFGModified); + + + // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk + // For now, we do nothing for the Stock GC + auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); + + if (GCAllocBytes) { + for (auto it = GCAllocBytes->user_begin(); it != GCAllocBytes->user_end(); ) { + if (auto *CI = dyn_cast(*it)) { + *CFGModified = true; + + assert(CI->getCalledOperand() == GCAllocBytes); + + auto newI = lowerGCAllocBytesLate(CI, F); + if (newI != CI) { + ++it; + CI->replaceAllUsesWith(newI); + CI->eraseFromParent(); + continue; + } + } + ++it; + } + } + return true; } diff --git a/src/staticdata.c b/src/staticdata.c index a0d56f3e38f54..05ad0565f8ac3 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -658,6 +658,7 @@ static void jl_load_sysimg_so(void) plen = (size_t *)&jl_system_image_size; else jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1); + jl_gc_notify_image_load(sysimg_data, *plen); jl_restore_system_image_data(sysimg_data, *plen); } @@ -4225,6 +4226,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1); size_t *plen; jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1); + jl_gc_notify_image_load(pkgimg_data, *plen); jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle); diff --git a/src/threading.c b/src/threading.c index ac9cc276d613a..77956786af3f4 100644 --- a/src/threading.c +++ b/src/threading.c @@ -773,6 +773,10 @@ void jl_init_threading(void) } int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads; + if (strstr(jl_gc_active_impl(), "MMTk")) { + ngcthreads = 0; + } + jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads; jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int)); jl_n_threads_per_pool[0] = nthreadsi; diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl index aa13fa3cdd31d..4a320282610cd 100644 --- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl +++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl @@ -166,6 +166,7 @@ function versioninfo(io::IO=stdout; verbose::Bool=false) end println(io, " WORD_SIZE: ", Sys.WORD_SIZE) println(io, " LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")") + println(io, " GC: ", unsafe_string(ccall(:jl_gc_active_impl, Ptr{UInt8}, ()))) println(io, """Threads: $(Threads.nthreads(:default)) default, $(Threads.nthreads(:interactive)) interactive, \ $(Threads.ngcthreads()) GC (on $(Sys.CPU_THREADS) virtual cores)""")