diff --git a/base/util.jl b/base/util.jl index 82a4396c581bc..37421ec255b94 100644 --- a/base/util.jl +++ b/base/util.jl @@ -7,7 +7,7 @@ # high-resolution relative time, in nanoseconds time_ns() = ccall(:jl_hrtime, UInt64, ()) -# This type must be kept in sync with the C struct in src/gc.c +# This type must be kept in sync with the C struct in src/gc.h immutable GC_Num allocd ::Int64 # GC internal freed ::Int64 # GC internal @@ -40,7 +40,7 @@ immutable GC_Diff end function GC_Diff(new::GC_Num, old::GC_Num) - # logic from gc.c:jl_gc_total_bytes + # logic from `src/gc.c:jl_gc_total_bytes` old_allocd = old.allocd + Int64(old.collect) + Int64(old.total_allocd) new_allocd = new.allocd + Int64(new.collect) + Int64(new.total_allocd) return GC_Diff(new_allocd - old_allocd, diff --git a/src/Makefile b/src/Makefile index 28e4022716686..1ccbee2c1afa3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -28,7 +28,7 @@ SRCS := \ jltypes gf typemap ast builtins module interpreter \ alloc dlload sys init task array dump toplevel jl_uv jlapi signal-handling \ simplevector APInt-C runtime_intrinsics runtime_ccall \ - threadgroup threading stackwalk gc safepoint + threadgroup threading stackwalk gc gc-debug gc-pages safepoint ifeq ($(JULIACODEGEN),LLVM) SRCS += codegen disasm debuginfo llvm-simdloop llvm-gcroot @@ -117,7 +117,9 @@ $(BUILDDIR)/anticodegen.o $(BUILDDIR)/anticodegen.dbg.obj: $(SRCDIR)/intrinsics. $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(SRCDIR)/codegen_internal.h $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/codegen_internal.h $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/table.c -$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-debug.c +$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h +$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h +$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c) $(BUILDDIR)/dump.o $(BUILDDIR)/dump.dbg.obj: $(addprefix $(SRCDIR)/,common_symbols1.inc common_symbols2.inc) $(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h threadgroup.h) diff --git a/src/gc-debug.c b/src/gc-debug.c index c382048f37a8b..fd3bcf4324c9d 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1,7 +1,15 @@ // This file is a part of Julia. License is MIT: http://julialang.org/license +#include "gc.h" +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + // Useful function in debugger to find page/region metadata -gcpage_t *jl_gc_page_metadata(void *data) +jl_gc_pagemeta_t *jl_gc_page_metadata(void *data) { return page_metadata(data); } @@ -22,42 +30,37 @@ JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osiz // Not in the pool if (!r) return NULL; - char *page_begin = GC_PAGE_DATA(p) + GC_PAGE_OFFSET; + char *page_begin = gc_page_data(p) + GC_PAGE_OFFSET; // In the page header if (p < page_begin) return NULL; size_t ofs = p - page_begin; - int pg_idx = PAGE_INDEX(r, p); + int pg_idx = page_index(r, page_begin); // Check if this is a free page - if (r->freemap[pg_idx / 32] & (uint32_t)(1 << (pg_idx % 32))) + if (!(r->allocmap[pg_idx / 32] & (uint32_t)(1 << (pg_idx % 32)))) return NULL; - gcpage_t *pagemeta = &r->meta[pg_idx]; + jl_gc_pagemeta_t *pagemeta = &r->meta[pg_idx]; int osize = pagemeta->osize; // Shouldn't be needed, just in case if (osize == 0) return NULL; char *tag = (char*)p - ofs % osize; // Points to an "object" that gets into the next page - if (tag + osize > GC_PAGE_DATA(p) + GC_PAGE_SZ) + if (tag + osize > gc_page_data(p) + GC_PAGE_SZ) return NULL; if (osize_p) *osize_p = osize; return (jl_taggedvalue_t*)tag; } -#ifdef GC_DEBUG_ENV -#include -#include -#endif - // mark verification #ifdef GC_VERIFY -static jl_value_t *lostval = 0; +jl_value_t *lostval = NULL; static arraylist_t lostval_parents; static arraylist_t lostval_parents_done; -static int verifying; +int gc_verifying; -static void add_lostval_parent(jl_value_t *parent) +void add_lostval_parent(jl_value_t *parent) { for(int i = 0; i < lostval_parents_done.len; i++) { if ((jl_value_t*)lostval_parents_done.items[i] == parent) @@ -70,35 +73,6 @@ static void add_lostval_parent(jl_value_t *parent) arraylist_push(&lostval_parents, parent); } -#define verify_val(v) do { \ - if (lostval == (jl_value_t*)(v) && (v) != 0) { \ - jl_printf(JL_STDOUT, \ - "Found lostval %p at %s:%d oftype: ", \ - (void*)(lostval), __FILE__, __LINE__); \ - jl_static_show(JL_STDOUT, jl_typeof(v)); \ - jl_printf(JL_STDOUT, "\n"); \ - } \ - } while(0); - - -#define verify_parent(ty, obj, slot, args...) do { \ - if (*(jl_value_t**)(slot) == lostval && \ - (jl_value_t*)(obj) != lostval) { \ - jl_printf(JL_STDOUT, "Found parent %p %p at %s:%d\n", \ - (void*)(ty), (void*)(obj), __FILE__, __LINE__); \ - jl_printf(JL_STDOUT, "\tloc %p : ", (void*)(slot)); \ - jl_printf(JL_STDOUT, args); \ - jl_printf(JL_STDOUT, "\n"); \ - jl_printf(JL_STDOUT, "\ttype: "); \ - jl_static_show(JL_STDOUT, jl_typeof(obj)); \ - jl_printf(JL_STDOUT, "\n"); \ - add_lostval_parent((jl_value_t*)(obj)); \ - } \ - } while(0); - -#define verify_parent1(ty,obj,slot,arg1) verify_parent(ty,obj,slot,arg1) -#define verify_parent2(ty,obj,slot,arg1,arg2) verify_parent(ty,obj,slot,arg1,arg2) - /* How to debug a missing write barrier : (or rather how I do it, if you know of a better way update this) @@ -131,17 +105,17 @@ static arraylist_t bits_save[4]; static void clear_mark(int bits) { gcval_t *pv; - if (!verifying) { + if (!gc_verifying) { for (int i = 0; i < 4; i++) { bits_save[i].len = 0; } } bigval_t *v; - FOR_EACH_HEAP () { - v = big_objects; + for (int i = 0;i < jl_n_threads;i++) { + v = jl_all_task_states[i].ptls->heap.big_objects; while (v != NULL) { void *gcv = &v->header; - if (!verifying) arraylist_push(&bits_save[gc_bits(gcv)], gcv); + if (!gc_verifying) arraylist_push(&bits_save[gc_bits(gcv)], gcv); gc_bits(gcv) = bits; v = v->next; } @@ -150,27 +124,28 @@ static void clear_mark(int bits) v = big_objects_marked; while (v != NULL) { void *gcv = &v->header; - if (!verifying) arraylist_push(&bits_save[gc_bits(gcv)], gcv); + if (!gc_verifying) arraylist_push(&bits_save[gc_bits(gcv)], gcv); gc_bits(gcv) = bits; v = v->next; } for (int h = 0; h < REGION_COUNT; h++) { - region_t *region = regions[h]; - if (!region) break; - for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) { - uint32_t line = region->freemap[pg_i]; - if (!!~line) { + region_t *region = ®ions[h]; + if (!region->pages) + break; + for (int pg_i = 0; pg_i < region->pg_cnt / 32; pg_i++) { + uint32_t line = region->allocmap[pg_i]; + if (line) { for (int j = 0; j < 32; j++) { - if (!((line >> j) & 1)) { - gcpage_t *pg = page_metadata(®ion->pages[pg_i*32 + j][0] + GC_PAGE_OFFSET); - pool_t *pool; - FOR_HEAP (pg->thread_n) - pool = &pools[pg->pool_n]; + if ((line >> j) & 1) { + jl_gc_pagemeta_t *pg = page_metadata(region->pages[pg_i*32 + j].data + GC_PAGE_OFFSET); + jl_tls_states_t *ptls = + jl_all_task_states[pg->thread_n].ptls; + jl_gc_pool_t *pool = &ptls->heap.norm_pools[pg->pool_n]; pv = (gcval_t*)(pg->data + GC_PAGE_OFFSET); char *lim = (char*)pv + GC_PAGE_SZ - GC_PAGE_OFFSET - pool->osize; while ((char*)pv <= lim) { - if (!verifying) arraylist_push(&bits_save[gc_bits(pv)], pv); + if (!gc_verifying) arraylist_push(&bits_save[gc_bits(pv)], pv); gc_bits(pv) = bits; pv = (gcval_t*)((char*)pv + pool->osize); } @@ -230,13 +205,13 @@ static void gc_verify_track(void) } while(lostval != NULL); } -static void gc_verify(void) +void gc_verify(void) { lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; clear_mark(GC_CLEAN); - verifying = 1; + gc_verifying = 1; pre_mark(); post_mark(&finalizer_list, 1); post_mark(&finalizer_list_marked, 1); @@ -254,7 +229,7 @@ static void gc_verify(void) } } if (lostval == NULL) { - verifying = 0; + gc_verifying = 0; restore(); // we did not miss anything return; } @@ -264,34 +239,9 @@ static void gc_verify(void) gc_debug_critical_error(); abort(); } - -#else -#define gc_verify() -#define verify_val(v) -#define verify_parent1(ty,obj,slot,arg1) -#define verify_parent2(ty,obj,slot,arg1,arg2) #endif #ifdef GC_DEBUG_ENV - -typedef struct { - uint64_t num; - uint64_t next; - - uint64_t min; - uint64_t interv; - uint64_t max; - unsigned short random[3]; -} jl_alloc_num_t; - -typedef struct { - int sweep_mask; - int wait_for_debugger; - jl_alloc_num_t pool; - jl_alloc_num_t other; - jl_alloc_num_t print; -} jl_gc_debug_env_t; - JL_DLLEXPORT jl_gc_debug_env_t jl_gc_debug_env = { GC_MARKED_NOESC, 0, @@ -299,6 +249,7 @@ JL_DLLEXPORT jl_gc_debug_env_t jl_gc_debug_env = { {0, UINT64_MAX, 0, 0, 0, {0, 0, 0}}, {0, UINT64_MAX, 0, 0, 0, {0, 0, 0}} }; +static char *gc_stack_lo; static void gc_debug_alloc_setnext(jl_alloc_num_t *num) { @@ -349,26 +300,12 @@ static int gc_debug_alloc_check(jl_alloc_num_t *num) return 1; } -static char *gc_stack_lo; -static void gc_debug_init(void) -{ - gc_stack_lo = (char*)gc_get_stack_ptr(); - char *env = getenv("JULIA_GC_NO_GENERATIONAL"); - if (env && strcmp(env, "0") != 0) - jl_gc_debug_env.sweep_mask = GC_MARKED; - env = getenv("JULIA_GC_WAIT_FOR_DEBUGGER"); - jl_gc_debug_env.wait_for_debugger = env && strcmp(env, "0") != 0; - gc_debug_alloc_init(&jl_gc_debug_env.pool, "POOL"); - gc_debug_alloc_init(&jl_gc_debug_env.other, "OTHER"); - gc_debug_alloc_init(&jl_gc_debug_env.print, "PRINT"); -} - -static inline int gc_debug_check_pool(void) +int gc_debug_check_pool(void) { return gc_debug_alloc_check(&jl_gc_debug_env.pool); } -static inline int gc_debug_check_other(void) +int gc_debug_check_other(void) { return gc_debug_alloc_check(&jl_gc_debug_env.other); } @@ -379,7 +316,7 @@ void gc_debug_print_status(void) uint64_t other_count = jl_gc_debug_env.other.num; jl_safe_printf("Allocations: %" PRIu64 " " "(Pool: %" PRIu64 "; Other: %" PRIu64 "); GC: %d\n", - pool_count + other_count, pool_count, other_count, n_pause); + pool_count + other_count, pool_count, other_count, gc_num.pause); } void gc_debug_critical_error(void) @@ -393,7 +330,7 @@ void gc_debug_critical_error(void) } } -static inline void gc_debug_print(void) +void gc_debug_print(void) { if (!gc_debug_alloc_check(&jl_gc_debug_env.print)) return; @@ -410,12 +347,12 @@ static void gc_scrub_range(char *stack_lo, char *stack_hi) jl_taggedvalue_t *tag = jl_gc_find_taggedvalue_pool(p, &osize); if (osize <= sizeof_jl_taggedvalue_t || !tag || gc_marked(tag)) continue; - gcpage_t *pg = page_metadata(tag); + jl_gc_pagemeta_t *pg = page_metadata(tag); // Make sure the sweep rebuild the freelist pg->allocd = 1; pg->gc_bits = 0x3; // Find the age bit - char *page_begin = GC_PAGE_DATA(tag) + GC_PAGE_OFFSET; + char *page_begin = gc_page_data(tag) + GC_PAGE_OFFSET; int obj_id = (((char*)tag) - page_begin) / osize; uint8_t *ages = pg->ages + obj_id / 8; // Force this to be a young object to save some memory @@ -426,21 +363,13 @@ static void gc_scrub_range(char *stack_lo, char *stack_hi) } } -static void gc_scrub(char *stack_hi) +void gc_scrub(char *stack_hi) { gc_scrub_range(gc_stack_lo, stack_hi); } - #else - -static inline int gc_debug_check_other(void) -{ - return 0; -} - -static inline int gc_debug_check_pool(void) +void gc_debug_critical_error(void) { - return 0; } void gc_debug_print_status(void) @@ -450,36 +379,16 @@ void gc_debug_print_status(void) uint64_t big_count = gc_num.bigalloc; jl_safe_printf("Allocations: %" PRIu64 " " "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n", - pool_count + big_count, pool_count, big_count, n_pause); -} - -void gc_debug_critical_error(void) -{ -} - -static inline void gc_debug_print(void) -{ + pool_count + big_count, pool_count, big_count, gc_num.pause); } - -static inline void gc_debug_init(void) -{ -} - -static void gc_scrub(char *stack_hi) -{ - (void)stack_hi; -} - #endif #ifdef OBJPROFILE static htable_t obj_counts[3]; static htable_t obj_sizes[3]; -static inline void objprofile_count(void *ty, int old, int sz) +void objprofile_count(void *ty, int old, int sz) { -#ifdef GC_VERIFY - if (verifying) return; -#endif + if (gc_verifying) return; if ((intptr_t)ty <= 0x10) { ty = (void*)jl_buff_tag; } @@ -500,7 +409,7 @@ static inline void objprofile_count(void *ty, int old, int sz) *((intptr_t*)bp) += sz; } -static void objprofile_reset(void) +void objprofile_reset(void) { for(int g=0; g < 3; g++) { htable_reset(&obj_counts[g], 0); @@ -548,7 +457,7 @@ static void objprofile_print(htable_t nums, htable_t sizes) } } -static void objprofile_printall(void) +void objprofile_printall(void) { jl_printf(JL_STDERR, "Transient mark :\n"); objprofile_print(obj_counts[0], obj_sizes[0]); @@ -557,28 +466,37 @@ static void objprofile_printall(void) jl_printf(JL_STDERR, "Remset :\n"); objprofile_print(obj_counts[2], obj_sizes[2]); } +#endif -static void objprofile_init(void) +void gc_debug_init(void) { +#ifdef GC_DEBUG_ENV + gc_stack_lo = (char*)gc_get_stack_ptr(); + char *env = getenv("JULIA_GC_NO_GENERATIONAL"); + if (env && strcmp(env, "0") != 0) + jl_gc_debug_env.sweep_mask = GC_MARKED; + env = getenv("JULIA_GC_WAIT_FOR_DEBUGGER"); + jl_gc_debug_env.wait_for_debugger = env && strcmp(env, "0") != 0; + gc_debug_alloc_init(&jl_gc_debug_env.pool, "POOL"); + gc_debug_alloc_init(&jl_gc_debug_env.other, "OTHER"); + gc_debug_alloc_init(&jl_gc_debug_env.print, "PRINT"); +#endif + +#ifdef GC_VERIFY + for (int i = 0; i < 4; i++) + arraylist_new(&bits_save[i], 0); + arraylist_new(&lostval_parents, 0); + arraylist_new(&lostval_parents_done, 0); +#endif + +#ifdef OBJPROFILE for (int g = 0;g < 3;g++) { htable_new(&obj_counts[g], 0); htable_new(&obj_sizes[g], 0); } -} -#else -static inline void objprofile_count(void *ty, int old, int sz) -{ -} - -static inline void objprofile_printall(void) -{ -} - -static inline void objprofile_reset(void) -{ +#endif } -static void objprofile_init(void) -{ +#ifdef __cplusplus } #endif diff --git a/src/gc-pages.c b/src/gc-pages.c new file mode 100644 index 0000000000000..f3b301c6a36cb --- /dev/null +++ b/src/gc-pages.c @@ -0,0 +1,209 @@ +// This file is a part of Julia. License is MIT: http://julialang.org/license + +#include "gc.h" +#ifndef _OS_WINDOWS_ +# include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// A region is contiguous storage for up to DEFAULT_REGION_PG_COUNT naturally aligned GC_PAGE_SZ pages +// It uses a very naive allocator (see jl_gc_alloc_page & jl_gc_free_page) +#if defined(_P64) +#define DEFAULT_REGION_PG_COUNT (16 * 8 * 4096) // 8 GB +#else +#define DEFAULT_REGION_PG_COUNT (8 * 4096) // 512 MB +#endif +#define MIN_REGION_PG_COUNT 64 // 1 MB + +static int region_pg_cnt = DEFAULT_REGION_PG_COUNT; +static jl_mutex_t pagealloc_lock; +static size_t current_pg_count = 0; + +void jl_gc_init_page(void) +{ +#ifndef _OS_WINDOWS_ + struct rlimit rl; + if (getrlimit(RLIMIT_AS, &rl) == 0) { + // This is not 100% precise and not the most efficient implementation + // but should be close enough and fast enough for the normal case. + while (rl.rlim_cur < region_pg_cnt * sizeof(jl_gc_page_t) * 2 && + region_pg_cnt >= MIN_REGION_PG_COUNT) { + region_pg_cnt /= 2; + } + } +#endif +} + +// Try to allocate a memory block for a region with `pg_cnt` pages. +// Return `NULL` if allocation failed. Result is aligned to `GC_PAGE_SZ`. +static char *jl_gc_try_alloc_region(int pg_cnt) +{ + const size_t pages_sz = sizeof(jl_gc_page_t) * pg_cnt; + const size_t freemap_sz = sizeof(uint32_t) * pg_cnt / 32; + const size_t meta_sz = sizeof(jl_gc_pagemeta_t) * pg_cnt; + size_t alloc_size = pages_sz + freemap_sz + meta_sz; +#ifdef _OS_WINDOWS_ + char *mem = (char*)VirtualAlloc(NULL, alloc_size + GC_PAGE_SZ, + MEM_RESERVE, PAGE_READWRITE); + if (mem == NULL) + return NULL; +#else + if (GC_PAGE_SZ > jl_page_size) + alloc_size += GC_PAGE_SZ; + char *mem = (char*)mmap(0, alloc_size, PROT_READ | PROT_WRITE, + MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) + return NULL; +#endif + if (GC_PAGE_SZ > jl_page_size) { + // round data pointer up to the nearest gc_page_data-aligned + // boundary if mmap didn't already do so. + mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1); + } + return mem; +} + +// Allocate the memory for a `region_t`. Starts with `region_pg_cnt` number +// of pages. Decrease 4x every time so that there are enough space for a few. +// more regions (or other allocations). The final page count is recorded +// and will be used as the starting count next time. If the page count is +// smaller `MIN_REGION_PG_COUNT` a `jl_memory_exception` is thrown. +// Assume `pagealloc_lock` is acquired, the lock is released before the +// exception is thrown. +static void jl_gc_alloc_region(region_t *region) +{ + int pg_cnt = region_pg_cnt; + const size_t pages_sz = sizeof(jl_gc_page_t) * pg_cnt; + const size_t allocmap_sz = sizeof(uint32_t) * pg_cnt / 32; + char *mem = NULL; + while (1) { + if (__likely((mem = jl_gc_try_alloc_region(pg_cnt)))) + break; + if (pg_cnt >= MIN_REGION_PG_COUNT * 4) { + pg_cnt /= 4; + region_pg_cnt = pg_cnt; + } + else if (pg_cnt > MIN_REGION_PG_COUNT) { + region_pg_cnt = pg_cnt = MIN_REGION_PG_COUNT; + } + else { + JL_UNLOCK_NOGC(&pagealloc_lock); + jl_throw(jl_memory_exception); + } + } + region->pages = (jl_gc_page_t*)mem; + region->allocmap = (uint32_t*)(mem + pages_sz); + region->meta = (jl_gc_pagemeta_t*)(mem + pages_sz +allocmap_sz); + region->lb = 0; + region->ub = 0; + region->pg_cnt = pg_cnt; +#ifdef _OS_WINDOWS_ + VirtualAlloc(region->allocmap, pg_cnt / 8, MEM_COMMIT, PAGE_READWRITE); + VirtualAlloc(region->meta, pg_cnt * sizeof(jl_gc_pagemeta_t), + MEM_COMMIT, PAGE_READWRITE); +#endif +} + +NOINLINE void *jl_gc_alloc_page(void) +{ + int i; + region_t *region; + int region_i = 0; + JL_LOCK_NOGC(&pagealloc_lock); + while (region_i < REGION_COUNT) { + region = ®ions[region_i]; + if (region->pages == NULL) + jl_gc_alloc_region(region); + for (i = region->lb; i < region->pg_cnt / 32; i++) { + if (~region->allocmap[i]) + break; + } + if (i == region->pg_cnt / 32) { + // region full + region_i++; + continue; + } + break; + } + if (__unlikely(region_i >= REGION_COUNT)) { + JL_UNLOCK_NOGC(&pagealloc_lock); + jl_throw(jl_memory_exception); + } + if (region->lb < i) + region->lb = i; + if (region->ub < i) + region->ub = i; + +#if defined(_COMPILER_MINGW_) + int j = __builtin_ffs(~region->allocmap[i]) - 1; +#elif defined(_COMPILER_MICROSOFT_) + unsigned long j; + _BitScanForward(&j, ~region->allocmap[i]); +#else + int j = ffs(~region->allocmap[i]) - 1; +#endif + + region->allocmap[i] |= (uint32_t)(1 << j); + void *ptr = region->pages[i * 32 + j].data; +#ifdef _OS_WINDOWS_ + VirtualAlloc(ptr, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); +#endif + current_pg_count++; +#ifdef GC_FINAL_STATS + max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; +#endif + JL_UNLOCK_NOGC(&pagealloc_lock); + return ptr; +} + +void jl_gc_free_page(void *p) +{ + int pg_idx = -1; + int i; + region_t *region = regions; + for (i = 0; i < REGION_COUNT && regions[i].pages != NULL; i++) { + region = ®ions[i]; + pg_idx = page_index(region, p); + if (pg_idx >= 0 && pg_idx < region->pg_cnt) { + break; + } + } + assert(i < REGION_COUNT && region->pages != NULL); + uint32_t msk = (uint32_t)(1 << (pg_idx % 32)); + assert(region->allocmap[pg_idx/32] & msk); + region->allocmap[pg_idx/32] ^= msk; + free(region->meta[pg_idx].ages); + // tell the OS we don't need these pages right now + size_t decommit_size = GC_PAGE_SZ; + if (GC_PAGE_SZ < jl_page_size) { + // ensure so we don't release more memory than intended + size_t n_pages = (GC_PAGE_SZ + jl_page_size - 1) / GC_PAGE_SZ; + decommit_size = jl_page_size; + p = (void*)((uintptr_t)region->pages[pg_idx].data & ~(jl_page_size - 1)); // round down to the nearest page + pg_idx = page_index(region, p); + if (pg_idx + n_pages > region->pg_cnt) + goto no_decommit; + for (; n_pages--; pg_idx++) { + msk = (uint32_t)(1 << ((pg_idx % 32))); + if (region->allocmap[pg_idx / 32] & msk) { + goto no_decommit; + } + } + } +#ifdef _OS_WINDOWS_ + VirtualFree(p, decommit_size, MEM_DECOMMIT); +#else + madvise(p, decommit_size, MADV_DONTNEED); +#endif +no_decommit: + if (region->lb > pg_idx / 32) + region->lb = pg_idx / 32; + current_pg_count--; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/gc.c b/src/gc.c index a1d0851a959b4..e06069438cb05 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1,38 +1,11 @@ // This file is a part of Julia. License is MIT: http://julialang.org/license -/* - allocation and garbage collection - . non-moving, precise mark and sweep collector - . pool-allocates small objects, keeps big objects on a simple list -*/ -// use mmap instead of malloc to allocate pages. default = off. -//#define USE_MMAP - -// free pages as soon as they are empty. if not defined, then we -// will wait for the next GC, to allow the space to be reused more -// efficiently. default = on. -#include -#include -#ifndef _MSC_VER -#include -#endif -#include -#include -#include "julia.h" -#include "julia_internal.h" -#include "threading.h" -#ifndef _OS_WINDOWS_ -#include -#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) -#define MAP_ANONYMOUS MAP_ANON -#endif -#endif +#include "gc.h" #ifdef __cplusplus extern "C" { #endif -static jl_mutex_t pagealloc_lock; // Protect all access to `finalizer_list`, `finalizer_list_marked` and // `to_finalize`. static jl_mutex_t finalizers_lock; @@ -69,283 +42,20 @@ static jl_mutex_t finalizers_lock; * finalizers in unmanaged (GC safe) mode. */ -// manipulating mark bits - -#define GC_CLEAN 0 // freshly allocated -#define GC_MARKED 1 // reachable and old -#define GC_QUEUED 2 // if it is reachable it will be marked as old -#define GC_MARKED_NOESC (GC_MARKED | GC_QUEUED) // reachable and young - -// This struct must be kept in sync with the Julia type of the same name in base/util.jl -typedef struct { - int64_t allocd; - int64_t freed; - uint64_t malloc; - uint64_t realloc; - uint64_t poolalloc; - uint64_t bigalloc; - uint64_t freecall; - uint64_t total_time; - uint64_t total_allocd; - uint64_t since_sweep; - size_t collect; - int pause; - int full_sweep; -} GC_Num; - -static GC_Num gc_num = {0,0,0,0,0,0,0,0,0,0,0,0,0}; - -#define collect_interval gc_num.collect -#define n_pause gc_num.pause -#define n_full_sweep gc_num.full_sweep -#define allocd_bytes gc_num.allocd -#define freed_bytes gc_num.freed -#define total_gc_time gc_num.total_time -#define total_allocd_bytes gc_num.total_allocd -#define allocd_bytes_since_sweep gc_num.since_sweep +jl_gc_num_t gc_num = {0,0,0,0,0,0,0,0,0,0,0,0,0}; static size_t last_long_collect_interval; -typedef struct _buff_t { - union { - uintptr_t header; - struct _buff_t *next; - uintptr_t flags; - jl_value_t *type; // 16-bytes aligned - struct { - uintptr_t gc_bits:2; - uintptr_t pooled:1; - }; - }; - // Work around a bug affecting gcc up to (at least) version 4.4.7 - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36839 -#if !defined(_COMPILER_MICROSOFT_) - int _dummy[0]; -#endif - char data[]; -} buff_t; -typedef buff_t gcval_t; - - -// layout for big (>2k) objects - -typedef struct _bigval_t { - struct _bigval_t *next; - struct _bigval_t **prev; // pointer to the next field of the prev entry - union { - size_t sz; - uintptr_t age : 2; - }; - #ifdef _P64 // Add padding so that char data[] below is 64-byte aligned - // (8 pointers of 8 bytes each) - (4 other pointers in struct) - void *_padding[8 - 4]; - #else - // (16 pointers of 4 bytes each) - (4 other pointers in struct) - void *_padding[16 - 4]; - #endif - //struct buff_t <>; - union { - uintptr_t header; - uintptr_t flags; - uintptr_t gc_bits:2; - }; - // Work around a bug affecting gcc up to (at least) version 4.4.7 - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36839 -#if !defined(_COMPILER_MICROSOFT_) - int _dummy[0]; -#endif - // must be 64-byte aligned here, in 32 & 64 bit modes - char data[]; -} bigval_t; - -#define bigval_header(data) container_of((data), bigval_t, header) - -// data structure for tracking malloc'd arrays. - -typedef struct _mallocarray_t { - jl_array_t *a; - struct _mallocarray_t *next; -} mallocarray_t; - -typedef struct _pool_t { - gcval_t *freelist; // root of list of free objects - gcval_t *newpages; // root of list of chunks of free objects - uint16_t end_offset; // stored to avoid computing it at each allocation - uint16_t osize; // size of objects in this pool - uint16_t nfree; // number of free objects in page pointed into by free_list -} pool_t; - -// layout for small (<2k) objects - -#define GC_PAGE_LG2 14 // log2(size of a page) -#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k -#define GC_PAGE_OFFSET (JL_SMALL_BYTE_ALIGNMENT - (sizeof_jl_taggedvalue_t % JL_SMALL_BYTE_ALIGNMENT)) - -// pool page metadata -typedef struct _gcpage_t { - struct { - uint16_t pool_n : 8; // index (into norm_pool) of pool that owns this page - uint16_t allocd : 1; // true if an allocation happened in this page since last sweep - uint16_t gc_bits : 2; // this is a bitwise | of all gc_bits in this page - }; - uint16_t nfree; // number of free objects in this page. - // invalid if pool that owns this page is allocating objects from this page. - uint16_t osize; // size of each object in this page - uint16_t fl_begin_offset; // offset of first free object in this page - uint16_t fl_end_offset; // offset of last free object in this page - uint16_t thread_n; // index (into jl_thread_heap) of heap that owns this page - char *data; - uint8_t *ages; -} gcpage_t; - -#define PAGE_PFL_BEG(p) ((gcval_t**)((p->data) + (p)->fl_begin_offset)) -#define PAGE_PFL_END(p) ((gcval_t**)((p->data) + (p)->fl_end_offset)) -// round an address inside a gcpage's data to its beginning -#define GC_PAGE_DATA(x) ((char*)((uintptr_t)(x) >> GC_PAGE_LG2 << GC_PAGE_LG2)) - -// A region is contiguous storage for up to REGION_PG_COUNT naturally aligned GC_PAGE_SZ pages -// It uses a very naive allocator (see malloc_page & free_page) -#if defined(_P64) && !defined(_COMPILER_MICROSOFT_) -#define REGION_PG_COUNT 16*8*4096 // 8G because virtual memory is cheap -#else -#define REGION_PG_COUNT 8*4096 // 512M -#endif -#define REGION_COUNT 8 - -typedef struct { - // Page layout: - // Padding: GC_PAGE_OFFSET - // Blocks: osize * n - // Tag: sizeof_jl_taggedvalue_t - // Data: <= osize - sizeof_jl_taggedvalue_t - char pages[REGION_PG_COUNT][GC_PAGE_SZ]; // must be first, to preserve page alignment - uint32_t freemap[REGION_PG_COUNT/32]; - gcpage_t meta[REGION_PG_COUNT]; -} region_t -#if !defined(_COMPILER_MICROSOFT_) && !(defined(_COMPILER_MINGW_) && defined(_COMPILER_CLANG_)) -__attribute__((aligned(GC_PAGE_SZ))) -#endif -; - -static region_t *regions[REGION_COUNT] = {NULL}; -// store a lower bound of the first free page in each region -static int regions_lb[REGION_COUNT] = {0}; -// an upper bound of the last non-free page -static int regions_ub[REGION_COUNT] = {REGION_PG_COUNT/32-1}; - -// Variables that become fields of a thread-local struct in the thread-safe version. -typedef struct _jl_thread_heap_t { - // variable for tracking weak references - arraylist_t weak_refs; - - // variables for tracking malloc'd arrays - mallocarray_t *mallocarrays; - mallocarray_t *mafreelist; - - // variables for tracking big objects - bigval_t *big_objects; - - // variables for tracking "remembered set" - arraylist_t rem_bindings; - arraylist_t _remset[2]; // contains jl_value_t* - // lower bound of the number of pointers inside remembered values - int remset_nptr; - arraylist_t *remset; - arraylist_t *last_remset; - - // variables for allocating objects from pools -#ifdef _P64 -#define N_POOLS 41 -#else -#define N_POOLS 43 -#endif - pool_t norm_pools[N_POOLS]; -} jl_thread_heap_t; - -typedef struct { - int index; - jl_thread_heap_t *heap; -} jl_each_heap_index_t; - -typedef struct { - int i; - jl_thread_heap_t *heap; -} jl_single_heap_index_t; - -// Include before gc-debug for objprofile -static const uintptr_t jl_buff_tag = 0x4eade800; -static void *const jl_malloc_tag = (void*)0xdeadaa01; -#ifdef OBJPROFILE -static void *const jl_singleton_tag = (void*)0xdeadaa02; -#endif - -#define current_heap __current_heap_idx.heap -#define current_heap_index __current_heap_idx.index -// This chould trigger a false positive warning with both gcc and clang -// since the compiler couldn't figure out that the loop is executed at least -// once. -// gcc bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68336 -// clang bug: https://llvm.org/bugs/show_bug.cgi?id=25521 -#define _FOR_SINGLE_HEAP(heap) \ - for (jl_single_heap_index_t __current_heap_idx = {1, heap}; \ - --__current_heap_idx.i >= 0;) -#define FOR_CURRENT_HEAP() _FOR_SINGLE_HEAP(jl_thread_heap) - -// The following macros are used for accessing these variables. -// In the multi-threaded version, they establish the desired thread context. -// In the single-threaded version, they are essentially noops, but nonetheless -// serve to check that the thread context macros are being used. -#ifdef JULIA_ENABLE_THREADING -#define jl_thread_heap (jl_get_ptls_states()->heap) -#define FOR_EACH_HEAP() \ - for (jl_each_heap_index_t __current_heap_idx = {jl_n_threads, NULL}; \ - --current_heap_index >= 0 && \ - ((current_heap = jl_all_heaps[current_heap_index]), 1);) -#define FOR_HEAP(t_n) _FOR_SINGLE_HEAP(jl_all_heaps[t_n]) -/*}}*/ -#else -static jl_thread_heap_t _jl_thread_heap; -static jl_thread_heap_t *const jl_thread_heap = &_jl_thread_heap; -#define FOR_EACH_HEAP() \ - for (jl_each_heap_index_t __current_heap_idx = {1, jl_thread_heap}; \ - --current_heap_index >= 0;) -#define FOR_HEAP(t_n) _FOR_SINGLE_HEAP(jl_thread_heap) -#endif - -#define HEAP(x) (current_heap->x) -#define weak_refs HEAP(weak_refs) -#define big_objects HEAP(big_objects) -#define mallocarrays HEAP(mallocarrays) -#define mafreelist HEAP(mafreelist) -#define remset HEAP(remset) -#define remset_nptr HEAP(remset_nptr) -#define last_remset HEAP(last_remset) -#define rem_bindings HEAP(rem_bindings) -#define pools HEAP(norm_pools) +region_t regions[REGION_COUNT]; // List of marked big objects. Not per-thread. Accessed only by master thread. -static bigval_t *big_objects_marked = NULL; +bigval_t *big_objects_marked = NULL; // finalization -static arraylist_t finalizer_list; -static arraylist_t finalizer_list_marked; -static arraylist_t to_finalize; - -#define should_timeout() 0 - -#define gc_bits(o) (((gcval_t*)(o))->gc_bits) -#define gc_marked(o) (((gcval_t*)(o))->gc_bits & GC_MARKED) -#define _gc_setmark(o, mark_mode) (((gcval_t*)(o))->gc_bits = mark_mode) - -static gcpage_t *page_metadata(void *data); -static void pre_mark(void); -static void post_mark(arraylist_t *list, int dryrun); -static region_t *find_region(void *ptr, int maybe); - -#define PAGE_INDEX(region, data) \ - ((GC_PAGE_DATA((data) - GC_PAGE_OFFSET) - \ - &(region)->pages[0][0])/GC_PAGE_SZ) +arraylist_t finalizer_list; +arraylist_t finalizer_list_marked; +arraylist_t to_finalize; -NOINLINE static uintptr_t gc_get_stack_ptr(void) +NOINLINE uintptr_t gc_get_stack_ptr(void) { void *dummy = NULL; // The mask is to suppress the compiler warning about returning @@ -353,7 +63,7 @@ NOINLINE static uintptr_t gc_get_stack_ptr(void) return (uintptr_t)&dummy & ~(uintptr_t)15; } -#include "gc-debug.c" +#define should_timeout() 0 #ifdef JULIA_ENABLE_THREADING static void jl_gc_wait_for_the_world(void) @@ -538,35 +248,8 @@ JL_DLLEXPORT void jl_finalize(jl_value_t *o) arraylist_free(&copied_list); } -static region_t *find_region(void *ptr, int maybe) -{ - // on 64bit systems we could probably use a single region and remove this loop - for (int i = 0; i < REGION_COUNT && regions[i]; i++) { - char *begin = ®ions[i]->pages[0][0]; - char *end = begin + sizeof(regions[i]->pages); - if ((char*)ptr >= begin && (char*)ptr <= end) - return regions[i]; - } - (void)maybe; - assert(maybe && "find_region failed"); - return NULL; -} - -static gcpage_t *page_metadata(void *data) -{ - region_t *r = find_region(data, 0); - int pg_idx = PAGE_INDEX(r, (char*)data); - return &r->meta[pg_idx]; -} - -static uint8_t *page_age(gcpage_t *pg) -{ - return pg->ages; -} - #define GC_POOL_END_OFS(osize) ((((GC_PAGE_SZ - GC_PAGE_OFFSET)/(osize)) - 1)*(osize) + GC_PAGE_OFFSET) - // GC knobs and self-measurement variables static int64_t last_gc_total_bytes = 0; @@ -584,8 +267,6 @@ static size_t max_collect_interval = 500000000UL; #define NS2MS(t) ((double)(t/1000)/1000) static int64_t live_bytes = 0; static int64_t promoted_bytes = 0; -static size_t current_pg_count = 0; -static size_t max_pg_count = 0; JL_DLLEXPORT size_t jl_gc_total_freed_bytes=0; #ifdef GC_FINAL_STATS @@ -647,12 +328,10 @@ static size_t array_nbytes(jl_array_t*); static inline int gc_setmark_big(void *o, int mark_mode) { -#ifdef GC_VERIFY - if (verifying) { + if (gc_verifying) { _gc_setmark(o, mark_mode); return 0; } -#endif assert(find_region(o,1) == NULL); bigval_t *hdr = bigval_header(o); int bits = gc_bits(o); @@ -684,13 +363,11 @@ static inline int gc_setmark_big(void *o, int mark_mode) static inline int gc_setmark_pool(void *o, int mark_mode) { -#ifdef GC_VERIFY - if (verifying) { + if (gc_verifying) { _gc_setmark(o, mark_mode); return mark_mode; } -#endif - gcpage_t *page = page_metadata(o); + jl_gc_pagemeta_t *page = page_metadata(o); int bits = gc_bits(o); if (bits == GC_QUEUED || bits == GC_MARKED) { mark_mode = GC_MARKED; @@ -739,125 +416,7 @@ inline void gc_setmark_buf(void *o, int mark_mode) gc_setmark_big(buf, mark_mode); } -static NOINLINE void *malloc_page(void) -{ - void *ptr = (void*)0; - int i; - region_t *region; - int region_i = 0; - JL_LOCK_NOGC(&pagealloc_lock); - while(region_i < REGION_COUNT) { - region = regions[region_i]; - if (region == NULL) { - size_t alloc_size = sizeof(region_t); -#ifdef _OS_WINDOWS_ - char *mem = (char*)VirtualAlloc(NULL, sizeof(region_t) + GC_PAGE_SZ, MEM_RESERVE, PAGE_READWRITE); -#else - if (GC_PAGE_SZ > jl_page_size) - alloc_size += GC_PAGE_SZ; - char *mem = (char*)mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - mem = mem == MAP_FAILED ? NULL : mem; -#endif - if (mem == NULL) { - jl_printf(JL_STDERR, "could not allocate pools\n"); - gc_debug_critical_error(); - abort(); - } - if (GC_PAGE_SZ > jl_page_size) { - // round data pointer up to the nearest GC_PAGE_DATA-aligned boundary - // if mmap didn't already do so - alloc_size += GC_PAGE_SZ; - region = (region_t*)((char*)GC_PAGE_DATA(mem + GC_PAGE_SZ - 1)); - } - else { - region = (region_t*)mem; - } -#ifdef _OS_WINDOWS_ - VirtualAlloc(region->freemap, REGION_PG_COUNT/8, MEM_COMMIT, PAGE_READWRITE); - VirtualAlloc(region->meta, REGION_PG_COUNT*sizeof(gcpage_t), MEM_COMMIT, PAGE_READWRITE); -#endif - memset(region->freemap, 0xff, REGION_PG_COUNT/8); - regions[region_i] = region; - } - for(i = regions_lb[region_i]; i < REGION_PG_COUNT/32; i++) { - if (region->freemap[i]) break; - } - if (i == REGION_PG_COUNT/32) { - // region full - region_i++; - continue; - } - break; - } - if (region_i >= REGION_COUNT) { - jl_printf(JL_STDERR, "increase REGION_COUNT or allocate less memory\n"); - gc_debug_critical_error(); - abort(); - } - if (regions_lb[region_i] < i) - regions_lb[region_i] = i; - if (regions_ub[region_i] < i) - regions_ub[region_i] = i; - -#if defined(_COMPILER_MINGW_) - int j = __builtin_ffs(region->freemap[i]) - 1; -#elif defined(_COMPILER_MICROSOFT_) - unsigned long j; - _BitScanForward(&j, region->freemap[i]); -#else - int j = ffs(region->freemap[i]) - 1; -#endif - - region->freemap[i] &= ~(uint32_t)(1 << j); - ptr = region->pages[i*32 + j]; -#ifdef _OS_WINDOWS_ - VirtualAlloc(ptr, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); -#endif - current_pg_count++; - max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; - JL_UNLOCK_NOGC(&pagealloc_lock); - return ptr; -} - -static void free_page(void *p) -{ - int pg_idx = -1; - int i; - for(i = 0; i < REGION_COUNT && regions[i] != NULL; i++) { - pg_idx = PAGE_INDEX(regions[i], (char*)p+GC_PAGE_OFFSET); - if (pg_idx >= 0 && pg_idx < REGION_PG_COUNT) break; - } - assert(i < REGION_COUNT && regions[i] != NULL); - region_t *region = regions[i]; - uint32_t msk = (uint32_t)(1 << (pg_idx % 32)); - assert(!(region->freemap[pg_idx/32] & msk)); - region->freemap[pg_idx/32] ^= msk; - free(region->meta[pg_idx].ages); - // tell the OS we don't need these pages right now - size_t decommit_size = GC_PAGE_SZ; - if (GC_PAGE_SZ < jl_page_size) { - // ensure so we don't release more memory than intended - size_t n_pages = (GC_PAGE_SZ + jl_page_size - 1) / GC_PAGE_SZ; - decommit_size = jl_page_size; - p = (void*)((uintptr_t)®ion->pages[pg_idx][0] & ~(jl_page_size - 1)); // round down to the nearest page - pg_idx = PAGE_INDEX(region, (char*)p+GC_PAGE_OFFSET); - if (pg_idx + n_pages > REGION_PG_COUNT) goto no_decommit; - for (; n_pages--; pg_idx++) { - msk = (uint32_t)(1 << ((pg_idx % 32))); - if (!(region->freemap[pg_idx/32] & msk)) goto no_decommit; - } - } -#ifdef _OS_WINDOWS_ - VirtualFree(p, decommit_size, MEM_DECOMMIT); -#else - madvise(p, decommit_size, MADV_DONTNEED); -#endif -no_decommit: - if (regions_lb[i] > pg_idx/32) regions_lb[i] = pg_idx/32; - current_pg_count--; -} - -#define should_collect() (__unlikely(allocd_bytes>0)) +#define should_collect() (__unlikely(gc_num.allocd>0)) static inline int maybe_collect(void) { @@ -876,17 +435,19 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value) jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc_1w(); jl_set_typeof(wr, jl_weakref_type); wr->value = value; // NOTE: wb not needed here - FOR_CURRENT_HEAP () - arraylist_push(&weak_refs, wr); + arraylist_push(&jl_thread_heap->weak_refs, wr); return wr; } static void sweep_weak_refs(void) { - FOR_EACH_HEAP () { - size_t n=0, ndel=0, l=weak_refs.len; + for (int i = 0;i < jl_n_threads;i++) { + jl_tls_states_t *ptls = jl_all_task_states[i].ptls; + size_t n = 0; + size_t ndel = 0; + size_t l = ptls->heap.weak_refs.len; jl_weakref_t *wr; - void **lst = weak_refs.items; + void **lst = ptls->heap.weak_refs.items; void *tmp; #define SWAP_wr(a,b) (tmp=a,a=b,b=tmp,1) if (l == 0) @@ -904,7 +465,7 @@ static void sweep_weak_refs(void) } } while ((n < l-ndel) && SWAP_wr(lst[n],lst[n+ndel])); - weak_refs.len -= ndel; + ptls->heap.weak_refs.len -= ndel; } } @@ -920,7 +481,7 @@ static NOINLINE void *alloc_big(size_t sz) bigval_t *v = (bigval_t*)malloc_cache_align(allocsz); if (v == NULL) jl_throw(jl_memory_exception); - jl_atomic_fetch_add(&allocd_bytes, allocsz); + jl_atomic_fetch_add(&gc_num.allocd, allocsz); gc_num.bigalloc++; #ifdef MEMDEBUG memset(v, 0xee, allocsz); @@ -928,13 +489,11 @@ static NOINLINE void *alloc_big(size_t sz) v->sz = allocsz; v->flags = 0; v->age = 0; - FOR_CURRENT_HEAP () { - v->next = big_objects; - v->prev = &big_objects; - if (v->next) - v->next->prev = &v->next; - big_objects = v; - } + v->next = jl_thread_heap->big_objects; + v->prev = &jl_thread_heap->big_objects; + if (v->next) + v->next->prev = &v->next; + jl_thread_heap->big_objects = v; return (void*)&v->header; } @@ -973,7 +532,7 @@ static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv) *pv = nxt; if (nxt) nxt->prev = pv; - freed_bytes += v->sz&~3; + gc_num.freed += v->sz&~3; #ifdef MEMDEBUG memset(v, 0xbb, v->sz&~3); #endif @@ -988,19 +547,18 @@ static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv) static void sweep_big(int sweep_mask) { - FOR_EACH_HEAP () - sweep_big_list(sweep_mask, &big_objects); + for (int i = 0;i < jl_n_threads;i++) + sweep_big_list(sweep_mask, + &jl_all_task_states[i].ptls->heap.big_objects); if (sweep_mask == GC_MARKED) { bigval_t **last_next = sweep_big_list(sweep_mask, &big_objects_marked); // Move all survivors from big_objects_marked list to big_objects list. - FOR_CURRENT_HEAP () { - if (big_objects) - big_objects->prev = last_next; - *last_next = big_objects; - big_objects = big_objects_marked; - if (big_objects) - big_objects->prev = &big_objects; - } + if (jl_thread_heap->big_objects) + jl_thread_heap->big_objects->prev = last_next; + *last_next = jl_thread_heap->big_objects; + jl_thread_heap->big_objects = big_objects_marked; + if (jl_thread_heap->big_objects) + jl_thread_heap->big_objects->prev = &jl_thread_heap->big_objects; big_objects_marked = NULL; } } @@ -1009,24 +567,22 @@ static void sweep_big(int sweep_mask) void jl_gc_track_malloced_array(jl_array_t *a) { - FOR_CURRENT_HEAP () { - mallocarray_t *ma; - if (mafreelist == NULL) { - ma = (mallocarray_t*)malloc(sizeof(mallocarray_t)); - } - else { - ma = mafreelist; - mafreelist = ma->next; - } - ma->a = a; - ma->next = mallocarrays; - mallocarrays = ma; + mallocarray_t *ma; + if (jl_thread_heap->mafreelist == NULL) { + ma = (mallocarray_t*)malloc(sizeof(mallocarray_t)); } + else { + ma = jl_thread_heap->mafreelist; + jl_thread_heap->mafreelist = ma->next; + } + ma->a = a; + ma->next = jl_thread_heap->mallocarrays; + jl_thread_heap->mallocarrays = ma; } void jl_gc_count_allocd(size_t sz) { - allocd_bytes += sz; + gc_num.allocd += sz; } static size_t array_nbytes(jl_array_t *a) @@ -1047,7 +603,7 @@ static void jl_gc_free_array(jl_array_t *a) jl_free_aligned(d); else free(d); - freed_bytes += array_nbytes(a); + gc_num.freed += array_nbytes(a); } } @@ -1057,9 +613,10 @@ static int mallocd_array_freed; static void sweep_malloced_arrays(void) { - FOR_EACH_HEAP () { - mallocarray_t *ma = mallocarrays; - mallocarray_t **pma = &mallocarrays; + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; + mallocarray_t *ma = ptls->heap.mallocarrays; + mallocarray_t **pma = &ptls->heap.mallocarrays; while (ma != NULL) { mallocarray_t *nxt = ma->next; if (gc_marked(jl_astaggedvalue(ma->a))) { @@ -1069,8 +626,8 @@ static void sweep_malloced_arrays(void) *pma = nxt; assert(ma->a->flags.how == 2); jl_gc_free_array(ma->a); - ma->next = mafreelist; - mafreelist = ma; + ma->next = ptls->heap.mafreelist; + ptls->heap.mafreelist = ma; mallocd_array_freed++; } mallocd_array_total++; @@ -1080,13 +637,13 @@ static void sweep_malloced_arrays(void) } // pool allocation -static inline gcval_t *reset_page(pool_t *p, gcpage_t *pg, gcval_t *fl) +static inline gcval_t *reset_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t *fl) { pg->gc_bits = 0; pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; - FOR_HEAP (pg->thread_n) - pg->pool_n = p - pools; - memset(page_age(pg), 0, LLT_ALIGN(GC_PAGE_SZ / p->osize, 8)); + jl_tls_states_t *ptls = jl_all_task_states[pg->thread_n].ptls; + pg->pool_n = p - ptls->heap.norm_pools; + memset(pg->ages, 0, GC_PAGE_SZ / 8 / p->osize + 1); gcval_t *beg = (gcval_t*)(pg->data + GC_PAGE_OFFSET); gcval_t *end = (gcval_t*)((char*)beg + (pg->nfree - 1)*p->osize); end->next = fl; @@ -1096,31 +653,31 @@ static inline gcval_t *reset_page(pool_t *p, gcpage_t *pg, gcval_t *fl) return beg; } -static NOINLINE void add_page(pool_t *p) +static NOINLINE void add_page(jl_gc_pool_t *p) { - char *data = (char*)malloc_page(); + char *data = (char*)jl_gc_alloc_page(); if (data == NULL) jl_throw(jl_memory_exception); - gcpage_t *pg = page_metadata(data + GC_PAGE_OFFSET); + jl_gc_pagemeta_t *pg = page_metadata(data + GC_PAGE_OFFSET); pg->data = data; pg->osize = p->osize; - pg->ages = (uint8_t*)malloc(LLT_ALIGN(GC_PAGE_SZ / p->osize, 8)); + pg->ages = (uint8_t*)malloc(GC_PAGE_SZ / 8 / p->osize + 1); pg->thread_n = ti_tid; gcval_t *fl = reset_page(p, pg, p->newpages); p->newpages = fl; } -static inline void *__pool_alloc(pool_t *p, int osize, int end_offset) +static inline void *__pool_alloc(jl_gc_pool_t *p, int osize, int end_offset) { #ifdef MEMDEBUG assert(0 && "Should not be using pools in MEMDEBUG mode"); #endif gcval_t *v, *end; // FIXME - need JL_ATOMIC_FETCH_AND_ADD here - if (__unlikely((allocd_bytes += osize) >= 0) || gc_debug_check_pool()) { - //allocd_bytes -= osize; + if (__unlikely((gc_num.allocd += osize) >= 0) || gc_debug_check_pool()) { + //gc_num.allocd -= osize; jl_gc_collect(0); - //allocd_bytes += osize; + //gc_num.allocd += osize; } else { jl_gc_safepoint(); @@ -1133,10 +690,10 @@ static inline void *__pool_alloc(pool_t *p, int osize, int end_offset) v->flags = 0; p->nfree--; p->freelist = next; - if (__unlikely(GC_PAGE_DATA(v) != GC_PAGE_DATA(next))) { + if (__unlikely(gc_page_data(v) != gc_page_data(next))) { // we only update pg's fields when the freelist changes page // since pg's metadata is likely not in cache - gcpage_t *pg = page_metadata(v); + jl_gc_pagemeta_t *pg = page_metadata(v); assert(pg->osize == p->osize); pg->nfree = 0; pg->allocd = 1; @@ -1151,13 +708,13 @@ static inline void *__pool_alloc(pool_t *p, int osize, int end_offset) add_page(p); v = p->newpages; } - end = (gcval_t*)&(GC_PAGE_DATA(v)[end_offset]); + end = (gcval_t*)&(gc_page_data(v)[end_offset]); if (__likely(v != end)) { p->newpages = (gcval_t*)((char*)v + osize); } else { // like the freelist case, but only update the page metadata when it is full - gcpage_t *pg = page_metadata(v); + jl_gc_pagemeta_t *pg = page_metadata(v); assert(pg->osize == p->osize); pg->nfree = 0; pg->allocd = 1; @@ -1170,18 +727,18 @@ static inline void *__pool_alloc(pool_t *p, int osize, int end_offset) // use this variant when osize is statically known // and is definitely in sizeclasses // GC_POOL_END_OFS uses an integer division -static inline void *_pool_alloc(pool_t *p, int osize) +static inline void *_pool_alloc(jl_gc_pool_t *p, int osize) { return __pool_alloc(p, osize, GC_POOL_END_OFS(osize)); } -static inline void *pool_alloc(pool_t *p) +static inline void *pool_alloc(jl_gc_pool_t *p) { return __pool_alloc(p, p->osize, p->end_offset); } // pools are 16376 bytes large (GC_POOL_SZ - GC_PAGE_OFFSET) -static const int sizeclasses[N_POOLS] = { +static const int sizeclasses[JL_GC_N_POOLS] = { #ifdef _P64 8, #else @@ -1229,7 +786,7 @@ static inline int szclass(size_t sz) return 16 - 16376 / 4 / LLT_ALIGN(sz, 16 * 4) + 16 + N; if (sz <= 1008) return 16 - 16376 / 2 / LLT_ALIGN(sz, 16 * 2) + 24 + N; - assert(sz <= GC_MAX_SZCLASS + sizeof(buff_t) && sizeclasses[N_POOLS-1] == GC_MAX_SZCLASS + sizeof(buff_t)); + assert(sz <= GC_MAX_SZCLASS + sizeof(buff_t) && sizeclasses[JL_GC_N_POOLS-1] == GC_MAX_SZCLASS + sizeof(buff_t)); return 16 - 16376 / 1 / LLT_ALIGN(sz, 16 * 1) + 32 + N; } @@ -1240,28 +797,27 @@ static int total_pages = 0; static int freed_pages = 0; static int lazy_freed_pages = 0; static int page_done = 0; -static gcval_t **sweep_page(pool_t *p, gcpage_t *pg, gcval_t **pfl,int,int); +static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl,int,int); static void sweep_pool_region(gcval_t ***pfl, int region_i, int sweep_mask) { - region_t *region = regions[region_i]; + region_t *region = ®ions[region_i]; // the actual sweeping int ub = 0; - int lb = regions_lb[region_i]; - for (int pg_i = 0; pg_i <= regions_ub[region_i]; pg_i++) { - uint32_t line = region->freemap[pg_i]; - if (!!~line) { + int lb = region->lb; + for (int pg_i = 0; pg_i <= region->ub; pg_i++) { + uint32_t line = region->allocmap[pg_i]; + if (line) { ub = pg_i; for (int j = 0; j < 32; j++) { - if (!((line >> j) & 1)) { - gcpage_t *pg = ®ion->meta[pg_i*32 + j]; + if ((line >> j) & 1) { + jl_gc_pagemeta_t *pg = ®ion->meta[pg_i*32 + j]; int p_n = pg->pool_n; int t_n = pg->thread_n; - pool_t *p = NULL; - FOR_HEAP (t_n) - p = &pools[p_n]; + jl_tls_states_t *ptls = jl_all_task_states[t_n].ptls; + jl_gc_pool_t *p = &ptls->heap.norm_pools[p_n]; int osize = pg->osize; - pfl[t_n * N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * N_POOLS + p_n], sweep_mask, osize); + pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_mask, osize); } } } @@ -1269,12 +825,12 @@ static void sweep_pool_region(gcval_t ***pfl, int region_i, int sweep_mask) lb = pg_i; } } - regions_ub[region_i] = ub; - regions_lb[region_i] = lb; + region->ub = ub; + region->lb = lb; } // Returns pointer to terminal pointer of list rooted at *pfl. -static gcval_t **sweep_page(pool_t *p, gcpage_t *pg, gcval_t **pfl, int sweep_mask, int osize) +static gcval_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl, int sweep_mask, int osize) { int freedall; gcval_t **prev_pfl = pfl; @@ -1283,7 +839,7 @@ static gcval_t **sweep_page(pool_t *p, gcpage_t *pg, gcval_t **pfl, int sweep_ma int pg_freedall = 0, pg_total = 0, pg_skpd = 0; int obj_per_page = (GC_PAGE_SZ - GC_PAGE_OFFSET)/osize; char *data = pg->data; - uint8_t *ages = page_age(pg); + uint8_t *ages = pg->ages; v = (gcval_t*)(data + GC_PAGE_OFFSET); char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; freedall = 1; @@ -1296,8 +852,8 @@ static gcval_t **sweep_page(pool_t *p, gcpage_t *pg, gcval_t **pfl, int sweep_ma if (sweep_mask == GC_MARKED_NOESC && !pg->allocd) { // the position of the freelist begin/end in this page is stored in its metadata if (pg->fl_begin_offset != (uint16_t)-1) { - *pfl = (gcval_t*)PAGE_PFL_BEG(pg); - pfl = prev_pfl = PAGE_PFL_END(pg); + *pfl = page_pfl_beg(pg); + pfl = prev_pfl = (gcval_t**)page_pfl_end(pg); } pg_skpd++; freedall = 0; @@ -1373,9 +929,9 @@ static gcval_t **sweep_page(pool_t *p, gcpage_t *pg, gcval_t **pfl, int sweep_ma #ifdef MEMDEBUG memset(pg->data, 0xbb, GC_PAGE_SZ); #endif - free_page(data); + jl_gc_free_page(data); #ifdef MEMDEBUG - memset(pg, 0xbb, sizeof(gcpage_t)); + memset(pg, 0xbb, sizeof(jl_gc_pagemeta_t)); #endif } freed_pages++; @@ -1391,7 +947,7 @@ static gcval_t **sweep_page(pool_t *p, gcpage_t *pg, gcval_t **pfl, int sweep_ma skipped_pages += pg_skpd; total_pages += pg_total; - freed_bytes += (nfree - old_nfree)*osize; + gc_num.freed += (nfree - old_nfree)*osize; return pfl; } @@ -1429,26 +985,27 @@ static int gc_sweep_inc(int sweep_mask) page_done = 0; int finished = 1; - gcval_t ***pfl = (gcval_t ***) alloca(jl_n_threads * N_POOLS * sizeof(gcval_t**)); + gcval_t ***pfl = (gcval_t ***) alloca(jl_n_threads * JL_GC_N_POOLS * sizeof(gcval_t**)); // update metadata of pages that were pointed to by freelist or newpages from a pool // i.e. pages being the current allocation target - FOR_EACH_HEAP () { - for (int i = 0; i < N_POOLS; i++) { - pool_t *p = &pools[i]; + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; + for (int i = 0; i < JL_GC_N_POOLS; i++) { + jl_gc_pool_t *p = &ptls->heap.norm_pools[i]; gcval_t *last = p->freelist; if (last) { - gcpage_t *pg = page_metadata(last); + jl_gc_pagemeta_t *pg = page_metadata(last); pg->allocd = 1; pg->nfree = p->nfree; } p->freelist = NULL; - pfl[current_heap_index * N_POOLS + i] = &p->freelist; + pfl[t_i * JL_GC_N_POOLS + i] = &p->freelist; last = p->newpages; if (last) { - gcpage_t *pg = page_metadata(last); - pg->nfree = (GC_PAGE_SZ - ((char*)last - GC_PAGE_DATA(last))) / p->osize; + jl_gc_pagemeta_t *pg = page_metadata(last); + pg->nfree = (GC_PAGE_SZ - ((char*)last - gc_page_data(last))) / p->osize; pg->allocd = 1; } p->newpages = NULL; @@ -1456,16 +1013,17 @@ static int gc_sweep_inc(int sweep_mask) } for (int i = 0; i < REGION_COUNT; i++) { - if (regions[i]) + if (regions[i].pages) /*finished &= */sweep_pool_region(pfl, i, sweep_mask); } - // null out terminal pointers of free lists and cache back pg->nfree in the pool_t - FOR_EACH_HEAP () { - for (int i = 0; i < N_POOLS; i++) { - pool_t *p = &pools[i]; - *pfl[current_heap_index * N_POOLS + i] = NULL; + // null out terminal pointers of free lists and cache back pg->nfree in the jl_gc_pool_t + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; + for (int i = 0; i < JL_GC_N_POOLS; i++) { + jl_gc_pool_t *p = &ptls->heap.norm_pools[i]; + *pfl[t_i * JL_GC_N_POOLS + i] = NULL; if (p->freelist) { p->nfree = page_metadata(p->freelist)->nfree; } @@ -1504,46 +1062,43 @@ static void grow_mark_stack(void) static void reset_remset(void) { - FOR_EACH_HEAP () { - arraylist_t *tmp = remset; - remset = last_remset; - last_remset = tmp; - remset->len = 0; - remset_nptr = 0; + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; + arraylist_t *tmp = ptls->heap.remset; + ptls->heap.remset = ptls->heap.last_remset; + ptls->heap.last_remset = tmp; + ptls->heap.remset->len = 0; + ptls->heap.remset_nptr = 0; } } JL_DLLEXPORT void jl_gc_queue_root(jl_value_t *ptr) { - FOR_CURRENT_HEAP () { - jl_taggedvalue_t *o = jl_astaggedvalue(ptr); + jl_taggedvalue_t *o = jl_astaggedvalue(ptr); #ifndef JULIA_ENABLE_THREADING - // Disable this assert since it can happen with multithreading (same - // with the ones in gc_queue_binding) when two threads are writing - // to the same object. - assert(gc_bits(o) != GC_QUEUED); + // Disable this assert since it can happen with multithreading (same + // with the ones in gc_queue_binding) when two threads are writing + // to the same object. + assert(gc_bits(o) != GC_QUEUED); #endif - // The modification of the `gc_bits` is not atomic but it - // should be safe here since GC is not allowed to run here and we only - // write GC_QUEUED to the GC bits outside GC. This could cause - // duplicated objects in the remset but that shouldn't be a problem. - gc_bits(o) = GC_QUEUED; - arraylist_push(remset, ptr); - remset_nptr++; // conservative - } + // The modification of the `gc_bits` is not atomic but it + // should be safe here since GC is not allowed to run here and we only + // write GC_QUEUED to the GC bits outside GC. This could cause + // duplicated objects in the remset but that shouldn't be a problem. + gc_bits(o) = GC_QUEUED; + arraylist_push(jl_thread_heap->remset, ptr); + jl_thread_heap->remset_nptr++; // conservative } void gc_queue_binding(jl_binding_t *bnd) { - FOR_CURRENT_HEAP () { - buff_t *buf = gc_val_buf(bnd); + buff_t *buf = gc_val_buf(bnd); #ifndef JULIA_ENABLE_THREADING - // Will fail for multithreading. See `jl_gc_queue_root` - assert(gc_bits(buf) != GC_QUEUED); + // Will fail for multithreading. See `jl_gc_queue_root` + assert(gc_bits(buf) != GC_QUEUED); #endif - gc_bits(buf) = GC_QUEUED; - arraylist_push(&rem_bindings, bnd); - } + gc_bits(buf) = GC_QUEUED; + arraylist_push(&jl_thread_heap->rem_bindings, bnd); } static int push_root(jl_value_t *v, int d, int); @@ -1589,10 +1144,9 @@ NOINLINE static int gc_mark_module(jl_module_t *m, int d) if (table[i] != HT_NOTFOUND) { jl_binding_t *b = (jl_binding_t*)table[i]; gc_setmark_buf(b, gc_bits(jl_astaggedvalue(m))); -#ifdef GC_VERIFY void *vb = gc_val_buf(b); verify_parent1("module", m, &vb, "binding_buff"); -#endif + (void)vb; if (b->value != NULL) { verify_parent2("module", m, &b->value, "binding(%s)", jl_symbol_name(b->name)); @@ -1766,10 +1320,9 @@ static int push_root(jl_value_t *v, int d, int bits) goto ret; } else if (a->flags.how == 1) { -#ifdef GC_VERIFY void *val_buf = gc_val_buf((char*)a->data - a->offset*a->elsize); verify_parent1("array", v, &val_buf, "buffer ('loc' addr is meaningless)"); -#endif + (void)val_buf; gc_setmark_buf((char*)a->data - a->offset*a->elsize, gc_bits(o)); } if (a->flags.ptrarray && a->data!=NULL) { @@ -1855,15 +1408,12 @@ static int push_root(jl_value_t *v, int d, int bits) } ret: -#ifdef GC_VERIFY - if (verifying) return bits; -#endif + if (gc_verifying) + return bits; if ((bits == GC_MARKED) && (refyoung == GC_MARKED_NOESC)) { - FOR_CURRENT_HEAP () { - remset_nptr += nptr; - // v is an old object referencing young objects - arraylist_push(remset, v); - } + jl_thread_heap->remset_nptr += nptr; + // v is an old object referencing young objects + arraylist_push(jl_thread_heap->remset, v); } return bits; @@ -1904,7 +1454,7 @@ extern jl_array_t *jl_module_init_order; static int inc_count = 0; // mark the initial root set -static void pre_mark(void) +void pre_mark(void) { // modules gc_push_root(jl_main_module, 0); @@ -1953,7 +1503,7 @@ static void pre_mark(void) // find unmarked objects that need to be finalized from the finalizer list "list". // this must happen last in the mark phase. // if dryrun == 1, it does not schedule any actual finalization and only marks finalizers -static void post_mark(arraylist_t *list, int dryrun) +void post_mark(arraylist_t *list, int dryrun) { for(size_t i=0; i < list->len; i+=2) { jl_value_t *v = (jl_value_t*)list->items[i]; @@ -2014,9 +1564,19 @@ JL_DLLEXPORT int jl_gc_is_enabled(void) return !jl_get_ptls_states()->disable_gc; } -JL_DLLEXPORT int64_t jl_gc_total_bytes(void) { return total_allocd_bytes + allocd_bytes + collect_interval; } -JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void) { return total_gc_time; } -JL_DLLEXPORT GC_Num jl_gc_num(void) { return gc_num; } +JL_DLLEXPORT int64_t jl_gc_total_bytes(void) +{ + // Sync this logic with `base/util.jl:GC_Diff` + return gc_num.total_allocd + gc_num.allocd + gc_num.interval; +} +JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void) +{ + return gc_num.total_time; +} +JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) +{ + return gc_num; +} JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) { @@ -2053,52 +1613,55 @@ static void _jl_gc_collect(int full, char *stack_hi) // 1. mark every object in the remset reset_remset(); - FOR_EACH_HEAP () { + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; // avoid counting remembered objects & bindings twice in perm_scanned_bytes - for(int i = 0; i < last_remset->len; i++) { - jl_value_t *item = (jl_value_t*)last_remset->items[i]; + for(int i = 0; i < ptls->heap.last_remset->len; i++) { + jl_value_t *item = (jl_value_t*)ptls->heap.last_remset->items[i]; objprofile_count(jl_typeof(item), 2, 0); gc_bits(jl_astaggedvalue(item)) = GC_MARKED; } - for (int i = 0; i < rem_bindings.len; i++) { - void *ptr = rem_bindings.items[i]; + for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { + void *ptr = ptls->heap.rem_bindings.items[i]; gc_bits(gc_val_buf(ptr)) = GC_MARKED; } - for (int i = 0; i < last_remset->len; i++) { - jl_value_t *item = (jl_value_t*)last_remset->items[i]; + for (int i = 0; i < ptls->heap.last_remset->len; i++) { + jl_value_t *item = (jl_value_t*)ptls->heap.last_remset->items[i]; push_root(item, 0, GC_MARKED); } } // 2. mark every object in a remembered binding - FOR_EACH_HEAP () { + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; int n_bnd_refyoung = 0; - for (int i = 0; i < rem_bindings.len; i++) { - jl_binding_t *ptr = (jl_binding_t*)rem_bindings.items[i]; + for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { + jl_binding_t *ptr = (jl_binding_t*)ptls->heap.rem_bindings.items[i]; // A null pointer can happen here when the binding is cleaned up // as an exception is thrown after it was already queued (#10221) if (!ptr->value) continue; if (gc_push_root(ptr->value, 0) == GC_MARKED_NOESC) { - rem_bindings.items[n_bnd_refyoung] = ptr; + ptls->heap.rem_bindings.items[n_bnd_refyoung] = ptr; n_bnd_refyoung++; } } - rem_bindings.len = n_bnd_refyoung; + ptls->heap.rem_bindings.len = n_bnd_refyoung; } // 3. walk roots pre_mark(); visit_mark_stack(GC_MARKED_NOESC); - allocd_bytes_since_sweep += allocd_bytes + (int64_t)collect_interval; + gc_num.since_sweep += gc_num.allocd + (int64_t)gc_num.interval; #if defined(GC_TIME) || defined(GC_FINAL_STATS) uint64_t mark_pause = jl_hrtime() - t0; #endif #ifdef GC_TIME - FOR_EACH_HEAP () { - jl_printf(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, remset_nptr); + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; + jl_printf(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, ptls->heap.last_remset->len, ptls->heap.remset_nptr); } saved_mark_sp = mark_sp; #endif @@ -2118,7 +1681,7 @@ static void _jl_gc_collect(int full, char *stack_hi) #if defined(GC_TIME) || defined(GC_FINAL_STATS) uint64_t sweep_t0 = jl_hrtime(); #endif - int64_t actual_allocd = allocd_bytes_since_sweep; + int64_t actual_allocd = gc_num.since_sweep; if (!sweeping) { // marking is over #if defined(GC_TIME) || defined(GC_FINAL_STATS) @@ -2142,39 +1705,35 @@ static void _jl_gc_collect(int full, char *stack_hi) #endif objprofile_printall(); objprofile_reset(); - total_allocd_bytes += allocd_bytes_since_sweep; + gc_num.total_allocd += gc_num.since_sweep; if (prev_sweep_mask == GC_MARKED_NOESC) promoted_bytes += perm_scanned_bytes - last_perm_scanned_bytes; // 5. next collection decision int not_freed_enough = estimate_freed < (7*(actual_allocd/10)); int nptr = 0; - FOR_EACH_HEAP () - nptr += remset_nptr; + for (int i = 0;i < jl_n_threads;i++) + nptr += jl_all_task_states[i].ptls->heap.remset_nptr; int large_frontier = nptr*sizeof(void*) >= default_collect_interval; // many pointers in the intergen frontier => "quick" mark is not quick - if ((full || large_frontier || ((not_freed_enough || promoted_bytes >= collect_interval) && (promoted_bytes >= default_collect_interval || prev_sweep_mask == GC_MARKED))) && n_pause > 1) { + if ((full || large_frontier || ((not_freed_enough || promoted_bytes >= gc_num.interval) && (promoted_bytes >= default_collect_interval || prev_sweep_mask == GC_MARKED))) && gc_num.pause > 1) { if (prev_sweep_mask != GC_MARKED || full) { if (full) recollect = 1; // TODO enable this? } if (large_frontier) - collect_interval = last_long_collect_interval; + gc_num.interval = last_long_collect_interval; if (not_freed_enough || large_frontier) { - if (collect_interval < default_collect_interval) - collect_interval = default_collect_interval; - else if (collect_interval <= 2*(max_collect_interval/5)) { - collect_interval = 5*(collect_interval/2); + if (gc_num.interval < default_collect_interval) + gc_num.interval = default_collect_interval; + else if (gc_num.interval <= 2*(max_collect_interval/5)) { + gc_num.interval = 5*(gc_num.interval/2); } } - last_long_collect_interval = collect_interval; + last_long_collect_interval = gc_num.interval; sweep_mask = GC_MARKED; promoted_bytes = 0; } else { - collect_interval = default_collect_interval/2; -#ifdef GC_DEBUG_ENV - sweep_mask = jl_gc_debug_env.sweep_mask; -#else - sweep_mask = GC_MARKED_NOESC; -#endif + gc_num.interval = default_collect_interval/2; + sweep_mask = gc_quick_sweep_mask; } if (sweep_mask == GC_MARKED) perm_scanned_bytes = 0; @@ -2189,38 +1748,39 @@ static void _jl_gc_collect(int full, char *stack_hi) // sweeping is over // 6. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. - FOR_EACH_HEAP () { + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; if (sweep_mask == GC_MARKED_NOESC) { - for (int i = 0; i < remset->len; i++) { - gc_bits(jl_astaggedvalue(remset->items[i])) = GC_QUEUED; + for (int i = 0; i < ptls->heap.remset->len; i++) { + gc_bits(jl_astaggedvalue(ptls->heap.remset->items[i])) = GC_QUEUED; } - for (int i = 0; i < rem_bindings.len; i++) { - void *ptr = rem_bindings.items[i]; + for (int i = 0; i < ptls->heap.rem_bindings.len; i++) { + void *ptr = ptls->heap.rem_bindings.items[i]; gc_bits(gc_val_buf(ptr)) = GC_QUEUED; } } else { - remset->len = 0; - rem_bindings.len = 0; - n_full_sweep++; + ptls->heap.remset->len = 0; + ptls->heap.rem_bindings.len = 0; } } + gc_num.full_sweep += sweep_mask != GC_MARKED_NOESC; sweeping = 0; #ifdef GC_TIME - SAVE2 = freed_bytes; - SAVE3 = allocd_bytes_since_sweep; - pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; + SAVE2 = gc_num.freed; + SAVE3 = gc_num.since_sweep; + pct = actual_allocd ? (gc_num.freed*100)/actual_allocd : -1; #endif prev_sweep_mask = sweep_mask; - allocd_bytes = -(int64_t)collect_interval; + gc_num.allocd = -(int64_t)gc_num.interval; inc_count = 0; - live_bytes += -freed_bytes + allocd_bytes_since_sweep; - allocd_bytes_since_sweep = 0; - jl_gc_total_freed_bytes += freed_bytes; - freed_bytes = 0; + live_bytes += -gc_num.freed + gc_num.since_sweep; + gc_num.since_sweep = 0; + jl_gc_total_freed_bytes += gc_num.freed; + gc_num.freed = 0; } #if defined(GC_FINAL_STATS) || defined(GC_TIME) uint64_t sweep_pause = jl_hrtime() - sweep_t0; @@ -2230,12 +1790,12 @@ static void _jl_gc_collect(int full, char *stack_hi) total_fin_time += + post_time; #endif #ifdef GC_TIME - jl_printf(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB EST %d kB [error %d] = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, estimate_freed/1024, (SAVE2 - estimate_freed), pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), inc_count, sweep_mask, -allocd_bytes/1024); + jl_printf(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB EST %d kB [error %d] = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, estimate_freed/1024, (SAVE2 - estimate_freed), pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), inc_count, sweep_mask, -gc_num.allocd/1024); #endif } - n_pause++; + gc_num.pause++; uint64_t pause = jl_hrtime() - t0; - total_gc_time += pause; + gc_num.total_time += pause; #ifdef GC_FINAL_STATS max_pause = max_pause < pause ? pause : max_pause; #endif @@ -2247,7 +1807,7 @@ static void _jl_gc_collect(int full, char *stack_hi) } #endif if (recollect) { - n_pause--; + gc_num.pause--; _jl_gc_collect(0, stack_hi); } } @@ -2308,8 +1868,7 @@ void *allocb(size_t sz) b->pooled = 0; } else { - FOR_CURRENT_HEAP () - b = (buff_t*)pool_alloc(&pools[szclass(allocsz)]); + b = (buff_t*)pool_alloc(&jl_thread_heap->norm_pools[szclass(allocsz)]); b->header = jl_buff_tag; b->pooled = 1; } @@ -2349,9 +1908,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz) return jl_valueof(alloc_big(allocsz)); #endif if (allocsz <= GC_MAX_SZCLASS + sizeof(buff_t)) { - FOR_CURRENT_HEAP () { - return jl_valueof(pool_alloc(&pools[szclass(allocsz)])); - } + return jl_valueof(pool_alloc(&jl_thread_heap->norm_pools[szclass(allocsz)])); } return jl_valueof(alloc_big(allocsz)); } @@ -2363,8 +1920,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void) #ifdef MEMDEBUG tag = alloc_big(sz); #else - FOR_CURRENT_HEAP () - tag = _pool_alloc(&pools[szclass(sz)], sz); + tag = _pool_alloc(&jl_thread_heap->norm_pools[szclass(sz)], sz); #endif return jl_valueof(tag); } @@ -2376,8 +1932,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void) #ifdef MEMDEBUG tag = alloc_big(sz); #else - FOR_CURRENT_HEAP () - tag = _pool_alloc(&pools[szclass(sz)], sz); + tag = _pool_alloc(&jl_thread_heap->norm_pools[szclass(sz)], sz); #endif return jl_valueof(tag); } @@ -2389,8 +1944,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void) #ifdef MEMDEBUG tag = alloc_big(sz); #else - FOR_CURRENT_HEAP () - tag = _pool_alloc(&pools[szclass(sz)], sz); + tag = _pool_alloc(&jl_thread_heap->norm_pools[szclass(sz)], sz); #endif return jl_valueof(tag); } @@ -2402,33 +1956,33 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void) #ifdef MEMDEBUG tag = alloc_big(sz); #else - FOR_CURRENT_HEAP () - tag = _pool_alloc(&pools[szclass(sz)], sz); + tag = _pool_alloc(&jl_thread_heap->norm_pools[szclass(sz)], sz); #endif return jl_valueof(tag); } #ifdef GC_FINAL_STATS static double process_t0; +size_t max_pg_count = 0; #include void jl_print_gc_stats(JL_STREAM *s) { - double gct = total_gc_time/1e9; + double gct = gc_num.total_time/1e9; malloc_stats(); double ptime = jl_clock_now()-process_t0; jl_printf(s, "exec time\t%.5f sec\n", ptime); - if (n_pause > 0) { + if (gc_num.pause > 0) { jl_printf(s, "gc time \t%.5f sec (%2.1f%%) in %d (%d full) collections\n", - NS_TO_S(total_gc_time), (NS_TO_S(total_gc_time)/ptime)*100, n_pause, n_full_sweep); + NS_TO_S(gc_num.total_time), (NS_TO_S(gc_num.total_time)/ptime)*100, gc_num.pause, gc_num.full_sweep); jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%2.0f ms max\n", - NS2MS(total_gc_time)/n_pause, NS2MS(max_pause)); + NS2MS(gc_num.total_time)/gc_num.pause, NS2MS(max_pause)); jl_printf(s, "\t\t(%2d%% mark, %2d%% sweep, %2d%% finalizers)\n", - (int)(total_mark_time * 100 / total_gc_time), - (int)(total_sweep_time * 100 / total_gc_time), - (int)(total_fin_time * 100 / total_gc_time)); + (int)(total_mark_time * 100 / gc_num.total_time), + (int)(total_sweep_time * 100 / gc_num.total_time), + (int)(total_fin_time * 100 / gc_num.total_time)); } int i = 0; - while (i < REGION_COUNT && regions[i]) i++; + while (i < REGION_COUNT && regions[i].pages) i++; jl_printf(s, "max allocated regions : %d\n", i); struct mallinfo mi = mallinfo(); jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); @@ -2439,58 +1993,43 @@ void jl_print_gc_stats(JL_STREAM *s) #endif // Per-thread initialization (when threading is fully implemented) -jl_thread_heap_t *jl_mk_thread_heap(void) -{ -#ifdef JULIA_ENABLE_THREADING - // Cache-aligned malloc - jl_thread_heap = - (jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), JL_CACHE_BYTE_ALIGNMENT); -#endif - FOR_CURRENT_HEAP () { - const int *szc = sizeclasses; - pool_t *p = pools; - for(int i=0; i < N_POOLS; i++) { - assert((szc[i] < 16 && szc[i] % sizeof(void*) == 0) || - (szc[i] % 16 == 0)); - p[i].osize = szc[i]; - p[i].freelist = NULL; - p[i].newpages = NULL; - p[i].end_offset = GC_POOL_END_OFS(szc[i]); - } - arraylist_new(&weak_refs, 0); - mallocarrays = NULL; - mafreelist = NULL; - big_objects = NULL; - arraylist_new(&rem_bindings, 0); - remset = &HEAP(_remset)[0]; - last_remset = &HEAP(_remset)[1]; - arraylist_new(remset, 0); - arraylist_new(last_remset, 0); - } - return jl_thread_heap; +void jl_mk_thread_heap(jl_thread_heap_t *heap) +{ + const int *szc = sizeclasses; + jl_gc_pool_t *p = heap->norm_pools; + for(int i=0; i < JL_GC_N_POOLS; i++) { + assert((szc[i] < 16 && szc[i] % sizeof(void*) == 0) || + (szc[i] % 16 == 0)); + p[i].osize = szc[i]; + p[i].freelist = NULL; + p[i].newpages = NULL; + p[i].end_offset = GC_POOL_END_OFS(szc[i]); + } + arraylist_new(&heap->weak_refs, 0); + heap->mallocarrays = NULL; + heap->mafreelist = NULL; + heap->big_objects = NULL; + arraylist_new(&heap->rem_bindings, 0); + heap->remset = &heap->_remset[0]; + heap->last_remset = &heap->_remset[1]; + arraylist_new(heap->remset, 0); + arraylist_new(heap->last_remset, 0); } // System-wide initializations void jl_gc_init(void) { + jl_gc_init_page(); gc_debug_init(); arraylist_new(&finalizer_list, 0); arraylist_new(&finalizer_list_marked, 0); arraylist_new(&to_finalize, 0); - collect_interval = default_collect_interval; + gc_num.interval = default_collect_interval; last_long_collect_interval = default_collect_interval; - allocd_bytes = -default_collect_interval; - -#ifdef GC_VERIFY - for(int i = 0; i < 4; i++) - arraylist_new(&bits_save[i], 0); - arraylist_new(&lostval_parents, 0); - arraylist_new(&lostval_parents_done, 0); -#endif + gc_num.allocd = -default_collect_interval; - objprofile_init(); #ifdef GC_FINAL_STATS process_t0 = jl_clock_now(); #endif @@ -2507,10 +2046,10 @@ void jl_gc_init(void) #if defined(MEMPROFILE) // TODO repair this -static size_t pool_stats(pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) +static size_t pool_stats(jl_gc_pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) { gcval_t *v; - gcpage_t *pg = p->pages; + jl_gc_pagemeta_t *pg = p->pages; size_t osize = p->osize; size_t nused=0, nfree=0, npgs=0, nold = 0; @@ -2532,7 +2071,7 @@ static size_t pool_stats(pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) v = (gcval_t*)((char*)v + osize); i++; } - gcpage_t *nextpg = NULL; + jl_gc_pagemeta_t *nextpg = NULL; pg = nextpg; } *pwaste = npgs * GC_PAGE_SZ - (nused * p->osize); @@ -2554,17 +2093,17 @@ static size_t pool_stats(pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) static void all_pool_stats(void) { - int i; - size_t nb=0, w, tw=0, no=0,tp=0, nold=0,noldbytes=0, b, np, nol; - for(i=0; i < N_POOLS; i++) { - FOR_EACH_HEAP () { - b = pool_stats(&pools[i], &w, &np, &nol); + size_t nb=0, w, tw=0, no=0,tp=0, nold=0,noldbytes=0, np, nol; + for (int i = 0; i < JL_GC_N_POOLS; i++) { + for (int t_i = 0;t_i < jl_n_threads;t_i++) { + jl_tls_states_t *ptls = jl_all_task_states[t_i].ptls; + size_t b = pool_stats(&ptls->heap.norm_pools[i], &w, &np, &nol); nb += b; - no += (b/pools[i].osize); + no += (b / ptls->heap.norm_pools[i].osize); tw += w; tp += np; nold += nol; - noldbytes += nol*pools[i].osize; + noldbytes += nol * ptls->heap.norm_pools[i].osize; } } jl_printf(JL_STDOUT, @@ -2574,7 +2113,7 @@ static void all_pool_stats(void) static void big_obj_stats(void) { - bigval_t *v = big_objects; + bigval_t *v = current_heap->big_objects; size_t nused=0, nbytes=0; while (v != NULL) { if (gc_marked(&v->_data)) { @@ -2593,7 +2132,7 @@ static void big_obj_stats(void) v = v->next; } - mallocarray_t *ma = mallocarrays; + mallocarray_t *ma = current_heap->mallocarrays; while (ma != NULL) { if (gc_marked(jl_astaggedvalue(ma->a))) { nused++; @@ -2610,7 +2149,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { sz += JL_SMALL_BYTE_ALIGNMENT; maybe_collect(); - allocd_bytes += sz; + gc_num.allocd += sz; gc_num.malloc++; void *b = malloc(sz); if (b == NULL) @@ -2622,7 +2161,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { nm += JL_SMALL_BYTE_ALIGNMENT; maybe_collect(); - allocd_bytes += nm*sz; + gc_num.allocd += nm*sz; gc_num.malloc++; void *b = calloc(nm, sz); if (b == NULL) @@ -2633,7 +2172,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) JL_DLLEXPORT void jl_gc_counted_free(void *p, size_t sz) { free(p); - freed_bytes += sz + JL_SMALL_BYTE_ALIGNMENT; + gc_num.freed += sz + JL_SMALL_BYTE_ALIGNMENT; gc_num.freecall++; } @@ -2643,9 +2182,9 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size sz += JL_SMALL_BYTE_ALIGNMENT; maybe_collect(); if (sz < old) - freed_bytes += (old - sz); + gc_num.freed += (old - sz); else - allocd_bytes += (sz - old); + gc_num.allocd += (sz - old); gc_num.realloc++; void *b = realloc(p, sz); if (b == NULL) @@ -2699,7 +2238,7 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - allocd_bytes += allocsz; + gc_num.allocd += allocsz; gc_num.malloc++; void *b = malloc_cache_align(allocsz); if (b == NULL) @@ -2721,9 +2260,9 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, live_bytes += allocsz - oldsz; } else if (allocsz < oldsz) - freed_bytes += (oldsz - allocsz); + gc_num.freed += (oldsz - allocsz); else - allocd_bytes += (allocsz - oldsz); + gc_num.allocd += (allocsz - oldsz); gc_num.realloc++; void *b; diff --git a/src/gc.h b/src/gc.h new file mode 100644 index 0000000000000..8a4a5a1ed4837 --- /dev/null +++ b/src/gc.h @@ -0,0 +1,345 @@ +// This file is a part of Julia. License is MIT: http://julialang.org/license + +/* + allocation and garbage collection + . non-moving, precise mark and sweep collector + . pool-allocates small objects, keeps big objects on a simple list +*/ + +#ifndef JULIA_GC_H +#define JULIA_GC_H + +#include +#include +#ifndef _MSC_VER +#include +#endif +#include +#include +#include "julia.h" +#include "julia_internal.h" +#include "threading.h" +#ifndef _OS_WINDOWS_ +#include +#if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS) +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// manipulating mark bits + +#define GC_CLEAN 0 // freshly allocated +#define GC_MARKED 1 // reachable and old +#define GC_QUEUED 2 // if it is reachable it will be marked as old +#define GC_MARKED_NOESC (GC_MARKED | GC_QUEUED) // reachable and young + +#define GC_PAGE_LG2 14 // log2(size of a page) +#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k +#define GC_PAGE_OFFSET (JL_SMALL_BYTE_ALIGNMENT - (sizeof_jl_taggedvalue_t % JL_SMALL_BYTE_ALIGNMENT)) + +// 8G * 32768 = 2^48 +// It's really unlikely that we'll actually allocate that much though... +#define REGION_COUNT 32768 + +#define jl_buff_tag ((uintptr_t)0x4eade800) +#define jl_malloc_tag ((void*)0xdeadaa01) +#define jl_singleton_tag ((void*)0xdeadaa02) + +// Used by GC_DEBUG_ENV +typedef struct { + uint64_t num; + uint64_t next; + + uint64_t min; + uint64_t interv; + uint64_t max; + unsigned short random[3]; +} jl_alloc_num_t; + +typedef struct { + int sweep_mask; + int wait_for_debugger; + jl_alloc_num_t pool; + jl_alloc_num_t other; + jl_alloc_num_t print; +} jl_gc_debug_env_t; + +// This struct must be kept in sync with the Julia type of the same name in base/util.jl +typedef struct { + int64_t allocd; + int64_t freed; + uint64_t malloc; + uint64_t realloc; + uint64_t poolalloc; + uint64_t bigalloc; + uint64_t freecall; + uint64_t total_time; + uint64_t total_allocd; + uint64_t since_sweep; + size_t interval; + int pause; + int full_sweep; +} jl_gc_num_t; + +// layout for small (<2k) objects + +typedef struct _buff_t { + union { + uintptr_t header; + struct _buff_t *next; + uintptr_t flags; + jl_value_t *type; // 16-bytes aligned + struct { + uintptr_t gc_bits:2; + uintptr_t pooled:1; + }; + }; + char data[]; +} buff_t; +typedef buff_t gcval_t; + +// layout for big (>2k) objects + +typedef struct _bigval_t { + struct _bigval_t *next; + struct _bigval_t **prev; // pointer to the next field of the prev entry + union { + size_t sz; + uintptr_t age : 2; + }; + #ifdef _P64 // Add padding so that char data[] below is 64-byte aligned + // (8 pointers of 8 bytes each) - (4 other pointers in struct) + void *_padding[8 - 4]; + #else + // (16 pointers of 4 bytes each) - (4 other pointers in struct) + void *_padding[16 - 4]; + #endif + //struct buff_t <>; + union { + uintptr_t header; + uintptr_t flags; + uintptr_t gc_bits:2; + }; + // must be 64-byte aligned here, in 32 & 64 bit modes + char data[]; +} bigval_t; + +// data structure for tracking malloc'd arrays. + +typedef struct _mallocarray_t { + jl_array_t *a; + struct _mallocarray_t *next; +} mallocarray_t; + +// pool page metadata +typedef struct { + struct { + uint16_t pool_n : 8; // index (into norm_pool) of pool that owns this page + uint16_t allocd : 1; // true if an allocation happened in this page since last sweep + uint16_t gc_bits : 2; // this is a bitwise | of all gc_bits in this page + }; + uint16_t nfree; // number of free objects in this page. + // invalid if pool that owns this page is allocating objects from this page. + uint16_t osize; // size of each object in this page + uint16_t fl_begin_offset; // offset of first free object in this page + uint16_t fl_end_offset; // offset of last free object in this page + uint16_t thread_n; // index (into jl_thread_heap) of heap that owns this page + char *data; + uint8_t *ages; +} jl_gc_pagemeta_t; + +typedef struct { + char data[GC_PAGE_SZ]; +} jl_gc_page_t +#if !defined(_COMPILER_MICROSOFT_) && !(defined(_COMPILER_MINGW_) && defined(_COMPILER_CLANG_)) +__attribute__((aligned(GC_PAGE_SZ))) +#endif +; + +typedef struct { + // Page layout: + // Padding: GC_PAGE_OFFSET + // Blocks: osize * n + // Tag: sizeof_jl_taggedvalue_t + // Data: <= osize - sizeof_jl_taggedvalue_t + jl_gc_page_t *pages; // [pg_cnt]; must be first, to preserve page alignment + uint32_t *allocmap; // [pg_cnt / 32] + jl_gc_pagemeta_t *meta; // [pg_cnt] + int pg_cnt; + // store a lower bound of the first free page in each region + int lb; + // an upper bound of the last non-free page + int ub; +} region_t +; + +extern jl_gc_num_t gc_num; +extern region_t regions[REGION_COUNT]; +extern bigval_t *big_objects_marked; +extern arraylist_t finalizer_list; +extern arraylist_t finalizer_list_marked; +extern arraylist_t to_finalize; + +// Counters +// GC_FINAL_STATS only +extern size_t max_pg_count; + +#define bigval_header(data) container_of((data), bigval_t, header) + +// round an address inside a gcpage's data to its beginning +STATIC_INLINE char *gc_page_data(void *x) +{ + return (char*)(((uintptr_t)x >> GC_PAGE_LG2) << GC_PAGE_LG2); +} + +STATIC_INLINE gcval_t *page_pfl_beg(jl_gc_pagemeta_t *p) +{ + return (gcval_t*)(p->data + p->fl_begin_offset); +} + +STATIC_INLINE gcval_t *page_pfl_end(jl_gc_pagemeta_t *p) +{ + return (gcval_t*)(p->data + p->fl_end_offset); +} + +STATIC_INLINE int page_index(region_t *region, void *data) +{ + return (gc_page_data(data) - region->pages->data) / GC_PAGE_SZ; +} + +#define gc_bits(o) (((gcval_t*)(o))->gc_bits) +#define gc_marked(o) (((gcval_t*)(o))->gc_bits & GC_MARKED) +#define _gc_setmark(o, mark_mode) (((gcval_t*)(o))->gc_bits = mark_mode) + +NOINLINE uintptr_t gc_get_stack_ptr(void); + +STATIC_INLINE region_t *find_region(void *ptr, int maybe) +{ + // on 64bit systems we could probably use a single region and remove this loop + for (int i = 0; i < REGION_COUNT && regions[i].pages; i++) { + region_t *region = ®ions[i]; + char *begin = region->pages->data; + char *end = begin + region->pg_cnt * sizeof(jl_gc_page_t); + if ((char*)ptr >= begin && (char*)ptr <= end) { + return region; + } + } + (void)maybe; + assert(maybe && "find_region failed"); + return NULL; +} + +STATIC_INLINE jl_gc_pagemeta_t *page_metadata(void *data) +{ + region_t *r = find_region(data, 0); + int pg_idx = page_index(r, (char*)data - GC_PAGE_OFFSET); + return &r->meta[pg_idx]; +} + +void pre_mark(void); +void post_mark(arraylist_t *list, int dryrun); +void gc_debug_init(void); + +#define jl_thread_heap (&jl_get_ptls_states()->heap) + +// GC pages + +void jl_gc_init_page(void); +NOINLINE void *jl_gc_alloc_page(void); +void jl_gc_free_page(void *p); + +// GC debug + +#ifdef GC_VERIFY +extern jl_value_t *lostval; +void gc_verify(void); +void add_lostval_parent(jl_value_t *parent); +#define verify_val(v) do { \ + if (lostval == (jl_value_t*)(v) && (v) != 0) { \ + jl_printf(JL_STDOUT, \ + "Found lostval %p at %s:%d oftype: ", \ + (void*)(lostval), __FILE__, __LINE__); \ + jl_static_show(JL_STDOUT, jl_typeof(v)); \ + jl_printf(JL_STDOUT, "\n"); \ + } \ + } while(0); + +#define verify_parent(ty, obj, slot, args...) do { \ + if (*(jl_value_t**)(slot) == lostval && \ + (jl_value_t*)(obj) != lostval) { \ + jl_printf(JL_STDOUT, "Found parent %p %p at %s:%d\n", \ + (void*)(ty), (void*)(obj), __FILE__, __LINE__); \ + jl_printf(JL_STDOUT, "\tloc %p : ", (void*)(slot)); \ + jl_printf(JL_STDOUT, args); \ + jl_printf(JL_STDOUT, "\n"); \ + jl_printf(JL_STDOUT, "\ttype: "); \ + jl_static_show(JL_STDOUT, jl_typeof(obj)); \ + jl_printf(JL_STDOUT, "\n"); \ + add_lostval_parent((jl_value_t*)(obj)); \ + } \ + } while(0); + +#define verify_parent1(ty,obj,slot,arg1) verify_parent(ty,obj,slot,arg1) +#define verify_parent2(ty,obj,slot,arg1,arg2) verify_parent(ty,obj,slot,arg1,arg2) +extern int gc_verifying; +#else +#define gc_verify() +#define verify_val(v) +#define verify_parent1(ty,obj,slot,arg1) +#define verify_parent2(ty,obj,slot,arg1,arg2) +#define gc_verifying (0) +#endif + +#ifdef GC_DEBUG_ENV +JL_DLLEXPORT extern jl_gc_debug_env_t jl_gc_debug_env; +#define gc_quick_sweep_mask jl_gc_debug_env.sweep_mask +int gc_debug_check_other(void); +int gc_debug_check_pool(void); +void gc_debug_print(void); +void gc_scrub(char *stack_hi); +#else +#define gc_quick_sweep_mask GC_MARKED_NOESC +static inline int gc_debug_check_other(void) +{ + return 0; +} +static inline int gc_debug_check_pool(void) +{ + return 0; +} +static inline void gc_debug_print(void) +{ +} +static inline void gc_scrub(char *stack_hi) +{ + (void)stack_hi; +} +#endif + +#ifdef OBJPROFILE +void objprofile_count(void *ty, int old, int sz); +void objprofile_printall(void); +void objprofile_reset(void); +#else +static inline void objprofile_count(void *ty, int old, int sz) +{ +} + +static inline void objprofile_printall(void) +{ +} + +static inline void objprofile_reset(void) +{ +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/julia_internal.h b/src/julia_internal.h index fa1451928cbac..45762c2874eac 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -259,6 +259,7 @@ void jl_init_restored_modules(jl_array_t *init_order); void jl_init_signal_async(void); void jl_init_debuginfo(void); void jl_init_runtime_ccall(void); +void jl_mk_thread_heap(jl_thread_heap_t *heap); void _julia_init(JL_IMAGE_SEARCH rel); #ifdef COPY_STACKS diff --git a/src/julia_threads.h b/src/julia_threads.h index 0179f4a898c8c..d44a623b49ac1 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -28,6 +28,42 @@ #endif #include +typedef struct { + struct _buff_t *freelist; // root of list of free objects + struct _buff_t *newpages; // root of list of chunks of free objects + uint16_t end_offset; // stored to avoid computing it at each allocation + uint16_t osize; // size of objects in this pool + uint16_t nfree; // number of free objects in page pointed into by free_list +} jl_gc_pool_t; + +typedef struct { + // variable for tracking weak references + arraylist_t weak_refs; + + // variables for tracking malloc'd arrays + struct _mallocarray_t *mallocarrays; + struct _mallocarray_t *mafreelist; + + // variables for tracking big objects + struct _bigval_t *big_objects; + + // variables for tracking "remembered set" + arraylist_t rem_bindings; + arraylist_t _remset[2]; // contains jl_value_t* + // lower bound of the number of pointers inside remembered values + int remset_nptr; + arraylist_t *remset; + arraylist_t *last_remset; + + // variables for allocating objects from pools +#ifdef _P64 +#define JL_GC_N_POOLS 41 +#else +#define JL_GC_N_POOLS 43 +#endif + jl_gc_pool_t norm_pools[JL_GC_N_POOLS]; +} jl_thread_heap_t; + // This includes all the thread local states we care about for a thread. #define JL_MAX_BT_SIZE 80000 typedef struct _jl_tls_states_t { @@ -45,7 +81,6 @@ typedef struct _jl_tls_states_t { volatile int8_t in_finalizer; int8_t disable_gc; volatile sig_atomic_t defer_signal; - struct _jl_thread_heap_t *heap; struct _jl_module_t *current_module; struct _jl_task_t *volatile current_task; struct _jl_task_t *root_task; @@ -66,6 +101,7 @@ typedef struct _jl_tls_states_t { // this is limited to the few places we do synchronous IO // we can make this more general (similar to defer_signal) if necessary volatile sig_atomic_t io_wait; + jl_thread_heap_t heap; } jl_tls_states_t; #ifdef __MIC__ diff --git a/src/threading.c b/src/threading.c index bfc47c8e3676f..13a612ecdeb9f 100644 --- a/src/threading.c +++ b/src/threading.c @@ -95,9 +95,6 @@ jl_thread_task_state_t *jl_all_task_states; // type of the thread id. JL_DLLEXPORT int16_t jl_threadid(void) { return ti_tid; } -struct _jl_thread_heap_t *jl_mk_thread_heap(void); -// must be called by each thread at startup - static void ti_initthread(int16_t tid) { jl_tls_states_t *ptls = jl_get_ptls_states(); @@ -122,11 +119,7 @@ static void ti_initthread(int16_t tid) abort(); } ptls->bt_data = (uintptr_t*)bt_data; -#ifdef JULIA_ENABLE_THREADING - jl_all_heaps[tid] = jl_mk_thread_heap(); -#else - jl_mk_thread_heap(); -#endif + jl_mk_thread_heap(&ptls->heap); jl_all_task_states[tid].ptls = ptls; jl_all_task_states[tid].signal_stack = jl_install_thread_signal_handler(); @@ -167,9 +160,6 @@ jl_mutex_t typecache_lock; #ifdef JULIA_ENABLE_THREADING -// thread heap -struct _jl_thread_heap_t **jl_all_heaps; - // only one thread group for now ti_threadgroup_t *tgworld; @@ -290,8 +280,6 @@ void jl_init_threading(void) if (jl_n_threads > max_threads) jl_n_threads = max_threads; - // set up space for per-thread heaps - jl_all_heaps = (struct _jl_thread_heap_t **)malloc(jl_n_threads * sizeof(void*)); jl_all_task_states = (jl_thread_task_state_t *)malloc(jl_n_threads * sizeof(jl_thread_task_state_t)); #if PROFILE_JL_THREADING diff --git a/src/threading.h b/src/threading.h index ac28ba35bd67b..7eab52cc8e9bd 100644 --- a/src/threading.h +++ b/src/threading.h @@ -18,11 +18,6 @@ extern "C" { extern jl_thread_task_state_t *jl_all_task_states; extern JL_DLLEXPORT int jl_n_threads; // # threads we're actually using -#ifdef JULIA_ENABLE_THREADING -// GC -extern struct _jl_thread_heap_t **jl_all_heaps; -#endif - // thread state enum { TI_THREAD_INIT,