Skip to content

Commit

Permalink
add a compile-time option to enable 4k page sizes (JuliaLang#52229) (#…
Browse files Browse the repository at this point in the history
…111)

We're suffering from heavy fragmentation in some of our workloads.

Add a build-time option to enable 4k pages (instead of 16k) in the GC,
since that improves memory utilization considerably for us.

Drawback is that this may increase the number of `madvise` system calls
in the sweeping phase by a factor of 4, but concurrent page sweeping
should help with some of that.
  • Loading branch information
d-netto authored and Drvi committed Dec 7, 2023
1 parent d6b858b commit e1971d4
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 18 deletions.
24 changes: 23 additions & 1 deletion src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@
extern "C" {
#endif

#ifdef GC_SMALL_PAGE
#define GC_PAGE_LG2 12 // log2(size of a page)
#else
#define GC_PAGE_LG2 14 // log2(size of a page)
#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
#endif
#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))

#define jl_malloc_tag ((void*)0xdeadaa01)
Expand Down Expand Up @@ -241,6 +245,23 @@ typedef struct {
_Atomic(size_t) n_pages_allocd;
} gc_fragmentation_stat_t;

#ifdef GC_SMALL_PAGE
#ifdef _P64
#define REGION0_PG_COUNT (1 << 16)
#define REGION1_PG_COUNT (1 << 18)
#define REGION2_PG_COUNT (1 << 18)
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF)
#define REGION_INDEX(p) (((uintptr_t)(p) >> 46) & 0x3FFFF)
#else
#define REGION0_PG_COUNT (1 << 10)
#define REGION1_PG_COUNT (1 << 10)
#define REGION2_PG_COUNT (1 << 0)
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
#define REGION_INDEX(p) (0)
#endif
#else
#ifdef _P64
#define REGION0_PG_COUNT (1 << 16)
#define REGION1_PG_COUNT (1 << 16)
Expand All @@ -256,6 +277,7 @@ typedef struct {
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
#define REGION_INDEX(p) (0)
#endif
#endif

// define the representation of the levels of the page-table (0 to 2)
typedef struct {
Expand Down
45 changes: 37 additions & 8 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,24 +359,48 @@ static const int jl_gc_sizeclasses[] = {
144, 160, 176, 192, 208, 224, 240, 256,

// the following tables are computed for maximum packing efficiency via the formula:
// pg = 2^14
// pg = GC_SMALL_PAGE ? 2^12 : 2^14
// sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)'

#ifdef GC_SMALL_PAGE
// rng = 15:-1:2 (14 pools)
272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool
// 16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost
#else
// rng = 60:-4:32 (8 pools)
272, 288, 304, 336, 368, 400, 448, 496,
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost

// rng = 30:-2:16 (8 pools)
544, 576, 624, 672, 736, 816, 896, 1008,
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost

// rng = 15:-1:8 (8 pools)
1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
#endif
};
#ifdef GC_SMALL_PAGE
#ifdef _P64
# define JL_GC_N_POOLS 39
#elif MAX_ALIGN == 8
# define JL_GC_N_POOLS 40
#else
# define JL_GC_N_POOLS 41
#endif
#else
#ifdef _P64
# define JL_GC_N_POOLS 49
#elif MAX_ALIGN == 8
# define JL_GC_N_POOLS 50
#else
# define JL_GC_N_POOLS 51
#endif
#endif
static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, "");

STATIC_INLINE int jl_gc_alignment(size_t sz)
Expand All @@ -403,7 +427,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz);

// the following table is computed as:
// [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]]
static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48};
static const uint8_t szclass_table[] =
#ifdef GC_SMALL_PAGE
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38};
#else
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48};
#endif
static_assert(sizeof(szclass_table) == 128, "");

STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz)
Expand Down
12 changes: 3 additions & 9 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#ifndef JL_THREADS_H
#define JL_THREADS_H

#include "work-stealing-queue.h"
#include "julia_atomics.h"
#include "work-stealing-queue.h"
#ifndef _OS_WINDOWS_
#include "pthread.h"
#endif
Expand Down Expand Up @@ -160,14 +160,8 @@ typedef struct {
arraylist_t *last_remset;

// variables for allocating objects from pools
#ifdef _P64
# define JL_GC_N_POOLS 49
#elif MAX_ALIGN == 8
# define JL_GC_N_POOLS 50
#else
# define JL_GC_N_POOLS 51
#endif
jl_gc_pool_t norm_pools[JL_GC_N_POOLS];
#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];

#define JL_N_STACK_POOLS 16
small_arraylist_t free_stacks[JL_N_STACK_POOLS];
Expand Down
5 changes: 5 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@
// Automatic Instrumenting Profiler
//#define ENABLE_TIMINGS

// pool allocator configuration options

// GC_SMALL_PAGE allocates objects in 4k pages
// #define GC_SMALL_PAGE


// method dispatch profiling --------------------------------------------------

Expand Down

0 comments on commit e1971d4

Please sign in to comment.