Skip to content

Commit

Permalink
expose metric to report reasons why full GCs were triggered (JuliaLan…
Browse files Browse the repository at this point in the history
…g#55826)

Additional GC observability tool.

This will help us to diagnose why some of our servers are triggering so
many full GCs in certain circumstances.
  • Loading branch information
d-netto committed Sep 27, 2024
1 parent 8e9f7de commit 29d8d93
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 1 deletion.
33 changes: 33 additions & 0 deletions base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,39 @@ function gc_page_utilization_data()
return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false)
end

# must be kept in sync with `src/gc-stock.h``
const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP,
:FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM, :FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY,
:FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER]

"""
Base.full_sweep_reasons()
Return a dictionary of the number of times each full sweep reason has occurred.
The reasons are:
- `:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL`: Full sweep was caused due to `always_full` being set in the GC debug environment
- `:FULL_SWEEP_REASON_FORCED_FULL_SWEEP`: Full sweep was forced by `GC.gc(true)`
- `:FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM`: Full sweep was forced by the allocation interval being above the total
memory in the machine (as returned by LibUV) divided by the number of mutator threads
- `:FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY`: Full sweep was caused due to live bytes being above the
soft heap limit size (which is either automatically computed at initialization based on the total memory provided by LibUV,
or set by the user via `--heap-size-hint`)
- `:FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER`: Full sweep was forced by the intergenerational frontier being too large
(i.e. too many pointers in the remembered set)
Note that the set of reasons is not guaranteed to be stable across minor versions of Julia.
"""
function full_sweep_reasons()
reason = cglobal(:jl_full_sweep_reasons, UInt64)
reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false)
d = Dict{Symbol, Int64}()
for (i, r) in enumerate(FULL_SWEEP_REASONS)
d[r] = reasons_as_array[i]
end
return d
end

"""
Base.jit_total_bytes()
Expand Down
7 changes: 7 additions & 0 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ uv_sem_t gc_sweep_assists_needed;
uv_mutex_t gc_queue_observer_lock;
// Tag for sentinel nodes in bigval list
uintptr_t gc_bigval_sentinel_tag;
// Table recording number of full GCs due to each reason
JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];

// Linked list of callback functions

Expand Down Expand Up @@ -3551,6 +3553,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
if (large_frontier) {
sweep_full = 1;
gc_num.interval = last_long_collect_interval;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER);
}
if (not_freed_enough || large_frontier) {
gc_num.interval = gc_num.interval * 2;
Expand All @@ -3566,6 +3569,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
if (gc_num.interval > maxmem) {
sweep_full = 1;
gc_num.interval = maxmem;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM);
}
}

Expand All @@ -3574,13 +3578,16 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
if (live_bytes > max_total_memory) {
under_memory_pressure = 1;
sweep_full = 1;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY);
}
if (gc_sweep_always_full) {
sweep_full = 1;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL);
}
if (collection == JL_GC_FULL && !prev_sweep_full) {
sweep_full = 1;
recollect = 1;
gc_count_full_sweep_reason(FULL_SWEEP_REASON_FORCED_FULL_SWEEP);
}
if (sweep_full) {
// these are the difference between the number of gc-perm bytes scanned
Expand Down
16 changes: 16 additions & 0 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,22 @@ FORCE_INLINE void gc_big_object_link(bigval_t *sentinel_node, bigval_t *node) JL
sentinel_node->next = node;
}

// Must be kept in sync with `base/timing.jl`
#define FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL (0)
#define FULL_SWEEP_REASON_FORCED_FULL_SWEEP (1)
#define FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM (2)
#define FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY (3)
#define FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER (4)
#define FULL_SWEEP_NUM_REASONS (5)

extern JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
STATIC_INLINE void gc_count_full_sweep_reason(int reason) JL_NOTSAFEPOINT
{
assert(reason >= 0 && reason < FULL_SWEEP_NUM_REASONS);
jl_full_sweep_reasons[reason]++;
}

extern uv_mutex_t gc_perm_lock;
extern uv_mutex_t gc_threads_lock;
extern uv_cond_t gc_threads_cond;
extern uv_sem_t gc_sweep_assists_needed;
Expand Down
2 changes: 1 addition & 1 deletion src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -1035,7 +1035,7 @@ JL_DLLEXPORT int jl_heartbeat_resume(void)
if (uv_sem_trywait(&heartbeat_off_sem) != 0) {
return -1;
}

// reset state as we've been paused
n_hbs_missed = 0;
n_hbs_recvd = 0;
Expand Down
11 changes: 11 additions & 0 deletions test/gc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ function run_pg_size_test()
@test page_size == (1 << 12) || page_size == (1 << 14)
end

function full_sweep_reasons_test()
GC.gc()
reasons = Base.full_sweep_reasons()
@test reasons[:FULL_SWEEP_REASON_FORCED_FULL_SWEEP] >= 1
@test keys(reasons) == Set(Base.FULL_SWEEP_REASONS)
end

# !!! note:
# Since we run our tests on 32bit OS as well we confine ourselves
# to parameters that allocate about 512MB of objects. Max RSS is lower
Expand All @@ -43,3 +50,7 @@ end
run_nonzero_page_utilization_test()
run_pg_size_test()
end

@testset "Full GC reasons" begin
full_sweep_reasons_test()
end

0 comments on commit 29d8d93

Please sign in to comment.