diff --git a/benchmarks/profiler_memory_sample_serialize.rb b/benchmarks/profiler_memory_sample_serialize.rb index 6c849ee9934..a3d3006dc6a 100644 --- a/benchmarks/profiler_memory_sample_serialize.rb +++ b/benchmarks/profiler_memory_sample_serialize.rb @@ -48,6 +48,7 @@ def setup @heap_size_enabled = ENV['HEAP_SIZE'] == 'true' @heap_sample_every = (ENV['HEAP_SAMPLE_EVERY'] || '1').to_i @retain_every = (ENV['RETAIN_EVERY'] || '10').to_i + @skip_end_gc = ENV['SKIP_END_GC'] == 'true' @recorder_factory = proc { Datadog::Profiling::StackRecorder.new( cpu_time_enabled: false, @@ -68,7 +69,7 @@ def run_benchmark suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_memory_sample_serialize') ) - x.report("sample+serialize #{ENV['CONFIG']} retain_every=#{@retain_every} heap_samples=#{@heap_samples_enabled} heap_size=#{@heap_size_enabled} heap_sample_every=#{@heap_sample_every}") do + x.report("sample+serialize #{ENV['CONFIG']} retain_every=#{@retain_every} heap_samples=#{@heap_samples_enabled} heap_size=#{@heap_size_enabled} heap_sample_every=#{@heap_sample_every} skip_end_gc=#{@skip_end_gc}") do recorder = @recorder_factory.call samples_per_second = 100 simulate_seconds = 60 @@ -79,7 +80,7 @@ def run_benchmark retained_objs << obj if (i % @retain_every).zero? end - GC.start + GC.start unless @skip_end_gc recorder.serialize end diff --git a/ext/datadog_profiling_native_extension/heap_recorder.c b/ext/datadog_profiling_native_extension/heap_recorder.c index 26b6ebe8873..a4e476c4121 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.c +++ b/ext/datadog_profiling_native_extension/heap_recorder.c @@ -10,6 +10,13 @@ #define CAN_APPLY_GC_FORCE_RECYCLE_BUG_WORKAROUND #endif +// Minimum age (in GC generations) of heap objects we want to include in heap +// recorder iterations. Object with age 0 represent objects that have yet to undergo +// a GC and, thus, may just be noise/trash at instant of iteration and are usually not +// relevant for heap profiles as the great majority should be trivially reclaimed +// during the next GC. +#define ITERATION_MIN_AGE 1 + // A compact representation of a stacktrace frame for a heap allocation. typedef struct { char *name; @@ -137,6 +144,11 @@ struct heap_recorder { // mutation of the data so iteration can occur without acquiring a lock. // NOTE: Contrary to object_records, this table has no ownership of its data. st_table *object_records_snapshot; + // The GC gen/epoch/count in which we prepared the current iteration. + // + // This enables us to calculate the age of iterated objects in the above snapshot by + // comparing it against an object's alloc_gen. + size_t iteration_gen; // Data for a heap recording that was started but not yet ended recording active_recording; @@ -353,6 +365,8 @@ void heap_recorder_prepare_iteration(heap_recorder *heap_recorder) { return; } + heap_recorder->iteration_gen = rb_gc_count(); + if (heap_recorder->object_records_snapshot != NULL) { // we could trivially handle this but we raise to highlight and catch unexpected usages. rb_raise(rb_eRuntimeError, "New heap recorder iteration prepared without the previous one having been finished."); @@ -459,6 +473,13 @@ static int st_object_record_entry_free(DDTRACE_UNUSED st_data_t key, st_data_t v return ST_DELETE; } +// Check to see if an object should not be included in a heap recorder iteration. +// This centralizes the checking logic to ensure it's equally applied between +// preparation and iteration codepaths. +static inline bool should_exclude_from_iteration(object_record *obj_record) { + return obj_record->object_data.gen_age < ITERATION_MIN_AGE; +} + static int st_object_record_update(st_data_t key, st_data_t value, st_data_t extra_arg) { long obj_id = (long) key; object_record *record = (object_record*) value; @@ -466,6 +487,19 @@ static int st_object_record_update(st_data_t key, st_data_t value, st_data_t ext VALUE ref; + size_t iteration_gen = recorder->iteration_gen; + size_t alloc_gen = record->object_data.alloc_gen; + // Guard against potential overflows given unsigned types here. + record->object_data.gen_age = alloc_gen < iteration_gen ? iteration_gen - alloc_gen : 0; + + if (should_exclude_from_iteration(record)) { + // If an object won't be included in the current iteration, there's + // no point checking for liveness or updating its size, so exit early. + // NOTE: This means that there should be an equivalent check during actual + // iteration otherwise we'd iterate/expose stale object data. + return ST_CONTINUE; + } + if (!ruby_ref_from_id(LONG2NUM(obj_id), &ref)) { // Id no longer associated with a valid ref. Need to delete this object record! on_committed_object_record_cleanup(recorder, record); @@ -525,8 +559,16 @@ static int st_object_records_iterate(DDTRACE_UNUSED st_data_t key, st_data_t val const heap_stack *stack = record->heap_record->stack; iteration_context *context = (iteration_context*) extra; - ddog_prof_Location *locations = context->heap_recorder->reusable_locations; + const heap_recorder *recorder = context->heap_recorder; + + if (should_exclude_from_iteration(record)) { + // Skip objects that should not be included in iteration + // NOTE: This matches the short-circuiting condition in st_object_record_update + // and prevents iteration over stale objects. + return ST_CONTINUE; + } + ddog_prof_Location *locations = recorder->reusable_locations; for (uint16_t i = 0; i < stack->frames_len; i++) { const heap_frame *frame = &stack->frames[i]; ddog_prof_Location *location = &locations[i]; @@ -725,9 +767,9 @@ void object_record_free(object_record *record) { VALUE object_record_inspect(object_record *record) { heap_frame top_frame = record->heap_record->stack->frames[0]; - VALUE inspect = rb_sprintf("obj_id=%ld weight=%d size=%zu location=%s:%d alloc_gen=%zu ", + VALUE inspect = rb_sprintf("obj_id=%ld weight=%d size=%zu location=%s:%d alloc_gen=%zu gen_age=%zu ", record->obj_id, record->object_data.weight, record->object_data.size, top_frame.filename, - (int) top_frame.line, record->object_data.alloc_gen); + (int) top_frame.line, record->object_data.alloc_gen, record->object_data.gen_age); const char *class = record->object_data.class; if (class != NULL) { diff --git a/ext/datadog_profiling_native_extension/heap_recorder.h b/ext/datadog_profiling_native_extension/heap_recorder.h index 6e2289aff1c..1eaaf559baa 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.h +++ b/ext/datadog_profiling_native_extension/heap_recorder.h @@ -27,7 +27,9 @@ typedef struct live_object_data { // could be seen as being representative of 50 objects. unsigned int weight; - // Size of this object on last flush/update. + // Size of this object in memory. + // NOTE: This only gets updated during heap_recorder_prepare_iteration and only + // for those objects that meet the minimum iteration age requirements. size_t size; // The class of the object that we're tracking. @@ -39,6 +41,10 @@ typedef struct live_object_data { // This enables us to calculate the age of this object in terms of GC executions. size_t alloc_gen; + // The age of this object in terms of GC generations. + // NOTE: This only gets updated during heap_recorder_prepare_iteration + size_t gen_age; + // Whether this object was previously seen as being frozen. If this is the case, // we'll skip any further size updates since frozen objects are supposed to be // immutable. diff --git a/ext/datadog_profiling_native_extension/stack_recorder.c b/ext/datadog_profiling_native_extension/stack_recorder.c index cb84217f022..ea4bb9bf79c 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.c +++ b/ext/datadog_profiling_native_extension/stack_recorder.c @@ -196,7 +196,6 @@ struct call_serialize_without_gvl_arguments { // Set by caller struct stack_recorder_state *state; ddog_Timespec finish_timestamp; - size_t gc_count_before_serialize; // Set by callee ddog_prof_Profile *profile; @@ -489,7 +488,6 @@ static VALUE _native_serialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instan struct call_serialize_without_gvl_arguments args = { .state = state, .finish_timestamp = finish_timestamp, - .gc_count_before_serialize = rb_gc_count(), .serialize_ran = false }; @@ -613,8 +611,6 @@ typedef struct heap_recorder_iteration_context { bool error; char error_msg[MAX_LEN_HEAP_ITERATION_ERROR_MSG]; - - size_t profile_gen; } heap_recorder_iteration_context; static bool add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteration_data iteration_data, void *extra_arg) { @@ -643,7 +639,7 @@ static bool add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteratio } labels[label_offset++] = (ddog_prof_Label) { .key = DDOG_CHARSLICE_C("gc gen age"), - .num = context->profile_gen - object_data->alloc_gen, + .num = object_data->gen_age, }; ddog_prof_Profile_Result result = ddog_prof_Profile_add( @@ -670,13 +666,12 @@ static bool add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteratio return true; } -static void build_heap_profile_without_gvl(struct stack_recorder_state *state, ddog_prof_Profile *profile, size_t gc_count_before_serialize) { +static void build_heap_profile_without_gvl(struct stack_recorder_state *state, ddog_prof_Profile *profile) { heap_recorder_iteration_context iteration_context = { .state = state, .profile = profile, .error = false, .error_msg = {0}, - .profile_gen = gc_count_before_serialize, }; bool iterated = heap_recorder_for_each_live_object(state->heap_recorder, add_heap_sample_to_active_profile_without_gvl, (void*) &iteration_context); // We wait until we're out of the iteration to grab the gvl and raise. This is important because during @@ -698,7 +693,7 @@ static void *call_serialize_without_gvl(void *call_args) { // Now that we have the inactive profile with all but heap samples, lets fill it with heap data // without needing to race with the active sampler - build_heap_profile_without_gvl(args->state, args->profile, args->gc_count_before_serialize); + build_heap_profile_without_gvl(args->state, args->profile); // Note: The profile gets reset by the serialize call args->result = ddog_prof_Profile_serialize(args->profile, &args->finish_timestamp, NULL /* duration_nanos is optional */, NULL /* start_time is optional */); diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 4bc75a724e2..eda3fe84f5b 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -637,6 +637,10 @@ test_num_allocated_object.times { |i| live_objects[i] = CpuAndWallTimeWorkerSpec::TestStruct.new } allocation_line = __LINE__ - 1 + # Force a GC to happen here to ensure all the live_objects have age > 0. + # Otherwise they wouldn't show up in the serialized pprof below + GC.start + cpu_and_wall_time_worker.stop test_struct_heap_sample = lambda { |sample| diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 5b25c0fb8e4..bbff9b59fb3 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -510,6 +510,48 @@ def sample_allocation(obj) expect(summed_values).to eq(expected_summed_values) end + it 'does not include samples with age = 0' do + test_num_allocated_objects = 123 + test_num_age_bigger_0 = 57 + live_objects = Array.new(test_num_allocated_objects) + + allocator_proc = proc { |i| + live_objects[i] = "this is string number #{i}" + sample_allocation(live_objects[i]) + } + + sample_line = __LINE__ - 3 + + # First allocate a bunch of objects with age > 0. We expect to + # see these at the end + test_num_age_bigger_0.times(&allocator_proc) + # Force the above allocations to have gc age > 0 + GC.start + + begin + # Need to disable GC during this entire stretch to ensure rb_gc_count is + # the same between sample_allocation and pprof serialization. + GC.disable + + # Allocate another set of objects that will necessarily have age = 0 since + # we disabled GC immediate before and will only enable it at test's end. + (test_num_age_bigger_0..test_num_allocated_objects).each(&allocator_proc) + + # Grab all exported heap samples and sum their values + sum_exported_heap_samples = heap_samples + .select { |s| s.has_location?(path: __FILE__, line: sample_line) } + .map { |s| s.values[:'heap-live-samples'] } + .reduce(:+) + + # Multiply expectation by sample_rate to be able to compare with weighted samples + # We expect total exported sum to match the weighted samples with age > 0 + expect(sum_exported_heap_samples).to be test_num_age_bigger_0 * sample_rate + ensure + # Whatever happens, make sure we reenable GC + GC.enable + end + end + it "aren't lost when they happen concurrently with a long serialization" do described_class::Testing._native_start_fake_slow_heap_serialization(stack_recorder) @@ -525,6 +567,8 @@ def sample_allocation(obj) described_class::Testing._native_end_fake_slow_heap_serialization(stack_recorder) + GC.start # Force a GC so the live_objects above have age > 0 and show up in heap samples + relevant_sample = heap_samples.find { |s| s.has_location?(path: __FILE__, line: sample_line) } expect(relevant_sample).not_to be nil expect(relevant_sample.values[:'heap-live-samples']).to eq test_num_allocated_object * sample_rate @@ -642,6 +686,8 @@ def create_obj_in_recycled_slot(should_sample_original: false) sample_allocation(recycled_obj) sample_line = __LINE__ - 1 + GC.start # Ensure recycled sample has age > 0 so it shows up in serialized profile + recycled_sample = heap_samples.find { |s| s.has_location?(path: __FILE__, line: sample_line) } expect(recycled_sample).not_to be nil end