Skip to content

Commit

Permalink
Merge pull request #2783 from DataDog/ivoanjo/fix-disabling-dynamic-s…
Browse files Browse the repository at this point in the history
…ampling-rate
  • Loading branch information
marcotc authored Apr 14, 2023
2 parents d2c13d8 + cdf76c4 commit 6f31af6
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,19 @@ struct cpu_and_wall_time_worker_state {
unsigned int signal_handler_wrong_thread;
// How many times we actually sampled (except GC samples)
unsigned int sampled;
// How many times we skipped a sample because of the dynamic sampling rate mechanism
unsigned int skipped_sample_because_of_dynamic_sampling_rate;

// Stats for the results of calling rb_postponed_job_register_one
// The same function was already waiting to be executed
unsigned int postponed_job_skipped_already_existed;
// The function was added to the queue successfully
unsigned int postponed_job_success;
// The queue was full
unsigned int postponed_job_full;
// The function returned an unknown result code
unsigned int postponed_job_unknown_result;

// Min/max/total wall-time spent sampling (except GC samples)
uint64_t sampling_time_ns_min;
uint64_t sampling_time_ns_max;
Expand Down Expand Up @@ -438,8 +451,20 @@ static void handle_sampling_signal(DDTRACE_UNUSED int _signal, DDTRACE_UNUSED si

// Note: If we ever want to get rid of rb_postponed_job_register_one, remember not to clobber Ruby exceptions, as
// this function does this helpful job for us now -- https://github.com/ruby/ruby/commit/a98e343d39c4d7bf1e2190b076720f32d9f298b3.
/*int result =*/ rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);
// TODO: Do something with result (potentially update tracking counters?)
int result = rb_postponed_job_register_one(0, sample_from_postponed_job, NULL);

// Officially, the result of rb_postponed_job_register_one is documented as being opaque, but in practice it does not
// seem to have changed between Ruby 2.3 and 3.2, and so we track it as a debugging mechanism
switch (result) {
case 0:
state->stats.postponed_job_full++; break;
case 1:
state->stats.postponed_job_success++; break;
case 2:
state->stats.postponed_job_skipped_already_existed++; break;
default:
state->stats.postponed_job_unknown_result++;
}
}

// The actual sampling trigger loop always runs **without** the global vm lock.
Expand All @@ -451,9 +476,6 @@ static void *run_sampling_trigger_loop(void *state_ptr) {
while (atomic_load(&state->should_run)) {
state->stats.trigger_sample_attempts++;

// TODO: This is still a placeholder for a more complex mechanism. In particular:
// * We want to do more than having a fixed sampling rate

current_gvl_owner owner = gvl_owner();
if (owner.valid) {
// Note that reading the GVL owner and sending them a signal is a race -- the Ruby VM keeps on executing while
Expand Down Expand Up @@ -519,8 +541,8 @@ static VALUE rescued_sample_from_postponed_job(VALUE self_instance) {

long wall_time_ns_before_sample = monotonic_wall_time_now_ns(RAISE_ON_FAILURE);

if (!dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
// TODO: Add a counter for this
if (state->dynamic_sampling_rate_enabled && !dynamic_sampling_rate_should_sample(&state->dynamic_sampling_rate, wall_time_ns_before_sample)) {
state->stats.skipped_sample_because_of_dynamic_sampling_rate++;
return Qnil;
}

Expand Down Expand Up @@ -777,6 +799,11 @@ static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance) {
ID2SYM(rb_intern("signal_handler_enqueued_sample")), /* => */ UINT2NUM(state->stats.signal_handler_enqueued_sample),
ID2SYM(rb_intern("signal_handler_wrong_thread")), /* => */ UINT2NUM(state->stats.signal_handler_wrong_thread),
ID2SYM(rb_intern("sampled")), /* => */ UINT2NUM(state->stats.sampled),
ID2SYM(rb_intern("skipped_sample_because_of_dynamic_sampling_rate")), /* => */ UINT2NUM(state->stats.skipped_sample_because_of_dynamic_sampling_rate),
ID2SYM(rb_intern("postponed_job_skipped_already_existed")), /* => */ UINT2NUM(state->stats.postponed_job_skipped_already_existed),
ID2SYM(rb_intern("postponed_job_success")), /* => */ UINT2NUM(state->stats.postponed_job_success),
ID2SYM(rb_intern("postponed_job_full")), /* => */ UINT2NUM(state->stats.postponed_job_full),
ID2SYM(rb_intern("postponed_job_unknown_result")), /* => */ UINT2NUM(state->stats.postponed_job_unknown_result),
ID2SYM(rb_intern("sampling_time_ns_min")), /* => */ pretty_sampling_time_ns_min,
ID2SYM(rb_intern("sampling_time_ns_max")), /* => */ pretty_sampling_time_ns_max,
ID2SYM(rb_intern("sampling_time_ns_total")), /* => */ pretty_sampling_time_ns_total,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,9 @@
# Sanity checking

# We're currently targeting 100 samples per second, so 5 in 100ms is a conservative approximation that hopefully
# will not cause flakiness
# will not cause flakiness.
# If this turns out to be flaky due to the dynamic sampling rate mechanism, it can be disabled like we do for
# the test below.
expect(sample_count).to be >= 5, "sample_count: #{sample_count}, stats: #{stats}"
expect(trigger_sample_attempts).to be >= sample_count
end
Expand All @@ -341,8 +343,6 @@
before { expect(Datadog.logger).to receive(:warn).with(/dynamic sampling rate disabled/) }

it 'is able to sample even when all threads are sleeping' do
skip('Test is flaky -- @ivoanjo is investigating')

start
wait_until_running

Expand Down Expand Up @@ -559,6 +559,11 @@
signal_handler_enqueued_sample: 0,
signal_handler_wrong_thread: 0,
sampled: 0,
skipped_sample_because_of_dynamic_sampling_rate: 0,
postponed_job_skipped_already_existed: 0,
postponed_job_success: 0,
postponed_job_full: 0,
postponed_job_unknown_result: 0,
sampling_time_ns_min: nil,
sampling_time_ns_max: nil,
sampling_time_ns_total: nil,
Expand Down

0 comments on commit 6f31af6

Please sign in to comment.