Skip to content

Commit

Permalink
Merge pull request #3310 from zendesk/ktsanaktsidis/overhead_percenta…
Browse files Browse the repository at this point in the history
…ge_dynamic

Allow the dynamic sampling rate overhead target to be set
  • Loading branch information
ivoanjo authored Dec 11, 2023
2 parents 22c0370 + 43ef2a9 commit 0fa4a67
Show file tree
Hide file tree
Showing 16 changed files with 247 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ static VALUE _native_initialize(
VALUE allocation_counting_enabled,
VALUE no_signals_workaround_enabled,
VALUE dynamic_sampling_rate_enabled,
VALUE dynamic_sampling_rate_overhead_target_percentage,
VALUE allocation_sample_every
);
static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
Expand Down Expand Up @@ -226,7 +227,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
// https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);

rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 8);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 9);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
Expand Down Expand Up @@ -295,13 +296,15 @@ static VALUE _native_initialize(
VALUE allocation_counting_enabled,
VALUE no_signals_workaround_enabled,
VALUE dynamic_sampling_rate_enabled,
VALUE dynamic_sampling_rate_overhead_target_percentage,
VALUE allocation_sample_every
) {
ENFORCE_BOOLEAN(gc_profiling_enabled);
ENFORCE_BOOLEAN(allocation_counting_enabled);
ENFORCE_BOOLEAN(no_signals_workaround_enabled);
ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
ENFORCE_TYPE(allocation_sample_every, T_FIXNUM);
ENFORCE_TYPE(dynamic_sampling_rate_overhead_target_percentage, T_FLOAT);

struct cpu_and_wall_time_worker_state *state;
TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
Expand All @@ -310,6 +313,7 @@ static VALUE _native_initialize(
state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
dynamic_sampling_rate_set_overhead_target_percentage(&state->dynamic_sampling_rate, NUM2DBL(dynamic_sampling_rate_overhead_target_percentage));
state->allocation_sample_every = NUM2INT(allocation_sample_every);

if (state->allocation_sample_every < 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
//
// Instead of sampling at a fixed sample rate, the actual sampling rate should be decided by also observing the impact
// that running the profiler is having. This protects against issues such as the profiler being deployed in very busy
//machines or containers with unrealistic CPU restrictions.
// machines or containers with unrealistic CPU restrictions.
//
// ### Implementation
//
Expand All @@ -35,20 +35,25 @@
// sample. If it's not, it will skip sampling.
//
// Finally, as an additional optimization, there's a `dynamic_sampling_rate_get_sleep()` which, given the current
// wall-time, will return the time remaining (*there's an exception, check below) until the next sample.
// wall-time, will return the time remaining (*there's an exception, check function) until the next sample.
//
// ---

// This is the wall-time overhead we're targeting. E.g. we target to spend no more than 2%, or 1.2 seconds per minute,
// taking profiling samples.
#define WALL_TIME_OVERHEAD_TARGET_PERCENTAGE 2.0 // %
// taking profiling samples by default.
#define DEFAULT_WALL_TIME_OVERHEAD_TARGET_PERCENTAGE 2.0 // %
// See `dynamic_sampling_rate_get_sleep()` for details
#define MAX_SLEEP_TIME_NS MILLIS_AS_NS(100)
// See `dynamic_sampling_rate_after_sample()` for details
#define MAX_TIME_UNTIL_NEXT_SAMPLE_NS SECONDS_AS_NS(10)

void dynamic_sampling_rate_init(dynamic_sampling_rate_state *state) {
atomic_init(&state->next_sample_after_monotonic_wall_time_ns, 0);
dynamic_sampling_rate_set_overhead_target_percentage(state, DEFAULT_WALL_TIME_OVERHEAD_TARGET_PERCENTAGE);
}

void dynamic_sampling_rate_set_overhead_target_percentage(dynamic_sampling_rate_state *state, double overhead_target_percentage) {
state->overhead_target_percentage = overhead_target_percentage;
}

void dynamic_sampling_rate_reset(dynamic_sampling_rate_state *state) {
Expand Down Expand Up @@ -76,7 +81,7 @@ bool dynamic_sampling_rate_should_sample(dynamic_sampling_rate_state *state, lon
}

void dynamic_sampling_rate_after_sample(dynamic_sampling_rate_state *state, long wall_time_ns_after_sample, uint64_t sampling_time_ns) {
double overhead_target = (double) WALL_TIME_OVERHEAD_TARGET_PERCENTAGE;
double overhead_target = state->overhead_target_percentage;

// The idea here is that we're targeting a maximum % of wall-time spent sampling.
// So for instance, if sampling_time_ns is 2% of the time we spend working, how much is the 98% we should spend
Expand All @@ -93,48 +98,51 @@ void dynamic_sampling_rate_after_sample(dynamic_sampling_rate_state *state, long
// ---
// Below here is boilerplate to expose the above code to Ruby so that we can test it with RSpec as usual.

VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns);
VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample);
VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns);
VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns);
VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample);
VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns);

void collectors_dynamic_sampling_rate_init(VALUE profiling_module) {
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
VALUE dynamic_sampling_rate_module = rb_define_module_under(collectors_module, "DynamicSamplingRate");
VALUE testing_module = rb_define_module_under(dynamic_sampling_rate_module, "Testing");

rb_define_singleton_method(testing_module, "_native_get_sleep", _native_get_sleep, 2);
rb_define_singleton_method(testing_module, "_native_should_sample", _native_should_sample, 2);
rb_define_singleton_method(testing_module, "_native_after_sample", _native_after_sample, 2);
rb_define_singleton_method(testing_module, "_native_get_sleep", _native_get_sleep, 3);
rb_define_singleton_method(testing_module, "_native_should_sample", _native_should_sample, 3);
rb_define_singleton_method(testing_module, "_native_after_sample", _native_after_sample, 3);
}

VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns) {
VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns) {
ENFORCE_TYPE(simulated_next_sample_after_monotonic_wall_time_ns, T_FIXNUM);
ENFORCE_TYPE(current_monotonic_wall_time_ns, T_FIXNUM);

dynamic_sampling_rate_state state;
dynamic_sampling_rate_init(&state);
dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
atomic_store(&state.next_sample_after_monotonic_wall_time_ns, NUM2LONG(simulated_next_sample_after_monotonic_wall_time_ns));

return ULL2NUM(dynamic_sampling_rate_get_sleep(&state, NUM2LONG(current_monotonic_wall_time_ns)));
}

VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample) {
VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample) {
ENFORCE_TYPE(simulated_next_sample_after_monotonic_wall_time_ns, T_FIXNUM);
ENFORCE_TYPE(wall_time_ns_before_sample, T_FIXNUM);

dynamic_sampling_rate_state state;
dynamic_sampling_rate_init(&state);
dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
atomic_store(&state.next_sample_after_monotonic_wall_time_ns, NUM2LONG(simulated_next_sample_after_monotonic_wall_time_ns));

return dynamic_sampling_rate_should_sample(&state, NUM2LONG(wall_time_ns_before_sample)) ? Qtrue : Qfalse;
}

VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns) {
VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns) {
ENFORCE_TYPE(wall_time_ns_after_sample, T_FIXNUM);
ENFORCE_TYPE(sampling_time_ns, T_FIXNUM);

dynamic_sampling_rate_state state;
dynamic_sampling_rate_init(&state);
dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));

dynamic_sampling_rate_after_sample(&state, NUM2LONG(wall_time_ns_after_sample), NUM2ULL(sampling_time_ns));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
#include <stdbool.h>

typedef struct {
// This is the wall-time overhead we're targeting. E.g. by default, we target to spend no more than 2%, or 1.2 seconds
// per minute, taking profiling samples.
double overhead_target_percentage;
atomic_long next_sample_after_monotonic_wall_time_ns;
} dynamic_sampling_rate_state;

void dynamic_sampling_rate_init(dynamic_sampling_rate_state *state);
void dynamic_sampling_rate_set_overhead_target_percentage(dynamic_sampling_rate_state *state, double overhead_target_percentage);
void dynamic_sampling_rate_reset(dynamic_sampling_rate_state *state);
uint64_t dynamic_sampling_rate_get_sleep(dynamic_sampling_rate_state *state, long current_monotonic_wall_time_ns);
bool dynamic_sampling_rate_should_sample(dynamic_sampling_rate_state *state, long wall_time_ns_before_sample);
Expand Down
28 changes: 28 additions & 0 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,34 @@ def initialize(*_)
end
end
end

# Configures how much wall-time overhead the profiler targets. The profiler will dynamically adjust the
# interval between samples it takes so as to try and maintain the property that it spends no longer than
# this amount of wall-clock time profiling. For example, with the default value of 2%, the profiler will
# try and cause no more than 1.2 seconds per minute of overhead. Decreasing this value will reduce the
# accuracy of the data collected. Increasing will impact the application.
#
# We do not recommend tweaking this value.
#
# This value should be a percentage i.e. a number between 0 and 100, not 0 and 1.
#
# @default `DD_PROFILING_OVERHEAD_TARGET_PERCENTAGE` as a float, otherwise 2.0
option :overhead_target_percentage do |o|
o.type :float
o.env 'DD_PROFILING_OVERHEAD_TARGET_PERCENTAGE'
o.default 2.0
end

# Controls how often the profiler reports data, in seconds. Cannot be lower than 60 seconds.
#
# We do not recommend tweaking this value.
#
# @default `DD_PROFILING_UPLOAD_PERIOD` environment variable, otherwise 60
option :upload_period_seconds do |o|
o.type :int
o.env 'DD_PROFILING_UPLOAD_PERIOD'
o.default 60
end
end

# @public_api
Expand Down
2 changes: 2 additions & 0 deletions lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def initialize(
allocation_counting_enabled:,
no_signals_workaround_enabled:,
thread_context_collector:,
dynamic_sampling_rate_overhead_target_percentage:,
idle_sampling_helper: IdleSamplingHelper.new,
# **NOTE**: This should only be used for testing; disabling the dynamic sampling rate will increase the
# profiler overhead!
Expand Down Expand Up @@ -45,6 +46,7 @@ def initialize(
allocation_counting_enabled,
no_signals_workaround_enabled,
dynamic_sampling_rate_enabled,
dynamic_sampling_rate_overhead_target_percentage,
allocation_sample_every,
)
@worker_thread = nil
Expand Down
20 changes: 18 additions & 2 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module Component
# Passing in a `nil` tracer is supported and will disable the following profiling features:
# * Code Hotspots panel in the trace viewer, as well as scoping a profile down to a span
# * Endpoint aggregation in the profiler UX, including normalization (resource per endpoint call)
def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) # rubocop:disable Metrics/MethodLength
require_relative '../profiling/diagnostics/environment_logger'

Profiling::Diagnostics::EnvironmentLogger.collect_and_log!
Expand Down Expand Up @@ -41,6 +41,8 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)

no_signals_workaround_enabled = no_signals_workaround_enabled?(settings)
timeline_enabled = settings.profiling.advanced.experimental_timeline_enabled
overhead_target_percentage = valid_overhead_target(settings.profiling.advanced.overhead_target_percentage)
upload_period_seconds = [60, settings.profiling.advanced.upload_period_seconds].max

recorder = Datadog::Profiling::StackRecorder.new(
cpu_time_enabled: RUBY_PLATFORM.include?('linux'), # Only supported on Linux currently
Expand All @@ -58,6 +60,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
no_signals_workaround_enabled: no_signals_workaround_enabled,
thread_context_collector: thread_context_collector,
dynamic_sampling_rate_overhead_target_percentage: overhead_target_percentage,
allocation_sample_every: 0,
)

Expand All @@ -68,7 +71,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)

exporter = build_profiler_exporter(settings, recorder, internal_metadata: internal_metadata)
transport = build_profiler_transport(settings, agent_settings)
scheduler = Profiling::Scheduler.new(exporter: exporter, transport: transport)
scheduler = Profiling::Scheduler.new(exporter: exporter, transport: transport, interval: upload_period_seconds)

Profiling::Profiler.new(worker: worker, scheduler: scheduler)
end
Expand Down Expand Up @@ -245,6 +248,19 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
true
end
end

private_class_method def self.valid_overhead_target(overhead_target_percentage)
if overhead_target_percentage > 0 && overhead_target_percentage <= 20
overhead_target_percentage
else
Datadog.logger.error(
'Ignoring invalid value for profiling overhead_target_percentage setting: ' \
"#{overhead_target_percentage.inspect}. Falling back to default value."
)

2.0
end
end
end
end
end
10 changes: 4 additions & 6 deletions lib/datadog/profiling/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@

module Datadog
module Profiling
# Periodically (every DEFAULT_INTERVAL_SECONDS) takes a profile from the `Exporter` and reports it using the
# Periodically (every interval, 60 seconds by default) takes a profile from the `Exporter` and reports it using the
# configured transport. Runs on its own background thread.
class Scheduler < Core::Worker
include Core::Workers::Polling

DEFAULT_INTERVAL_SECONDS = 60
MINIMUM_INTERVAL_SECONDS = 0

# We sleep for at most this duration seconds before reporting data to avoid multi-process applications all
Expand All @@ -28,8 +27,7 @@ class Scheduler < Core::Worker
def initialize(
exporter:,
transport:,
fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
interval: DEFAULT_INTERVAL_SECONDS,
interval:, fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default, # seconds
enabled: true
)
@exporter = exporter
Expand Down Expand Up @@ -115,8 +113,8 @@ def flush_events
#
# During PR review (https://github.com/DataDog/dd-trace-rb/pull/1807) we discussed the possible alternative of
# just sleeping before starting the scheduler loop. We ended up not going with that option to avoid the first
# profile containing up to DEFAULT_INTERVAL_SECONDS + DEFAULT_FLUSH_JITTER_MAXIMUM_SECONDS instead of the
# usual DEFAULT_INTERVAL_SECONDS size.
# profile containing up to interval + DEFAULT_FLUSH_JITTER_MAXIMUM_SECONDS instead of the
# usual interval seconds.
if run_loop?
jitter_seconds = rand * DEFAULT_FLUSH_JITTER_MAXIMUM_SECONDS # floating point number between (0.0...maximum)
sleep(jitter_seconds)
Expand Down
2 changes: 2 additions & 0 deletions sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module Datadog
allocation_counting_enabled: bool,
no_signals_workaround_enabled: bool,
thread_context_collector: Datadog::Profiling::Collectors::ThreadContext,
dynamic_sampling_rate_overhead_target_percentage: Float,
?idle_sampling_helper: Datadog::Profiling::Collectors::IdleSamplingHelper,
?dynamic_sampling_rate_enabled: bool,
?allocation_sample_every: Integer,
Expand All @@ -25,6 +26,7 @@ module Datadog
bool allocation_counting_enabled,
bool no_signals_workaround_enabled,
bool dynamic_sampling_rate_enabled,
Float dynamic_sampling_rate_overhead_target_percentage,
::Integer allocation_sample_every,
) -> true

Expand Down
2 changes: 2 additions & 0 deletions sig/datadog/profiling/component.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ module Datadog

def self.incompatible_libmysqlclient_version?: (untyped settings) -> bool
def self.incompatible_passenger_version?: () -> bool
def self.flush_interval: (untyped settings) -> ::Numeric
def self.valid_overhead_target: (::Float overhead_target_percentage) -> ::Float
end
end
end
2 changes: 2 additions & 0 deletions sig/datadog/profiling/ext.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ module Datadog
ENV_MAX_FRAMES: "DD_PROFILING_MAX_FRAMES"
ENV_AGENTLESS: "DD_PROFILING_AGENTLESS"
ENV_ENDPOINT_COLLECTION_ENABLED: "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED"
ENV_DYNAMIC_SAMPLING_RATE_OVERHEAD_TARGET_PERCENTAGE: "DD_PROFILING_DYNAMIC_SAMPLING_RATE_OVERHEAD_TARGET_PERCENTAGE"
DEFAULT_DYNAMIC_SAMPLING_RATE_OVERHEAD_TARGET_PERCENTAGE: Float

module Transport
module HTTP
Expand Down
2 changes: 1 addition & 1 deletion sig/datadog/profiling/scheduler.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module Datadog
exporter: Datadog::Profiling::Exporter,
transport: Datadog::Profiling::HttpTransport,
?fork_policy: untyped,
?interval: ::Integer,
?interval: ::Numeric,
?enabled: bool,
) -> void

Expand Down
Loading

0 comments on commit 0fa4a67

Please sign in to comment.