Merge pull request #3310 from zendesk/ktsanaktsidis/overhead_percenta…

…ge_dynamic Allow the dynamic sampling rate overhead target to be set
DataDog · Dec 11, 2023 · 0fa4a67 · 0fa4a67
2 parents 22c0370 + 43ef2a9
commit 0fa4a67
Show file tree

Hide file tree

Showing 16 changed files with 247 additions and 42 deletions.
diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -152,6 +152,7 @@ static VALUE _native_initialize(
   VALUE allocation_counting_enabled,
   VALUE no_signals_workaround_enabled,
   VALUE dynamic_sampling_rate_enabled,
+  VALUE dynamic_sampling_rate_overhead_target_percentage,
   VALUE allocation_sample_every
 );
 static void cpu_and_wall_time_worker_typed_data_mark(void *state_ptr);
@@ -226,7 +227,7 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
   // https://bugs.ruby-lang.org/issues/18007 for a discussion around this.
   rb_define_alloc_func(collectors_cpu_and_wall_time_worker_class, _native_new);
 
-  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 8);
+  rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_initialize", _native_initialize, 9);
   rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_sampling_loop", _native_sampling_loop, 1);
   rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_stop", _native_stop, 2);
   rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_reset_after_fork", _native_reset_after_fork, 1);
@@ -295,13 +296,15 @@ static VALUE _native_initialize(
   VALUE allocation_counting_enabled,
   VALUE no_signals_workaround_enabled,
   VALUE dynamic_sampling_rate_enabled,
+  VALUE dynamic_sampling_rate_overhead_target_percentage,
   VALUE allocation_sample_every
 ) {
   ENFORCE_BOOLEAN(gc_profiling_enabled);
   ENFORCE_BOOLEAN(allocation_counting_enabled);
   ENFORCE_BOOLEAN(no_signals_workaround_enabled);
   ENFORCE_BOOLEAN(dynamic_sampling_rate_enabled);
   ENFORCE_TYPE(allocation_sample_every, T_FIXNUM);
+  ENFORCE_TYPE(dynamic_sampling_rate_overhead_target_percentage, T_FLOAT);
 
   struct cpu_and_wall_time_worker_state *state;
   TypedData_Get_Struct(self_instance, struct cpu_and_wall_time_worker_state, &cpu_and_wall_time_worker_typed_data, state);
@@ -310,6 +313,7 @@ static VALUE _native_initialize(
   state->allocation_counting_enabled = (allocation_counting_enabled == Qtrue);
   state->no_signals_workaround_enabled = (no_signals_workaround_enabled == Qtrue);
   state->dynamic_sampling_rate_enabled = (dynamic_sampling_rate_enabled == Qtrue);
+  dynamic_sampling_rate_set_overhead_target_percentage(&state->dynamic_sampling_rate, NUM2DBL(dynamic_sampling_rate_overhead_target_percentage));
   state->allocation_sample_every = NUM2INT(allocation_sample_every);
 
   if (state->allocation_sample_every < 0) {

diff --git a/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c b/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.c
@@ -19,7 +19,7 @@
 //
 // Instead of sampling at a fixed sample rate, the actual sampling rate should be decided by also observing the impact
 // that running the profiler is having. This protects against issues such as the profiler being deployed in very busy
-//machines or containers with unrealistic CPU restrictions.
+// machines or containers with unrealistic CPU restrictions.
 //
 // ### Implementation
 //
@@ -35,20 +35,25 @@
 // sample. If it's not, it will skip sampling.
 //
 // Finally, as an additional optimization, there's a `dynamic_sampling_rate_get_sleep()` which, given the current
-// wall-time, will return the time remaining (*there's an exception, check below) until the next sample.
+// wall-time, will return the time remaining (*there's an exception, check function) until the next sample.
 //
 // ---
 
 // This is the wall-time overhead we're targeting. E.g. we target to spend no more than 2%, or 1.2 seconds per minute,
-// taking profiling samples.
-#define WALL_TIME_OVERHEAD_TARGET_PERCENTAGE 2.0 // %
+// taking profiling samples by default.
+#define DEFAULT_WALL_TIME_OVERHEAD_TARGET_PERCENTAGE 2.0 // %
 // See `dynamic_sampling_rate_get_sleep()` for details
 #define MAX_SLEEP_TIME_NS MILLIS_AS_NS(100)
 // See `dynamic_sampling_rate_after_sample()` for details
 #define MAX_TIME_UNTIL_NEXT_SAMPLE_NS SECONDS_AS_NS(10)
 
 void dynamic_sampling_rate_init(dynamic_sampling_rate_state *state) {
   atomic_init(&state->next_sample_after_monotonic_wall_time_ns, 0);
+  dynamic_sampling_rate_set_overhead_target_percentage(state, DEFAULT_WALL_TIME_OVERHEAD_TARGET_PERCENTAGE);
+}
+
+void dynamic_sampling_rate_set_overhead_target_percentage(dynamic_sampling_rate_state *state, double overhead_target_percentage) {
+  state->overhead_target_percentage = overhead_target_percentage;
 }
 
 void dynamic_sampling_rate_reset(dynamic_sampling_rate_state *state) {
@@ -76,7 +81,7 @@ bool dynamic_sampling_rate_should_sample(dynamic_sampling_rate_state *state, lon
 }
 
 void dynamic_sampling_rate_after_sample(dynamic_sampling_rate_state *state, long wall_time_ns_after_sample, uint64_t sampling_time_ns) {
-  double overhead_target = (double) WALL_TIME_OVERHEAD_TARGET_PERCENTAGE;
+  double overhead_target = state->overhead_target_percentage;
 
   // The idea here is that we're targeting a maximum % of wall-time spent sampling.
   // So for instance, if sampling_time_ns is 2% of the time we spend working, how much is the 98% we should spend
@@ -93,48 +98,51 @@ void dynamic_sampling_rate_after_sample(dynamic_sampling_rate_state *state, long
 // ---
 // Below here is boilerplate to expose the above code to Ruby so that we can test it with RSpec as usual.
 
-VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns);
-VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample);
-VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns);
+VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns);
+VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample);
+VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns);
 
 void collectors_dynamic_sampling_rate_init(VALUE profiling_module) {
   VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
   VALUE dynamic_sampling_rate_module = rb_define_module_under(collectors_module, "DynamicSamplingRate");
   VALUE testing_module = rb_define_module_under(dynamic_sampling_rate_module, "Testing");
 
-  rb_define_singleton_method(testing_module, "_native_get_sleep", _native_get_sleep, 2);
-  rb_define_singleton_method(testing_module, "_native_should_sample", _native_should_sample, 2);
-  rb_define_singleton_method(testing_module, "_native_after_sample", _native_after_sample, 2);
+  rb_define_singleton_method(testing_module, "_native_get_sleep", _native_get_sleep, 3);
+  rb_define_singleton_method(testing_module, "_native_should_sample", _native_should_sample, 3);
+  rb_define_singleton_method(testing_module, "_native_after_sample", _native_after_sample, 3);
 }
 
-VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns) {
+VALUE _native_get_sleep(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE current_monotonic_wall_time_ns) {
   ENFORCE_TYPE(simulated_next_sample_after_monotonic_wall_time_ns, T_FIXNUM);
   ENFORCE_TYPE(current_monotonic_wall_time_ns, T_FIXNUM);
 
   dynamic_sampling_rate_state state;
   dynamic_sampling_rate_init(&state);
+  dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
   atomic_store(&state.next_sample_after_monotonic_wall_time_ns, NUM2LONG(simulated_next_sample_after_monotonic_wall_time_ns));
 
   return ULL2NUM(dynamic_sampling_rate_get_sleep(&state, NUM2LONG(current_monotonic_wall_time_ns)));
 }
 
-VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample) {
+VALUE _native_should_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE simulated_next_sample_after_monotonic_wall_time_ns, VALUE wall_time_ns_before_sample) {
   ENFORCE_TYPE(simulated_next_sample_after_monotonic_wall_time_ns, T_FIXNUM);
   ENFORCE_TYPE(wall_time_ns_before_sample, T_FIXNUM);
 
   dynamic_sampling_rate_state state;
   dynamic_sampling_rate_init(&state);
+  dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
   atomic_store(&state.next_sample_after_monotonic_wall_time_ns, NUM2LONG(simulated_next_sample_after_monotonic_wall_time_ns));
 
   return dynamic_sampling_rate_should_sample(&state, NUM2LONG(wall_time_ns_before_sample)) ? Qtrue : Qfalse;
 }
 
-VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns) {
+VALUE _native_after_sample(DDTRACE_UNUSED VALUE self, VALUE overhead_target_percentage, VALUE wall_time_ns_after_sample, VALUE sampling_time_ns) {
   ENFORCE_TYPE(wall_time_ns_after_sample, T_FIXNUM);
   ENFORCE_TYPE(sampling_time_ns, T_FIXNUM);
 
   dynamic_sampling_rate_state state;
   dynamic_sampling_rate_init(&state);
+  dynamic_sampling_rate_set_overhead_target_percentage(&state, NUM2DBL(overhead_target_percentage));
 
   dynamic_sampling_rate_after_sample(&state, NUM2LONG(wall_time_ns_after_sample), NUM2ULL(sampling_time_ns));
 

diff --git a/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h b/ext/ddtrace_profiling_native_extension/collectors_dynamic_sampling_rate.h
@@ -4,10 +4,14 @@
 #include <stdbool.h>
 
 typedef struct {
+  // This is the wall-time overhead we're targeting. E.g. by default, we target to spend no more than 2%, or 1.2 seconds
+  // per minute, taking profiling samples.
+  double overhead_target_percentage;
   atomic_long next_sample_after_monotonic_wall_time_ns;
 } dynamic_sampling_rate_state;
 
 void dynamic_sampling_rate_init(dynamic_sampling_rate_state *state);
+void dynamic_sampling_rate_set_overhead_target_percentage(dynamic_sampling_rate_state *state, double overhead_target_percentage);
 void dynamic_sampling_rate_reset(dynamic_sampling_rate_state *state);
 uint64_t dynamic_sampling_rate_get_sleep(dynamic_sampling_rate_state *state, long current_monotonic_wall_time_ns);
 bool dynamic_sampling_rate_should_sample(dynamic_sampling_rate_state *state, long wall_time_ns_before_sample);

diff --git a/lib/datadog/core/configuration/settings.rb b/lib/datadog/core/configuration/settings.rb
@@ -391,6 +391,34 @@ def initialize(*_)
                 end
               end
             end
+
+            # Configures how much wall-time overhead the profiler targets. The profiler will dynamically adjust the
+            # interval between samples it takes so as to try and maintain the property that it spends no longer than
+            # this amount of wall-clock time profiling. For example, with the default value of 2%, the profiler will
+            # try and cause no more than 1.2 seconds per minute of overhead. Decreasing this value will reduce the
+            # accuracy of the data collected. Increasing will impact the application.
+            #
+            # We do not recommend tweaking this value.
+            #
+            # This value should be a percentage i.e. a number between 0 and 100, not 0 and 1.
+            #
+            # @default `DD_PROFILING_OVERHEAD_TARGET_PERCENTAGE` as a float, otherwise 2.0
+            option :overhead_target_percentage do |o|
+              o.type :float
+              o.env 'DD_PROFILING_OVERHEAD_TARGET_PERCENTAGE'
+              o.default 2.0
+            end
+
+            # Controls how often the profiler reports data, in seconds. Cannot be lower than 60 seconds.
+            #
+            # We do not recommend tweaking this value.
+            #
+            # @default `DD_PROFILING_UPLOAD_PERIOD` environment variable, otherwise 60
+            option :upload_period_seconds do |o|
+              o.type :int
+              o.env 'DD_PROFILING_UPLOAD_PERIOD'
+              o.default 60
+            end
           end
 
           # @public_api

diff --git a/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb b/lib/datadog/profiling/collectors/cpu_and_wall_time_worker.rb
@@ -18,6 +18,7 @@ def initialize(
           allocation_counting_enabled:,
           no_signals_workaround_enabled:,
           thread_context_collector:,
+          dynamic_sampling_rate_overhead_target_percentage:,
           idle_sampling_helper: IdleSamplingHelper.new,
           # **NOTE**: This should only be used for testing; disabling the dynamic sampling rate will increase the
           # profiler overhead!
@@ -45,6 +46,7 @@ def initialize(
             allocation_counting_enabled,
             no_signals_workaround_enabled,
             dynamic_sampling_rate_enabled,
+            dynamic_sampling_rate_overhead_target_percentage,
             allocation_sample_every,
           )
           @worker_thread = nil

diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb
@@ -7,7 +7,7 @@ module Component
       # Passing in a `nil` tracer is supported and will disable the following profiling features:
       # * Code Hotspots panel in the trace viewer, as well as scoping a profile down to a span
       # * Endpoint aggregation in the profiler UX, including normalization (resource per endpoint call)
-      def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
+      def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) # rubocop:disable Metrics/MethodLength
         require_relative '../profiling/diagnostics/environment_logger'
 
         Profiling::Diagnostics::EnvironmentLogger.collect_and_log!
@@ -41,6 +41,8 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
 
         no_signals_workaround_enabled = no_signals_workaround_enabled?(settings)
         timeline_enabled = settings.profiling.advanced.experimental_timeline_enabled
+        overhead_target_percentage = valid_overhead_target(settings.profiling.advanced.overhead_target_percentage)
+        upload_period_seconds = [60, settings.profiling.advanced.upload_period_seconds].max
 
         recorder = Datadog::Profiling::StackRecorder.new(
           cpu_time_enabled: RUBY_PLATFORM.include?('linux'), # Only supported on Linux currently
@@ -58,6 +60,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
           allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
           no_signals_workaround_enabled: no_signals_workaround_enabled,
           thread_context_collector: thread_context_collector,
+          dynamic_sampling_rate_overhead_target_percentage: overhead_target_percentage,
           allocation_sample_every: 0,
         )
 
@@ -68,7 +71,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
 
         exporter = build_profiler_exporter(settings, recorder, internal_metadata: internal_metadata)
         transport = build_profiler_transport(settings, agent_settings)
-        scheduler = Profiling::Scheduler.new(exporter: exporter, transport: transport)
+        scheduler = Profiling::Scheduler.new(exporter: exporter, transport: transport, interval: upload_period_seconds)
 
         Profiling::Profiler.new(worker: worker, scheduler: scheduler)
       end
@@ -245,6 +248,19 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
           true
         end
       end
+
+      private_class_method def self.valid_overhead_target(overhead_target_percentage)
+        if overhead_target_percentage > 0 && overhead_target_percentage <= 20
+          overhead_target_percentage
+        else
+          Datadog.logger.error(
+            'Ignoring invalid value for profiling overhead_target_percentage setting: ' \
+            "#{overhead_target_percentage.inspect}. Falling back to default value."
+          )
+
+          2.0
+        end
+      end
     end
   end
 end
diff --git a/lib/datadog/profiling/scheduler.rb b/lib/datadog/profiling/scheduler.rb
@@ -5,12 +5,11 @@
 
 module Datadog
   module Profiling
-    # Periodically (every DEFAULT_INTERVAL_SECONDS) takes a profile from the `Exporter` and reports it using the
+    # Periodically (every interval, 60 seconds by default) takes a profile from the `Exporter` and reports it using the
     # configured transport. Runs on its own background thread.
     class Scheduler < Core::Worker
       include Core::Workers::Polling
 
-      DEFAULT_INTERVAL_SECONDS = 60
       MINIMUM_INTERVAL_SECONDS = 0
 
       # We sleep for at most this duration seconds before reporting data to avoid multi-process applications all
@@ -28,8 +27,7 @@ class Scheduler < Core::Worker
       def initialize(
         exporter:,
         transport:,
-        fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default
-        interval: DEFAULT_INTERVAL_SECONDS,
+        interval:, fork_policy: Core::Workers::Async::Thread::FORK_POLICY_RESTART, # Restart in forks by default, # seconds
         enabled: true
       )
         @exporter = exporter
@@ -115,8 +113,8 @@ def flush_events
         #
         # During PR review (https://github.com/DataDog/dd-trace-rb/pull/1807) we discussed the possible alternative of
         # just sleeping before starting the scheduler loop. We ended up not going with that option to avoid the first
-        # profile containing up to DEFAULT_INTERVAL_SECONDS + DEFAULT_FLUSH_JITTER_MAXIMUM_SECONDS instead of the
-        # usual DEFAULT_INTERVAL_SECONDS size.
+        # profile containing up to interval + DEFAULT_FLUSH_JITTER_MAXIMUM_SECONDS instead of the
+        # usual interval seconds.
         if run_loop?
           jitter_seconds = rand * DEFAULT_FLUSH_JITTER_MAXIMUM_SECONDS # floating point number between (0.0...maximum)
           sleep(jitter_seconds)

diff --git a/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs b/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs
@@ -12,6 +12,7 @@ module Datadog
           allocation_counting_enabled: bool,
           no_signals_workaround_enabled: bool,
           thread_context_collector: Datadog::Profiling::Collectors::ThreadContext,
+          dynamic_sampling_rate_overhead_target_percentage: Float,
           ?idle_sampling_helper: Datadog::Profiling::Collectors::IdleSamplingHelper,
           ?dynamic_sampling_rate_enabled: bool,
           ?allocation_sample_every: Integer,
@@ -25,6 +26,7 @@ module Datadog
           bool allocation_counting_enabled,
           bool no_signals_workaround_enabled,
           bool dynamic_sampling_rate_enabled,
+          Float dynamic_sampling_rate_overhead_target_percentage,
           ::Integer allocation_sample_every,
         ) -> true
 

diff --git a/sig/datadog/profiling/component.rbs b/sig/datadog/profiling/component.rbs
@@ -24,6 +24,8 @@ module Datadog
 
       def self.incompatible_libmysqlclient_version?: (untyped settings) -> bool
       def self.incompatible_passenger_version?: () -> bool
+      def self.flush_interval: (untyped settings) -> ::Numeric
+      def self.valid_overhead_target: (::Float overhead_target_percentage) -> ::Float
     end
   end
 end
diff --git a/sig/datadog/profiling/ext.rbs b/sig/datadog/profiling/ext.rbs
@@ -6,6 +6,8 @@ module Datadog
       ENV_MAX_FRAMES: "DD_PROFILING_MAX_FRAMES"
       ENV_AGENTLESS: "DD_PROFILING_AGENTLESS"
       ENV_ENDPOINT_COLLECTION_ENABLED: "DD_PROFILING_ENDPOINT_COLLECTION_ENABLED"
+      ENV_DYNAMIC_SAMPLING_RATE_OVERHEAD_TARGET_PERCENTAGE: "DD_PROFILING_DYNAMIC_SAMPLING_RATE_OVERHEAD_TARGET_PERCENTAGE"
+      DEFAULT_DYNAMIC_SAMPLING_RATE_OVERHEAD_TARGET_PERCENTAGE: Float
 
       module Transport
         module HTTP

diff --git a/sig/datadog/profiling/scheduler.rbs b/sig/datadog/profiling/scheduler.rbs
@@ -7,7 +7,7 @@ module Datadog
         exporter: Datadog::Profiling::Exporter,
         transport: Datadog::Profiling::HttpTransport,
         ?fork_policy: untyped,
-        ?interval: ::Integer,
+        ?interval: ::Numeric,
         ?enabled: bool,
       ) -> void