From 7df5858ae66f55e5dfa509ab2a44b3f4e60203bc Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 12 Mar 2024 16:13:49 +0000 Subject: [PATCH 1/4] [PROF-9136] Fix rare remote configuration worker thread leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **What does this PR do?** This PR fixes the remote configuration worker thread leak that is discussed in more detail in the internal "dd-trace-rb Remote::Worker thread leak investigation" doc. In a nutshell, there's a potential race between threads that call `Datadog::Core::Remote::Tie.boot` and threads that call `Datadog.configure` that can, in really unlucky situations, mean that the `Remote::Worker thread gets leaked during library reconfiguration. This rare issue is O(number of times Datadog.configure gets called) while requests are being processed in a web-app. I was only able to reproduce it with the weird reproducer below, and by adding `sleep` to some parts of our code, so while we have observed it in production, we expect it to be rare. Being more specific, this issue is related to a race between the remote configuration thread being shut down, and other parts of the code base trying to restart it. In practice, the remote configuration would previously always get started by calls to `Datadog::Core::Remote.active_remote.barrier(:once)`. During dd-trace-rb reconfiguration, it’s possible for the remote configuration thread to be shut down, and before the new configuration kicks in, for any other thread to call `Datadog::Core::Remote.active_remote.barrier(:once)`, restarting the remote configuration thread that belonged to the old configuration. This PR fixes this issue introducing a `@stop_requested` variable that flags a "terminal state" for the remote configuration worker. Thus, the default behavior of `Remote::Worker.start` is changed from always starting the thread to not starting the thread if the same worker was stopped before. **Motivation:** Fix issue. **Additional Notes:** We are also working to remove cases where `Datadog.configure` gets called during request handling, e.g. #3515 . **How to test the change?** I've added unit test coverage for the worker not restarting itself again. To validate that this fixes the worker thread leak, the following reproducer can be used: ```diff diff --git a/lib/datadog/core/configuration.rb b/lib/datadog/core/configuration.rb index 7a20444784..fe0ca8ba0f 100644 --- a/lib/datadog/core/configuration.rb +++ b/lib/datadog/core/configuration.rb @@ -254,6 +254,12 @@ module Datadog components = Components.new(settings) old.shutdown!(components) + puts "After shutdown, sleeping!" + + sleep 1 + + puts "Woke up after shutdown!" + components.startup!(settings) components end diff --git a/remote-worker-thread-leak.rb b/remote-worker-thread-leak.rb new file mode 100644 index 0000000000..3552c654b5 --- /dev/null +++ b/remote-worker-thread-leak.rb @@ -0,0 +1,22 @@ +require 'ddtrace' + +configure_thread = Thread.new do + 15.times { + Datadog.configure { Thread.pass } + Thread.pass + } +end + +trigger_rc_thread = Thread.new do + loop { + sleep 0.5 + Datadog::Core::Remote.active_remote.barrier(:once) + Thread.pass + } +end + +configure_thread.join +trigger_rc_thread.kill +trigger_rc_thread.join + +puts Thread.list ``` Before the fix, the list of threads printed would show a lot of leaked remote worker configuration threads. With the fix, the threads no longer leak, and if you enable debug logging you will see that the restart is being prevented and the debug log I added will show up ``` D, [2024-03-12T16:27:02.951950 #742185] DEBUG -- ddtrace: [ddtrace] (dd-trace-rb/lib/datadog/core/remote/worker.rb:28:in `start') remote worker: refusing to restart after previous stop ``` This debug message is harmless and shows the fix is working fine. --- lib/datadog/core/remote/worker.rb | 13 ++++++++----- sig/datadog/core/remote/worker.rbs | 2 +- spec/datadog/core/remote/worker_spec.rb | 22 ++++++++++++++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/lib/datadog/core/remote/worker.rb b/lib/datadog/core/remote/worker.rb index da6f9379d75..e8be1231f50 100644 --- a/lib/datadog/core/remote/worker.rb +++ b/lib/datadog/core/remote/worker.rb @@ -10,8 +10,8 @@ def initialize(interval:, &block) @thr = nil @starting = false - @stopping = false @started = false + @stop_requested = false @interval = interval raise ArgumentError, 'can not initialize a worker without a block' unless block @@ -19,11 +19,16 @@ def initialize(interval:, &block) @block = block end - def start + def start(allow_restart: false) Datadog.logger.debug { 'remote worker starting' } acquire_lock + if @stop_requested && !allow_restart + Datadog.logger.debug('remote worker: refusing to restart after previous stop') + return + end + return if @starting || @started @starting = true @@ -46,8 +51,6 @@ def stop acquire_lock - @stopping = true - thread = @thr if thread @@ -56,8 +59,8 @@ def stop end @started = false - @stopping = false @thr = nil + @stop_requested = true Datadog.logger.debug { 'remote worker stopped' } ensure diff --git a/sig/datadog/core/remote/worker.rbs b/sig/datadog/core/remote/worker.rbs index 76767d399bb..bfe9d7f18a2 100644 --- a/sig/datadog/core/remote/worker.rbs +++ b/sig/datadog/core/remote/worker.rbs @@ -12,7 +12,7 @@ module Datadog def initialize: (interval: ::Float) { () -> void } -> void - def start: () -> void + def start: (?allow_restart: bool) -> void def stop: () -> void diff --git a/spec/datadog/core/remote/worker_spec.rb b/spec/datadog/core/remote/worker_spec.rb index 7ed70c5be26..040ebd5e586 100644 --- a/spec/datadog/core/remote/worker_spec.rb +++ b/spec/datadog/core/remote/worker_spec.rb @@ -67,6 +67,28 @@ expect(worker.instance_variable_get(:@thr).thread_variable_get(:fork_safe)).to be true end + + it 'does not restart the worker after being stopped once' do + worker.start + expect(worker.instance_variable_get(:@started)).to be true + + worker.stop + + worker.start + expect(worker.instance_variable_get(:@started)).to be false + end + + context 'when calling start with allow_restart: true' do + it 'restarts the worker after being stopped once' do + worker.start + expect(worker.instance_variable_get(:@started)).to be true + + worker.stop + + worker.start(allow_restart: true) + expect(worker.instance_variable_get(:@started)).to be true + end + end end describe '#stop' do From 760e084bdf5267ec962f66da7a62815330a4d70c Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 13 Mar 2024 09:16:43 +0000 Subject: [PATCH 2/4] Remove `allow_restart` argument I initially added this thinking it may make it easier for testing and/or experimentation, but after feedback from review, let's go without it for now -- we can always re-add later. --- lib/datadog/core/remote/worker.rb | 4 ++-- spec/datadog/core/remote/worker_spec.rb | 12 ------------ 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/lib/datadog/core/remote/worker.rb b/lib/datadog/core/remote/worker.rb index e8be1231f50..86868562ca7 100644 --- a/lib/datadog/core/remote/worker.rb +++ b/lib/datadog/core/remote/worker.rb @@ -19,12 +19,12 @@ def initialize(interval:, &block) @block = block end - def start(allow_restart: false) + def start Datadog.logger.debug { 'remote worker starting' } acquire_lock - if @stop_requested && !allow_restart + if @stop_requested Datadog.logger.debug('remote worker: refusing to restart after previous stop') return end diff --git a/spec/datadog/core/remote/worker_spec.rb b/spec/datadog/core/remote/worker_spec.rb index 040ebd5e586..20cf0f087cd 100644 --- a/spec/datadog/core/remote/worker_spec.rb +++ b/spec/datadog/core/remote/worker_spec.rb @@ -77,18 +77,6 @@ worker.start expect(worker.instance_variable_get(:@started)).to be false end - - context 'when calling start with allow_restart: true' do - it 'restarts the worker after being stopped once' do - worker.start - expect(worker.instance_variable_get(:@started)).to be true - - worker.stop - - worker.start(allow_restart: true) - expect(worker.instance_variable_get(:@started)).to be true - end - end end describe '#stop' do From 22f33301baac1da4fc555e22a12cef66c85260fc Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 13 Mar 2024 09:19:05 +0000 Subject: [PATCH 3/4] Rename @stop_requested to @stopped --- lib/datadog/core/remote/worker.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/datadog/core/remote/worker.rb b/lib/datadog/core/remote/worker.rb index 86868562ca7..19bc18b2264 100644 --- a/lib/datadog/core/remote/worker.rb +++ b/lib/datadog/core/remote/worker.rb @@ -11,7 +11,7 @@ def initialize(interval:, &block) @starting = false @started = false - @stop_requested = false + @stopped = false @interval = interval raise ArgumentError, 'can not initialize a worker without a block' unless block @@ -24,7 +24,7 @@ def start acquire_lock - if @stop_requested + if @stopped Datadog.logger.debug('remote worker: refusing to restart after previous stop') return end @@ -60,7 +60,7 @@ def stop @started = false @thr = nil - @stop_requested = true + @stopped = true Datadog.logger.debug { 'remote worker stopped' } ensure From e20cbaf4a21f948106f087311ace4760b2072c6b Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Wed, 13 Mar 2024 09:20:42 +0000 Subject: [PATCH 4/4] Update type signatures for worker after removal of argument --- sig/datadog/core/remote/worker.rbs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sig/datadog/core/remote/worker.rbs b/sig/datadog/core/remote/worker.rbs index bfe9d7f18a2..76767d399bb 100644 --- a/sig/datadog/core/remote/worker.rbs +++ b/sig/datadog/core/remote/worker.rbs @@ -12,7 +12,7 @@ module Datadog def initialize: (interval: ::Float) { () -> void } -> void - def start: (?allow_restart: bool) -> void + def start: () -> void def stop: () -> void