Add support for worker state callbacks

JamesWrigley · JamesWrigley · commit d5d1c90db08c · 2024-12-11T00:36:52.000+01:00
diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md
@@ -16,6 +16,7 @@ This documents notable changes in DistributedNext.jl. The format is based on
 - A watcher mechanism has been added to detect when both the Distributed stdlib
   and DistributedNext may be active and adding workers. This should help prevent
   incompatibilities from both libraries being used simultaneously ([#10]).
+- Implemented callback support for workers being added/removed etc ([#17]).
 
 ## [v1.0.0] - 2024-12-02
 
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -50,6 +50,17 @@ DistributedNext.cluster_cookie()
 DistributedNext.cluster_cookie(::Any)
 ```
 
+## Callbacks
+
+```@docs
+DistributedNext.add_worker_added_callback
+DistributedNext.remove_worker_added_callback
+DistributedNext.add_worker_exiting_callback
+DistributedNext.remove_worker_exiting_callback
+DistributedNext.add_worker_exited_callback
+DistributedNext.remove_worker_exited_callback
+```
+
 ## Cluster Manager Interface
 
 This interface provides a mechanism to launch and manage Julia workers on different cluster environments.
diff --git a/src/cluster.jl b/src/cluster.jl
@@ -461,12 +461,14 @@ function addprocs(manager::ClusterManager; kwargs...)
 
     cluster_mgmt_from_master_check()
 
-    lock(worker_lock)
-    try
-        addprocs_locked(manager::ClusterManager; kwargs...)
-    finally
-        unlock(worker_lock)
+    new_workers = @lock worker_lock addprocs_locked(manager::ClusterManager; kwargs...)
+    for worker in new_workers
+        for callback in values(worker_added_callbacks)
+            callback(worker)
+        end
     end
+
+    return new_workers
 end
 
 function addprocs_locked(manager::ClusterManager; kwargs...)
@@ -855,13 +857,96 @@ const HDR_COOKIE_LEN=16
 const map_pid_wrkr = Dict{Int, Union{Worker, LocalProcess}}()
 const map_sock_wrkr = IdDict()
 const map_del_wrkr = Set{Int}()
+const worker_added_callbacks = Dict{Any, Base.Callable}()
+const worker_exiting_callbacks = Dict{Any, Base.Callable}()
+const worker_exited_callbacks = Dict{Any, Base.Callable}()
 
 # whether process is a master or worker in a distributed setup
 myrole() = LPROCROLE[]
 function myrole!(proctype::Symbol)
     LPROCROLE[] = proctype
 end
 
+# Callbacks
+
+# We define the callback methods in a loop here and add docstrings for them afterwards
+for callback_type in (:added, :exiting, :exited)
+    let add_name = Symbol(:add_worker_, callback_type, :_callback),
+        remove_name = Symbol(:remove_worker_, callback_type, :_callback),
+        dict_name = Symbol(:worker_, callback_type, :_callbacks)
+
+        @eval begin
+            function $add_name(f::Base.Callable; key=nothing)
+                if !hasmethod(f, Tuple{Int})
+                    throw(ArgumentError("Callback function is invalid, it must be able to accept a single Int argument"))
+                end
+
+                if isnothing(key)
+                    key = Symbol(gensym(), nameof(f))
+                end
+
+                $dict_name[key] = f
+                return key
+            end
+
+            $remove_name(key) = delete!($dict_name, key)
+        end
+    end
+end
+
+"""
+    add_worker_added_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process whenever a worker is
+added. The callback will be called with the added worker ID,
+e.g. `f(w::Int)`. Returns a unique key for the callback.
+"""
+function add_worker_added_callback end
+
+"""
+    remove_worker_added_callback(key)
+
+Remove the callback for `key`.
+"""
+function remove_worker_added_callback end
+
+"""
+    add_worker_exiting_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process immediately before a
+worker is removed with [`rmprocs()`](@ref). The callback will be called with the
+worker ID, e.g. `f(w::Int)`. Returns a unique key for the callback.
+
+All callbacks will be executed asynchronously and if they don't all finish
+before the `callback_timeout` passed to `rmprocs()` then the process will be
+removed anyway.
+"""
+function add_worker_exiting_callback end
+
+"""
+    remove_worker_exiting_callback(key)
+
+Remove the callback for `key`.
+"""
+function remove_worker_exiting_callback end
+
+"""
+    add_worker_exited_callback(f::Base.Callable; key=nothing)
+
+Register a callback to be called on the master process when a worker has exited
+for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
+segfaulting etc). The callback will be called with the worker ID,
+e.g. `f(w::Int)`. Returns a unique key for the callback.
+"""
+function add_worker_exited_callback end
+
+"""
+    remove_worker_exited_callback(key)
+
+Remove the callback for `key`.
+"""
+function remove_worker_exited_callback end
+
 # cluster management related API
 """
     myid()
@@ -1025,7 +1110,7 @@ function cluster_mgmt_from_master_check()
 end
 
 """
-    rmprocs(pids...; waitfor=typemax(Int))
+    rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10)
 
 Remove the specified workers. Note that only process 1 can add or remove
 workers.
@@ -1039,6 +1124,10 @@ Argument `waitfor` specifies how long to wait for the workers to shut down:
     returned. The user should call [`wait`](@ref) on the task before invoking any other
     parallel calls.
 
+The `callback_timeout` specifies how long to wait for any callbacks to execute
+before continuing to remove the workers (see
+[`add_worker_exiting_callback()`](@ref)).
+
 # Examples
 ```julia-repl
 \$ julia -p 5
@@ -1055,24 +1144,36 @@ julia> workers()
  6
 ```
 """
-function rmprocs(pids...; waitfor=typemax(Int))
+function rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10)
     cluster_mgmt_from_master_check()
 
     pids = vcat(pids...)
     if waitfor == 0
-        t = @async _rmprocs(pids, typemax(Int))
+        t = @async _rmprocs(pids, typemax(Int), callback_timeout)
         yield()
         return t
     else
-        _rmprocs(pids, waitfor)
+        _rmprocs(pids, waitfor, callback_timeout)
         # return a dummy task object that user code can wait on.
         return @async nothing
     end
 end
 
-function _rmprocs(pids, waitfor)
+function _rmprocs(pids, waitfor, callback_timeout)
     lock(worker_lock)
     try
+        # Run the callbacks
+        callback_tasks = Task[]
+        for pid in pids
+            for callback in values(worker_exiting_callbacks)
+                push!(callback_tasks, Threads.@spawn callback(pid))
+            end
+        end
+
+        if timedwait(() -> all(istaskdone.(callback_tasks)), callback_timeout) === :timed_out
+            @warn "Some callbacks timed out, continuing to remove workers anyway"
+        end
+
         rmprocset = Union{LocalProcess, Worker}[]
         for p in pids
             if p == 1
@@ -1218,6 +1319,14 @@ function deregister_worker(pg, pid)
             delete!(pg.refs, id)
         end
     end
+
+    # Call callbacks on the master
+    if myid() == 1
+        for callback in values(worker_exited_callbacks)
+            callback(pid)
+        end
+    end
+
     return
 end
 
diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl
@@ -1,6 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 using DistributedNext, Random, Serialization, Sockets
+import DistributedNext
 import DistributedNext: launch, manage
 
 
@@ -1927,6 +1928,47 @@ include("splitrange.jl")
     end
 end
 
+@testset "Worker state callbacks" begin
+    if nprocs() > 1
+        rmprocs(workers())
+    end
+
+    # Smoke test to ensure that all the callbacks are executed
+    added_workers = Int[]
+    exiting_workers = Int[]
+    exited_workers = Int[]
+    added_key = DistributedNext.add_worker_added_callback(pid -> push!(added_workers, pid))
+    exiting_key = DistributedNext.add_worker_exiting_callback(pid -> push!(exiting_workers, pid))
+    exited_key = DistributedNext.add_worker_exited_callback(pid -> push!(exited_workers, pid))
+
+    pid = only(addprocs(1))
+    @test added_workers == [pid]
+    rmprocs(workers())
+    @test exiting_workers == [pid]
+    @test exited_workers == [pid]
+
+    # Remove the callbacks
+    DistributedNext.remove_worker_added_callback(added_key)
+    DistributedNext.remove_worker_exiting_callback(exiting_key)
+    DistributedNext.remove_worker_exited_callback(exited_key)
+
+    # Test that the `callback_timeout` option works
+    event = Base.Event()
+    callback_task = nothing
+    exiting_key = DistributedNext.add_worker_exiting_callback(_ -> (callback_task = current_task(); wait(event)))
+    addprocs(1)
+
+    @test_logs (:warn, r"Some callbacks timed out.+") rmprocs(workers(); callback_timeout=0.5)
+
+    notify(event)
+    wait(callback_task)
+
+    # Test that the previous callbacks were indeed removed
+    @test length(added_workers) == 1
+    @test length(exiting_workers) == 1
+    @test length(exited_workers) == 1
+end
+
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
 if nprocs() > 1