@@ -505,52 +505,72 @@ function workers()
505
505
end
506
506
507
507
"""
508
- rmprocs(pids...; waitfor=0.0 )
508
+ rmprocs(pids...; waitfor=typemax(Int) )
509
509
510
- Removes the specified workers. Note that only
511
- process 1 can add or remove workers - if another
512
- worker tries to call `rmprocs`, an error will be
513
- thrown. The optional argument `waitfor` determines
514
- how long the first process will wait for the workers
515
- to shut down.
510
+ Removes the specified workers. Note that only process 1 can add or remove
511
+ workers.
512
+
513
+ Argument `waitfor` specifies how long to wait for the workers to shut down:
514
+ - If unspecified, `rmprocs` will wait until all requested `pids` are removed.
515
+ - An `ErrorException` is raised if all workers cannot be terminated before
516
+ the requested `waitfor` seconds.
517
+ - With a `waitfor` value of 0, the call returns immediately with the workers
518
+ scheduled for removal in a different task. The scheduled `Task` object is
519
+ returned. The user should call `wait` on the task before invoking any other
520
+ parallel calls.
516
521
"""
517
- function rmprocs (pids... ; waitfor = 0.0 )
522
+ function rmprocs (pids... ; waitfor= typemax (Int) )
518
523
# Only pid 1 can add and remove processes
519
524
if myid () != 1
520
- error ( " only process 1 can add and remove processes" )
525
+ throw ( ErrorException ( " only process 1 can add and remove processes" ) )
521
526
end
522
527
528
+ pids = vcat (pids... )
529
+ if waitfor == 0
530
+ t = @schedule _rmprocs (pids, typemax (Int))
531
+ yield ()
532
+ return t
533
+ else
534
+ _rmprocs (pids, waitfor)
535
+ # return a dummy task object that user code can wait on.
536
+ return @schedule nothing
537
+ end
538
+ end
539
+
540
+ function _rmprocs (pids, waitfor)
523
541
lock (worker_lock)
524
542
try
525
543
rmprocset = []
526
- for i in vcat (pids... )
527
- if i == 1
544
+ for p in vcat (pids... )
545
+ if p == 1
528
546
warn (" rmprocs: process 1 not removed" )
529
547
else
530
- if haskey (map_pid_wrkr, i )
531
- w = map_pid_wrkr[i ]
548
+ if haskey (map_pid_wrkr, p )
549
+ w = map_pid_wrkr[p ]
532
550
set_worker_state (w, W_TERMINATING)
533
- kill (w. manager, i , w. config)
551
+ kill (w. manager, p , w. config)
534
552
push! (rmprocset, w)
535
553
end
536
554
end
537
555
end
538
556
539
557
start = time ()
540
558
while (time () - start) < waitfor
541
- if all (w -> w. state == W_TERMINATED, rmprocset)
542
- break
543
- else
544
- sleep (0.1 )
545
- end
559
+ all (w -> w. state == W_TERMINATED, rmprocset) && break
560
+ sleep (min (0.1 , waitfor - (time () - start)))
546
561
end
547
562
548
- ((waitfor > 0 ) && any (w -> w. state != W_TERMINATED, rmprocset)) ? :timed_out : :ok
563
+ unremoved = [wrkr. id for wrkr in filter (w -> w. state != W_TERMINATED, rmprocset)]
564
+ if length (unremoved) > 0
565
+ estr = string (" rmprocs: pids " , unremoved, " not terminated after " , waitfor, " seconds." )
566
+ throw (ErrorException (estr))
567
+ end
549
568
finally
550
569
unlock (worker_lock)
551
570
end
552
571
end
553
572
573
+
554
574
"""
555
575
ProcessExitedException()
556
576
@@ -2238,18 +2258,18 @@ function check_same_host(pids)
2238
2258
end
2239
2259
2240
2260
function terminate_all_workers ()
2241
- if myid () != 1
2242
- return
2243
- end
2261
+ myid () != 1 && return
2244
2262
2245
2263
if nprocs () > 1
2246
- ret = rmprocs (workers (); waitfor= 0.5 )
2247
- if ret != = :ok
2264
+ try
2265
+ rmprocs (workers (); waitfor= 5.0 )
2266
+ catch _ex
2248
2267
warn (" Forcibly interrupting busy workers" )
2249
2268
# Might be computation bound, interrupt them and try again
2250
2269
interrupt (workers ())
2251
- ret = rmprocs (workers (); waitfor= 0.5 )
2252
- if ret != = :ok
2270
+ try
2271
+ rmprocs (workers (); waitfor= 5.0 )
2272
+ catch _ex2
2253
2273
warn (" Unable to terminate all workers" )
2254
2274
end
2255
2275
end
0 commit comments