@@ -457,22 +457,34 @@ end
457
457
```
458
458
"""
459
459
function addprocs (manager:: ClusterManager ; kwargs... )
460
+ params = merge (default_addprocs_params (manager), Dict {Symbol, Any} (kwargs))
461
+
460
462
init_multi ()
461
463
462
464
cluster_mgmt_from_master_check ()
463
465
464
- new_workers = @lock worker_lock addprocs_locked (manager:: ClusterManager ; kwargs... )
466
+ new_workers = @lock worker_lock addprocs_locked (manager:: ClusterManager , params)
467
+
468
+ callback_tasks = Dict {Any, Task} ()
465
469
for worker in new_workers
466
- for callback in values ( worker_added_callbacks)
467
- callback (worker)
470
+ for (name, callback) in worker_added_callbacks
471
+ callback_tasks[name] = Threads . @spawn callback (worker)
468
472
end
469
473
end
470
474
475
+ running_callbacks = () -> [" '$(key) '" for (key, task) in callback_tasks if ! istaskdone (task)]
476
+ while timedwait (() -> isempty (running_callbacks ()), params[:callback_warning_interval ]) === :timed_out
477
+ callbacks_str = join (running_callbacks (), " , " )
478
+ @warn " Waiting for these worker-added callbacks to finish: $(callbacks_str) "
479
+ end
480
+
481
+ # Wait on the tasks so that exceptions bubble up
482
+ wait .(values (callback_tasks))
483
+
471
484
return new_workers
472
485
end
473
486
474
- function addprocs_locked (manager:: ClusterManager ; kwargs... )
475
- params = merge (default_addprocs_params (manager), Dict {Symbol,Any} (kwargs))
487
+ function addprocs_locked (manager:: ClusterManager , params)
476
488
topology (Symbol (params[:topology ]))
477
489
478
490
if PGRP. topology != = :all_to_all
@@ -559,7 +571,8 @@ default_addprocs_params() = Dict{Symbol,Any}(
559
571
:exeflags => ` ` ,
560
572
:env => [],
561
573
:enable_threaded_blas => false ,
562
- :lazy => true )
574
+ :lazy => true ,
575
+ :callback_warning_interval => 10 )
563
576
564
577
565
578
function setup_launched_worker (manager, wconfig, launched_q)
872
885
function _add_callback (f, key, dict)
873
886
if ! hasmethod (f, Tuple{Int})
874
887
throw (ArgumentError (" Callback function is invalid, it must be able to accept a single Int argument" ))
888
+ elseif haskey (dict, key)
889
+ throw (ArgumentError (" A callback function with key '$(key) ' already exists" ))
875
890
end
876
891
877
892
if isnothing (key)
@@ -889,14 +904,23 @@ _remove_callback(key, dict) = delete!(dict, key)
889
904
890
905
Register a callback to be called on the master process whenever a worker is
891
906
added. The callback will be called with the added worker ID,
892
- e.g. `f(w::Int)`. Returns a unique key for the callback.
907
+ e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
908
+ not specified.
909
+
910
+ The worker-added callbacks will be executed concurrently. If one throws an
911
+ exception it will not be caught and will bubble up through [`addprocs()`](@ref).
912
+
913
+ Keep in mind that the callbacks will add to the time taken to launch workers; so
914
+ try to either keep the callbacks fast to execute, or do the actual
915
+ initialization asynchronously by spawning a task in the callback (beware of race
916
+ conditions if you do this).
893
917
"""
894
918
add_worker_added_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_added_callbacks)
895
919
896
920
"""
897
921
remove_worker_added_callback(key)
898
922
899
- Remove the callback for `key`.
923
+ Remove the callback for `key` that was added with [`add_worker_added_callback()`](@ref) .
900
924
"""
901
925
remove_worker_added_callback (key) = _remove_callback (key, worker_added_callbacks)
902
926
@@ -905,18 +929,19 @@ remove_worker_added_callback(key) = _remove_callback(key, worker_added_callbacks
905
929
906
930
Register a callback to be called on the master process immediately before a
907
931
worker is removed with [`rmprocs()`](@ref). The callback will be called with the
908
- worker ID, e.g. `f(w::Int)`. Returns a unique key for the callback.
932
+ worker ID, e.g. `f(w::Int)`. Chooses and returns a unique key for the callback
933
+ if `key` is not specified.
909
934
910
- All callbacks will be executed asynchronously and if they don't all finish
911
- before the `callback_timeout` passed to `rmprocs()` then the process will be
912
- removed anyway.
935
+ All worker-exiting callbacks will be executed concurrently and if they don't
936
+ all finish before the `callback_timeout` passed to `rmprocs()` then the process
937
+ will be removed anyway.
913
938
"""
914
939
add_worker_exiting_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exiting_callbacks)
915
940
916
941
"""
917
942
remove_worker_exiting_callback(key)
918
943
919
- Remove the callback for `key`.
944
+ Remove the callback for `key` that was added with [`add_worker_exiting_callback()`](@ref) .
920
945
"""
921
946
remove_worker_exiting_callback (key) = _remove_callback (key, worker_exiting_callbacks)
922
947
@@ -926,14 +951,17 @@ remove_worker_exiting_callback(key) = _remove_callback(key, worker_exiting_callb
926
951
Register a callback to be called on the master process when a worker has exited
927
952
for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
928
953
segfaulting etc). The callback will be called with the worker ID,
929
- e.g. `f(w::Int)`. Returns a unique key for the callback.
954
+ e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
955
+ not specified.
956
+
957
+ If the callback throws an exception it will be caught and printed.
930
958
"""
931
959
add_worker_exited_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exited_callbacks)
932
960
933
961
"""
934
962
remove_worker_exited_callback(key)
935
963
936
- Remove the callback for `key`.
964
+ Remove the callback for `key` that was added with [`add_worker_exited_callback()`](@ref) .
937
965
"""
938
966
remove_worker_exited_callback (key) = _remove_callback (key, worker_exited_callbacks)
939
967
@@ -1176,15 +1204,17 @@ function _rmprocs(pids, waitfor, callback_timeout)
1176
1204
lock (worker_lock)
1177
1205
try
1178
1206
# Run the callbacks
1179
- callback_tasks = Task[]
1207
+ callback_tasks = Dict {Any, Task} ()
1180
1208
for pid in pids
1181
- for callback in values ( worker_exiting_callbacks)
1182
- push! ( callback_tasks, Threads. @spawn callback (pid) )
1209
+ for (name, callback) in worker_exiting_callbacks
1210
+ callback_tasks[name] = Threads. @spawn callback (pid)
1183
1211
end
1184
1212
end
1185
1213
1186
- if timedwait (() -> all (istaskdone .(callback_tasks)), callback_timeout) === :timed_out
1187
- @warn " Some callbacks timed out, continuing to remove workers anyway"
1214
+ if timedwait (() -> all (istaskdone .(values (callback_tasks))), callback_timeout) === :timed_out
1215
+ timedout_callbacks = [" '$(key) '" for (key, task) in callback_tasks if ! istaskdone (task)]
1216
+ callbacks_str = join (timedout_callbacks, " , " )
1217
+ @warn " Some worker-exiting callbacks have not yet finished, continuing to remove workers anyway. These are the callbacks still running: $(callbacks_str) "
1188
1218
end
1189
1219
1190
1220
rmprocset = Union{LocalProcess, Worker}[]
@@ -1335,8 +1365,12 @@ function deregister_worker(pg, pid)
1335
1365
1336
1366
# Call callbacks on the master
1337
1367
if myid () == 1
1338
- for callback in values (worker_exited_callbacks)
1339
- callback (pid)
1368
+ for (name, callback) in worker_exited_callbacks
1369
+ try
1370
+ callback (pid)
1371
+ catch ex
1372
+ @error " Error when running worker-exited callback '$(name) '" exception= (ex, catch_backtrace ())
1373
+ end
1340
1374
end
1341
1375
end
1342
1376
0 commit comments