@@ -3,6 +3,8 @@ using Logging
3
3
4
4
export SlurmManager, PBSManager, set_worker_loggers
5
5
6
+ worker_timeout () = parse (Float64, get (ENV , " JULIA_WORKER_TIMEOUT" , " 300.0" ))
7
+
6
8
get_worker_pool () = workers () == [1 ] ? WorkerPool () : default_worker_pool ()
7
9
8
10
function run_worker_iteration (
@@ -21,7 +23,7 @@ function run_worker_iteration(
21
23
remotecall_wait (forward_model, w, iter, m)
22
24
end
23
25
end
24
-
26
+ isempty (all_known_workers . workers) && @info " No workers currently available "
25
27
@sync while ! isempty (work_to_do)
26
28
# Add new workers to worker_pool
27
29
all_workers = get_worker_pool ()
@@ -40,7 +42,7 @@ function run_worker_iteration(
40
42
push! (worker_pool, worker)
41
43
end
42
44
else
43
- println ( " no workers available" )
45
+ @debug " no workers available"
44
46
sleep (10 ) # Wait for workers to become available
45
47
end
46
48
end
@@ -75,7 +77,7 @@ addprocs(SlurmManager(ntasks=4))
75
77
76
78
# Pass additional arguments to `srun`
77
79
addprocs(SlurmManager(ntasks=4), gpus_per_task=1)
78
-
80
+ ```
79
81
# Related functions
80
82
- `calibrate(WorkerBackend, ...)`: Perform calibration using workers
81
83
- `remotecall(func, worker_id, args...)`: Execute functions on specific workers
@@ -100,7 +102,6 @@ function Distributed.manage(
100
102
)
101
103
if op == :register
102
104
set_worker_logger (id)
103
- evaluate_initial_expression (id, manager. expr)
104
105
end
105
106
end
106
107
@@ -313,8 +314,6 @@ Workers inherit the current Julia environment by default.
313
314
# Related Functions
314
315
- `calibrate(WorkerBackend, ...)`: Perform worker calibration
315
316
- `remotecall(func, worker_id, args...)`: Execute functions on specific workers
316
-
317
- See also: [`addprocs`](@ref), [`Distributed`](@ref), [`SlurmManager`](@ref)
318
317
"""
319
318
struct PBSManager <: ClusterManager
320
319
ntasks:: Integer
@@ -478,3 +477,148 @@ function set_worker_loggers(workers = workers())
478
477
end
479
478
end
480
479
end
480
+
481
+ # Copied from Distributed.jl in order to evaluate the manager's expression on worker initialization
482
+ function Distributed. create_worker (
483
+ manager:: Union{SlurmManager, PBSManager} ,
484
+ wconfig,
485
+ )
486
+ # only node 1 can add new nodes, since nobody else has the full list of address:port
487
+ @assert Distributed. LPROC. id == 1
488
+ timeout = worker_timeout ()
489
+
490
+ # initiate a connect. Does not wait for connection completion in case of TCP.
491
+ w = Distributed. Worker ()
492
+ local r_s, w_s
493
+ try
494
+ (r_s, w_s) = Distributed. connect (manager, w. id, wconfig)
495
+ catch ex
496
+ try
497
+ Distributed. deregister_worker (w. id)
498
+ kill (manager, w. id, wconfig)
499
+ finally
500
+ rethrow (ex)
501
+ end
502
+ end
503
+
504
+ w = Distributed. Worker (w. id, r_s, w_s, manager; config = wconfig)
505
+ # install a finalizer to perform cleanup if necessary
506
+ finalizer (w) do w
507
+ if myid () == 1
508
+ Distributed. manage (w. manager, w. id, w. config, :finalize )
509
+ end
510
+ end
511
+
512
+ # set when the new worker has finished connections with all other workers
513
+ ntfy_oid = Distributed. RRID ()
514
+ rr_ntfy_join = Distributed. lookup_ref (ntfy_oid)
515
+ rr_ntfy_join. waitingfor = myid ()
516
+
517
+ # Start a new task to handle inbound messages from connected worker in master.
518
+ # Also calls `wait_connected` on TCP streams.
519
+ Distributed. process_messages (w. r_stream, w. w_stream, false )
520
+
521
+ # send address information of all workers to the new worker.
522
+ # Cluster managers set the address of each worker in `WorkerConfig.connect_at`.
523
+ # A new worker uses this to setup an all-to-all network if topology :all_to_all is specified.
524
+ # Workers with higher pids connect to workers with lower pids. Except process 1 (master) which
525
+ # initiates connections to all workers.
526
+
527
+ # Connection Setup Protocol:
528
+ # - Master sends 16-byte cookie followed by 16-byte version string and a JoinPGRP message to all workers
529
+ # - On each worker
530
+ # - Worker responds with a 16-byte version followed by a JoinCompleteMsg
531
+ # - Connects to all workers less than its pid. Sends the cookie, version and an IdentifySocket message
532
+ # - Workers with incoming connection requests write back their Version and an IdentifySocketAckMsg message
533
+ # - On master, receiving a JoinCompleteMsg triggers rr_ntfy_join (signifies that worker setup is complete)
534
+
535
+ join_list = []
536
+ if Distributed. PGRP. topology === :all_to_all
537
+ # need to wait for lower worker pids to have completed connecting, since the numerical value
538
+ # of pids is relevant to the connection process, i.e., higher pids connect to lower pids and they
539
+ # require the value of config.connect_at which is set only upon connection completion
540
+ for jw in Distributed. PGRP. workers
541
+ if (jw. id != 1 ) && (jw. id < w. id)
542
+ # wait for wl to join
543
+ # We should access this atomically using (@atomic jw.state)
544
+ # but this is only recently supported
545
+ if jw. state === Distributed. W_CREATED
546
+ lock (jw. c_state) do
547
+ wait (jw. c_state)
548
+ end
549
+ end
550
+ push! (join_list, jw)
551
+ end
552
+ end
553
+
554
+ elseif Distributed. PGRP. topology === :custom
555
+ # wait for requested workers to be up before connecting to them.
556
+ filterfunc (x) =
557
+ (x. id != 1 ) &&
558
+ isdefined (x, :config ) &&
559
+ (
560
+ notnothing (x. config. ident) in
561
+ something (wconfig. connect_idents, [])
562
+ )
563
+
564
+ wlist = filter (filterfunc, Distributed. PGRP. workers)
565
+ waittime = 0
566
+ while wconfig. connect_idents != = nothing &&
567
+ length (wlist) < length (wconfig. connect_idents)
568
+ if waittime >= timeout
569
+ error (" peer workers did not connect within $timeout seconds" )
570
+ end
571
+ sleep (1.0 )
572
+ waittime += 1
573
+ wlist = filter (filterfunc, Distributed. PGRP. workers)
574
+ end
575
+
576
+ for wl in wlist
577
+ lock (wl. c_state) do
578
+ if (@atomic wl. state) === Distributed. W_CREATED
579
+ # wait for wl to join
580
+ wait (wl. c_state)
581
+ end
582
+ end
583
+ push! (join_list, wl)
584
+ end
585
+ end
586
+
587
+ all_locs = Base. mapany (
588
+ x ->
589
+ isa (x, Distributed. Worker) ?
590
+ (something (x. config. connect_at, ()), x. id) : ((), x. id, true ),
591
+ join_list,
592
+ )
593
+ Distributed. send_connection_hdr (w, true )
594
+ enable_threaded_blas = something (wconfig. enable_threaded_blas, false )
595
+
596
+ join_message = Distributed. JoinPGRPMsg (
597
+ w. id,
598
+ all_locs,
599
+ Distributed. PGRP. topology,
600
+ enable_threaded_blas,
601
+ Distributed. isclusterlazy (),
602
+ )
603
+ Distributed. send_msg_now (
604
+ w,
605
+ Distributed. MsgHeader (Distributed. RRID (0 , 0 ), ntfy_oid),
606
+ join_message,
607
+ )
608
+
609
+ # Ensure the initial expression is evaluated before any other code
610
+ @info " Evaluating initial expression on worker $(w. id) "
611
+ evaluate_initial_expression (w. id, manager. expr)
612
+
613
+ @async Distributed. manage (w. manager, w. id, w. config, :register )
614
+
615
+ # wait for rr_ntfy_join with timeout
616
+ if timedwait (() -> isready (rr_ntfy_join), timeout) === :timed_out
617
+ error (" worker did not connect within $timeout seconds" )
618
+ end
619
+ lock (Distributed. client_refs) do
620
+ delete! (Distributed. PGRP. refs, ntfy_oid)
621
+ end
622
+
623
+ return w. id
624
+ end
0 commit comments