@@ -3,6 +3,8 @@ using Logging
3
3
4
4
export SlurmManager, PBSManager, set_worker_loggers
5
5
6
+ worker_timeout () = parse (Float64, get (ENV , " JULIA_WORKER_TIMEOUT" , " 300.0" ))
7
+
6
8
get_worker_pool () = workers () == [1 ] ? WorkerPool () : default_worker_pool ()
7
9
8
10
function run_worker_iteration (
@@ -21,7 +23,7 @@ function run_worker_iteration(
21
23
remotecall_wait (forward_model, w, iter, m)
22
24
end
23
25
end
24
-
26
+ isempty (all_known_workers . workers) && @info " No workers currently available "
25
27
@sync while ! isempty (work_to_do)
26
28
# Add new workers to worker_pool
27
29
all_workers = get_worker_pool ()
@@ -40,7 +42,7 @@ function run_worker_iteration(
40
42
push! (worker_pool, worker)
41
43
end
42
44
else
43
- println ( " no workers available" )
45
+ @debug " no workers available"
44
46
sleep (10 ) # Wait for workers to become available
45
47
end
46
48
end
@@ -75,7 +77,7 @@ addprocs(SlurmManager(ntasks=4))
75
77
76
78
# Pass additional arguments to `srun`
77
79
addprocs(SlurmManager(ntasks=4), gpus_per_task=1)
78
-
80
+ ```
79
81
# Related functions
80
82
- `calibrate(WorkerBackend, ...)`: Perform calibration using workers
81
83
- `remotecall(func, worker_id, args...)`: Execute functions on specific workers
@@ -100,7 +102,6 @@ function Distributed.manage(
100
102
)
101
103
if op == :register
102
104
set_worker_logger (id)
103
- evaluate_initial_expression (id, manager. expr)
104
105
end
105
106
end
106
107
@@ -313,8 +314,6 @@ Workers inherit the current Julia environment by default.
313
314
# Related Functions
314
315
- `calibrate(WorkerBackend, ...)`: Perform worker calibration
315
316
- `remotecall(func, worker_id, args...)`: Execute functions on specific workers
316
-
317
- See also: [`addprocs`](@ref), [`Distributed`](@ref), [`SlurmManager`](@ref)
318
317
"""
319
318
struct PBSManager <: ClusterManager
320
319
ntasks:: Integer
@@ -457,6 +456,7 @@ This function should be called from the worker process.
457
456
"""
458
457
function set_worker_logger ()
459
458
@eval Main using Logging
459
+ redirect_stderr (stdout )
460
460
io = open (" worker_$(myid ()) .log" , " w" )
461
461
logger = SimpleLogger (io)
462
462
Base. global_logger (logger)
@@ -478,3 +478,148 @@ function set_worker_loggers(workers = workers())
478
478
end
479
479
end
480
480
end
481
+
482
+ # Copied from Distributed.jl in order to evaluate the manager's expression on worker initialization
483
+ function Distributed. create_worker (
484
+ manager:: Union{SlurmManager, PBSManager} ,
485
+ wconfig,
486
+ )
487
+ # only node 1 can add new nodes, since nobody else has the full list of address:port
488
+ @assert Distributed. LPROC. id == 1
489
+ timeout = worker_timeout ()
490
+
491
+ # initiate a connect. Does not wait for connection completion in case of TCP.
492
+ w = Distributed. Worker ()
493
+ local r_s, w_s
494
+ try
495
+ (r_s, w_s) = Distributed. connect (manager, w. id, wconfig)
496
+ catch ex
497
+ try
498
+ Distributed. deregister_worker (w. id)
499
+ kill (manager, w. id, wconfig)
500
+ finally
501
+ rethrow (ex)
502
+ end
503
+ end
504
+
505
+ w = Distributed. Worker (w. id, r_s, w_s, manager; config = wconfig)
506
+ # install a finalizer to perform cleanup if necessary
507
+ finalizer (w) do w
508
+ if myid () == 1
509
+ Distributed. manage (w. manager, w. id, w. config, :finalize )
510
+ end
511
+ end
512
+
513
+ # set when the new worker has finished connections with all other workers
514
+ ntfy_oid = Distributed. RRID ()
515
+ rr_ntfy_join = Distributed. lookup_ref (ntfy_oid)
516
+ rr_ntfy_join. waitingfor = myid ()
517
+
518
+ # Start a new task to handle inbound messages from connected worker in master.
519
+ # Also calls `wait_connected` on TCP streams.
520
+ Distributed. process_messages (w. r_stream, w. w_stream, false )
521
+
522
+ # send address information of all workers to the new worker.
523
+ # Cluster managers set the address of each worker in `WorkerConfig.connect_at`.
524
+ # A new worker uses this to setup an all-to-all network if topology :all_to_all is specified.
525
+ # Workers with higher pids connect to workers with lower pids. Except process 1 (master) which
526
+ # initiates connections to all workers.
527
+
528
+ # Connection Setup Protocol:
529
+ # - Master sends 16-byte cookie followed by 16-byte version string and a JoinPGRP message to all workers
530
+ # - On each worker
531
+ # - Worker responds with a 16-byte version followed by a JoinCompleteMsg
532
+ # - Connects to all workers less than its pid. Sends the cookie, version and an IdentifySocket message
533
+ # - Workers with incoming connection requests write back their Version and an IdentifySocketAckMsg message
534
+ # - On master, receiving a JoinCompleteMsg triggers rr_ntfy_join (signifies that worker setup is complete)
535
+
536
+ join_list = []
537
+ if Distributed. PGRP. topology === :all_to_all
538
+ # need to wait for lower worker pids to have completed connecting, since the numerical value
539
+ # of pids is relevant to the connection process, i.e., higher pids connect to lower pids and they
540
+ # require the value of config.connect_at which is set only upon connection completion
541
+ for jw in Distributed. PGRP. workers
542
+ if (jw. id != 1 ) && (jw. id < w. id)
543
+ # wait for wl to join
544
+ # We should access this atomically using (@atomic jw.state)
545
+ # but this is only recently supported
546
+ if jw. state === Distributed. W_CREATED
547
+ lock (jw. c_state) do
548
+ wait (jw. c_state)
549
+ end
550
+ end
551
+ push! (join_list, jw)
552
+ end
553
+ end
554
+
555
+ elseif Distributed. PGRP. topology === :custom
556
+ # wait for requested workers to be up before connecting to them.
557
+ filterfunc (x) =
558
+ (x. id != 1 ) &&
559
+ isdefined (x, :config ) &&
560
+ (
561
+ notnothing (x. config. ident) in
562
+ something (wconfig. connect_idents, [])
563
+ )
564
+
565
+ wlist = filter (filterfunc, Distributed. PGRP. workers)
566
+ waittime = 0
567
+ while wconfig. connect_idents != = nothing &&
568
+ length (wlist) < length (wconfig. connect_idents)
569
+ if waittime >= timeout
570
+ error (" peer workers did not connect within $timeout seconds" )
571
+ end
572
+ sleep (1.0 )
573
+ waittime += 1
574
+ wlist = filter (filterfunc, Distributed. PGRP. workers)
575
+ end
576
+
577
+ for wl in wlist
578
+ lock (wl. c_state) do
579
+ if (@atomic wl. state) === Distributed. W_CREATED
580
+ # wait for wl to join
581
+ wait (wl. c_state)
582
+ end
583
+ end
584
+ push! (join_list, wl)
585
+ end
586
+ end
587
+
588
+ all_locs = Base. mapany (
589
+ x ->
590
+ isa (x, Distributed. Worker) ?
591
+ (something (x. config. connect_at, ()), x. id) : ((), x. id, true ),
592
+ join_list,
593
+ )
594
+ Distributed. send_connection_hdr (w, true )
595
+ enable_threaded_blas = something (wconfig. enable_threaded_blas, false )
596
+
597
+ join_message = Distributed. JoinPGRPMsg (
598
+ w. id,
599
+ all_locs,
600
+ Distributed. PGRP. topology,
601
+ enable_threaded_blas,
602
+ Distributed. isclusterlazy (),
603
+ )
604
+ Distributed. send_msg_now (
605
+ w,
606
+ Distributed. MsgHeader (Distributed. RRID (0 , 0 ), ntfy_oid),
607
+ join_message,
608
+ )
609
+
610
+ # Ensure the initial expression is evaluated before any other code
611
+ @info " Evaluating initial expression on worker $(w. id) "
612
+ evaluate_initial_expression (w. id, manager. expr)
613
+
614
+ @async Distributed. manage (w. manager, w. id, w. config, :register )
615
+
616
+ # wait for rr_ntfy_join with timeout
617
+ if timedwait (() -> isready (rr_ntfy_join), timeout) === :timed_out
618
+ error (" worker did not connect within $timeout seconds" )
619
+ end
620
+ lock (Distributed. client_refs) do
621
+ delete! (Distributed. PGRP. refs, ntfy_oid)
622
+ end
623
+
624
+ return w. id
625
+ end
0 commit comments