Skip to content

Commit 0247db6

Browse files
committed
Revert "revert src changes for later merge with #1138 after testing"
This reverts commit a56bed6.
1 parent b223d8e commit 0247db6

1 file changed

Lines changed: 61 additions & 3 deletions

File tree

src/esm_runscripts/slurm.py

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,21 @@ def get_jobid():
6363
return os.environ.get("SLURM_JOB_ID")
6464

6565
def prepare_launcher(self, config, cluster):
66+
# which launcher are we using?
67+
launcher = config["computer"].get("launcher",None)
68+
# friendly check that you are using a launcher that we support
69+
if launcher not in ["srun", "mpirun"]:
70+
print(" The launcher %s is not compatible with ESM-Tools in SLURM " % (launcher,))
71+
print(" Supported launchers for SLURM are srun and mpirun ")
72+
6673
# MA: not sure how this will play with heterogeneous parallelization
6774
if "multi_srun" in config["general"]:
6875
for run_type in list(config["general"]["multi_srun"]):
6976
current_hostfile = self.path + "_" + run_type
70-
write_one_hostfile(current_hostfile, config)
77+
if launcher == "srun":
78+
write_one_hostfile_srun(current_hostfile, config)
79+
elif launcher == "mpirun":
80+
write_one_hostfile_mpirun(current_hostfile, config)
7181

7282
if config["computer"].get(
7383
"heterogeneous_parallelization", False
@@ -76,7 +86,11 @@ def prepare_launcher(self, config, cluster):
7686
config["general"]["batch"].het_par_launcher_lines(config, cluster)
7787
else:
7888
# Standard/old way of running jobs with slurm
79-
self.write_one_hostfile(self.path, config)
89+
if launcher == "srun":
90+
self.write_one_hostfile_srun(self.path, config)
91+
elif launcher == "mpirun":
92+
# JK: Need to think about how to handle heterogeneous paralleisation here...
93+
self.write_one_hostfile_mpirun(self.path, config)
8094

8195
hostfile_in_work = (
8296
config["general"]["work_dir"] + "/" + os.path.basename(self.path)
@@ -85,10 +99,11 @@ def prepare_launcher(self, config, cluster):
8599

86100
return config
87101

88-
def write_one_hostfile(self, hostfile, config):
102+
def write_one_hostfile_srun(self, hostfile, config):
89103
"""
90104
Gathers previously prepared requirements
91105
(batch_system.calculate_requirements) and writes them to ``self.path``.
106+
Suitable for srun
92107
"""
93108

94109
with open(hostfile, "w") as hostfile:
@@ -112,7 +127,50 @@ def write_one_hostfile(self, hostfile, config):
112127
hostfile.write(
113128
str(start_proc) + "-" + str(end_proc) + " " + command + "\n"
114129
)
130+
131+
def write_one_hostfile_mpirun(self, hostfile, config):
132+
"""
133+
Gathers previously prepared requirements
134+
(batch_system.calculate_requirements) and writes them to ``self.path``.
135+
Suitable for mpirun launcher
136+
"""
137+
138+
# make an empty string which we will append commands to
139+
mpirun_options = ""
115140

141+
for model in config["general"]["valid_model_names"]:
142+
end_proc = config[model].get("end_proc", None)
143+
start_proc = config[model].get("start_proc", None)
144+
145+
# a model component like oasis3mct does not need cores
146+
# since its technically a library
147+
# So start_proc and end_proc will be None. Skip it
148+
if start_proc == None or end_proc == None:
149+
continue
150+
151+
# number of cores needed
152+
no_cpus = end_proc - start_proc + 1
153+
154+
# check if execution_command or executable exist
155+
if "execution_command" in config[model]:
156+
command = "./" + config[model]["execution_command"]
157+
elif "executable" in config[model]:
158+
command = "./" + config[model]["executable"]
159+
else:
160+
print('warning: the executable or execution_command could not be detemined for %s' % (model,))
161+
continue
162+
163+
# the mpirun command is set here.
164+
mpirun_options += (
165+
" -np %d %s :" % (no_cpus, command)
166+
)
167+
168+
mpirun_options = mpirun_options[:-1] # remove trailing ":"
169+
170+
with open(hostfile, "w") as hostfile:
171+
hostfile.write(mpirun_options)
172+
173+
116174
@staticmethod
117175
def get_job_state(jobid):
118176
"""

0 commit comments

Comments
 (0)