Skip to content

Commit 7a92892

Browse files
authored
Merge pull request #312 from casparvl/adapt_arch_target_map
Adapt arch target map
2 parents 5065c16 + 2f3c0ae commit 7a92892

File tree

12 files changed

+595
-154
lines changed

12 files changed

+595
-154
lines changed

app.cfg.example

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -305,19 +305,59 @@ signing =
305305

306306

307307
[architecturetargets]
308-
# defines for which architectures the bot will build and what job submission
309-
# parameters shall be used to allocate a compute node with the correct
310-
arch_target_map = {
311-
"linux/x86_64/generic": "--partition x86-64-generic-node",
312-
"linux/x86_64/amd/zen2": "--partition x86-64-amd-zen2-node" }
308+
# arch_target_map has been replaced by node_type_map
309+
# arch_target_map = {
310+
# }
313311

312+
# Each entry in the node_type_map dictionary describes a build node type. The key is a (descriptive) name for this build node, and its value is a dictionary containing the following build node properties as key-value pairs:
313+
- os: its operating system (os)
314+
- cpu_subdir: its CPU architecture
315+
- slurm_params: the SLURM parameters that need to be passed to submit jobs to it
316+
- repo_targets: supported repository targets for this node type
317+
- accel (optional): which accelerators this node has
318+
# All are strings, except repo_targets, which is a list of strings.
319+
# Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of
320+
# CPU and one specific type of GPU) should be allocated.
321+
# Below is an example configuration for a system that contains 4 types of nodes: zen2 CPU nodes, zen4 CPU nodes,
322+
# GPU nodes with an icelake CPU and A100 GPU, GPU nodes with a zen4 CPU and an H100 GPU.
323+
# The 'on:' argument to the bot build command determines which node type will be allocated for the build job,
324+
# e.g. 'bot:build on:arch=zen4,accel=nvidia/cc90 for:...' will match the gpu_h100 node type below.
325+
# If no 'on:' argument is passed to the build command, the 'for:' argument is used instead,
326+
# e.g. 'bot:build for:arch=icelake,accel=nvidia/cc80' will match the gpu_a100 node type below.
327+
node_type_map = {
328+
"cpu_zen2": {
329+
"os": "linux",
330+
"cpu_subdir": "x86_64/amd/zen2",
331+
"slurm_params": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1",
332+
"repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"]
333+
},
334+
"cpu_zen4": {
335+
"os": "linux",
336+
"cpu_subdir": "x86_64/amd/zen4",
337+
"accel": "None",
338+
"slurm_params": "-p genoa --nodes 1 --ntasks-per-node 24 --cpus-per-task 1",
339+
"repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"]
340+
},
341+
"gpu_a100": {
342+
"os": "linux",
343+
"cpu_subdir": "x86_64/intel/icelake",
344+
"accel": "nvidia/cc80",
345+
"slurm_params": "-p gpu_a100 --nodes 1 --tasks-per-node 18 --cpus-per-task 1 --gpus-per-node 1",
346+
"repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"]
347+
},
348+
"gpu_h100": {
349+
"os": "linux",
350+
"cpu_subdir": "x86_64/amd/zen4",
351+
"accel": "nvidia/cc90",
352+
"slurm_params": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1",
353+
"repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"]
354+
}}
314355

315356
[repo_targets]
316-
# defines for which repository a arch_target should be build for
317-
#
318-
# EESSI/2023.06 and EESSI/2025.06
319-
repo_target_map = {
320-
"linux/x86_64/amd/zen2" : ["eessi.io-2023.06-software","eessi.io-2025.06-software"] }
357+
358+
# No longer used, repo targets are now specified per node type in the node_type_map
359+
# repo_target_map = {
360+
# "linux/x86_64/amd/zen2" : ["eessi.io-2023.06-software","eessi.io-2025.06-software"] }
321361

322362
# points to definition of repositories (default repository defined by build container)
323363
repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos
@@ -360,8 +400,12 @@ scontrol_command = /usr/bin/scontrol
360400
# awaits_release = job id `{job_id}` awaits release by job manager
361401
awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds
362402
awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager
363-
initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}`
403+
new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}`
404+
build_on_arch = Building on: `{on_arch}`{on_accelerator}
405+
build_for_arch = Building for: `{for_arch}`{for_accelerator}
406+
jobdir = Job dir: `{symlink}`
364407
with_accelerator =  and accelerator `{accelerator}`
408+
# initial_comment = New job on instance `{app_name}` for repository `{repo_id}`\nBuilding on: `{on_arch}`{on_accelerator}\nBuilding for: `{for_arch}`{for_accelerator}\nJob dir: `{symlink}` # no longer used
365409

366410

367411
[new_job_comments]

eessi_bot_event_handler.py

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929

3030
# Local application imports (anything from EESSI/eessi-bot-software-layer)
3131
from connections import github
32-
from tasks.build import check_build_permission, get_architecture_targets, get_repo_cfg, \
33-
request_bot_build_issue_comments, submit_build_jobs
32+
from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \
33+
submit_build_jobs
3434
from tasks.deploy import deploy_built_artefacts, determine_job_dirs
3535
from tasks.clean_up import move_to_trash_bin
3636
from tools import config
@@ -43,7 +43,7 @@
4343

4444
REQUIRED_CONFIG = {
4545
config.SECTION_ARCHITECTURETARGETS: [
46-
config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP], # required
46+
config.NODE_TYPE_MAP], # required
4747
config.SECTION_BOT_CONTROL: [
4848
# config.BOT_CONTROL_SETTING_CHATLEVEL, # optional
4949
config.BOT_CONTROL_SETTING_COMMAND_PERMISSION, # required
@@ -104,10 +104,12 @@
104104
config.SECTION_JOB_MANAGER: [
105105
config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required
106106
config.SECTION_REPO_TARGETS: [
107-
config.REPO_TARGETS_SETTING_REPO_TARGET_MAP, # required
108107
config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required
109108
config.SECTION_SUBMITTED_JOB_COMMENTS: [
110-
config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required
109+
config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO, # required
110+
config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH, # required
111+
config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH, # required
112+
config.SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR, # required
111113
# config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional
112114
config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required
113115
config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required
@@ -411,23 +413,21 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev
411413
app_name = self.cfg[config.SECTION_GITHUB][config.GITHUB_SETTING_APP_NAME]
412414
# TODO check if PR already has a comment with arch targets and
413415
# repositories
414-
arch_map = get_architecture_targets(self.cfg)
415-
repo_cfg = get_repo_cfg(self.cfg)
416-
417-
comment = f"Instance `{app_name}` is configured to build for:"
418-
architectures = ['/'.join(arch.split('/')[1:]) for arch in arch_map.keys()]
419-
comment += "\n- architectures: "
420-
if len(architectures) > 0:
421-
comment += f"{', '.join([f'`{arch}`' for arch in architectures])}"
422-
else:
423-
comment += "none"
424-
repositories = list(set([repo_id for repo_ids in repo_cfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP].values()
425-
for repo_id in repo_ids]))
426-
comment += "\n- repositories: "
427-
if len(repositories) > 0:
428-
comment += f"{', '.join([f'`{repo_id}`' for repo_id in repositories])}"
429-
else:
430-
comment += "none"
416+
node_map = get_node_types(self.cfg)
417+
418+
comment = f"Instance `{app_name}` is configured to build on:"
419+
for node in node_map:
420+
comment += f"\n- Node type `{node}`:"
421+
current_node_type = node_map[node]
422+
if "os" in current_node_type:
423+
comment += f"\n - OS: `{current_node_type['os']}`"
424+
if "cpu_subdir" in current_node_type:
425+
comment += f"\n - CPU architecture: `{current_node_type['cpu_subdir']}`"
426+
if "repo_targets" in current_node_type:
427+
comment += f"\n - Repositories: `{current_node_type['repo_targets']}`"
428+
if "accel" in current_node_type:
429+
comment += f"\n - Accelerators: `{current_node_type['accel']}`"
430+
comment += "\n"
431431

432432
self.log(f"PR opened: comment '{comment}'")
433433

@@ -532,7 +532,7 @@ def handle_bot_command_build(self, event_info, bot_command):
532532
build_msg = ''
533533
if check_build_permission(pr, event_info):
534534
# use filter from command
535-
submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters)
535+
submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters, bot_command.build_params)
536536
if submitted_jobs is None or len(submitted_jobs) == 0:
537537
build_msg = "\n - no jobs were submitted"
538538
else:
@@ -578,8 +578,8 @@ def handle_bot_command_status(self, event_info, bot_command):
578578
bot_command (EESSIBotCommand): command to be handled
579579
580580
Returns:
581-
github.IssueComment.IssueComment (note, github refers to
582-
PyGithub, not the github from the internal connections module)
581+
(string): list item with a link to the issue comment that was created
582+
containing the status overview
583583
"""
584584
self.log("processing bot command 'status'")
585585
repo_name = event_info['raw_request_body']['repository']['full_name']
@@ -588,18 +588,23 @@ def handle_bot_command_status(self, event_info, bot_command):
588588

589589
comment_status = ''
590590
comment_status += "\nThis is the status of all the `bot: build` commands:"
591-
comment_status += "\n|arch|result|date|status|url|"
592-
comment_status += "\n|----|------|----|------|---|"
591+
comment_status += "\n|on|for|repo|result|date|status|url|"
592+
comment_status += "\n|----|----|----|------|----|------|---|"
593593
for x in range(0, len(status_table['date'])):
594-
comment_status += f"\n|{status_table['arch'][x]}|"
594+
comment_status += f"\n|{status_table['on arch'][x]}|"
595+
comment_status += f"{status_table['for arch'][x]}|"
596+
comment_status += f"{status_table['for repo'][x]}|"
595597
comment_status += f"{status_table['result'][x]}|"
596598
comment_status += f"{status_table['date'][x]}|"
597599
comment_status += f"{status_table['status'][x]}|"
598600
comment_status += f"{status_table['url'][x]}|"
599601

600602
self.log(f"Overview of finished builds: comment '{comment_status}'")
601603
issue_comment = create_comment(repo_name, pr_number, comment_status, ChatLevels.MINIMAL)
602-
return issue_comment
604+
if issue_comment:
605+
return f"\n - added status comment {issue_comment.html_url}"
606+
else:
607+
return "\n - failed to create status comment"
603608

604609
def start(self, app, port=3000):
605610
"""
@@ -692,7 +697,7 @@ def main():
692697
opts = event_handler_parse()
693698

694699
# config is read and checked for settings to raise an exception early when the event_handler starts.
695-
if config.check_required_cfg_settings(REQUIRED_CONFIG):
700+
if config.check_cfg_settings(REQUIRED_CONFIG):
696701
print("Configuration check: PASSED")
697702
else:
698703
print("Configuration check: FAILED")

eessi_bot_job_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ def main():
623623

624624
# config is read and checked for settings to raise an exception early when
625625
# the job_manager runs
626-
if config.check_required_cfg_settings(REQUIRED_CONFIG):
626+
if config.check_cfg_settings(REQUIRED_CONFIG):
627627
print("Configuration check: PASSED")
628628
else:
629629
print("Configuration check: FAILED")

0 commit comments

Comments
 (0)