From 38e5fd341ae8d8d279f3acbb7d8063477cfe361a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 8 May 2025 20:54:46 +0000 Subject: [PATCH 01/10] PoC of automating partition/nodegroup config --- .../inventory/group_vars/all/z_partitions.yml | 18 ------------------ .../inventory/group_vars/all/openhpc.yml | 17 +++++++++++++++-- .../tofu/inventory.tf | 9 --------- .../tofu/inventory.tpl | 1 + .../tofu/partitions.tpl | 5 ----- requirements.yml | 2 +- 6 files changed, 17 insertions(+), 35 deletions(-) delete mode 100755 environments/.stackhpc/inventory/group_vars/all/z_partitions.yml delete mode 100644 environments/skeleton/{{cookiecutter.environment}}/tofu/partitions.tpl diff --git a/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml b/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml deleted file mode 100755 index ea489770e..000000000 --- a/environments/.stackhpc/inventory/group_vars/all/z_partitions.yml +++ /dev/null @@ -1,18 +0,0 @@ -# override tofu-generated file: -openhpc_slurm_partitions: - - name: extra - - name: standard - - name: rebuild - groups: - - name: extra - - name: standard - default: NO - maxtime: 30 - partition_params: - PriorityJobFactor: 65533 - Hidden: YES - RootOnly: YES - DisableRootJobs: NO - PreemptMode: 'OFF' - OverSubscribe: EXCLUSIVE - diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index bcda89b56..f93aa605a 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -15,8 +15,21 @@ openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" -openhpc_slurm_partitions: - - name: "compute" +openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime + name: rebuild + groups: "{{ cluster_compute_groups }}" + default: NO + maxtime: 30 + partition_params: + PriorityJobFactor: 65533 + Hidden: YES + RootOnly: YES + DisableRootJobs: NO + PreemptMode: 'OFF' + OverSubscribe: EXCLUSIVE +openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group +openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed +openhpc_partitions: "{{ openhpc_user_partitions + [openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else [] }}" # auto-create rebuild partition if reqd. openhpc_packages_default: # system packages - podman diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf index 0af7eb30b..2259d0415 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf @@ -11,12 +11,3 @@ resource "local_file" "hosts" { ) filename = "../inventory/hosts.yml" } - -resource "local_file" "partitions" { - content = templatefile("${path.module}/partitions.tpl", - { - "compute_groups": module.compute, - }, - ) - filename = "../inventory/group_vars/all/partitions.yml" # as all/ is created by skeleton -} diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl index ae3bcbc40..1e0b46f40 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl @@ -2,6 +2,7 @@ all: vars: openhpc_cluster_name: ${cluster_name} cluster_domain_suffix: ${cluster_domain_suffix} + cluster_compute_groups: ${jsonencode(keys(compute_groups))} control: hosts: diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/partitions.tpl b/environments/skeleton/{{cookiecutter.environment}}/tofu/partitions.tpl deleted file mode 100644 index 6fa1bae39..000000000 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/partitions.tpl +++ /dev/null @@ -1,5 +0,0 @@ -# Generated by terraform -openhpc_slurm_partitions: -%{ for group_name in keys(compute_groups) ~} - - name: ${group_name} -%{ endfor ~} diff --git a/requirements.yml b/requirements.yml index 87b2a6263..bd470f3e1 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.28.0 + version: feat/nodegroups # TODO: bump to release name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 3bf7db9d19190caba08b4725e7ec7f5775f09e71 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 08:43:50 +0000 Subject: [PATCH 02/10] update rebuild docs --- docs/experimental/slurm-controlled-rebuild.md | 43 ++++--------------- 1 file changed, 9 insertions(+), 34 deletions(-) diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 30754efa7..7f9efa22c 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -107,42 +107,17 @@ The configuration of this is complex and involves: defined in the `compute` or `login` variables, to override the default image for specific node groups. -5. Modify `openhpc_slurm_partitions` to add a new partition covering rebuildable - nodes to use for for rebuild jobs. If using the default OpenTofu - configurations, this variable is contained in an OpenTofu-templated file - `environments/$ENV/group_vars/all/partitions.yml` which must be overriden - by copying it to e.g. a `z_partitions.yml` file in the same directory. - However production sites will probably be overriding this file anyway to - customise it. - - An example partition definition, given the two node groups "general" and - "gpu" shown in Step 2, is: - - ```yaml - openhpc_slurm_partitions: - ... - - name: rebuild - groups: - - name: general - - name: gpu - default: NO - maxtime: 30 - partition_params: - PriorityJobFactor: 65533 - Hidden: YES - RootOnly: YES - DisableRootJobs: NO - PreemptMode: 'OFF' - OverSubscribe: EXCLUSIVE - ``` - - Which has parameters as follows: +5. Ensure `openhpc_partitions` contains a partition covering the nodes to run + rebuild jobs. The default definition in `environments/common/inventory/group_vars/all/openhpc.yml` + will automatically include this via `openhpc_rebuild_partition` also in that + file. If modifying this, note the important parameters are: + - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, default `rebuild`. - - `groups`: A list of node group names, matching keys in the OpenTofu - `compute` variable (see example in step 2 above). Normally every compute - node group should be listed here, unless Slurm-controlled rebuild is not - required for certain node groups. + - `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and + keys in the OpenTofu `compute` variable (see example in step 2 above). + Normally every compute node group should be listed here, unless + Slurm-controlled rebuild is not required for certain node groups. - `default`: Must be set to `NO` so that it is not the default partition. - `maxtime`: Maximum time to allow for rebuild jobs, in [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). From d730da6735b792a9b27a44d66d2f4aa0945d2ea8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 11:58:43 +0000 Subject: [PATCH 03/10] fixup ondemand partitions for openhpc_partitions --- ansible/roles/openondemand/README.md | 4 ++-- environments/common/inventory/group_vars/all/openondemand.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index 365265df0..099276c7e 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -59,10 +59,10 @@ This role enables SSL on the Open Ondemand server, using the following self-sign - `new_window`: Optional. Whether to open link in new window. Bool, default `false`. - `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component. - `openondemand_dashboard_support_url`: Optional. URL or email etc to show as support contact under Help in dashboard. Default `(undefined)`. -- `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_slurm_partitions. +- `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_partitions. - `openondemand_desktop_screensaver`: Optional. Whether to enable screen locking/screensaver. **NB:** Users must have passwords if this is enabled. Bool, default `false`. - `openondemand_filesapp_paths`: List of paths (in addition to $HOME, which is always added) to include shortcuts to within the Files dashboard app. -- `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_slurm_partitions. +- `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_partitions. ### Monitoring - `openondemand_exporter`: Optional. Install the Prometheus [ondemand_exporter](https://github.com/OSC/ondemand_exporter) on the `openondemand` node to export metrics about Open Ondemand itself. Default `true`. diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 38b2b3a67..2df138072 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -12,8 +12,8 @@ openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if openondemand_auth: basic_pam -openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" -openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}" # Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host, # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. From 4e641d15850670c4cd4cc30f6372c577bd054314 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 12:00:50 +0000 Subject: [PATCH 04/10] fixup rebuild role docs for openhpc_partitions --- ansible/roles/rebuild/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index 58cb26502..4e4e87a8e 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -23,7 +23,7 @@ The below are only used by this role's `rebuild.yml` task file, i.e. when running the `ansible/adhoc/rebuild-via-slurm.yml` playbook: - `rebuild_job_partitions`: Optional. Comma-separated list of names of rebuild - partitions defined in `openhpc_slurm_partitions`. Useful as an extra-var for + partitions defined in `openhpc_partitions`. Useful as an extra-var for limiting rebuilds. Default `rebuild`. - `rebuild_job_name`: Optional. Name of rebuild jobs. Default is `rebuild-` From 5e5b389f9a0fec25ad1e9b2bccfdb6e4a61792db Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 12:03:39 +0000 Subject: [PATCH 05/10] fix caas for openhpc_partitions/openhpc_nodegroups --- ansible/roles/cluster_infra/templates/outputs.tf.j2 | 2 +- ansible/roles/cluster_infra/templates/resources.tf.j2 | 4 ++-- environments/.caas/inventory/group_vars/all/openhpc.yml | 4 ++-- environments/.caas/inventory/group_vars/all/openondemand.yml | 4 ++-- environments/.caas/inventory/group_vars/openstack.yml | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 index 4d894a1dd..33af5d5b8 100644 --- a/ansible/roles/cluster_infra/templates/outputs.tf.j2 +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -32,7 +32,7 @@ output "cluster_nodes" { } } ], - {% for partition in openhpc_slurm_partitions %} + {% for partition in openhpc_partitions %} [ for compute in openstack_compute_instance_v2.{{ partition.name }}: { name = compute.name diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 69d001105..7b2e70e65 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -282,7 +282,7 @@ resource "openstack_networking_port_v2" "control_storage" { ### # Workers ### -{% for partition in openhpc_slurm_partitions %} +{% for partition in openhpc_partitions %} # Primary network resource "openstack_networking_port_v2" "{{ partition.name }}" { count = {{ partition.count }} @@ -499,7 +499,7 @@ resource "openstack_compute_instance_v2" "control" { } } -{% for partition in openhpc_slurm_partitions %} +{% for partition in openhpc_partitions %} resource "openstack_compute_instance_v2" "{{ partition.name }}" { count = {{ partition.count }} diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml index 624402f9f..74f196c6f 100644 --- a/environments/.caas/inventory/group_vars/all/openhpc.yml +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -1,5 +1,5 @@ openhpc_cluster_name: "{{ cluster_name }}" -# Provision a single "standard" compute partition using the supplied +# Provision a single "standard" compute nodegroup using the supplied # node count and flavor -openhpc_slurm_partitions: "{{ hostvars[groups['openstack'][0]]['openhpc_slurm_partitions'] }}" +openhpc_nodegroups: "{{ hostvars[groups['openstack'][0]]['openhpc_nodegroups'] }}" diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml index 60461bd61..4dc0b9337 100644 --- a/environments/.caas/inventory/group_vars/all/openondemand.yml +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -1,7 +1,7 @@ --- openondemand_auth: basic_pam -openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" -openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}" +openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}" +openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}" httpd_listen_addr_port: - 80 diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml index 89445b394..f76c05033 100644 --- a/environments/.caas/inventory/group_vars/openstack.yml +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -16,9 +16,9 @@ terraform_project_path: "{{ playbook_dir }}/terraform" terraform_state: "{{ cluster_state | default('present') }}" cluster_ssh_user: rocky -# Provision a single "standard" compute partition using the supplied +# Provision a single "standard" compute nodegroup using the supplied # node count and flavor -openhpc_slurm_partitions: +openhpc_nodegroups: - name: "standard" count: "{{ compute_count }}" flavor: "{{ compute_flavor }}" From 1f99e4acb1895730245f70d48b5173aa56642ef1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 12:53:15 +0000 Subject: [PATCH 06/10] make caas provisioning less confusing --- .../cluster_infra/templates/outputs.tf.j2 | 6 ++-- .../cluster_infra/templates/resources.tf.j2 | 32 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/outputs.tf.j2 b/ansible/roles/cluster_infra/templates/outputs.tf.j2 index 33af5d5b8..a5f5c8fd7 100644 --- a/ansible/roles/cluster_infra/templates/outputs.tf.j2 +++ b/ansible/roles/cluster_infra/templates/outputs.tf.j2 @@ -32,12 +32,12 @@ output "cluster_nodes" { } } ], - {% for partition in openhpc_partitions %} + {% for nodegroup in openhpc_nodegroups %} [ - for compute in openstack_compute_instance_v2.{{ partition.name }}: { + for compute in openstack_compute_instance_v2.{{ nodegroup.name }}: { name = compute.name ip = compute.network[0].fixed_ip_v4 - groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"], + groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ nodegroup.name }}"], facts = { openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id } diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 7b2e70e65..179ea8e25 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -282,11 +282,11 @@ resource "openstack_networking_port_v2" "control_storage" { ### # Workers ### -{% for partition in openhpc_partitions %} +{% for nodegroup in openhpc_nodegroup %} # Primary network -resource "openstack_networking_port_v2" "{{ partition.name }}" { - count = {{ partition.count }} - name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}" +resource "openstack_networking_port_v2" "{{ nodegroup.name }}" { + count = {{ nodegroup.count }} + name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}" network_id = "${data.openstack_networking_network_v2.cluster_network.id}" admin_state_up = "true" @@ -305,9 +305,9 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" { # Storage network {% if cluster_storage_network is defined %} -resource "openstack_networking_port_v2" "{{ partition.name }}_storage" { - count = {{ partition.count }} - name = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}" +resource "openstack_networking_port_v2" "{{ nodegroup.name }}_storage" { + count = {{ nodegroup.count }} + name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-storage-${count.index}" network_id = data.openstack_networking_network_v2.cluster_storage.id admin_state_up = "true" @@ -499,25 +499,25 @@ resource "openstack_compute_instance_v2" "control" { } } -{% for partition in openhpc_partitions %} -resource "openstack_compute_instance_v2" "{{ partition.name }}" { - count = {{ partition.count }} +{% for nodegroup in openhpc_nodegroup %} +resource "openstack_compute_instance_v2" "{{ nodegroup.name }}" { + count = {{ nodegroup.count }} - name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}" + name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}" image_id = "{{ cluster_image }}" - {% if 'flavor_name' in partition %} - flavor_name = "{{ partition.flavor_name }}" + {% if 'flavor_name' in nodegroup %} + flavor_name = "{{ nodegroup.flavor_name }}" {% else %} - flavor_id = "{{ partition.flavor }}" + flavor_id = "{{ nodegroup.flavor }}" {% endif %} network { - port = openstack_networking_port_v2.{{ partition.name }}[count.index].id + port = openstack_networking_port_v2.{{ nodegroup.name }}[count.index].id } {% if cluster_storage_network is defined %} network { - port = openstack_networking_port_v2.{{ partition.name }}_storage[count.index].id + port = openstack_networking_port_v2.{{ nodegroup.name }}_storage[count.index].id } {% endif %} From 2b752bab3dd7dc31a56f32f38dd618c1ffdd72c3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 13:00:47 +0000 Subject: [PATCH 07/10] fix openhpc_partition config for stackhpc.openhpc groups->nodegroups change --- environments/common/inventory/group_vars/all/openhpc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index f93aa605a..b6a4a0eb4 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -17,7 +17,7 @@ openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using host openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime name: rebuild - groups: "{{ cluster_compute_groups }}" + nodegroups: "{{ cluster_compute_groups }}" default: NO maxtime: 30 partition_params: From 7ce38c69fc1f146e3937840342450be31c21625d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 13:29:42 +0000 Subject: [PATCH 08/10] run stackhpc_openhpc validation --- ansible/validate.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/validate.yml b/ansible/validate.yml index ca0a95854..e307ec649 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -23,12 +23,17 @@ gather_facts: false tags: openhpc tasks: + - import_role: + name: stackhpc.openhpc + tasks_from: validate.yml - assert: that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])" fail_msg: | 'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden? Additional slurm.conf parameters should be provided using variable openhpc_config_extra. success_msg: Checked Slurm will be configured for configless operation + delegate_to: localhost + run_once: true - name: Validate filebeat configuration hosts: filebeat From c8af52ef3d571f7ef517555ec44530bd75965498 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 14:34:04 +0000 Subject: [PATCH 09/10] fix caas nodegroups typo --- ansible/roles/cluster_infra/templates/resources.tf.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 179ea8e25..f3423712e 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -282,7 +282,7 @@ resource "openstack_networking_port_v2" "control_storage" { ### # Workers ### -{% for nodegroup in openhpc_nodegroup %} +{% for nodegroup in openhpc_nodegroups %} # Primary network resource "openstack_networking_port_v2" "{{ nodegroup.name }}" { count = {{ nodegroup.count }} @@ -499,7 +499,7 @@ resource "openstack_compute_instance_v2" "control" { } } -{% for nodegroup in openhpc_nodegroup %} +{% for nodegroup in openhpc_nodegroups %} resource "openstack_compute_instance_v2" "{{ nodegroup.name }}" { count = {{ nodegroup.count }} From e471458dc8b6248e88b65d8c2d38f73a7b9f20dc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 9 May 2025 15:13:51 +0000 Subject: [PATCH 10/10] fix partitions for caas and non-rebuilt-enabled clusters --- environments/common/inventory/group_vars/all/openhpc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index b6a4a0eb4..ce3394d63 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -17,7 +17,7 @@ openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using host openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime name: rebuild - nodegroups: "{{ cluster_compute_groups }}" + nodegroups: "{{ cluster_compute_groups | default([]) }}" default: NO maxtime: 30 partition_params: @@ -29,7 +29,7 @@ openhpc_rebuild_partition: # not a role var - could actually add more indirectio OverSubscribe: EXCLUSIVE openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed -openhpc_partitions: "{{ openhpc_user_partitions + [openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else [] }}" # auto-create rebuild partition if reqd. +openhpc_partitions: "{{ openhpc_user_partitions + ([openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else []) }}" # auto-create rebuild partition if reqd. openhpc_packages_default: # system packages - podman