Skip to content

Commit b99b1e9

Browse files
authored
Update appliance for stackhpc.openhpc nodegroup/partition changes (#666)
* PoC of automating partition/nodegroup config * update rebuild docs * fixup ondemand partitions for openhpc_partitions * fixup rebuild role docs for openhpc_partitions * fix caas for openhpc_partitions/openhpc_nodegroups * make caas provisioning less confusing * fix openhpc_partition config for stackhpc.openhpc groups->nodegroups change * run stackhpc_openhpc validation * fix caas nodegroups typo * fix partitions for caas and non-rebuilt-enabled clusters * bump openhpc role to release
1 parent 6545905 commit b99b1e9

File tree

16 files changed

+61
-99
lines changed

16 files changed

+61
-99
lines changed

ansible/roles/cluster_infra/templates/outputs.tf.j2

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ output "cluster_nodes" {
3232
}
3333
}
3434
],
35-
{% for partition in openhpc_slurm_partitions %}
35+
{% for nodegroup in openhpc_nodegroups %}
3636
[
37-
for compute in openstack_compute_instance_v2.{{ partition.name }}: {
37+
for compute in openstack_compute_instance_v2.{{ nodegroup.name }}: {
3838
name = compute.name
3939
ip = compute.network[0].fixed_ip_v4
40-
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"],
40+
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ nodegroup.name }}"],
4141
facts = {
4242
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
4343
}

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,11 @@ resource "openstack_networking_port_v2" "control_storage" {
282282
###
283283
# Workers
284284
###
285-
{% for partition in openhpc_slurm_partitions %}
285+
{% for nodegroup in openhpc_nodegroups %}
286286
# Primary network
287-
resource "openstack_networking_port_v2" "{{ partition.name }}" {
288-
count = {{ partition.count }}
289-
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
287+
resource "openstack_networking_port_v2" "{{ nodegroup.name }}" {
288+
count = {{ nodegroup.count }}
289+
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}"
290290
network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
291291
admin_state_up = "true"
292292

@@ -305,9 +305,9 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
305305

306306
# Storage network
307307
{% if cluster_storage_network is defined %}
308-
resource "openstack_networking_port_v2" "{{ partition.name }}_storage" {
309-
count = {{ partition.count }}
310-
name = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}"
308+
resource "openstack_networking_port_v2" "{{ nodegroup.name }}_storage" {
309+
count = {{ nodegroup.count }}
310+
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-storage-${count.index}"
311311
network_id = data.openstack_networking_network_v2.cluster_storage.id
312312
admin_state_up = "true"
313313

@@ -499,25 +499,25 @@ resource "openstack_compute_instance_v2" "control" {
499499
}
500500
}
501501

502-
{% for partition in openhpc_slurm_partitions %}
503-
resource "openstack_compute_instance_v2" "{{ partition.name }}" {
504-
count = {{ partition.count }}
502+
{% for nodegroup in openhpc_nodegroups %}
503+
resource "openstack_compute_instance_v2" "{{ nodegroup.name }}" {
504+
count = {{ nodegroup.count }}
505505

506-
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
506+
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}"
507507
image_id = "{{ cluster_image }}"
508-
{% if 'flavor_name' in partition %}
509-
flavor_name = "{{ partition.flavor_name }}"
508+
{% if 'flavor_name' in nodegroup %}
509+
flavor_name = "{{ nodegroup.flavor_name }}"
510510
{% else %}
511-
flavor_id = "{{ partition.flavor }}"
511+
flavor_id = "{{ nodegroup.flavor }}"
512512
{% endif %}
513513

514514
network {
515-
port = openstack_networking_port_v2.{{ partition.name }}[count.index].id
515+
port = openstack_networking_port_v2.{{ nodegroup.name }}[count.index].id
516516
}
517517

518518
{% if cluster_storage_network is defined %}
519519
network {
520-
port = openstack_networking_port_v2.{{ partition.name }}_storage[count.index].id
520+
port = openstack_networking_port_v2.{{ nodegroup.name }}_storage[count.index].id
521521
}
522522
{% endif %}
523523

ansible/roles/openondemand/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@ This role enables SSL on the Open Ondemand server, using the following self-sign
5959
- `new_window`: Optional. Whether to open link in new window. Bool, default `false`.
6060
- `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component.
6161
- `openondemand_dashboard_support_url`: Optional. URL or email etc to show as support contact under Help in dashboard. Default `(undefined)`.
62-
- `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_slurm_partitions.
62+
- `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_partitions.
6363
- `openondemand_desktop_screensaver`: Optional. Whether to enable screen locking/screensaver. **NB:** Users must have passwords if this is enabled. Bool, default `false`.
6464
- `openondemand_filesapp_paths`: List of paths (in addition to $HOME, which is always added) to include shortcuts to within the Files dashboard app.
65-
- `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_slurm_partitions.
65+
- `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_partitions.
6666

6767
### Monitoring
6868
- `openondemand_exporter`: Optional. Install the Prometheus [ondemand_exporter](https://github.com/OSC/ondemand_exporter) on the `openondemand` node to export metrics about Open Ondemand itself. Default `true`.

ansible/roles/rebuild/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ The below are only used by this role's `rebuild.yml` task file, i.e. when
2323
running the `ansible/adhoc/rebuild-via-slurm.yml` playbook:
2424

2525
- `rebuild_job_partitions`: Optional. Comma-separated list of names of rebuild
26-
partitions defined in `openhpc_slurm_partitions`. Useful as an extra-var for
26+
partitions defined in `openhpc_partitions`. Useful as an extra-var for
2727
limiting rebuilds. Default `rebuild`.
2828

2929
- `rebuild_job_name`: Optional. Name of rebuild jobs. Default is `rebuild-`

ansible/validate.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,17 @@
2323
gather_facts: false
2424
tags: openhpc
2525
tasks:
26+
- import_role:
27+
name: stackhpc.openhpc
28+
tasks_from: validate.yml
2629
- assert:
2730
that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])"
2831
fail_msg: |
2932
'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden?
3033
Additional slurm.conf parameters should be provided using variable openhpc_config_extra.
3134
success_msg: Checked Slurm will be configured for configless operation
35+
delegate_to: localhost
36+
run_once: true
3237

3338
- name: Validate filebeat configuration
3439
hosts: filebeat

docs/experimental/slurm-controlled-rebuild.md

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -107,42 +107,17 @@ The configuration of this is complex and involves:
107107
defined in the `compute` or `login` variables, to override the default
108108
image for specific node groups.
109109
110-
5. Modify `openhpc_slurm_partitions` to add a new partition covering rebuildable
111-
nodes to use for for rebuild jobs. If using the default OpenTofu
112-
configurations, this variable is contained in an OpenTofu-templated file
113-
`environments/$ENV/group_vars/all/partitions.yml` which must be overriden
114-
by copying it to e.g. a `z_partitions.yml` file in the same directory.
115-
However production sites will probably be overriding this file anyway to
116-
customise it.
117-
118-
An example partition definition, given the two node groups "general" and
119-
"gpu" shown in Step 2, is:
120-
121-
```yaml
122-
openhpc_slurm_partitions:
123-
...
124-
- name: rebuild
125-
groups:
126-
- name: general
127-
- name: gpu
128-
default: NO
129-
maxtime: 30
130-
partition_params:
131-
PriorityJobFactor: 65533
132-
Hidden: YES
133-
RootOnly: YES
134-
DisableRootJobs: NO
135-
PreemptMode: 'OFF'
136-
OverSubscribe: EXCLUSIVE
137-
```
138-
139-
Which has parameters as follows:
110+
5. Ensure `openhpc_partitions` contains a partition covering the nodes to run
111+
rebuild jobs. The default definition in `environments/common/inventory/group_vars/all/openhpc.yml`
112+
will automatically include this via `openhpc_rebuild_partition` also in that
113+
file. If modifying this, note the important parameters are:
114+
140115
- `name`: Partition name matching `rebuild` role variable `rebuild_partitions`,
141116
default `rebuild`.
142-
- `groups`: A list of node group names, matching keys in the OpenTofu
143-
`compute` variable (see example in step 2 above). Normally every compute
144-
node group should be listed here, unless Slurm-controlled rebuild is not
145-
required for certain node groups.
117+
- `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and
118+
keys in the OpenTofu `compute` variable (see example in step 2 above).
119+
Normally every compute node group should be listed here, unless
120+
Slurm-controlled rebuild is not required for certain node groups.
146121
- `default`: Must be set to `NO` so that it is not the default partition.
147122
- `maxtime`: Maximum time to allow for rebuild jobs, in
148123
[slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime).
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
openhpc_cluster_name: "{{ cluster_name }}"
22

3-
# Provision a single "standard" compute partition using the supplied
3+
# Provision a single "standard" compute nodegroup using the supplied
44
# node count and flavor
5-
openhpc_slurm_partitions: "{{ hostvars[groups['openstack'][0]]['openhpc_slurm_partitions'] }}"
5+
openhpc_nodegroups: "{{ hostvars[groups['openstack'][0]]['openhpc_nodegroups'] }}"

environments/.caas/inventory/group_vars/all/openondemand.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
openondemand_auth: basic_pam
3-
openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
4-
openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
3+
openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}"
4+
openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}"
55

66
httpd_listen_addr_port:
77
- 80

environments/.caas/inventory/group_vars/openstack.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ terraform_project_path: "{{ playbook_dir }}/terraform"
1616
terraform_state: "{{ cluster_state | default('present') }}"
1717
cluster_ssh_user: rocky
1818

19-
# Provision a single "standard" compute partition using the supplied
19+
# Provision a single "standard" compute nodegroup using the supplied
2020
# node count and flavor
21-
openhpc_slurm_partitions:
21+
openhpc_nodegroups:
2222
- name: "standard"
2323
count: "{{ compute_count }}"
2424
flavor: "{{ compute_flavor }}"

environments/.stackhpc/inventory/group_vars/all/z_partitions.yml

Lines changed: 0 additions & 18 deletions
This file was deleted.

environments/common/inventory/group_vars/all/openhpc.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,21 @@ openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}"
1515
openhpc_slurmdbd_mysql_username: slurm
1616
openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init
1717
openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}"
18-
openhpc_slurm_partitions:
19-
- name: "compute"
18+
openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime
19+
name: rebuild
20+
nodegroups: "{{ cluster_compute_groups | default([]) }}"
21+
default: NO
22+
maxtime: 30
23+
partition_params:
24+
PriorityJobFactor: 65533
25+
Hidden: YES
26+
RootOnly: YES
27+
DisableRootJobs: NO
28+
PreemptMode: 'OFF'
29+
OverSubscribe: EXCLUSIVE
30+
openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group
31+
openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed
32+
openhpc_partitions: "{{ openhpc_user_partitions + ([openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else []) }}" # auto-create rebuild partition if reqd.
2033
openhpc_packages_default:
2134
# system packages
2235
- podman

environments/common/inventory/group_vars/all/openondemand.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if
1212

1313
openondemand_auth: basic_pam
1414

15-
openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
16-
openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
15+
openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}"
16+
openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}"
1717

1818
# Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host,
1919
# e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'.

environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,3 @@ resource "local_file" "hosts" {
1111
)
1212
filename = "../inventory/hosts.yml"
1313
}
14-
15-
resource "local_file" "partitions" {
16-
content = templatefile("${path.module}/partitions.tpl",
17-
{
18-
"compute_groups": module.compute,
19-
},
20-
)
21-
filename = "../inventory/group_vars/all/partitions.yml" # as all/ is created by skeleton
22-
}

environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tpl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ all:
22
vars:
33
openhpc_cluster_name: ${cluster_name}
44
cluster_domain_suffix: ${cluster_domain_suffix}
5+
cluster_compute_groups: ${jsonencode(keys(compute_groups))}
56

67
control:
78
hosts:

environments/skeleton/{{cookiecutter.environment}}/tofu/partitions.tpl

Lines changed: 0 additions & 5 deletions
This file was deleted.

requirements.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ roles:
44
version: v25.3.2
55
name: stackhpc.nfs
66
- src: https://github.com/stackhpc/ansible-role-openhpc.git
7-
version: v0.30.0
7+
version: v1.0.0
88
name: stackhpc.openhpc
99
- src: https://github.com/stackhpc/ansible-node-exporter.git
1010
version: stackhpc

0 commit comments

Comments
 (0)