Skip to content

Update appliance for stackhpc.openhpc nodegroup/partition changes #666

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions ansible/roles/cluster_infra/templates/outputs.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ output "cluster_nodes" {
}
}
],
{% for partition in openhpc_slurm_partitions %}
{% for nodegroup in openhpc_nodegroups %}
[
for compute in openstack_compute_instance_v2.{{ partition.name }}: {
for compute in openstack_compute_instance_v2.{{ nodegroup.name }}: {
name = compute.name
ip = compute.network[0].fixed_ip_v4
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ partition.name }}"],
groups = ["compute", "{{ cluster_name }}_compute", "{{ cluster_name }}_{{ nodegroup.name }}"],
facts = {
openstack_project_id = data.openstack_identity_auth_scope_v3.scope.project_id
}
Expand Down
32 changes: 16 additions & 16 deletions ansible/roles/cluster_infra/templates/resources.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,11 @@ resource "openstack_networking_port_v2" "control_storage" {
###
# Workers
###
{% for partition in openhpc_slurm_partitions %}
{% for nodegroup in openhpc_nodegroups %}
# Primary network
resource "openstack_networking_port_v2" "{{ partition.name }}" {
count = {{ partition.count }}
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
resource "openstack_networking_port_v2" "{{ nodegroup.name }}" {
count = {{ nodegroup.count }}
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}"
network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
admin_state_up = "true"

Expand All @@ -305,9 +305,9 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {

# Storage network
{% if cluster_storage_network is defined %}
resource "openstack_networking_port_v2" "{{ partition.name }}_storage" {
count = {{ partition.count }}
name = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}"
resource "openstack_networking_port_v2" "{{ nodegroup.name }}_storage" {
count = {{ nodegroup.count }}
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-storage-${count.index}"
network_id = data.openstack_networking_network_v2.cluster_storage.id
admin_state_up = "true"

Expand Down Expand Up @@ -499,25 +499,25 @@ resource "openstack_compute_instance_v2" "control" {
}
}

{% for partition in openhpc_slurm_partitions %}
resource "openstack_compute_instance_v2" "{{ partition.name }}" {
count = {{ partition.count }}
{% for nodegroup in openhpc_nodegroups %}
resource "openstack_compute_instance_v2" "{{ nodegroup.name }}" {
count = {{ nodegroup.count }}

name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
name = "{{ cluster_name }}-compute-{{ nodegroup.name }}-${count.index}"
image_id = "{{ cluster_image }}"
{% if 'flavor_name' in partition %}
flavor_name = "{{ partition.flavor_name }}"
{% if 'flavor_name' in nodegroup %}
flavor_name = "{{ nodegroup.flavor_name }}"
{% else %}
flavor_id = "{{ partition.flavor }}"
flavor_id = "{{ nodegroup.flavor }}"
{% endif %}

network {
port = openstack_networking_port_v2.{{ partition.name }}[count.index].id
port = openstack_networking_port_v2.{{ nodegroup.name }}[count.index].id
}

{% if cluster_storage_network is defined %}
network {
port = openstack_networking_port_v2.{{ partition.name }}_storage[count.index].id
port = openstack_networking_port_v2.{{ nodegroup.name }}_storage[count.index].id
}
{% endif %}

Expand Down
4 changes: 2 additions & 2 deletions ansible/roles/openondemand/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ This role enables SSL on the Open Ondemand server, using the following self-sign
- `new_window`: Optional. Whether to open link in new window. Bool, default `false`.
- `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component.
- `openondemand_dashboard_support_url`: Optional. URL or email etc to show as support contact under Help in dashboard. Default `(undefined)`.
- `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_slurm_partitions.
- `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_partitions.
- `openondemand_desktop_screensaver`: Optional. Whether to enable screen locking/screensaver. **NB:** Users must have passwords if this is enabled. Bool, default `false`.
- `openondemand_filesapp_paths`: List of paths (in addition to $HOME, which is always added) to include shortcuts to within the Files dashboard app.
- `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_slurm_partitions.
- `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_partitions.

### Monitoring
- `openondemand_exporter`: Optional. Install the Prometheus [ondemand_exporter](https://github.com/OSC/ondemand_exporter) on the `openondemand` node to export metrics about Open Ondemand itself. Default `true`.
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/rebuild/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ The below are only used by this role's `rebuild.yml` task file, i.e. when
running the `ansible/adhoc/rebuild-via-slurm.yml` playbook:

- `rebuild_job_partitions`: Optional. Comma-separated list of names of rebuild
partitions defined in `openhpc_slurm_partitions`. Useful as an extra-var for
partitions defined in `openhpc_partitions`. Useful as an extra-var for
limiting rebuilds. Default `rebuild`.

- `rebuild_job_name`: Optional. Name of rebuild jobs. Default is `rebuild-`
Expand Down
5 changes: 5 additions & 0 deletions ansible/validate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@
gather_facts: false
tags: openhpc
tasks:
- import_role:
name: stackhpc.openhpc
tasks_from: validate.yml
- assert:
that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])"
fail_msg: |
'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden?
Additional slurm.conf parameters should be provided using variable openhpc_config_extra.
success_msg: Checked Slurm will be configured for configless operation
delegate_to: localhost
run_once: true

- name: Validate filebeat configuration
hosts: filebeat
Expand Down
43 changes: 9 additions & 34 deletions docs/experimental/slurm-controlled-rebuild.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,42 +107,17 @@ The configuration of this is complex and involves:
defined in the `compute` or `login` variables, to override the default
image for specific node groups.

5. Modify `openhpc_slurm_partitions` to add a new partition covering rebuildable
nodes to use for for rebuild jobs. If using the default OpenTofu
configurations, this variable is contained in an OpenTofu-templated file
`environments/$ENV/group_vars/all/partitions.yml` which must be overriden
by copying it to e.g. a `z_partitions.yml` file in the same directory.
However production sites will probably be overriding this file anyway to
customise it.

An example partition definition, given the two node groups "general" and
"gpu" shown in Step 2, is:

```yaml
openhpc_slurm_partitions:
...
- name: rebuild
groups:
- name: general
- name: gpu
default: NO
maxtime: 30
partition_params:
PriorityJobFactor: 65533
Hidden: YES
RootOnly: YES
DisableRootJobs: NO
PreemptMode: 'OFF'
OverSubscribe: EXCLUSIVE
```

Which has parameters as follows:
5. Ensure `openhpc_partitions` contains a partition covering the nodes to run
rebuild jobs. The default definition in `environments/common/inventory/group_vars/all/openhpc.yml`
will automatically include this via `openhpc_rebuild_partition` also in that
file. If modifying this, note the important parameters are:

- `name`: Partition name matching `rebuild` role variable `rebuild_partitions`,
default `rebuild`.
- `groups`: A list of node group names, matching keys in the OpenTofu
`compute` variable (see example in step 2 above). Normally every compute
node group should be listed here, unless Slurm-controlled rebuild is not
required for certain node groups.
- `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and
keys in the OpenTofu `compute` variable (see example in step 2 above).
Normally every compute node group should be listed here, unless
Slurm-controlled rebuild is not required for certain node groups.
- `default`: Must be set to `NO` so that it is not the default partition.
- `maxtime`: Maximum time to allow for rebuild jobs, in
[slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime).
Expand Down
4 changes: 2 additions & 2 deletions environments/.caas/inventory/group_vars/all/openhpc.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
openhpc_cluster_name: "{{ cluster_name }}"

# Provision a single "standard" compute partition using the supplied
# Provision a single "standard" compute nodegroup using the supplied
# node count and flavor
openhpc_slurm_partitions: "{{ hostvars[groups['openstack'][0]]['openhpc_slurm_partitions'] }}"
openhpc_nodegroups: "{{ hostvars[groups['openstack'][0]]['openhpc_nodegroups'] }}"
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
openondemand_auth: basic_pam
openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}"
openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}"

httpd_listen_addr_port:
- 80
Expand Down
4 changes: 2 additions & 2 deletions environments/.caas/inventory/group_vars/openstack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ terraform_project_path: "{{ playbook_dir }}/terraform"
terraform_state: "{{ cluster_state | default('present') }}"
cluster_ssh_user: rocky

# Provision a single "standard" compute partition using the supplied
# Provision a single "standard" compute nodegroup using the supplied
# node count and flavor
openhpc_slurm_partitions:
openhpc_nodegroups:
- name: "standard"
count: "{{ compute_count }}"
flavor: "{{ compute_flavor }}"
Expand Down

This file was deleted.

17 changes: 15 additions & 2 deletions environments/common/inventory/group_vars/all/openhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,21 @@ openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}"
openhpc_slurmdbd_mysql_username: slurm
openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init
openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}"
openhpc_slurm_partitions:
- name: "compute"
openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime
name: rebuild
nodegroups: "{{ cluster_compute_groups | default([]) }}"
default: NO
maxtime: 30
partition_params:
PriorityJobFactor: 65533
Hidden: YES
RootOnly: YES
DisableRootJobs: NO
PreemptMode: 'OFF'
OverSubscribe: EXCLUSIVE
openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group
openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed
openhpc_partitions: "{{ openhpc_user_partitions + ([openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else []) }}" # auto-create rebuild partition if reqd.
openhpc_packages_default:
# system packages
- podman
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if

openondemand_auth: basic_pam

openondemand_jupyter_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
openondemand_desktop_partition: "{{ openhpc_slurm_partitions[0]['name'] }}"
openondemand_jupyter_partition: "{{ openhpc_partitions[0]['name'] }}"
openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}"

# Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host,
# e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,3 @@ resource "local_file" "hosts" {
)
filename = "../inventory/hosts.yml"
}

resource "local_file" "partitions" {
content = templatefile("${path.module}/partitions.tpl",
{
"compute_groups": module.compute,
},
)
filename = "../inventory/group_vars/all/partitions.yml" # as all/ is created by skeleton
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ all:
vars:
openhpc_cluster_name: ${cluster_name}
cluster_domain_suffix: ${cluster_domain_suffix}
cluster_compute_groups: ${jsonencode(keys(compute_groups))}

control:
hosts:
Expand Down

This file was deleted.

2 changes: 1 addition & 1 deletion requirements.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ roles:
version: v25.3.2
name: stackhpc.nfs
- src: https://github.com/stackhpc/ansible-role-openhpc.git
version: v0.30.0
version: 5ceb9e1 # on feat/nodegroups-v2 TODO: bump to release
name: stackhpc.openhpc
- src: https://github.com/stackhpc/ansible-node-exporter.git
version: stackhpc
Expand Down