diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c9f1a6e..27e3a0d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,7 +51,6 @@ jobs: - test2 - test3 - test4 - - test5 - test6 - test8 - test9 diff --git a/README.md b/README.md index 841c15b..b3d3b97 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,6 @@ each list element: `openhpc_slurmdbd_host`: Optional. Where to deploy slurmdbd if are using this role to deploy slurmdbd, otherwise where an existing slurmdbd is running. This should be the name of a host in your inventory. Set this to `none` to prevent the role from managing slurmdbd. Defaults to `openhpc_slurm_control_host`. -`openhpc_slurm_configless`: Optional, default false. If true then slurm's ["configless" mode](https://slurm.schedmd.com/configless_slurm.html) is used. - `openhpc_munge_key_b64`: Optional. A base-64 encoded munge key. If not provided then the one generated on package install is used, but the `openhpc_slurm_control_host` must be in the play. `openhpc_login_only_nodes`: Optional. If using "configless" mode specify the name of an ansible group containing nodes which are login-only nodes (i.e. not also control nodes), if required. These nodes will run `slurmd` to contact the control node for config. @@ -50,6 +48,9 @@ each list element: ### slurm.conf +Note this role always operates in Slurm's [configless mode](https://slurm.schedmd.com/configless_slurm.html) +where the `slurm.conf` configuration file is only present on the control node. + `openhpc_nodegroups`: Optional, default `[]`. List of mappings, each defining a unique set of homogenous nodes: * `name`: Required. Name of node group. diff --git a/defaults/main.yml b/defaults/main.yml index bb06672..2de06aa 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -43,7 +43,6 @@ openhpc_default_config: openhpc_config: {} openhpc_gres_template: gres.conf.j2 -openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" openhpc_state_save_location: /var/spool/slurm diff --git a/molecule/README.md b/molecule/README.md index d96f669..b9085b5 100644 --- a/molecule/README.md +++ b/molecule/README.md @@ -6,24 +6,23 @@ Test options in "Other" column flow down through table unless changed. Test | # Partitions | Groups in partitions? | Other --- | --- | --- | --- -test1 | 1 | N | 2x compute node, sequential names (default test), config on all nodes +test1 | 1 | N | 2x compute node, sequential names (default test) test1b | 1 | N | 1x compute node test1c | 1 | N | 2x compute nodes, nonsequential names test2 | 2 | N | 4x compute node, sequential names test3 | 1 | Y | 4x compute nodes in 2x groups, single partition test4 | 1 | N | 2x compute node, accounting enabled -test5 | 1 | N | As for #1 but configless -test6 | 1 | N | 0x compute nodes, configless +test5 | - | - | [removed, now always configless] +test6 | 1 | N | 0x compute nodes test7 | 1 | N | [removed, image build should just run install.yml task, this is not expected to work] -test8 | 1 | N | 2x compute node, 2x login-only nodes, configless +test8 | 1 | N | 2x compute node, 2x login-only nodes test9 | 1 | N | As test8 but uses `--limit=testohpc-control,testohpc-compute-0` and checks login nodes still end up in slurm.conf -test10 | 1 | N | As for #5 but then tries to add an additional node -test11 | 1 | N | As for #5 but then deletes a node (actually changes the partition due to molecule/ansible limitations) -test12 | 1 | N | As for #5 but enabling job completion and testing `sacct -c` -test13 | 1 | N | As for #5 but tests `openhpc_config` variable. -test14 | 1 | N | [removed, extra_nodes removed] -test15 | 1 | Y | As for #5 but also tests `partitions with different name but with the same NodeName`. - +test10 | 1 | N | As for #1 but then tries to add an additional node +test11 | 1 | N | As for #1 but then deletes a node (actually changes the partition due to molecule/ansible limitations) +test12 | 1 | N | As for #1 but enabling job completion and testing `sacct -c` +test13 | 1 | N | As for #1 but tests `openhpc_config` variable. +test14 | - | - | [removed, extra_nodes removed] +test15 | 1 | Y | As for #1 but also tests partitions with different name but with the same NodeName. # Local Installation & Running diff --git a/molecule/test10/converge.yml b/molecule/test10/converge.yml index a11f0b4..9cd234b 100644 --- a/molecule/test10/converge.yml +++ b/molecule/test10/converge.yml @@ -10,7 +10,6 @@ openhpc_nodegroups: - name: "compute" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: diff --git a/molecule/test10/verify.yml b/molecule/test10/verify.yml index 612b05c..0863879 100644 --- a/molecule/test10/verify.yml +++ b/molecule/test10/verify.yml @@ -32,7 +32,6 @@ openhpc_nodegroups: - name: "compute" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - name: Check modified cluster has 3x nodes hosts: testohpc_login diff --git a/molecule/test11/converge.yml b/molecule/test11/converge.yml index 77d7342..7f9ec64 100644 --- a/molecule/test11/converge.yml +++ b/molecule/test11/converge.yml @@ -14,4 +14,3 @@ openhpc_nodegroups: - name: "compute_orig" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true diff --git a/molecule/test11/verify.yml b/molecule/test11/verify.yml index 71debb3..a0e4e0b 100644 --- a/molecule/test11/verify.yml +++ b/molecule/test11/verify.yml @@ -29,7 +29,6 @@ openhpc_nodegroups: - name: "compute_new" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - name: Check modified cluster has 1x nodes hosts: testohpc_login diff --git a/molecule/test12/converge.yml b/molecule/test12/converge.yml index 348460b..e08e289 100644 --- a/molecule/test12/converge.yml +++ b/molecule/test12/converge.yml @@ -14,5 +14,4 @@ openhpc_nodegroups: - name: "compute" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true openhpc_slurm_job_comp_type: jobcomp/filetxt diff --git a/molecule/test13/converge.yml b/molecule/test13/converge.yml index 5270c06..1d3c2ff 100644 --- a/molecule/test13/converge.yml +++ b/molecule/test13/converge.yml @@ -10,7 +10,6 @@ openhpc_nodegroups: - name: "compute" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true openhpc_login_only_nodes: 'testohpc_login' openhpc_config: FirstJobId: 13 diff --git a/molecule/test15/converge.yml b/molecule/test15/converge.yml index 28e496c..1b7a4de 100644 --- a/molecule/test15/converge.yml +++ b/molecule/test15/converge.yml @@ -20,7 +20,6 @@ Default: false AllowAccounts: Group_own_thePartition openhpc_cluster_name: testohpc - openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: diff --git a/molecule/test5/converge.yml b/molecule/test5/converge.yml deleted file mode 100644 index 58a465d..0000000 --- a/molecule/test5/converge.yml +++ /dev/null @@ -1,17 +0,0 @@ ---- -- name: Converge - hosts: all - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_nodegroups: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - tasks: - - name: "Include ansible-role-openhpc" - include_role: - name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" diff --git a/molecule/test5/molecule.yml b/molecule/test5/molecule.yml deleted file mode 100644 index 4f7c357..0000000 --- a/molecule/test5/molecule.yml +++ /dev/null @@ -1,44 +0,0 @@ ---- -driver: - name: podman -platforms: - - name: testohpc-login-0 - image: ${MOLECULE_IMAGE} - pre_build_image: true - groups: - - testohpc_login - command: /sbin/init - tmpfs: - /run: rw - /tmp: rw - volumes: - - /sys/fs/cgroup:/sys/fs/cgroup:ro - network: net1 - - name: testohpc-compute-0 - image: ${MOLECULE_IMAGE} - pre_build_image: true - groups: - - testohpc_compute - command: /sbin/init - tmpfs: - /run: rw - /tmp: rw - volumes: - - /sys/fs/cgroup:/sys/fs/cgroup:ro - network: net1 - - name: testohpc-compute-1 - image: ${MOLECULE_IMAGE} - pre_build_image: true - groups: - - testohpc_compute - command: /sbin/init - tmpfs: - /run: rw - /tmp: rw - volumes: - - /sys/fs/cgroup:/sys/fs/cgroup:ro - network: net1 -provisioner: - name: ansible -verifier: - name: ansible diff --git a/molecule/test5/verify.yml b/molecule/test5/verify.yml deleted file mode 100644 index e3f2476..0000000 --- a/molecule/test5/verify.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- - -- name: Check slurm hostlist - hosts: testohpc_login - tasks: - - name: Get slurm partition info - command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace - register: sinfo - - name: - assert: # PARTITION AVAIL TIMELIMIT NODES STATE NODELIST - that: "sinfo.stdout_lines == ['compute*,up,60-00:00:00,2,idle,testohpc-compute-[0-1]']" - fail_msg: "FAILED - actual value: {{ sinfo.stdout_lines }}" diff --git a/molecule/test6/converge.yml b/molecule/test6/converge.yml index 8904633..c023c64 100644 --- a/molecule/test6/converge.yml +++ b/molecule/test6/converge.yml @@ -9,7 +9,6 @@ openhpc_nodegroups: - name: "n/a" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: diff --git a/molecule/test8/converge.yml b/molecule/test8/converge.yml index 4aad845..5e12f80 100644 --- a/molecule/test8/converge.yml +++ b/molecule/test8/converge.yml @@ -10,7 +10,6 @@ openhpc_nodegroups: - name: "compute" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true openhpc_login_only_nodes: 'testohpc_login' tasks: - name: "Include ansible-role-openhpc" diff --git a/molecule/test9/converge.yml b/molecule/test9/converge.yml index 4aad845..5e12f80 100644 --- a/molecule/test9/converge.yml +++ b/molecule/test9/converge.yml @@ -10,7 +10,6 @@ openhpc_nodegroups: - name: "compute" openhpc_cluster_name: testohpc - openhpc_slurm_configless: true openhpc_login_only_nodes: 'testohpc_login' tasks: - name: "Include ansible-role-openhpc" diff --git a/tasks/pre.yml b/tasks/pre.yml index 3c0341c..07d6a62 100644 --- a/tasks/pre.yml +++ b/tasks/pre.yml @@ -1,6 +1,6 @@ -- name: Enable batch on configless login-only nodes +- name: Enable batch on login-only nodes + # TODO: why can't we remove this by just setting openhpc_enable.batch: true for appliance login nodes?? set_fact: openhpc_enable: "{{ openhpc_enable | combine({'batch': true}) }}" when: - - openhpc_slurm_configless - openhpc_login_only_nodes in group_names diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 0ba2b12..b83c44a 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -63,7 +63,7 @@ owner: root group: root mode: 0644 - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool + when: openhpc_enable.control | default(false) notify: - Restart slurmctld service register: ohpc_slurm_conf @@ -76,7 +76,7 @@ mode: "0600" owner: slurm group: slurm - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool + when: openhpc_enable.control | default(false) notify: - Restart slurmctld service register: ohpc_gres_conf @@ -90,7 +90,7 @@ mode: "0644" # perms/ownership based off src from ohpc package owner: root group: root - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless | bool + when: openhpc_enable.control | default(false) - name: Remove local tempfile for slurm.conf templating ansible.builtin.file: @@ -132,10 +132,9 @@ - name: Configure slurmd command line options vars: slurmd_options_configless: "--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}" - slurmd_options: "" lineinfile: path: /etc/sysconfig/slurmd - line: "SLURMD_OPTIONS='{{ slurmd_options_configless if openhpc_slurm_configless | bool else slurmd_options }}'" + line: "SLURMD_OPTIONS='{{ slurmd_options_configless }}'" regexp: "^SLURMD_OPTIONS=" create: yes owner: root diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 3ffeff5..ffd4057 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -3,16 +3,14 @@ ClusterName={{ openhpc_cluster_name }} # PARAMETERS {% for k, v in openhpc_default_config | combine(openhpc_config) | items %} {% if v != "omit" %}{# allow removing items using setting key: null #} -{% if k != 'SlurmctldParameters' %}{# handled separately due to openhpc_slurm_configless #} +{% if k != 'SlurmctldParameters' %}{# handled separately due to configless mode #} {{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }} {% endif %} {% endif %} {% endfor %} -{% set slurmctldparameters = ((openhpc_config.get('SlurmctldParameters', []) + (['enable_configless'] if openhpc_slurm_configless | bool else [])) | unique) %} -{% if slurmctldparameters | length > 0 %} +{% set slurmctldparameters = ((openhpc_config.get('SlurmctldParameters', []) + ['enable_configless']) | unique) %} SlurmctldParameters={{ slurmctldparameters | join(',') }} -{% endif %} # LOGIN-ONLY NODES # Define slurmd nodes not in partitions for login-only nodes in "configless" mode: