diff --git a/README.md b/README.md index e5c472c..90dbbb0 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,18 @@ Default: `true` Type: `bool` +### hpc_azure_disable_predictable_net_names + +Whether to disable predictable network interface names by adding `net.ifnames=0` +to the kernel command line (via the bootloader system role). + +This keeps kernel names such as `ib0`, `ib1`, ... instead of `ibP...` on IPoIB, +but it also affects Ethernet naming (e.g. `eth0` instead of `enP...`). + +Default: `true` + +Type: `bool` + ### hpc_install_system_openmpi Whether to install OpenMPI that comes from AppStream repositories and does not have Nvidia GPU support. diff --git a/defaults/main.yml b/defaults/main.yml index 1b3f67b..d06c878 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -23,6 +23,7 @@ hpc_install_nvidia_fabric_manager: true hpc_install_nvidia_imex: true hpc_install_rdma: true hpc_enable_azure_persistent_rdma_naming: true +hpc_azure_disable_predictable_net_names: true hpc_install_system_openmpi: true hpc_build_openmpi_w_nvidia_gpu_support: true hpc_install_moneo: true diff --git a/tasks/main.yml b/tasks/main.yml index 3eff8fd..72ade5b 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -578,6 +578,21 @@ name: nvidia-imex.service enabled: true +- name: Disable predictable network interface names on Azure (net.ifnames=0) + when: + - hpc_azure_disable_predictable_net_names + block: + - name: Configure net.ifnames via bootloader role + include_role: + name: fedora.linux_system_roles.bootloader + vars: + bootloader_settings: + - kernel: ALL + options: + - name: net.ifnames + value: "0" + state: present + - name: Install RDMA packages when: hpc_install_rdma block: diff --git a/templates/test-azure-health-checks.sh.j2 b/templates/test-azure-health-checks.sh.j2 index 161dae8..69a3cec 100644 --- a/templates/test-azure-health-checks.sh.j2 +++ b/templates/test-azure-health-checks.sh.j2 @@ -206,10 +206,10 @@ test_health_log() { pass "health.log file exists" echo "Checking: health.log for errors" - if grep -Ei "fail|fault|error" "$log_file" | grep -Eiv "success" > /dev/null 2>&1; then + if grep -Eiu '^(fail|fault|error)\b' "$log_file" | grep -Eiv "success" > /dev/null 2>&1; then echo "" echo "--- Error excerpts from health.log ---" - grep -Ei "fail|fault|error" "$log_file" | grep -Eiv "success" | head -20 + grep -Eiu '^(fail|fault|error)\b' "$log_file" | grep -Eiv "success" | head -20 echo "--- End of excerpts ---" echo "" fail "FAIL/FAULT/ERROR found in health.log" diff --git a/tests/tests_default.yml b/tests/tests_default.yml index d5689da..79b28b8 100644 --- a/tests/tests_default.yml +++ b/tests/tests_default.yml @@ -20,6 +20,7 @@ hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_diagnostics: false + hpc_azure_disable_predictable_net_names: false tasks: - name: Skip unsupported architectures include_tasks: tasks/skip_unsupported_archs.yml diff --git a/tests/tests_include_vars_from_parent.yml b/tests/tests_include_vars_from_parent.yml index 1b54ad1..3f440ae 100644 --- a/tests/tests_include_vars_from_parent.yml +++ b/tests/tests_include_vars_from_parent.yml @@ -65,6 +65,7 @@ hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_diagnostics: false + hpc_azure_disable_predictable_net_names: false - name: Cleanup file: diff --git a/tests/tests_skip_toolkit.yml b/tests/tests_skip_toolkit.yml index 8f603ed..255a422 100644 --- a/tests/tests_skip_toolkit.yml +++ b/tests/tests_skip_toolkit.yml @@ -23,6 +23,7 @@ hpc_install_docker: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_azurehpc_health_checks: "{{ hpc_install_nvidia_container_toolkit }}" hpc_install_diagnostics: false + hpc_azure_disable_predictable_net_names: false tags: - tests::reboot tasks: