diff --git a/README.md b/README.md index f5c3b1e..d45d395 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,18 @@ Default: `true` Type: `bool` +### hpc_install_nvidia_dcgm + +Whether to install the NVIDIA datacenter GPU manager(DCGM) and enable its nvidia-dcgm service. + +NVIDIA DCGM is a GPU monitoring and management toolkit for large-scale GPU deployments, install DCGM on all GPU nodes in an HPC cluster to maintain reliability and monitor GPU health. + +Run `dcgmi` in the GPU nodes, e.g. `dcgmi discovery -l` to list GPUs on the node. + +Default: `true` + +Type: `bool` + ### hpc_install_rdma Whether to install the NVIDIA RDMA package. diff --git a/defaults/main.yml b/defaults/main.yml index 3332de3..5e93ac0 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -21,6 +21,7 @@ hpc_install_cuda_toolkit: true hpc_install_hpc_nvidia_nccl: true hpc_install_nvidia_fabric_manager: true hpc_install_nvidia_imex: true +hpc_install_nvidia_dcgm: true hpc_install_rdma: true hpc_enable_azure_persistent_rdma_naming: true hpc_azure_disable_predictable_net_names: true diff --git a/tasks/main.yml b/tasks/main.yml index 4e0dd9d..22091f1 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -593,6 +593,23 @@ value: "0" state: present +- name: Install NVIDIA datacenter GPU manager enable service + when: hpc_install_nvidia_dcgm + block: + - name: Install NVIDIA datacenter GPU manager + package: + name: "{{ __hpc_nvidia_dcgm }}" + state: present + use: "{{ (__hpc_server_is_ostree | d(false)) | + ternary('ansible.posix.rhel_rpm_ostree', omit) }}" + register: __hpc_nvidia_dcgm_install + until: __hpc_nvidia_dcgm_install is success + + - name: Ensure dcgm service is enabled + service: + name: nvidia-dcgm + enabled: true + - name: Install RDMA packages when: hpc_install_rdma block: diff --git a/vars/RedHat_9.yml b/vars/RedHat_9.yml index e82beed..d3e6657 100644 --- a/vars/RedHat_9.yml +++ b/vars/RedHat_9.yml @@ -34,6 +34,8 @@ __hpc_nvidia_nccl_packages: __hpc_docker_packages: - moby-engine-29.1.4-1.el9 - moby-cli-29.1.4-1.el9 +__hpc_nvidia_dcgm: + - datacenter-gpu-manager-4-cuda12 # Vars related to building packages from source __hpc_gdrcopy_info: