From 195b8b552bc2b1ce76a842a7b2ed61b88cbd91df Mon Sep 17 00:00:00 2001 From: Yaju Cao Date: Tue, 17 Mar 2026 11:05:38 +0800 Subject: [PATCH] feat: Add NVIDIA DCGM installation NVIDIA DCGM is a GPU monitoring and management toolkit for large-scale GPU deployments. Install DCGM on all GPU nodes in an HPC cluster to maintain reliability and monitor GPU health. Based on the current CUDA 12.9 environment, install the corresponding version: datacenter-gpu-manager-4-cuda12. This package requires NVIDIA GPU drivers to be installed. After the package is installed, it configures the nvidia-dcgm system service to monitor GPU status and provides the 'dcgmi' command line tool for users. Signed-off-by: Yaju Cao --- README.md | 12 ++++++++++++ defaults/main.yml | 1 + tasks/main.yml | 17 +++++++++++++++++ vars/RedHat_9.yml | 2 ++ 4 files changed, 32 insertions(+) diff --git a/README.md b/README.md index f5c3b1e..d45d395 100644 --- a/README.md +++ b/README.md @@ -115,6 +115,18 @@ Default: `true` Type: `bool` +### hpc_install_nvidia_dcgm + +Whether to install the NVIDIA datacenter GPU manager(DCGM) and enable its nvidia-dcgm service. + +NVIDIA DCGM is a GPU monitoring and management toolkit for large-scale GPU deployments, install DCGM on all GPU nodes in an HPC cluster to maintain reliability and monitor GPU health. + +Run `dcgmi` in the GPU nodes, e.g. `dcgmi discovery -l` to list GPUs on the node. + +Default: `true` + +Type: `bool` + ### hpc_install_rdma Whether to install the NVIDIA RDMA package. diff --git a/defaults/main.yml b/defaults/main.yml index 3332de3..5e93ac0 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -21,6 +21,7 @@ hpc_install_cuda_toolkit: true hpc_install_hpc_nvidia_nccl: true hpc_install_nvidia_fabric_manager: true hpc_install_nvidia_imex: true +hpc_install_nvidia_dcgm: true hpc_install_rdma: true hpc_enable_azure_persistent_rdma_naming: true hpc_azure_disable_predictable_net_names: true diff --git a/tasks/main.yml b/tasks/main.yml index 4e0dd9d..22091f1 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -593,6 +593,23 @@ value: "0" state: present +- name: Install NVIDIA datacenter GPU manager enable service + when: hpc_install_nvidia_dcgm + block: + - name: Install NVIDIA datacenter GPU manager + package: + name: "{{ __hpc_nvidia_dcgm }}" + state: present + use: "{{ (__hpc_server_is_ostree | d(false)) | + ternary('ansible.posix.rhel_rpm_ostree', omit) }}" + register: __hpc_nvidia_dcgm_install + until: __hpc_nvidia_dcgm_install is success + + - name: Ensure dcgm service is enabled + service: + name: nvidia-dcgm + enabled: true + - name: Install RDMA packages when: hpc_install_rdma block: diff --git a/vars/RedHat_9.yml b/vars/RedHat_9.yml index e82beed..d3e6657 100644 --- a/vars/RedHat_9.yml +++ b/vars/RedHat_9.yml @@ -34,6 +34,8 @@ __hpc_nvidia_nccl_packages: __hpc_docker_packages: - moby-engine-29.1.4-1.el9 - moby-cli-29.1.4-1.el9 +__hpc_nvidia_dcgm: + - datacenter-gpu-manager-4-cuda12 # Vars related to building packages from source __hpc_gdrcopy_info: