From 9c0081d83b05daffb8339a5e293195f1d3210e26 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Wed, 31 Jan 2024 18:11:36 -0500 Subject: [PATCH 1/2] Prevent CI from running OOM when oversubscribing GPUs --- .github/workflows/build_cmake.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index c6b292f12..75795fcd8 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -94,6 +94,8 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + PARSEC_MCA_device_cuda_memory_use=10 + PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure - name: Save Artifact @@ -196,6 +198,8 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + PARSEC_MCA_device_cuda_memory_use=10 + PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure - name: Save Testing Artifact From 18a52187463b2f15f2d3ea7f52967a257cc34dc1 Mon Sep 17 00:00:00 2001 From: Aurelien Bouteiller Date: Thu, 1 Feb 2024 19:06:45 -0500 Subject: [PATCH 2/2] Do not enable devices without explicit parameters doing so in the ctest files --- .github/workflows/build_cmake.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml index 75795fcd8..ad3552e53 100644 --- a/.github/workflows/build_cmake.yml +++ b/.github/workflows/build_cmake.yml @@ -94,6 +94,10 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + # enable devices only in tests that explicitely require them + PARSEC_MCA_device_cuda_enabled=0 + PARSEC_MCA_device_hip_enabled=0 + # restrict memory use for oversubscribed runners PARSEC_MCA_device_cuda_memory_use=10 PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure @@ -198,6 +202,10 @@ jobs: # run: ctest -C $BUILD_TYPE run: | source ${{github.workspace}}/.github/CI/spack_setup.sh + # enable devices only in tests that explicitely require them + PARSEC_MCA_device_cuda_enabled=0 + PARSEC_MCA_device_hip_enabled=0 + # restrict memory use for oversubscribed runners PARSEC_MCA_device_cuda_memory_use=10 PARSEC_MCA_device_hip_memory_use=10 ctest --output-on-failure