Skip to content

Commit

Permalink
GH200 CI (#148)
Browse files Browse the repository at this point in the history
* sync spack recipes from spack@develop

* ci on gh200

* remove handwritten compilers.yaml
  • Loading branch information
simonpintarelli authored and Mathieu Taillefumier committed Dec 11, 2024
1 parent 09fc581 commit 7e2df12
Show file tree
Hide file tree
Showing 6 changed files with 237 additions and 18 deletions.
17 changes: 6 additions & 11 deletions ci/baseimage.cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM ubuntu:22.04 as builder

ARG CUDA_ARCH=60
ARG CUDA_ARCH=90

ENV DEBIAN_FRONTEND noninteractive

Expand All @@ -10,7 +10,7 @@ ENV PATH="/spack/bin:${PATH}"

ENV MPICH_VERSION=3.4.3

ENV CMAKE_VERSION=3.27.9
ENV CMAKE_VERSION=3.30.3

RUN apt-get -y update

Expand All @@ -23,11 +23,12 @@ RUN apt-get install -y --no-install-recommends gcc g++ gfortran clang libomp-14-
liblzma-dev libbz2-dev

# install CMake
RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz -O cmake.tar.gz && \
RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz -O cmake.tar.gz && \
tar zxvf cmake.tar.gz --strip-components=1 -C /usr

#
# get latest version of spack
RUN git clone -b v0.21.0 https://github.com/spack/spack.git
RUN git clone -b v0.23.0 https://github.com/spack/spack.git

# set the location of packages built by spack
RUN spack config add config:install_tree:root:/opt/local
Expand All @@ -45,13 +46,7 @@ RUN spack external find --all --exclude python
RUN spack compiler find

# install yq (utility to manipulate the yaml files)
RUN wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_386 && chmod a+x /usr/local/bin/yq

# change the fortran compilers: for gcc the gfortran is already properly set and the change has no effect; add it for clang
RUN yq -i '.compilers[0].compiler.paths.f77 = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml && \
yq -i '.compilers[0].compiler.paths.fc = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml && \
yq -i '.compilers[1].compiler.paths.f77 = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml && \
yq -i '.compilers[1].compiler.paths.fc = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml
RUN wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_arm64 && chmod a+x /usr/local/bin/yq

# install MPICH
RUN spack install mpich@${MPICH_VERSION} %gcc
Expand Down
95 changes: 95 additions & 0 deletions ci/daint-alps.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
include:
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'

stages:
- baseimage
- build
- test

build base image:
extends: [.dynamic-image-name, .container-builder-cscs-gh200]
stage: baseimage
timeout: 2h
variables:
SLURM_RESERVATION: 'NCCL'
DOCKERFILE: ci/baseimage.cuda.Dockerfile
WATCH_FILECHANGES: ci/baseimage.cuda.Dockerfile
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/cosma-ci

build tiled-mm:
extends: .container-builder-cscs-gh200
needs: ["build base image"]
stage: build
variables:
SLURM_RESERVATION: 'NCCL'
DOCKERFILE: ci/build.Dockerfile
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/cosma/cosma-ci:$CI_COMMIT_SHA
ENVPATH: "/cosma-env-cuda"
DOCKER_BUILD_ARGS: '["BASE_IMAGE=${BASE_IMAGE}", "ENVPATH=$ENVPATH"]'

.run_tests:
extends: [.container-runner-todi-gh200]
needs: ["build tiled-mm"]
stage: test
image: $CSCS_REGISTRY_PATH/cosma/cosma-ci:$CI_COMMIT_SHA
variables:
GIT_STRATEGY: none
MPICH_MAX_THREAD_SAFETY: multiple
CSCS_REGISTRY_LOGIN: 'YES'
PULL_IMAGE: 'YES'
SLURM_HINT: nomultithread
SLURM_UNBUFFEREDIO: ''
SLURM_RESERVATION: 'NCCL'
SLURM_CPU_BIND: 'socket'
SLURM_MPI: "pmi2"
CRAY_CUDA_MPS: 'YES'
# Workaround after update until hooks are fixed
ENROOT_LIBRARY_PATH: /capstor/scratch/cscs/fmohamed/enrootlibn
# SLURM_WAIT: 0
COSMA_GPU_MAX_TILE_K: 100
COSMA_GPU_MAX_TILE_M: 100
COSMA_GPU_MAX_TILE_N: 100

mapper:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.mapper
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 1
USE_MPI: 'YES'

pdgemm:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.pdgemm
variables:
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 16
USE_MPI: 'YES'

multiply:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.multiply
variables:
SLURM_JOB_NUM_NODES: 2
SLURM_NTASKS: 16
USE_MPI: 'YES'

scalar_matmul:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.scalar_matmul
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 8
USE_MPI: 'YES'

multiply_using_layout:
extends: .run_tests
stage: test
script: /cosma-env-cuda/.spack-env/view/bin/test.multiply_using_layout
variables:
SLURM_JOB_NUM_NODES: 1
SLURM_NTASKS: 4
16 changes: 16 additions & 0 deletions ci/mps-wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
# Example mps-wrapper.sh usage:
# > srun --cpu-bind=socket [...] mps-wrapper.sh <cmd>

export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
# Launch MPS from a single rank per node
if [ $SLURM_LOCALID -eq 0 ]; then
CUDA_VISIBLE_DEVICES=0,1,2,3 nvidia-cuda-mps-control -d
fi

# set cuda device
numa_nodes=$(hwloc-calc --physical --intersect NUMAnode $(taskset -p $$ | awk '{print "0x"$6}'))
export CUDA_VISIBLE_DEVICES=$numa_nodes
# Run the command
exec numactl --membind=$numa_nodes "$@"
104 changes: 104 additions & 0 deletions spack/packages/cosma/fj-ssl2.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fd1e55..41a041b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS "YES") # always write compile_commands.json

set(COSMA_GPU_BACKENDS_LIST "CUDA" "ROCM")
set(COSMA_SCALAPACK_LIST "OFF" "MKL" "CRAY_LIBSCI" "CUSTOM")
-set(COSMA_BLAS_LIST "auto" "MKL" "OPENBLAS" "CRAY_LIBSCI" "CUSTOM" "BLIS" "ATLAS" "CUDA" "ROCM" "OFF")
+set(COSMA_BLAS_LIST "auto" "MKL" "SSL2" "OPENBLAS" "CRAY_LIBSCI" "CUSTOM" "BLIS" "ATLAS" "CUDA" "ROCM" "OFF")
option(COSMA_WITH_TESTS "Generate the test target." ON)
option(COSMA_WITH_APPS "Generate the miniapp targets." ON)
option(COSMA_WITH_BENCHMARKS "Generate the benchmark targets." ON)
@@ -45,7 +45,7 @@ if (COSMA_BLAS MATCHES "CUDA|ROCM")
set(COSMA_GPU_BACKEND ${COSMA_BLAS})
else()
if(COSMA_BLAS STREQUAL "OFF")
- message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS")
+ message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, SSL2, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS")
else()
set(COSMA_BLAS_VENDOR ${COSMA_BLAS})
endif()
@@ -190,6 +190,7 @@ install(FILES "${cosma_BINARY_DIR}/cosmaConfig.cmake"
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
"${cosma_SOURCE_DIR}/cmake/FindMKL.cmake"
+ "${cosma_SOURCE_DIR}/cmake/FindSSL2.cmake"
"${cosma_SOURCE_DIR}/cmake/FindBlas.cmake"
"${cosma_SOURCE_DIR}/cmake/FindSCALAPACK.cmake"
"${cosma_SOURCE_DIR}/cmake/FindOPENBLAS.cmake"
diff --git a/cmake/FindBlas.cmake b/cmake/FindBlas.cmake
index aef956c..3c47561 100644
--- a/cmake/FindBlas.cmake
+++ b/cmake/FindBlas.cmake
@@ -14,6 +14,7 @@ endif()
set(COSMA_BLAS_VENDOR_LIST
"auto"
"MKL"
+ "SSL2"
"OPENBLAS"
"FLEXIBLAS"
"ARMPL"
diff --git a/cmake/FindSSL2.cmake b/cmake/FindSSL2.cmake
new file mode 100644
index 0000000..f0e11bf
--- /dev/null
+++ b/cmake/FindSSL2.cmake
@@ -0,0 +1,56 @@
+#.rst:
+# FindSSL2
+# -----------
+#
+# This module tries to find the SSL2 library.
+#
+# The following variables are set
+#
+# ::
+#
+# SSL2_FOUND - True if ssl2 is found
+# SSL2_LIBRARIES - The required libraries
+# SSL2_INCLUDE_DIRS - The required include directory
+#
+# The following import target is created
+#
+# ::
+#
+# SSL2::ssl2
+
+#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them.
+# if(NOT POLICY CMP0074)
+set(_SSL2_PATHS ${SSL2_ROOT}
+ $ENV{SSL2_ROOT}
+ $ENV{SSL2ROOT}
+ $ENV{SSL2_DIR}
+ $ENV{SSL2DIR})
+# endif()
+
+find_library(
+ COSMA_SSL2_LINK_LIBRARIES
+ NAMES "fjlapackex"
+ HINTS ${_SSL2_PATHS}
+ PATH_SUFFIXES "lib64"
+)
+find_path(
+ COSMA_SSL2_INCLUDE_DIRS
+ NAMES "cblas.h"
+ HINTS ${_SSL2_PATHS}
+ PATH_SUFFIXES "include"
+)
+
+# check if found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(SSL2 REQUIRED_VARS COSMA_SSL2_INCLUDE_DIRS COSMA_SSL2_LINK_LIBRARIES)
+
+# add target to link against
+if(NOT TARGET cosma::BLAS::SSL2::ssl2)
+ add_library(cosma::BLAS::SSL2::ssl2 INTERFACE IMPORTED)
+ add_library(cosma::BLAS::SSL2::blas ALIAS cosma::BLAS::SSL2::ssl2)
+endif()
+set_property(TARGET cosma::BLAS::SSL2::ssl2 PROPERTY INTERFACE_LINK_LIBRARIES ${COSMA_SSL2_LINK_LIBRARIES})
+set_property(TARGET cosma::BLAS::SSL2::ssl2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COSMA_SSL2_INCLUDE_DIRS})
+
+# prevent clutter in cache
+MARK_AS_ADVANCED(SSL2_FOUND SSL2_LIBRARIES SSL2_INCLUDE_DIRS)
21 changes: 14 additions & 7 deletions spack/packages/cosma/package.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
Expand All @@ -17,13 +17,15 @@ class Cosma(CMakePackage):
url = "https://github.com/eth-cscs/COSMA/archive/refs/tags/v2.6.6.tar.gz"
git = "https://github.com/eth-cscs/COSMA.git"

license("BSD-3-Clause")

# note: The default archives produced with github do not have the archives
# of the submodules.
version("master", branch="master", submodules=False)
version("2.6.6", sha256="1604be101e77192fbcc5551236bc87888d336e402f5409bbdd9dea900401cc37")
version("2.6.5", sha256="10d9b7ecc1ce44ec5b9e0c0bf89278a63029912ec3ea99661be8576b553ececf")
version("2.6.4", sha256="6d7bd5e3005874af9542a329c93e7ccd29ca1a5573dae27618fac2704fa2b6ab")
version("2.6.3", sha256="8ca96ca41458f1e9d0da70d524c5a03c677dba7238d23a578f852163b6d45ac9")
version("2.6.3", sha256="c2a3735ea8f860930bea6706d968497d72a1be0498c689b5bc4a951ffc2d1146")
version("2.6.2", sha256="2debb5123cc35aeebc5fd2f8a46cfd6356d1e27618c9bb57129ecd09aa400940")
version("2.6.1", sha256="69aa6634a030674f0d9be61e7b0bf0dc17acf0fc9e7a90b40e3179e2254c8d67")
version("2.5.1", sha256="085b7787597374244bbb1eb89bc69bf58c35f6c85be805e881e1c0b25166c3ce")
Expand All @@ -34,12 +36,14 @@ class Cosma(CMakePackage):
version("2.0.7", sha256="8d70bfcbda6239b6a8fbeaca138790bbe58c0c3aa576879480d2632d4936cf7e")
version("2.0.2", sha256="4f3354828bc718f3eef2f0098c3bdca3499297497a220da32db1acd57920c68d")

depends_on("cxx", type="build") # generated

# We just need the libraries of cuda and rocm, so no need to extend
# CudaPackage or ROCmPackage.
variant("cuda", default=False, description="Build with cuBLAS support")
variant("rocm", default=False, description="Build with rocBLAS support")
variant("scalapack", default=False, description="Build with ScaLAPACK API")
variant("shared", default=False, description="Build the shared library version")
variant("shared", default=True, description="Build the shared library version")
variant("tests", default=False, description="Build tests")
variant("apps", default=False, description="Build miniapp")
variant("profiling", default=False, description="Enable profiling")
Expand Down Expand Up @@ -76,8 +80,10 @@ class Cosma(CMakePackage):
depends_on("semiprof", when="+profiling")
depends_on("costa+profiling", when="+profiling")

patch("fj-ssl2.patch", when="^fujitsu-ssl2")

def setup_build_environment(self, env):
if "+cuda" in self.spec:
if self.spec.satisfies("+cuda"):
env.set("CUDA_PATH", self.spec["cuda"].prefix)

def cosma_blas_cmake_arg(self):
Expand All @@ -89,6 +95,7 @@ def cosma_blas_cmake_arg(self):
("^cray-libsci", "CRAY_LIBSCI"),
("^netlib-lapack", "CUSTOM"),
("^openblas", "OPENBLAS"),
("^fujitsu-ssl2", "SSL2"),
]

if self.version >= Version("2.4.0"):
Expand All @@ -105,11 +112,11 @@ def cosma_blas_cmake_arg(self):
def cosma_scalapack_cmake_arg(self):
spec = self.spec

if "~scalapack" in spec:
if spec.satisfies("~scalapack"):
return "OFF"
elif "^intel-mkl" in spec or "^intel-oneapi-mkl" in spec:
elif spec.satisfies("^intel-mkl") or spec.satisfies("^intel-oneapi-mkl"):
return "MKL"
elif "^cray-libsci" in spec:
elif spec.satisfies("^cray-libsci"):
return "CRAY_LIBSCI"

return "CUSTOM"
Expand Down
2 changes: 2 additions & 0 deletions spack/packages/tiled-mm/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class TiledMm(CMakePackage, CudaPackage, ROCmPackage):
version("2.2", sha256="6d0b49c9588ece744166822fd44a7bc5bec3dc666b836de8bf4bf1a7bb675aac")
version("2.0", sha256="ea554aea8c53d7c8e40044e6d478c0e8137d7e8b09d7cb9650703430d92cf32e")

depends_on("cxx", type="build") # generated

variant("shared", default=True, description="Build shared libraries")
variant("examples", default=False, description="Enable examples")
variant("tests", default=False, description="Enable tests")
Expand Down

0 comments on commit 7e2df12

Please sign in to comment.