Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] new CUDA CI+development Docker container #1162

Draft
wants to merge 30 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9f8f365
update CI Docker container
BenWibking Aug 27, 2024
6581306
update OpenMPI
BenWibking Aug 27, 2024
5add7b7
revert to OpenMPI 4.1.4
BenWibking Aug 27, 2024
d8b2fdc
add ADIOS2+openPMD
BenWibking Aug 27, 2024
8acc717
add c-blosc ubuntu package
BenWibking Aug 27, 2024
7a39059
fix Dockerfile.nvcc
BenWibking Aug 27, 2024
e6bcad2
fix dockerfile
BenWibking Aug 27, 2024
cec0c64
install python headers
BenWibking Aug 27, 2024
e36f245
install cmake from apt (required for aarch64)
BenWibking Aug 27, 2024
fb8e957
remove duplicate cmake dep
BenWibking Aug 27, 2024
ebe3f9b
disable ascent build
BenWibking Aug 27, 2024
18640dc
add newer ascent version
BenWibking Aug 28, 2024
494048b
disable ascent build; fix openpmd build
BenWibking Aug 28, 2024
9b73434
downgrade to CUDA 12.0
BenWibking Aug 28, 2024
71416f5
fix ascent build path
BenWibking Aug 28, 2024
93c2045
fix bug in build_ascent.sh
BenWibking Aug 29, 2024
804faf4
remove unneeded patches
BenWibking Aug 29, 2024
ce13748
ascent complains if MFEM is not built
BenWibking Aug 29, 2024
cb08a39
control cuda support for ascent with env var
BenWibking Aug 29, 2024
16e4751
add MAKEOPTS=--output-sync=target
BenWibking Aug 29, 2024
3eae3ef
add comment to Dockerfile
BenWibking Aug 29, 2024
d700e1c
Merge branch 'develop' into BenWibking/update-cuda-ci-container
BenWibking Aug 29, 2024
f4dbf1a
Downgrade numpy
pgrete Aug 30, 2024
e9fe3cc
Fix ADIOS2 and OpenPMD versions
pgrete Aug 30, 2024
4d2bf95
Directly use Ascent script with small patch
pgrete Aug 30, 2024
3ef7b5c
Use Cuda12.1 container and drop to local user
pgrete Aug 30, 2024
79595f2
add emacs and vi
BenWibking Aug 31, 2024
d05d48c
set build_jobs=`nproc` to avoid OOM kill
BenWibking Aug 31, 2024
ba7827d
add developer tools for Codespaces/VSCode
BenWibking Aug 31, 2024
652daf3
add devcontainer.json
BenWibking Aug 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// devcontainer.json
{
"name": "parthenon-dev",
"image": "ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent",
"hostRequirements": {
"cpus": 4
},
"customizations": {
"vscode": {
"settings": {},
"extensions": [
"-ms-vscode.cpptools",
"llvm-vs-code-extensions.vscode-clangd",
"github.vscode-pull-request-github",
"ms-python.python",
"ms-toolsai.jupyter",
"ms-vscode.live-server",
"ms-azuretools.vscode-docker",
"swyddfa.esbonio",
"tomoki1207.pdf",
"ms-vscode.cmake-tools",
"ms-vsliveshare.vsliveshare"
]
}
},
"remoteEnv": {
"PATH": "${containerEnv:PATH}:/usr/local/hdf5/parallel/bin",
"OMPI_MCA_opal_warn_on_missing_libcuda": "0"
},
//"remoteUser": "ubuntu",
// we need to manually checkout the submodules,
// but VSCode may try to configure CMake before they are fully checked-out.
// workaround TBD
"postCreateCommand": "git submodule update --init"
}
72 changes: 57 additions & 15 deletions scripts/docker/Dockerfile.nvcc
Original file line number Diff line number Diff line change
@@ -1,23 +1,36 @@
FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04

RUN apt-get clean && apt-get update -y && \
DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib python3-scipy python3-pip lcov curl cuda-nsight-systems-11-6 cmake ninja-build
DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends git python3-minimal libpython3-stdlib bc hwloc wget openssh-client python3-numpy python3-h5py python3-matplotlib python3-scipy python3-pip lcov curl cuda-nsight-systems-12-6 cmake ninja-build libpython3-dev gcc-11 g++-11 emacs nvi sphinx-doc python3-sphinx-rtd-theme python3-sphinxcontrib.bibtex python3-sphinx-copybutton && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10

RUN g++ --version

RUN pip3 install unyt

RUN pip3 install blosc2

# for Codespaces/VSCode Sphinx support
RUN pip3 install esbonio

# h5py from the repo is incompatible with the default numpy 2.1.0
# Downgrading is not the cleanest solution, but it works...
# see https://stackoverflow.com/questions/78634235/numpy-dtype-size-changed-may-indicate-binary-incompatibility-expected-96-from
RUN pip3 install numpy==1.26.4

RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key| apt-key add - && \
echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-15 main" > /etc/apt/sources.list.d/llvm.list
echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-19 main" > /etc/apt/sources.list.d/llvm.list

RUN apt-get clean && apt-get update -y && \
DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends clang-15 llvm-15 libomp-15-dev && \
DEBIAN_FRONTEND="noninteractive" TZ=America/New_York apt-get install -y --no-install-recommends clang-19 llvm-19 libomp-19-dev clangd-19 && \
rm -rf /var/lib/apt/lists/*


RUN cd /tmp && \
wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.bz2 && \
tar xjf openmpi-4.1.4.tar.bz2 && \
cd openmpi-4.1.4 && \
./configure --prefix=/opt/openmpi --enable-mpi-cxx --with-cuda && \
./configure --prefix=/opt/openmpi --disable-mpi-fortran --disable-oshmem --with-cuda && \
make -j16 && \
make install && \
cd / && \
Expand All @@ -36,19 +49,48 @@ RUN cd /tmp && \
cd / && \
rm -rf /tmp/hdf5-1.12.2*

RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
RUN mkdir /tmp/build-adios2 && cd /tmp/build-adios2 && \
wget https://github.com/ornladios/ADIOS2/archive/refs/tags/v2.10.1.tar.gz && \
tar xzf v2.10.1.tar.gz && \
mkdir adios2-build && cd adios2-build && \
cmake ../ADIOS2-2.10.1 -DADIOS2_USE_Blosc2=ON -DADIOS2_USE_Fortran=OFF && \
make -j 16 && make install && \
cd / && \
rm -rf /tmp/build-adios2

# commit version is dev branch on 2024-08-30
RUN mkdir /tmp/build-openpmd && cd /tmp/build-openpmd && \
wget https://github.com/openPMD/openPMD-api/archive/1c7d7ff.tar.gz && \
tar xzf 1c7d7ff.tar.gz && \
mkdir openPMD-api-build && cd openPMD-api-build && \
cmake ../openPMD-api-1c7d7ffc5ef501e1d2dcbd5169b3e5eff677b399 -DopenPMD_USE_PYTHON=ON -DPython_EXECUTABLE=$(which python3) -DopenPMD_USE_ADIOS2=ON && \
cmake --build . -j 16 && \
cmake --build . --target install && \
cd / && \
rm -rf /tmp/build-openpmd

RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.23.2/cmake-3.23.2-linux-x86_64.tar.gz -o cmake-3.23.2-linux-x86_64.tar.gz && \
tar -xzf cmake-3.23.2-linux-x86_64.tar.gz -C /opt
RUN mkdir /tmp/build-ascent

ENV PATH=/opt/cmake-3.23.2-linux-x86_64/bin:$PATH
COPY ascent_build.patch /tmp/build-ascent

COPY build_ascent_cuda.sh /tmp/build-ascent/build_ascent_cuda.sh
## NOTE: with enable_cuda=ON, you need a Docker VM with a LARGE amount of RAM (at least 15 GB RAM, 4 GB swap)

# commit version is dev branch on 2024-08-30
RUN cd /tmp/build-ascent && \
bash build_ascent_cuda.sh && \
wget https://github.com/Alpine-DAV/ascent/archive/dc2ec9c.tar.gz && \
tar xzf dc2ec9c.tar.gz -C . --strip-components=1 && \
wget https://github.com/LLNL/blt/archive/9ff7734.tar.gz && \
tar xzf 9ff7734.tar.gz -C ./src/blt --strip-components=1 && \
cd ./scripts/build_ascent && \
patch -p1 build_ascent.sh /tmp/build-ascent/ascent_build.patch && \
env enable_cuda=ON enable_mpi=ON build_hdf5=false build_silo=false bash build_ascent.sh && \
cd / && \
rm -rf /tmp/build-ascent
rm -rf /tmp/ascent_build

# uid 1000 maps to the one running the container on the CI host
RUN groupadd -g 109 render
RUN useradd --create-home --shell /bin/bash -u 1000 -G render,sudo ci

USER ci

# manually downgrade numpy as deprecated `typeDict` is still used by h5py
RUN pip install numpy==1.21
WORKDIR /home/ci
49 changes: 49 additions & 0 deletions scripts/docker/ascent_build.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
--- build_ascent.sh 2024-08-29 21:00:24.000000000 +0000
+++ build_ascent_parthenon.sh 2024-08-30 09:55:58.976365723 +0000
@@ -21,6 +21,8 @@
# Build Options
##############################################################################

+export MAKEFLAGS="--output-sync=target"
+
# shared options
enable_cuda="${enable_cuda:=OFF}"
enable_hip="${enable_hip:=OFF}"
@@ -31,7 +33,7 @@
enable_find_mpi="${enable_find_mpi:=ON}"
enable_tests="${enable_tests:=OFF}"
enable_verbose="${enable_verbose:=ON}"
-build_jobs="${build_jobs:=6}"
+build_jobs=`nproc`
build_config="${build_config:=Release}"
build_shared_libs="${build_shared_libs:=ON}"

@@ -126,8 +128,8 @@
root_dir=$(ospath ${root_dir})
root_dir=$(abs_path ${root_dir})
script_dir=$(abs_path "$(dirname "${BASH_SOURCE[0]}")")
-build_dir=$(ospath ${root_dir}/build)
-source_dir=$(ospath ${root_dir}/source)
+build_dir=$(ospath build)
+source_dir=$(ospath source)


# root_dir is where we will build and install
@@ -140,7 +142,7 @@

# install_dir is where we will install
# override with `prefix` env var
-install_dir="${install_dir:=$root_dir/install}"
+install_dir=/usr/local

echo "*** prefix: ${root_dir}"
echo "*** build root: ${build_dir}"
@@ -231,7 +233,7 @@
hdf5_short_version=1.14
hdf5_src_dir=$(ospath ${source_dir}/hdf5-${hdf5_version})
hdf5_build_dir=$(ospath ${build_dir}/hdf5-${hdf5_version}/)
-hdf5_install_dir=$(ospath ${install_dir}/hdf5-${hdf5_version}/)
+hdf5_install_dir=/usr/local/hdf5/serial
hdf5_tarball=$(ospath ${source_dir}/hdf5-${hdf5_version}.tar.gz)

# build only if install doesn't exist
Loading
Loading