3
3
4
4
# ###################################################################################################
5
5
# This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile,
6
- # understand what it does, then create your own Dockerfile.
6
+ # understand what it does, then create your own Dockerfile. Software versions are provided for
7
+ # illustration only.
7
8
#
8
9
# Sample build instructions:
9
10
#
19
20
# # Load image to local docker registry -> on head node, or new compute/build node.
20
21
# docker load < /fsx/nvidia-pt-od__latest.tar
21
22
# ###################################################################################################
22
- FROM nvcr.io/nvidia/pytorch:23.12 -py3
23
+ FROM nvcr.io/nvidia/pytorch:24.03 -py3
23
24
ENV DEBIAN_FRONTEND=noninteractive
24
25
25
26
# The three must-be-built packages.
26
27
# Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error.
27
- ENV EFA_INSTALLER_VERSION=1.30 .0
28
- ENV AWS_OFI_NCCL_VERSION=1.8 .1-aws
28
+ ENV EFA_INSTALLER_VERSION=1.32 .0
29
+ ENV AWS_OFI_NCCL_VERSION=1.9 .1-aws
29
30
ENV NCCL_TESTS_VERSION=master
30
31
31
32
# # Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
@@ -88,10 +89,13 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
88
89
# [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official
89
90
# binaries.
90
91
#
92
+ # Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to
93
+ # find out the prebuilt nccl version in the parent image.
94
+ #
91
95
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
92
96
# aws-ofi-ccnl.
93
97
# ###################################################################################################
94
- # ENV NCCL_VERSION=2.19.3 -1
98
+ # ENV NCCL_VERSION=2.21.5 -1
95
99
# RUN cd /opt && \
96
100
# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \
97
101
# dpkg -i cuda-keyring_1.0-1_all.deb && \
@@ -101,17 +105,21 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
101
105
102
106
103
107
# ###################################################################################################
104
- # [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The
105
- # benefits of installing to the same location as the built-in version are:
108
+ # [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones.
109
+ #
110
+ # Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to
111
+ # find out the prebuilt nccl version in the parent image.
112
+ #
113
+ # Installation mechanics:
106
114
#
107
- # 1. There 's only ever a single libnccl version offered by this image, preventing application from
108
- # mistakenly chooses a wrong version.
109
- # 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
115
+ # 1. Remove pre-installed nccl to ensure there 's only ever a single libnccl version offered by this
116
+ # image, preventing application from mistakenly chooses a wrong version.
117
+ # 2. Install to default location, so no more extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
110
118
#
111
119
# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
112
120
# aws-ofi-ccnl.
113
121
# ###################################################################################################
114
- ENV NCCL_VERSION=2.19.3 -1
122
+ ENV NCCL_VERSION=2.21.5 -1
115
123
RUN apt-get remove -y libnccl2 libnccl-dev \
116
124
&& cd /tmp \
117
125
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
0 commit comments