Skip to content

Commit a6695cb

Browse files
authored
Templatized build & start from TF 2.6 base image. (#1078)
* Templatize build & start from TF base image. - Build CPU & GPU images in parallel (faster build time) - The GPU image is no longer layers on top of the CPU image (small GPU image, simpler because no need to install CUDA ourselves) - Upgrade TensorFlow to 2.6 http://b/167268016 * remove base-tag, re-org Jenkinsfile * fix jenkinsfile * remove blank lines * Increase GPU build time * install torchaudio/torchtext on GPU build * Turn off KMP_AFFINITY logs * remove horovod * Move uninstall statement after clean-layer.sh has been added * remove uninstsall lightgbm statement for gpu now that it's templatized * Remove CPU & GPU Dockerfiles * Remove duplicated code & resolved TODO
1 parent 760f305 commit a6695cb

File tree

6 files changed

+191
-217
lines changed

6 files changed

+191
-217
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
*.pyc
22
.idea/
33
.vscode
4-
.mypy_cache
4+
.mypy_cache
5+
.generated

Dockerfile renamed to Dockerfile.tmpl

+83-27
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,26 @@
1-
ARG BASE_TAG=m78
2-
ARG TENSORFLOW_VERSION=2.4.1
3-
4-
FROM gcr.io/deeplearning-platform-release/base-cpu:${BASE_TAG}
5-
6-
# We need to redefine TENSORFLOW_VERSION here to get the default ARG value defined above the FROM instruction.
7-
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
8-
ARG TENSORFLOW_VERSION
1+
{{ if eq .Accelerator "gpu" }}
2+
FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m78
3+
ENV CUDA_MAJOR_VERSION=11
4+
ENV CUDA_MINOR_VERSION=0
5+
{{ else }}
6+
FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m78
7+
{{ end }}
8+
# Keep these variables in sync if base image is updated.
9+
ENV TENSORFLOW_VERSION=2.6.0
10+
# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0
11+
# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
12+
ENV KMP_WARNINGS=0
913

1014
ADD clean-layer.sh /tmp/clean-layer.sh
1115
ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
1216
ADD patches/template_conf.json /opt/kaggle/conf.json
1317

18+
{{ if eq .Accelerator "gpu" }}
19+
# b/200968891 Keeps horovod once torch is upgraded.
20+
RUN pip uninstall -y horovod && \
21+
/tmp/clean-layer.sh
22+
{{ end }}
23+
1424
# Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections,
1525
# as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346
1626
RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \
@@ -24,8 +34,6 @@ RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list &
2434
apt-get install -y openssh-client && \
2535
/tmp/clean-layer.sh
2636

27-
# Make sure the dynamic linker finds the right libstdc++
28-
ENV LD_LIBRARY_PATH=/opt/conda/lib
2937
# b/128333086: Set PROJ_LIB to points to the proj4 cartographic library.
3038
ENV PROJ_LIB=/opt/conda/share/proj
3139

@@ -39,8 +47,71 @@ RUN conda config --add channels nvidia && \
3947
conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \
4048
/tmp/clean-layer.sh
4149

50+
{{ if eq .Accelerator "gpu" }}
51+
RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
52+
/tmp/clean-layer.sh
53+
{{ end }}
54+
55+
# Install PyTorch
56+
{{ if eq .Accelerator "gpu" }}
57+
RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
58+
/tmp/clean-layer.sh
59+
{{ else }}
4260
RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
4361
/tmp/clean-layer.sh
62+
{{ end }}
63+
64+
# Install LightGBM
65+
ENV LIGHTGBM_VERSION=3.2.1
66+
{{ if eq .Accelerator "gpu" }}
67+
# Install OpenCL & libboost (required by LightGBM GPU version)
68+
RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \
69+
mkdir -p /etc/OpenCL/vendors && \
70+
echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
71+
cd /usr/local/src && \
72+
git clone --recursive https://github.com/microsoft/LightGBM && \
73+
cd LightGBM && \
74+
git checkout tags/v$LIGHTGBM_VERSION && \
75+
mkdir build && cd build && \
76+
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
77+
make -j$(nproc) && \
78+
cd /usr/local/src/LightGBM/python-package && \
79+
python setup.py install --precompile && \
80+
/tmp/clean-layer.sh
81+
{{ else }}
82+
RUN pip install lightgbm==$LIGHTGBM_VERSION && \
83+
/tmp/clean-layer.sh
84+
{{ end }}
85+
86+
# Install JAX
87+
ENV JAX_VERSION=0.2.19
88+
{{ if eq .Accelerator "gpu" }}
89+
RUN pip install jax[cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION]==$JAX_VERSION -f https://storage.googleapis.com/jax-releases/jax_releases.html && \
90+
/tmp/clean-layer.sh
91+
{{ else }}
92+
RUN pip install jax[cpu]==$JAX_VERSION && \
93+
/tmp/clean-layer.sh
94+
{{ end }}
95+
96+
# Install mxnet
97+
{{ if eq .Accelerator "gpu" }}
98+
RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
99+
/tmp/clean-layer.sh
100+
{{ else }}
101+
RUN pip install mxnet && \
102+
/tmp/clean-layer.sh
103+
{{ end}}
104+
105+
# Install GPU specific packages
106+
{{ if eq .Accelerator "gpu" }}
107+
# Install GPU-only packages
108+
RUN pip install pycuda && \
109+
pip install pynvrtc && \
110+
# b/190622765 latest version is causing issue. nnabla fixed it in https://github.com/sony/nnabla/issues/892, waiting for new release before we can remove this pin.
111+
pip install pynvml==8.0.4 && \
112+
pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
113+
/tmp/clean-layer.sh
114+
{{ end }}
44115

45116
RUN pip install pysal && \
46117
pip install seaborn python-dateutil dask python-igraph && \
@@ -50,12 +121,8 @@ RUN pip install pysal && \
50121
# Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda.
51122
apt-get install -y default-jre-headless && \
52123
pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o && \
53-
/tmp/clean-layer.sh
54-
55-
RUN pip install tensorflow==${TENSORFLOW_VERSION} && \
56-
pip install tensorflow-gcs-config==2.4.0 && \
57-
pip install tensorflow-addons==0.12.1 && \
58-
pip install tensorflow_probability==0.12.2 && \
124+
pip install tensorflow-gcs-config==2.6.0 && \
125+
pip install tensorflow-addons==0.14.0 && \
59126
/tmp/clean-layer.sh
60127

61128
RUN apt-get install -y libfreetype6-dev && \
@@ -65,10 +132,7 @@ RUN apt-get install -y libfreetype6-dev && \
65132
pip install textblob && \
66133
pip install wordcloud && \
67134
pip install xgboost && \
68-
# Pinned to match GPU version. Update version together.
69-
pip install lightgbm==3.2.1 && \
70135
pip install pydot && \
71-
pip install keras-tuner && \
72136
pip install flake8 && \
73137
# Pinned because it breaks theano test with the latest version (b/178107003).
74138
pip install theano-pymc==1.0.11 && \
@@ -99,7 +163,6 @@ RUN apt-get install -y libfreetype6-dev && \
99163
/tmp/clean-layer.sh
100164

101165
RUN pip install ibis-framework && \
102-
pip install mxnet && \
103166
pip install gluonnlp && \
104167
pip install gluoncv && \
105168
/tmp/clean-layer.sh
@@ -384,11 +447,6 @@ RUN pip install flashtext && \
384447
pip install geopandas && \
385448
pip install nnabla && \
386449
pip install vowpalwabbit && \
387-
# papermill can replace nbconvert for executing notebooks
388-
pip install cloud-tpu-client && \
389-
# b/188429515#comment7 tensorflow-cloud >= 0.1.14 installs tensorflow-transform which install apache-beam which downgrades the google.cloud library to 1.x.
390-
pip install tensorflow-cloud==0.1.13 && \
391-
pip install tensorflow-datasets && \
392450
pip install pydub && \
393451
pip install pydegensac && \
394452
# b/198635596 latest versions of torchmetrics & pytorch-lightning are failing at runtime.
@@ -401,8 +459,6 @@ RUN pip install flashtext && \
401459
# pycrypto is used by competitions team.
402460
pip install pycrypto && \
403461
pip install easyocr && \
404-
# Keep JAX version in sync with GPU image.
405-
pip install jax[cpu]==0.2.19 && \
406462
# ipympl adds interactive widget support for matplotlib
407463
pip install ipympl==0.7.0 && \
408464
pip install pandarallel && \

Jenkinsfile

+77-74
Original file line numberDiff line numberDiff line change
@@ -20,46 +20,7 @@ pipeline {
2020
}
2121

2222
stages {
23-
stage('Docker CPU Build') {
24-
options {
25-
timeout(time: 120, unit: 'MINUTES')
26-
}
27-
steps {
28-
sh '''#!/bin/bash
29-
set -exo pipefail
30-
31-
./build | ts
32-
./push ${PRETEST_TAG}
33-
'''
34-
}
35-
}
36-
37-
stage('Test CPU Image') {
38-
options {
39-
timeout(time: 5, unit: 'MINUTES')
40-
}
41-
steps {
42-
sh '''#!/bin/bash
43-
set -exo pipefail
44-
45-
date
46-
docker pull gcr.io/kaggle-images/python:${PRETEST_TAG}
47-
./test --image gcr.io/kaggle-images/python:${PRETEST_TAG}
48-
'''
49-
}
50-
}
51-
52-
stage('Docker GPU Build') {
53-
// A GPU is not required to build this image. However, in our current setup,
54-
// the default runtime is set to nvidia (as opposed to runc) and there
55-
// is no option to specify a runtime for the `docker build` command.
56-
//
57-
// TODO(rosbo) don't set `nvidia` as the default runtime and use the
58-
// `--runtime=nvidia` flag for the `docker run` command when GPU support is needed.
59-
agent { label 'ephemeral-linux-gpu' }
60-
options {
61-
timeout(time: 60, unit: 'MINUTES')
62-
}
23+
stage('Clean Images') {
6324
steps {
6425
sh '''#!/bin/bash
6526
set -exo pipefail
@@ -70,51 +31,93 @@ pipeline {
7031
# will untag the previously built image which is safe to do. Builds for a single branch are performed
7132
# serially.
7233
docker image prune -f
73-
./build --gpu --base-image-tag ${PRETEST_TAG} | ts
74-
./push --gpu ${PRETEST_TAG}
7534
'''
7635
}
7736
}
37+
stage('Build/Test/Diff') {
38+
parallel {
39+
stage('CPU') {
40+
stages {
41+
stage('Build CPU Image') {
42+
options {
43+
timeout(time: 120, unit: 'MINUTES')
44+
}
45+
steps {
46+
sh '''#!/bin/bash
47+
set -exo pipefail
7848
79-
stage('Test GPU Image') {
80-
agent { label 'ephemeral-linux-gpu' }
81-
options {
82-
timeout(time: 20, unit: 'MINUTES')
83-
}
84-
steps {
85-
sh '''#!/bin/bash
86-
set -exo pipefail
87-
88-
date
89-
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
90-
./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
91-
'''
92-
}
93-
}
49+
./build | ts
50+
./push ${PRETEST_TAG}
51+
'''
52+
}
53+
}
54+
stage('Test CPU Image') {
55+
options {
56+
timeout(time: 5, unit: 'MINUTES')
57+
}
58+
steps {
59+
sh '''#!/bin/bash
60+
set -exo pipefail
9461
95-
stage('Package Versions') {
96-
parallel {
97-
stage('CPU Diff') {
98-
steps {
99-
sh '''#!/bin/bash
100-
set -exo pipefail
62+
date
63+
docker pull gcr.io/kaggle-images/python:${PRETEST_TAG}
64+
./test --image gcr.io/kaggle-images/python:${PRETEST_TAG}
65+
'''
66+
}
67+
}
68+
stage('Diff CPU image') {
69+
steps {
70+
sh '''#!/bin/bash
71+
set -exo pipefail
10172
102-
docker pull gcr.io/kaggle-images/python:${PRETEST_TAG}
103-
./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG}
104-
'''
73+
docker pull gcr.io/kaggle-images/python:${PRETEST_TAG}
74+
./diff --target gcr.io/kaggle-images/python:${PRETEST_TAG}
75+
'''
76+
}
77+
}
10578
}
10679
}
107-
stage('GPU Diff') {
80+
stage('GPU') {
10881
agent { label 'ephemeral-linux-gpu' }
109-
steps {
110-
sh '''#!/bin/bash
111-
set -exo pipefail
82+
stages {
83+
stage('Build GPU Image') {
84+
options {
85+
timeout(time: 120, unit: 'MINUTES')
86+
}
87+
steps {
88+
sh '''#!/bin/bash
89+
set -exo pipefail
90+
./build --gpu | ts
91+
./push --gpu ${PRETEST_TAG}
92+
'''
93+
}
94+
}
95+
stage('Test GPU Image') {
96+
options {
97+
timeout(time: 20, unit: 'MINUTES')
98+
}
99+
steps {
100+
sh '''#!/bin/bash
101+
set -exo pipefail
102+
103+
date
104+
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
105+
./test --gpu --image gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
106+
'''
107+
}
108+
}
109+
stage('Diff GPU Image') {
110+
steps {
111+
sh '''#!/bin/bash
112+
set -exo pipefail
112113
113-
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
114-
./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
115-
'''
114+
docker pull gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
115+
./diff --gpu --target gcr.io/kaggle-private-byod/python:${PRETEST_TAG}
116+
'''
117+
}
118+
}
116119
}
117-
}
120+
}
118121
}
119122
}
120123

0 commit comments

Comments
 (0)