From 8516598640c49cf6857b821b9499d9297ab0d26c Mon Sep 17 00:00:00 2001 From: Etienne Perot Date: Sun, 15 Dec 2024 21:33:08 -0800 Subject: [PATCH] Re-add PyTorch benchmarks into PyTorch image. Update CUDA version. The PyTorch benchmarks were removed in https://github.com/google/gvisor/commit/9304ed401fd9604bc36c0436a1132a8b4e3f0851#diff-fd8d6db82d75e1038ed6136c9930c17d6985ff5d22f2ed9e5e8910661de14228 but the Kubernetes PyTorch Kubernetes benchmarks actually depended on them. PiperOrigin-RevId: 706569220 --- images/gpu/pytorch/Dockerfile.x86_64 | 94 +++++++++++++++++----- test/kubernetes/benchmarks/pytorch.go | 64 +++------------ test/kubernetes/benchmarks/pytorch_test.go | 6 -- 3 files changed, 81 insertions(+), 83 deletions(-) diff --git a/images/gpu/pytorch/Dockerfile.x86_64 b/images/gpu/pytorch/Dockerfile.x86_64 index fed65ef7e5..7890f9f6fb 100644 --- a/images/gpu/pytorch/Dockerfile.x86_64 +++ b/images/gpu/pytorch/Dockerfile.x86_64 @@ -1,29 +1,42 @@ -FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 - -RUN apt-get update && apt-get install --yes \ - python3 \ - python3-distutils \ - python3-pip \ - clang \ - wget \ - vim \ - git - -RUN python3 -m pip install --ignore-installed \ - "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \ - torch \ - torchvision \ - lightning \ - numpy \ - memory_profiler +FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 + +# Used for determining the correct pip index URL below. +ENV CUDA_VERSION=12.4 ENV PYTORCH_DATASETS_DIR=/pytorch-data ENV TORCH_HOME=/pytorch-home +RUN mkdir -p "$TORCH_HOME" && \ + mkdir -p "$PYTORCH_DATASETS_DIR" + +RUN apt-get update && \ + apt-get install --yes \ + libgl1-mesa-glx libglib2.0-0 \ + pkg-config \ + python3 \ + python3-distutils \ + python3-pip \ + clang \ + wget \ + vim \ + git + +RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu$(echo "$CUDA_VERSION" | sed 's~\.~~g')" && \ + python3 -m pip install --ignore-installed \ + boto3 \ + "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \ + lightning \ + matplotlib \ + memory_profiler \ + numba && \ + python3 -m pip install --ignore-installed \ + torch \ + torchvision \ + torchaudio \ + numpy \ + --index-url "$PIP_INDEX_URL" + COPY download_pytorch_datasets.py /tmp/ -# Some PyTorch examples hardcode the data directory to "data", so -# make a symlink for that too. -RUN mkdir "$PYTORCH_DATASETS_DIR" && \ - python3 /tmp/download_pytorch_datasets.py && \ +RUN python3 /tmp/download_pytorch_datasets.py && \ rm /tmp/download_pytorch_datasets.py RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \ @@ -38,3 +51,40 @@ RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \ COPY *.py / RUN rm /download_pytorch_datasets.py + +RUN PYTORCH_BENCHMARKS_COMMIT=675fb8f537d302a4fef3ed2a67349209e65046ac && \ + mkdir /pytorch-benchmark && \ + cd /pytorch-benchmark && \ + git init && \ + git remote add origin https://github.com/pytorch/benchmark.git && \ + git fetch --depth 1 origin "$PYTORCH_BENCHMARKS_COMMIT" && \ + git checkout FETCH_HEAD + +# Note that mobilenet_v2 does not have a requirements.txt file. +RUN cd /pytorch-benchmark && \ + python3 -m pip install --ignore-installed \ + -r requirements.txt \ + -r torchbenchmark/models/LearningToPaint/requirements.txt \ + -r torchbenchmark/models/fastNLP_Bert/requirements.txt \ + -r torchbenchmark/models/hf_BigBird/requirements.txt \ + -r torchbenchmark/models/speech_transformer/requirements.txt + +# These benchmarks are chosen based on diversity of the type of model and their +# profile with respect to using the GPU and moving data. For more context, see +# this paper: https://arxiv.org/pdf/2304.14226.pdf +RUN cd /pytorch-benchmark && \ + python3 install.py \ + LearningToPaint \ + fastNLP_Bert \ + hf_BigBird \ + speech_transformer \ + mobilenet_v2 + +# Some of these benchmarks download a dataset at runtime. +# Run them once on CPU just to get this predownloaded into the image. +RUN cd /pytorch-benchmark && \ + python3 run.py LearningToPaint --device cpu && \ + python3 run.py fastNLP_Bert --device cpu && \ + python3 run.py hf_BigBird --device cpu && \ + python3 run.py speech_transformer --device cpu && \ + python3 run.py mobilenet_v2 --device cpu diff --git a/test/kubernetes/benchmarks/pytorch.go b/test/kubernetes/benchmarks/pytorch.go index 92fe7e45ef..2ddc55709c 100644 --- a/test/kubernetes/benchmarks/pytorch.go +++ b/test/kubernetes/benchmarks/pytorch.go @@ -53,19 +53,9 @@ const ( pytorchImage = k8s.ImageRepoPrefix + "gpu/pytorch_x86_64:latest" ) -type pytorchMode string - -// pytorchMode is the pytorch mode used, either script mode (jit) or eager mode. -// See: https://towardsdatascience.com/pytorch-jit-and-torchscript-c2a77bac0fff -const ( - jit = pytorchMode("jit") - eager = pytorchMode("eager") -) - type pytorchTest struct { module string test pytorchTestType - mode pytorchMode } // Sets of tests. @@ -81,12 +71,10 @@ var ( { module: "fastNLP_Bert", test: train, - mode: eager, }, { module: "fastNLP_Bert", test: eval, - mode: eager, }, } @@ -100,12 +88,10 @@ var ( { module: "hf_BigBird", test: train, - mode: eager, }, { module: "hf_BigBird", test: eval, - mode: eager, }, } @@ -119,12 +105,10 @@ var ( { module: "speech_transformer", test: train, - mode: eager, }, { module: "speech_transformer", test: eval, - mode: eager, }, } @@ -138,12 +122,10 @@ var ( { module: "LearningToPaint", test: train, - mode: jit, }, { module: "LearningToPaint", test: eval, - mode: jit, }, } @@ -156,29 +138,10 @@ var ( { module: "mobilenet_v2", test: train, - mode: jit, }, { module: "mobilenet_v2", test: eval, - mode: jit, - }, - } - - // BackgroundMatting uses the Background_Matting module classified as "Computer Vision: Pattern Recognition". - // BackgroundMatting has a lot of GPU idle time. See Figure 2 on page 5: https://arxiv.org/pdf/2304.14226.pdf - // - // https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models/Background_Matting (see README) - BackgroundMatting = []pytorchTest{ - { - module: "Background_Matting", - test: train, - mode: eager, - }, - { - module: "Background_Matting", - test: eval, - mode: eager, }, } ) @@ -188,7 +151,7 @@ var ( func (p pytorchTest) Name() string { // Kubernetes pod names cannot contain "_". module := strings.ReplaceAll(strings.ToLower(p.module), "_", "-") - return fmt.Sprintf("%s-%s-%s", module, p.test, p.mode) + return fmt.Sprintf("%s-%s", module, p.test) } var snakeCase = regexp.MustCompile("_.") @@ -206,16 +169,7 @@ func (p pytorchTest) BenchName() string { return strings.ToUpper(strings.TrimPrefix(s, "_")) }) test := strings.ToUpper(string(p.test)[:1]) + string(p.test[1:]) - var mode string - switch p.mode { - case eager: - mode = "Eager" - case jit: - mode = "JIT" - default: - panic(fmt.Sprintf("Unknown mode: %v", p.mode)) - } - return fmt.Sprintf("%s/%s/%s", moduleName, test, mode) + return fmt.Sprintf("%s/%s", moduleName, test) } func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13.Pod, error) { @@ -235,12 +189,12 @@ func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13 func (p pytorchTest) command() []string { return []string{ - "python3", - "run.py", - p.module, - "--device", "cuda", - "--test", string(p.test), - "--mode", string(p.mode), + "sh", + "-c", + strings.Join([]string{ + "cd /pytorch-benchmark", + fmt.Sprintf("python3 run.py %s --device cuda --test %s", p.module, p.test), + }, " && "), } } @@ -350,7 +304,7 @@ func parseStandardOutput(output string) ([]benchmetric.MetricValue, error) { }, nil } -var gpuTimeRegex = regexp.MustCompile(`GPU\sTime:\s*(\d+\.\d+)\smilliseconds`) +var gpuTimeRegex = regexp.MustCompile(`GPU\sTime\sper\sbatch:\s*(\d+\.\d+)\smilliseconds`) func parseGPUTime(output string) (float64, error) { match := gpuTimeRegex.FindStringSubmatch(output) diff --git a/test/kubernetes/benchmarks/pytorch_test.go b/test/kubernetes/benchmarks/pytorch_test.go index ea43ab323b..d1787aba7b 100644 --- a/test/kubernetes/benchmarks/pytorch_test.go +++ b/test/kubernetes/benchmarks/pytorch_test.go @@ -47,11 +47,6 @@ func TestMobileNetV2(t *testing.T) { runTests(ctx, t, MobileNetV2) } -func TestBackgroundMatting(t *testing.T) { - ctx := context.Background() - runTests(ctx, t, BackgroundMatting) -} - func runTests(ctx context.Context, t *testing.T, tests []pytorchTest) { k8sCtx, err := k8sctx.Context(ctx) if err != nil { @@ -72,6 +67,5 @@ func TestMain(m *testing.M) { "TestSpeechTransformer": TestSpeechTransformer, "TestLearningToPaint": TestLearningToPaint, "TestMobileNetV2": TestMobileNetV2, - "TestBackgroundMatting": TestBackgroundMatting, }) }