Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-add PyTorch benchmarks into PyTorch image. Update CUDA version. #11286

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 73 additions & 21 deletions images/gpu/pytorch/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -1,29 +1,38 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04

RUN apt-get update && apt-get install --yes \
python3 \
python3-distutils \
python3-pip \
clang \
wget \
vim \
git
ENV PYTORCH_DATASETS_DIR=/pytorch-data
ENV TORCH_HOME=/pytorch-home
RUN mkdir -p "$TORCH_HOME" && \
mkdir -p "$PYTORCH_DATASETS_DIR"

RUN apt-get update && \
apt-get install --yes \
libgl1-mesa-glx libglib2.0-0 \
pkg-config \
python3 \
python3-distutils \
python3-pip \
clang \
wget \
vim \
git

RUN python3 -m pip install --ignore-installed \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
torch \
torchvision \
lightning \
numpy \
memory_profiler
boto3 \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
lightning \
matplotlib \
memory_profiler \
numba && \
python3 -m pip install --ignore-installed \
torch \
torchvision \
torchaudio \
numpy \
--index-url https://download.pytorch.org/whl/cu124

ENV PYTORCH_DATASETS_DIR=/pytorch-data
ENV TORCH_HOME=/pytorch-home
COPY download_pytorch_datasets.py /tmp/
# Some PyTorch examples hardcode the data directory to "data", so
# make a symlink for that too.
RUN mkdir "$PYTORCH_DATASETS_DIR" && \
python3 /tmp/download_pytorch_datasets.py && \
RUN python3 /tmp/download_pytorch_datasets.py && \
rm /tmp/download_pytorch_datasets.py

RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \
Expand All @@ -38,3 +47,46 @@ RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \

COPY *.py /
RUN rm /download_pytorch_datasets.py

RUN PYTORCH_BENCHMARKS_COMMIT=675fb8f537d302a4fef3ed2a67349209e65046ac && \
mkdir /pytorch-benchmark && \
cd /pytorch-benchmark && \
git init && \
git remote add origin https://github.com/pytorch/benchmark.git && \
git fetch --depth 1 origin "$PYTORCH_BENCHMARKS_COMMIT" && \
git checkout FETCH_HEAD

# Note that mobilenet_v2 does not have a requirements.txt file.
RUN cd /pytorch-benchmark && \
python3 -m pip install --ignore-installed \
-r requirements.txt \
-r torchbenchmark/models/LearningToPaint/requirements.txt \
-r torchbenchmark/models/fastNLP_Bert/requirements.txt \
-r torchbenchmark/models/hf_BigBird/requirements.txt \
-r torchbenchmark/models/speech_transformer/requirements.txt \
-r torchbenchmark/models/Background_Matting/requirements.txt

# These benchmarks are chosen based on diversity of the type of model and their
# profile with respect to using the GPU and moving data. For more context, see
# this paper: https://arxiv.org/pdf/2304.14226.pdf
RUN cd /pytorch-benchmark && \
python3 install.py \
LearningToPaint \
fastNLP_Bert \
hf_BigBird \
speech_transformer \
mobilenet_v2 \
Background_Matting

# Some of these benchmarks download a dataset at runtime.
# Run them once on CPU just to get this predownloaded into the image.
# Background_Matting will throw a NotImplementedError when running on
# CPU, but this is after having downloaded its dataset, so we run it
# anyway and verify that it output contains NotImplementedError.
RUN cd /pytorch-benchmark && \
python3 run.py LearningToPaint --device cpu && \
python3 run.py fastNLP_Bert --device cpu && \
python3 run.py hf_BigBird --device cpu && \
python3 run.py speech_transformer --device cpu && \
python3 run.py mobilenet_v2 --device cpu && \
( ( python3 run.py Background_Matting --device cpu 2>&1 || true) | grep -q NotImplementedError)
56 changes: 39 additions & 17 deletions test/kubernetes/benchmarks/httpbench/httpbench.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,18 @@ type HTTPBenchmark struct {
// Run runs the HTTP-based benchmark.
func (h *HTTPBenchmark) Run(ctx context.Context, t *testing.T) {
t.Helper()
if err := h.Cluster.WaitForServiceReady(ctx, h.Service); err != nil {
serverWaitCtx, serverWaitCancel := context.WithTimeout(ctx, 10*time.Minute)
if err := h.Cluster.WaitForServiceReady(serverWaitCtx, h.Service); err != nil {
t.Fatalf("Failed to wait for service: %v", err)
}
ip := testcluster.GetIPFromService(h.Service)
if ip == "" {
t.Fatalf("did not get valid ip: %s", ip)
}
if err := h.waitForServer(ctx, ip); err != nil {
if err := h.waitForServer(serverWaitCtx, ip); err != nil {
t.Fatalf("Failed to wait for server: %v", err)
}
serverWaitCancel()
for _, round := range h.Rounds {
qpsText := fmt.Sprintf("%d", round.TargetQPS)
if round.TargetQPS == InfiniteQPS {
Expand Down Expand Up @@ -146,7 +148,10 @@ func (h *HTTPBenchmark) runRound(ctx context.Context, t *testing.T, round Round,
}
defer h.Cluster.DeletePod(ctx, client)

if err := h.Cluster.WaitForPodCompleted(ctx, client); err != nil {
waitCtx, waitCancel := context.WithTimeout(ctx, round.Duration+2*time.Minute)
err = h.Cluster.WaitForPodCompleted(waitCtx, client)
waitCancel()
if err != nil {
t.Fatalf("failed to wait for wrk2 pod: %v", err)
}

Expand Down Expand Up @@ -243,21 +248,38 @@ func (h *HTTPBenchmark) getWgetPod(ip string) *v13.Pod {
// waitForServer waits for an HTTP server to start responding on the given
// IP and port.
func (h *HTTPBenchmark) waitForServer(ctx context.Context, ip string) error {
wget, err := h.Cluster.ConfigurePodForClientNodepool(ctx, h.getWgetPod(ip))
if err != nil {
return fmt.Errorf("failed to configure wget pod for client nodepool: %v", err)
}
wget, err = h.Cluster.CreatePod(ctx, wget)
if err != nil {
return fmt.Errorf("failed to create wget pod: %v", err)
}
defer h.Cluster.DeletePod(ctx, wget)
waitCtx, waitCancel := context.WithTimeout(ctx, 1*time.Minute)
defer waitCancel()
if err := h.Cluster.WaitForPodCompleted(waitCtx, wget); err != nil {
return fmt.Errorf("failed to wait for HTTP server %s:%d%s: %v", ip, h.Port, h.Path, err)
lastPhase := v13.PodUnknown
var lastLogs string
for ctx.Err() == nil {
wget, err := h.Cluster.ConfigurePodForClientNodepool(ctx, h.getWgetPod(ip))
if err != nil {
return fmt.Errorf("failed to configure wget pod for client nodepool: %w", err)
}
wget, err = h.Cluster.CreatePod(ctx, wget)
if err != nil {
return fmt.Errorf("failed to create wget pod: %w", err)
}
phase, waitErr := h.Cluster.WaitForPodTerminated(ctx, wget)
if phase != v13.PodSucceeded {
logs, err := h.Cluster.ReadPodLogs(ctx, wget)
if err != nil {
_ = h.Cluster.DeletePod(ctx, wget) // Best-effort delete.
return fmt.Errorf("failed to read wget pod logs: %w", err)
}
lastLogs = logs
}
deleteErr := h.Cluster.DeletePod(ctx, wget)
if waitErr != nil {
return fmt.Errorf("failed to wait for wget pod: %w", waitErr)
}
if deleteErr != nil {
return fmt.Errorf("failed to delete wget pod: %w", deleteErr)
}
if phase == v13.PodSucceeded {
return nil
}
}
return nil
return fmt.Errorf("wget pod still fails after context expiry (last phase: %v; last logs: %q)", lastPhase, lastLogs)
}

/*
Expand Down
22 changes: 4 additions & 18 deletions test/kubernetes/benchmarks/nginx.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import (

const (
nginxPort = 80
nginxBenchmarkDuration = 70 * time.Second
nginxBenchmarkDuration = 55 * time.Second
nginxRequestTimeout = 3 * time.Second
nginxServingDir = "/tmp/html"

Expand All @@ -48,9 +48,9 @@ var (
// The test expects that it contains the files to be served at /local,
// and will serve files out of `nginxServingDir`.
nginxCommand = []string{"nginx", "-c", "/etc/nginx/nginx.conf"}
nginxDocKibibytes = []int{1, 10, 100, 10240}
threads = []int{1, 8, 64, 1000}
targetQPS = []int{1, 8, 64, httpbench.InfiniteQPS}
nginxDocKibibytes = []int{1, 10240}
threads = []int{1, 8, 1000}
targetQPS = []int{1, 64, httpbench.InfiniteQPS}
wantPercentiles = []int{50, 95, 99}
)

Expand Down Expand Up @@ -212,20 +212,6 @@ func BenchmarkNginx(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesC
benchmark.Run(ctx, t)
})
}
t.Run("HTTP404", func(t *testing.T) {
benchmark := &httpbench.HTTPBenchmark{
Name: fmt.Sprintf("nginx/%s/HTTP404", test.name),
Cluster: cluster,
Namespace: benchmarkNS,
Service: service,
Port: nginxPort,
Path: "/404-this-page-does-not-exist.html",
Rounds: rounds,
Timeout: nginxRequestTimeout,
WantPercentiles: wantPercentiles,
}
benchmark.Run(ctx, t)
})
})
if t.Failed() {
break
Expand Down
2 changes: 1 addition & 1 deletion test/kubernetes/benchmarks/postgresql.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ const (
)

var (
numConnections = []int{1, 2, 6, 16, 32, 64}
numConnections = []int{1, 2, 12, 64}
)

// BenchmarkPostgresPGBench runs a PostgreSQL pgbench test.
Expand Down
47 changes: 8 additions & 39 deletions test/kubernetes/benchmarks/pytorch.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,9 @@ const (
pytorchImage = k8s.ImageRepoPrefix + "gpu/pytorch_x86_64:latest"
)

type pytorchMode string

// pytorchMode is the pytorch mode used, either script mode (jit) or eager mode.
// See: https://towardsdatascience.com/pytorch-jit-and-torchscript-c2a77bac0fff
const (
jit = pytorchMode("jit")
eager = pytorchMode("eager")
)

type pytorchTest struct {
module string
test pytorchTestType
mode pytorchMode
}

// Sets of tests.
Expand All @@ -81,12 +71,10 @@ var (
{
module: "fastNLP_Bert",
test: train,
mode: eager,
},
{
module: "fastNLP_Bert",
test: eval,
mode: eager,
},
}

Expand All @@ -100,12 +88,10 @@ var (
{
module: "hf_BigBird",
test: train,
mode: eager,
},
{
module: "hf_BigBird",
test: eval,
mode: eager,
},
}

Expand All @@ -119,12 +105,10 @@ var (
{
module: "speech_transformer",
test: train,
mode: eager,
},
{
module: "speech_transformer",
test: eval,
mode: eager,
},
}

Expand All @@ -138,12 +122,10 @@ var (
{
module: "LearningToPaint",
test: train,
mode: jit,
},
{
module: "LearningToPaint",
test: eval,
mode: jit,
},
}

Expand All @@ -156,12 +138,10 @@ var (
{
module: "mobilenet_v2",
test: train,
mode: jit,
},
{
module: "mobilenet_v2",
test: eval,
mode: jit,
},
}

Expand All @@ -173,12 +153,10 @@ var (
{
module: "Background_Matting",
test: train,
mode: eager,
},
{
module: "Background_Matting",
test: eval,
mode: eager,
},
}
)
Expand All @@ -188,7 +166,7 @@ var (
func (p pytorchTest) Name() string {
// Kubernetes pod names cannot contain "_".
module := strings.ReplaceAll(strings.ToLower(p.module), "_", "-")
return fmt.Sprintf("%s-%s-%s", module, p.test, p.mode)
return fmt.Sprintf("%s-%s", module, p.test)
}

var snakeCase = regexp.MustCompile("_.")
Expand All @@ -206,16 +184,7 @@ func (p pytorchTest) BenchName() string {
return strings.ToUpper(strings.TrimPrefix(s, "_"))
})
test := strings.ToUpper(string(p.test)[:1]) + string(p.test[1:])
var mode string
switch p.mode {
case eager:
mode = "Eager"
case jit:
mode = "JIT"
default:
panic(fmt.Sprintf("Unknown mode: %v", p.mode))
}
return fmt.Sprintf("%s/%s/%s", moduleName, test, mode)
return fmt.Sprintf("%s/%s", moduleName, test)
}

func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13.Pod, error) {
Expand All @@ -235,12 +204,12 @@ func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13

func (p pytorchTest) command() []string {
return []string{
"python3",
"run.py",
p.module,
"--device", "cuda",
"--test", string(p.test),
"--mode", string(p.mode),
"sh",
"-c",
strings.Join([]string{
"cd /pytorch-benchmark",
fmt.Sprintf("python3 run.py %s --device cuda --test %s", p.module, p.test),
}, " && "),
}
}

Expand Down
Loading
Loading