google · copybara-service · Dec 11, 2024
diff --git a/images/gpu/pytorch/Dockerfile.x86_64 b/images/gpu/pytorch/Dockerfile.x86_64
@@ -1,29 +1,38 @@
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
 
-RUN apt-get update && apt-get install --yes \
-      python3 \
-      python3-distutils \
-      python3-pip \
-      clang \
-      wget \
-      vim \
-      git
+ENV PYTORCH_DATASETS_DIR=/pytorch-data
+ENV TORCH_HOME=/pytorch-home
+RUN mkdir -p "$TORCH_HOME" && \
+    mkdir -p "$PYTORCH_DATASETS_DIR"
+
+RUN apt-get update && \
+    apt-get install --yes \
+        libgl1-mesa-glx libglib2.0-0 \
+        pkg-config \
+        python3 \
+        python3-distutils \
+        python3-pip \
+        clang \
+        wget \
+        vim \
+        git
 
 RUN python3 -m pip install --ignore-installed \
-      "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
-      torch \
-      torchvision \
-      lightning \
-      numpy \
-      memory_profiler
+        boto3 \
+        "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
+        lightning \
+        matplotlib \
+        memory_profiler \
+        numba && \
+    python3 -m pip install --ignore-installed \
+        torch \
+        torchvision \
+        torchaudio \
+        numpy \
+        --index-url https://download.pytorch.org/whl/cu124
 
-ENV PYTORCH_DATASETS_DIR=/pytorch-data
-ENV TORCH_HOME=/pytorch-home
 COPY download_pytorch_datasets.py /tmp/
-# Some PyTorch examples hardcode the data directory to "data", so
-# make a symlink for that too.
-RUN mkdir "$PYTORCH_DATASETS_DIR" && \
-    python3 /tmp/download_pytorch_datasets.py && \
+RUN python3 /tmp/download_pytorch_datasets.py && \
     rm /tmp/download_pytorch_datasets.py
 
 RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \
@@ -38,3 +47,46 @@ RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \
 
 COPY *.py /
 RUN rm /download_pytorch_datasets.py
+
+RUN PYTORCH_BENCHMARKS_COMMIT=675fb8f537d302a4fef3ed2a67349209e65046ac && \
+    mkdir /pytorch-benchmark && \
+    cd /pytorch-benchmark && \
+    git init && \
+    git remote add origin https://github.com/pytorch/benchmark.git && \
+    git fetch --depth 1 origin "$PYTORCH_BENCHMARKS_COMMIT" && \
+    git checkout FETCH_HEAD
+
+# Note that mobilenet_v2 does not have a requirements.txt file.
+RUN cd /pytorch-benchmark && \
+    python3 -m pip install --ignore-installed \
+        -r requirements.txt \
+        -r torchbenchmark/models/LearningToPaint/requirements.txt \
+        -r torchbenchmark/models/fastNLP_Bert/requirements.txt \
+        -r torchbenchmark/models/hf_BigBird/requirements.txt \
+        -r torchbenchmark/models/speech_transformer/requirements.txt \
+        -r torchbenchmark/models/Background_Matting/requirements.txt
+
+# These benchmarks are chosen based on diversity of the type of model and their
+# profile with respect to using the GPU and moving data. For more context, see
+# this paper: https://arxiv.org/pdf/2304.14226.pdf
+RUN cd /pytorch-benchmark && \
+    python3 install.py \
+        LearningToPaint \
+        fastNLP_Bert \
+        hf_BigBird \
+        speech_transformer \
+        mobilenet_v2 \
+        Background_Matting
+
+# Some of these benchmarks download a dataset at runtime.
+# Run them once on CPU just to get this predownloaded into the image.
+# Background_Matting will throw a NotImplementedError when running on
+# CPU, but this is after having downloaded its dataset, so we run it
+# anyway and verify that it output contains NotImplementedError.
+RUN cd /pytorch-benchmark && \
+    python3 run.py LearningToPaint --device cpu && \
+    python3 run.py fastNLP_Bert --device cpu && \
+    python3 run.py hf_BigBird --device cpu && \
+    python3 run.py speech_transformer --device cpu && \
+    python3 run.py mobilenet_v2 --device cpu && \
+    ( ( python3 run.py Background_Matting --device cpu 2>&1 || true) | grep -q NotImplementedError)
diff --git a/test/kubernetes/benchmarks/httpbench/httpbench.go b/test/kubernetes/benchmarks/httpbench/httpbench.go
@@ -105,16 +105,18 @@ type HTTPBenchmark struct {
 // Run runs the HTTP-based benchmark.
 func (h *HTTPBenchmark) Run(ctx context.Context, t *testing.T) {
 	t.Helper()
-	if err := h.Cluster.WaitForServiceReady(ctx, h.Service); err != nil {
+	serverWaitCtx, serverWaitCancel := context.WithTimeout(ctx, 10*time.Minute)
+	if err := h.Cluster.WaitForServiceReady(serverWaitCtx, h.Service); err != nil {
 		t.Fatalf("Failed to wait for service: %v", err)
 	}
 	ip := testcluster.GetIPFromService(h.Service)
 	if ip == "" {
 		t.Fatalf("did not get valid ip: %s", ip)
 	}
-	if err := h.waitForServer(ctx, ip); err != nil {
+	if err := h.waitForServer(serverWaitCtx, ip); err != nil {
 		t.Fatalf("Failed to wait for server: %v", err)
 	}
+	serverWaitCancel()
 	for _, round := range h.Rounds {
 		qpsText := fmt.Sprintf("%d", round.TargetQPS)
 		if round.TargetQPS == InfiniteQPS {
@@ -146,7 +148,10 @@ func (h *HTTPBenchmark) runRound(ctx context.Context, t *testing.T, round Round,
 	}
 	defer h.Cluster.DeletePod(ctx, client)
 
-	if err := h.Cluster.WaitForPodCompleted(ctx, client); err != nil {
+	waitCtx, waitCancel := context.WithTimeout(ctx, round.Duration+2*time.Minute)
+	err = h.Cluster.WaitForPodCompleted(waitCtx, client)
+	waitCancel()
+	if err != nil {
 		t.Fatalf("failed to wait for wrk2 pod: %v", err)
 	}
 
@@ -243,21 +248,38 @@ func (h *HTTPBenchmark) getWgetPod(ip string) *v13.Pod {
 // waitForServer waits for an HTTP server to start responding on the given
 // IP and port.
 func (h *HTTPBenchmark) waitForServer(ctx context.Context, ip string) error {
-	wget, err := h.Cluster.ConfigurePodForClientNodepool(ctx, h.getWgetPod(ip))
-	if err != nil {
-		return fmt.Errorf("failed to configure wget pod for client nodepool: %v", err)
-	}
-	wget, err = h.Cluster.CreatePod(ctx, wget)
-	if err != nil {
-		return fmt.Errorf("failed to create wget pod: %v", err)
-	}
-	defer h.Cluster.DeletePod(ctx, wget)
-	waitCtx, waitCancel := context.WithTimeout(ctx, 1*time.Minute)
-	defer waitCancel()
-	if err := h.Cluster.WaitForPodCompleted(waitCtx, wget); err != nil {
-		return fmt.Errorf("failed to wait for HTTP server %s:%d%s: %v", ip, h.Port, h.Path, err)
+	lastPhase := v13.PodUnknown
+	var lastLogs string
+	for ctx.Err() == nil {
+		wget, err := h.Cluster.ConfigurePodForClientNodepool(ctx, h.getWgetPod(ip))
+		if err != nil {
+			return fmt.Errorf("failed to configure wget pod for client nodepool: %w", err)
+		}
+		wget, err = h.Cluster.CreatePod(ctx, wget)
+		if err != nil {
+			return fmt.Errorf("failed to create wget pod: %w", err)
+		}
+		phase, waitErr := h.Cluster.WaitForPodTerminated(ctx, wget)
+		if phase != v13.PodSucceeded {
+			logs, err := h.Cluster.ReadPodLogs(ctx, wget)
+			if err != nil {
+				_ = h.Cluster.DeletePod(ctx, wget) // Best-effort delete.
+				return fmt.Errorf("failed to read wget pod logs: %w", err)
+			}
+			lastLogs = logs
+		}
+		deleteErr := h.Cluster.DeletePod(ctx, wget)
+		if waitErr != nil {
+			return fmt.Errorf("failed to wait for wget pod: %w", waitErr)
+		}
+		if deleteErr != nil {
+			return fmt.Errorf("failed to delete wget pod: %w", deleteErr)
+		}
+		if phase == v13.PodSucceeded {
+			return nil
+		}
 	}
-	return nil
+	return fmt.Errorf("wget pod still fails after context expiry (last phase: %v; last logs: %q)", lastPhase, lastLogs)
 }
 
 /*

diff --git a/test/kubernetes/benchmarks/nginx.go b/test/kubernetes/benchmarks/nginx.go
@@ -33,7 +33,7 @@ import (
 
 const (
 	nginxPort              = 80
-	nginxBenchmarkDuration = 70 * time.Second
+	nginxBenchmarkDuration = 55 * time.Second
 	nginxRequestTimeout    = 3 * time.Second
 	nginxServingDir        = "/tmp/html"
 
@@ -48,9 +48,9 @@ var (
 	// The test expects that it contains the files to be served at /local,
 	// and will serve files out of `nginxServingDir`.
 	nginxCommand      = []string{"nginx", "-c", "/etc/nginx/nginx.conf"}
-	nginxDocKibibytes = []int{1, 10, 100, 10240}
-	threads           = []int{1, 8, 64, 1000}
-	targetQPS         = []int{1, 8, 64, httpbench.InfiniteQPS}
+	nginxDocKibibytes = []int{1, 10240}
+	threads           = []int{1, 8, 1000}
+	targetQPS         = []int{1, 64, httpbench.InfiniteQPS}
 	wantPercentiles   = []int{50, 95, 99}
 )
 
@@ -212,20 +212,6 @@ func BenchmarkNginx(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesC
 					benchmark.Run(ctx, t)
 				})
 			}
-			t.Run("HTTP404", func(t *testing.T) {
-				benchmark := &httpbench.HTTPBenchmark{
-					Name:            fmt.Sprintf("nginx/%s/HTTP404", test.name),
-					Cluster:         cluster,
-					Namespace:       benchmarkNS,
-					Service:         service,
-					Port:            nginxPort,
-					Path:            "/404-this-page-does-not-exist.html",
-					Rounds:          rounds,
-					Timeout:         nginxRequestTimeout,
-					WantPercentiles: wantPercentiles,
-				}
-				benchmark.Run(ctx, t)
-			})
 		})
 		if t.Failed() {
 			break

diff --git a/test/kubernetes/benchmarks/postgresql.go b/test/kubernetes/benchmarks/postgresql.go
@@ -46,7 +46,7 @@ const (
 )
 
 var (
-	numConnections = []int{1, 2, 6, 16, 32, 64}
+	numConnections = []int{1, 2, 12, 64}
 )
 
 // BenchmarkPostgresPGBench runs a PostgreSQL pgbench test.

diff --git a/test/kubernetes/benchmarks/pytorch.go b/test/kubernetes/benchmarks/pytorch.go
@@ -53,19 +53,9 @@ const (
 	pytorchImage = k8s.ImageRepoPrefix + "gpu/pytorch_x86_64:latest"
 )
 
-type pytorchMode string
-
-// pytorchMode is the pytorch mode used, either script mode (jit) or eager mode.
-// See: https://towardsdatascience.com/pytorch-jit-and-torchscript-c2a77bac0fff
-const (
-	jit   = pytorchMode("jit")
-	eager = pytorchMode("eager")
-)
-
 type pytorchTest struct {
 	module string
 	test   pytorchTestType
-	mode   pytorchMode
 }
 
 // Sets of tests.
@@ -81,12 +71,10 @@ var (
 		{
 			module: "fastNLP_Bert",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "fastNLP_Bert",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 
@@ -100,12 +88,10 @@ var (
 		{
 			module: "hf_BigBird",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "hf_BigBird",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 
@@ -119,12 +105,10 @@ var (
 		{
 			module: "speech_transformer",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "speech_transformer",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 
@@ -138,12 +122,10 @@ var (
 		{
 			module: "LearningToPaint",
 			test:   train,
-			mode:   jit,
 		},
 		{
 			module: "LearningToPaint",
 			test:   eval,
-			mode:   jit,
 		},
 	}
 
@@ -156,12 +138,10 @@ var (
 		{
 			module: "mobilenet_v2",
 			test:   train,
-			mode:   jit,
 		},
 		{
 			module: "mobilenet_v2",
 			test:   eval,
-			mode:   jit,
 		},
 	}
 
@@ -173,12 +153,10 @@ var (
 		{
 			module: "Background_Matting",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "Background_Matting",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 )
@@ -188,7 +166,7 @@ var (
 func (p pytorchTest) Name() string {
 	// Kubernetes pod names cannot contain "_".
 	module := strings.ReplaceAll(strings.ToLower(p.module), "_", "-")
-	return fmt.Sprintf("%s-%s-%s", module, p.test, p.mode)
+	return fmt.Sprintf("%s-%s", module, p.test)
 }
 
 var snakeCase = regexp.MustCompile("_.")
@@ -206,16 +184,7 @@ func (p pytorchTest) BenchName() string {
 		return strings.ToUpper(strings.TrimPrefix(s, "_"))
 	})
 	test := strings.ToUpper(string(p.test)[:1]) + string(p.test[1:])
-	var mode string
-	switch p.mode {
-	case eager:
-		mode = "Eager"
-	case jit:
-		mode = "JIT"
-	default:
-		panic(fmt.Sprintf("Unknown mode: %v", p.mode))
-	}
-	return fmt.Sprintf("%s/%s/%s", moduleName, test, mode)
+	return fmt.Sprintf("%s/%s", moduleName, test)
 }
 
 func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13.Pod, error) {
@@ -235,12 +204,12 @@ func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13
 
 func (p pytorchTest) command() []string {
 	return []string{
-		"python3",
-		"run.py",
-		p.module,
-		"--device", "cuda",
-		"--test", string(p.test),
-		"--mode", string(p.mode),
+		"sh",
+		"-c",
+		strings.Join([]string{
+			"cd /pytorch-benchmark",
+			fmt.Sprintf("python3 run.py %s --device cuda --test %s", p.module, p.test),
+		}, " && "),
 	}
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -46,7 +46,7 @@ const ( @@
     )
     var (
-    	numConnections = []int{1, 2, 6, 16, 32, 64}
+    	numConnections = []int{1, 2, 12, 64}
     )
     // BenchmarkPostgresPGBench runs a PostgreSQL pgbench test.
@@ Expand Down @@