google · copybara-service · Nov 6, 2025 · Nov 6, 2025
diff --git a/Makefile b/Makefile
@@ -313,12 +313,13 @@ cos-gpu-smoke-tests: gpu-smoke-images $(RUNTIME_BIN)
 gpu-images: gpu-smoke-images load-gpu_pytorch load-gpu_ollama load-gpu_ollama_client load-basic_busybox load-basic_alpine load-basic_python load-gpu_stable-diffusion-xl load-gpu_vllm load-gpu_nccl-tests load-benchmarks_ffmpeg
 .PHONY: gpu-images
 
-l4-gpu-images: load-gpu_sglang load-gpu_sglang_client
+l4-gpu-images: load-gpu_sglang load-gpu_sglang_client load-gpu_triton load-gpu_triton_client
 .PHONY: l4-gpu-images
 
 l4-gpu-tests: l4-gpu-images $(RUNTIME_BIN)
 	@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all)
 	@$(call sudo,test/gpu:sglang_test,--runtime=$(RUNTIME) -test.v $(ARGS))
+	@$(call sudo,test/gpu:triton_test,--runtime=$(RUNTIME) -test.v $(ARGS))
 .PHONY: l4-gpu-tests
 
 gpu-all-tests: gpu-images gpu-smoke-tests $(RUNTIME_BIN)

diff --git a/images/gpu/triton/Dockerfile.x86_64 b/images/gpu/triton/Dockerfile.x86_64
@@ -0,0 +1,79 @@
+# --- Downloader Stage ---
+# Fetches model/tokenizer assets from GCS
+FROM google/cloud-sdk:541.0.0-slim AS downloader
+RUN gcloud config set auth/disable_credentials true
+RUN gsutil -m cp -r gs://gvisor/tests/models/llama-2-7b-chat-hf /
+RUN mkdir -p /engines
+RUN gsutil -m cp -r gs://gvisor/tests/l4/engines/llama-2-7b-chat-hf /engines/
+
+# --- Builder Stage for TensorRT-LLM ---
+# This stage uses 'git sparse-checkout' to download *only* the
+# files we need, which is much faster than 'git clone' and avoids svn.
+FROM nvcr.io/nvidia/tritonserver:25.08-trtllm-python-py3 AS trtllm_builder
+
+WORKDIR /
+
+# 1. Clone an empty "blob-less" repo. This is very fast.
+RUN git clone --filter=blob:none --no-checkout --depth 1 \
+    https://github.com/NVIDIA/TensorRT-LLM.git /TensorRT-LLM
+WORKDIR /TensorRT-LLM
+
+# 2. Set up sparse checkout to define *only* the paths we need
+RUN git sparse-checkout init --cone && \
+    git sparse-checkout set \
+    "triton_backend/all_models/inflight_batcher_llm/" \
+    "triton_backend/tools/"
+
+# 3. Now, check out the v1.2.0rc1 tag.
+# This will download *only* the files in the two directories above.
+RUN git checkout 796891ba2a6959bad58c0da9645416c7264349e9
+
+# --- Final Stage ---
+# This is our final runtime image.
+# NO CHANGES are needed here. The COPY commands work perfectly
+# because the builder stage created the identical paths.
+FROM nvcr.io/nvidia/tritonserver:25.08-trtllm-python-py3
+
+# --- Build Arguments ---
+ARG TOKENIZER_DIR=/llama-2-7b-chat-hf
+ARG ENGINE_DIR=/engines/llama-2-7b-chat-hf/fp8/1-gpu
+ARG MAX_BATCH_SIZE=1
+ARG INSTANCE_COUNT=1
+ARG TOKENIZER_TYPE=auto
+ARG DECOUPLED_MODE=true
+ARG MODEL_FOLDER=/models/
+ARG MAX_QUEUE_DELAY_MS=10000
+ARG TRITON_BACKEND=tensorrtllm
+ARG LOGITS_DATATYPE="TYPE_FP32"
+ARG FILL_TEMPLATE_SCRIPT=/TensorRT-LLM/triton_backend/tools/fill_template.py
+
+# --- Asset Copying ---
+
+# Copy only the tokenizer (needed for config)
+COPY --from=downloader ${TOKENIZER_DIR} ${TOKENIZER_DIR}
+
+# Copy *only* the model templates from the trtllm_builder stage
+COPY --from=trtllm_builder /TensorRT-LLM/triton_backend/all_models/inflight_batcher_llm ${MODEL_FOLDER}
+
+# Copy *only* the build script we need from the trtllm_builder stage
+COPY --from=trtllm_builder ${FILL_TEMPLATE_SCRIPT} /usr/local/bin/fill_template.py
+ARG FILL_TEMPLATE_SCRIPT=/usr/local/bin/fill_template.py # Update ARG to new path
+
+# Copy *only* the specific engine directory we need, directly
+# from the downloader into the final model repository path.
+COPY --from=downloader ${ENGINE_DIR} ${MODEL_FOLDER}/tensorrt_llm/1/
+
+# --- Model Configuration ---
+# Run the template-filling commands and clean up the script
+RUN python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt \
+        tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT} && \
+    python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt \
+        tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT} && \
+    python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt \
+        prompt_embedding_table_data_type:TYPE_FP16,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT},logits_datatype:${LOGITS_DATATYPE} && \
+    python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt \
+        triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:${LOGITS_DATATYPE} && \
+    python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt \
+        prompt_embedding_table_data_type:TYPE_FP16,triton_backend:${TRITON_BACKEND},triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${MODEL_FOLDER}/tensorrt_llm/1,max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_fused_batching,encoder_input_features_data_type:TYPE_FP16,logits_datatype:${LOGITS_DATATYPE}
+
+CMD ["tritonserver", "--model-repository=/models/"]
diff --git a/images/gpu/triton/client/BUILD b/images/gpu/triton/client/BUILD
@@ -0,0 +1,11 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(
+    default_applicable_licenses = ["//:license"],
+    licenses = ["notice"],
+)
+
+go_binary(
+    name = "client",
+    srcs = ["client.go"],
+)
diff --git a/images/gpu/triton/client/Dockerfile b/images/gpu/triton/client/Dockerfile
@@ -0,0 +1,8 @@
+FROM golang:1.22 AS builder
+
+COPY client.go /client.go
+RUN CGO_ENABLED=0 go build -o /httpclient /client.go
+
+FROM alpine:latest
+COPY --from=builder /httpclient /usr/bin/
+CMD ["/usr/bin/httpclient"]
diff --git a/images/gpu/triton/client/client.go b/images/gpu/triton/client/client.go
@@ -0,0 +1,155 @@
+// Copyright 2025 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A simple `curl`-like HTTP client that prints metrics after the request.
+// All of its output is structured to be unambiguous even if stdout/stderr
+// is combined, as is the case for Kubernetes logs.
+// Useful for communicating with SGLang.
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/base64"
+	"encoding/json"
+	"flag"
+	"fmt"
+	"net/http"
+	"os"
+	"sort"
+	"strings"
+	"time"
+)
+
+// LINT.IfChange
+
+// Flags.
+var (
+	url            = flag.String("url", "", "HTTP request URL.")
+	method         = flag.String("method", "GET", "HTTP request method (GET or POST).")
+	postDataBase64 = flag.String("post_base64", "", "HTTP request POST data in base64 format; ignored for GET requests.")
+	timeout        = flag.Duration("timeout", 0, "HTTP request timeout; 0 for no timeout.")
+)
+
+// bufSize is the size of buffers used for HTTP requests and responses.
+const bufSize = 1024 * 1024 // 1MiB
+
+// fatalf crashes the program with a given error message.
+func fatalf(format string, values ...any) {
+	fmt.Fprintf(os.Stderr, "FATAL: "+format+"\n", values...)
+	os.Exit(1)
+}
+
+// Metrics contains the request metrics to export to JSON.
+// This is parsed by the sglang library at `test/gpu/sglang/sglang.go`.
+type Metrics struct {
+	// ProgramStarted is the time when the program started.
+	ProgramStarted time.Time `json:"program_started"`
+	// RequestSent is the time when the HTTP request was sent.
+	RequestSent time.Time `json:"request_sent"`
+	// ResponseReceived is the time when the HTTP response headers were received.
+	ResponseReceived time.Time `json:"response_received"`
+	// FirstByteRead is the time when the first HTTP response body byte was read.
+	FirstByteRead time.Time `json:"first_byte_read"`
+	// LastByteRead is the time when the last HTTP response body byte was read.
+	LastByteRead time.Time `json:"last_byte_read"`
+}
+
+func main() {
+	var metrics Metrics
+	metrics.ProgramStarted = time.Now()
+	flag.Parse()
+	if *url == "" {
+		fatalf("--url is required")
+	}
+	client := http.Client{
+		Transport: &http.Transport{
+			MaxIdleConns:    1,
+			IdleConnTimeout: *timeout,
+			ReadBufferSize:  bufSize,
+			WriteBufferSize: bufSize,
+		},
+		Timeout: *timeout,
+	}
+	var request *http.Request
+	var err error
+	switch *method {
+	case "GET":
+		request, err = http.NewRequest("GET", *url, nil)
+	case "POST":
+		postData, postDataErr := base64.StdEncoding.DecodeString(*postDataBase64)
+		if postDataErr != nil {
+			fatalf("cannot decode POST data: %v", postDataErr)
+		}
+		request, err = http.NewRequest("POST", *url, bytes.NewBuffer(postData))
+	default:
+		err = fmt.Errorf("unknown method %q", *method)
+	}
+	if err != nil {
+		fatalf("cannot create request: %v", err)
+	}
+	orderedReqHeaders := make([]string, 0, len(request.Header))
+	for k := range request.Header {
+		orderedReqHeaders = append(orderedReqHeaders, k)
+	}
+	sort.Strings(orderedReqHeaders)
+	for _, k := range orderedReqHeaders {
+		for _, v := range request.Header[k] {
+			fmt.Fprintf(os.Stderr, "REQHEADER: %s: %s\n", k, v)
+		}
+	}
+	metrics.RequestSent = time.Now()
+	resp, err := client.Do(request)
+	metrics.ResponseReceived = time.Now()
+	if err != nil {
+		fatalf("cannot make request: %v", err)
+	}
+	gotFirstByte := false
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		if !gotFirstByte {
+			metrics.FirstByteRead = time.Now()
+			gotFirstByte = true
+		}
+		if scanner.Text() == "" {
+			continue
+		}
+		fmt.Printf("BODY: %q\n", strings.TrimPrefix(scanner.Text(), "data: "))
+	}
+	// Check for any errors that may have occurred during scanning
+	if err := scanner.Err(); err != nil {
+		fatalf("error reading response body: %v", err)
+	}
+	metrics.LastByteRead = time.Now()
+	if err := resp.Body.Close(); err != nil {
+		fatalf("cannot close response body: %v", err)
+	}
+	orderedRespHeaders := make([]string, 0, len(resp.Header))
+	for k := range resp.Header {
+		orderedRespHeaders = append(orderedRespHeaders, k)
+	}
+	sort.Strings(orderedRespHeaders)
+	for _, k := range orderedRespHeaders {
+		for _, v := range resp.Header[k] {
+			fmt.Fprintf(os.Stderr, "RESPHEADER: %s: %s\n", k, v)
+		}
+	}
+	metricsBytes, err := json.Marshal(&metrics)
+	if err != nil {
+		fatalf("cannot marshal metrics: %v", err)
+	}
+	fmt.Fprintf(os.Stderr, "STATS: %s\n", string(metricsBytes))
+}
+
+// LINT.ThenChange(../../ollama/client/client.go)
diff --git a/images/gpu/triton/tensorrt/Dockerfile.llama-2-7b-chat-hf b/images/gpu/triton/tensorrt/Dockerfile.llama-2-7b-chat-hf
@@ -0,0 +1,53 @@
+# Use the official NVIDIA CUDA image as the base.
+FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
+
+# Set the default shell to bash.
+SHELL ["/bin/bash", "-c"]
+
+# Consolidate system dependency installation into a single RUN command
+# to reduce the number of layers in the final image.
+RUN apt-get update && apt-get install -y \
+    neovim \
+    git \
+    openmpi-bin \
+    libopenmpi-dev \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3-venv \
+    python-is-python3 && \
+    # Clean up the apt cache to reduce image size.
+    rm -rf /var/lib/apt/lists/*
+
+# Download TensorRT-LLM from the specified version tag.
+ARG TENSORRT_LLM_VERSION="1.0.0"
+ARG TENSORRT_LLM_DIR="/TensorRT-LLM-${TENSORRT_LLM_VERSION}"
+RUN git clone --depth 1 --branch "v${TENSORRT_LLM_VERSION}" https://github.com/NVIDIA/TensorRT-LLM.git "${TENSORRT_LLM_DIR}"
+
+# Create a Python virtual environment and add its bin directory to the system's PATH.
+# This makes commands from the venv (like pip, huggingface-cli) available in all subsequent layers.
+ENV VENV_PATH="/opt/venv"
+RUN python3 -m venv "${VENV_PATH}"
+ENV PATH="${VENV_PATH}/bin:${PATH}"
+
+# Upgrade pip and install the huggingface_hub library.
+RUN pip install --upgrade pip
+RUN pip install huggingface_hub
+
+# Download the model from Hugging Face.
+# The HF_TOKEN should be passed as a build argument for security.
+ARG HF_TOKEN=""
+ARG REPO_ID="meta-llama/Llama-2-7b-chat-hf"
+ARG MODEL_DIR="/llama-2-7b-chat-hf"
+RUN huggingface-cli download \
+    "${REPO_ID}" \
+    --local-dir "${MODEL_DIR}" \
+    --local-dir-use-symlinks False \
+    --token "${HF_TOKEN}"
+
+# Set the working directory to the Llama example within the TensorRT-LLM repository.
+WORKDIR "${TENSORRT_LLM_DIR}/examples/models/core/llama"
+
+# Install the Python dependencies required for the Llama example.
+# This command will use the pip from the virtual environment we added to the PATH.
+RUN pip install -r requirements.txt