Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,13 @@ cos-gpu-smoke-tests: gpu-smoke-images $(RUNTIME_BIN)
gpu-images: gpu-smoke-images load-gpu_pytorch load-gpu_ollama load-gpu_ollama_client load-basic_busybox load-basic_alpine load-basic_python load-gpu_stable-diffusion-xl load-gpu_vllm load-gpu_nccl-tests load-benchmarks_ffmpeg
.PHONY: gpu-images

l4-gpu-images: load-gpu_sglang load-gpu_sglang_client
l4-gpu-images: load-gpu_sglang load-gpu_sglang_client load-gpu_triton load-gpu_triton_client
.PHONY: l4-gpu-images

l4-gpu-tests: l4-gpu-images $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all)
@$(call sudo,test/gpu:sglang_test,--runtime=$(RUNTIME) -test.v $(ARGS))
@$(call sudo,test/gpu:triton_test,--runtime=$(RUNTIME) -test.v $(ARGS))
.PHONY: l4-gpu-tests

gpu-all-tests: gpu-images gpu-smoke-tests $(RUNTIME_BIN)
Expand Down
79 changes: 79 additions & 0 deletions images/gpu/triton/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# --- Downloader Stage ---
# Fetches model/tokenizer assets from GCS
FROM google/cloud-sdk:541.0.0-slim AS downloader
RUN gcloud config set auth/disable_credentials true
RUN gsutil -m cp -r gs://gvisor/tests/models/llama-2-7b-chat-hf /
RUN mkdir -p /engines
RUN gsutil -m cp -r gs://gvisor/tests/l4/engines/llama-2-7b-chat-hf /engines/

# --- Builder Stage for TensorRT-LLM ---
# This stage uses 'git sparse-checkout' to download *only* the
# files we need, which is much faster than 'git clone' and avoids svn.
FROM nvcr.io/nvidia/tritonserver:25.08-trtllm-python-py3 AS trtllm_builder

WORKDIR /

# 1. Clone an empty "blob-less" repo. This is very fast.
RUN git clone --filter=blob:none --no-checkout --depth 1 \
https://github.com/NVIDIA/TensorRT-LLM.git /TensorRT-LLM
WORKDIR /TensorRT-LLM

# 2. Set up sparse checkout to define *only* the paths we need
RUN git sparse-checkout init --cone && \
git sparse-checkout set \
"triton_backend/all_models/inflight_batcher_llm/" \
"triton_backend/tools/"

# 3. Now, check out the v1.2.0rc1 tag.
# This will download *only* the files in the two directories above.
RUN git checkout 796891ba2a6959bad58c0da9645416c7264349e9

# --- Final Stage ---
# This is our final runtime image.
# NO CHANGES are needed here. The COPY commands work perfectly
# because the builder stage created the identical paths.
FROM nvcr.io/nvidia/tritonserver:25.08-trtllm-python-py3

# --- Build Arguments ---
ARG TOKENIZER_DIR=/llama-2-7b-chat-hf
ARG ENGINE_DIR=/engines/llama-2-7b-chat-hf/fp8/1-gpu
ARG MAX_BATCH_SIZE=1
ARG INSTANCE_COUNT=1
ARG TOKENIZER_TYPE=auto
ARG DECOUPLED_MODE=true
ARG MODEL_FOLDER=/models/
ARG MAX_QUEUE_DELAY_MS=10000
ARG TRITON_BACKEND=tensorrtllm
ARG LOGITS_DATATYPE="TYPE_FP32"
ARG FILL_TEMPLATE_SCRIPT=/TensorRT-LLM/triton_backend/tools/fill_template.py

# --- Asset Copying ---

# Copy only the tokenizer (needed for config)
COPY --from=downloader ${TOKENIZER_DIR} ${TOKENIZER_DIR}

# Copy *only* the model templates from the trtllm_builder stage
COPY --from=trtllm_builder /TensorRT-LLM/triton_backend/all_models/inflight_batcher_llm ${MODEL_FOLDER}

# Copy *only* the build script we need from the trtllm_builder stage
COPY --from=trtllm_builder ${FILL_TEMPLATE_SCRIPT} /usr/local/bin/fill_template.py
ARG FILL_TEMPLATE_SCRIPT=/usr/local/bin/fill_template.py # Update ARG to new path

# Copy *only* the specific engine directory we need, directly
# from the downloader into the final model repository path.
COPY --from=downloader ${ENGINE_DIR} ${MODEL_FOLDER}/tensorrt_llm/1/

# --- Model Configuration ---
# Run the template-filling commands and clean up the script
RUN python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/preprocessing/config.pbtxt \
tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:${INSTANCE_COUNT} && \
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/postprocessing/config.pbtxt \
tokenizer_dir:${TOKENIZER_DIR},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:${INSTANCE_COUNT} && \
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm_bls/config.pbtxt \
prompt_embedding_table_data_type:TYPE_FP16,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},bls_instance_count:${INSTANCE_COUNT},logits_datatype:${LOGITS_DATATYPE} && \
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/ensemble/config.pbtxt \
triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:${LOGITS_DATATYPE} && \
python3 ${FILL_TEMPLATE_SCRIPT} -i ${MODEL_FOLDER}/tensorrt_llm/config.pbtxt \
prompt_embedding_table_data_type:TYPE_FP16,triton_backend:${TRITON_BACKEND},triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},engine_dir:${MODEL_FOLDER}/tensorrt_llm/1,max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MS},batching_strategy:inflight_fused_batching,encoder_input_features_data_type:TYPE_FP16,logits_datatype:${LOGITS_DATATYPE}

CMD ["tritonserver", "--model-repository=/models/"]
11 changes: 11 additions & 0 deletions images/gpu/triton/client/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
load("//tools:defs.bzl", "go_binary")

package(
default_applicable_licenses = ["//:license"],
licenses = ["notice"],
)

go_binary(
name = "client",
srcs = ["client.go"],
)
8 changes: 8 additions & 0 deletions images/gpu/triton/client/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM golang:1.22 AS builder

COPY client.go /client.go
RUN CGO_ENABLED=0 go build -o /httpclient /client.go

FROM alpine:latest
COPY --from=builder /httpclient /usr/bin/
CMD ["/usr/bin/httpclient"]
155 changes: 155 additions & 0 deletions images/gpu/triton/client/client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// Copyright 2025 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// A simple `curl`-like HTTP client that prints metrics after the request.
// All of its output is structured to be unambiguous even if stdout/stderr
// is combined, as is the case for Kubernetes logs.
// Useful for communicating with SGLang.
package main

import (
"bufio"
"bytes"
"encoding/base64"
"encoding/json"
"flag"
"fmt"
"net/http"
"os"
"sort"
"strings"
"time"
)

// LINT.IfChange

// Flags.
var (
url = flag.String("url", "", "HTTP request URL.")
method = flag.String("method", "GET", "HTTP request method (GET or POST).")
postDataBase64 = flag.String("post_base64", "", "HTTP request POST data in base64 format; ignored for GET requests.")
timeout = flag.Duration("timeout", 0, "HTTP request timeout; 0 for no timeout.")
)

// bufSize is the size of buffers used for HTTP requests and responses.
const bufSize = 1024 * 1024 // 1MiB

// fatalf crashes the program with a given error message.
func fatalf(format string, values ...any) {
fmt.Fprintf(os.Stderr, "FATAL: "+format+"\n", values...)
os.Exit(1)
}

// Metrics contains the request metrics to export to JSON.
// This is parsed by the sglang library at `test/gpu/sglang/sglang.go`.
type Metrics struct {
// ProgramStarted is the time when the program started.
ProgramStarted time.Time `json:"program_started"`
// RequestSent is the time when the HTTP request was sent.
RequestSent time.Time `json:"request_sent"`
// ResponseReceived is the time when the HTTP response headers were received.
ResponseReceived time.Time `json:"response_received"`
// FirstByteRead is the time when the first HTTP response body byte was read.
FirstByteRead time.Time `json:"first_byte_read"`
// LastByteRead is the time when the last HTTP response body byte was read.
LastByteRead time.Time `json:"last_byte_read"`
}

func main() {
var metrics Metrics
metrics.ProgramStarted = time.Now()
flag.Parse()
if *url == "" {
fatalf("--url is required")
}
client := http.Client{
Transport: &http.Transport{
MaxIdleConns: 1,
IdleConnTimeout: *timeout,
ReadBufferSize: bufSize,
WriteBufferSize: bufSize,
},
Timeout: *timeout,
}
var request *http.Request
var err error
switch *method {
case "GET":
request, err = http.NewRequest("GET", *url, nil)
case "POST":
postData, postDataErr := base64.StdEncoding.DecodeString(*postDataBase64)
if postDataErr != nil {
fatalf("cannot decode POST data: %v", postDataErr)
}
request, err = http.NewRequest("POST", *url, bytes.NewBuffer(postData))
default:
err = fmt.Errorf("unknown method %q", *method)
}
if err != nil {
fatalf("cannot create request: %v", err)
}
orderedReqHeaders := make([]string, 0, len(request.Header))
for k := range request.Header {
orderedReqHeaders = append(orderedReqHeaders, k)
}
sort.Strings(orderedReqHeaders)
for _, k := range orderedReqHeaders {
for _, v := range request.Header[k] {
fmt.Fprintf(os.Stderr, "REQHEADER: %s: %s\n", k, v)
}
}
metrics.RequestSent = time.Now()
resp, err := client.Do(request)
metrics.ResponseReceived = time.Now()
if err != nil {
fatalf("cannot make request: %v", err)
}
gotFirstByte := false
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
if !gotFirstByte {
metrics.FirstByteRead = time.Now()
gotFirstByte = true
}
if scanner.Text() == "" {
continue
}
fmt.Printf("BODY: %q\n", strings.TrimPrefix(scanner.Text(), "data: "))
}
// Check for any errors that may have occurred during scanning
if err := scanner.Err(); err != nil {
fatalf("error reading response body: %v", err)
}
metrics.LastByteRead = time.Now()
if err := resp.Body.Close(); err != nil {
fatalf("cannot close response body: %v", err)
}
orderedRespHeaders := make([]string, 0, len(resp.Header))
for k := range resp.Header {
orderedRespHeaders = append(orderedRespHeaders, k)
}
sort.Strings(orderedRespHeaders)
for _, k := range orderedRespHeaders {
for _, v := range resp.Header[k] {
fmt.Fprintf(os.Stderr, "RESPHEADER: %s: %s\n", k, v)
}
}
metricsBytes, err := json.Marshal(&metrics)
if err != nil {
fatalf("cannot marshal metrics: %v", err)
}
fmt.Fprintf(os.Stderr, "STATS: %s\n", string(metricsBytes))
}

// LINT.ThenChange(../../ollama/client/client.go)
53 changes: 53 additions & 0 deletions images/gpu/triton/tensorrt/Dockerfile.llama-2-7b-chat-hf
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Use the official NVIDIA CUDA image as the base.
FROM nvidia/cuda:12.8.1-devel-ubuntu22.04

# Set the default shell to bash.
SHELL ["/bin/bash", "-c"]

# Consolidate system dependency installation into a single RUN command
# to reduce the number of layers in the final image.
RUN apt-get update && apt-get install -y \
neovim \
git \
openmpi-bin \
libopenmpi-dev \
python3.10 \
python3.10-dev \
python3-pip \
python3-venv \
python-is-python3 && \
# Clean up the apt cache to reduce image size.
rm -rf /var/lib/apt/lists/*

# Download TensorRT-LLM from the specified version tag.
ARG TENSORRT_LLM_VERSION="1.0.0"
ARG TENSORRT_LLM_DIR="/TensorRT-LLM-${TENSORRT_LLM_VERSION}"
RUN git clone --depth 1 --branch "v${TENSORRT_LLM_VERSION}" https://github.com/NVIDIA/TensorRT-LLM.git "${TENSORRT_LLM_DIR}"

# Create a Python virtual environment and add its bin directory to the system's PATH.
# This makes commands from the venv (like pip, huggingface-cli) available in all subsequent layers.
ENV VENV_PATH="/opt/venv"
RUN python3 -m venv "${VENV_PATH}"
ENV PATH="${VENV_PATH}/bin:${PATH}"

# Upgrade pip and install the huggingface_hub library.
RUN pip install --upgrade pip
RUN pip install huggingface_hub

# Download the model from Hugging Face.
# The HF_TOKEN should be passed as a build argument for security.
ARG HF_TOKEN=""
ARG REPO_ID="meta-llama/Llama-2-7b-chat-hf"
ARG MODEL_DIR="/llama-2-7b-chat-hf"
RUN huggingface-cli download \
"${REPO_ID}" \
--local-dir "${MODEL_DIR}" \
--local-dir-use-symlinks False \
--token "${HF_TOKEN}"

# Set the working directory to the Llama example within the TensorRT-LLM repository.
WORKDIR "${TENSORRT_LLM_DIR}/examples/models/core/llama"

# Install the Python dependencies required for the Llama example.
# This command will use the pip from the virtual environment we added to the PATH.
RUN pip install -r requirements.txt
Loading
Loading