Skip to content

Commit 027a99f

Browse files
committed
feat: support cuda sglang
Signed-off-by: thxCode <[email protected]>
1 parent 796089a commit 027a99f

File tree

4 files changed

+263
-5
lines changed

4 files changed

+263
-5
lines changed

pack/.post_operation/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ We leverage the matrix expansion feature of GPUStack Runner to achieve this, and
1717
- [x] 2025-10-29: Reinstall `ray[client] ray[default]` packages for CANN released images.
1818
- [x] 2025-11-03: Refresh MindIE entrypoint for CANN released images.
1919
- [x] 2025-11-05: Polish NVIDIA HPC-X configuration for CUDA released images.
20-
- [ ] 2025-11-06: Install EP kernel for CUDA released images.
20+
- [x] 2025-11-06: Install EP kernel for CUDA released images.

pack/cann/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
# - VLLM_ASCEND_VERSION: Version of vLLM Ascend to use,
4040
# if not specified, it will fetch from the vLLM Ascend PyPi RSS.
4141
# - VLLM_TORCH_VERSION: Version of Torch for vLLM to use.
42+
# - SGLANG_BASE_IMAGE: Base image for SGLang.
4243
# - SGLANG_VERSION: Version of SGLang to use.
4344
# - SGLANG_VLLM_VERSION: Version of vLLM for SGLang to use.
4445
# - SGLANG_TORCH_VERSION: Version of Torch for SGLang to use.
@@ -986,14 +987,14 @@ RUN <<EOF
986987
&& rm -rf /root/.rustup
987988
EOF
988989

989-
## Install DeepEP
990+
## Install SGLANG Kernel
990991

991992
ARG SGLANG_KERNEL_VERSION
992993

993994
ENV SGLANG_KERNEL_VERSION=${SGLANG_KERNEL_VERSION}
994995

995996
RUN <<EOF
996-
# DeepEP
997+
# SGLANG Kernel
997998

998999
CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
9991000
if [[ -z "${CMAKE_MAX_JOBS}" ]]; then

pack/cuda/Dockerfile

Lines changed: 251 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@
7474
# - VLLM_FLASHINFER_VERSION: Version of FlashInfer to use,
7575
# which is used to build the FlashInfer wheel.
7676
# - VLLM_LMCACHE_VERSION: Version of lmcache to use.
77+
# - SGLANG_BASE_IMAGE: Base image for SGLang.
78+
# - SGLANG_VERSION: Version of SGLang to use.
79+
# - SGLANG_TORCH_VERSION: Version of Torch for SGLang to use.
80+
# - SGLANG_KERNEL_VERSION: Version of SGLang Kernel to use.
7781
ARG PYTHON_VERSION=3.12
7882
ARG CMAKE_MAX_JOBS
7983
ARG CUDA_VERSION=12.8.1
@@ -97,6 +101,11 @@ ARG VLLM_DEEPGEMM_VERSION=2.1.1.post3
97101
ARG VLLM_FLASHINFER_REPOSITORY=https://github.com/flashinfer-ai/flashinfer.git
98102
ARG VLLM_FLASHINFER_VERSION=0.3.1
99103
ARG VLLM_LMCACHE_VERSION=0.3.8
104+
ARG SGLANG_BASE_IMAGE=vllm
105+
ARG SGLANG_VERSION=0.5.4.post2
106+
ARG SGLANG_TORCH_VERSION=${VLLM_TORCH_VERSION}
107+
ARG SGLANG_TORCH_CUDA_VERSION=${VLLM_TORCH_CUDA_VERSION}
108+
ARG SGLANG_KERNEL_VERSION=0.3.16.post4
100109

101110
#
102111
# Stage Bake Runtime
@@ -1125,7 +1134,7 @@ RUN <<EOF
11251134
CMAKE_MAX_JOBS="8"
11261135
fi
11271136
VL_CUDA_ARCHS="${CUDA_ARCHS}"
1128-
if [[ -z "${LC_CUDA_ARCHS}" ]]; then
1137+
if [[ -z "${VL_CUDA_ARCHS}" ]]; then
11291138
if (( $(echo "${CUDA_MAJOR} < 12" | bc -l) )); then
11301139
VL_CUDA_ARCHS="7.5 8.0+PTX 8.9"
11311140
elif (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.8" | bc -l) )); then
@@ -1177,7 +1186,8 @@ RUN --mount=type=bind,from=vllm-build-flashinfer,source=/,target=/flashinfer,rw
11771186
fi
11781187

11791188
# Download pre-compiled cubins
1180-
FLASHINFER_CUBIN_DOWNLOAD_THREADS="${CMAKE_MAX_JOBS}"
1189+
export FLASHINFER_CUBIN_DOWNLOAD_THREADS="${CMAKE_MAX_JOBS}"
1190+
export FLASHINFER_LOGGING_LEVEL=warning
11811191
python -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
11821192

11831193
# Cleanup
@@ -1352,6 +1362,7 @@ RUN <<EOF
13521362
--package vllm \
13531363
--package flashinfer-python \
13541364
--package torch \
1365+
--package triton \
13551366
--package pplx-kernels \
13561367
--package deep-gemm \
13571368
--package deep-ep \
@@ -1364,3 +1375,241 @@ ENV RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1
13641375

13651376
WORKDIR /
13661377
ENTRYPOINT [ "tini", "--" ]
1378+
1379+
# Stage SGLang
1380+
#
1381+
# Example build command:
1382+
# docker build --progress=plain --platform=linux/amd64 --file=pack/cuda/Dockerfile --tag=gpustack/runner:cuda${CUDA_VERSION%.*}-sglang-linux-amd64 --target=sglang pack/cuda
1383+
#
1384+
1385+
FROM ${SGLANG_BASE_IMAGE} AS sglang
1386+
SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
1387+
1388+
ARG TARGETPLATFORM
1389+
ARG TARGETOS
1390+
ARG TARGETARCH
1391+
1392+
ENV UV_SYSTEM_PYTHON=1 \
1393+
UV_PRERELEASE=allow
1394+
1395+
## Install Protobuf
1396+
1397+
RUN <<EOF
1398+
# Protobuf
1399+
1400+
# Install
1401+
apt-get update -y && apt-get install -y --no-install-recommends \
1402+
libgrpc-dev \
1403+
libgrpc++-dev \
1404+
libprotobuf-dev \
1405+
protobuf-compiler \
1406+
protobuf-compiler-grpc
1407+
1408+
# Cleanup
1409+
rm -rf /var/tmp/* \
1410+
&& rm -rf /tmp/* \
1411+
&& rm -rf /var/cache/apt
1412+
EOF
1413+
1414+
## Install Torch
1415+
## It's recommended to use the same Torch version as vLLM for compatibility.
1416+
## So we pass the Torch version from vLLM build stage.
1417+
1418+
ARG SGLANG_TORCH_VERSION
1419+
ARG SGLANG_TORCH_CUDA_VERSION
1420+
1421+
ENV SGLANG_TORCH_VERSION=${SGLANG_TORCH_VERSION} \
1422+
SGLANG_TORCH_CUDA_VERSION=${SGLANG_TORCH_CUDA_VERSION}
1423+
1424+
RUN <<EOF
1425+
# Torch
1426+
1427+
if [[ "${SGLANG_TORCH_VERSION}" == "${VLLM_TORCH_VERSION}" ]]; then
1428+
echo "Using the same Torch version as vLLM: ${SGLANG_TORCH_VERSION}"
1429+
exit 0
1430+
fi
1431+
1432+
# Install
1433+
cat <<EOT >/tmp/requirements.txt
1434+
torch==${SGLANG_TORCH_VERSION}
1435+
torchvision
1436+
torchaudio
1437+
EOT
1438+
IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${SGLANG_TORCH_CUDA_VERSION}"
1439+
if [[ "${TARGETARCH}" == "amd64" ]]; then
1440+
uv pip install --index-url https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR} \
1441+
-r /tmp/requirements.txt
1442+
else
1443+
uv pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
1444+
-r /tmp/requirements.txt
1445+
fi
1446+
uv pip install \
1447+
numpy scipy
1448+
1449+
# Cleanup
1450+
rm -rf /var/tmp/* \
1451+
&& rm -rf /tmp/*
1452+
EOF
1453+
1454+
## Install SGLang
1455+
1456+
ARG CMAKE_MAX_JOBS
1457+
ARG SGLANG_VERSION
1458+
ARG SGLANG_VLLM_VERSION
1459+
1460+
ENV SGLANG_VERSION=${SGLANG_VERSION} \
1461+
SGLANG_VLLM_VERSION=${SGLANG_VLLM_VERSION}
1462+
1463+
RUN <<EOF
1464+
# SGLang
1465+
1466+
CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
1467+
if [[ -z "${CMAKE_MAX_JOBS}" ]]; then
1468+
CMAKE_MAX_JOBS="$(( $(nproc) / 2 ))"
1469+
fi
1470+
if (( $(echo "${CMAKE_MAX_JOBS} > 8" | bc -l) )); then
1471+
CMAKE_MAX_JOBS="8"
1472+
fi
1473+
SG_CUDA_ARCHS="${CUDA_ARCHS}"
1474+
if [[ -z "${SG_CUDA_ARCHS}" ]]; then
1475+
if (( $(echo "${CUDA_MAJOR} < 12" | bc -l) )); then
1476+
SG_CUDA_ARCHS="7.5 8.0+PTX 8.9"
1477+
elif (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.8" | bc -l) )); then
1478+
SG_CUDA_ARCHS="7.5 8.0+PTX 8.9 9.0"
1479+
else
1480+
SG_CUDA_ARCHS="7.5 8.0+PTX 8.9 9.0 10.0+PTX 12.0+PTX"
1481+
fi
1482+
fi
1483+
export MAX_JOBS="${CMAKE_MAX_JOBS}"
1484+
export TORCH_CUDA_ARCH_LIST="${SG_CUDA_ARCHS}"
1485+
export COMPILE_CUSTOM_KERNELS=1
1486+
export NVCC_THREADS=1
1487+
1488+
# Install SGLang
1489+
git -C /tmp clone --recursive --shallow-submodules \
1490+
--depth 1 --branch v${SGLANG_VERSION} --single-branch \
1491+
https://github.com/sgl-project/sglang.git sglang-${SGLANG_VERSION}
1492+
pushd /tmp/sglang-${SGLANG_VERSION}/python \
1493+
&& uv pip install --verbose .[all]
1494+
1495+
# Download pre-compiled cubins
1496+
export FLASHINFER_CUBIN_DOWNLOAD_THREADS="${CMAKE_MAX_JOBS}"
1497+
export FLASHINFER_LOGGING_LEVEL=warning
1498+
python -m flashinfer --download-cubin
1499+
1500+
# Cleanup
1501+
rm -rf /var/tmp/* \
1502+
&& rm -rf /tmp/*
1503+
EOF
1504+
1505+
## Install SGLang Router
1506+
1507+
RUN <<EOF
1508+
# SGlang Router
1509+
1510+
# Install Rust
1511+
curl --retry 3 --retry-connrefused --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
1512+
export PATH="/root/.cargo/bin:${PATH}" \
1513+
&& rustc --version \
1514+
&& cargo --version
1515+
1516+
# Install build tools
1517+
uv pip install \
1518+
setuptools-rust maturin
1519+
1520+
# Install SGLang Router
1521+
git -C /tmp clone --recursive --shallow-submodules \
1522+
--depth 1 --branch v${SGLANG_VERSION} --single-branch \
1523+
https://github.com/sgl-project/sglang.git sglang
1524+
pushd /tmp/sglang/sgl-router \
1525+
&& ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \
1526+
&& tree -hs /tmp/sglang/sgl-router/dist \
1527+
&& uv pip install --force-reinstall /tmp/sglang/sgl-router/dist/*.whl
1528+
1529+
# Cleanup
1530+
rm -rf /var/tmp/* \
1531+
&& rm -rf /tmp/* \
1532+
&& rm -rf /root/.cache \
1533+
&& rm -rf /root/.cargo \
1534+
&& rm -rf /root/.rustup
1535+
EOF
1536+
1537+
## Install SGLANG Kernel
1538+
1539+
ARG SGLANG_KERNEL_VERSION
1540+
1541+
ENV SGLANG_KERNEL_VERSION=${SGLANG_KERNEL_VERSION}
1542+
1543+
RUN <<EOF
1544+
# SGLANG Kernel
1545+
1546+
IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${SGLANG_TORCH_CUDA_VERSION}"
1547+
1548+
CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
1549+
if [[ -z "${CMAKE_MAX_JOBS}" ]]; then
1550+
CMAKE_MAX_JOBS="$(( $(nproc) / 2 ))"
1551+
fi
1552+
if (( $(echo "${CMAKE_MAX_JOBS} > 8" | bc -l) )); then
1553+
CMAKE_MAX_JOBS="8"
1554+
fi
1555+
export MAX_JOBS="${CMAKE_MAX_JOBS}"
1556+
1557+
# Download
1558+
if (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.7" | bc -l) )); then
1559+
uv pip install --force-reinstall --no-deps \
1560+
https://github.com/sgl-project/whl/releases/download/v${SGLANG_KERNEL_VERSION}/sgl_kernel-${SGLANG_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl
1561+
elif (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.9" | bc -l) )); then
1562+
uv pip install \
1563+
sgl-kernel==${SGLANG_KERNEL_VERSION}
1564+
else
1565+
uv pip install --force-reinstall --no-deps \
1566+
https://github.com/sgl-project/whl/releases/download/v${SGLANG_KERNEL_VERSION}/sgl_kernel-${SGLANG_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl
1567+
fi
1568+
1569+
# Cleanup
1570+
rm -rf /var/tmp/* \
1571+
&& rm -rf /tmp/*
1572+
EOF
1573+
1574+
## Install Dependencies
1575+
1576+
RUN <<EOF
1577+
# Dependencies
1578+
1579+
# Install Dependencies,
1580+
# see https://github.com/sgl-project/sglang/blob/41c10e67fcae6ac50dfe283655bdf545d224cba9/docker/Dockerfile#L181-L209.
1581+
cat <<EOT >/tmp/requirements.txt
1582+
nvidia-cutlass-dsl==4.3.0.dev0
1583+
datamodel_code_generator
1584+
mooncake-transfer-engine==0.3.6.post1
1585+
nixl
1586+
EOT
1587+
uv pip install \
1588+
-r /tmp/requirements.txt
1589+
1590+
# Cleanup
1591+
rm -rf /var/tmp/* \
1592+
&& rm -rf /tmp/*
1593+
EOF
1594+
1595+
## Postprocess
1596+
1597+
RUN <<EOF
1598+
# Postprocess
1599+
1600+
# Review
1601+
uv pip tree \
1602+
--package sglang \
1603+
--package sglang-router \
1604+
--package sgl-kernel \
1605+
--package flashinfer-python \
1606+
--package triton \
1607+
--package vllm \
1608+
--package torch \
1609+
--package deep-ep
1610+
EOF
1611+
1612+
## Entrypoint
1613+
1614+
WORKDIR /
1615+
ENTRYPOINT [ "tini", "--" ]

pack/matrix.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,35 +70,43 @@ rules:
7070
services:
7171
- "voxbox"
7272
- "vllm"
73+
- "sglang"
7374
args:
7475
- "CUDA_VERSION=12.4.1"
7576
- "VOXBOX_TORCH_CUDA_VERSION=12.6.3"
7677
- "VLLM_TORCH_CUDA_VERSION=12.6.3"
7778
- "VLLM_NVIDIA_GDRCOPY_VERSION=2.4.1"
7879
- "VLLM_NVIDIA_HPCX_VERSION=2.21.3"
7980
- "VLLM_AWS_EFA_VERSION=1.43.3"
81+
- "SGLANG_BASE_IMAGE=gpustack/runner:cuda12.4-vllm0.11.0"
82+
- "SGLANG_KERNEL_VERSION=0.3.12"
8083
## NVIDIA CUDA 12.6.3, using PyTorch +cu126 in linux/amd64.
8184
##
8285
- backend: "cuda"
8386
services:
8487
- "voxbox"
8588
- "vllm"
89+
- "sglang"
8690
args:
8791
- "CUDA_VERSION=12.6.3"
8892
- "VLLM_NVIDIA_GDRCOPY_VERSION=2.4.1"
8993
- "VLLM_NVIDIA_HPCX_VERSION=2.21.3"
9094
- "VLLM_AWS_EFA_VERSION=1.43.3"
95+
- "SGLANG_BASE_IMAGE=gpustack/runner:cuda12.6-vllm0.11.0"
96+
- "SGLANG_KERNEL_VERSION=0.3.12"
9197
## NVIDIA CUDA 12.8.1, using PyTorch +cu128 in both linux/amd64 and linux/arm64.
9298
##
9399
- backend: "cuda"
94100
services:
95101
- "voxbox"
96102
- "vllm"
103+
- "sglang"
97104
args:
98105
- "CUDA_VERSION=12.8.1"
99106
- "VLLM_NVIDIA_GDRCOPY_VERSION=2.4.1"
100107
- "VLLM_NVIDIA_HPCX_VERSION=2.22.1rc4"
101108
- "VLLM_AWS_EFA_VERSION=1.43.3"
109+
- "SGLANG_BASE_IMAGE=gpustack/runner:cuda12.8-vllm0.11.0"
102110

103111
#
104112
# Hygon DTK

0 commit comments

Comments
 (0)