7474# - VLLM_FLASHINFER_VERSION: Version of FlashInfer to use,
7575# which is used to build the FlashInfer wheel.
7676# - VLLM_LMCACHE_VERSION: Version of lmcache to use.
77+ # - SGLANG_BASE_IMAGE: Base image for SGLang.
78+ # - SGLANG_VERSION: Version of SGLang to use.
79+ # - SGLANG_TORCH_VERSION: Version of Torch for SGLang to use.
80+ # - SGLANG_KERNEL_VERSION: Version of SGLang Kernel to use.
7781ARG PYTHON_VERSION=3.12
7882ARG CMAKE_MAX_JOBS
7983ARG CUDA_VERSION=12.8.1
@@ -97,6 +101,11 @@ ARG VLLM_DEEPGEMM_VERSION=2.1.1.post3
97101ARG VLLM_FLASHINFER_REPOSITORY=https://github.com/flashinfer-ai/flashinfer.git
98102ARG VLLM_FLASHINFER_VERSION=0.3.1
99103ARG VLLM_LMCACHE_VERSION=0.3.8
104+ ARG SGLANG_BASE_IMAGE=vllm
105+ ARG SGLANG_VERSION=0.5.4.post2
106+ ARG SGLANG_TORCH_VERSION=${VLLM_TORCH_VERSION}
107+ ARG SGLANG_TORCH_CUDA_VERSION=${VLLM_TORCH_CUDA_VERSION}
108+ ARG SGLANG_KERNEL_VERSION=0.3.16.post4
100109
101110#
102111# Stage Bake Runtime
@@ -1125,7 +1134,7 @@ RUN <<EOF
11251134 CMAKE_MAX_JOBS="8"
11261135 fi
11271136 VL_CUDA_ARCHS="${CUDA_ARCHS}"
1128- if [[ -z "${LC_CUDA_ARCHS }" ]]; then
1137+ if [[ -z "${VL_CUDA_ARCHS }" ]]; then
11291138 if (( $(echo "${CUDA_MAJOR} < 12" | bc -l) )); then
11301139 VL_CUDA_ARCHS="7.5 8.0+PTX 8.9"
11311140 elif (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.8" | bc -l) )); then
@@ -1177,7 +1186,8 @@ RUN --mount=type=bind,from=vllm-build-flashinfer,source=/,target=/flashinfer,rw
11771186 fi
11781187
11791188 # Download pre-compiled cubins
1180- FLASHINFER_CUBIN_DOWNLOAD_THREADS="${CMAKE_MAX_JOBS}"
1189+ export FLASHINFER_CUBIN_DOWNLOAD_THREADS="${CMAKE_MAX_JOBS}"
1190+ export FLASHINFER_LOGGING_LEVEL=warning
11811191 python -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
11821192
11831193 # Cleanup
@@ -1352,6 +1362,7 @@ RUN <<EOF
13521362 --package vllm \
13531363 --package flashinfer-python \
13541364 --package torch \
1365+ --package triton \
13551366 --package pplx-kernels \
13561367 --package deep-gemm \
13571368 --package deep-ep \
@@ -1364,3 +1375,241 @@ ENV RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1
13641375
13651376WORKDIR /
13661377ENTRYPOINT [ "tini" , "--" ]
1378+
1379+ # Stage SGLang
1380+ #
1381+ # Example build command:
1382+ # docker build --progress=plain --platform=linux/amd64 --file=pack/cuda/Dockerfile --tag=gpustack/runner:cuda${CUDA_VERSION%.*}-sglang-linux-amd64 --target=sglang pack/cuda
1383+ #
1384+
1385+ FROM ${SGLANG_BASE_IMAGE} AS sglang
1386+ SHELL ["/bin/bash" , "-eo" , "pipefail" , "-c" ]
1387+
1388+ ARG TARGETPLATFORM
1389+ ARG TARGETOS
1390+ ARG TARGETARCH
1391+
1392+ ENV UV_SYSTEM_PYTHON=1 \
1393+ UV_PRERELEASE=allow
1394+
1395+ # # Install Protobuf
1396+
1397+ RUN <<EOF
1398+ # Protobuf
1399+
1400+ # Install
1401+ apt-get update -y && apt-get install -y --no-install-recommends \
1402+ libgrpc-dev \
1403+ libgrpc++-dev \
1404+ libprotobuf-dev \
1405+ protobuf-compiler \
1406+ protobuf-compiler-grpc
1407+
1408+ # Cleanup
1409+ rm -rf /var/tmp/* \
1410+ && rm -rf /tmp/* \
1411+ && rm -rf /var/cache/apt
1412+ EOF
1413+
1414+ # # Install Torch
1415+ # # It's recommended to use the same Torch version as vLLM for compatibility.
1416+ # # So we pass the Torch version from vLLM build stage.
1417+
1418+ ARG SGLANG_TORCH_VERSION
1419+ ARG SGLANG_TORCH_CUDA_VERSION
1420+
1421+ ENV SGLANG_TORCH_VERSION=${SGLANG_TORCH_VERSION} \
1422+ SGLANG_TORCH_CUDA_VERSION=${SGLANG_TORCH_CUDA_VERSION}
1423+
1424+ RUN <<EOF
1425+ # Torch
1426+
1427+ if [[ "${SGLANG_TORCH_VERSION}" == "${VLLM_TORCH_VERSION}" ]]; then
1428+ echo "Using the same Torch version as vLLM: ${SGLANG_TORCH_VERSION}"
1429+ exit 0
1430+ fi
1431+
1432+ # Install
1433+ cat <<EOT >/tmp/requirements.txt
1434+ torch==${SGLANG_TORCH_VERSION}
1435+ torchvision
1436+ torchaudio
1437+ EOT
1438+ IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${SGLANG_TORCH_CUDA_VERSION}"
1439+ if [[ "${TARGETARCH}" == "amd64" ]]; then
1440+ uv pip install --index-url https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR} \
1441+ -r /tmp/requirements.txt
1442+ else
1443+ uv pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
1444+ -r /tmp/requirements.txt
1445+ fi
1446+ uv pip install \
1447+ numpy scipy
1448+
1449+ # Cleanup
1450+ rm -rf /var/tmp/* \
1451+ && rm -rf /tmp/*
1452+ EOF
1453+
1454+ # # Install SGLang
1455+
1456+ ARG CMAKE_MAX_JOBS
1457+ ARG SGLANG_VERSION
1458+ ARG SGLANG_VLLM_VERSION
1459+
1460+ ENV SGLANG_VERSION=${SGLANG_VERSION} \
1461+ SGLANG_VLLM_VERSION=${SGLANG_VLLM_VERSION}
1462+
1463+ RUN <<EOF
1464+ # SGLang
1465+
1466+ CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
1467+ if [[ -z "${CMAKE_MAX_JOBS}" ]]; then
1468+ CMAKE_MAX_JOBS="$(( $(nproc) / 2 ))"
1469+ fi
1470+ if (( $(echo "${CMAKE_MAX_JOBS} > 8" | bc -l) )); then
1471+ CMAKE_MAX_JOBS="8"
1472+ fi
1473+ SG_CUDA_ARCHS="${CUDA_ARCHS}"
1474+ if [[ -z "${SG_CUDA_ARCHS}" ]]; then
1475+ if (( $(echo "${CUDA_MAJOR} < 12" | bc -l) )); then
1476+ SG_CUDA_ARCHS="7.5 8.0+PTX 8.9"
1477+ elif (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.8" | bc -l) )); then
1478+ SG_CUDA_ARCHS="7.5 8.0+PTX 8.9 9.0"
1479+ else
1480+ SG_CUDA_ARCHS="7.5 8.0+PTX 8.9 9.0 10.0+PTX 12.0+PTX"
1481+ fi
1482+ fi
1483+ export MAX_JOBS="${CMAKE_MAX_JOBS}"
1484+ export TORCH_CUDA_ARCH_LIST="${SG_CUDA_ARCHS}"
1485+ export COMPILE_CUSTOM_KERNELS=1
1486+ export NVCC_THREADS=1
1487+
1488+ # Install SGLang
1489+ git -C /tmp clone --recursive --shallow-submodules \
1490+ --depth 1 --branch v${SGLANG_VERSION} --single-branch \
1491+ https://github.com/sgl-project/sglang.git sglang-${SGLANG_VERSION}
1492+ pushd /tmp/sglang-${SGLANG_VERSION}/python \
1493+ && uv pip install --verbose .[all]
1494+
1495+ # Download pre-compiled cubins
1496+ export FLASHINFER_CUBIN_DOWNLOAD_THREADS="${CMAKE_MAX_JOBS}"
1497+ export FLASHINFER_LOGGING_LEVEL=warning
1498+ python -m flashinfer --download-cubin
1499+
1500+ # Cleanup
1501+ rm -rf /var/tmp/* \
1502+ && rm -rf /tmp/*
1503+ EOF
1504+
1505+ # # Install SGLang Router
1506+
1507+ RUN <<EOF
1508+ # SGlang Router
1509+
1510+ # Install Rust
1511+ curl --retry 3 --retry-connrefused --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
1512+ export PATH="/root/.cargo/bin:${PATH}" \
1513+ && rustc --version \
1514+ && cargo --version
1515+
1516+ # Install build tools
1517+ uv pip install \
1518+ setuptools-rust maturin
1519+
1520+ # Install SGLang Router
1521+ git -C /tmp clone --recursive --shallow-submodules \
1522+ --depth 1 --branch v${SGLANG_VERSION} --single-branch \
1523+ https://github.com/sgl-project/sglang.git sglang
1524+ pushd /tmp/sglang/sgl-router \
1525+ && ulimit -n 65536 && maturin build --release --features vendored-openssl --out dist \
1526+ && tree -hs /tmp/sglang/sgl-router/dist \
1527+ && uv pip install --force-reinstall /tmp/sglang/sgl-router/dist/*.whl
1528+
1529+ # Cleanup
1530+ rm -rf /var/tmp/* \
1531+ && rm -rf /tmp/* \
1532+ && rm -rf /root/.cache \
1533+ && rm -rf /root/.cargo \
1534+ && rm -rf /root/.rustup
1535+ EOF
1536+
1537+ # # Install SGLANG Kernel
1538+
1539+ ARG SGLANG_KERNEL_VERSION
1540+
1541+ ENV SGLANG_KERNEL_VERSION=${SGLANG_KERNEL_VERSION}
1542+
1543+ RUN <<EOF
1544+ # SGLANG Kernel
1545+
1546+ IFS="." read -r CUDA_MAJOR CUDA_MINOR CUDA_PATCH <<< "${SGLANG_TORCH_CUDA_VERSION}"
1547+
1548+ CMAKE_MAX_JOBS="${CMAKE_MAX_JOBS}"
1549+ if [[ -z "${CMAKE_MAX_JOBS}" ]]; then
1550+ CMAKE_MAX_JOBS="$(( $(nproc) / 2 ))"
1551+ fi
1552+ if (( $(echo "${CMAKE_MAX_JOBS} > 8" | bc -l) )); then
1553+ CMAKE_MAX_JOBS="8"
1554+ fi
1555+ export MAX_JOBS="${CMAKE_MAX_JOBS}"
1556+
1557+ # Download
1558+ if (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.7" | bc -l) )); then
1559+ uv pip install --force-reinstall --no-deps \
1560+ https://github.com/sgl-project/whl/releases/download/v${SGLANG_KERNEL_VERSION}/sgl_kernel-${SGLANG_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl
1561+ elif (( $(echo "${CUDA_MAJOR}.${CUDA_MINOR} < 12.9" | bc -l) )); then
1562+ uv pip install \
1563+ sgl-kernel==${SGLANG_KERNEL_VERSION}
1564+ else
1565+ uv pip install --force-reinstall --no-deps \
1566+ https://github.com/sgl-project/whl/releases/download/v${SGLANG_KERNEL_VERSION}/sgl_kernel-${SGLANG_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl
1567+ fi
1568+
1569+ # Cleanup
1570+ rm -rf /var/tmp/* \
1571+ && rm -rf /tmp/*
1572+ EOF
1573+
1574+ # # Install Dependencies
1575+
1576+ RUN <<EOF
1577+ # Dependencies
1578+
1579+ # Install Dependencies,
1580+ # see https://github.com/sgl-project/sglang/blob/41c10e67fcae6ac50dfe283655bdf545d224cba9/docker/Dockerfile#L181-L209.
1581+ cat <<EOT >/tmp/requirements.txt
1582+ nvidia-cutlass-dsl==4.3.0.dev0
1583+ datamodel_code_generator
1584+ mooncake-transfer-engine==0.3.6.post1
1585+ nixl
1586+ EOT
1587+ uv pip install \
1588+ -r /tmp/requirements.txt
1589+
1590+ # Cleanup
1591+ rm -rf /var/tmp/* \
1592+ && rm -rf /tmp/*
1593+ EOF
1594+
1595+ # # Postprocess
1596+
1597+ RUN <<EOF
1598+ # Postprocess
1599+
1600+ # Review
1601+ uv pip tree \
1602+ --package sglang \
1603+ --package sglang-router \
1604+ --package sgl-kernel \
1605+ --package flashinfer-python \
1606+ --package triton \
1607+ --package vllm \
1608+ --package torch \
1609+ --package deep-ep
1610+ EOF
1611+
1612+ # # Entrypoint
1613+
1614+ WORKDIR /
1615+ ENTRYPOINT [ "tini" , "--" ]
0 commit comments