Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ on:
- voxbox
- mindie
- vllm
- sglang
# Since specific Backend and Target still result in many tags,
# we can leverage this to control packing one specific tag, even os/arch.
tag:
Expand Down
2 changes: 1 addition & 1 deletion gpustack_runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from dataclasses_json import dataclass_json

_RE_DOCKER_IMAGE = re.compile(
r"(?:(?P<prefix>[\w\\.\-]+(?:/[\w\\.\-]+)*)/)?gpustack/runner:(?P<backend>(Host|cann|corex|cuda|dtk|maca|rocm))(?P<backend_version>[XY\d\\.]+)(?:-(?P<backend_variant>\w+))?-(?P<service>(vllm|voxbox|mindie))(?P<service_version>[\w\\.]+)(?:-(?P<suffix>\w+))?",
r"(?:(?P<prefix>[\w\\.\-]+(?:/[\w\\.\-]+)*)/)?gpustack/runner:(?P<backend>(Host|cann|corex|cuda|dtk|maca|rocm))(?P<backend_version>[XY\d\\.]+)(?:-(?P<backend_variant>\w+))?-(?P<service>(vllm|voxbox|sglang|mindie))(?P<service_version>[\w\\.]+)(?:-(?P<suffix>\w+))?",
)
"""
Regex for Docker image parsing,
Expand Down
130 changes: 130 additions & 0 deletions pack/cann/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
# - Install vLLM-Ascend from source.
# - Install dependencies.
# - Postprocess, review installation.
# 5. sglang target.
# - Build SGLang from source (Ascend/NPU), including sgl-kernel-npu and deep-ep.
# - Install sglang with NPU extras.
# - Ecosystem install: MemFabric and Triton Ascend.
# - Optional: Install BiSheng toolkit.
# - Postprocess, review installation.

# Argument usage:
# - PYTHON_VERSION: Version of Python to use.
Expand All @@ -33,6 +39,8 @@
# - VLLM_ASCEND_VERSION: Version of vLLM Ascend to use,
# if not specified, it will fetch from the vLLM Ascend PyPi RSS.
# - VLLM_TORCH_VERSION: Version of Torch for vLLM to use.
# - SGLANG_VERSION: Version of SGLang to use.

ARG PYTHON_VERSION=3.11
ARG CMAKE_MAX_JOBS
ARG CANN_VERSION=8.2.rc2
Expand Down Expand Up @@ -737,3 +745,125 @@ ENV RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1

WORKDIR /
ENTRYPOINT [ "tini", "--" ]

# Stage SGLang (inherits vLLM)
#
# Example build command:
# docker build --progress=plain --platform=linux/arm64 \
# --file=test/testDockerfile.cann \
# --tag=gpustack/runner:cann${CANN_VERSION%.*}-sglang-linux-arm64 \
# --target=sglang test
#
FROM vllm AS sglang
SHELL ["/bin/bash", "-eo", "pipefail", "-c"]

ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH

ENV UV_SYSTEM_PYTHON=1 \
UV_PRERELEASE=allow

## Build args for SGLang
ARG SGL_REPO="https://github.com/sgl-project/sglang.git"
ARG SGL_DEFAULT="main"
ARG SGL_BRANCH=${SGL_DEFAULT}
ARG BUILD_TYPE=srt
ARG NO_DEPS_FLAG=""
ARG SGLANG_VERSION=0.5.3.post3
ENV SGLANG_VERSION=${SGLANG_VERSION}

## Build args for sgl-kernel-npu
ARG SGL_KERNEL_NPU_REPO="https://github.com/sgl-project/sgl-kernel-npu.git"
ARG SGL_KERNEL_NPU_BRANCH=${SGL_DEFAULT}
## NPU ecosystem components
ARG MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
ARG TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0%2Bgitb0ea0850-cp311-cp311-linux_aarch64.whl"
ARG BISHENG_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/Ascend-BiSheng-toolkit_aarch64.run"

## Ascend toolkit path
ENV ASCEND_CANN_PATH="${CANN_HOME}/ascend-toolkit"

## Install SGLang and NPU components
RUN <<EOF
# Prepare Python build deps and utilities
uv pip install --verbose wheel build IPython orjson python-multipart pybind11

# Clean any previous installs
pip uninstall -y sgl_kernel_npu deep-ep sglang || true

# Ecosystem: MemFabric and Triton Ascend
uv pip install --no-cache-dir wheel==0.45.1
uv pip install --no-cache-dir ${MEMFABRIC_URL}
uv pip install --no-cache-dir ${TRITON_ASCEND_URL}

# Clone SGLang and install Python package (NPU extras)
mkdir -p /sgl-workspace && pushd /sgl-workspace
git clone ${SGL_REPO}
cd sglang
# Prefer version tag if provided, otherwise fall back to branch selection
if [[ -n "${SGLANG_VERSION}" ]]; then
git fetch --tags --depth=1
if git rev-parse -q --verify "refs/tags/v${SGLANG_VERSION}" >/dev/null; then
echo "Checking out tag v${SGLANG_VERSION}"; git checkout -q "tags/v${SGLANG_VERSION}"
elif git rev-parse -q --verify "refs/tags/${SGLANG_VERSION}" >/dev/null; then
echo "Checking out tag ${SGLANG_VERSION}"; git checkout -q "tags/${SGLANG_VERSION}"
elif git rev-parse -q --verify "${SGLANG_VERSION}" >/dev/null; then
echo "Checking out commit/branch ${SGLANG_VERSION}"; git checkout -q "${SGLANG_VERSION}"
elif [[ "${SGL_BRANCH}" != "${SGL_DEFAULT}" ]]; then
echo "Checking out branch ${SGL_BRANCH}"; git checkout -q "${SGL_BRANCH}"
else
echo "Using ${SGL_DEFAULT} default branch"
fi
else
if [[ "${SGL_BRANCH}" != "${SGL_DEFAULT}" ]]; then
echo "Checking out branch ${SGL_BRANCH}"; git checkout -q "${SGL_BRANCH}"
fi
fi
rm -f python/pyproject.toml
mv python/pyproject_other.toml python/pyproject.toml
if [[ "${BUILD_TYPE}" == "srt" ]]; then
python -m pip --no-cache-dir install -e "python[srt_npu]" ${NO_DEPS_FLAG}
else
python -m pip --no-cache-dir install -e "python[all_npu]" ${NO_DEPS_FLAG}
fi
popd

# Build sgl-kernel-npu and deep-ep wheels
git -C /sgl-workspace clone --depth 1 ${SGL_KERNEL_NPU_REPO} ${SGL_KERNEL_NPU_BRANCH:+--branch ${SGL_KERNEL_NPU_BRANCH}}
export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/latest/runtime/lib64/stub:$LD_LIBRARY_PATH
source ${ASCEND_CANN_PATH}/set_env.sh
pushd /sgl-workspace/sgl-kernel-npu
bash build.sh
pip install output/deep_ep*.whl output/sgl_kernel_npu*.whl --no-cache-dir
popd

# Link deep_ep cpp .so to package root for runtime discovery
cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -sf deep_ep/deep_ep_cpp*.so .

# Install BiSheng toolkit (Ascend)
wget ${BISHENG_URL} && chmod a+x Ascend-BiSheng-toolkit_aarch64.run && ./Ascend-BiSheng-toolkit_aarch64.run --install && rm Ascend-BiSheng-toolkit_aarch64.run

# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/*
EOF

## Postprocess review
RUN <<EOF
uv pip tree \
--package sglang \
--package torch \
--package torch-npu \
--package deep-ep
EOF

## Performance environment variables
ENV PYTORCH_NPU_ALLOC_CONF=expandable_segments:True \
SGLANG_SET_CPU_AFFINITY=1 \
SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 \
HCCL_BUFFSIZE=200 \
SGLANG_NPU_USE_MLAPO=1

WORKDIR /
ENTRYPOINT [ "tini", "--" ]
91 changes: 91 additions & 0 deletions pack/cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
# - Install FlashInfer if existed.
# - Install dependencies.
# - Postprocess, review installation.
# 4. sglang target.
# - Install SGLang from PyPI (version controlled via SGLANG_VERSION).
# - Postprocess, review installation.

# Argument usage:
# - PYTHON_VERSION: Version of Python to use.
Expand Down Expand Up @@ -54,6 +57,8 @@
# - VLLM_FLASHINFER_VERSION: Version of FlashInfer to use,
# which is used to build the FlashInfer wheel.
# - VLLM_LMCACHE_VERSION: Version of lmcache to use.
# - SGLANG_VERSION: Version of SGLang to install (PyPI). Defaults to 0.5.3.post3.

ARG PYTHON_VERSION=3.12
ARG CMAKE_MAX_JOBS
ARG CUDA_VERSION=12.8.1
Expand All @@ -74,6 +79,10 @@ ARG VLLM_FLASHINFER_REPOSITORY=https://github.com/flashinfer-ai/flashinfer.git
ARG VLLM_FLASHINFER_VERSION=0.3.1
ARG VLLM_LMCACHE_VERSION=0.3.8

# SGLang build args (mirroring vLLM)
ARG SGLANG_VERSION=0.5.3.post3


#
# Stage Bake Runtime
#
Expand Down Expand Up @@ -954,3 +963,85 @@ ENV RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=1

WORKDIR /
ENTRYPOINT [ "tini", "--" ]

# Stage SGLang (inherits vLLM)
#
# Example build command:
# docker build --progress=plain --platform=linux/amd64 --file=pack/cuda/Dockerfile --tag=gpustack/runner:cuda${CUDA_VERSION%.*}-sglang${SGLANG_VERSION}-linux-amd64 --target=sglang pack/cuda
#
FROM vllm AS sglang
SHELL ["/bin/bash", "-eo", "pipefail", "-c"]

ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH

ENV UV_SYSTEM_PYTHON=1 \
UV_PRERELEASE=allow

## Install SGLang

ARG SGLANG_VERSION

ENV SGLANG_VERSION=${SGLANG_VERSION}

RUN <<EOF
# SGLang

# Install
uv pip install --verbose \
sglang==${SGLANG_VERSION}

# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/*
EOF

## Install Dependencies

RUN <<EOF
# Dependencies

cat <<EOT >/tmp/requirements.txt
requests
pyyaml
httpx<1.0
fastapi
uvicorn
EOT
uv pip install \
-r /tmp/requirements.txt

# Review
uv pip tree \
--package sglang \
--package vllm \
--package torch
EOF

## Runtime Enhancements

# Build-time switches
ARG NCCL_ENABLE=1
ARG NCCL_PACKAGE=nvidia-nccl-cu12
ARG NCCL_VERSION=2.27.6
ARG FLASHINFER_PREFETCH_CUBIN=1

RUN <<EOF
# Runtime accelerators

# NCCL: configurable install via build args
if [[ "${NCCL_ENABLE}" == "1" ]]; then
uv pip install --no-cache-dir ${NCCL_PACKAGE}==${NCCL_VERSION} --force-reinstall --no-deps
fi

# FlashInfer cubin prefetch: only if package is present
if [[ "${FLASHINFER_PREFETCH_CUBIN}" == "1" ]]; then
python -c "import importlib.util,sys; sys.exit(0 if importlib.util.find_spec('flashinfer') else 1)" \
&& python -m flashinfer --download-cubin || true
fi

# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/*
EOF
14 changes: 13 additions & 1 deletion pack/expand_matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,18 @@ EOT
--arg tag "${TAG}${TAG_SUFFIX}" \
--arg platform_tag "${PLATFORM_TAG}" \
'.[$tag] += [$platform_tag]')"
PLATFORM_TAG_CACHE="[\"${PLATFORM_TAG}\",\"${PLATFORM_TAG_XY}\",\"${PLATFORM_TAG_X}\"]"
if [[ "${SERVICE}" == "sglang" ]]; then
IFS="." read -r V_MAJOR V_MINOR V_PATCH V_POST <<<"${VLLM_VERSION}"
if [[ -z "${V_PATCH}" ]]; then V_PATCH=0; fi
VLLM_TAG="${TAG_PREFIX}vllm${V_MAJOR}.${V_MINOR}.${V_PATCH}"
VLLM_TAG_X="${TAG_PREFIX}vllm${V_MAJOR}"
VLLM_TAG_XY="${TAG_PREFIX}vllm${V_MAJOR}.${V_MINOR}"
VLLM_PLATFORM_TAG="${VLLM_TAG}-${OS}-${ARCH}"
VLLM_PLATFORM_TAG_X="${VLLM_TAG_X}-${OS}-${ARCH}"
VLLM_PLATFORM_TAG_XY="${VLLM_TAG_XY}-${OS}-${ARCH}"
PLATFORM_TAG_CACHE="[\"${PLATFORM_TAG}\",\"${PLATFORM_TAG_XY}\",\"${PLATFORM_TAG_X}\",\"${VLLM_PLATFORM_TAG}\",\"${VLLM_PLATFORM_TAG_XY}\",\"${VLLM_PLATFORM_TAG_X}\"]"
fi
BUILD_JOBS="$(echo "${BUILD_JOBS}" | jq -cr \
--arg backend "${BACKEND}" \
--arg backend_version "${BACKEND_VERSION}" \
Expand All @@ -249,7 +261,7 @@ EOT
--arg tag "${TAG}${TAG_SUFFIX}" \
--argjson args "${ARGS}" \
--arg runner "${RUNNER}" \
--argjson platform_tag_cache "[\"${PLATFORM_TAG}\",\"${PLATFORM_TAG_XY}\",\"${PLATFORM_TAG_X}\"]" \
--argjson platform_tag_cache "${PLATFORM_TAG_CACHE}" \
--arg original_backend_version "${ORIGINAL_BACKEND_VERSION}" \
'[{
backend: $backend,
Expand Down
26 changes: 23 additions & 3 deletions pack/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# used to select the packing rules and Dockerfile below `pack` directory.
# - services: (Optional) The inference service to pack for,
# used to select the Docker build phase described in `pack/${backend}/Dockerfile`.
# Default to `voxbox` and `vllm`.
# Default to `voxbox`, `vllm`, and `sglang`.
# - platforms: (Optional) The platforms to build for,
# used to select the Docker Linux build platforms.
# Default to `linux/amd64` and `linux/arm64`.
Expand All @@ -20,9 +20,12 @@ rules:
## Ascend CANN 8.2.rc2, using CANN Kernel for A3.
##
- backend: "cann"
platforms:
- "linux/arm64"
services:
- "mindie"
- "vllm"
- "sglang"
args:
- "CANN_VERSION=8.2.rc2"
- "CANN_ARCHS=a3"
Expand All @@ -38,9 +41,12 @@ rules:
## Ascend CANN 8.2.rc2, using CANN Kernel for 310P.
##
- backend: "cann"
platforms:
- "linux/arm64"
services:
- "mindie"
- "vllm"
- "sglang"
args:
- "CANN_VERSION=8.2.rc2"
- "CANN_ARCHS=310p"
Expand All @@ -66,6 +72,10 @@ rules:
## NVIDIA CUDA 12.4.1, using PyTorch +cu126 in linux/amd64.
##
- backend: "cuda"
services:
- "voxbox"
- "vllm"
- "sglang"
args:
- "CUDA_VERSION=12.4.1"
- "VOXBOX_TORCH_CUDA_VERSION=12.6.3"
Expand All @@ -76,6 +86,10 @@ rules:
## NVIDIA CUDA 12.6.3, using PyTorch +cu126 in linux/amd64.
##
- backend: "cuda"
services:
- "voxbox"
- "vllm"
- "sglang"
args:
- "CUDA_VERSION=12.6.3"
- "VLLM_NVIDIA_GDRCOPY_VERSION=2.4.1"
Expand All @@ -84,6 +98,10 @@ rules:
## NVIDIA CUDA 12.8.1, using PyTorch +cu128 in both linux/amd64 and linux/arm64.
##
- backend: "cuda"
services:
- "voxbox"
- "vllm"
- "sglang"
args:
- "CUDA_VERSION=12.8.1"
- "VLLM_NVIDIA_GDRCOPY_VERSION=2.4.1"
Expand Down Expand Up @@ -125,9 +143,11 @@ rules:
## AMD ROCm 7.0.2, using PyTorch +rocm7.0 in linux/amd64.
##
- backend: "rocm"
services:
- "vllm"
platforms:
- "linux/amd64"
services:
- "voxbox"
- "vllm"
- "sglang"
args:
- "ROCM_VERSION=7.0.2"
Loading