Skip to content

Commit 898cdf0

Browse files
authored
[CI] Fix neuron CI and run offline tests (vllm-project#11779)
Signed-off-by: Liangfu Chen <[email protected]>
1 parent 0f3f3c8 commit 898cdf0

File tree

3 files changed

+35
-37
lines changed

3 files changed

+35
-37
lines changed

.buildkite/run-neuron-test.sh

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@
33
# This script build the Neuron docker image and run the API server inside the container.
44
# It serves a sanity check for compilation and basic model usage.
55
set -e
6+
set -v
7+
8+
image_name="neuron/vllm-ci"
9+
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
10+
11+
HF_CACHE="$(realpath ~)/huggingface"
12+
mkdir -p "${HF_CACHE}"
13+
HF_MOUNT="/root/.cache/huggingface"
14+
15+
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
16+
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
17+
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
618

719
# Try building the docker image
820
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
1325
last_build=$(cat /tmp/neuron-docker-build-timestamp)
1426
current_time=$(date +%s)
1527
if [ $((current_time - last_build)) -gt 86400 ]; then
28+
docker image prune -f
1629
docker system prune -f
30+
rm -rf "${HF_MOUNT:?}/*"
31+
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
1732
echo "$current_time" > /tmp/neuron-docker-build-timestamp
1833
fi
1934
else
2035
date "+%s" > /tmp/neuron-docker-build-timestamp
2136
fi
2237

23-
docker build -t neuron -f Dockerfile.neuron .
38+
docker build -t "${image_name}" -f Dockerfile.neuron .
2439

2540
# Setup cleanup
26-
remove_docker_container() { docker rm -f neuron || true; }
41+
remove_docker_container() {
42+
docker image rm -f "${image_name}" || true;
43+
}
2744
trap remove_docker_container EXIT
28-
remove_docker_container
2945

3046
# Run the image
31-
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
32-
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
33-
34-
# Wait for the server to start
35-
wait_for_server_to_start() {
36-
timeout=300
37-
counter=0
38-
39-
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
40-
sleep 1
41-
counter=$((counter + 1))
42-
if [ $counter -ge $timeout ]; then
43-
echo "Timeout after $timeout seconds"
44-
break
45-
fi
46-
done
47-
}
48-
wait_for_server_to_start
49-
50-
# Test a simple prompt
51-
curl -X POST -H "Content-Type: application/json" \
52-
localhost:8000/generate \
53-
-d '{"prompt": "San Francisco is a"}'
47+
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
48+
-v "${HF_CACHE}:${HF_MOUNT}" \
49+
-e "HF_HOME=${HF_MOUNT}" \
50+
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
51+
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
52+
--name "${container_name}" \
53+
${image_name} \
54+
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"

Dockerfile.neuron

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ RUN apt-get update && \
1515
ffmpeg libsm6 libxext6 libgl1
1616

1717
### Mount Point ###
18-
# When launching the container, mount the code directory to /app
19-
ARG APP_MOUNT=/app
18+
# When launching the container, mount the code directory to /workspace
19+
ARG APP_MOUNT=/workspace
2020
VOLUME [ ${APP_MOUNT} ]
2121
WORKDIR ${APP_MOUNT}/vllm
2222

@@ -25,6 +25,7 @@ RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
2525
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
2626
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
2727
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
28+
RUN python3 -m pip install pytest
2829

2930
COPY . .
3031
ARG GIT_REPO_CHECK=0
@@ -42,4 +43,7 @@ RUN --mount=type=bind,source=.git,target=.git \
4243
# install development dependencies (for testing)
4344
RUN python3 -m pip install -e tests/vllm_test_utils
4445

46+
# overwrite entrypoint to run bash script
47+
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
48+
4549
CMD ["/bin/bash"]

examples/offline_inference_neuron.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,5 @@
1-
import os
2-
31
from vllm import LLM, SamplingParams
42

5-
# creates XLA hlo graphs for all the context length buckets.
6-
os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
7-
# creates XLA hlo graphs for all the token gen buckets.
8-
os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
9-
103
# Sample prompts.
114
prompts = [
125
"Hello, my name is",
@@ -26,8 +19,8 @@
2619
# Currently, this is a known limitation in continuous batching support
2720
# in transformers-neuronx.
2821
# TODO(liangfu): Support paged-attention in transformers-neuronx.
29-
max_model_len=2048,
30-
block_size=2048,
22+
max_model_len=1024,
23+
block_size=1024,
3124
# The device can be automatically detected when AWS Neuron SDK is installed.
3225
# The device argument can be either unspecified for automated detection,
3326
# or explicitly assigned.

0 commit comments

Comments
 (0)