3
3
# This script build the Neuron docker image and run the API server inside the container.
4
4
# It serves a sanity check for compilation and basic model usage.
5
5
set -e
6
+ set -v
7
+
8
+ image_name=" neuron/vllm-ci"
9
+ container_name=" neuron_$( tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo) "
10
+
11
+ HF_CACHE=" $( realpath ~ ) /huggingface"
12
+ mkdir -p " ${HF_CACHE} "
13
+ HF_MOUNT=" /root/.cache/huggingface"
14
+
15
+ NEURON_COMPILE_CACHE_URL=" $( realpath ~ ) /neuron_compile_cache"
16
+ mkdir -p " ${NEURON_COMPILE_CACHE_URL} "
17
+ NEURON_COMPILE_CACHE_MOUNT=" /root/.cache/neuron_compile_cache"
6
18
7
19
# Try building the docker image
8
20
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
13
25
last_build=$( cat /tmp/neuron-docker-build-timestamp)
14
26
current_time=$( date +%s)
15
27
if [ $(( current_time - last_build)) -gt 86400 ]; then
28
+ docker image prune -f
16
29
docker system prune -f
30
+ rm -rf " ${HF_MOUNT:? } /*"
31
+ rm -rf " ${NEURON_COMPILE_CACHE_MOUNT:? } /*"
17
32
echo " $current_time " > /tmp/neuron-docker-build-timestamp
18
33
fi
19
34
else
20
35
date " +%s" > /tmp/neuron-docker-build-timestamp
21
36
fi
22
37
23
- docker build -t neuron -f Dockerfile.neuron .
38
+ docker build -t " ${image_name} " -f Dockerfile.neuron .
24
39
25
40
# Setup cleanup
26
- remove_docker_container () { docker rm -f neuron || true ; }
41
+ remove_docker_container () {
42
+ docker image rm -f " ${image_name} " || true ;
43
+ }
27
44
trap remove_docker_container EXIT
28
- remove_docker_container
29
45
30
46
# Run the image
31
- docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
32
- --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
33
-
34
- # Wait for the server to start
35
- wait_for_server_to_start () {
36
- timeout=300
37
- counter=0
38
-
39
- while [ " $( curl -s -o /dev/null -w ' %{http_code}' localhost:8000/health) " != " 200" ]; do
40
- sleep 1
41
- counter=$(( counter + 1 ))
42
- if [ $counter -ge $timeout ]; then
43
- echo " Timeout after $timeout seconds"
44
- break
45
- fi
46
- done
47
- }
48
- wait_for_server_to_start
49
-
50
- # Test a simple prompt
51
- curl -X POST -H " Content-Type: application/json" \
52
- localhost:8000/generate \
53
- -d ' {"prompt": "San Francisco is a"}'
47
+ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
48
+ -v " ${HF_CACHE} :${HF_MOUNT} " \
49
+ -e " HF_HOME=${HF_MOUNT} " \
50
+ -v " ${NEURON_COMPILE_CACHE_URL} :${NEURON_COMPILE_CACHE_MOUNT} " \
51
+ -e " NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT} " \
52
+ --name " ${container_name} " \
53
+ ${image_name} \
54
+ /bin/bash -c " python3 /workspace/vllm/examples/offline_inference_neuron.py"
0 commit comments