From be18fb7504f4cf75513bdc22abd48eac647d0bf8 Mon Sep 17 00:00:00 2001
From: Hossein Sarshar <hossein.sarshar@gmail.com>
Date: Fri, 17 Oct 2025 06:00:44 +0000
Subject: [PATCH 1/3] fixed the docker image for the guide

Signed-off-by: Hossein Sarshar <hossein.sarshar@gmail.com>
---
 docs/getting_started/quickstart.md | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 227313f4c..430e59bb3 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -82,20 +82,32 @@ To install vLLM TPU, you can either install using `pip` (see section [Install us
 
 ### Run vllm-tpu as a Docker image
 
-1. Include the `--privileged`, `--net host`, and `--shm-size=16G` options to enable TPU interaction and shared memory.
+1. Include the `--privileged`, `--net host`, and `--shm-size=150gb` options to enable TPU interaction and shared memory.
 
     ```shell
-    docker run --privileged --net host --shm-size=16G -it vllm/vllm-tpu
+    export DOCKER_URI=vllm/vllm-tpu:latest
+    
+    sudo docker run -it --rm --name $USER-vllm --privileged --net=host \
+        -v /dev/shm:/dev/shm \
+        --shm-size 150gb \
+        -p 8000:8000 \
+        --entrypoint /bin/bash ${DOCKER_URI}
     ```
 
 1. Start the vLLM OpenAI API server (inside the container):
 
     ```shell
-    python -m vllm.entrypoints.openai.api_server \
-        --model meta-llama/Llama-3.1-8B \
-        --tensor-parallel-size 1 \
-        --host 0.0.0.0 \
-        --port 8000
+    export MAX_MODEL_LEN=4096
+    export TP=1 # number of chips
+
+    vllm serve meta-llama/Meta-Llama-3.1-8B \
+        --seed 42 \
+        --disable-log-requests \
+        --gpu-memory-utilization 0.98 \
+        --max-num-batched-tokens 2048 \
+        --max-num-seqs 256 \
+        --tensor-parallel-size $TP \
+        --max-model-len $MAX_MODEL_LEN
     ```
 
     Note: Adjust `--model` if you’re using a different model and `--tensor-parallel-size` if you want to use a different number of tensor parallel replicas.

From edb383dd811baf8be85928bb5ecf28ac22a40266 Mon Sep 17 00:00:00 2001
From: Hossein Sarshar <hossein.sarshar@gmail.com>
Date: Fri, 17 Oct 2025 14:53:07 +0000
Subject: [PATCH 2/3] removed the white spaces

Signed-off-by: Hossein Sarshar <hossein.sarshar@gmail.com>
---
 docs/getting_started/quickstart.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 430e59bb3..0f91fba5c 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -82,11 +82,10 @@ To install vLLM TPU, you can either install using `pip` (see section [Install us
 
 ### Run vllm-tpu as a Docker image
 
-1. Include the `--privileged`, `--net host`, and `--shm-size=150gb` options to enable TPU interaction and shared memory.
+1. Include the `--privileged`, `--net=host`, and `--shm-size=150gb` options to enable TPU interaction and shared memory.
 
     ```shell
     export DOCKER_URI=vllm/vllm-tpu:latest
-    
     sudo docker run -it --rm --name $USER-vllm --privileged --net=host \
         -v /dev/shm:/dev/shm \
         --shm-size 150gb \
@@ -99,7 +98,6 @@ To install vLLM TPU, you can either install using `pip` (see section [Install us
     ```shell
     export MAX_MODEL_LEN=4096
     export TP=1 # number of chips
-
     vllm serve meta-llama/Meta-Llama-3.1-8B \
         --seed 42 \
         --disable-log-requests \

From b28eb71439ab367b9227d902ca2512e35dd22846 Mon Sep 17 00:00:00 2001
From: Hossein Sarshar <hossein.sarshar@gmail.com>
Date: Fri, 17 Oct 2025 19:26:24 +0000
Subject: [PATCH 3/3] added the token and the HF home path variables

Signed-off-by: Hossein Sarshar <hossein.sarshar@gmail.com>
---
 docs/getting_started/quickstart.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 0f91fba5c..e3a926d9c 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -96,6 +96,8 @@ To install vLLM TPU, you can either install using `pip` (see section [Install us
 1. Start the vLLM OpenAI API server (inside the container):
 
     ```shell
+    export HF_HOME=/dev/shm/vllm
+    export HF_TOKEN=<your-token>
     export MAX_MODEL_LEN=4096
     export TP=1 # number of chips
     vllm serve meta-llama/Meta-Llama-3.1-8B \