Skip to content

Commit 0314244

Browse files
committed
Document Sync by Tina
1 parent 001cc0d commit 0314244

File tree

1 file changed

+5
-85
lines changed

1 file changed

+5
-85
lines changed

docs/stable/store/quickstart.md

+5-85
Original file line numberDiff line numberDiff line change
@@ -133,93 +133,13 @@ Our api aims to be compatible with the `sharded_state` load format in vLLM. Thus
133133
Thus, for fist-time users, you have to load the model from other backends and then converted it to the ServerlessLLM format.
134134

135135
1. Download the model from HuggingFace and save it in the ServerlessLLM format:
136-
``` python
137-
import os
138-
import shutil
139-
from typing import Optional
140-
141-
class VllmModelDownloader:
142-
def __init__(self):
143-
pass
144-
145-
def download_vllm_model(
146-
self,
147-
model_name: str,
148-
torch_dtype: str,
149-
tensor_parallel_size: int = 1,
150-
pattern: Optional[str] = None,
151-
max_size: Optional[int] = None,
152-
):
153-
import gc
154-
import shutil
155-
from tempfile import TemporaryDirectory
156-
157-
import torch
158-
from huggingface_hub import snapshot_download
159-
from vllm import LLM
160-
from vllm.config import LoadFormat
161-
162-
# set the model storage path
163-
storage_path = os.getenv("STORAGE_PATH", "./models")
164-
165-
def _run_writer(input_dir, model_name):
166-
# load models from the input directory
167-
llm_writer = LLM(
168-
model=input_dir,
169-
download_dir=input_dir,
170-
dtype=torch_dtype,
171-
tensor_parallel_size=tensor_parallel_size,
172-
num_gpu_blocks_override=1,
173-
enforce_eager=True,
174-
max_model_len=1,
175-
)
176-
model_path = os.path.join(storage_path, model_name)
177-
model_executer = llm_writer.llm_engine.model_executor
178-
# save the models in the ServerlessLLM format
179-
model_executer.save_serverless_llm_state(
180-
path=model_path, pattern=pattern, max_size=max_size
181-
)
182-
for file in os.listdir(input_dir):
183-
# Copy the metadata files into the output directory
184-
if os.path.splitext(file)[1] not in (
185-
".bin",
186-
".pt",
187-
".safetensors",
188-
):
189-
src_path = os.path.join(input_dir, file)
190-
dest_path = os.path.join(model_path, file)
191-
if os.path.isdir(src_path):
192-
shutil.copytree(src_path, dest_path)
193-
else:
194-
shutil.copy(src_path, dest_path)
195-
del model_executer
196-
del llm_writer
197-
gc.collect()
198-
if torch.cuda.is_available():
199-
torch.cuda.empty_cache()
200-
torch.cuda.synchronize()
201-
202-
try:
203-
with TemporaryDirectory() as cache_dir:
204-
# download from huggingface
205-
input_dir = snapshot_download(
206-
model_name,
207-
cache_dir=cache_dir,
208-
allow_patterns=["*.safetensors", "*.bin", "*.json", "*.txt"],
209-
)
210-
_run_writer(input_dir, model_name)
211-
except Exception as e:
212-
print(f"An error occurred while saving the model: {e}")
213-
# remove the output dir
214-
shutil.rmtree(os.path.join(storage_path, model_name))
215-
raise RuntimeError(
216-
f"Failed to save {model_name} for vllm backend: {e}"
217-
)
218-
219-
downloader = VllmModelDownloader()
220-
downloader.download_vllm_model("facebook/opt-1.3b", "float16", 1)
136+
``` bash
137+
python3 examples/sllm_store/save_vllm_model.py --model_name facebook/opt-1.3b --storage_path $PWD/models --tensor_parallel_size 1
138+
221139
```
222140

141+
You can also transfer the model from the local path compared to download it from network by passing the `--local_model_path` argument.
142+
223143
After downloading the model, you can launch the checkpoint store server and load the model in vLLM through `sllm` load format.
224144

225145
2. Launch the checkpoint store server in a separate process:

0 commit comments

Comments
 (0)