@@ -133,93 +133,13 @@ Our api aims to be compatible with the `sharded_state` load format in vLLM. Thus
133
133
Thus, for fist-time users, you have to load the model from other backends and then converted it to the ServerlessLLM format.
134
134
135
135
1 . Download the model from HuggingFace and save it in the ServerlessLLM format:
136
- ``` python
137
- import os
138
- import shutil
139
- from typing import Optional
140
-
141
- class VllmModelDownloader :
142
- def __init__ (self ):
143
- pass
144
-
145
- def download_vllm_model (
146
- self ,
147
- model_name : str ,
148
- torch_dtype : str ,
149
- tensor_parallel_size : int = 1 ,
150
- pattern : Optional[str ] = None ,
151
- max_size : Optional[int ] = None ,
152
- ):
153
- import gc
154
- import shutil
155
- from tempfile import TemporaryDirectory
156
-
157
- import torch
158
- from huggingface_hub import snapshot_download
159
- from vllm import LLM
160
- from vllm.config import LoadFormat
161
-
162
- # set the model storage path
163
- storage_path = os.getenv(" STORAGE_PATH" , " ./models" )
164
-
165
- def _run_writer (input_dir , model_name ):
166
- # load models from the input directory
167
- llm_writer = LLM(
168
- model = input_dir,
169
- download_dir = input_dir,
170
- dtype = torch_dtype,
171
- tensor_parallel_size = tensor_parallel_size,
172
- num_gpu_blocks_override = 1 ,
173
- enforce_eager = True ,
174
- max_model_len = 1 ,
175
- )
176
- model_path = os.path.join(storage_path, model_name)
177
- model_executer = llm_writer.llm_engine.model_executor
178
- # save the models in the ServerlessLLM format
179
- model_executer.save_serverless_llm_state(
180
- path = model_path, pattern = pattern, max_size = max_size
181
- )
182
- for file in os.listdir(input_dir):
183
- # Copy the metadata files into the output directory
184
- if os.path.splitext(file )[1 ] not in (
185
- " .bin" ,
186
- " .pt" ,
187
- " .safetensors" ,
188
- ):
189
- src_path = os.path.join(input_dir, file )
190
- dest_path = os.path.join(model_path, file )
191
- if os.path.isdir(src_path):
192
- shutil.copytree(src_path, dest_path)
193
- else :
194
- shutil.copy(src_path, dest_path)
195
- del model_executer
196
- del llm_writer
197
- gc.collect()
198
- if torch.cuda.is_available():
199
- torch.cuda.empty_cache()
200
- torch.cuda.synchronize()
201
-
202
- try :
203
- with TemporaryDirectory() as cache_dir:
204
- # download from huggingface
205
- input_dir = snapshot_download(
206
- model_name,
207
- cache_dir = cache_dir,
208
- allow_patterns = [" *.safetensors" , " *.bin" , " *.json" , " *.txt" ],
209
- )
210
- _run_writer(input_dir, model_name)
211
- except Exception as e:
212
- print (f " An error occurred while saving the model: { e} " )
213
- # remove the output dir
214
- shutil.rmtree(os.path.join(storage_path, model_name))
215
- raise RuntimeError (
216
- f " Failed to save { model_name} for vllm backend: { e} "
217
- )
218
-
219
- downloader = VllmModelDownloader()
220
- downloader.download_vllm_model(" facebook/opt-1.3b" , " float16" , 1 )
136
+ ``` bash
137
+ python3 examples/sllm_store/save_vllm_model.py --model_name facebook/opt-1.3b --storage_path $PWD /models --tensor_parallel_size 1
138
+
221
139
```
222
140
141
+ You can also transfer the model from the local path compared to download it from network by passing the ` --local_model_path ` argument.
142
+
223
143
After downloading the model, you can launch the checkpoint store server and load the model in vLLM through ` sllm ` load format.
224
144
225
145
2 . Launch the checkpoint store server in a separate process:
0 commit comments