From 68bbf779557f83d8e408380e5a45c9dcd8447853 Mon Sep 17 00:00:00 2001 From: kerthcet Date: Fri, 6 Sep 2024 00:11:11 +0800 Subject: [PATCH] Prepare for v0.0.5 Signed-off-by: kerthcet --- docs/examples/README.md | 2 -- .../speculative-decoding/vllm/playground.yaml | 10 +++++-- llmaz/model_loader/model_hub/huggingface.py | 16 +++++++--- llmaz/model_loader/model_hub/modelscope.py | 11 +++++-- llmaz/model_loader/model_hub/util.py | 30 +++++++++++++++++++ pkg/defaults.go | 2 +- 6 files changed, 59 insertions(+), 12 deletions(-) create mode 100644 llmaz/model_loader/model_hub/util.py diff --git a/docs/examples/README.md b/docs/examples/README.md index 726e392..b690c22 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -43,6 +43,4 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference ### Speculative Decoding with vLLM -> Not supported yet because llama.cpp doesn't support speculative decoding in the server side, see https://github.com/ggerganov/llama.cpp/issues/5877. - [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here. diff --git a/docs/examples/speculative-decoding/vllm/playground.yaml b/docs/examples/speculative-decoding/vllm/playground.yaml index 152f08d..0be6c61 100644 --- a/docs/examples/speculative-decoding/vllm/playground.yaml +++ b/docs/examples/speculative-decoding/vllm/playground.yaml @@ -13,5 +13,11 @@ spec: backendConfig: args: - --use-v2-block-manager - - --num_speculative_tokens 5 - - -tp 1 + - --num_speculative_tokens + - "5" + - -tp + - "1" + resources: + limits: + cpu: 8 + memory: "16Gi" diff --git a/llmaz/model_loader/model_hub/huggingface.py b/llmaz/model_loader/model_hub/huggingface.py index 678957a..6b32936 100644 --- a/llmaz/model_loader/model_hub/huggingface.py +++ b/llmaz/model_loader/model_hub/huggingface.py @@ -26,6 +26,7 @@ ModelHub, ) from llmaz.util.logger import Logger +from llmaz.model_loader.model_hub.util import get_folder_total_size from typing import Optional @@ -50,14 +51,18 @@ def load_model( local_dir=MODEL_LOCAL_DIR, revision=revision, ) + file_size = os.path.getsize(MODEL_LOCAL_DIR + filename) / (1024**3) + Logger.info( + f"The total size of {MODEL_LOCAL_DIR + filename} is {file_size: .2f} GB" + ) return + local_dir = os.path.join( + MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}" + ) + # # TODO: Should we verify the download is finished? with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - local_dir = os.path.join( - MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}" - ) - futures = [] for file in list_repo_files(repo_id=model_id): # TODO: support version management, right now we didn't distinguish with them. @@ -71,6 +76,9 @@ def load_model( ).add_done_callback(handle_completion) ) + total_size = get_folder_total_size(local_dir) + Logger.info(f"The total size of {local_dir} is {total_size: .2f} GB") + def handle_completion(future): filename = future.result() diff --git a/llmaz/model_loader/model_hub/modelscope.py b/llmaz/model_loader/model_hub/modelscope.py index d7eaeac..66c5553 100644 --- a/llmaz/model_loader/model_hub/modelscope.py +++ b/llmaz/model_loader/model_hub/modelscope.py @@ -27,6 +27,7 @@ ModelHub, ) from llmaz.util.logger import Logger +from llmaz.model_loader.model_hub.util import get_folder_total_size class ModelScope(ModelHub): @@ -43,11 +44,12 @@ def load_model( f"Start to download, model_id: {model_id}, filename: {filename}, revision: {revision}" ) + local_dir = os.path.join( + MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}" + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = [] - local_dir = os.path.join( - MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}" - ) futures.append( executor.submit( snapshot_download, @@ -57,6 +59,9 @@ def load_model( ).add_done_callback(handle_completion) ) + total_size = get_folder_total_size(local_dir) + Logger.info(f"The total size of {local_dir} is {total_size:.2f} GB") + def handle_completion(future): filename = future.result() diff --git a/llmaz/model_loader/model_hub/util.py b/llmaz/model_loader/model_hub/util.py new file mode 100644 index 0000000..315d50a --- /dev/null +++ b/llmaz/model_loader/model_hub/util.py @@ -0,0 +1,30 @@ +""" +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import os + + +def get_folder_total_size(folder_path: str): + total_size = 0 + + for dirpath, _, filenames in os.walk(folder_path): + for filename in filenames: + file_path = os.path.join(dirpath, filename) + if os.path.exists(file_path): + total_size += os.path.getsize(file_path) + + total_size_gb = total_size / (1024**3) + return total_size_gb diff --git a/pkg/defaults.go b/pkg/defaults.go index f854aef..f350c17 100644 --- a/pkg/defaults.go +++ b/pkg/defaults.go @@ -17,5 +17,5 @@ limitations under the License. package pkg const ( - LOADER_IMAGE = "inftyai/model-loader:v0.0.7" + LOADER_IMAGE = "inftyai/model-loader:v0.0.8" )