From 68bbf779557f83d8e408380e5a45c9dcd8447853 Mon Sep 17 00:00:00 2001
From: kerthcet <kerthcet@gmail.com>
Date: Fri, 6 Sep 2024 00:11:11 +0800
Subject: [PATCH] Prepare for v0.0.5

Signed-off-by: kerthcet <kerthcet@gmail.com>
---
 docs/examples/README.md                       |  2 --
 .../speculative-decoding/vllm/playground.yaml | 10 +++++--
 llmaz/model_loader/model_hub/huggingface.py   | 16 +++++++---
 llmaz/model_loader/model_hub/modelscope.py    | 11 +++++--
 llmaz/model_loader/model_hub/util.py          | 30 +++++++++++++++++++
 pkg/defaults.go                               |  2 +-
 6 files changed, 59 insertions(+), 12 deletions(-)
 create mode 100644 llmaz/model_loader/model_hub/util.py

diff --git a/docs/examples/README.md b/docs/examples/README.md
index 726e392..b690c22 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -43,6 +43,4 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 ### Speculative Decoding with vLLM
 
-> Not supported yet because llama.cpp doesn't support speculative decoding in the server side, see https://github.com/ggerganov/llama.cpp/issues/5877.
-
 [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.
diff --git a/docs/examples/speculative-decoding/vllm/playground.yaml b/docs/examples/speculative-decoding/vllm/playground.yaml
index 152f08d..0be6c61 100644
--- a/docs/examples/speculative-decoding/vllm/playground.yaml
+++ b/docs/examples/speculative-decoding/vllm/playground.yaml
@@ -13,5 +13,11 @@ spec:
   backendConfig:
     args:
       - --use-v2-block-manager
-      - --num_speculative_tokens 5
-      - -tp 1
+      - --num_speculative_tokens
+      - "5"
+      - -tp
+      - "1"
+    resources:
+      limits:
+        cpu: 8
+        memory: "16Gi"
diff --git a/llmaz/model_loader/model_hub/huggingface.py b/llmaz/model_loader/model_hub/huggingface.py
index 678957a..6b32936 100644
--- a/llmaz/model_loader/model_hub/huggingface.py
+++ b/llmaz/model_loader/model_hub/huggingface.py
@@ -26,6 +26,7 @@
     ModelHub,
 )
 from llmaz.util.logger import Logger
+from llmaz.model_loader.model_hub.util import get_folder_total_size
 
 from typing import Optional
 
@@ -50,14 +51,18 @@ def load_model(
                 local_dir=MODEL_LOCAL_DIR,
                 revision=revision,
             )
+            file_size = os.path.getsize(MODEL_LOCAL_DIR + filename) / (1024**3)
+            Logger.info(
+                f"The total size of {MODEL_LOCAL_DIR + filename} is {file_size: .2f} GB"
+            )
             return
 
+        local_dir = os.path.join(
+            MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}"
+        )
+
         # # TODO: Should we verify the download is finished?
         with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-            local_dir = os.path.join(
-                MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}"
-            )
-
             futures = []
             for file in list_repo_files(repo_id=model_id):
                 # TODO: support version management, right now we didn't distinguish with them.
@@ -71,6 +76,9 @@ def load_model(
                     ).add_done_callback(handle_completion)
                 )
 
+        total_size = get_folder_total_size(local_dir)
+        Logger.info(f"The total size of {local_dir} is {total_size: .2f} GB")
+
 
 def handle_completion(future):
     filename = future.result()
diff --git a/llmaz/model_loader/model_hub/modelscope.py b/llmaz/model_loader/model_hub/modelscope.py
index d7eaeac..66c5553 100644
--- a/llmaz/model_loader/model_hub/modelscope.py
+++ b/llmaz/model_loader/model_hub/modelscope.py
@@ -27,6 +27,7 @@
     ModelHub,
 )
 from llmaz.util.logger import Logger
+from llmaz.model_loader.model_hub.util import get_folder_total_size
 
 
 class ModelScope(ModelHub):
@@ -43,11 +44,12 @@ def load_model(
             f"Start to download, model_id: {model_id}, filename: {filename}, revision: {revision}"
         )
 
+        local_dir = os.path.join(
+            MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}"
+        )
+
         with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
             futures = []
-            local_dir = os.path.join(
-                MODEL_LOCAL_DIR, f"models--{model_id.replace('/','--')}"
-            )
             futures.append(
                 executor.submit(
                     snapshot_download,
@@ -57,6 +59,9 @@ def load_model(
                 ).add_done_callback(handle_completion)
             )
 
+        total_size = get_folder_total_size(local_dir)
+        Logger.info(f"The total size of {local_dir} is {total_size:.2f} GB")
+
 
 def handle_completion(future):
     filename = future.result()
diff --git a/llmaz/model_loader/model_hub/util.py b/llmaz/model_loader/model_hub/util.py
new file mode 100644
index 0000000..315d50a
--- /dev/null
+++ b/llmaz/model_loader/model_hub/util.py
@@ -0,0 +1,30 @@
+"""
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+
+
+def get_folder_total_size(folder_path: str):
+    total_size = 0
+
+    for dirpath, _, filenames in os.walk(folder_path):
+        for filename in filenames:
+            file_path = os.path.join(dirpath, filename)
+            if os.path.exists(file_path):
+                total_size += os.path.getsize(file_path)
+
+    total_size_gb = total_size / (1024**3)
+    return total_size_gb
diff --git a/pkg/defaults.go b/pkg/defaults.go
index f854aef..f350c17 100644
--- a/pkg/defaults.go
+++ b/pkg/defaults.go
@@ -17,5 +17,5 @@ limitations under the License.
 package pkg
 
 const (
-	LOADER_IMAGE = "inftyai/model-loader:v0.0.7"
+	LOADER_IMAGE = "inftyai/model-loader:v0.0.8"
 )