From 1e91bb482db3e33b0bdcc65e234a7a8a68d8cf86 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Fri, 24 Oct 2025 23:31:23 +0000 Subject: [PATCH 1/3] feat: add HuggingFace cache checking to sanity_check.py Signed-off-by: Keiven Chang --- deploy/dynamo_check.py | 1 - deploy/sanity_check.py | 202 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 197 insertions(+), 6 deletions(-) delete mode 120000 deploy/dynamo_check.py diff --git a/deploy/dynamo_check.py b/deploy/dynamo_check.py deleted file mode 120000 index bf0591926a..0000000000 --- a/deploy/dynamo_check.py +++ /dev/null @@ -1 +0,0 @@ -sanity_check.py \ No newline at end of file diff --git a/deploy/sanity_check.py b/deploy/sanity_check.py index 1d2c800d7d..67a7b04334 100755 --- a/deploy/sanity_check.py +++ b/deploy/sanity_check.py @@ -13,8 +13,17 @@ - LLM frameworks (vllm, sglang, tensorrt_llm) - Dynamo runtime and framework components - File system (permissions and disk space, detailed with --thorough-check) +- HuggingFace model cache (detailed with --thorough-check) - Installation status and component availability +IMPORTANT: This script is STANDALONE and uses only Python stdlib (no Dynamo components). + +Why: Must work before Dynamo is built/installed (CI, fresh containers, build failures). +This tool is for pre-deployment validation; dynamo.common.config_dump is for runtime. + +Hard-coded paths: Uses defaults (e.g., ~/.cache/huggingface/hub) for predictable +behavior even when environment variables are misconfigured. See class docs for details. + The output uses status indicators: - ✅ Component found and working - ❌ Component missing or error @@ -35,6 +44,9 @@ ├─ OS Ubuntu 24.04.1 LTS (Noble Numbat) (Linux 6.11.0-28-generic x86_64), Memory=26.7/125.5 GiB, Cores=32 ├─ User info: user=ubuntu, uid=1000, gid=1000 ├─ ✅ NVIDIA GPU NVIDIA RTX 6000 Ada Generation, driver 570.133.07, CUDA 12.8, Power=26.14/300.00 W, Memory=289/49140 MiB +├─ 🤖Framework +│ ├─ ✅ vLLM: 0.10.1.1, module=/opt/vllm/vllm/__init__.py, exec=/opt/dynamo/venv/bin/vllm +│ └─ ✅ Sglang: 0.3.0, module=/opt/sglang/sglang/__init__.py ├─ File System │ ├─ ✅ Dynamo workspace ($HOME/dynamo) writable │ ├─ ✅ Dynamo .git directory writable @@ -42,6 +54,7 @@ │ ├─ ✅ Cargo home ($HOME/.cargo) writable │ ├─ ✅ Cargo target ($HOME/dynamo/.build/target) writable │ └─ ✅ Python site-packages ($HOME/dynamo/venv/lib/python3.12/site-packages) writable +├─ ✅ Hugging Face Cache 3 models in ~/.cache/huggingface/hub ├─ ✅ Cargo $HOME/.cargo/bin/cargo, cargo 1.89.0 (c24e10642 2025-06-23) │ ├─ Cargo home directory CARGO_HOME=$HOME/.cargo │ └─ Cargo target directory CARGO_TARGET_DIR=$HOME/dynamo/.build/target @@ -52,9 +65,6 @@ ├─ ✅ Python 3.12.3, /opt/dynamo/venv/bin/python │ ├─ ✅ PyTorch 2.7.1+cu128, ✅torch.cuda.is_available │ └─ PYTHONPATH not set -├─ 🤖Framework -│ ├─ ✅ vLLM: 0.10.1.1, module=/opt/vllm/vllm/__init__.py, exec=/opt/dynamo/venv/bin/vllm -│ └─ ✅ Sglang: 0.3.0, module=/opt/sglang/sglang/__init__.py └─ Dynamo $HOME/dynamo, SHA: a03d29066, Date: 2025-08-30 16:22:29 PDT ├─ ✅ Runtime components ai-dynamo-runtime 0.4.1 │ │ /opt/dynamo/venv/lib/python3.12/site-packages/ai_dynamo_runtime-0.4.1.dist-info: created=2025-08-30 19:14:29 PDT @@ -79,8 +89,8 @@ python deploy/sanity_check.py [--thorough-check] [--terse] Options: - --thorough-check Enable thorough checking (file permissions, directory sizes, etc.) - --terse Enable terse output mode + --thorough-check Enable thorough checking (file permissions, directory sizes, HuggingFace model details) + --terse Enable terse output mode (show only essential info and errors) """ import datetime @@ -324,6 +334,9 @@ def __init__( # Add file permissions check self.add_child(FilePermissionsInfo(thorough_check=self.thorough_check)) + # Add HuggingFace cache check + self.add_child(HuggingFaceInfo(thorough_check=self.thorough_check)) + # Add Cargo (always show, even if not found) self.add_child(CargoInfo(thorough_check=self.thorough_check)) @@ -1227,6 +1240,185 @@ def format_bytes(bytes_val): return "", None +class HuggingFaceInfo(NodeInfo): + """Hugging Face models cache information (follows standalone requirement) + + HARD-CODED PATH: ~/.cache/huggingface/hub + + ENV VARIABLES (checked by HuggingFace transformers library, not this tool): + - HF_HOME: Base directory for Hugging Face cache + - HUGGINGFACE_HUB_CACHE: Direct path to hub cache + - HF_TOKEN: Authentication token (checked and displayed if set) + + This class directly uses ~/.cache/huggingface/hub instead of reading environment + variables because this tool must work reliably in all environments, including when + environment variables are misconfigured or not set. For dynamic configuration that + respects all HF environment variables, use dynamo.common.config_dump at runtime. + """ + + def __init__(self, thorough_check: bool = False): + # HARD-CODED PATH: ~/.cache/huggingface/hub (not reading HF_HOME or HUGGINGFACE_HUB_CACHE) + hf_cache_path = os.path.expanduser("~/.cache/huggingface/hub") + + if os.path.exists(hf_cache_path): + models = self._get_cached_models(hf_cache_path) + if models: + self._init_with_models(hf_cache_path, models, thorough_check) + else: + self._init_no_models_found(hf_cache_path) + else: + self._init_cache_not_available() + + # Add HF_TOKEN info if set (common to all cases) + self._add_hf_token_info() + + def _init_with_models( + self, hf_cache_path: str, models: List[tuple], thorough_check: bool + ): + """Initialize when models are found in cache.""" + model_count = len(models) + display_path = self._replace_home_with_var(hf_cache_path) + super().__init__( + label="Hugging Face Cache", + desc=f"{model_count} models in {display_path}", + status=NodeStatus.OK, + ) + + # Only show detailed model list in thorough mode + if thorough_check: + self._add_model_details(models) + + def _init_no_models_found(self, hf_cache_path: str): + """Initialize when cache exists but no models found.""" + display_path = self._replace_home_with_var(hf_cache_path) + super().__init__( + label="Hugging Face Cache", + desc=f"directory exists but no models found in {display_path}", + status=NodeStatus.WARNING, + ) + + def _init_cache_not_available(self): + """Initialize when cache directory doesn't exist.""" + super().__init__( + label="Hugging Face Cache", + desc="~/.cache/huggingface/hub not available", + status=NodeStatus.WARNING, + ) + + def _add_model_details(self, models: List[tuple]): + """Add detailed model information as child nodes.""" + # Add all models as children (no limit) + for i, model_info in enumerate(models): + model_name, download_date, size_str = model_info + model_node = NodeInfo( + label=f"Model {i+1}", + desc=f"{model_name}, downloaded={download_date}, size={size_str}", + status=NodeStatus.INFO, + ) + self.add_child(model_node) + + def _add_hf_token_info(self): + """Add HF_TOKEN information if the environment variable is set.""" + if os.environ.get("HF_TOKEN"): + token_node = NodeInfo( + label="HF_TOKEN", + desc="", + status=NodeStatus.INFO, + ) + self.add_child(token_node) + + def _get_cached_models(self, cache_path: str) -> List[tuple]: + """Get list of cached Hugging Face models with metadata. + + Returns: + List of tuples: (model_name, download_date, size_str) + """ + models = [] + try: + if os.path.exists(cache_path): + for item in os.listdir(cache_path): + item_path = os.path.join(cache_path, item) + if os.path.isdir(item_path): + # Get model name + if item.startswith("models--"): + # Convert "models--org--model-name" to "org/model-name" + parts = item.split("--") + if len(parts) >= 3: + org = parts[1] + model_name = "--".join( + parts[2:] + ) # Handle model names with dashes + display_name = f"{org}/{model_name}" + else: + display_name = item # Fallback to raw name + elif not item.startswith("."): # Skip hidden files/dirs + display_name = item + else: + continue # Skip hidden directories + + # Get download date (directory creation/modification time) + try: + stat_info = os.stat(item_path) + # Use the earlier of creation time or modification time + download_time = min(stat_info.st_ctime, stat_info.st_mtime) + download_date = self._format_timestamp_pdt(download_time) + except Exception: + download_date = "unknown" + + # Get directory size + try: + size_bytes = self._get_directory_size_bytes(item_path) + size_str = self._format_size(size_bytes) + except Exception: + size_str = "unknown" + + models.append((display_name, download_date, size_str)) + except Exception: + pass + + # Sort by model name + return sorted(models, key=lambda x: x[0]) + + def _get_directory_size_bytes(self, directory: str) -> int: + """Get the total size of a directory in bytes.""" + total_size = 0 + try: + for dirpath, dirnames, filenames in os.walk(directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + try: + if not os.path.islink(filepath): # Skip symbolic links + total_size += os.path.getsize(filepath) + except (OSError, FileNotFoundError): + pass # Skip files that can't be accessed + except Exception: + pass + return total_size + + def _format_size(self, size_bytes: int) -> str: + """Format size in bytes to human readable format.""" + if size_bytes == 0: + return "0 B" + + units = ["B", "KB", "MB", "GB", "TB"] + size = float(size_bytes) + unit_index = 0 + + while size >= 1024.0 and unit_index < len(units) - 1: + size /= 1024.0 + unit_index += 1 + + # Format with appropriate precision + if unit_index == 0: # Bytes + return f"{int(size)} {units[unit_index]}" + elif size >= 100: + return f"{size:.0f} {units[unit_index]}" + elif size >= 10: + return f"{size:.1f} {units[unit_index]}" + else: + return f"{size:.2f} {units[unit_index]}" + + class CargoInfo(NodeInfo): """Cargo tool information""" From 730b2653ae93367e3ae4ce30eba381f8b952ab6f Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Sat, 25 Oct 2025 02:53:13 +0000 Subject: [PATCH 2/3] fix: correct HuggingFace cache model filtering and performance - Only count models--* directories, excluding datasets--, spaces--, blobs - Gate size calculation on thorough_check flag to keep default mode fast - Add compute_sizes parameter with documentation Signed-off-by: Keiven Chang --- deploy/sanity_check.py | 62 ++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/deploy/sanity_check.py b/deploy/sanity_check.py index 67a7b04334..eba419282e 100755 --- a/deploy/sanity_check.py +++ b/deploy/sanity_check.py @@ -1261,7 +1261,9 @@ def __init__(self, thorough_check: bool = False): hf_cache_path = os.path.expanduser("~/.cache/huggingface/hub") if os.path.exists(hf_cache_path): - models = self._get_cached_models(hf_cache_path) + models = self._get_cached_models( + hf_cache_path, compute_sizes=thorough_check + ) if models: self._init_with_models(hf_cache_path, models, thorough_check) else: @@ -1327,9 +1329,13 @@ def _add_hf_token_info(self): ) self.add_child(token_node) - def _get_cached_models(self, cache_path: str) -> List[tuple]: + def _get_cached_models(self, cache_path: str, compute_sizes: bool) -> List[tuple]: """Get list of cached Hugging Face models with metadata. + Args: + cache_path: Path to HuggingFace cache directory + compute_sizes: Whether to compute directory sizes (slow operation) + Returns: List of tuples: (model_name, download_date, size_str) """ @@ -1338,41 +1344,37 @@ def _get_cached_models(self, cache_path: str) -> List[tuple]: if os.path.exists(cache_path): for item in os.listdir(cache_path): item_path = os.path.join(cache_path, item) - if os.path.isdir(item_path): - # Get model name - if item.startswith("models--"): - # Convert "models--org--model-name" to "org/model-name" - parts = item.split("--") - if len(parts) >= 3: - org = parts[1] - model_name = "--".join( - parts[2:] - ) # Handle model names with dashes - display_name = f"{org}/{model_name}" - else: - display_name = item # Fallback to raw name - elif not item.startswith("."): # Skip hidden files/dirs - display_name = item - else: - continue # Skip hidden directories - - # Get download date (directory creation/modification time) - try: - stat_info = os.stat(item_path) - # Use the earlier of creation time or modification time - download_time = min(stat_info.st_ctime, stat_info.st_mtime) - download_date = self._format_timestamp_pdt(download_time) - except Exception: - download_date = "unknown" + # Only count model repos; ignore datasets--, spaces--, blobs, etc. + if not (os.path.isdir(item_path) and item.startswith("models--")): + continue + # Convert "models--org--repo-name" to "org/repo-name" + parts = item.split("--") + if len(parts) >= 3: + org = parts[1] + model_name = "--".join(parts[2:]) # Preserve dashes + display_name = f"{org}/{model_name}" + else: + display_name = item # Fallback to raw dir name + + # Get download date (directory creation/modification time) + try: + stat_info = os.stat(item_path) + # Use the earlier of creation time or modification time + download_time = min(stat_info.st_ctime, stat_info.st_mtime) + download_date = self._format_timestamp_pdt(download_time) + except Exception: + download_date = "unknown" - # Get directory size + # Get directory size (only when requested) + size_str = "-" + if compute_sizes: try: size_bytes = self._get_directory_size_bytes(item_path) size_str = self._format_size(size_bytes) except Exception: size_str = "unknown" - models.append((display_name, download_date, size_str)) + models.append((display_name, download_date, size_str)) except Exception: pass From 8c045e07c1377262278bcc6ab36f2756d6298b81 Mon Sep 17 00:00:00 2001 From: Keiven Chang Date: Tue, 28 Oct 2025 12:21:33 -0700 Subject: [PATCH 3/3] minor, fix site-packages permission check to avoid false failures When virtualenv site-packages is writable, downgrade non-writable system directories from ERROR to WARNING. Signed-off-by: Keiven Chang --- deploy/sanity_check.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/deploy/sanity_check.py b/deploy/sanity_check.py index eba419282e..9d1fe84e07 100755 --- a/deploy/sanity_check.py +++ b/deploy/sanity_check.py @@ -1116,7 +1116,14 @@ def _check_dynamo_directory_permissions(self): ) def _check_site_packages_permissions(self): - """Check site-packages directory writability""" + """Check site-packages directory writability + + Logic: + - If running in a virtualenv and its site-packages is writable: PASS + (system site-packages being read-only is expected and shown as WARNING) + - If no virtualenv and no writable site-packages: ERROR + (can't install packages anywhere) + """ try: import site @@ -1126,15 +1133,33 @@ def _check_site_packages_permissions(self): if user_site: site_packages_dirs.append(user_site) - # Check each existing site-packages directory + # First pass: check which directories are writable + writable_dirs = [] + all_results = [] recursive = self.thorough_check + for site_dir in site_packages_dirs: if os.path.exists(site_dir): results = self._check_permissions_unified( [site_dir], "site-packages", recursive=recursive ) - for result in results: - self.add_child(result) + all_results.append((site_dir, results)) + + # Check if this directory is writable + if results and results[0].status == NodeStatus.OK: + writable_dirs.append(site_dir) + + # Determine if we have at least one writable site-packages + has_writable_site_packages = len(writable_dirs) > 0 + + # Second pass: add results with adjusted status + for site_dir, results in all_results: + for result in results: + # If we have at least one writable site-packages, + # downgrade ERROR to WARNING for non-writable ones + if has_writable_site_packages and result.status == NodeStatus.ERROR: + result.status = NodeStatus.WARNING + self.add_child(result) except Exception as e: self.add_child(