diff --git a/README.md b/README.md index aae7b0dc..4ee8f640 100644 --- a/README.md +++ b/README.md @@ -100,14 +100,17 @@ Open http://localhost:7860 (Gradio) or http://localhost:8001 (API). ### šŸ’” Which Model Should I Choose? -| Your GPU VRAM | Recommended LM Model | Notes | -|---------------|---------------------|-------| -| **≤6GB** | None (DiT only) | LM disabled by default to save memory | -| **6-12GB** | `acestep-5Hz-lm-0.6B` | Lightweight, good balance | -| **12-16GB** | `acestep-5Hz-lm-1.7B` | Better quality | -| **≄16GB** | `acestep-5Hz-lm-4B` | Best quality and audio understanding | - -> šŸ“– GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [äø­ę–‡](./docs/zh/GPU_COMPATIBILITY.md) | [ę—„ęœ¬čŖž](./docs/ja/GPU_COMPATIBILITY.md) +| Your GPU VRAM | Recommended LM Model | Backend | Notes | +|---------------|---------------------|---------|-------| +| **≤6GB** | None (DiT only) | — | LM disabled by default; INT8 quantization + full CPU offload | +| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | Lightweight LM with PyTorch backend | +| **8-16GB** | `acestep-5Hz-lm-0.6B` / `1.7B` | `vllm` | 0.6B for 8-12GB, 1.7B for 12-16GB | +| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 4B available on 20GB+; no offload needed on 20GB+ | +| **≄24GB** | `acestep-5Hz-lm-4B` | `vllm` | Best quality, all models fit without offload | + +The UI automatically selects the best configuration for your GPU. All settings (LM model, backend, offloading, quantization) are tier-aware and pre-configured. + +> šŸ“– GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [äø­ę–‡](./docs/zh/GPU_COMPATIBILITY.md) | [ę—„ęœ¬čŖž](./docs/ja/GPU_COMPATIBILITY.md) | [ķ•œźµ­ģ–“](./docs/ko/GPU_COMPATIBILITY.md) ## šŸš€ Launch Scripts diff --git a/acestep/acestep_v15_pipeline.py b/acestep/acestep_v15_pipeline.py index 08347dac..252f805c 100644 --- a/acestep/acestep_v15_pipeline.py +++ b/acestep/acestep_v15_pipeline.py @@ -36,7 +36,7 @@ from .llm_inference import LLMHandler from .dataset_handler import DatasetHandler from .gradio_ui import create_gradio_interface - from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB + from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB from .model_downloader import ensure_lm_model except ImportError: # When executed as a script: `python acestep/acestep_v15_pipeline.py` @@ -47,7 +47,7 @@ from acestep.llm_inference import LLMHandler from acestep.dataset_handler import DatasetHandler from acestep.gradio_ui import create_gradio_interface - from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB + from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB from acestep.model_downloader import ensure_lm_model @@ -93,7 +93,11 @@ def main(): set_global_gpu_config(gpu_config) # Set global config for use across modules gpu_memory_gb = gpu_config.gpu_memory_gb - auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB + # Enable auto-offload for GPUs below 20 GB. 16 GB GPUs cannot hold all + # models simultaneously (DiT ~4.7 + VAE ~0.3 + text_enc ~1.2 + LM ≄1.2 + + # activations) so they *must* offload. The old threshold of 16 GB caused + # 16 GB GPUs to never offload, leading to OOM. + auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB # Print GPU configuration info print(f"\n{'='*60}") @@ -110,9 +114,9 @@ def main(): print(f"{'='*60}\n") if auto_offload: - print(f"Auto-enabling CPU offload (GPU < 16GB)") + print(f"Auto-enabling CPU offload (GPU {gpu_memory_gb:.1f}GB < {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)") elif gpu_memory_gb > 0: - print(f"CPU offload disabled by default (GPU >= 16GB)") + print(f"CPU offload disabled by default (GPU {gpu_memory_gb:.1f}GB >= {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)") else: print("No GPU detected, running on CPU") @@ -205,6 +209,19 @@ def main(): args.offload_to_cpu = True print(f"Auto-enabling CPU offload (4B LM model requires offloading on {gpu_memory_gb:.0f}GB GPU)") + # Safety: on 16GB GPUs, prevent selecting LM models that are too large. + # Even with offloading, a 4B LM (8 GB weights + KV cache) leaves almost no + # headroom for DiT activations on a 16 GB card. + if args.lm_model_path and 0 < gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB: + if "4B" in args.lm_model_path: + # Downgrade to 1.7B if available + fallback = args.lm_model_path.replace("4B", "1.7B") + print( + f"WARNING: 4B LM model is too large for {gpu_memory_gb:.0f}GB GPU. " + f"Downgrading to 1.7B variant: {fallback}" + ) + args.lm_model_path = fallback + try: init_params = None dit_handler = None diff --git a/acestep/api_server.py b/acestep/api_server.py index a553aaf8..91224994 100644 --- a/acestep/api_server.py +++ b/acestep/api_server.py @@ -68,6 +68,7 @@ is_lm_model_supported, GPUConfig, VRAM_16GB_MIN_GB, + VRAM_AUTO_OFFLOAD_THRESHOLD_GB, ) @@ -1899,7 +1900,7 @@ async def _job_store_cleanup_worker() -> None: app.state.gpu_config = gpu_config gpu_memory_gb = gpu_config.gpu_memory_gb - auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB + auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB # Print GPU configuration info print(f"\n{'='*60}") diff --git a/acestep/dit_alignment_score.py b/acestep/dit_alignment_score.py index 4e99982d..f89def61 100644 --- a/acestep/dit_alignment_score.py +++ b/acestep/dit_alignment_score.py @@ -834,16 +834,16 @@ def calculate_score( Returns: AlignmentScore object containing individual metrics and final score. """ - # Ensure Inputs are Tensors on the correct device + # Ensure Inputs are Tensors. + # Always compute on CPU — the scoring matrices are small and this + # avoids occupying GPU VRAM that DiT / VAE / LM need. Keeping + # everything on CPU also prevents timeout issues on low-VRAM GPUs + # where the accelerator memory is fully committed to model weights. + _score_device = "cpu" if not isinstance(energy_matrix, torch.Tensor): - # Use available accelerator device; fallback to CPU if none - if torch.cuda.is_available(): - _score_device = "cuda" - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - _score_device = "mps" - else: - _score_device = "cpu" energy_matrix = torch.tensor(energy_matrix, device=_score_device, dtype=torch.float32) + else: + energy_matrix = energy_matrix.to(device=_score_device, dtype=torch.float32) device = energy_matrix.device diff --git a/acestep/gpu_config.py b/acestep/gpu_config.py index 1733beb5..e02fde18 100644 --- a/acestep/gpu_config.py +++ b/acestep/gpu_config.py @@ -28,11 +28,53 @@ VRAM_16GB_TOLERANCE_GB = 0.5 VRAM_16GB_MIN_GB = 16.0 - VRAM_16GB_TOLERANCE_GB # treat as 16GB class if >= this +# Threshold below which auto_offload is enabled. +# 16GB GPUs cannot hold DiT + VAE + text_encoder + LM simultaneously without offloading. +VRAM_AUTO_OFFLOAD_THRESHOLD_GB = 20.0 + # PyTorch installation URLs for diagnostics PYTORCH_CUDA_INSTALL_URL = "https://download.pytorch.org/whl/cu121" PYTORCH_ROCM_INSTALL_URL = "https://download.pytorch.org/whl/rocm6.0" +# =========================================================================== +# Empirical VRAM measurements (GB) -- model weights only, bf16 precision +# These values should be calibrated using scripts/profile_vram.py +# =========================================================================== + +# Base model weights (loaded once at startup) +MODEL_VRAM = { + "dit_turbo": 4.7, # DiT turbo model weights (bf16) + "dit_base": 4.7, # DiT base model weights (bf16) + "vae": 0.33, # VAE (AutoencoderOobleck) weights (fp16) + "text_encoder": 1.2, # Qwen3-Embedding-0.6B text encoder (bf16) + "silence_latent": 0.01, # Silence latent tensor + "cuda_context": 0.5, # CUDA context + driver overhead +} + +# LM model weights (bf16) + KV cache estimates +LM_VRAM = { + "0.6B": {"weights": 1.2, "kv_cache_2k": 0.3, "kv_cache_4k": 0.6}, + "1.7B": {"weights": 3.4, "kv_cache_2k": 0.5, "kv_cache_4k": 1.0}, + "4B": {"weights": 8.0, "kv_cache_2k": 0.8, "kv_cache_4k": 1.6}, +} + +# DiT inference peak VRAM per batch item (approximate, depends on duration) +# These are additional activations/intermediates on top of model weights. +# +# Profiling on A800 (flash attention) shows only ~0.001-0.004 GB per batch item. +# Consumer GPUs without flash attention will be higher due to materialised +# attention matrices. We use conservative estimates that cover the worst case +# (no flash attention, long sequences). +DIT_INFERENCE_VRAM_PER_BATCH = { + "turbo": 0.3, # GB per batch item (no CFG) + "base": 0.6, # GB per batch item (with CFG, 2x forward) +} + +# Safety margin to keep free for OS/driver/fragmentation (GB) +VRAM_SAFETY_MARGIN_GB = 0.5 + + @dataclass class GPUConfig: """GPU configuration based on available memory""" @@ -50,65 +92,157 @@ class GPUConfig: # LM configuration init_lm_default: bool # Whether to initialize LM by default available_lm_models: List[str] # Available LM models for this tier + recommended_lm_model: str # Recommended default LM model path (empty if LM not available) + + # LM backend restriction + # "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for very low VRAM + lm_backend_restriction: str # "all" or "pt_mlx_only" + recommended_backend: str # Recommended default backend: "vllm", "pt", or "mlx" + + # Offload defaults + offload_to_cpu_default: bool # Whether offload_to_cpu should be enabled by default + offload_dit_to_cpu_default: bool # Whether offload_dit_to_cpu should be enabled by default + + # Quantization / compile defaults + quantization_default: bool # Whether INT8 quantization should be enabled by default + compile_model_default: bool # Whether torch.compile should be enabled by default # LM memory allocation (GB) for each model size lm_memory_gb: Dict[str, float] # e.g., {"0.6B": 3, "1.7B": 8, "4B": 12} # GPU tier configurations +# tier6 has been split into tier6a (16-20GB) and tier6b (20-24GB) to fix the +# 16GB regression. 16GB GPUs cannot hold all models simultaneously with the +# same batch sizes as 24GB GPUs. GPU_TIER_CONFIGS = { "tier1": { # <= 4GB - "max_duration_with_lm": 180, # 3 minutes - "max_duration_without_lm": 180, # 3 minutes + # Offload mode required. DiT(4.46) barely fits with CUDA context(0.5). + # VAE decode falls back to CPU. Keep durations moderate. + "max_duration_with_lm": 240, # 4 minutes + "max_duration_without_lm": 360, # 6 minutes "max_batch_size_with_lm": 1, "max_batch_size_without_lm": 1, "init_lm_default": False, "available_lm_models": [], + "recommended_lm_model": "", + "lm_backend_restriction": "pt_mlx_only", # vllm KV cache won't fit + "recommended_backend": "pt", + "offload_to_cpu_default": True, + "offload_dit_to_cpu_default": True, + "quantization_default": True, # INT8 essential to fit DiT in ~4GB + "compile_model_default": True, "lm_memory_gb": {}, }, "tier2": { # 4-6GB - "max_duration_with_lm": 360, # 6 minutes - "max_duration_without_lm": 360, # 6 minutes + # Offload mode. DiT(4.46) + context(0.5) + activations ā‰ˆ 5.0GB. + # ~1GB headroom. Tiled VAE decode fits with chunk=256 (~0.8GB peak). + # Duration barely affects peak VRAM (latent tensor is <2MB even at 10min). + "max_duration_with_lm": 480, # 8 minutes + "max_duration_without_lm": 600, # 10 minutes (max supported) "max_batch_size_with_lm": 1, "max_batch_size_without_lm": 1, "init_lm_default": False, "available_lm_models": [], + "recommended_lm_model": "", + "lm_backend_restriction": "pt_mlx_only", + "recommended_backend": "pt", + "offload_to_cpu_default": True, + "offload_dit_to_cpu_default": True, + "quantization_default": True, + "compile_model_default": True, "lm_memory_gb": {}, }, "tier3": { # 6-8GB - "max_duration_with_lm": 240, # 4 minutes with LM - "max_duration_without_lm": 360, # 6 minutes without LM - "max_batch_size_with_lm": 1, + # Offload mode. DiT(4.46) + context(0.5) ā‰ˆ 5.0GB. + # ~1.5-3GB headroom allows LM 0.6B (1.2+0.6=1.8GB) and batch=2. + # vllm KV cache is tight; pt backend is safer for 0.6B on this tier. + "max_duration_with_lm": 480, # 8 minutes + "max_duration_without_lm": 600, # 10 minutes (max supported) + "max_batch_size_with_lm": 2, "max_batch_size_without_lm": 2, - "init_lm_default": False, # Don't init by default due to limited memory + "init_lm_default": True, "available_lm_models": ["acestep-5Hz-lm-0.6B"], + "recommended_lm_model": "acestep-5Hz-lm-0.6B", + "lm_backend_restriction": "pt_mlx_only", # vllm KV cache too greedy for <8GB + "recommended_backend": "pt", + "offload_to_cpu_default": True, + "offload_dit_to_cpu_default": True, + "quantization_default": True, + "compile_model_default": True, "lm_memory_gb": {"0.6B": 3}, }, "tier4": { # 8-12GB - "max_duration_with_lm": 240, # 4 minutes with LM - "max_duration_without_lm": 360, # 6 minutes without LM + # Can keep DiT + 0.6B LM simultaneously on GPU (4.46+1.2+0.6=6.26GB). + # Offload VAE/TextEnc. Plenty of room for inference activations. + "max_duration_with_lm": 480, # 8 minutes + "max_duration_without_lm": 600, # 10 minutes (max supported) "max_batch_size_with_lm": 2, "max_batch_size_without_lm": 4, - "init_lm_default": False, # Don't init by default + "init_lm_default": True, "available_lm_models": ["acestep-5Hz-lm-0.6B"], + "recommended_lm_model": "acestep-5Hz-lm-0.6B", + "lm_backend_restriction": "all", # vllm fits with 0.6B + "recommended_backend": "vllm", + "offload_to_cpu_default": True, + "offload_dit_to_cpu_default": True, + "quantization_default": True, + "compile_model_default": True, "lm_memory_gb": {"0.6B": 3}, }, "tier5": { # 12-16GB - "max_duration_with_lm": 240, # 4 minutes with LM - "max_duration_without_lm": 360, # 6 minutes without LM - "max_batch_size_with_lm": 2, + # DiT + 1.7B LM (4.46+3.45+0.44=8.35GB) fits comfortably. + # VAE decode is batch-sequential so batch size doesn't affect VAE VRAM. + "max_duration_with_lm": 480, # 8 minutes + "max_duration_without_lm": 600, # 10 minutes (max supported) + "max_batch_size_with_lm": 4, "max_batch_size_without_lm": 4, "init_lm_default": True, "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B"], + "recommended_lm_model": "acestep-5Hz-lm-1.7B", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", + "offload_to_cpu_default": True, + "offload_dit_to_cpu_default": False, # 12-16GB can keep DiT on GPU + "quantization_default": True, + "compile_model_default": True, "lm_memory_gb": {"0.6B": 3, "1.7B": 8}, }, - "tier6": { # 16-24GB + "tier6a": { # 16-20GB (e.g., RTX 4060 Ti 16GB, RTX 3080 16GB) + # On 16GB GPUs: DiT(INT8, ~2.4GB) + LM 1.7B(~7.6GB peak with offload) = ~10GB peak + # Empirical batch tests (60s, turbo): noLM-4→13.3GB, LM-2→11.9GB, LM-4→~13.5GB + # With CPU offload, LM is offloaded after inference → DiT batch has full 16GB budget. "max_duration_with_lm": 480, # 8 minutes - "max_duration_without_lm": 480, # 8 minutes + "max_duration_without_lm": 600, # 10 minutes (max supported) "max_batch_size_with_lm": 4, "max_batch_size_without_lm": 8, "init_lm_default": True, + "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B"], + "recommended_lm_model": "acestep-5Hz-lm-1.7B", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", + "offload_to_cpu_default": True, # Still offload VAE/TextEnc to save VRAM for LM + "offload_dit_to_cpu_default": False, + "quantization_default": True, + "compile_model_default": True, + "lm_memory_gb": {"0.6B": 3, "1.7B": 8}, + }, + "tier6b": { # 20-24GB (e.g., RTX 3090, RTX 4090) + # 20-24GB: no offload, no quantization. DiT(bf16, ~4.7GB) + LM 1.7B(~3.4GB) = ~8.1GB + # Remaining ~12-16GB easily fits batch=8. VAE decode is batch-sequential. + "max_duration_with_lm": 480, # 8 minutes + "max_duration_without_lm": 480, # 8 minutes + "max_batch_size_with_lm": 8, + "max_batch_size_without_lm": 8, + "init_lm_default": True, "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B", "acestep-5Hz-lm-4B"], + "recommended_lm_model": "acestep-5Hz-lm-1.7B", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", + "offload_to_cpu_default": False, # 20-24GB can hold all models + "offload_dit_to_cpu_default": False, + "quantization_default": False, # Enough VRAM, quantization optional + "compile_model_default": True, "lm_memory_gb": {"0.6B": 3, "1.7B": 8, "4B": 12}, }, "unlimited": { # >= 24GB @@ -118,10 +252,20 @@ class GPUConfig: "max_batch_size_without_lm": 8, "init_lm_default": True, "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B", "acestep-5Hz-lm-4B"], + "recommended_lm_model": "acestep-5Hz-lm-4B", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", + "offload_to_cpu_default": False, + "offload_dit_to_cpu_default": False, + "quantization_default": False, # Plenty of VRAM + "compile_model_default": True, "lm_memory_gb": {"0.6B": 3, "1.7B": 8, "4B": 12}, }, } +# Backward compatibility alias: code that references "tier6" gets tier6b behavior +GPU_TIER_CONFIGS["tier6"] = GPU_TIER_CONFIGS["tier6b"] + def get_gpu_memory_gb() -> float: """ @@ -142,6 +286,38 @@ def get_gpu_memory_gb() -> float: try: simulated_gb = float(debug_vram) logger.warning(f"āš ļø DEBUG MODE: Simulating GPU memory as {simulated_gb:.1f}GB (set via {DEBUG_MAX_CUDA_VRAM_ENV} environment variable)") + # Also enforce a hard VRAM cap via PyTorch so that the allocator + # cannot use more than the simulated amount. This makes the + # simulation realistic — without it, models still load into the + # real (larger) GPU memory and nvitop shows much higher usage. + try: + import torch + if torch.cuda.is_available(): + total_bytes = torch.cuda.get_device_properties(0).total_memory + total_gb = total_bytes / (1024 ** 3) + if simulated_gb < total_gb: + # When simulating a smaller GPU on a larger one, the host + # GPU's CUDA context is typically much bigger (e.g. A100 + # ~1.4GB vs GTX 1060 ~0.3GB). Using the host context + # would over-penalise the allocator budget. + # + # Instead we use a *reference* context size that matches + # what the target-class GPU would actually have. Consumer + # GPUs (≤24GB) typically have 0.3-0.5GB context overhead. + REFERENCE_CONTEXT_GB = MODEL_VRAM.get("cuda_context", 0.5) + allocator_budget_gb = max(0.5, simulated_gb - REFERENCE_CONTEXT_GB) + fraction = allocator_budget_gb / total_gb + # Clamp to [0.01, 1.0] to satisfy PyTorch constraints + fraction = max(0.01, min(1.0, fraction)) + torch.cuda.set_per_process_memory_fraction(fraction) + logger.warning( + f"āš ļø DEBUG MODE: Set CUDA memory fraction to {fraction:.4f} " + f"(allocator_budget={allocator_budget_gb:.2f}GB, " + f"ref_context={REFERENCE_CONTEXT_GB:.2f}GB, target={simulated_gb:.1f}GB, " + f"total={total_gb:.1f}GB) to enforce hard VRAM cap" + ) + except Exception as e: + logger.warning(f"āš ļø DEBUG MODE: Could not enforce CUDA memory cap: {e}") return simulated_gb except ValueError: logger.warning(f"Invalid {DEBUG_MAX_CUDA_VRAM_ENV} value: {debug_vram}, ignoring") @@ -305,7 +481,7 @@ def get_gpu_tier(gpu_memory_gb: float) -> str: gpu_memory_gb: GPU memory in GB Returns: - Tier string: "tier1", "tier2", "tier3", "tier4", "tier5", "tier6", or "unlimited" + Tier string: "tier1", "tier2", "tier3", "tier4", "tier5", "tier6a", "tier6b", or "unlimited" """ if gpu_memory_gb <= 0: # CPU mode - use tier1 limits @@ -320,10 +496,13 @@ def get_gpu_tier(gpu_memory_gb: float) -> str: return "tier4" elif gpu_memory_gb < VRAM_16GB_MIN_GB: return "tier5" - elif gpu_memory_gb <= 24: + elif gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB: + # 16-20GB range: tier6a (constrained, needs offload) if gpu_memory_gb < 16.0: logger.info(f"Detected {gpu_memory_gb:.2f}GB VRAM — treating as 16GB class GPU") - return "tier6" + return "tier6a" + elif gpu_memory_gb <= 24: + return "tier6b" else: return "unlimited" @@ -353,6 +532,13 @@ def get_gpu_config(gpu_memory_gb: Optional[float] = None) -> GPUConfig: max_batch_size_without_lm=config["max_batch_size_without_lm"], init_lm_default=config["init_lm_default"], available_lm_models=config["available_lm_models"], + recommended_lm_model=config.get("recommended_lm_model", ""), + lm_backend_restriction=config.get("lm_backend_restriction", "all"), + recommended_backend=config.get("recommended_backend", "vllm"), + offload_to_cpu_default=config.get("offload_to_cpu_default", True), + offload_dit_to_cpu_default=config.get("offload_dit_to_cpu_default", True), + quantization_default=config.get("quantization_default", True), + compile_model_default=config.get("compile_model_default", True), lm_memory_gb=config["lm_memory_gb"], ) @@ -362,7 +548,7 @@ def get_lm_model_size(model_path: str) -> str: Extract LM model size from model path. Args: - model_path: Model path string (e.g., "acestep-5Hz-lm-0.6B") + model_path: Model path string (e.g., "acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-0.6B-v4-fix") Returns: Model size string: "0.6B", "1.7B", or "4B" @@ -378,46 +564,360 @@ def get_lm_model_size(model_path: str) -> str: return "0.6B" +def is_lm_model_size_allowed(disk_model_name: str, tier_available_models: List[str]) -> bool: + """ + Check if a disk LM model is allowed by the tier's available models list. + + Uses size-based matching so that variants like "acestep-5Hz-lm-0.6B-v4-fix" + are correctly matched against "acestep-5Hz-lm-0.6B" in the tier config. + + Args: + disk_model_name: Actual model directory name on disk (e.g., "acestep-5Hz-lm-0.6B-v4-fix") + tier_available_models: List of tier-allowed model base names (e.g., ["acestep-5Hz-lm-0.6B"]) + + Returns: + True if the model's size class is allowed by the tier + """ + if not tier_available_models: + return False + model_size = get_lm_model_size(disk_model_name) + for tier_model in tier_available_models: + if model_size == get_lm_model_size(tier_model): + return True + return False + + +def find_best_lm_model_on_disk(recommended_model: str, disk_models: List[str]) -> Optional[str]: + """ + Find the best matching disk model for a recommended tier model. + + If the exact recommended model exists on disk, return it. + Otherwise, find a disk model with the same size class (e.g., "0.6B"). + Prefers models with version suffixes (e.g., "-v4-fix") as they are likely newer. + + Args: + recommended_model: Tier-recommended model name (e.g., "acestep-5Hz-lm-0.6B") + disk_models: List of model names actually on disk + + Returns: + Best matching disk model name, or None if no match + """ + if not recommended_model or not disk_models: + return disk_models[0] if disk_models else None + + # Exact match first + if recommended_model in disk_models: + return recommended_model + + # Size-based match: find all disk models with same size + target_size = get_lm_model_size(recommended_model) + candidates = [m for m in disk_models if get_lm_model_size(m) == target_size] + + if candidates: + # Prefer the one with the longest name (likely has version suffix = newer) + return max(candidates, key=len) + + # No match for recommended size; return first available disk model + return disk_models[0] if disk_models else None + + def get_lm_gpu_memory_ratio(model_path: str, total_gpu_memory_gb: float) -> Tuple[float, float]: """ Calculate GPU memory utilization ratio for LM model. + This function now uses *actually free* VRAM (via torch.cuda.mem_get_info) + when available, instead of computing the ratio purely from total VRAM. + This is critical because DiT, VAE, and text encoder are already loaded + when the LM initializes, so the "available" memory is much less than total. + Args: model_path: LM model path (e.g., "acestep-5Hz-lm-0.6B") - total_gpu_memory_gb: Total GPU memory in GB + total_gpu_memory_gb: Total GPU memory in GB (used as fallback) Returns: Tuple of (gpu_memory_utilization_ratio, target_memory_gb) """ model_size = get_lm_model_size(model_path) - # Model weight memory (approximate) for each model size - model_weight_memory = { - "0.6B": 3.0, - "1.7B": 8.0, - "4B": 12.0, - } + # Use empirical LM VRAM measurements for target memory + lm_info = LM_VRAM.get(model_size, LM_VRAM["0.6B"]) + lm_weights_gb = lm_info["weights"] + lm_kv_cache_gb = lm_info["kv_cache_4k"] - target_gb = model_weight_memory.get(model_size, 3.0) + # Total target = model weights + KV cache + small overhead + target_gb = lm_weights_gb + total_target_gb = lm_weights_gb + lm_kv_cache_gb + 0.3 # 0.3 GB overhead - # gpu_memory_utilization in nano-vllm caps the TOTAL GPU memory usage - # (model weights + KV cache + overhead). If we set it to just the model - # weight size, there is almost no room left for KV cache and inference - # fails with "Insufficient KV cache" errors. - # We therefore add generous headroom so the KV cache can hold at least - # max_model_len (4096) tokens comfortably. - total_target_gb = target_gb * 1.5 # 50% headroom for KV cache + overhead + # Try to use actual free memory for a more accurate ratio + free_gb = None + try: + import torch + if torch.cuda.is_available(): + free_bytes, total_bytes = torch.cuda.mem_get_info() + free_gb = free_bytes / (1024**3) + actual_total_gb = total_bytes / (1024**3) + + # If MAX_CUDA_VRAM is set, use the simulated values instead + # because set_per_process_memory_fraction limits actual allocation + debug_vram = os.environ.get(DEBUG_MAX_CUDA_VRAM_ENV) + if debug_vram is not None: + try: + simulated_gb = float(debug_vram) + if simulated_gb < actual_total_gb: + # Use reference context (matching set_per_process_memory_fraction) + ref_context_gb = MODEL_VRAM.get("cuda_context", 0.5) + allocator_budget_gb = max(0.5, simulated_gb - ref_context_gb) + reserved_gb = torch.cuda.memory_reserved() / (1024**3) + free_gb = max(0, allocator_budget_gb - reserved_gb) + actual_total_gb = simulated_gb + except (ValueError, TypeError): + pass + + # The ratio is relative to total GPU memory (nano-vllm convention), + # but we compute it so that the LM only claims what's actually free + # minus a safety margin for DiT inference activations. + # Reserve at least 1.5 GB for DiT inference activations + dit_reserve_gb = 1.5 + usable_for_lm = max(0, free_gb - dit_reserve_gb - VRAM_SAFETY_MARGIN_GB) + + # Cap to what the LM actually needs + usable_for_lm = min(usable_for_lm, total_target_gb) + + # Convert to ratio of total GPU memory + # nano-vllm uses: target_total_usage = total * gpu_memory_utilization + # We want: (total * ratio) = current_usage + usable_for_lm + current_usage_gb = actual_total_gb - free_gb + desired_total_usage = current_usage_gb + usable_for_lm + ratio = desired_total_usage / actual_total_gb + + ratio = min(0.9, max(0.1, ratio)) + + logger.info( + f"[get_lm_gpu_memory_ratio] model={model_size}, free={free_gb:.2f}GB, " + f"current_usage={current_usage_gb:.2f}GB, lm_target={total_target_gb:.2f}GB, " + f"usable_for_lm={usable_for_lm:.2f}GB, ratio={ratio:.3f}" + ) + return ratio, target_gb + except Exception as e: + logger.warning(f"[get_lm_gpu_memory_ratio] Failed to query free VRAM: {e}, using fallback") - # For large GPUs (>=24GB), don't restrict memory too much + # Fallback: compute ratio from total VRAM (less accurate) if total_gpu_memory_gb >= 24: ratio = min(0.9, max(0.2, total_target_gb / total_gpu_memory_gb)) else: - # For smaller GPUs, strictly limit memory usage ratio = min(0.9, max(0.1, total_target_gb / total_gpu_memory_gb)) return ratio, target_gb +def compute_adaptive_config(total_vram_gb: float, dit_type: str = "turbo") -> GPUConfig: + """ + Compute GPU configuration based on what actually fits in VRAM. + + This is a VRAM-budget-based approach: instead of hard-coded tier boundaries, + we calculate how much memory each component needs and determine what fits. + + Args: + total_vram_gb: Total GPU VRAM in GB + dit_type: "turbo" or "base" (affects inference VRAM due to CFG) + + Returns: + GPUConfig with parameters that fit within the VRAM budget + """ + # Calculate base VRAM usage (always loaded) + dit_key = f"dit_{dit_type}" if f"dit_{dit_type}" in MODEL_VRAM else "dit_turbo" + base_usage = ( + MODEL_VRAM[dit_key] + + MODEL_VRAM["vae"] + + MODEL_VRAM["text_encoder"] + + MODEL_VRAM["cuda_context"] + + MODEL_VRAM["silence_latent"] + + VRAM_SAFETY_MARGIN_GB + ) + + available = total_vram_gb - base_usage + + if available <= 0: + # Not enough for even base models - CPU offload required + return get_gpu_config(total_vram_gb) + + # Determine which LM models fit + available_lm_models = [] + lm_memory_gb = {} + + for size_key in ["0.6B", "1.7B", "4B"]: + lm_info = LM_VRAM[size_key] + lm_total = lm_info["weights"] + lm_info["kv_cache_4k"] + # LM needs to fit with some room left for inference activations + inference_per_batch = DIT_INFERENCE_VRAM_PER_BATCH.get(dit_type, 0.8) + if lm_total + inference_per_batch <= available: + model_name = f"acestep-5Hz-lm-{size_key}" + available_lm_models.append(model_name) + lm_memory_gb[size_key] = lm_info["weights"] + lm_info["kv_cache_4k"] + + # Determine max batch sizes + inference_per_batch = DIT_INFERENCE_VRAM_PER_BATCH.get(dit_type, 0.8) + + # Without LM: all available VRAM goes to inference + max_batch_no_lm = max(1, int(available / inference_per_batch)) + max_batch_no_lm = min(max_batch_no_lm, 8) # Cap at 8 + + # With LM: subtract the largest available LM from available + if available_lm_models: + largest_lm_size = list(lm_memory_gb.keys())[-1] + lm_usage = lm_memory_gb[largest_lm_size] + remaining_for_inference = available - lm_usage + max_batch_with_lm = max(1, int(remaining_for_inference / inference_per_batch)) + max_batch_with_lm = min(max_batch_with_lm, 8) + else: + max_batch_with_lm = max_batch_no_lm + + # Determine duration limits based on available VRAM + # Longer durations need more VRAM for latents + if total_vram_gb >= 24: + max_dur_lm = 600 + max_dur_no_lm = 600 + elif total_vram_gb >= 20: + max_dur_lm = 480 + max_dur_no_lm = 480 + elif total_vram_gb >= 16: + max_dur_lm = 360 + max_dur_no_lm = 480 + elif total_vram_gb >= 12: + max_dur_lm = 240 + max_dur_no_lm = 360 + elif total_vram_gb >= 8: + max_dur_lm = 240 + max_dur_no_lm = 360 + else: + max_dur_lm = 180 + max_dur_no_lm = 180 + + tier = get_gpu_tier(total_vram_gb) + tier_config = GPU_TIER_CONFIGS.get(tier, {}) + + return GPUConfig( + tier=tier, + gpu_memory_gb=total_vram_gb, + max_duration_with_lm=max_dur_lm, + max_duration_without_lm=max_dur_no_lm, + max_batch_size_with_lm=max_batch_with_lm, + max_batch_size_without_lm=max_batch_no_lm, + init_lm_default=bool(available_lm_models), + available_lm_models=available_lm_models, + recommended_lm_model=tier_config.get("recommended_lm_model", available_lm_models[0] if available_lm_models else ""), + lm_backend_restriction=tier_config.get("lm_backend_restriction", "all"), + recommended_backend=tier_config.get("recommended_backend", "vllm"), + offload_to_cpu_default=tier_config.get("offload_to_cpu_default", True), + offload_dit_to_cpu_default=tier_config.get("offload_dit_to_cpu_default", True), + quantization_default=tier_config.get("quantization_default", True), + compile_model_default=tier_config.get("compile_model_default", True), + lm_memory_gb=lm_memory_gb, + ) + + +def get_effective_free_vram_gb(device_index: int = 0) -> float: + """ + Get the effective free VRAM in GB, accounting for per-process memory fraction. + + torch.cuda.mem_get_info() reports *device-level* free memory, which ignores + the per-process cap set by torch.cuda.set_per_process_memory_fraction(). + + This function computes: + effective_free = min(device_free, process_allowed - process_allocated) + + where process_allowed = total_memory * memory_fraction. + + Returns 0 if no GPU is available or on error. + """ + try: + import torch + if not torch.cuda.is_available(): + return 0.0 + + device_free_bytes, total_bytes = torch.cuda.mem_get_info(device_index) + + # Check if a per-process memory fraction has been set + # We detect this by checking MAX_CUDA_VRAM env var (our simulation mechanism) + debug_vram = os.environ.get(DEBUG_MAX_CUDA_VRAM_ENV) + if debug_vram is not None: + try: + simulated_gb = float(debug_vram) + total_gb = total_bytes / (1024 ** 3) + if simulated_gb < total_gb: + # Per-process cap is active. + # Use the same reference context as set_per_process_memory_fraction. + ref_context_gb = MODEL_VRAM.get("cuda_context", 0.5) + allocator_budget_gb = max(0.5, simulated_gb - ref_context_gb) + allocator_budget_bytes = allocator_budget_gb * (1024 ** 3) + reserved_bytes = torch.cuda.memory_reserved(device_index) + # Free = what the allocator is allowed minus what it has reserved + process_free = allocator_budget_bytes - reserved_bytes + effective_free = min(device_free_bytes, process_free) + return max(0.0, effective_free / (1024 ** 3)) + except (ValueError, TypeError): + pass + + return device_free_bytes / (1024 ** 3) + except Exception: + return 0.0 + + +def get_available_vram_gb() -> float: + """ + Get currently available (free) GPU VRAM in GB. + Returns 0 if no GPU is available or on error. + + This is an alias for get_effective_free_vram_gb() that accounts for + per-process memory fraction caps. + """ + return get_effective_free_vram_gb() + + +def estimate_inference_vram( + batch_size: int, + duration_s: float, + dit_type: str = "turbo", + with_lm: bool = False, + lm_size: str = "0.6B", +) -> float: + """ + Estimate total VRAM needed for a generation request. + + Args: + batch_size: Number of samples to generate + duration_s: Audio duration in seconds + dit_type: "turbo" or "base" + with_lm: Whether LM is loaded + lm_size: LM model size if with_lm is True + + Returns: + Estimated VRAM in GB + """ + # Base model weights + dit_key = f"dit_{dit_type}" if f"dit_{dit_type}" in MODEL_VRAM else "dit_turbo" + base = ( + MODEL_VRAM[dit_key] + + MODEL_VRAM["vae"] + + MODEL_VRAM["text_encoder"] + + MODEL_VRAM["cuda_context"] + ) + + # DiT inference activations (scales with batch size and duration) + per_batch = DIT_INFERENCE_VRAM_PER_BATCH.get(dit_type, 0.8) + # Duration scaling: longer audio = more latent frames = more memory + duration_factor = max(1.0, duration_s / 60.0) # Normalize to 60s baseline + inference = per_batch * batch_size * duration_factor + + # LM memory + lm_mem = 0.0 + if with_lm and lm_size in LM_VRAM: + lm_info = LM_VRAM[lm_size] + lm_mem = lm_info["weights"] + lm_info["kv_cache_4k"] + + return base + inference + lm_mem + VRAM_SAFETY_MARGIN_GB + + def check_duration_limit( duration: float, gpu_config: GPUConfig, diff --git a/acestep/gradio_ui/events/__init__.py b/acestep/gradio_ui/events/__init__.py index dd1a4122..c76c6c07 100644 --- a/acestep/gradio_ui/events/__init__.py +++ b/acestep/gradio_ui/events/__init__.py @@ -70,6 +70,9 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase generation_section["cfg_interval_start"], generation_section["cfg_interval_end"], generation_section["task_type"], + # GPU-config-aware limits (updated after initialization) + generation_section["audio_duration"], + generation_section["batch_size_input"], ] ) diff --git a/acestep/gradio_ui/events/generation_handlers.py b/acestep/gradio_ui/events/generation_handlers.py index e8fc6eeb..8a0dd0ca 100644 --- a/acestep/gradio_ui/events/generation_handlers.py +++ b/acestep/gradio_ui/events/generation_handlers.py @@ -8,13 +8,14 @@ import glob import gradio as gr from typing import Optional, List, Tuple +from loguru import logger from acestep.constants import ( TASK_TYPES_TURBO, TASK_TYPES_BASE, ) from acestep.gradio_ui.i18n import t from acestep.inference import understand_music, create_sample, format_sample -from acestep.gpu_config import get_global_gpu_config +from acestep.gpu_config import get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk def clamp_duration_to_gpu_limit(duration_value: Optional[float], llm_handler=None) -> Optional[float]: @@ -441,10 +442,38 @@ def update_model_type_settings(config_path): def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu, compile_model, quantization): - """Wrapper for service initialization, returns status, button state, accordion state, and model type settings""" + """Wrapper for service initialization, returns status, button state, accordion state, model type settings, and GPU-config-aware UI limits.""" # Convert quantization checkbox to value (int8_weight_only if checked, None if not) quant_value = "int8_weight_only" if quantization else None + # --- Tier-aware validation before initialization --- + gpu_config = get_global_gpu_config() + + # Validate LM request against GPU tier + if init_llm and not gpu_config.available_lm_models: + init_llm = False # Force disable LM on tiers that can't support it + logger.warning(f"āš ļø LM initialization disabled: GPU tier {gpu_config.tier} ({gpu_config.gpu_memory_gb:.1f}GB) does not support LM") + + # Validate LM model against tier's available models (size-based matching) + if init_llm and lm_model_path and gpu_config.available_lm_models: + if not is_lm_model_size_allowed(lm_model_path, gpu_config.available_lm_models): + # The selected model's size class is not supported by this tier. + # Find a disk model that matches the recommended size. + all_disk_models = llm_handler.get_available_5hz_lm_models() if llm_handler else [] + fallback = find_best_lm_model_on_disk(gpu_config.recommended_lm_model, all_disk_models) + if fallback: + old_model = lm_model_path + lm_model_path = fallback + logger.warning(f"āš ļø LM model {old_model} size not supported for tier {gpu_config.tier}, falling back to {lm_model_path}") + else: + init_llm = False + logger.warning(f"āš ļø No compatible LM model found on disk for tier {gpu_config.tier}, disabling LM") + + # Validate backend against tier restriction + if init_llm and gpu_config.lm_backend_restriction == "pt_mlx_only" and backend == "vllm": + backend = gpu_config.recommended_backend # Fallback to pt + logger.warning(f"āš ļø vllm backend not supported for tier {gpu_config.tier} (VRAM too low for KV cache), falling back to {backend}") + # Initialize DiT handler status, enable = dit_handler.initialize_service( checkpoint, config_path, device, @@ -485,11 +514,37 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi is_turbo = dit_handler.is_turbo_model() model_type_settings = get_model_type_ui_settings(is_turbo) + # --- Update UI limits based on GPU config and actual LM state --- + gpu_config = get_global_gpu_config() + lm_actually_initialized = llm_handler.llm_initialized if llm_handler else False + max_duration = gpu_config.max_duration_with_lm if lm_actually_initialized else gpu_config.max_duration_without_lm + max_batch = gpu_config.max_batch_size_with_lm if lm_actually_initialized else gpu_config.max_batch_size_without_lm + + duration_update = gr.update( + maximum=float(max_duration), + info=f"Duration in seconds (-1 for auto). Max: {max_duration}s / {max_duration // 60} min" + ) + batch_update = gr.update( + value=min(2, max_batch), # Clamp value to new maximum to avoid Gradio validation error + maximum=max_batch, + info=f"Number of samples to generate (Max: {max_batch})" + ) + + # Add GPU config info to status + status += f"\nšŸ“Š GPU Config: tier={gpu_config.tier}, max_duration={max_duration}s, max_batch={max_batch}" + if gpu_config.available_lm_models: + status += f", available_lm={gpu_config.available_lm_models}" + else: + status += ", LM not available for this GPU tier" + return ( status, gr.update(interactive=enable), accordion_state, - *model_type_settings + *model_type_settings, + # GPU-config-aware UI updates + duration_update, + batch_update, ) diff --git a/acestep/gradio_ui/interfaces/generation.py b/acestep/gradio_ui/interfaces/generation.py index a30bc877..564b07a0 100644 --- a/acestep/gradio_ui/interfaces/generation.py +++ b/acestep/gradio_ui/interfaces/generation.py @@ -12,7 +12,7 @@ DEFAULT_DIT_INSTRUCTION, ) from acestep.gradio_ui.i18n import t -from acestep.gpu_config import get_global_gpu_config, GPUConfig +from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict: @@ -48,17 +48,37 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua default_batch_size = min(2, max_batch_size) # Default to 2 or max if lower init_lm_default = gpu_config.init_lm_default - # Determine default offload setting - # If XPU is detected, default offload to False (keep models on device) - # Otherwise default to True (offload to CPU to save VRAM) - default_offload = True + # Determine default offload setting from GPU config tier + # XPU override: if XPU is detected, keep models on device + default_offload = gpu_config.offload_to_cpu_default + default_offload_dit = gpu_config.offload_dit_to_cpu_default try: import torch if hasattr(torch, 'xpu') and torch.xpu.is_available(): default_offload = False + default_offload_dit = False except ImportError: pass + # Tier-aware LM defaults + default_quantization = gpu_config.quantization_default + default_compile = gpu_config.compile_model_default + # macOS override: disable quantization on macOS due to torchao incompatibilities + if sys.platform == "darwin": + default_quantization = False + + # Backend choices based on tier restriction + if gpu_config.lm_backend_restriction == "pt_mlx_only": + available_backends = ["pt", "mlx"] + else: + available_backends = ["vllm", "pt", "mlx"] + recommended_backend = gpu_config.recommended_backend + if recommended_backend not in available_backends: + recommended_backend = available_backends[0] + + # Recommended LM model: use tier config, fallback to first available + recommended_lm = gpu_config.recommended_lm_model + with gr.Group(): # Service Configuration - collapse if pre-initialized, hide if in service mode accordion_open = not service_pre_initialized @@ -115,9 +135,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua ) with gr.Row(): - # Get available 5Hz LM model list - available_lm_models = llm_handler.get_available_5hz_lm_models() - default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None) + # Get available 5Hz LM model list from disk, then filter by GPU tier + all_lm_models = llm_handler.get_available_5hz_lm_models() + # Filter to only show models whose size class is supported by this tier + # e.g., tier3 allows "0.6B" → keep "acestep-5Hz-lm-0.6B-v4-fix" on disk + tier_lm_models = gpu_config.available_lm_models + if tier_lm_models: + filtered_lm_models = [m for m in all_lm_models if is_lm_model_size_allowed(m, tier_lm_models)] + # If no tier models found on disk, show all disk models (user may have custom checkpoints) + available_lm_models = filtered_lm_models if filtered_lm_models else all_lm_models + else: + available_lm_models = all_lm_models + + # Use recommended model from tier config, find best match on disk + default_lm_model = find_best_lm_model_on_disk(recommended_lm, available_lm_models) # Set lm_model_path value from init_params if pre-initialized lm_model_path_value = init_params.get('lm_model_path', default_lm_model) if service_pre_initialized else default_lm_model @@ -125,25 +156,29 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua label=t("service.lm_model_path_label"), choices=available_lm_models, value=lm_model_path_value, - info=t("service.lm_model_path_info") + info=t("service.lm_model_path_info") + (f" (Recommended: {recommended_lm})" if recommended_lm else " (LM not available for this GPU tier)") ) - # Set backend value from init_params if pre-initialized - backend_value = init_params.get('backend', 'vllm') if service_pre_initialized else 'vllm' + # Set backend value from init_params if pre-initialized, using tier-recommended backend + backend_value = init_params.get('backend', recommended_backend) if service_pre_initialized else recommended_backend backend_dropdown = gr.Dropdown( - choices=["vllm", "pt", "mlx"], + choices=available_backends, value=backend_value, label=t("service.backend_label"), - info=t("service.backend_info") + info=t("service.backend_info") + (f" (vllm unavailable for {gpu_config.tier}: VRAM too low)" if gpu_config.lm_backend_restriction == "pt_mlx_only" else "") ) # Checkbox options section - all checkboxes grouped together + # Defaults are tier-aware (set above from gpu_config) with gr.Row(): - # Set init_llm value from init_params if pre-initialized, otherwise use GPU config default + # LM checkbox: for tiers with no LM support, default off and show info init_llm_value = init_params.get('init_llm', init_lm_default) if service_pre_initialized else init_lm_default + lm_info_text = t("service.init_llm_info") + if not gpu_config.available_lm_models: + lm_info_text += " āš ļø LM not available for this GPU tier (VRAM too low)" init_llm_checkbox = gr.Checkbox( label=t("service.init_llm_label"), value=init_llm_value, - info=t("service.init_llm_info"), + info=lm_info_text, ) # Auto-detect flash attention availability flash_attn_available = dit_handler.is_flash_attention_available(device_value) @@ -155,35 +190,33 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua interactive=flash_attn_available, info=t("service.flash_attention_info_enabled") if flash_attn_available else t("service.flash_attention_info_disabled") ) - # Set offload_to_cpu value from init_params if pre-initialized (default True) + # Offload to CPU: tier-aware default offload_to_cpu_value = init_params.get('offload_to_cpu', default_offload) if service_pre_initialized else default_offload offload_to_cpu_checkbox = gr.Checkbox( label=t("service.offload_cpu_label"), value=offload_to_cpu_value, - info=t("service.offload_cpu_info") + info=t("service.offload_cpu_info") + (" (recommended for this tier)" if default_offload else " (optional for this tier)") ) - # Set offload_dit_to_cpu value from init_params if pre-initialized (default True) - offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', default_offload) if service_pre_initialized else default_offload + # Offload DiT to CPU: tier-aware default + offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', default_offload_dit) if service_pre_initialized else default_offload_dit offload_dit_to_cpu_checkbox = gr.Checkbox( label=t("service.offload_dit_cpu_label"), value=offload_dit_to_cpu_value, - info=t("service.offload_dit_cpu_info") + info=t("service.offload_dit_cpu_info") + (" (recommended for this tier)" if default_offload_dit else " (optional for this tier)") ) - # Set compile_model value from init_params if pre-initialized (default True) - compile_model_value = init_params.get('compile_model', True) if service_pre_initialized else True + # Compile model: tier-aware default + compile_model_value = init_params.get('compile_model', default_compile) if service_pre_initialized else default_compile compile_model_checkbox = gr.Checkbox( label=t("service.compile_model_label"), value=compile_model_value, info=t("service.compile_model_info") ) - # Set quantization value from init_params if pre-initialized. - # Default to False on macOS to avoid torchao incompatibilities. - default_quantization = False if sys.platform == "darwin" else True + # Quantization: tier-aware default (macOS override already applied above) quantization_value = init_params.get('quantization', default_quantization) if service_pre_initialized else default_quantization quantization_checkbox = gr.Checkbox( label=t("service.quantization_label"), value=quantization_value, - info=t("service.quantization_info") + info=t("service.quantization_info") + (" (recommended for this tier)" if default_quantization else " (optional for this tier)") ) init_btn = gr.Button(t("service.init_btn"), variant="primary", size="lg") diff --git a/acestep/handler.py b/acestep/handler.py index b6fac652..1e278f40 100644 --- a/acestep/handler.py +++ b/acestep/handler.py @@ -46,7 +46,7 @@ ) from acestep.core.generation.handler import LoraManagerMixin, ProgressMixin from acestep.dit_alignment_score import MusicStampsAligner, MusicLyricScorer -from acestep.gpu_config import get_gpu_memory_gb, get_global_gpu_config +from acestep.gpu_config import get_gpu_memory_gb, get_global_gpu_config, get_effective_free_vram_gb warnings.filterwarnings("ignore") @@ -1169,37 +1169,182 @@ def _get_effective_mps_memory_gb(self) -> Optional[float]: # Align with gpu_config: MPS can use ~75% of unified memory for GPU workloads. return system_gb * 0.75 + # Maximum VAE decode chunk size. Larger chunks are faster but the + # PyTorch caching allocator may *reserve* significantly more VRAM than + # the peak *allocated* amount. Empirical measurements (bf16 VAE, + # ~10 GB baseline from DiT + LM): + # chunk peak_alloc peak_reserved + # 512 11.9 GB 12.7 GB + # 1024 13.1 GB 15.0 GB ← dangerously close to 16 GB + # 1536 14.4 GB 17.2 GB ← exceeds 16 GB + # Capping at 512 keeps reserved VRAM safely under 16 GB on consumer + # GPUs while the speed difference vs 1024/1536 is negligible for + # tiled decode (a few hundred ms). + VAE_DECODE_MAX_CHUNK_SIZE = 512 + def _get_auto_decode_chunk_size(self) -> int: - """Choose a conservative VAE decode chunk size based on memory.""" + """Choose a conservative VAE decode chunk size based on available memory. + + For CUDA GPUs, uses actual free VRAM to determine chunk size. + For MPS, uses effective memory estimate. + Larger chunks are faster but use more VRAM; smaller chunks are safer. + The result is capped at ``VAE_DECODE_MAX_CHUNK_SIZE`` to prevent the + PyTorch caching allocator from over-reserving VRAM on consumer GPUs. + """ override = os.environ.get("ACESTEP_VAE_DECODE_CHUNK_SIZE") if override: try: value = int(override) if value > 0: - return value + return value # explicit override bypasses the cap except ValueError: pass + + max_chunk = self.VAE_DECODE_MAX_CHUNK_SIZE + if self.device == "mps": mem_gb = self._get_effective_mps_memory_gb() if mem_gb is not None: if mem_gb >= 48: - return 1536 + return min(1536, max_chunk) if mem_gb >= 24: - return 1024 - return 512 + return min(1024, max_chunk) + return min(512, max_chunk) + + # CUDA: use effective free VRAM (respects per-process memory fraction) to pick chunk size + if self.device == "cuda" or (isinstance(self.device, str) and self.device.startswith("cuda")): + try: + free_gb = get_effective_free_vram_gb() + except Exception: + free_gb = 0 + logger.debug(f"[_get_auto_decode_chunk_size] Effective free VRAM: {free_gb:.2f} GB") + # VAE decode peak VRAM (allocated) scales roughly with chunk_size. + # Empirical: chunk_size=512 needs ~1.3 GB, 1024 needs ~2.6 GB, 1536 needs ~3.9 GB + # chunk_size=128 needs ~0.3 GB, chunk_size=64 needs ~0.3 GB + if free_gb >= 8.0: + return min(512, max_chunk) + elif free_gb >= 5.0: + return min(512, max_chunk) + elif free_gb >= 2.5: + return min(512, max_chunk) + elif free_gb >= 1.0: + return 256 + elif free_gb >= 0.5: + return 128 # Very tight VRAM + else: + return 64 # Extremely tight VRAM — minimal chunk + + return min(512, max_chunk) def _should_offload_wav_to_cpu(self) -> bool: - """Decide whether to offload decoded wavs to CPU for memory safety.""" + """Decide whether to offload decoded wavs to CPU for memory safety. + + For CUDA GPUs with >=24 GB free, keep on GPU for speed. + For MPS with >=32 GB, keep on GPU. + Otherwise offload to CPU to avoid OOM during concatenation. + """ override = os.environ.get("ACESTEP_MPS_DECODE_OFFLOAD") if override: return override.lower() in ("1", "true", "yes") - if self.device != "mps": + if self.device == "mps": + mem_gb = self._get_effective_mps_memory_gb() + if mem_gb is not None and mem_gb >= 32: + return False return True - mem_gb = self._get_effective_mps_memory_gb() - if mem_gb is not None and mem_gb >= 32: - return False + # CUDA: offload unless plenty of free VRAM + if self.device == "cuda" or (isinstance(self.device, str) and self.device.startswith("cuda")): + try: + free_gb = get_effective_free_vram_gb() + logger.debug(f"[_should_offload_wav_to_cpu] Effective free VRAM: {free_gb:.2f} GB") + if free_gb >= 24.0: + return False + except Exception: + pass return True + def _vram_guard_reduce_batch( + self, + batch_size: int, + audio_duration: Optional[float] = None, + use_lm: bool = False, + ) -> int: + """Pre-inference VRAM guard: auto-reduce batch_size if free VRAM is tight. + + Rough activation estimate per batch element: + - DiT forward pass: ~0.8 GB per sample at 60s, scales linearly with duration + - LM inference: KV cache is pre-allocated so batch doesn't change it much + - VAE decode: handled separately via tiled_decode + + We leave a 1.5 GB safety margin for CUDA allocator fragmentation. + + IMPORTANT: When offload_to_cpu is True, the LM model (especially vllm + backend) may still be on GPU when this guard runs, but it will be + offloaded or its memory reclaimed before DiT actually needs the VRAM. + In that case we trust the static GPU tier config limits (which have been + empirically validated) and skip the dynamic VRAM check. + """ + if batch_size <= 1: + return batch_size + + device = self.device + if device == "cpu" or device == "mps": + return batch_size # No CUDA VRAM to guard + + # When CPU offload is enabled, the current free VRAM is misleading because + # the LM (vllm KV cache + weights) may still be on GPU at this point but + # will be released/reclaimed before DiT actually uses the VRAM. The static + # GPU tier configs already encode safe batch limits that were empirically + # validated with offload enabled, so trust them. + # + # Use the more conservative max_batch_size_with_lm as the threshold since + # the handler doesn't know if LM was used upstream. This is safe because + # max_batch_size_with_lm <= max_batch_size_without_lm for all tiers. + if self.offload_to_cpu: + gpu_config = get_global_gpu_config() + if gpu_config is not None: + tier_max = gpu_config.max_batch_size_with_lm + if batch_size <= tier_max: + logger.debug( + f"[VRAM guard] offload_to_cpu=True, batch_size={batch_size} <= " + f"tier limit {tier_max} — skipping dynamic VRAM check " + f"(LM will be offloaded before DiT runs)" + ) + return batch_size + # batch_size exceeds tier limit — fall through to dynamic check + + try: + free_gb = get_effective_free_vram_gb() + except Exception: + return batch_size + + # Estimate per-sample activation cost for DiT + duration_sec = float(audio_duration) if audio_duration and float(audio_duration) > 0 else 60.0 + # Empirical: ~0.8 GB per sample at 60s, linear scaling + per_sample_gb = 0.8 * (duration_sec / 60.0) + # If using cfg (base model), double the per-sample cost + if hasattr(self, 'model') and self.model is not None: + model_name = getattr(self, 'config_path', '') or '' + if 'base' in model_name.lower(): + per_sample_gb *= 2.0 + + safety_margin_gb = 1.5 + available_for_batch = free_gb - safety_margin_gb + + if available_for_batch <= 0: + logger.warning( + f"[VRAM guard] Only {free_gb:.1f} GB free — reducing batch_size to 1" + ) + return 1 + + max_safe_batch = max(1, int(available_for_batch / per_sample_gb)) + if max_safe_batch < batch_size: + logger.warning( + f"[VRAM guard] Free VRAM {free_gb:.1f} GB can safely fit ~{max_safe_batch} samples " + f"(requested {batch_size}). Reducing batch_size to {max_safe_batch}." + ) + return max_safe_batch + + return batch_size def _get_vae_dtype(self, device: Optional[str] = None) -> torch.dtype: """Get VAE dtype based on target device and GPU tier.""" target_device = device or self.device @@ -2569,6 +2714,8 @@ def tiled_decode(self, latents, chunk_size: Optional[int] = None, overlap: int = if offload_wav_to_cpu is None: offload_wav_to_cpu = self._should_offload_wav_to_cpu() + logger.info(f"[tiled_decode] chunk_size={chunk_size}, offload_wav_to_cpu={offload_wav_to_cpu}, latents_shape={latents.shape}") + # MPS Conv1d has a hard output-size limit that the OobleckDecoder # exceeds during temporal upsampling with large chunks. Reduce # chunk_size to keep each VAE decode within the MPS kernel limits @@ -2621,13 +2768,44 @@ def _tiled_decode_inner(self, latents, chunk_size, overlap, offload_wav_to_cpu): """Core tiled decode logic (extracted for fallback wrapping).""" B, C, T = latents.shape + # ---- Batch-sequential decode ---- + # VAE decode VRAM scales linearly with batch size. On tight-VRAM GPUs + # (e.g. 8 GB) decoding the whole batch at once can OOM. Process one + # sample at a time so peak VRAM stays constant regardless of batch size. + if B > 1: + logger.info(f"[tiled_decode] Batch size {B} > 1 — decoding samples sequentially to save VRAM") + per_sample_results = [] + for b_idx in range(B): + single = latents[b_idx : b_idx + 1] # [1, C, T] + decoded = self._tiled_decode_inner(single, chunk_size, overlap, offload_wav_to_cpu) + # Move to CPU immediately to free GPU VRAM for next sample + per_sample_results.append(decoded.cpu() if decoded.device.type != "cpu" else decoded) + self._empty_cache() + # Concatenate on CPU then move back if needed + result = torch.cat(per_sample_results, dim=0) # [B, channels, samples] + if latents.device.type != "cpu" and not offload_wav_to_cpu: + result = result.to(latents.device) + return result + + # Adjust overlap for very small chunk sizes to ensure positive stride + effective_overlap = overlap + while chunk_size - 2 * effective_overlap <= 0 and effective_overlap > 0: + effective_overlap = effective_overlap // 2 + if effective_overlap != overlap: + logger.warning(f"[tiled_decode] Reduced overlap from {overlap} to {effective_overlap} for chunk_size={chunk_size}") + overlap = effective_overlap + # If short enough, decode directly if T <= chunk_size: - # Decode and immediately extract .sample to avoid keeping DecoderOutput object - decoder_output = self.vae.decode(latents) - result = decoder_output.sample - del decoder_output - return result + try: + decoder_output = self.vae.decode(latents) + result = decoder_output.sample + del decoder_output + return result + except torch.cuda.OutOfMemoryError: + logger.warning("[tiled_decode] OOM on direct decode, falling back to CPU VAE decode") + self._empty_cache() + return self._decode_on_cpu(latents) # Calculate stride (core size) stride = chunk_size - 2 * overlap @@ -2638,10 +2816,25 @@ def _tiled_decode_inner(self, latents, chunk_size, overlap, offload_wav_to_cpu): if offload_wav_to_cpu: # Optimized path: offload wav to CPU immediately to save VRAM - return self._tiled_decode_offload_cpu(latents, B, T, stride, overlap, num_steps) + try: + return self._tiled_decode_offload_cpu(latents, B, T, stride, overlap, num_steps) + except torch.cuda.OutOfMemoryError: + logger.warning(f"[tiled_decode] OOM during offload_cpu decode with chunk_size={chunk_size}, falling back to CPU VAE decode") + self._empty_cache() + return self._decode_on_cpu(latents) else: # Default path: keep everything on GPU - return self._tiled_decode_gpu(latents, B, T, stride, overlap, num_steps) + try: + return self._tiled_decode_gpu(latents, B, T, stride, overlap, num_steps) + except torch.cuda.OutOfMemoryError: + logger.warning(f"[tiled_decode] OOM during GPU decode with chunk_size={chunk_size}, falling back to CPU offload path") + self._empty_cache() + try: + return self._tiled_decode_offload_cpu(latents, B, T, stride, overlap, num_steps) + except torch.cuda.OutOfMemoryError: + logger.warning("[tiled_decode] OOM even with offload path, falling back to full CPU VAE decode") + self._empty_cache() + return self._decode_on_cpu(latents) def _tiled_decode_gpu(self, latents, B, T, stride, overlap, num_steps): """Standard tiled decode keeping all data on GPU.""" @@ -2769,6 +2962,44 @@ def _tiled_decode_offload_cpu(self, latents, B, T, stride, overlap, num_steps): return final_audio + def _decode_on_cpu(self, latents): + """ + Emergency fallback: move VAE to CPU, decode there, then restore. + + This is used when GPU VRAM is too tight for even the smallest tiled decode. + Slower but guarantees no OOM on GPU. + """ + logger.warning("[_decode_on_cpu] Moving VAE to CPU for decode (VRAM too tight for GPU decode)") + + # Remember original device + try: + original_device = next(self.vae.parameters()).device + except StopIteration: + original_device = torch.device("cpu") + + # Move VAE to CPU + vae_cpu_dtype = self._get_vae_dtype("cpu") + self._recursive_to_device(self.vae, "cpu", vae_cpu_dtype) + self._empty_cache() + + # Move latents to CPU + latents_cpu = latents.cpu().to(vae_cpu_dtype) + + # Decode on CPU (no tiling needed — CPU has plenty of RAM) + try: + with torch.inference_mode(): + decoder_output = self.vae.decode(latents_cpu) + result = decoder_output.sample + del decoder_output + finally: + # Restore VAE to original device + if original_device.type != "cpu": + vae_gpu_dtype = self._get_vae_dtype(str(original_device)) + self._recursive_to_device(self.vae, original_device, vae_gpu_dtype) + + logger.info(f"[_decode_on_cpu] CPU decode complete, result shape={result.shape}") + return result # result stays on CPU — fine for audio post-processing + def tiled_encode(self, audio, chunk_size=None, overlap=None, offload_latent_to_cpu=True): """ Encode audio to latents using tiling to reduce VRAM usage. @@ -3036,6 +3267,15 @@ def _has_audio_codes(v: Union[str, List[str]]) -> bool: actual_batch_size = batch_size if batch_size is not None else self.batch_size actual_batch_size = max(1, actual_batch_size) # Ensure at least 1 + # ---- Pre-inference VRAM guard ---- + # Estimate whether the requested batch_size fits in free VRAM and + # auto-reduce if it does not. This prevents OOM crashes at the cost + # of generating fewer samples. + actual_batch_size = self._vram_guard_reduce_batch( + actual_batch_size, + audio_duration=audio_duration, + ) + actual_seed_list, seed_value_for_ui = self.prepare_seeds(actual_batch_size, seed, use_random_seed) # Convert special values to None @@ -3209,10 +3449,16 @@ def _has_audio_codes(v: Union[str, List[str]]) -> bool: logger.debug(f"[generate_music] Before VAE decode: allocated={self._memory_allocated()/1024**3:.2f}GB, max={self._max_memory_allocated()/1024**3:.2f}GB") - # ROCm fix: decode VAE on CPU to bypass MIOpen workspace bugs - # On APUs with unified memory this has zero data-transfer cost + # Check effective free VRAM and auto-enable CPU decode if extremely tight import os as _os _vae_cpu = _os.environ.get("ACESTEP_VAE_ON_CPU", "0").lower() in ("1", "true", "yes") + if not _vae_cpu: + _effective_free = get_effective_free_vram_gb() + logger.info(f"[generate_music] Effective free VRAM before VAE decode: {_effective_free:.2f} GB") + # If less than 0.5 GB free, VAE decode on GPU will almost certainly OOM + if _effective_free < 0.5: + logger.warning(f"[generate_music] Only {_effective_free:.2f} GB free VRAM — auto-enabling CPU VAE decode") + _vae_cpu = True if _vae_cpu: logger.info("[generate_music] Moving VAE to CPU for decode (ACESTEP_VAE_ON_CPU=1)...") _vae_device = next(self.vae.parameters()).device diff --git a/acestep/llm_inference.py b/acestep/llm_inference.py index 22b81b95..dac14b00 100644 --- a/acestep/llm_inference.py +++ b/acestep/llm_inference.py @@ -3831,12 +3831,18 @@ def get_hf_model_for_scoring(self): load_time = time.time() - start_time logger.info(f"HuggingFace model loaded in {load_time:.2f}s") - # Move to same device as vllm model - device = next(model_runner.model.parameters()).device - self._hf_model_for_scoring = self._hf_model_for_scoring.to(device) - self._hf_model_for_scoring.eval() - - logger.info(f"HuggingFace model for scoring ready on {device}") + # When offload_to_cpu is enabled, keep the model on CPU to save + # VRAM. The caller (_load_scoring_model_context in + # test_time_scaling.py) will move it to the accelerator only for + # the duration of the forward pass. + if self.offload_to_cpu: + self._hf_model_for_scoring.eval() + logger.info("HuggingFace model for scoring kept on CPU (offload_to_cpu=True)") + else: + device = next(model_runner.model.parameters()).device + self._hf_model_for_scoring = self._hf_model_for_scoring.to(device) + self._hf_model_for_scoring.eval() + logger.info(f"HuggingFace model for scoring ready on {device}") return self._hf_model_for_scoring @@ -3860,12 +3866,16 @@ def get_hf_model_for_scoring(self): load_time = time.time() - start_time logger.info(f"HuggingFace model loaded in {load_time:.2f}s") - # Keep on CPU for MPS (scoring is not perf-critical) - device = "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu" - self._hf_model_for_scoring = self._hf_model_for_scoring.to(device) - self._hf_model_for_scoring.eval() - - logger.info(f"HuggingFace model for scoring ready on {device}") + # When offload_to_cpu is enabled, keep on CPU; the scoring + # context manager will move it to the accelerator as needed. + if self.offload_to_cpu: + self._hf_model_for_scoring.eval() + logger.info("HuggingFace model for scoring kept on CPU (offload_to_cpu=True)") + else: + device = "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu" + self._hf_model_for_scoring = self._hf_model_for_scoring.to(device) + self._hf_model_for_scoring.eval() + logger.info(f"HuggingFace model for scoring ready on {device}") return self._hf_model_for_scoring diff --git a/acestep/test_time_scaling.py b/acestep/test_time_scaling.py index 19964840..827f28e5 100644 --- a/acestep/test_time_scaling.py +++ b/acestep/test_time_scaling.py @@ -2,13 +2,15 @@ Test-Time Scaling Module Implements perplexity-based scoring for generated audio codes """ +import contextlib +import math +import re + import torch import torch.nn.functional as F -from typing import Tuple, Optional, Dict, Any, List -from loguru import logger import yaml -import math -import re +from loguru import logger +from typing import Tuple, Optional, Dict, Any, List def pmi_score(log_prob_conditional: float, log_prob_unconditional: float) -> float: @@ -62,6 +64,52 @@ def pmi_to_normalized_score(pmi: float, scale: float = 0.1) -> float: return 1.0 / (1.0 + math.exp(-pmi / scale)) +@contextlib.contextmanager +def _load_scoring_model_context(llm_handler): + """ + Context manager that loads the HF scoring model to the accelerator device + before use and offloads it back to CPU afterwards. + + For the ``pt`` backend the existing ``_load_model_context()`` already + handles offloading, so we just delegate to it. For ``vllm`` / ``mlx`` + backends, ``get_hf_model_for_scoring()`` caches a *separate* HF model + that would otherwise stay on GPU permanently — here we move it to GPU + only for the duration of the scoring forward pass and move it back to + CPU when done, freeing VRAM for DiT / VAE. + """ + backend = getattr(llm_handler, "llm_backend", "pt") + + if backend == "pt": + # pt backend: _load_model_context already handles GPU ↔ CPU + with llm_handler._load_model_context(): + yield + return + + # vllm / mlx: manage the cached HF model ourselves + model = llm_handler.get_hf_model_for_scoring() + if model is None: + yield + return + + offload = getattr(llm_handler, "offload_to_cpu", False) + device = llm_handler.device if hasattr(llm_handler, "device") else "cpu" + + if offload and hasattr(model, "to"): + logger.info(f"[scoring] Loading HF scoring model to {device}") + model.to(device) + + try: + yield + finally: + if offload and hasattr(model, "to"): + logger.info("[scoring] Offloading HF scoring model to CPU") + model.to("cpu") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"): + torch.mps.empty_cache() + + def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str, target_text: str) -> Tuple[torch.Tensor, torch.Tensor]: """ @@ -77,7 +125,18 @@ def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str, """ model = llm_handler.get_hf_model_for_scoring() tokenizer = llm_handler.llm_tokenizer - device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device + + # Determine the device the model is *currently* on (it may be on CPU + # if offload_to_cpu is active — _load_scoring_model_context will move + # it to the accelerator before the forward pass). + backend = getattr(llm_handler, "llm_backend", "pt") + if backend == "pt": + device = llm_handler.device + else: + # For vllm/mlx the scoring model may be on CPU right now; + # use the handler's target device so tensors land on the right device + # once the model is moved there by the context manager. + device = llm_handler.device if hasattr(llm_handler, "device") else next(model.parameters()).device # 1. Tokenize prompt ONLY to get its length (used for slicing later). # We must ensure special tokens are added to count the offset correctly. @@ -96,18 +155,17 @@ def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str, return torch.empty(0, device=device), torch.empty(0, device=device) # 3. Forward Pass (Teacher Forcing) + # _load_scoring_model_context ensures the model is on-device for the + # forward pass and offloaded back to CPU afterwards. with torch.no_grad(): - with llm_handler._load_model_context(): + with _load_scoring_model_context(llm_handler): outputs = model(input_ids=input_ids, attention_mask=full_tokens['attention_mask']) all_logits = outputs.logits # [1, seq_len, vocab_size] - # 4. Extract Logits and Labels - # We need to predict `input_ids[i]`. The logit for this is at `all_logits[i-1]`. - # Target starts at index `prompt_len`. - # So we need logits from `prompt_len - 1` up to the second to last position. - - target_logits = all_logits[0, prompt_len - 1:-1, :] # [target_len, vocab_size] - target_ids = input_ids[0, prompt_len:] # [target_len] + # 4. Extract Logits and Labels — move to CPU so downstream scoring + # does not keep large vocab-sized tensors on GPU. + target_logits = all_logits[0, prompt_len - 1:-1, :].cpu() # [target_len, vocab_size] + target_ids = input_ids[0, prompt_len:].cpu() # [target_len] return target_logits, target_ids diff --git a/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py b/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py index c564265a..b4f25caa 100644 --- a/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py +++ b/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py @@ -218,6 +218,22 @@ def allocate_kv_cache(self): hf_config = config.hf_config free, total = torch.cuda.mem_get_info() current = torch.cuda.memory_stats()["allocated_bytes.all.current"] + + # Account for per-process memory fraction (set via MAX_CUDA_VRAM simulation) + import os as _os + _debug_vram = _os.environ.get("MAX_CUDA_VRAM") + if _debug_vram is not None: + try: + _simulated_gb = float(_debug_vram) + _total_gb = total / (1024 ** 3) + if _simulated_gb < _total_gb: + # Effective total and free are capped by simulation + reserved = torch.cuda.memory_reserved() + total = int(_simulated_gb * (1024 ** 3)) + free = max(0, total - reserved) + except (ValueError, TypeError): + pass + num_kv_heads = hf_config.num_key_value_heads // self.world_size head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads) block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * self.dtype.itemsize @@ -228,6 +244,13 @@ def allocate_kv_cache(self): target_total_usage = total * config.gpu_memory_utilization available_for_kv_cache = min(free * 0.9, target_total_usage - current) + # Safety check: ensure we leave at least ~1 GB free for DiT inference + # activations that will run after LM generation. Without this, the KV + # cache can consume all free VRAM and cause OOM during DiT forward pass. + MIN_RESERVE_BYTES = int(1.0 * 1024**3) # 1 GB reserved for other models + max_kv_from_free = max(0, free - MIN_RESERVE_BYTES) * 0.9 + available_for_kv_cache = min(available_for_kv_cache, max_kv_from_free) + # Ensure we have positive memory available if available_for_kv_cache <= 0: available_for_kv_cache = free * 0.5 # Fallback to 50% of free memory @@ -242,11 +265,21 @@ def allocate_kv_cache(self): ) max_tokens_capacity = config.num_kvcache_blocks * self.block_size kv_cache_size_gb = config.num_kvcache_blocks * block_bytes / 1024**3 + + # If KV cache would leave less than 1 GB free, warn and suggest reducing max_model_len + post_kv_free = (free - config.num_kvcache_blocks * block_bytes) / 1024**3 + if post_kv_free < 1.0: + print( + f"[nanovllm] WARNING: After KV cache allocation, only {post_kv_free:.2f} GB free. " + f"DiT inference may OOM. Consider reducing max_model_len or using CPU offload." + ) + print( f"[nanovllm] KV cache allocated: {config.num_kvcache_blocks} blocks Ɨ {self.block_size} tokens = " f"{max_tokens_capacity} tokens capacity, {kv_cache_size_gb:.2f} GB " f"(free: {free / 1024**3:.2f} GB, used: {current / 1024**3:.2f} GB, " - f"target: {target_total_usage / 1024**3:.2f} GB, block: {block_bytes / 1024**2:.2f} MB)" + f"target: {target_total_usage / 1024**3:.2f} GB, block: {block_bytes / 1024**2:.2f} MB, " + f"post_kv_free: {post_kv_free:.2f} GB)" ) self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim) layer_id = 0 diff --git a/docs/en/BENCHMARK.md b/docs/en/BENCHMARK.md index 79a5124b..87f68370 100644 --- a/docs/en/BENCHMARK.md +++ b/docs/en/BENCHMARK.md @@ -26,6 +26,7 @@ |------|-------------| | `profile` | Profile a single generation run with detailed timing breakdown | | `benchmark` | Run a matrix of configurations (duration Ɨ batch Ɨ thinking Ɨ steps) and produce a summary table | +| `tier-test` | Automatically test all GPU tiers by simulating different VRAM sizes via `MAX_CUDA_VRAM` | | `understand` | Profile the `understand_music()` API (audio → metadata extraction) | | `create_sample` | Profile the `create_sample()` API (inspiration / simple mode) | | `format_sample` | Profile the `format_sample()` API (caption + lyrics → structured metadata) | @@ -156,6 +157,84 @@ Profiles the `format_sample()` API which converts caption + lyrics into structur python profile_inference.py --mode format_sample ``` +### 6. `tier-test` — Automated GPU Tier Testing + +Automatically simulates different GPU VRAM sizes using `MAX_CUDA_VRAM` and runs a generation test at each tier. This is the recommended way to validate that all GPU tiers work correctly after modifying `acestep/gpu_config.py`. + +```bash +# Test all tiers (4, 6, 8, 12, 16, 20, 24 GB) +python profile_inference.py --mode tier-test + +# Test specific VRAM sizes +python profile_inference.py --mode tier-test --tiers 6 8 16 + +# Test with LM enabled (where the tier supports it) +python profile_inference.py --mode tier-test --tier-with-lm + +# Quick test: skip torch.compile for non-quantized tiers +python profile_inference.py --mode tier-test --tier-skip-compile +``` + +**What it validates per tier:** +- Correct tier detection and `GPUConfig` construction +- Model initialization (DiT, VAE, Text Encoder, optionally LM) +- A short generation run (30s duration, batch=1) completes without OOM +- Adaptive VAE decode fallback (GPU → CPU offload → full CPU) +- VRAM usage stays within the simulated limit + +**Output example:** + +``` +TIER TEST RESULTS +==================================================================================================== + VRAM Tier LM Duration Status Peak VRAM Notes + ────────────────────────────────────────────────────────────────────────────── + 4GB tier1 — 30s āœ… OK 3.8GB VAE decoded on CPU + 6GB tier2 — 30s āœ… OK 5.4GB Tiled VAE chunk=256 + 8GB tier4 0.6B 30s āœ… OK 7.2GB vllm backend + 12GB tier5 1.7B 30s āœ… OK 10.8GB vllm backend + 16GB tier6a 1.7B 30s āœ… OK 14.5GB offload enabled + 20GB tier6b 1.7B 30s āœ… OK 17.2GB no offload + 24GB unlimited 4B 30s āœ… OK 21.3GB full models on GPU +``` + +> **Note**: `tier-test` mode uses `torch.cuda.set_per_process_memory_fraction()` to enforce a hard VRAM cap, making simulations realistic even on high-end GPUs (e.g., A100 80GB). + +#### Boundary Testing + +Use `--tier-boundary` to find the minimum VRAM tier at which INT8 quantization and CPU offload can be safely disabled. For each tier, up to three configurations are tested: + +1. **default** — tier's standard settings +2. **no-quant** — quantization disabled, offload unchanged +3. **no-offload** — no quantization AND no CPU offload + +```bash +# Run boundary tests across all tiers +python profile_inference.py --mode tier-test --tier-boundary + +# Boundary test with LM enabled +python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm + +# Save boundary results to JSON +python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json +``` + +The output includes a **Boundary Analysis** summary showing the minimum tier for each capability. + +#### Batch Size Boundary Testing + +Use `--tier-batch-boundary` to find the maximum safe batch size for each tier. For each tier, the tool progressively tests batch sizes 1, 2, 4, 8 (stopping at first OOM) with both LM-enabled and LM-disabled configurations: + +```bash +# Run batch boundary tests +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm + +# Test specific tiers +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24 +``` + +The output includes a **Batch Boundary Summary** showing the maximum successful batch size per tier for both with-LM and without-LM configurations. + --- ## CLI Reference @@ -209,12 +288,22 @@ python profile_inference.py --mode format_sample | Flag | Default | Description | |------|---------|-------------| -| `--mode` | `profile` | Mode: `profile` / `benchmark` / `understand` / `create_sample` / `format_sample` | +| `--mode` | `profile` | Mode: `profile` / `benchmark` / `tier-test` / `understand` / `create_sample` / `format_sample` | | `--no-warmup` | off | Skip warmup run | | `--detailed` | off | Enable `cProfile` function-level analysis | | `--llm-debug` | off | Deep LLM debugging (token count, throughput) | | `--benchmark-output` | none | Save benchmark results to JSON file | +### Tier-Test Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--tiers` | `4 6 8 12 16 20 24` | VRAM sizes (GB) to simulate | +| `--tier-with-lm` | off | Enable LM initialization on tiers that support it | +| `--tier-skip-compile` | off | Skip `torch.compile` for faster iteration on non-quantized tiers | +| `--tier-boundary` | off | Test each tier with no-quant and no-offload variants to find minimum capability boundaries | +| `--tier-batch-boundary` | off | Test each tier with batch sizes 1, 2, 4, 8 to find maximum safe batch size | + ### Input Options | Flag | Default | Description | @@ -340,6 +429,10 @@ TIME COSTS BREAKDOWN 4. **Test with representative durations** — Short durations (30s) are dominated by LLM time; long durations (240s+) are dominated by DiT time. -5. **GPU memory auto-adaptation** — The benchmark mode automatically clamps durations and batch sizes to what your GPU can handle. +5. **GPU memory auto-adaptation** — The benchmark mode automatically clamps durations and batch sizes to what your GPU can handle, using the adaptive tier system in `acestep/gpu_config.py`. 6. **Use `--detailed` sparingly** — `cProfile` adds overhead; use it only when investigating function-level bottlenecks. + +7. **Use `tier-test` for regression testing** — After modifying GPU tier configs, run `--mode tier-test` to verify all tiers still work correctly. This is especially important when changing offload thresholds, duration limits, or LM model availability. + +8. **Simulate low VRAM realistically** — When using `MAX_CUDA_VRAM`, the system enforces a hard VRAM cap via `set_per_process_memory_fraction()`, so OOM errors during simulation reflect real behavior on consumer GPUs. diff --git a/docs/en/GPU_COMPATIBILITY.md b/docs/en/GPU_COMPATIBILITY.md index 3b9bc10d..b1666023 100644 --- a/docs/en/GPU_COMPATIBILITY.md +++ b/docs/en/GPU_COMPATIBILITY.md @@ -1,36 +1,69 @@ # GPU Compatibility Guide -ACE-Step 1.5 automatically adapts to your GPU's available VRAM, adjusting generation limits and LM model availability accordingly. The system detects GPU memory at startup and configures optimal settings. +ACE-Step 1.5 automatically adapts to your GPU's available VRAM, adjusting generation limits, LM model availability, offloading strategies, and UI defaults accordingly. The system detects GPU memory at startup and configures optimal settings for your hardware. ## GPU Tier Configuration -| VRAM | Tier | LM Mode | Max Duration | Max Batch Size | LM Memory Allocation | -|------|------|---------|--------------|----------------|---------------------| -| ≤4GB | Tier 1 | Not available | 3 min | 1 | - | -| 4-6GB | Tier 2 | Not available | 6 min | 1 | - | -| 6-8GB | Tier 3 | 0.6B (optional) | With LM: 4 min / Without: 6 min | With LM: 1 / Without: 2 | 3GB | -| 8-12GB | Tier 4 | 0.6B (optional) | With LM: 4 min / Without: 6 min | With LM: 2 / Without: 4 | 3GB | -| 12-16GB | Tier 5 | 0.6B / 1.7B | With LM: 4 min / Without: 6 min | With LM: 2 / Without: 4 | 0.6B: 3GB, 1.7B: 8GB | -| 16-24GB | Tier 6 | 0.6B / 1.7B / 4B | 8 min | With LM: 4 / Without: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB | -| ≄24GB | Unlimited | All models | 10 min | 8 | Unrestricted | +| VRAM | Tier | LM Models | Recommended LM | Backend | Max Duration (LM / No LM) | Max Batch (LM / No LM) | Offload | Quantization | +|------|------|-----------|-----------------|---------|----------------------------|-------------------------|---------|--------------| +| ≤4GB | Tier 1 | None | — | pt | 4 min / 6 min | 1 / 1 | CPU + DiT | INT8 | +| 4-6GB | Tier 2 | None | — | pt | 8 min / 10 min | 1 / 1 | CPU + DiT | INT8 | +| 6-8GB | Tier 3 | 0.6B | 0.6B | pt | 8 min / 10 min | 2 / 2 | CPU + DiT | INT8 | +| 8-12GB | Tier 4 | 0.6B | 0.6B | vllm | 8 min / 10 min | 2 / 4 | CPU + DiT | INT8 | +| 12-16GB | Tier 5 | 0.6B, 1.7B | 1.7B | vllm | 8 min / 10 min | 4 / 4 | CPU | INT8 | +| 16-20GB | Tier 6a | 0.6B, 1.7B | 1.7B | vllm | 8 min / 10 min | 4 / 8 | CPU | INT8 | +| 20-24GB | Tier 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8 min / 8 min | 8 / 8 | None | None | +| ≄24GB | Unlimited | All (0.6B, 1.7B, 4B) | 4B | vllm | 10 min / 10 min | 8 / 8 | None | None | + +### Column Descriptions + +- **LM Models**: Which 5Hz Language Model sizes can be loaded on this tier +- **Recommended LM**: The default LM model selected in the UI for this tier +- **Backend**: LM inference backend (`vllm` for NVIDIA GPUs with sufficient VRAM, `pt` for PyTorch fallback, `mlx` for Apple Silicon) +- **Offload**: Memory offloading strategy + - **CPU + DiT**: All models (DiT, VAE, Text Encoder) offloaded to CPU when not in use; DiT also offloaded between steps + - **CPU**: VAE and Text Encoder offloaded to CPU; DiT stays on GPU + - **None**: All models remain on GPU +- **Quantization**: Whether INT8 weight quantization is enabled by default to reduce VRAM usage + +## Adaptive UI Defaults + +The Gradio UI automatically configures itself based on the detected GPU tier: + +- **LM Initialization Checkbox**: Checked by default for tiers that support LM (Tier 3+), unchecked and disabled for Tier 1-2 +- **LM Model Path**: Pre-populated with the recommended model for your tier; dropdown only shows compatible models +- **Backend Dropdown**: Restricted to `pt`/`mlx` on Tier 1-3 (vllm KV cache is too memory-hungry); all backends available on Tier 4+ +- **CPU Offload / DiT Offload**: Enabled by default on lower tiers, disabled on higher tiers +- **Quantization**: Enabled by default on Tier 1-6a, disabled on Tier 6b+ (sufficient VRAM) +- **Compile Model**: Enabled by default on all tiers (required for quantization) + +If you manually select an incompatible option (e.g., trying to use vllm on a 6GB GPU), the system will warn you and automatically fall back to a compatible configuration. + +## Runtime Safety Features + +- **VRAM Guard**: Before each inference, the system estimates VRAM requirements and automatically reduces batch size if needed +- **Adaptive VAE Decode**: Three-tier fallback: GPU tiled decode → GPU decode with CPU offload → full CPU decode +- **Auto Chunk Size**: VAE decode chunk size adapts to available free VRAM (64/128/256/512/1024/1536) +- **Duration/Batch Clamping**: If you request values exceeding your tier's limits, they are clamped with a warning ## Notes - **Default settings** are automatically configured based on detected GPU memory - **LM Mode** refers to the Language Model used for Chain-of-Thought generation and audio understanding -- **Flash Attention**, **CPU Offload**, **Compile**, and **Quantization** are enabled by default for optimal performance -- If you request a duration or batch size exceeding your GPU's limits, a warning will be displayed and values will be clamped +- **Flash Attention** is auto-detected and enabled when available - **Constrained Decoding**: When LM is initialized, the LM's duration generation is also constrained to the GPU tier's maximum duration limit, preventing out-of-memory errors during CoT generation -- For GPUs with ≤6GB VRAM, LM initialization is disabled by default to preserve memory for the DiT model +- For GPUs with ≤6GB VRAM (Tier 1-2), LM initialization is disabled by default to preserve memory for the DiT model - You can manually override settings via command-line arguments or the Gradio UI > **Community Contributions Welcome**: The GPU tier configurations above are based on our testing across common hardware. If you find that your device's actual performance differs from these parameters (e.g., can handle longer durations or larger batch sizes), we welcome you to conduct more thorough testing and submit a PR to optimize these configurations in `acestep/gpu_config.py`. Your contributions help improve the experience for all users! ## Memory Optimization Tips -1. **Low VRAM (<8GB)**: Use DiT-only mode without LM initialization for maximum duration -2. **Medium VRAM (8-16GB)**: Use the 0.6B LM model for best balance of quality and memory -3. **High VRAM (>16GB)**: Enable larger LM models (1.7B/4B) for better audio understanding and generation quality +1. **Very Low VRAM (≤6GB)**: Use DiT-only mode without LM initialization. INT8 quantization and full CPU offload are mandatory. VAE decode may fall back to CPU automatically. +2. **Low VRAM (6-8GB)**: The 0.6B LM model can be used with `pt` backend. Keep offload enabled. +3. **Medium VRAM (8-16GB)**: Use the 0.6B or 1.7B LM model. `vllm` backend works well on Tier 4+. +4. **High VRAM (16-24GB)**: Enable larger LM models (1.7B recommended). Quantization becomes optional on 20GB+. +5. **Very High VRAM (≄24GB)**: All models fit without offloading or quantization. Use 4B LM for best quality. ## Debug Mode: Simulating Different GPU Configurations @@ -40,17 +73,93 @@ For testing and development, you can simulate different GPU memory sizes using t # Simulate a 4GB GPU (Tier 1) MAX_CUDA_VRAM=4 uv run acestep +# Simulate a 6GB GPU (Tier 2) +MAX_CUDA_VRAM=6 uv run acestep + # Simulate an 8GB GPU (Tier 4) MAX_CUDA_VRAM=8 uv run acestep # Simulate a 12GB GPU (Tier 5) MAX_CUDA_VRAM=12 uv run acestep -# Simulate a 16GB GPU (Tier 6) +# Simulate a 16GB GPU (Tier 6a) MAX_CUDA_VRAM=16 uv run acestep ``` +When `MAX_CUDA_VRAM` is set, the system also calls `torch.cuda.set_per_process_memory_fraction()` to enforce a hard VRAM cap, making the simulation realistic even on high-end GPUs. + +### Automated Tier Testing + +Instead of manually testing each tier through the UI, use the `tier-test` mode of `profile_inference.py`: + +```bash +# Test all tiers automatically +python profile_inference.py --mode tier-test + +# Test specific tiers +python profile_inference.py --mode tier-test --tiers 6 8 16 + +# Test with LM enabled (where supported) +python profile_inference.py --mode tier-test --tier-with-lm + +# Quick test (skip torch.compile for non-quantized tiers) +python profile_inference.py --mode tier-test --tier-skip-compile +``` + +See [BENCHMARK.md](BENCHMARK.md) for full documentation of the profiling tool. + This is useful for: - Testing GPU tier configurations on high-end hardware - Verifying that warnings and limits work correctly for each tier -- Developing and testing new GPU configuration parameters before submitting a PR +- Automated regression testing after modifying `acestep/gpu_config.py` +- CI/CD validation of VRAM compatibility + +### Boundary Testing (Finding Minimum Tiers) + +Use `--tier-boundary` to empirically determine the minimum VRAM tier at which INT8 quantization and CPU offload can be safely disabled. For each tier, this runs up to three configurations: + +1. **default** — tier's standard settings (quantization + offload as configured) +2. **no-quant** — same offload settings, but quantization disabled +3. **no-offload** — no quantization AND no CPU offload (all models on GPU) + +```bash +# Run boundary tests across all tiers +python profile_inference.py --mode tier-test --tier-boundary + +# Test specific tiers with boundary testing +python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24 + +# Boundary test with LM enabled (where supported) +python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm + +# Save results to JSON for further analysis +python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json +``` + +The output includes a **Boundary Analysis** section showing the minimum tier for each capability: + +``` +BOUNDARY ANALYSIS +================= + Capability Min Tier VRAM + ------------------------------------------------------------ + No INT8 Quantization tier6b 20GB + No CPU Offload (all models on GPU) tier6b 20GB + ------------------------------------------------------------ +``` + +> **Note:** Boundary results are empirical and may vary based on DiT model variant (turbo vs base), whether LM is enabled, generation duration, and flash attention availability. Community contributions to refine these boundaries are welcome! + +### Batch Size Boundary Testing + +Use `--tier-batch-boundary` to find the maximum safe batch size for each tier by progressively testing batch sizes 1, 2, 4, 8: + +```bash +# Run batch boundary tests with LM enabled +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm + +# Test specific tiers +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24 +``` + +This tests both with-LM and without-LM configurations and reports the maximum successful batch size per tier. diff --git a/docs/en/GPU_TROUBLESHOOTING.md b/docs/en/GPU_TROUBLESHOOTING.md index afeba414..0d42739b 100644 --- a/docs/en/GPU_TROUBLESHOOTING.md +++ b/docs/en/GPU_TROUBLESHOOTING.md @@ -229,4 +229,6 @@ If none of the above solutions work: | Variable | Purpose | Example | |----------|---------|---------| -| `MAX_CUDA_VRAM` | Override detected VRAM (testing) | `8` (simulate 8GB GPU) | +| `MAX_CUDA_VRAM` | Override detected VRAM for tier simulation (also enforces hard VRAM cap via `set_per_process_memory_fraction`) | `8` (simulate 8GB GPU) | + +> **Note on `MAX_CUDA_VRAM`**: When set, this variable not only changes the tier detection logic but also calls `torch.cuda.set_per_process_memory_fraction()` to enforce a hard VRAM limit. This means OOM errors during simulation are realistic and reflect actual behavior on GPUs with that amount of VRAM. See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for the full tier table. diff --git a/docs/en/GRADIO_GUIDE.md b/docs/en/GRADIO_GUIDE.md index 235ab06e..f316ab42 100644 --- a/docs/en/GRADIO_GUIDE.md +++ b/docs/en/GRADIO_GUIDE.md @@ -62,17 +62,23 @@ The Gradio interface consists of several main sections: | Setting | Description | |---------|-------------| -| **5Hz LM Model Path** | Select the language model (e.g., `acestep-5Hz-lm-0.6B`, `acestep-5Hz-lm-1.7B`) | -| **5Hz LM Backend** | `vllm` (faster, recommended) or `pt` (PyTorch, more compatible) | -| **Initialize 5Hz LM** | Check to load the LM during initialization (required for thinking mode) | +| **5Hz LM Model Path** | Select the language model. **Available models are filtered by your GPU tier** — e.g., 6-8GB GPUs only show 0.6B, while 24GB+ GPUs show all sizes (0.6B, 1.7B, 4B). | +| **5Hz LM Backend** | `vllm` (faster, recommended for NVIDIA with ≄8GB VRAM), `pt` (PyTorch, universal fallback), or `mlx` (Apple Silicon). **On GPUs <8GB, the backend is restricted to `pt`/`mlx`** because vllm's KV cache is too memory-hungry. | +| **Initialize 5Hz LM** | Check to load the LM during initialization (required for thinking mode). **Automatically unchecked and disabled on GPUs ≤6GB** (Tier 1-2). | + +> **Adaptive Defaults**: All LM settings are automatically configured based on your GPU's VRAM tier. The recommended LM model, backend, and initialization state are pre-set for optimal performance. You can manually override these, but the system will warn you if your selection is incompatible with your GPU. ### Performance Options | Setting | Description | |---------|-------------| | **Use Flash Attention** | Enable for faster inference (requires flash_attn package) | -| **Offload to CPU** | Offload models to CPU when idle to save GPU memory | -| **Offload DiT to CPU** | Specifically offload the DiT model to CPU | +| **Offload to CPU** | Offload models to CPU when idle to save GPU memory. **Automatically enabled on GPUs <20GB.** | +| **Offload DiT to CPU** | Specifically offload the DiT model to CPU. **Automatically enabled on GPUs <12GB.** | +| **INT8 Quantization** | Reduce model VRAM footprint with INT8 weight quantization. **Automatically enabled on GPUs <20GB.** | +| **Compile Model** | Enable `torch.compile` for optimized inference. **Enabled by default on all tiers** (required when quantization is active). | + +> **Tier-Aware Settings**: Offload, quantization, and compile options are automatically set based on your GPU tier. See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for the full tier table. ### LoRA Adapter @@ -87,7 +93,12 @@ The Gradio interface consists of several main sections: ### Initialization -Click **Initialize Service** to load the models. The status box will show progress and confirmation. +Click **Initialize Service** to load the models. The status box will show progress and confirmation, including: +- The detected GPU tier and VRAM +- Maximum allowed duration and batch size (adjusted dynamically based on whether LM was initialized) +- Any warnings about incompatible settings that were automatically corrected + +After initialization, the **Audio Duration** and **Batch Size** sliders are automatically updated to reflect the tier's limits. --- @@ -527,14 +538,18 @@ These options are especially useful when preprocessing takes a long time or you - Make caption more specific **Out of memory:** -- Reduce batch size -- Enable CPU offloading +- The system includes automatic VRAM management (VRAM guard, adaptive VAE decode, auto batch reduction). If OOM still occurs: +- Reduce batch size manually +- Enable CPU offloading (should be auto-enabled for GPUs <20GB) +- Enable INT8 quantization (should be auto-enabled for GPUs <20GB) - Reduce LM batch chunk size +- See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for recommended settings per tier **LM not working:** -- Ensure "Initialize 5Hz LM" was checked during initialization -- Check that a valid LM model path is selected -- Verify vllm or PyTorch backend is available +- Ensure "Initialize 5Hz LM" was checked during initialization (disabled by default on GPUs ≤6GB) +- Check that a valid LM model path is selected (only tier-compatible models are shown) +- Verify vllm or PyTorch backend is available (vllm restricted on GPUs <8GB) +- If the LM checkbox is grayed out, your GPU tier does not support LM — use DiT-only mode --- diff --git a/docs/en/INFERENCE.md b/docs/en/INFERENCE.md index 1a0504a3..a2354f08 100644 --- a/docs/en/INFERENCE.md +++ b/docs/en/INFERENCE.md @@ -1068,11 +1068,19 @@ else: ### 7. Memory Management -For large batch sizes or long durations: -- Monitor GPU memory usage -- Reduce `batch_size` if OOM errors occur -- Reduce `lm_batch_chunk_size` for LM operations -- Consider using `offload_to_cpu=True` during initialization +ACE-Step 1.5 includes automatic VRAM management that adapts to your GPU: + +- **Automatic tier detection**: The system detects available VRAM and selects optimal settings (see [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md)) +- **VRAM guard**: Before each inference, the system estimates VRAM requirements and automatically reduces `batch_size` if needed +- **Adaptive VAE decode**: Three-tier fallback — GPU tiled decode → GPU decode with CPU offload → full CPU decode +- **Auto chunk sizing**: VAE decode chunk size adapts to free VRAM (64/128/256/512/1024/1536) +- **Duration/batch clamping**: Values exceeding your tier's limits are automatically clamped with a warning + +For manual tuning: +- Reduce `batch_size` if OOM errors persist +- Reduce `lm_batch_chunk_size` for LM operations on low-VRAM GPUs +- Enable `offload_to_cpu=True` during initialization for GPUs with <20GB VRAM +- Enable `quantization="int8_weight_only"` for GPUs with <20GB VRAM ### 8. Accessing Time Costs @@ -1094,7 +1102,7 @@ if result.success: ### Common Issues **Issue**: Out of memory errors -- **Solution**: Reduce `batch_size`, `inference_steps`, or enable CPU offloading +- **Solution**: The system should automatically handle most OOM scenarios via VRAM guard (batch reduction) and adaptive VAE decode (CPU fallback). If OOM still occurs: reduce `batch_size`, reduce `inference_steps`, enable CPU offloading (`offload_to_cpu=True`), or enable INT8 quantization. See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for recommended settings per VRAM tier. **Issue**: Poor quality results - **Solution**: Increase `inference_steps`, adjust `guidance_scale`, use base model diff --git a/docs/en/INSTALL.md b/docs/en/INSTALL.md index 4f584463..39488680 100644 --- a/docs/en/INSTALL.md +++ b/docs/en/INSTALL.md @@ -502,7 +502,7 @@ ACESTEP_INIT_LLM=false | `--init_llm` | auto | LLM init: `true` / `false` / omit for auto | | `--config_path` | auto | DiT model (e.g., `acestep-v15-turbo`) | | `--lm_model_path` | auto | LM model (e.g., `acestep-5Hz-lm-1.7B`) | -| `--offload_to_cpu` | auto | CPU offload (auto-enabled if VRAM < 16GB) | +| `--offload_to_cpu` | auto | CPU offload (auto-enabled if VRAM < 20GB) | | `--download-source` | auto | Model source: `auto` / `huggingface` / `modelscope` | | `--enable-api` | false | Enable REST API alongside Gradio UI | | `--api-key` | none | API key for authentication | @@ -576,16 +576,17 @@ huggingface-cli download ACE-Step/acestep-5Hz-lm-4B --local-dir ./checkpoints/ac ## šŸ’” Which Model Should I Choose? -ACE-Step automatically adapts to your GPU's VRAM: +ACE-Step automatically adapts to your GPU's VRAM. The UI pre-configures all settings (LM model, backend, offloading, quantization) based on your detected GPU tier: -| Your GPU VRAM | Recommended LM Model | Notes | -|---------------|---------------------|-------| -| **≤6GB** | None (DiT only) | LM disabled by default to save memory | -| **6-12GB** | `acestep-5Hz-lm-0.6B` | Lightweight, good balance | -| **12-16GB** | `acestep-5Hz-lm-1.7B` | Better quality | -| **≄16GB** | `acestep-5Hz-lm-4B` | Best quality and audio understanding | +| Your GPU VRAM | Recommended LM Model | Backend | Notes | +|---------------|---------------------|---------|-------| +| **≤6GB** | None (DiT only) | — | LM disabled by default; INT8 quantization + full CPU offload | +| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | Lightweight LM with PyTorch backend | +| **8-16GB** | `0.6B` / `1.7B` | `vllm` | 0.6B for 8-12GB, 1.7B for 12-16GB | +| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 4B available on 20GB+; no offload on 20GB+ | +| **≄24GB** | `acestep-5Hz-lm-4B` | `vllm` | Best quality, all models fit without offload | -> šŸ“– For detailed GPU compatibility information (duration limits, batch sizes, memory optimization), see [GPU Compatibility Guide](GPU_COMPATIBILITY.md). +> šŸ“– For detailed GPU compatibility information (tier table, duration limits, batch sizes, adaptive UI defaults, memory optimization), see [GPU Compatibility Guide](GPU_COMPATIBILITY.md). --- diff --git a/docs/en/ace_step_musicians_guide.md b/docs/en/ace_step_musicians_guide.md index c739f930..6f1a8757 100644 --- a/docs/en/ace_step_musicians_guide.md +++ b/docs/en/ace_step_musicians_guide.md @@ -138,36 +138,45 @@ A computer with a decent graphics card (GPU). The better the GPU, the faster and YOUR GPU MEMORY WHAT YOU CAN DO ───────────────────────────────────────────────────── - 4 GB (entry level) Songs up to 3 minutes + 4 GB (entry level) Songs up to 6 minutes ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 1 song at a time Basic mode only (no Songwriter brain) - 8 GB (mainstream) Songs up to 6 minutes - ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 1-2 songs at a time - Optional lightweight Songwriter brain + 6-8 GB (budget) Songs up to 10 minutes + ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 1-2 songs at a time + Optional lightweight Songwriter brain (0.6B) - 12 GB (sweet spot) Songs up to 6 minutes - ā–“ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 2-4 songs at a time - Full Songwriter brain available + 8-12 GB (mainstream) Songs up to 10 minutes + ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 2-4 songs at a time + Songwriter brain available (0.6B) - 16 GB (enthusiast) Songs up to 8 minutes - ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 2-4 songs at a time - Larger, smarter Songwriter brain + 12-16 GB (sweet spot) Songs up to 10 minutes + ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 2-4 songs at a time + Full Songwriter brain (1.7B) - 24 GB+ (high end) Songs up to 10 minutes + 16-20 GB (enthusiast) Songs up to 10 minutes + ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 1-4 songs at a time + Larger Songwriter brain (1.7B) + + 20-24 GB (high end) Songs up to 8 minutes + ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ 2-8 songs at a time + All Songwriter brains (0.6B/1.7B/4B), no offload needed + + 24 GB+ (pro) Songs up to 10 minutes ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–“ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ā–‘ Up to 8 songs at a time - All features unlocked + All features unlocked, best quality (4B) ``` **Common GPUs and where they land:** | GPU | Memory | Tier | |-----|--------|------| -| GTX 1050 Ti | 4 GB | Entry | -| RTX 3060 / 4060 | 8 GB | Mainstream | -| RTX 3070 / 4070 | 8-12 GB | Sweet spot | -| RTX 3080 / 4080 | 12-16 GB | Enthusiast | -| RTX 4090 | 24 GB | High end | +| GTX 1050 Ti | 4 GB | Entry (Tier 1) | +| GTX 1660 / RTX 2060 | 6 GB | Budget (Tier 2) | +| RTX 3060 / 4060 | 8 GB | Mainstream (Tier 4) | +| RTX 3070 / 4070 | 8-12 GB | Mainstream-Sweet spot (Tier 4-5) | +| RTX 3080 16GB / 4060 Ti 16GB | 16 GB | Enthusiast (Tier 6a) | +| RTX 3090 / 4090 | 24 GB | High end / Pro (Tier 6b-Unlimited) | | Apple M1/M2/M3 (Mac) | Shared memory | Supported, varies | **Disk space:** About 100 GB free. The AI models are large files (around 60 GB total) that download automatically the first time you run the software. diff --git a/docs/ja/GPU_COMPATIBILITY.md b/docs/ja/GPU_COMPATIBILITY.md index 37d9b227..e1862296 100644 --- a/docs/ja/GPU_COMPATIBILITY.md +++ b/docs/ja/GPU_COMPATIBILITY.md @@ -1,36 +1,69 @@ # GPU äŗ’ę›ę€§ć‚¬ć‚¤ćƒ‰ -ACE-Step 1.5 は GPU 恮 VRAM ć«č‡Ŗå‹•ēš„ć«é©åæœć—ć€ē”Ÿęˆę™‚é–“ć®åˆ¶é™ć‚„ä½æē”ØåÆčƒ½ćŖ LM ćƒ¢ćƒ‡ćƒ«ć‚’čŖæę•“ć—ć¾ć™ć€‚ć‚·ć‚¹ćƒ†ćƒ ćÆčµ·å‹•ę™‚ć« GPU ćƒ”ćƒ¢ćƒŖć‚’ę¤œå‡ŗć—ć€ęœ€é©ćŖčØ­å®šć‚’č‡Ŗå‹•ę§‹ęˆć—ć¾ć™ć€‚ +ACE-Step 1.5 は GPU 恮 VRAM ć«č‡Ŗå‹•ēš„ć«é©åæœć—ć€ē”Ÿęˆę™‚é–“ć®åˆ¶é™ć€ä½æē”ØåÆčƒ½ćŖ LM ćƒ¢ćƒ‡ćƒ«ć€ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ęˆ¦ē•„ć€UI ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆčØ­å®šć‚’čŖæę•“ć—ć¾ć™ć€‚ć‚·ć‚¹ćƒ†ćƒ ćÆčµ·å‹•ę™‚ć« GPU ćƒ”ćƒ¢ćƒŖć‚’ę¤œå‡ŗć—ć€ęœ€é©ćŖčØ­å®šć‚’č‡Ŗå‹•ę§‹ęˆć—ć¾ć™ć€‚ ## GPU ćƒ†ć‚£ć‚¢ę§‹ęˆ -| VRAM | ćƒ†ć‚£ć‚¢ | LM ćƒ¢ćƒ¼ćƒ‰ | ęœ€å¤§ę™‚é–“ | ęœ€å¤§ćƒćƒƒćƒ | LM ćƒ”ćƒ¢ćƒŖå‰²å½“ | -|------|--------|-----------|----------|------------|---------------| -| ≤4GB | Tier 1 | åˆ©ē”ØäøåÆ | 3 分 | 1 | - | -| 4-6GB | Tier 2 | åˆ©ē”ØäøåÆ | 6 分 | 1 | - | -| 6-8GB | Tier 3 | 0.6B (ć‚Ŗćƒ—ć‚·ćƒ§ćƒ³) | LM 恂悊: 4 分 / LM なし: 6 分 | LM 恂悊: 1 / LM なし: 2 | 3GB | -| 8-12GB | Tier 4 | 0.6B (ć‚Ŗćƒ—ć‚·ćƒ§ćƒ³) | LM 恂悊: 4 分 / LM なし: 6 分 | LM 恂悊: 2 / LM なし: 4 | 3GB | -| 12-16GB | Tier 5 | 0.6B / 1.7B | LM 恂悊: 4 分 / LM なし: 6 分 | LM 恂悊: 2 / LM なし: 4 | 0.6B: 3GB, 1.7B: 8GB | -| 16-24GB | Tier 6 | 0.6B / 1.7B / 4B | 8 分 | LM 恂悊: 4 / LM なし: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB | -| ≄24GB | ē„”åˆ¶é™ | å…Øćƒ¢ćƒ‡ćƒ« | 10 分 | 8 | ē„”åˆ¶é™ | +| VRAM | ćƒ†ć‚£ć‚¢ | LM ćƒ¢ćƒ‡ćƒ« | ęŽØå„Ø LM | ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ | ęœ€å¤§ę™‚é–“ (LM꜉ / LMē„”) | ęœ€å¤§ćƒćƒƒćƒ (LM꜉ / LMē„”) | ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ | 量子化 | +|------|--------|-----------|---------|-------------|------------------------|--------------------------|------------|--------| +| ≤4GB | Tier 1 | なし | — | pt | 4分 / 6分 | 1 / 1 | CPU + DiT | INT8 | +| 4-6GB | Tier 2 | なし | — | pt | 8分 / 10分 | 1 / 1 | CPU + DiT | INT8 | +| 6-8GB | Tier 3 | 0.6B | 0.6B | pt | 8分 / 10分 | 2 / 2 | CPU + DiT | INT8 | +| 8-12GB | Tier 4 | 0.6B | 0.6B | vllm | 8分 / 10分 | 2 / 4 | CPU + DiT | INT8 | +| 12-16GB | Tier 5 | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 4 | CPU | INT8 | +| 16-20GB | Tier 6a | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 8 | CPU | INT8 | +| 20-24GB | Tier 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8分 / 8分 | 8 / 8 | なし | なし | +| ≄24GB | ē„”åˆ¶é™ | å…Øćƒ¢ćƒ‡ćƒ« (0.6B, 1.7B, 4B) | 4B | vllm | 10分 / 10分 | 8 / 8 | なし | なし | + +### åˆ—ć®čŖ¬ę˜Ž + +- **LM ćƒ¢ćƒ‡ćƒ«**: ć“ć®ćƒ†ć‚£ć‚¢ć§ćƒ­ćƒ¼ćƒ‰ć§ćć‚‹ 5Hz čØ€čŖžćƒ¢ćƒ‡ćƒ«ć®ć‚µć‚¤ć‚ŗ +- **ęŽØå„Ø LM**: UI ć§ć“ć®ćƒ†ć‚£ć‚¢ć«ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆéøęŠžć•ć‚Œć‚‹ LM ćƒ¢ćƒ‡ćƒ« +- **ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰**: LM ęŽØč«–ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ļ¼ˆ`vllm` ćÆååˆ†ćŖ VRAM 悒ꌁ恤 NVIDIA GPU 向け、`pt` は PyTorch ćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æć€`mlx` は Apple Silicon 向け) +- **ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰**: + - **CPU + DiT**: ć™ć¹ć¦ć®ćƒ¢ćƒ‡ćƒ«ļ¼ˆDiT态VAEć€ćƒ†ć‚­ć‚¹ćƒˆć‚Øćƒ³ć‚³ćƒ¼ćƒ€ćƒ¼ļ¼‰ć‚’ęœŖä½æē”Øę™‚ć« CPU ć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ļ¼›DiT ć‚‚ć‚¹ćƒ†ćƒƒćƒ—é–“ć§ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ + - **CPU**: VAE ćØćƒ†ć‚­ć‚¹ćƒˆć‚Øćƒ³ć‚³ćƒ¼ćƒ€ćƒ¼ć‚’ CPU ć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ļ¼›DiT は GPU ć«äæęŒ + - **なし**: ć™ć¹ć¦ć®ćƒ¢ćƒ‡ćƒ«ć‚’ GPU ć«äæęŒ +- **量子化**: VRAM ä½æē”Øé‡ć‚’å‰Šęø›ć™ć‚‹ćŸć‚ć€ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ INT8 é‡ćæé‡å­åŒ–ć‚’ęœ‰åŠ¹ć«ć™ć‚‹ć‹ć©ć†ć‹ + +## ć‚¢ćƒ€ćƒ—ćƒ†ć‚£ćƒ– UI ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆ + +Gradio UI ćÆę¤œå‡ŗć•ć‚ŒćŸ GPU ćƒ†ć‚£ć‚¢ć«åŸŗć„ć„ć¦č‡Ŗå‹•ēš„ć«čØ­å®šć•ć‚Œć¾ć™ļ¼š + +- **LM åˆęœŸåŒ–ćƒć‚§ćƒƒć‚Æćƒœćƒƒć‚Æć‚¹**: LM ć‚’ć‚µćƒćƒ¼ćƒˆć™ć‚‹ćƒ†ć‚£ć‚¢ļ¼ˆTier 3+ļ¼‰ć§ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ćƒć‚§ćƒƒć‚Æć€Tier 1-2 ć§ćÆćƒć‚§ćƒƒć‚ÆćŖć—ćƒ»ē„”åŠ¹ +- **LM ćƒ¢ćƒ‡ćƒ«ćƒ‘ć‚¹**: ćƒ†ć‚£ć‚¢ć®ęŽØå„Øćƒ¢ćƒ‡ćƒ«ćŒč‡Ŗå‹•å…„åŠ›ļ¼›ćƒ‰ćƒ­ćƒƒćƒ—ćƒ€ć‚¦ćƒ³ć«ćÆäŗ’ę›ćƒ¢ćƒ‡ćƒ«ć®ćæč”Øē¤ŗ +- **ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ćƒ‰ćƒ­ćƒƒćƒ—ćƒ€ć‚¦ćƒ³**: Tier 1-3 では `pt`/`mlx` ć«åˆ¶é™ļ¼ˆvllm KV ć‚­ćƒ£ćƒƒć‚·ćƒ„ćŒćƒ”ćƒ¢ćƒŖć‚’ę¶ˆč²»ć—ć™ćŽć‚‹ļ¼‰ļ¼›Tier 4+ ć§ćÆć™ć¹ć¦ć®ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ćŒåˆ©ē”ØåÆčƒ½ +- **CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ / DiT ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰**: ä½Žćƒ†ć‚£ć‚¢ć§ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ęœ‰åŠ¹ć€é«˜ćƒ†ć‚£ć‚¢ć§ćÆē„”åŠ¹ +- **量子化**: Tier 1-6a ć§ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ęœ‰åŠ¹ć€Tier 6b+ ć§ćÆē„”åŠ¹ļ¼ˆååˆ†ćŖ VRAM) +- **ćƒ¢ćƒ‡ćƒ«ć‚³ćƒ³ćƒ‘ć‚¤ćƒ«**: ć™ć¹ć¦ć®ćƒ†ć‚£ć‚¢ć§ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ęœ‰åŠ¹ļ¼ˆé‡å­åŒ–ć«åæ…č¦ļ¼‰ + +äŗ’ę›ę€§ć®ćŖć„ć‚Ŗćƒ—ć‚·ćƒ§ćƒ³ć‚’ę‰‹å‹•ć§éøęŠžć—ćŸå “åˆļ¼ˆä¾‹ļ¼š6GB GPU 恧 vllm ć‚’ä½æē”Øć—ć‚ˆć†ćØć—ćŸå “åˆļ¼‰ć€ć‚·ć‚¹ćƒ†ćƒ ćÆč­¦å‘Šć‚’č”Øē¤ŗć—ć€äŗ’ę›ę€§ć®ć‚ć‚‹čØ­å®šć«č‡Ŗå‹•ćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æć—ć¾ć™ć€‚ + +## ćƒ©ćƒ³ć‚æć‚¤ćƒ å®‰å…Øę©Ÿčƒ½ + +- **VRAM ć‚¬ćƒ¼ćƒ‰**: å„ęŽØč«–å‰ć« VRAM č¦ä»¶ć‚’ęŽØå®šć—ć€åæ…č¦ć«åæœć˜ć¦ćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’č‡Ŗå‹•å‰Šęø› +- **ć‚¢ćƒ€ćƒ—ćƒ†ć‚£ćƒ– VAE ćƒ‡ć‚³ćƒ¼ćƒ‰**: 3 ę®µéšŽćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æļ¼šGPU ć‚æć‚¤ćƒ«ćƒ‡ć‚³ćƒ¼ćƒ‰ → GPU ćƒ‡ć‚³ćƒ¼ćƒ‰+CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ → å®Œå…Ø CPU ćƒ‡ć‚³ćƒ¼ćƒ‰ +- **č‡Ŗå‹•ćƒćƒ£ćƒ³ć‚Æć‚µć‚¤ć‚ŗ**: VAE ćƒ‡ć‚³ćƒ¼ćƒ‰ćƒćƒ£ćƒ³ć‚Æć‚µć‚¤ć‚ŗćŒåˆ©ē”ØåÆčƒ½ćŖē©ŗć VRAM に適応(64/128/256/512/1024/1536) +- **Ꙃ間/ćƒćƒƒćƒć‚Æćƒ©ćƒ³ćƒ—**: ćƒ†ć‚£ć‚¢ć®åˆ¶é™ć‚’č¶…ćˆć‚‹å€¤ć‚’č¦ę±‚ć—ćŸå “åˆć€č­¦å‘ŠćØćØć‚‚ć«č‡Ŗå‹•čŖæę•“ ## ę³Øę„äŗ‹é … - **ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆčØ­å®š** ćÆę¤œå‡ŗć•ć‚ŒćŸ GPU ćƒ”ćƒ¢ćƒŖć«åŸŗć„ć„ć¦č‡Ŗå‹•ę§‹ęˆć•ć‚Œć¾ć™ - **LM ćƒ¢ćƒ¼ćƒ‰** は Chain-of-Thought ē”ŸęˆćØć‚Ŗćƒ¼ćƒ‡ć‚£ć‚Ŗē†č§£ć«ä½æē”Øć•ć‚Œć‚‹čØ€čŖžćƒ¢ćƒ‡ćƒ«ć‚’ęŒ‡ć—ć¾ć™ -- **Flash Attention**态**CPU Offload**态**Compile**态**Quantization** ćÆęœ€é©ćŖćƒ‘ćƒ•ć‚©ćƒ¼ćƒžćƒ³ć‚¹ć®ćŸć‚ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ęœ‰åŠ¹ć§ć™ -- č¦ę±‚ć—ćŸę™‚é–“ć‚„ćƒćƒƒćƒć‚µć‚¤ć‚ŗćŒ GPU ć®åˆ¶é™ć‚’č¶…ćˆć‚‹å “åˆć€č­¦å‘ŠćŒč”Øē¤ŗć•ć‚Œć€å€¤ćÆčØ±å®¹ęœ€å¤§å€¤ć«čŖæę•“ć•ć‚Œć¾ć™ +- **Flash Attention** ćÆč‡Ŗå‹•ę¤œå‡ŗć•ć‚Œć€åˆ©ē”ØåÆčƒ½ćŖå “åˆć«ęœ‰åŠ¹åŒ–ć•ć‚Œć¾ć™ - **åˆ¶ē“„ä»˜ććƒ‡ć‚³ćƒ¼ćƒ‰**: LM ćŒåˆęœŸåŒ–ć•ć‚Œć‚‹ćØć€LM ć®ę™‚é–“ē”Ÿęˆć‚‚ GPU ćƒ†ć‚£ć‚¢ć®ęœ€å¤§ę™‚é–“åˆ¶é™å†…ć«åˆ¶ē“„ć•ć‚Œć€CoT ē”Ÿęˆę™‚ć®ćƒ”ćƒ¢ćƒŖäøč¶³ć‚Øćƒ©ćƒ¼ć‚’é˜²ćŽć¾ć™ -- VRAM ≤6GB 恮 GPU では、DiT ćƒ¢ćƒ‡ćƒ«ē”Øć®ćƒ”ćƒ¢ćƒŖć‚’ē¢ŗäæć™ć‚‹ćŸć‚ć€ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ LM åˆęœŸåŒ–ćŒē„”åŠ¹ć«ćŖć‚Šć¾ć™ +- VRAM ≤6GB 恮 GPU(Tier 1-2)では、DiT ćƒ¢ćƒ‡ćƒ«ē”Øć®ćƒ”ćƒ¢ćƒŖć‚’ē¢ŗäæć™ć‚‹ćŸć‚ć€ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ LM åˆęœŸåŒ–ćŒē„”åŠ¹ć«ćŖć‚Šć¾ć™ - ć‚³ćƒžćƒ³ćƒ‰ćƒ©ć‚¤ćƒ³å¼•ę•°ć¾ćŸćÆ Gradio UI ć§čØ­å®šć‚’ę‰‹å‹•ć§äøŠę›øćć§ćć¾ć™ -> **ć‚³ćƒŸćƒ„ćƒ‹ćƒ†ć‚£č²¢ēŒ®ę­“čæŽ**: 上記の GPU ćƒ†ć‚£ć‚¢ę§‹ęˆćÆäø€čˆ¬ēš„ćŖćƒćƒ¼ćƒ‰ć‚¦ć‚§ć‚¢ć§ć®ćƒ†ć‚¹ćƒˆć«åŸŗć„ć„ć¦ć„ć¾ć™ć€‚ćŠä½æć„ć®ćƒ‡ćƒć‚¤ć‚¹ć®å®Ÿéš›ć®ćƒ‘ćƒ•ć‚©ćƒ¼ćƒžćƒ³ć‚¹ćŒć“ć‚Œć‚‰ć®ćƒ‘ćƒ©ćƒ”ćƒ¼ć‚æćØē•°ćŖć‚‹å “åˆļ¼ˆä¾‹ļ¼šć‚ˆć‚Šé•·ć„ę™‚é–“ć‚„ć‚ˆć‚Šå¤§ććŖćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’å‡¦ē†ć§ćć‚‹ļ¼‰ć€ć‚ˆć‚Šå¾¹åŗ•ēš„ćŖćƒ†ć‚¹ćƒˆć‚’č”Œć„ć€`acestep/gpu_config.py` ć®ę§‹ęˆć‚’ęœ€é©åŒ–ć™ć‚‹ PR ć‚’ęå‡ŗć™ć‚‹ć“ćØć‚’ę­“čæŽć—ć¾ć™ć€‚ēš†ę§˜ć®č²¢ēŒ®ćŒć™ć¹ć¦ć®ćƒ¦ćƒ¼ć‚¶ćƒ¼ć®ä½“éØ“å‘äøŠć«å½¹ē«‹ć”ć¾ć™ļ¼ +> **ć‚³ćƒŸćƒ„ćƒ‹ćƒ†ć‚£č²¢ēŒ®ę­“čæŽ**: 上記の GPU ćƒ†ć‚£ć‚¢ę§‹ęˆćÆäø€čˆ¬ēš„ćŖćƒćƒ¼ćƒ‰ć‚¦ć‚§ć‚¢ć§ć®ćƒ†ć‚¹ćƒˆć«åŸŗć„ć„ć¦ć„ć¾ć™ć€‚ćŠä½æć„ć®ćƒ‡ćƒć‚¤ć‚¹ć®å®Ÿéš›ć®ćƒ‘ćƒ•ć‚©ćƒ¼ćƒžćƒ³ć‚¹ćŒć“ć‚Œć‚‰ć®ćƒ‘ćƒ©ćƒ”ćƒ¼ć‚æćØē•°ćŖć‚‹å “åˆļ¼ˆä¾‹ļ¼šć‚ˆć‚Šé•·ć„ę™‚é–“ć‚„ć‚ˆć‚Šå¤§ććŖćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’å‡¦ē†ć§ćć‚‹ļ¼‰ć€ć‚ˆć‚Šå¾¹åŗ•ēš„ćŖćƒ†ć‚¹ćƒˆć‚’č”Œć„ć€`acestep/gpu_config.py` ć®ę§‹ęˆć‚’ęœ€é©åŒ–ć™ć‚‹ PR ć‚’ęå‡ŗć™ć‚‹ć“ćØć‚’ę­“čæŽć—ć¾ć™ć€‚ ## ćƒ”ćƒ¢ćƒŖęœ€é©åŒ–ć®ćƒ’ćƒ³ćƒˆ -1. **低 VRAM (<8GB)**: ęœ€å¤§ę™‚é–“ć‚’å¾—ć‚‹ćŸć‚ć€LM åˆęœŸåŒ–ćŖć—ć® DiT ć®ćæćƒ¢ćƒ¼ćƒ‰ć‚’ä½æē”Ø -2. **äø­ VRAM (8-16GB)**: å“č³ŖćØćƒ”ćƒ¢ćƒŖć®ćƒćƒ©ćƒ³ć‚¹ćŒęœ€é©ćŖ 0.6B LM ćƒ¢ćƒ‡ćƒ«ć‚’ä½æē”Ø -3. **高 VRAM (>16GB)**: ć‚ˆć‚Šč‰Æć„ć‚Ŗćƒ¼ćƒ‡ć‚£ć‚Ŗē†č§£ćØē”Ÿęˆå“č³Ŗć®ćŸć‚ć€ć‚ˆć‚Šå¤§ććŖ LM ćƒ¢ćƒ‡ćƒ« (1.7B/4B) ć‚’ęœ‰åŠ¹åŒ– +1. **č¶…ä½Ž VRAM (≤6GB)**: LM åˆęœŸåŒ–ćŖć—ć® DiT ć®ćæćƒ¢ćƒ¼ćƒ‰ć‚’ä½æē”Øć€‚INT8 é‡å­åŒ–ćØå®Œå…Ø CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ćŒåæ…é ˆć€‚VAE ćƒ‡ć‚³ćƒ¼ćƒ‰ćÆč‡Ŗå‹•ēš„ć« CPU ć«ćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æć™ć‚‹å “åˆćŒć‚ć‚Šć¾ć™ć€‚ +2. **低 VRAM (6-8GB)**: `pt` ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ć§ 0.6B LM ćƒ¢ćƒ‡ćƒ«ć‚’ä½æē”ØåÆčƒ½ć€‚ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚’ęœ‰åŠ¹ć«äæć”ć¾ć™ć€‚ +3. **äø­ VRAM (8-16GB)**: 0.6B または 1.7B LM ćƒ¢ćƒ‡ćƒ«ć‚’ä½æē”Øć€‚Tier 4+ では `vllm` ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ćŒč‰Æå„½ć«å‹•ä½œć—ć¾ć™ć€‚ +4. **高 VRAM (16-24GB)**: ć‚ˆć‚Šå¤§ććŖ LM ćƒ¢ćƒ‡ćƒ«ļ¼ˆ1.7B ęŽØå„Øļ¼‰ć‚’ęœ‰åŠ¹åŒ–ć€‚20GB+ ć§ćÆé‡å­åŒ–ćÆć‚Ŗćƒ—ć‚·ćƒ§ćƒ³ć«ćŖć‚Šć¾ć™ć€‚ +5. **č¶…é«˜ VRAM (≄24GB)**: ć™ć¹ć¦ć®ćƒ¢ćƒ‡ćƒ«ćŒć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚„é‡å­åŒ–ćŖć—ć§å‹•ä½œć€‚ęœ€é«˜å“č³Ŗć®ćŸć‚ 4B LM を使用。 ## ćƒ‡ćƒćƒƒć‚°ćƒ¢ćƒ¼ćƒ‰ļ¼šē•°ćŖć‚‹ GPU ę§‹ęˆć®ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ć‚·ćƒ§ćƒ³ @@ -40,17 +73,81 @@ ACE-Step 1.5 は GPU 恮 VRAM ć«č‡Ŗå‹•ēš„ć«é©åæœć—ć€ē”Ÿęˆę™‚é–“ć®åˆ¶é™ # 4GB GPU (Tier 1) ć‚’ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ćƒˆ MAX_CUDA_VRAM=4 uv run acestep +# 6GB GPU (Tier 2) ć‚’ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ćƒˆ +MAX_CUDA_VRAM=6 uv run acestep + # 8GB GPU (Tier 4) ć‚’ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ćƒˆ MAX_CUDA_VRAM=8 uv run acestep # 12GB GPU (Tier 5) ć‚’ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ćƒˆ MAX_CUDA_VRAM=12 uv run acestep -# 16GB GPU (Tier 6) ć‚’ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ćƒˆ +# 16GB GPU (Tier 6a) ć‚’ć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ćƒˆ MAX_CUDA_VRAM=16 uv run acestep ``` +`MAX_CUDA_VRAM` ć‚’čØ­å®šć™ć‚‹ćØć€ć‚·ć‚¹ćƒ†ćƒ ćÆ `torch.cuda.set_per_process_memory_fraction()` を呼び出して VRAM ć®ćƒćƒ¼ćƒ‰ć‚­ćƒ£ćƒƒćƒ—ć‚’å¼·åˆ¶ć—ć€ćƒć‚¤ć‚Øćƒ³ćƒ‰ GPU ć§ć‚‚ćƒŖć‚¢ćƒ«ćŖć‚·ćƒŸćƒ„ćƒ¬ćƒ¼ć‚·ćƒ§ćƒ³ć‚’å®Ÿē¾ć—ć¾ć™ć€‚ + +### č‡Ŗå‹•ćƒ†ć‚£ć‚¢ćƒ†ć‚¹ćƒˆ + +UI ć§å„ćƒ†ć‚£ć‚¢ć‚’ę‰‹å‹•ćƒ†ć‚¹ćƒˆć™ć‚‹ä»£ć‚ć‚Šć«ć€`profile_inference.py` 恮 `tier-test` ćƒ¢ćƒ¼ćƒ‰ć‚’ä½æē”Øć§ćć¾ć™ļ¼š + +```bash +# ć™ć¹ć¦ć®ćƒ†ć‚£ć‚¢ć‚’č‡Ŗå‹•ćƒ†ć‚¹ćƒˆ +python profile_inference.py --mode tier-test + +# ē‰¹å®šć®ćƒ†ć‚£ć‚¢ć‚’ćƒ†ć‚¹ćƒˆ +python profile_inference.py --mode tier-test --tiers 6 8 16 + +# LM ć‚’ęœ‰åŠ¹ć«ć—ć¦ćƒ†ć‚¹ćƒˆļ¼ˆć‚µćƒćƒ¼ćƒˆć•ć‚Œć‚‹ćƒ†ć‚£ć‚¢ć§ļ¼‰ +python profile_inference.py --mode tier-test --tier-with-lm + +# é«˜é€Ÿćƒ†ć‚¹ćƒˆļ¼ˆéžé‡å­åŒ–ćƒ†ć‚£ć‚¢ć§ torch.compile ć‚’ć‚¹ć‚­ćƒƒćƒ—ļ¼‰ +python profile_inference.py --mode tier-test --tier-skip-compile +``` + +ćƒ—ćƒ­ćƒ•ć‚”ć‚¤ćƒŖćƒ³ć‚°ćƒ„ćƒ¼ćƒ«ć®å®Œå…ØćŖćƒ‰ć‚­ćƒ„ćƒ”ćƒ³ćƒˆćÆ [BENCHMARK.md](BENCHMARK.md) ć‚’å‚ē…§ć—ć¦ćć ć•ć„ć€‚ + ē”Øé€”ļ¼š - ćƒć‚¤ć‚Øćƒ³ćƒ‰ćƒćƒ¼ćƒ‰ć‚¦ć‚§ć‚¢ć§ GPU ćƒ†ć‚£ć‚¢ę§‹ęˆć‚’ćƒ†ć‚¹ćƒˆ - å„ćƒ†ć‚£ć‚¢ć®č­¦å‘ŠćØåˆ¶é™ćŒę­£ć—ćę©Ÿčƒ½ć™ć‚‹ć“ćØć‚’ē¢ŗčŖ -- PR ć‚’ęå‡ŗć™ć‚‹å‰ć«ę–°ć—ć„ GPU ę§‹ęˆćƒ‘ćƒ©ćƒ”ćƒ¼ć‚æć‚’é–‹ē™ŗćƒ»ćƒ†ć‚¹ćƒˆ +- `acestep/gpu_config.py` å¤‰ę›“å¾Œć®č‡Ŗå‹•å›žåø°ćƒ†ć‚¹ćƒˆ +- CI/CD VRAM äŗ’ę›ę€§ę¤œčØ¼ + +### å¢ƒē•Œćƒ†ć‚¹ćƒˆļ¼ˆęœ€å°ćƒ†ć‚£ć‚¢ć®ē‰¹å®šļ¼‰ + +`--tier-boundary` を使用すると、INT8 é‡å­åŒ–ćØ CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚’å®‰å…Øć«ē„”åŠ¹åŒ–ć§ćć‚‹ęœ€å° VRAM ćƒ†ć‚£ć‚¢ć‚’å®ŸéØ“ēš„ć«ē‰¹å®šć§ćć¾ć™ć€‚å„ćƒ†ć‚£ć‚¢ć«åÆ¾ć—ć¦ęœ€å¤§3ć¤ć®ę§‹ęˆć§ćƒ†ć‚¹ćƒˆć—ć¾ć™ļ¼š + +1. **default** — ćƒ†ć‚£ć‚¢ć®ęØ™ęŗ–čØ­å®šļ¼ˆé‡å­åŒ– + ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚’čØ­å®šé€šć‚Šć«ä½æē”Øļ¼‰ +2. **no-quant** — ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰čØ­å®šćÆćć®ć¾ć¾ć€é‡å­åŒ–ć‚’ē„”åŠ¹åŒ– +3. **no-offload** — é‡å­åŒ–ćŖć—ć€CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ćŖć—ļ¼ˆć™ć¹ć¦ć®ćƒ¢ćƒ‡ćƒ«ć‚’ GPU ć«äæęŒļ¼‰ + +```bash +# ć™ć¹ć¦ć®ćƒ†ć‚£ć‚¢ć§å¢ƒē•Œćƒ†ć‚¹ćƒˆć‚’å®Ÿč”Œ +python profile_inference.py --mode tier-test --tier-boundary + +# ē‰¹å®šć®ćƒ†ć‚£ć‚¢ć®å¢ƒē•Œćƒ†ć‚¹ćƒˆ +python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24 + +# LM ć‚’ęœ‰åŠ¹ć«ć—ćŸå¢ƒē•Œćƒ†ć‚¹ćƒˆļ¼ˆć‚µćƒćƒ¼ćƒˆć•ć‚Œć‚‹ćƒ†ć‚£ć‚¢ć§ļ¼‰ +python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm + +# ēµęžœć‚’ JSON ć«äæå­˜ +python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json +``` + +> **ę³Øę„ļ¼š** å¢ƒē•Œćƒ†ć‚¹ćƒˆēµęžœćÆēµŒéØ“ēš„ćŖć‚‚ć®ć§ć‚ć‚Šć€DiT ćƒ¢ćƒ‡ćƒ«ćƒćƒŖć‚¢ćƒ³ćƒˆļ¼ˆturbo vs base)、LM ć®ęœ‰åŠ¹åŒ–ēŠ¶ę…‹ć€ē”Ÿęˆę™‚é–“ć€flash attention ć®åˆ©ē”ØåÆå¦ć«ć‚ˆć£ć¦ē•°ćŖć‚‹å “åˆćŒć‚ć‚Šć¾ć™ć€‚ + +### ćƒćƒƒćƒć‚µć‚¤ć‚ŗå¢ƒē•Œćƒ†ć‚¹ćƒˆ + +`--tier-batch-boundary` ć‚’ä½æē”Øć—ć¦ć€ćƒćƒƒćƒć‚µć‚¤ć‚ŗ 1态2态4态8 ć‚’ę®µéšŽēš„ć«ćƒ†ć‚¹ćƒˆć—ć€å„ćƒ†ć‚£ć‚¢ć®ęœ€å¤§å®‰å…Øćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’č¦‹ć¤ć‘ć¾ć™ļ¼š + +```bash +# LM ęœ‰åŠ¹ć§ćƒćƒƒćƒå¢ƒē•Œćƒ†ć‚¹ćƒˆć‚’å®Ÿč”Œ +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm + +# ē‰¹å®šć®ćƒ†ć‚£ć‚¢ć‚’ćƒ†ć‚¹ćƒˆ +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24 +``` + +LM 恂悊/ćŖć—ć®äø”ę–¹ć®ę§‹ęˆć‚’ćƒ†ć‚¹ćƒˆć—ć€å„ćƒ†ć‚£ć‚¢ć®ęœ€å¤§ęˆåŠŸćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’å ±å‘Šć—ć¾ć™ć€‚ diff --git a/docs/ja/GRADIO_GUIDE.md b/docs/ja/GRADIO_GUIDE.md index 75d27f9a..5053377b 100644 --- a/docs/ja/GRADIO_GUIDE.md +++ b/docs/ja/GRADIO_GUIDE.md @@ -62,17 +62,23 @@ Gradioć‚¤ćƒ³ć‚æćƒ¼ćƒ•ć‚§ćƒ¼ć‚¹ćÆä»„äø‹ć®äø»č¦ć‚»ć‚Æć‚·ćƒ§ćƒ³ć§ę§‹ęˆć•ć‚Œ | 設定 | čŖ¬ę˜Ž | |---------|-------------| -| **5Hz LMćƒ¢ćƒ‡ćƒ«ćƒ‘ć‚¹** | čØ€čŖžćƒ¢ćƒ‡ćƒ«ć‚’éøęŠžļ¼ˆä¾‹ļ¼š`acestep-5Hz-lm-0.6B`态`acestep-5Hz-lm-1.7B`)| -| **5Hz LMćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰** | `vllm`ļ¼ˆć‚ˆć‚Šé«˜é€Ÿć€ęŽØå„Øļ¼‰ć¾ćŸćÆ `pt`(PyTorchć€äŗ’ę›ę€§ćŒé«˜ć„ļ¼‰| -| **5Hz LMć‚’åˆęœŸåŒ–** | åˆęœŸåŒ–ę™‚ć«LMć‚’čŖ­ćæč¾¼ć‚€ćŸć‚ć«ćƒć‚§ćƒƒć‚Æļ¼ˆthinkingćƒ¢ćƒ¼ćƒ‰ć«åæ…č¦ļ¼‰| +| **5Hz LMćƒ¢ćƒ‡ćƒ«ćƒ‘ć‚¹** | čØ€čŖžćƒ¢ćƒ‡ćƒ«ć‚’éøęŠžć€‚**åˆ©ē”ØåÆčƒ½ćŖćƒ¢ćƒ‡ćƒ«ćÆGPUćƒ†ć‚£ć‚¢ć«åŸŗć„ć„ć¦č‡Ŗå‹•ćƒ•ć‚£ćƒ«ć‚æćƒŖćƒ³ć‚°**ć•ć‚Œć¾ć™ — ä¾‹ļ¼š6-8GB GPUでは0.6Bのみ、24GB+ GPUć§ćÆć™ć¹ć¦ć®ć‚µć‚¤ć‚ŗļ¼ˆ0.6B态1.7B态4Bļ¼‰ćŒč”Øē¤ŗć•ć‚Œć¾ć™ć€‚| +| **5Hz LMćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰** | `vllm`ļ¼ˆć‚ˆć‚Šé«˜é€Ÿć€VRAM ≄8GB恮NVIDIA GPUęŽØå„Øļ¼‰ć€`pt`(PyTorchć€ćƒ¦ćƒ‹ćƒćƒ¼ć‚µćƒ«ćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æļ¼‰ć€ć¾ćŸćÆ `mlx`(Apple Silicon)。**VRAM <8GB恮GPUでは `pt`/`mlx` ć«åˆ¶é™**ć•ć‚Œć¾ć™ļ¼ˆvllm恮KVć‚­ćƒ£ćƒƒć‚·ćƒ„ćŒćƒ”ćƒ¢ćƒŖć‚’ę¶ˆč²»ć—ć™ćŽć‚‹ćŸć‚ļ¼‰ć€‚| +| **5Hz LMć‚’åˆęœŸåŒ–** | åˆęœŸåŒ–ę™‚ć«LMć‚’čŖ­ćæč¾¼ć‚€ćŸć‚ć«ćƒć‚§ćƒƒć‚Æļ¼ˆthinkingćƒ¢ćƒ¼ćƒ‰ć«åæ…č¦ļ¼‰ć€‚**VRAM ≤6GB恮GPU(Tier 1-2ļ¼‰ć§ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ćƒć‚§ćƒƒć‚ÆćŖć—ćƒ»ē„”åŠ¹ć€‚**| + +> **ć‚¢ćƒ€ćƒ—ćƒ†ć‚£ćƒ–ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆ**: すべてのLM設定はGPU恮VRAMćƒ†ć‚£ć‚¢ć«åŸŗć„ć„ć¦č‡Ŗå‹•ę§‹ęˆć•ć‚Œć¾ć™ć€‚ęŽØå„ØLMćƒ¢ćƒ‡ćƒ«ć€ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ć€åˆęœŸåŒ–ēŠ¶ę…‹ćÆęœ€é©ćŖćƒ‘ćƒ•ć‚©ćƒ¼ćƒžćƒ³ć‚¹ć«äŗ‹å‰čØ­å®šć•ć‚Œć¦ć„ć¾ć™ć€‚ę‰‹å‹•ć§äøŠę›øćć§ćć¾ć™ćŒć€GPUćØäŗ’ę›ę€§ć®ćŖć„éøęŠžć‚’ć—ćŸå “åˆć€ć‚·ć‚¹ćƒ†ćƒ ćŒč­¦å‘Šć‚’č”Øē¤ŗć—ć¾ć™ć€‚ ### ćƒ‘ćƒ•ć‚©ćƒ¼ćƒžćƒ³ć‚¹ć‚Ŗćƒ—ć‚·ćƒ§ćƒ³ | 設定 | čŖ¬ę˜Ž | |---------|-------------| | **Flash Attentionを使用** | ć‚ˆć‚Šé«˜é€ŸćŖęŽØč«–ć®ćŸć‚ć«ęœ‰åŠ¹åŒ–ļ¼ˆflash_attnćƒ‘ćƒƒć‚±ćƒ¼ć‚øćŒåæ…č¦ļ¼‰| -| **CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰** | ć‚¢ć‚¤ćƒ‰ćƒ«ę™‚ć«ćƒ¢ćƒ‡ćƒ«ć‚’CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć—ć¦GPUćƒ”ćƒ¢ćƒŖć‚’ēÆ€ē“„ | -| **DiT悒CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰** | DiTćƒ¢ćƒ‡ćƒ«ć‚’ē‰¹ć«CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ | +| **CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰** | ć‚¢ć‚¤ćƒ‰ćƒ«ę™‚ć«ćƒ¢ćƒ‡ćƒ«ć‚’CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć—ć¦GPUćƒ”ćƒ¢ćƒŖć‚’ēÆ€ē“„ć€‚**VRAM <20GB恮GPUć§ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆč‡Ŗå‹•ęœ‰åŠ¹ć€‚**| +| **DiT悒CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰** | DiTćƒ¢ćƒ‡ćƒ«ć‚’ē‰¹ć«CPUć«ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć€‚**VRAM <12GB恮GPUć§ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆč‡Ŗå‹•ęœ‰åŠ¹ć€‚**| +| **INT8量子化** | INT8é‡ćæé‡å­åŒ–ć§ćƒ¢ćƒ‡ćƒ«ć®VRAMä½æē”Øé‡ć‚’å‰Šęø›ć€‚**VRAM <20GB恮GPUć§ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆč‡Ŗå‹•ęœ‰åŠ¹ć€‚**| +| **ćƒ¢ćƒ‡ćƒ«ć‚³ćƒ³ćƒ‘ć‚¤ćƒ«** | ęœ€é©åŒ–ęŽØč«–ć®ćŸć‚ `torch.compile` ć‚’ęœ‰åŠ¹åŒ–ć€‚**ć™ć¹ć¦ć®ćƒ†ć‚£ć‚¢ć§ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆęœ‰åŠ¹**ļ¼ˆé‡å­åŒ–ćŒć‚¢ć‚Æćƒ†ć‚£ćƒ–ćŖå “åˆć«åæ…č¦ļ¼‰ć€‚| + +> **ćƒ†ć‚£ć‚¢åÆ¾åæœčØ­å®š**: ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć€é‡å­åŒ–ć€ć‚³ćƒ³ćƒ‘ć‚¤ćƒ«ć‚Ŗćƒ—ć‚·ćƒ§ćƒ³ćÆGPUćƒ†ć‚£ć‚¢ć«åŸŗć„ć„ć¦č‡Ŗå‹•čØ­å®šć•ć‚Œć¾ć™ć€‚å®Œå…ØćŖćƒ†ć‚£ć‚¢ćƒ†ćƒ¼ćƒ–ćƒ«ćÆ [GPU_COMPATIBILITY.md](../ja/GPU_COMPATIBILITY.md) ć‚’å‚ē…§ć—ć¦ćć ć•ć„ć€‚ ### LoRAć‚¢ćƒ€ćƒ—ć‚æćƒ¼ @@ -87,7 +93,12 @@ Gradioć‚¤ćƒ³ć‚æćƒ¼ćƒ•ć‚§ćƒ¼ć‚¹ćÆä»„äø‹ć®äø»č¦ć‚»ć‚Æć‚·ćƒ§ćƒ³ć§ę§‹ęˆć•ć‚Œ ### åˆęœŸåŒ– -**ć‚µćƒ¼ćƒ“ć‚¹ć‚’åˆęœŸåŒ–** ć‚’ć‚ÆćƒŖćƒƒć‚Æć—ć¦ćƒ¢ćƒ‡ćƒ«ć‚’čŖ­ćæč¾¼ćæć¾ć™ć€‚ć‚¹ćƒ†ćƒ¼ć‚æć‚¹ćƒœćƒƒć‚Æć‚¹ć«é€²ę—ćØē¢ŗčŖćŒč”Øē¤ŗć•ć‚Œć¾ć™ć€‚ +**ć‚µćƒ¼ćƒ“ć‚¹ć‚’åˆęœŸåŒ–** ć‚’ć‚ÆćƒŖćƒƒć‚Æć—ć¦ćƒ¢ćƒ‡ćƒ«ć‚’čŖ­ćæč¾¼ćæć¾ć™ć€‚ć‚¹ćƒ†ćƒ¼ć‚æć‚¹ćƒœćƒƒć‚Æć‚¹ć«ä»„äø‹ć‚’å«ć‚€é€²ę—ćØē¢ŗčŖćŒč”Øē¤ŗć•ć‚Œć¾ć™ļ¼š +- ę¤œå‡ŗć•ć‚ŒćŸGPUćƒ†ć‚£ć‚¢ćØVRAM +- ęœ€å¤§čØ±å®¹ę™‚é–“ćØćƒćƒƒćƒć‚µć‚¤ć‚ŗļ¼ˆLMćŒåˆęœŸåŒ–ć•ć‚ŒćŸć‹ć©ć†ć‹ć«åŸŗć„ć„ć¦å‹•ēš„ć«čŖæę•“ļ¼‰ +- č‡Ŗå‹•äæ®ę­£ć•ć‚ŒćŸäŗ’ę›ę€§ć®ćŖć„čØ­å®šć«é–¢ć™ć‚‹č­¦å‘Š + +åˆęœŸåŒ–å¾Œć€**ć‚Ŗćƒ¼ćƒ‡ć‚£ć‚Ŗę™‚é–“** と **ćƒćƒƒćƒć‚µć‚¤ć‚ŗ** ć‚¹ćƒ©ć‚¤ćƒ€ćƒ¼ćÆćƒ†ć‚£ć‚¢ć®åˆ¶é™ć‚’åę˜ ć™ć‚‹ć‚ˆć†ć«č‡Ŗå‹•ę›“ę–°ć•ć‚Œć¾ć™ć€‚ --- @@ -515,15 +526,19 @@ LoRAćƒˆćƒ¬ćƒ¼ćƒ‹ćƒ³ć‚°ć‚æćƒ–ćÆć‚«ć‚¹ć‚æćƒ LoRAć‚¢ćƒ€ćƒ—ć‚æćƒ¼ć‚’ä½œęˆć™ć‚‹ćŸ - ē•°ćŖć‚‹ć‚·ćƒ¼ćƒ‰ć‚’č©¦ć™ - captionć‚’ć‚ˆć‚Šå…·ä½“ēš„ć«ć™ć‚‹ -**ćƒ”ćƒ¢ćƒŖäøč¶³ļ¼š** -- ćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’ęø›ć‚‰ć™ -- CPUć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚’ęœ‰åŠ¹åŒ– +**ćƒ”ćƒ¢ćƒŖäøč¶³ļ¼ˆOOMļ¼‰ļ¼š** +- ć‚·ć‚¹ćƒ†ćƒ ćÆč‡Ŗå‹•VRAMć‚¬ćƒ¼ćƒ‰ļ¼ˆćƒćƒƒćƒč‡Ŗå‹•å‰Šęø›ļ¼‰ćØć‚¢ćƒ€ćƒ—ćƒ†ć‚£ćƒ–VAEćƒ‡ć‚³ćƒ¼ćƒ‰ļ¼ˆCPUćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æļ¼‰ć‚’å«ćæć¾ć™ć€‚ćć‚Œć§ć‚‚OOMćŒē™ŗē”Ÿć™ć‚‹å “åˆļ¼š +- ę‰‹å‹•ć§ćƒćƒƒćƒć‚µć‚¤ć‚ŗć‚’ęø›ć‚‰ć™ +- CPUć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚’ęœ‰åŠ¹åŒ–ļ¼ˆVRAM <20GBć§ćÆč‡Ŗå‹•ęœ‰åŠ¹ć®ćÆćšļ¼‰ +- INT8é‡å­åŒ–ć‚’ęœ‰åŠ¹åŒ–ļ¼ˆVRAM <20GBć§ćÆč‡Ŗå‹•ęœ‰åŠ¹ć®ćÆćšļ¼‰ - LMćƒćƒƒćƒćƒćƒ£ćƒ³ć‚Æć‚µć‚¤ć‚ŗć‚’ęø›ć‚‰ć™ +- å„ćƒ†ć‚£ć‚¢ć®ęŽØå„ØčØ­å®šćÆ [GPU_COMPATIBILITY.md](../ja/GPU_COMPATIBILITY.md) ć‚’å‚ē…§ **LMćŒę©Ÿčƒ½ć—ćŖć„ļ¼š** -- åˆęœŸåŒ–ę™‚ć«ć€Œ5Hz LMć‚’åˆęœŸåŒ–ć€ćŒćƒć‚§ćƒƒć‚Æć•ć‚Œć¦ć„ćŸć“ćØć‚’ē¢ŗčŖ -- ęœ‰åŠ¹ćŖLMćƒ¢ćƒ‡ćƒ«ćƒ‘ć‚¹ćŒéøęŠžć•ć‚Œć¦ć„ć‚‹ć“ćØć‚’ē¢ŗčŖ -- vllmまたはPyTorchćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ćŒåˆ©ē”ØåÆčƒ½ć§ć‚ć‚‹ć“ćØć‚’ē¢ŗčŖ +- åˆęœŸåŒ–ę™‚ć«ć€Œ5Hz LMć‚’åˆęœŸåŒ–ć€ćŒćƒć‚§ćƒƒć‚Æć•ć‚Œć¦ć„ćŸć“ćØć‚’ē¢ŗčŖļ¼ˆVRAM ≤6GB恮GPUć§ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆē„”åŠ¹ļ¼‰ +- ęœ‰åŠ¹ćŖLMćƒ¢ćƒ‡ćƒ«ćƒ‘ć‚¹ćŒéøęŠžć•ć‚Œć¦ć„ć‚‹ć“ćØć‚’ē¢ŗčŖļ¼ˆćƒ†ć‚£ć‚¢äŗ’ę›ćƒ¢ćƒ‡ćƒ«ć®ćæč”Øē¤ŗļ¼‰ +- vllmまたはPyTorchćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ćŒåˆ©ē”ØåÆčƒ½ć§ć‚ć‚‹ć“ćØć‚’ē¢ŗčŖļ¼ˆVRAM <8GBではvllmåˆ¶é™ļ¼‰ +- LMćƒć‚§ćƒƒć‚Æćƒœćƒƒć‚Æć‚¹ćŒć‚°ćƒ¬ćƒ¼ć‚¢ć‚¦ćƒˆć—ć¦ć„ć‚‹å “åˆć€GPUćƒ†ć‚£ć‚¢ćŒLMć‚’ć‚µćƒćƒ¼ćƒˆć—ć¦ć„ć¾ć›ć‚“ — DiTć®ćæćƒ¢ćƒ¼ćƒ‰ć‚’ä½æē”Ø --- diff --git a/docs/ja/INFERENCE.md b/docs/ja/INFERENCE.md index 6473d7f4..e6036c4b 100644 --- a/docs/ja/INFERENCE.md +++ b/docs/ja/INFERENCE.md @@ -709,8 +709,8 @@ caption="é€Ÿć„é…ć„éŸ³ę„½" # ćƒ†ćƒ³ćƒć®ēŸ›ē›¾ ### ć‚ˆćć‚ć‚‹å•é”Œ -**問锌**ļ¼šćƒ”ćƒ¢ćƒŖäøč¶³ć‚Øćƒ©ćƒ¼ -- **解決策**:`batch_size`态`inference_steps` を減らすか、CPUć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć‚’ęœ‰åŠ¹åŒ– +**問锌**ļ¼šćƒ”ćƒ¢ćƒŖäøč¶³ļ¼ˆOOMļ¼‰ć‚Øćƒ©ćƒ¼ +- **解決策**ļ¼šć‚·ć‚¹ćƒ†ćƒ ćÆ VRAM ć‚¬ćƒ¼ćƒ‰ļ¼ˆćƒćƒƒćƒč‡Ŗå‹•å‰Šęø›ļ¼‰ćØć‚¢ćƒ€ćƒ—ćƒ†ć‚£ćƒ– VAE ćƒ‡ć‚³ćƒ¼ćƒ‰ļ¼ˆCPU ćƒ•ć‚©ćƒ¼ćƒ«ćƒćƒƒć‚Æļ¼‰ć«ć‚ˆć‚Šć€ć»ćØć‚“ć©ć® OOM ć‚·ćƒŠćƒŖć‚Ŗć‚’č‡Ŗå‹•å‡¦ē†ć—ć¾ć™ć€‚ćć‚Œć§ć‚‚ OOM ćŒē™ŗē”Ÿć™ć‚‹å “åˆļ¼š`batch_size` を減らす、`inference_steps` を減らす、CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ļ¼ˆ`offload_to_cpu=True`ļ¼‰ć‚’ęœ‰åŠ¹åŒ–ć€ć¾ćŸćÆ INT8 é‡å­åŒ–ć‚’ęœ‰åŠ¹åŒ–ć—ć¦ćć ć•ć„ć€‚å„ VRAM ćƒ†ć‚£ć‚¢ć®ęŽØå„ØčØ­å®šćÆ [GPU_COMPATIBILITY.md](../ja/GPU_COMPATIBILITY.md) ć‚’å‚ē…§ć—ć¦ćć ć•ć„ć€‚ **問锌**ļ¼šēµęžœć®å“č³ŖćŒę‚Ŗć„ - **解決策**:`inference_steps` を増やす、`guidance_scale` を調敓、basećƒ¢ćƒ‡ćƒ«ć‚’ä½æē”Ø diff --git a/docs/ja/INSTALL.md b/docs/ja/INSTALL.md index c18d3eff..13f64240 100644 --- a/docs/ja/INSTALL.md +++ b/docs/ja/INSTALL.md @@ -468,7 +468,7 @@ ACESTEP_INIT_LLM=false | `--init_llm` | auto | LLM åˆęœŸåŒ–ļ¼š`true` / `false` / ēœē•„ć§č‡Ŗå‹• | | `--config_path` | auto | DiT ćƒ¢ćƒ‡ćƒ«ļ¼ˆä¾‹ļ¼š`acestep-v15-turbo`) | | `--lm_model_path` | auto | LM ćƒ¢ćƒ‡ćƒ«ļ¼ˆä¾‹ļ¼š`acestep-5Hz-lm-1.7B`) | -| `--offload_to_cpu` | auto | CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ļ¼ˆVRAM < 16GB ć§č‡Ŗå‹•ęœ‰åŠ¹åŒ–ļ¼‰ | +| `--offload_to_cpu` | auto | CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ļ¼ˆGPU ćƒ†ć‚£ć‚¢ć«åŸŗć„ć„ć¦č‡Ŗå‹•čØ­å®šļ¼‰ | | `--download-source` | auto | ćƒ¢ćƒ‡ćƒ«ć‚½ćƒ¼ć‚¹ļ¼š`auto` / `huggingface` / `modelscope` | | `--enable-api` | false | Gradio UI ćØåŒę™‚ć« REST API ć‚Øćƒ³ćƒ‰ćƒć‚¤ćƒ³ćƒˆć‚’ęœ‰åŠ¹åŒ– | @@ -529,16 +529,17 @@ huggingface-cli download ACE-Step/acestep-5Hz-lm-4B --local-dir ./checkpoints/ac ## šŸ’” ć©ć®ćƒ¢ćƒ‡ćƒ«ć‚’éøć¶ć¹ćļ¼Ÿ -ACE-Step は GPU 恮 VRAM ć«č‡Ŗå‹•é©åæœć—ć¾ć™ļ¼š +ACE-Step は GPU 恮 VRAM ć«č‡Ŗå‹•é©åæœć—ć¾ć™ć€‚UI ćÆę¤œå‡ŗć•ć‚ŒćŸ GPU ćƒ†ć‚£ć‚¢ć«åŸŗć„ć„ć¦ć™ć¹ć¦ć®čØ­å®šļ¼ˆLM ćƒ¢ćƒ‡ćƒ«ć€ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ć€ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ć€é‡å­åŒ–ļ¼‰ć‚’äŗ‹å‰ę§‹ęˆć—ć¾ć™ļ¼š -| GPU VRAM | ęŽØå„Ø LM ćƒ¢ćƒ‡ćƒ« | å‚™č€ƒ | -|----------|---------------|------| -| **≤6GB** | ćŖć—ļ¼ˆDiTのみ) | ćƒ”ćƒ¢ćƒŖēÆ€ē“„ć®ćŸć‚ LM ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ē„”åŠ¹ | -| **6-12GB** | `acestep-5Hz-lm-0.6B` | č»½é‡ć€ćƒćƒ©ćƒ³ć‚¹ćŒč‰Æć„ | -| **12-16GB** | `acestep-5Hz-lm-1.7B` | ć‚ˆć‚Šé«˜å“č³Ŗ | -| **≄16GB** | `acestep-5Hz-lm-4B` | ęœ€é«˜å“č³ŖćØéŸ³å£°ē†č§£čƒ½åŠ› | +| GPU VRAM | ęŽØå„Ø LM ćƒ¢ćƒ‡ćƒ« | ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ | å‚™č€ƒ | +|----------|---------------|-------------|------| +| **≤6GB** | ćŖć—ļ¼ˆDiTのみ) | — | LM ćÆćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć§ē„”åŠ¹ļ¼›INT8 量子化 + å®Œå…Ø CPU ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ | +| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | č»½é‡ LM态PyTorch ćƒćƒƒć‚Æć‚Øćƒ³ćƒ‰ | +| **8-16GB** | `0.6B` / `1.7B` | `vllm` | 8-12GB は 0.6B态12-16GB は 1.7B | +| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 20GB+ 恧 4B åˆ©ē”ØåÆčƒ½ļ¼›20GB+ ć§ć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰äøč¦ | +| **≄24GB** | `acestep-5Hz-lm-4B` | `vllm` | ęœ€é«˜å“č³Ŗć€ć™ć¹ć¦ć®ćƒ¢ćƒ‡ćƒ«ćŒć‚Ŗćƒ•ćƒ­ćƒ¼ćƒ‰ćŖć—ć§å‹•ä½œ | -> šŸ“– GPU äŗ’ę›ę€§ć®č©³ē“°ļ¼ˆę™‚é–“åˆ¶é™ć€ćƒćƒƒćƒć‚µć‚¤ć‚ŗć€ćƒ”ćƒ¢ćƒŖęœ€é©åŒ–ļ¼‰ćÆ [GPU äŗ’ę›ę€§ć‚¬ć‚¤ćƒ‰](GPU_COMPATIBILITY.md) ć‚’å‚ē…§ć—ć¦ćć ć•ć„ć€‚ +> šŸ“– GPU äŗ’ę›ę€§ć®č©³ē“°ļ¼ˆćƒ†ć‚£ć‚¢ćƒ†ćƒ¼ćƒ–ćƒ«ć€ę™‚é–“åˆ¶é™ć€ćƒćƒƒćƒć‚µć‚¤ć‚ŗć€ć‚¢ćƒ€ćƒ—ćƒ†ć‚£ćƒ– UI ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆć€ćƒ”ćƒ¢ćƒŖęœ€é©åŒ–ļ¼‰ćÆ [GPU äŗ’ę›ę€§ć‚¬ć‚¤ćƒ‰](GPU_COMPATIBILITY.md) ć‚’å‚ē…§ć—ć¦ćć ć•ć„ć€‚ --- diff --git a/docs/ko/GPU_COMPATIBILITY.md b/docs/ko/GPU_COMPATIBILITY.md index 38bfe625..0ceb9b6f 100644 --- a/docs/ko/GPU_COMPATIBILITY.md +++ b/docs/ko/GPU_COMPATIBILITY.md @@ -1,36 +1,69 @@ # GPU ķ˜øķ™˜ģ„± ź°€ģ“ė“œ -ACE-Step 1.5ėŠ” GPUģ˜ ģ‚¬ģš© ź°€ėŠ„ķ•œ VRAM에 ģžė™ģœ¼ė”œ ģ ģ‘ķ•˜ģ—¬ ģƒģ„± ģ œķ•œ ė° LM ėŖØėø ź°€ģš©ģ„±ģ„ ģ ģ ˆķžˆ ģ”°ģ •ķ•©ė‹ˆė‹¤. ģ‹œģŠ¤ķ…œģ€ ģ‹œģž‘ ģ‹œ GPU 메모리넼 ź°ģ§€ķ•˜ź³  ģµœģ ģ˜ ģ„¤ģ •ģ„ źµ¬ģ„±ķ•©ė‹ˆė‹¤. +ACE-Step 1.5ėŠ” GPUģ˜ ģ‚¬ģš© ź°€ėŠ„ķ•œ VRAM에 ģžė™ģœ¼ė”œ ģ ģ‘ķ•˜ģ—¬ ģƒģ„± ģ œķ•œ, LM ėŖØėø ź°€ģš©ģ„±, ģ˜¤ķ”„ė”œė“œ ģ „ėžµ ė° UI źø°ė³ø ģ„¤ģ •ģ„ ģ ģ ˆķžˆ ģ”°ģ •ķ•©ė‹ˆė‹¤. ģ‹œģŠ¤ķ…œģ€ ģ‹œģž‘ ģ‹œ GPU 메모리넼 ź°ģ§€ķ•˜ź³  ģµœģ ģ˜ ģ„¤ģ •ģ„ źµ¬ģ„±ķ•©ė‹ˆė‹¤. ## GPU ķ‹°ģ–“ 구성 -| VRAM | ķ‹°ģ–“ | LM ėŖØė“œ | ģµœėŒ€ ģƒģ„± źøøģ“ | ģµœėŒ€ 배치 크기 | LM 메모리 할당 | -|------|------|---------|--------------|----------------|---------------------| -| ≤4GB | ķ‹°ģ–“ 1 | ģ‚¬ģš© ė¶ˆź°€ | 3ė¶„ | 1 | - | -| 4-6GB | ķ‹°ģ–“ 2 | ģ‚¬ģš© ė¶ˆź°€ | 6ė¶„ | 1 | - | -| 6-8GB | ķ‹°ģ–“ 3 | 0.6B (ģ„ ķƒ 사항) | LM ģ‚¬ģš© ģ‹œ: 4ė¶„ / ėÆøģ‚¬ģš© ģ‹œ: 6ė¶„ | LM ģ‚¬ģš© ģ‹œ: 1 / ėÆøģ‚¬ģš© ģ‹œ: 2 | 3GB | -| 8-12GB | ķ‹°ģ–“ 4 | 0.6B (ģ„ ķƒ 사항) | LM ģ‚¬ģš© ģ‹œ: 4ė¶„ / ėÆøģ‚¬ģš© ģ‹œ: 6ė¶„ | LM ģ‚¬ģš© ģ‹œ: 2 / ėÆøģ‚¬ģš© ģ‹œ: 4 | 3GB | -| 12-16GB | ķ‹°ģ–“ 5 | 0.6B / 1.7B | LM ģ‚¬ģš© ģ‹œ: 4ė¶„ / ėÆøģ‚¬ģš© ģ‹œ: 6ė¶„ | LM ģ‚¬ģš© ģ‹œ: 2 / ėÆøģ‚¬ģš© ģ‹œ: 4 | 0.6B: 3GB, 1.7B: 8GB | -| 16-24GB | ķ‹°ģ–“ 6 | 0.6B / 1.7B / 4B | 8ė¶„ | LM ģ‚¬ģš© ģ‹œ: 4 / ėÆøģ‚¬ģš© ģ‹œ: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB | -| ≄24GB | ģ œķ•œ ģ—†ģŒ | ėŖØė“  ėŖØėø | 10ė¶„ | 8 | ģ œķ•œ ģ—†ģŒ | +| VRAM | ķ‹°ģ–“ | LM ėŖØėø | ģ¶”ģ²œ LM | ė°±ģ—”ė“œ | ģµœėŒ€ źøøģ“ (LM ģ‚¬ģš© / ėÆøģ‚¬ģš©) | ģµœėŒ€ 배치 (LM ģ‚¬ģš© / ėÆøģ‚¬ģš©) | ģ˜¤ķ”„ė”œė“œ | ģ–‘ģžķ™” | +|------|------|---------|---------|--------|------------------------------|------------------------------|----------|--------| +| ≤4GB | ķ‹°ģ–“ 1 | ģ—†ģŒ | — | pt | 4ė¶„ / 6ė¶„ | 1 / 1 | CPU + DiT | INT8 | +| 4-6GB | ķ‹°ģ–“ 2 | ģ—†ģŒ | — | pt | 8ė¶„ / 10ė¶„ | 1 / 1 | CPU + DiT | INT8 | +| 6-8GB | ķ‹°ģ–“ 3 | 0.6B | 0.6B | pt | 8ė¶„ / 10ė¶„ | 1 / 2 | CPU + DiT | INT8 | +| 8-12GB | ķ‹°ģ–“ 4 | 0.6B | 0.6B | vllm | 8ė¶„ / 10ė¶„ | 2 / 4 | CPU + DiT | INT8 | +| 12-16GB | ķ‹°ģ–“ 5 | 0.6B, 1.7B | 1.7B | vllm | 8ė¶„ / 10ė¶„ | 2 / 4 | CPU | INT8 | +| 16-20GB | ķ‹°ģ–“ 6a | 0.6B, 1.7B | 1.7B | vllm | 8ė¶„ / 10ė¶„ | 4 / 8 | CPU | INT8 | +| 20-24GB | ķ‹°ģ–“ 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8ė¶„ / 8ė¶„ | 4 / 8 | ģ—†ģŒ | ģ—†ģŒ | +| ≄24GB | ģ œķ•œ ģ—†ģŒ | 전첓 (0.6B, 1.7B, 4B) | 4B | vllm | 10ė¶„ / 10ė¶„ | 8 / 8 | ģ—†ģŒ | ģ—†ģŒ | + +### ģ—“ 설명 + +- **LM ėŖØėø**: 핓당 ķ‹°ģ–“ģ—ģ„œ ė”œė“œķ•  수 ģžˆėŠ” 5Hz ģ–øģ–“ ėŖØėø 크기 +- **ģ¶”ģ²œ LM**: UIģ—ģ„œ 핓당 티얓에 źø°ė³ø ģ„ ķƒė˜ėŠ” LM ėŖØėø +- **ė°±ģ—”ė“œ**: LM 추딠 ė°±ģ—”ė“œ (`vllm`ģ€ ģ¶©ė¶„ķ•œ VRAMģ„ 가진 NVIDIA GPU용, `pt`ėŠ” PyTorch ėŒ€ģ²“, `mlx`ėŠ” Apple Silicon용) +- **ģ˜¤ķ”„ė”œė“œ**: + - **CPU + DiT**: ėŖØė“  ėŖØėø(DiT, VAE, ķ…ģŠ¤ķŠø ģøģ½”ė”)ģ„ ėÆøģ‚¬ģš© ģ‹œ CPU딜 ģ˜¤ķ”„ė”œė“œ; DiTė„ 단계 ź°„ ģ˜¤ķ”„ė”œė“œ + - **CPU**: VAE와 ķ…ģŠ¤ķŠø ģøģ½”ė”ė„¼ CPU딜 ģ˜¤ķ”„ė”œė“œ; DiTėŠ” GPU에 ģœ ģ§€ + - **ģ—†ģŒ**: ėŖØė“  ėŖØėøģ„ GPU에 ģœ ģ§€ +- **ģ–‘ģžķ™”**: VRAM ģ‚¬ģš©ėŸ‰ģ„ ģ¤„ģ“źø° ģœ„ķ•“ 기본적으딜 INT8 ź°€ģ¤‘ģ¹˜ ģ–‘ģžķ™”ė„¼ ķ™œģ„±ķ™”ķ• ģ§€ 여부 + +## ģ ģ‘ķ˜• UI źø°ė³ø 설정 + +Gradio UIėŠ” ź°ģ§€ėœ GPU 티얓에 ė”°ė¼ ģžė™ģœ¼ė”œ ģ„¤ģ •ė©ė‹ˆė‹¤: + +- **LM ģ“ˆźø°ķ™” ģ²“ķ¬ė°•ģŠ¤**: LMģ„ ģ§€ģ›ķ•˜ėŠ” ķ‹°ģ–“(ķ‹°ģ–“ 3+)ģ—ģ„œ źø°ė³ø 첓크, ķ‹°ģ–“ 1-2ģ—ģ„œėŠ” 첓크 ķ•“ģ œ ė° ė¹„ķ™œģ„±ķ™” +- **LM ėŖØėø 경딜**: ķ‹°ģ–“ģ˜ ģ¶”ģ²œ ėŖØėøģ“ ģžė™ ģž…ė „; ė“œė”­ė‹¤ģš“ģ—ėŠ” ķ˜øķ™˜ ėŖØėøė§Œ ķ‘œģ‹œ +- **ė°±ģ—”ė“œ ė“œė”­ė‹¤ģš“**: ķ‹°ģ–“ 1-3ģ—ģ„œėŠ” `pt`/`mlx`딜 ģ œķ•œ(vllm KV ģŗģ‹œź°€ 메모리넼 ź³¼ė„ķ•˜ź²Œ ģ‚¬ģš©); ķ‹°ģ–“ 4+ģ—ģ„œėŠ” ėŖØė“  ė°±ģ—”ė“œ ģ‚¬ģš© ź°€ėŠ„ +- **CPU ģ˜¤ķ”„ė”œė“œ / DiT ģ˜¤ķ”„ė”œė“œ**: ė‚®ģ€ ķ‹°ģ–“ģ—ģ„œ źø°ė³ø ķ™œģ„±ķ™”, ė†’ģ€ ķ‹°ģ–“ģ—ģ„œ ė¹„ķ™œģ„±ķ™” +- **ģ–‘ģžķ™”**: ķ‹°ģ–“ 1-6aģ—ģ„œ źø°ė³ø ķ™œģ„±ķ™”, ķ‹°ģ–“ 6b+ģ—ģ„œ ė¹„ķ™œģ„±ķ™”(ģ¶©ė¶„ķ•œ VRAM) +- **ėŖØėø ģ»“ķŒŒģ¼**: ėŖØė“  ķ‹°ģ–“ģ—ģ„œ źø°ė³ø ķ™œģ„±ķ™”(ģ–‘ģžķ™”ģ— ķ•„ģš”) + +ķ˜øķ™˜ė˜ģ§€ ģ•ŠėŠ” ģ˜µģ…˜ģ„ ģˆ˜ė™ģœ¼ė”œ ģ„ ķƒķ•œ 경우(예: 6GB GPUģ—ģ„œ vllm ģ‚¬ģš© ģ‹œė„), ģ‹œģŠ¤ķ…œģ“ 경고넼 ķ‘œģ‹œķ•˜ź³  ķ˜øķ™˜ ź°€ėŠ„ķ•œ ģ„¤ģ •ģœ¼ė”œ ģžė™ ėŒ€ģ²“ķ•©ė‹ˆė‹¤. + +## ėŸ°ķƒ€ģž„ ģ•ˆģ „ 기늄 + +- **VRAM ź°€ė“œ**: 각 추딠 전에 VRAM ģš”źµ¬ ģ‚¬ķ•­ģ„ ģ¶”ģ •ķ•˜ź³  ķ•„ģš” ģ‹œ 배치 크기넼 ģžė™ ģ¶•ģ†Œ +- **ģ ģ‘ķ˜• VAE 디코딩**: 3단계 ėŒ€ģ²“: GPU ķƒ€ģ¼ 디코딩 → GPU 디코딩+CPU ģ˜¤ķ”„ė”œė“œ → 완전 CPU 디코딩 +- **ģžė™ 청크 크기**: VAE 디코딩 청크 크기가 ģ‚¬ģš© ź°€ėŠ„ķ•œ ģ—¬ģœ  VRAM에 ģ ģ‘(64/128/256/512/1024/1536) +- **źøøģ“/배치 ķ“ėžØķ•‘**: ķ‹°ģ–“ ģ œķ•œģ„ ģ“ˆź³¼ķ•˜ėŠ” ź°’ģ„ ģš”ģ²­ķ•˜ė©“ 경고와 ķ•Øź»˜ ģžė™ ģ”°ģ • ## ģ°øź³  사항 -- **źø°ė³ø 설정**ģ€ ź°ģ§€ėœ GPU 메모리에 ė”°ė¼ ģžė™ģœ¼ė”œ źµ¬ģ„±ė©ė‹ˆė‹¤. -- **LM ėŖØė“œ**ėŠ” Chain-of-Thought ģƒģ„± ė° ģ˜¤ė””ģ˜¤ ģ“ķ•“ģ— ģ‚¬ģš©ė˜ėŠ” ģ–øģ–“ ėŖØėøģ„ ģ˜ėÆøķ•©ė‹ˆė‹¤. -- ģµœģ ģ˜ ģ„±ėŠ„ģ„ ģœ„ķ•“ **Flash Attention**, **CPU Offload**, **Compile**, **Quantization**ģ“ 기본적으딜 ķ™œģ„±ķ™”ė©ė‹ˆė‹¤. -- GPU ģ œķ•œģ„ ģ“ˆź³¼ķ•˜ėŠ” źøøģ“ ė˜ėŠ” 배치 크기넼 ģš”ģ²­ķ•˜ė©“ 경고가 ķ‘œģ‹œė˜ź³  ź°’ģ“ ģ œķ•œė©ė‹ˆė‹¤. -- **ģ œģ•½ 디코딩 (Constrained Decoding)**: LMģ“ ģ“ˆźø°ķ™”ė˜ė©“ LMģ˜ źøøģ“ ģƒģ„±ė„ GPU ķ‹°ģ–“ģ˜ ģµœėŒ€ źøøģ“ ģ œķ•œģœ¼ė”œ ģ œģ•½ė˜ģ–“ CoT ģƒģ„± 중 메모리 부씱(OOM) ģ—ėŸ¬ė„¼ ė°©ģ§€ķ•©ė‹ˆė‹¤. -- VRAMģ“ 6GB ģ“ķ•˜ģø GPUģ˜ 경우, DiT ėŖØėøģ˜ 메모리 확볓넼 ģœ„ķ•“ LM ģ“ˆźø°ķ™”ź°€ 기본적으딜 ė¹„ķ™œģ„±ķ™”ė©ė‹ˆė‹¤. -- 명령줄 ģøģž(CLI) ė˜ėŠ” Gradio UI넼 통핓 ģ„¤ģ •ģ„ ģˆ˜ė™ģœ¼ė”œ ė¬“ģ‹œķ•  수 ģžˆģŠµė‹ˆė‹¤. +- **źø°ė³ø 설정**ģ€ ź°ģ§€ėœ GPU 메모리에 ė”°ė¼ ģžė™ģœ¼ė”œ źµ¬ģ„±ė©ė‹ˆė‹¤ +- **LM ėŖØė“œ**ėŠ” Chain-of-Thought ģƒģ„± ė° ģ˜¤ė””ģ˜¤ ģ“ķ•“ģ— ģ‚¬ģš©ė˜ėŠ” ģ–øģ–“ ėŖØėøģ„ ģ˜ėÆøķ•©ė‹ˆė‹¤ +- **Flash Attention**ģ€ ģžė™ ź°ģ§€ė˜ė©° ģ‚¬ģš© ź°€ėŠ„ķ•  ė•Œ ķ™œģ„±ķ™”ė©ė‹ˆė‹¤ +- **ģ œģ•½ 디코딩**: LMģ“ ģ“ˆźø°ķ™”ė˜ė©“ LMģ˜ źøøģ“ ģƒģ„±ė„ GPU ķ‹°ģ–“ģ˜ ģµœėŒ€ źøøģ“ ģ œķ•œģœ¼ė”œ ģ œģ•½ė˜ģ–“ CoT ģƒģ„± 중 OOM ģ—ėŸ¬ė„¼ ė°©ģ§€ķ•©ė‹ˆė‹¤ +- VRAMģ“ 6GB ģ“ķ•˜ģø GPU(ķ‹°ģ–“ 1-2)ģ˜ 경우, DiT ėŖØėøģ˜ 메모리 확볓넼 ģœ„ķ•“ LM ģ“ˆźø°ķ™”ź°€ 기본적으딜 ė¹„ķ™œģ„±ķ™”ė©ė‹ˆė‹¤ +- CLI ģøģž ė˜ėŠ” Gradio UI넼 통핓 ģ„¤ģ •ģ„ ģˆ˜ė™ģœ¼ė”œ ė¬“ģ‹œķ•  수 ģžˆģŠµė‹ˆė‹¤ -> **ģ»¤ė®¤ė‹ˆķ‹° źø°ģ—¬ ķ™˜ģ˜**: ģœ„ģ˜ GPU ķ‹°ģ–“ źµ¬ģ„±ģ€ ģ¼ė°˜ģ ģø ķ•˜ė“œģ›Øģ–“ģ—ģ„œģ˜ ķ…ŒģŠ¤ķŠøė„¼ ė°”ķƒ•ģœ¼ė”œ ķ•©ė‹ˆė‹¤. ģ‚¬ģš© ģ¤‘ģø ģž„ģ¹˜ģ˜ ģ‹¤ģ œ ģ„±ėŠ„ģ“ ģ“ ķŒŒė¼ėÆøķ„°ģ™€ 다넓다멓(예: ė” źø“ źøøģ“ė‚˜ ė” 큰 배치넼 ģ²˜ė¦¬ķ•  수 ģžˆėŠ” 경우), ė” ģ² ģ €ķ•œ ķ…ŒģŠ¤ķŠøė„¼ ģˆ˜ķ–‰ķ•˜ź³  `acestep/gpu_config.py`ģ—ģ„œ ģ“ėŸ¬ķ•œ źµ¬ģ„±ģ„ ģµœģ ķ™”ķ•˜źø° ģœ„ķ•œ PRģ„ ģ œģ¶œķ•“ ģ£¼ģ‹œźø° ė°”ėžė‹ˆė‹¤. ģ—¬ėŸ¬ė¶„ģ˜ 기여가 ėŖØė“  ģ‚¬ģš©ģžģ˜ ź²½ķ—˜ģ„ ź°œģ„ ķ•˜ėŠ” ė° ė„ģ›€ģ“ ė©ė‹ˆė‹¤! +> **ģ»¤ė®¤ė‹ˆķ‹° źø°ģ—¬ ķ™˜ģ˜**: ģœ„ģ˜ GPU ķ‹°ģ–“ źµ¬ģ„±ģ€ ģ¼ė°˜ģ ģø ķ•˜ė“œģ›Øģ–“ģ—ģ„œģ˜ ķ…ŒģŠ¤ķŠøė„¼ ė°”ķƒ•ģœ¼ė”œ ķ•©ė‹ˆė‹¤. ģ‚¬ģš© ģ¤‘ģø ģž„ģ¹˜ģ˜ ģ‹¤ģ œ ģ„±ėŠ„ģ“ ģ“ ķŒŒė¼ėÆøķ„°ģ™€ 다넓다멓, ė” ģ² ģ €ķ•œ ķ…ŒģŠ¤ķŠøė„¼ ģˆ˜ķ–‰ķ•˜ź³  `acestep/gpu_config.py`ģ—ģ„œ źµ¬ģ„±ģ„ ģµœģ ķ™”ķ•˜źø° ģœ„ķ•œ PRģ„ ģ œģ¶œķ•“ ģ£¼ģ‹œźø° ė°”ėžė‹ˆė‹¤. ## 메모리 ģµœģ ķ™” 팁 -1. **ė‚®ģ€ VRAM (8GB 미만)**: ģµœėŒ€ źøøģ“ė„¼ ķ™•ė³“ķ•˜ė ¤ė©“ LM ģ“ˆźø°ķ™” ģ—†ģ“ DiT ģ „ģš© ėŖØė“œė„¼ ģ‚¬ģš©ķ•˜ģ„øģš”. -2. **중간 VRAM (8-16GB)**: ķ’ˆģ§ˆź³¼ ė©”ėŖØė¦¬ģ˜ ģµœģ ģ˜ ź· ķ˜•ģ„ ģœ„ķ•“ 0.6B LM ėŖØėøģ„ ģ‚¬ģš©ķ•˜ģ„øģš”. -3. **ė†’ģ€ VRAM (16GB 쓈과)**: ė” ė‚˜ģ€ ģ˜¤ė””ģ˜¤ ģ“ķ•“ ė° ģƒģ„± ķ’ˆģ§ˆģ„ ģœ„ķ•“ ė” 큰 LM ėŖØėø(1.7B/4B)ģ„ ķ™œģ„±ķ™”ķ•˜ģ„øģš”. +1. **ģ“ˆģ € VRAM (≤6GB)**: LM ģ“ˆźø°ķ™” ģ—†ģ“ DiT ģ „ģš© ėŖØė“œė„¼ ģ‚¬ģš©. INT8 ģ–‘ģžķ™”ģ™€ 완전 CPU ģ˜¤ķ”„ė”œė“œź°€ ķ•„ģˆ˜. VAE ė””ģ½”ė”©ģ“ ģžė™ģœ¼ė”œ CPU딜 ėŒ€ģ²“ė  수 ģžˆģŠµė‹ˆė‹¤. +2. **ģ € VRAM (6-8GB)**: `pt` ė°±ģ—”ė“œė”œ 0.6B LM ėŖØėø ģ‚¬ģš© ź°€ėŠ„. ģ˜¤ķ”„ė”œė“œė„¼ ķ™œģ„± 상태딜 ģœ ģ§€ķ•˜ģ„øģš”. +3. **중간 VRAM (8-16GB)**: 0.6B ė˜ėŠ” 1.7B LM ėŖØėøģ„ ģ‚¬ģš©. ķ‹°ģ–“ 4+ģ—ģ„œ `vllm` ė°±ģ—”ė“œź°€ ģž˜ ģž‘ė™ķ•©ė‹ˆė‹¤. +4. **ė†’ģ€ VRAM (16-24GB)**: ė” 큰 LM ėŖØėø(1.7B ģ¶”ģ²œ)ģ„ ķ™œģ„±ķ™”. 20GB+ģ—ģ„œėŠ” ģ–‘ģžķ™”ź°€ ģ„ ķƒ ģ‚¬ķ•­ģ“ ė©ė‹ˆė‹¤. +5. **쓈고 VRAM (≄24GB)**: ėŖØė“  ėŖØėøģ“ ģ˜¤ķ”„ė”œė“œė‚˜ ģ–‘ģžķ™” ģ—†ģ“ ģž‘ė™. 최고 ķ’ˆģ§ˆģ„ ģœ„ķ•“ 4B LMģ„ ģ‚¬ģš©ķ•˜ģ„øģš”. ## 디버그 ėŖØė“œ: 다넸 GPU 구성 ģ‹œė®¬ė ˆģ“ģ…˜ @@ -40,17 +73,81 @@ ACE-Step 1.5ėŠ” GPUģ˜ ģ‚¬ģš© ź°€ėŠ„ķ•œ VRAM에 ģžė™ģœ¼ė”œ ģ ģ‘ķ•˜ģ—¬ ģƒģ„± # 4GB GPU ģ‹œė®¬ė ˆģ“ģ…˜ (ķ‹°ģ–“ 1) MAX_CUDA_VRAM=4 uv run acestep +# 6GB GPU ģ‹œė®¬ė ˆģ“ģ…˜ (ķ‹°ģ–“ 2) +MAX_CUDA_VRAM=6 uv run acestep + # 8GB GPU ģ‹œė®¬ė ˆģ“ģ…˜ (ķ‹°ģ–“ 4) MAX_CUDA_VRAM=8 uv run acestep # 12GB GPU ģ‹œė®¬ė ˆģ“ģ…˜ (ķ‹°ģ–“ 5) MAX_CUDA_VRAM=12 uv run acestep -# 16GB GPU ģ‹œė®¬ė ˆģ“ģ…˜ (ķ‹°ģ–“ 6) +# 16GB GPU ģ‹œė®¬ė ˆģ“ģ…˜ (ķ‹°ģ–“ 6a) MAX_CUDA_VRAM=16 uv run acestep ``` +`MAX_CUDA_VRAM`ģ„ ģ„¤ģ •ķ•˜ė©“ ģ‹œģŠ¤ķ…œģ€ `torch.cuda.set_per_process_memory_fraction()`ģ„ ķ˜øģ¶œķ•˜ģ—¬ VRAM ķ•˜ė“œ ģŗ”ģ„ ź°•ģ œķ•˜ė©°, 고사양 GPUģ—ģ„œė„ ķ˜„ģ‹¤ģ ģø ģ‹œė®¬ė ˆģ“ģ…˜ģ„ ģ œź³µķ•©ė‹ˆė‹¤. + +### ģžė™ ķ‹°ģ–“ ķ…ŒģŠ¤ķŠø + +UIģ—ģ„œ 각 티얓넼 ģˆ˜ė™ģœ¼ė”œ ķ…ŒģŠ¤ķŠøķ•˜ėŠ” ėŒ€ģ‹ , `profile_inference.py`ģ˜ `tier-test` ėŖØė“œė„¼ ģ‚¬ģš©ķ•  수 ģžˆģŠµė‹ˆė‹¤: + +```bash +# ėŖØė“  ķ‹°ģ–“ ģžė™ ķ…ŒģŠ¤ķŠø +python profile_inference.py --mode tier-test + +# ķŠ¹ģ • ķ‹°ģ–“ ķ…ŒģŠ¤ķŠø +python profile_inference.py --mode tier-test --tiers 6 8 16 + +# LM ķ™œģ„±ķ™”ķ•˜ģ—¬ ķ…ŒģŠ¤ķŠø (ģ§€ģ›ė˜ėŠ” ķ‹°ģ–“ģ—ģ„œ) +python profile_inference.py --mode tier-test --tier-with-lm + +# 빠넸 ķ…ŒģŠ¤ķŠø (ė¹„ģ–‘ģžķ™” ķ‹°ģ–“ģ—ģ„œ torch.compile ź±“ė„ˆė›°źø°) +python profile_inference.py --mode tier-test --tier-skip-compile +``` + +ķ”„ė”œķŒŒģ¼ė§ ė„źµ¬ģ˜ 전첓 ė¬øģ„œėŠ” [BENCHMARK.md](BENCHMARK.md)넼 ģ°øģ”°ķ•˜ģ„øģš”. + ģ“ėŠ” ė‹¤ģŒź³¼ ź°™ģ€ ź²½ģš°ģ— ģœ ģš©ķ•©ė‹ˆė‹¤: - 고사양 ķ•˜ė“œģ›Øģ–“ģ—ģ„œ GPU ķ‹°ģ–“ 구성 ķ…ŒģŠ¤ķŠø - 각 티얓에 ėŒ€ķ•“ 경고 ė° ģ œķ•œģ“ ģ˜¬ė°”ė„“ź²Œ ģž‘ė™ķ•˜ėŠ”ģ§€ ķ™•ģø -- PR 제출 ģ „ 새딜욓 GPU 구성 ķŒŒė¼ėÆøķ„° 개발 ė° ķ…ŒģŠ¤ķŠø +- `acestep/gpu_config.py` ģˆ˜ģ • 후 ģžė™ ķšŒź·€ ķ…ŒģŠ¤ķŠø +- CI/CD VRAM ķ˜øķ™˜ģ„± ź²€ģ¦ + +### 경계 ķ…ŒģŠ¤ķŠø (ģµœģ†Œ ķ‹°ģ–“ 찾기) + +`--tier-boundary`넼 ģ‚¬ģš©ķ•˜ė©“ INT8 ģ–‘ģžķ™”ģ™€ CPU ģ˜¤ķ”„ė”œė“œė„¼ ģ•ˆģ „ķ•˜ź²Œ ė¹„ķ™œģ„±ķ™”ķ•  수 ģžˆėŠ” ģµœģ†Œ VRAM 티얓넼 ģ‹¤ķ—˜ģ ģœ¼ė”œ ķ™•ģøķ•  수 ģžˆģŠµė‹ˆė‹¤. 각 티얓에 ėŒ€ķ•“ ģµœėŒ€ 3가지 źµ¬ģ„±ģœ¼ė”œ ķ…ŒģŠ¤ķŠøķ•©ė‹ˆė‹¤: + +1. **default** — ķ‹°ģ–“ģ˜ źø°ė³ø 설정 (ģ–‘ģžķ™” + ģ˜¤ķ”„ė”œė“œė„¼ źµ¬ģ„±ėŒ€ė”œ ģ‚¬ģš©) +2. **no-quant** — ģ˜¤ķ”„ė”œė“œ ģ„¤ģ •ģ€ ģœ ģ§€ķ•˜ė˜ ģ–‘ģžķ™” ė¹„ķ™œģ„±ķ™” +3. **no-offload** — ģ–‘ģžķ™” ģ—†ģŒ, CPU ģ˜¤ķ”„ė”œė“œ ģ—†ģŒ (ėŖØė“  ėŖØėøģ„ GPU에 ģœ ģ§€) + +```bash +# ėŖØė“  ķ‹°ģ–“ģ—ģ„œ 경계 ķ…ŒģŠ¤ķŠø 실행 +python profile_inference.py --mode tier-test --tier-boundary + +# ķŠ¹ģ • ķ‹°ģ–“ģ˜ 경계 ķ…ŒģŠ¤ķŠø +python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24 + +# LM ķ™œģ„±ķ™”ėœ 경계 ķ…ŒģŠ¤ķŠø (ģ§€ģ›ė˜ėŠ” ķ‹°ģ–“ģ—ģ„œ) +python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm + +# 결과넼 JSON으딜 ģ €ģž„ +python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json +``` + +> **ģ°øź³ :** 경계 ķ…ŒģŠ¤ķŠø ź²°ź³¼ėŠ” ź²½ķ—˜ģ ģ“ė©°, DiT ėŖØėø ė³€ķ˜• (turbo vs base), LM ķ™œģ„±ķ™” 여부, ģƒģ„± ģ‹œź°„, flash attention ź°€ģš©ģ„±ģ— ė”°ė¼ ė‹¬ė¼ģ§ˆ 수 ģžˆģŠµė‹ˆė‹¤. + +### 배치 크기 경계 ķ…ŒģŠ¤ķŠø + +`--tier-batch-boundary`넼 ģ‚¬ģš©ķ•˜ģ—¬ 배치 크기 1, 2, 4, 8ģ„ ė‹Øź³„ģ ģœ¼ė”œ ķ…ŒģŠ¤ķŠøķ•˜ģ—¬ 각 ķ‹°ģ–“ģ˜ ģµœėŒ€ ģ•ˆģ „ 배치 크기넼 ģ°¾ģŠµė‹ˆė‹¤: + +```bash +# LM ķ™œģ„±ķ™” ģƒķƒœģ—ģ„œ 배치 경계 ķ…ŒģŠ¤ķŠø 실행 +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm + +# ķŠ¹ģ • ķ‹°ģ–“ ķ…ŒģŠ¤ķŠø +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24 +``` + +LM ģ‚¬ģš©/ėÆøģ‚¬ģš© 두 가지 źµ¬ģ„±ģ„ 모두 ķ…ŒģŠ¤ķŠøķ•˜ź³  각 ķ‹°ģ–“ģ˜ ģµœėŒ€ 성공 배치 크기넼 ė³“ź³ ķ•©ė‹ˆė‹¤. diff --git a/docs/ko/INFERENCE.md b/docs/ko/INFERENCE.md index 6ec98f31..47ab56ff 100644 --- a/docs/ko/INFERENCE.md +++ b/docs/ko/INFERENCE.md @@ -267,3 +267,4 @@ result = generate_music(dit_handler, llm_handler, params, config, save_dir="/out 2. **Turbo ėŖØėø ķ™œģš©**: 빠넸 반복 ģž‘ģ—…ģ—ėŠ” `turbo` ėŖØėøģ„ ģ‚¬ģš©ķ•˜ėŠ” ź²ƒģ“ ķšØģœØģ ģž…ė‹ˆė‹¤. 3. **Thinking ėŖØė“œ**: ė” ė…¼ė¦¬ģ ģø ģŒģ•… 구씰가 ķ•„ģš”ķ•  ė•Œ `thinking=True`넼 ģ‚¬ģš©ķ•˜ė˜, 메모리가 ė¶€ģ”±ķ•˜ė©“ 끌 수 ģžˆģŠµė‹ˆė‹¤. 4. **ź²°ź³¼ 반복**: 배치 크기넼 2-4딜 ģ„¤ģ •ķ•˜ģ—¬ ģ—¬ėŸ¬ ė²„ģ „ģ„ ķ•œ ė²ˆģ— 듣고 ģµœģ ģ˜ 결과넼 ź³ ė„“ėŠ” ź²ƒģ“ ģ¢‹ģŠµė‹ˆė‹¤. +5. **메모리 ꓀리**: ACE-Step 1.5ėŠ” ģžė™ VRAM ꓀리넼 ķ¬ķ•Øķ•©ė‹ˆė‹¤ — VRAM ź°€ė“œ(ģžė™ 배치 ģ¶•ģ†Œ), ģ ģ‘ķ˜• VAE 디코딩(CPU ėŒ€ģ²“), ģžė™ 청크 크기 ģ”°ģ •. OOMģ“ ė°œģƒķ•˜ė©“ ģ‹œģŠ¤ķ…œģ“ ģžė™ģœ¼ė”œ ģ²˜ė¦¬ķ•©ė‹ˆė‹¤. 각 VRAM ķ‹°ģ–“ģ˜ ź¶Œģž„ ģ„¤ģ •ģ€ [GPU_COMPATIBILITY.md](../ko/GPU_COMPATIBILITY.md)넼 ģ°øģ”°ķ•˜ģ„øģš”. \ No newline at end of file diff --git a/docs/zh/BENCHMARK.md b/docs/zh/BENCHMARK.md index a75a7ca6..dc7096e6 100644 --- a/docs/zh/BENCHMARK.md +++ b/docs/zh/BENCHMARK.md @@ -26,6 +26,7 @@ |------|------| | `profile` | åÆ¹å•ę¬”ē”Ÿęˆčæ›č”ŒčÆ¦ē»†ēš„č®”ę—¶åˆ†ęž | | `benchmark` | čæč”Œé…ē½®ēŸ©é˜µļ¼ˆę—¶é•æ Ɨ ę‰¹é‡ Ɨ ꀝ考 Ɨ ę­„ę•°ļ¼‰ļ¼Œč¾“å‡ŗę±‡ę€»č”Ø | +| `tier-test` | é€ščæ‡ `MAX_CUDA_VRAM` ęØ”ę‹ŸäøåŒę˜¾å­˜å¤§å°ļ¼Œč‡ŖåŠØęµ‹čÆ•ę‰€ęœ‰ GPU 等级 | | `understand` | åˆ†ęž `understand_music()` APIļ¼ˆéŸ³é¢‘ → å…ƒę•°ę®ęå–ļ¼‰ | | `create_sample` | åˆ†ęž `create_sample()` APIļ¼ˆēµę„Ÿ/ē®€å•ęØ”å¼ļ¼‰ | | `format_sample` | åˆ†ęž `format_sample()` APIļ¼ˆę ‡é¢˜+ę­ŒčÆ → ē»“ęž„åŒ–å…ƒę•°ę®ļ¼‰ | @@ -156,6 +157,84 @@ python profile_inference.py --mode create_sample --instrumental python profile_inference.py --mode format_sample ``` +### 6. `tier-test` — č‡ŖåŠØåŒ– GPU 等级测试 + +使用 `MAX_CUDA_VRAM` č‡ŖåŠØęØ”ę‹ŸäøåŒēš„ GPU ę˜¾å­˜å¤§å°ļ¼Œå¹¶åœØęÆäøŖē­‰ēŗ§čæč”Œē”Ÿęˆęµ‹čÆ•ć€‚čæ™ę˜Æäæ®ę”¹ `acestep/gpu_config.py` åŽéŖŒčÆę‰€ęœ‰ GPU ē­‰ēŗ§ę˜Æå¦ę­£åøøå·„ä½œēš„ęŽØčę–¹å¼ć€‚ + +```bash +# ęµ‹čÆ•ę‰€ęœ‰ē­‰ēŗ§ (4, 6, 8, 12, 16, 20, 24 GB) +python profile_inference.py --mode tier-test + +# ęµ‹čÆ•ē‰¹å®šę˜¾å­˜å¤§å° +python profile_inference.py --mode tier-test --tiers 6 8 16 + +# 启用 LM ęµ‹čÆ•ļ¼ˆåœØę”ÆęŒēš„ē­‰ēŗ§äøŠļ¼‰ +python profile_inference.py --mode tier-test --tier-with-lm + +# åæ«é€Ÿęµ‹čÆ•ļ¼šéžé‡åŒ–ē­‰ēŗ§č·³čæ‡ torch.compile +python profile_inference.py --mode tier-test --tier-skip-compile +``` + +**ęÆäøŖē­‰ēŗ§éŖŒčÆēš„å†…å®¹ļ¼š** +- ę­£ē”®ēš„ē­‰ēŗ§ę£€ęµ‹å’Œ `GPUConfig` ęž„å»ŗ +- ęØ”åž‹åˆå§‹åŒ–ļ¼ˆDiT态VAEć€ę–‡ęœ¬ē¼–ē å™Øļ¼ŒåÆé€‰ LM) +- ēŸ­ę—¶é—“ē”Ÿęˆļ¼ˆ30ē§’ę—¶é•æļ¼Œbatch=1)无 OOM 完成 +- 自适应 VAE č§£ē å›žé€€ļ¼ˆGPU → CPU åøč½½ → å®Œå…Ø CPU) +- ę˜¾å­˜ä½æē”ØäæęŒåœØęØ”ę‹Ÿé™åˆ¶å†… + +**č¾“å‡ŗē¤ŗä¾‹ļ¼š** + +``` +TIER TEST RESULTS +==================================================================================================== + VRAM Tier LM Duration Status Peak VRAM Notes + ────────────────────────────────────────────────────────────────────────────── + 4GB tier1 — 30s āœ… OK 3.8GB VAE 在 CPU 上解码 + 6GB tier2 — 30s āœ… OK 5.4GB åˆ†ē‰‡ VAE chunk=256 + 8GB tier4 0.6B 30s āœ… OK 7.2GB vllm åŽē«Æ + 12GB tier5 1.7B 30s āœ… OK 10.8GB vllm åŽē«Æ + 16GB tier6a 1.7B 30s āœ… OK 14.5GB åÆē”Øåøč½½ + 20GB tier6b 1.7B 30s āœ… OK 17.2GB ę— åøč½½ + 24GB unlimited 4B 30s āœ… OK 21.3GB ę‰€ęœ‰ęØ”åž‹åœØ GPU 上 +``` + +> **ę³Øę„**: `tier-test` ęØ”å¼ä½æē”Ø `torch.cuda.set_per_process_memory_fraction()` å¼ŗåˆ¶ę‰§č”Œę˜¾å­˜ē”¬äøŠé™ļ¼Œå³ä½æåœØé«˜ē«Æ GPUļ¼ˆå¦‚ A100 80GBļ¼‰äøŠä¹Ÿčƒ½å®žēŽ°ēœŸå®žēš„ęØ”ę‹Ÿć€‚ + +#### č¾¹ē•Œęµ‹čÆ• + +使用 `--tier-boundary` ęŸ„ę‰¾åÆä»„å®‰å…Øå…³é—­ INT8 量化和 CPU åøč½½ēš„ęœ€ä½Žę˜¾å­˜ē­‰ēŗ§ć€‚åÆ¹ęÆäøŖē­‰ēŗ§ęœ€å¤šęµ‹čÆ•äø‰ē§é…ē½®ļ¼š + +1. **default** — ē­‰ēŗ§ēš„ę ‡å‡†č®¾ē½® +2. **no-quant** — å…³é—­é‡åŒ–ļ¼Œåøč½½äøå˜ +3. **no-offload** — äøä½æē”Øé‡åŒ–ļ¼Œä¹Ÿäøä½æē”Ø CPU åøč½½ + +```bash +# åœØę‰€ęœ‰ē­‰ēŗ§čæč”Œč¾¹ē•Œęµ‹čÆ• +python profile_inference.py --mode tier-test --tier-boundary + +# 启用 LM ēš„č¾¹ē•Œęµ‹čÆ• +python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm + +# å°†č¾¹ē•Œęµ‹čÆ•ē»“ęžœäæå­˜äøŗ JSON +python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json +``` + +č¾“å‡ŗåŒ…å«äø€äøŖ **č¾¹ē•Œåˆ†ęž** ę‘˜č¦ļ¼Œę˜¾ē¤ŗęÆē§čƒ½åŠ›ēš„ęœ€ä½Žē­‰ēŗ§ć€‚ + +#### ę‰¹ę¬”å¤§å°č¾¹ē•Œęµ‹čÆ• + +使用 `--tier-batch-boundary` ęŸ„ę‰¾ęÆäøŖē­‰ēŗ§ēš„ęœ€å¤§å®‰å…Øę‰¹ę¬”å¤§å°ć€‚åÆ¹ęÆäøŖē­‰ēŗ§ļ¼Œå·„å…·ä¼šé€’čæ›ęµ‹čÆ•ę‰¹ę¬”å¤§å° 1态2态4态8ļ¼ˆåœØé¦–ę¬” OOM ę—¶åœę­¢ļ¼‰ļ¼ŒåŒę—¶ęµ‹čÆ•åÆē”Ø LM å’ŒęœŖåÆē”Ø LM ēš„é…ē½®ļ¼š + +```bash +# čæč”Œę‰¹ę¬”č¾¹ē•Œęµ‹čÆ• +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm + +# ęµ‹čÆ•ē‰¹å®šē­‰ēŗ§ +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24 +``` + +č¾“å‡ŗåŒ…å«äø€äøŖ **ę‰¹ę¬”č¾¹ē•Œę‘˜č¦**ļ¼Œę˜¾ē¤ŗęÆäøŖē­‰ēŗ§åœØęœ‰ LM å’Œę—  LM é…ē½®äø‹ēš„ęœ€å¤§ęˆåŠŸę‰¹ę¬”å¤§å°ć€‚ + --- ## å‘½ä»¤č”Œå‚ę•° @@ -209,12 +288,22 @@ python profile_inference.py --mode format_sample | å‚ę•° | é»˜č®¤å€¼ | čÆ“ę˜Ž | |------|--------|------| -| `--mode` | `profile` | ęØ”å¼ļ¼š`profile` / `benchmark` / `understand` / `create_sample` / `format_sample` | +| `--mode` | `profile` | ęØ”å¼ļ¼š`profile` / `benchmark` / `tier-test` / `understand` / `create_sample` / `format_sample` | | `--no-warmup` | 关闭 | č·³čæ‡é¢„ēƒ­ | | `--detailed` | 关闭 | 启用 `cProfile` å‡½ę•°ēŗ§åˆ†ęž | | `--llm-debug` | 关闭 | 深度 LLM č°ƒčÆ•ļ¼ˆtoken ę•°é‡ć€åžåé‡ļ¼‰ | | `--benchmark-output` | ꗠ | äæå­˜åŸŗå‡†ęµ‹čÆ•ē»“ęžœäøŗ JSON ꖇ件 | +### 等级测试选锹 + +| å‚ę•° | é»˜č®¤å€¼ | čÆ“ę˜Ž | +|------|--------|------| +| `--tiers` | `4 6 8 12 16 20 24` | č¦ęØ”ę‹Ÿēš„ę˜¾å­˜å¤§å°ļ¼ˆGB) | +| `--tier-with-lm` | 关闭 | åœØę”ÆęŒēš„ē­‰ēŗ§äøŠåÆē”Ø LM 初始化 | +| `--tier-skip-compile` | 关闭 | éžé‡åŒ–ē­‰ēŗ§č·³čæ‡ `torch.compile` ä»„åŠ é€Ÿčæ­ä»£ | +| `--tier-boundary` | 关闭 | åÆ¹ęÆäøŖē­‰ēŗ§ęµ‹čÆ• no-quant 和 no-offload å˜ä½“ļ¼ŒęŸ„ę‰¾ęœ€ä½Žčƒ½åŠ›č¾¹ē•Œ | +| `--tier-batch-boundary` | 关闭 | åÆ¹ęÆäøŖē­‰ēŗ§ęµ‹čÆ•ę‰¹ę¬”å¤§å° 1态2态4态8ļ¼ŒęŸ„ę‰¾ęœ€å¤§å®‰å…Øę‰¹ę¬”å¤§å° | + ### 输兄选锹 | å‚ę•° | é»˜č®¤å€¼ | čÆ“ę˜Ž | @@ -340,6 +429,10 @@ TIME COSTS BREAKDOWN 4. **使用代蔨性时长测试** — ēŸ­ę—¶é•æļ¼ˆ30s)仄 LLM č€—ę—¶äøŗäø»ļ¼›é•æę—¶é•æļ¼ˆ240s+)仄 DiT 耗时为主。 -5. **GPU ę˜¾å­˜č‡ŖåŠØé€‚é…** — benchmark ęØ”å¼ä¼šč‡ŖåŠØå°†ę—¶é•æå’Œę‰¹é‡å¤§å°č£å‰Ŗåˆ° GPU åÆå¤„ē†ēš„čŒƒå›“ć€‚ +5. **GPU ę˜¾å­˜č‡ŖåŠØé€‚é…** — benchmark ęØ”å¼ä¼šč‡ŖåŠØå°†ę—¶é•æå’Œę‰¹é‡å¤§å°č£å‰Ŗåˆ° GPU åÆå¤„ē†ēš„čŒƒå›“ļ¼Œä½æē”Ø `acestep/gpu_config.py` äø­ēš„č‡Ŗé€‚åŗ”ē­‰ēŗ§ē³»ē»Ÿć€‚ 6. **č°Øę…Žä½æē”Ø `--detailed`** — `cProfile` ä¼šå¢žåŠ å¼€é”€ļ¼›ä»…åœØéœ€č¦č°ƒęŸ„å‡½ę•°ēŗ§ē“¶é¢ˆę—¶ä½æē”Øć€‚ + +7. **使用 `tier-test` čæ›č”Œå›žå½’ęµ‹čÆ•** — 修改 GPU ē­‰ēŗ§é…ē½®åŽļ¼Œčæč”Œ `--mode tier-test` éŖŒčÆę‰€ęœ‰ē­‰ēŗ§ä»ē„¶ę­£åøøå·„ä½œć€‚čæ™åœØę›“ę”¹åøč½½é˜ˆå€¼ć€ę—¶é•æé™åˆ¶ęˆ– LM ęØ”åž‹åÆē”Øę€§ę—¶å°¤äøŗé‡č¦ć€‚ + +8. **ēœŸå®žęØ”ę‹Ÿä½Žę˜¾å­˜** — 使用 `MAX_CUDA_VRAM` ę—¶ļ¼Œē³»ē»Ÿé€ščæ‡ `set_per_process_memory_fraction()` å¼ŗåˆ¶ę‰§č”Œę˜¾å­˜ē”¬äøŠé™ļ¼Œå› ę­¤ęØ”ę‹ŸęœŸé—“ēš„ OOM é”™čÆÆåę˜ äŗ†ę¶ˆč“¹ēŗ§ GPU äøŠēš„ēœŸå®žč”Œäøŗć€‚ diff --git a/docs/zh/GPU_COMPATIBILITY.md b/docs/zh/GPU_COMPATIBILITY.md index bf71cf3c..cb448b18 100644 --- a/docs/zh/GPU_COMPATIBILITY.md +++ b/docs/zh/GPU_COMPATIBILITY.md @@ -1,36 +1,69 @@ # GPU å…¼å®¹ę€§ęŒ‡å— -ACE-Step 1.5 ä¼šč‡ŖåŠØé€‚é…ę‚Øēš„ GPU ę˜¾å­˜å¤§å°ļ¼Œē›øåŗ”č°ƒę•“ē”Ÿęˆę—¶é•æé™åˆ¶å’ŒåÆē”Øēš„ LM ęØ”åž‹ć€‚ē³»ē»ŸåœØåÆåŠØę—¶ę£€ęµ‹ GPU ę˜¾å­˜å¹¶č‡ŖåŠØé…ē½®ęœ€ä½³č®¾ē½®ć€‚ +ACE-Step 1.5 ä¼šč‡ŖåŠØé€‚é…ę‚Øēš„ GPU ę˜¾å­˜å¤§å°ļ¼Œē›øåŗ”č°ƒę•“ē”Ÿęˆę—¶é•æé™åˆ¶ć€åÆē”Øēš„ LM ęØ”åž‹ć€åøč½½ē­–ē•„å’Œ UI é»˜č®¤č®¾ē½®ć€‚ē³»ē»ŸåœØåÆåŠØę—¶ę£€ęµ‹ GPU ę˜¾å­˜å¹¶č‡ŖåŠØé…ē½®ęœ€ä½³č®¾ē½®ć€‚ ## GPU åˆ†ēŗ§é…ē½® -| 显存 | 等级 | LM ęØ”å¼ | ęœ€å¤§ę—¶é•æ | ęœ€å¤§ę‰¹ę¬” | LM ę˜¾å­˜åˆ†é… | -|------|------|---------|----------|----------|-------------| -| ≤4GB | Tier 1 | äøåÆē”Ø | 3 分钟 | 1 | - | -| 4-6GB | Tier 2 | äøåÆē”Ø | 6 分钟 | 1 | - | -| 6-8GB | Tier 3 | 0.6B (åÆé€‰) | ꜉ LM: 4 分钟 / ꗠ LM: 6 分钟 | ꜉ LM: 1 / ꗠ LM: 2 | 3GB | -| 8-12GB | Tier 4 | 0.6B (åÆé€‰) | ꜉ LM: 4 分钟 / ꗠ LM: 6 分钟 | ꜉ LM: 2 / ꗠ LM: 4 | 3GB | -| 12-16GB | Tier 5 | 0.6B / 1.7B | ꜉ LM: 4 分钟 / ꗠ LM: 6 分钟 | ꜉ LM: 2 / ꗠ LM: 4 | 0.6B: 3GB, 1.7B: 8GB | -| 16-24GB | Tier 6 | 0.6B / 1.7B / 4B | 8 分钟 | ꜉ LM: 4 / ꗠ LM: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB | -| ≄24GB | ę— é™åˆ¶ | ę‰€ęœ‰ęØ”åž‹ | 10 分钟 | 8 | ę— é™åˆ¶ | +| 显存 | 等级 | LM ęØ”åž‹ | ęŽØč LM | åŽē«Æ | ęœ€å¤§ę—¶é•æ (꜉LM / ꗠLM) | ęœ€å¤§ę‰¹ę¬” (꜉LM / ꗠLM) | åøč½½ē­–ē•„ | 量化 | +|------|------|---------|---------|------|------------------------|------------------------|----------|------| +| ≤4GB | Tier 1 | ꗠ | — | pt | 4分 / 6分 | 1 / 1 | CPU + DiT | INT8 | +| 4-6GB | Tier 2 | ꗠ | — | pt | 8分 / 10分 | 1 / 1 | CPU + DiT | INT8 | +| 6-8GB | Tier 3 | 0.6B | 0.6B | pt | 8分 / 10分 | 2 / 2 | CPU + DiT | INT8 | +| 8-12GB | Tier 4 | 0.6B | 0.6B | vllm | 8分 / 10分 | 2 / 4 | CPU + DiT | INT8 | +| 12-16GB | Tier 5 | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 4 | CPU | INT8 | +| 16-20GB | Tier 6a | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 8 | CPU | INT8 | +| 20-24GB | Tier 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8分 / 8分 | 8 / 8 | ꗠ | ꗠ | +| ≄24GB | ę— é™åˆ¶ | å…ØéƒØ (0.6B, 1.7B, 4B) | 4B | vllm | 10分 / 10分 | 8 / 8 | ꗠ | ꗠ | + +### åˆ—čÆ“ę˜Ž + +- **LM ęØ”åž‹**: čÆ„ē­‰ēŗ§åÆä»„åŠ č½½ēš„ 5Hz čÆ­čØ€ęØ”åž‹å°ŗåÆø +- **ęŽØč LM**: UI äø­čÆ„ē­‰ēŗ§é»˜č®¤é€‰ę‹©ēš„ LM ęØ”åž‹ +- **åŽē«Æ**: LM ęŽØē†åŽē«Æļ¼ˆ`vllm` ē”ØäŗŽę˜¾å­˜å……č¶³ēš„ NVIDIA GPU,`pt` äøŗ PyTorch å›žé€€ę–¹ę”ˆļ¼Œ`mlx` ē”ØäŗŽ Apple Silicon) +- **åøč½½ē­–ē•„**: + - **CPU + DiT**: ę‰€ęœ‰ęØ”åž‹ļ¼ˆDiT态VAEć€ę–‡ęœ¬ē¼–ē å™Øļ¼‰äøä½æē”Øę—¶åøč½½åˆ° CPUļ¼›DiT ä¹ŸåœØę­„éŖ¤é—“åøč½½ + - **CPU**: VAE å’Œę–‡ęœ¬ē¼–ē å™Øåøč½½åˆ° CPUļ¼›DiT äæē•™åœØ GPU 上 + - **ꗠ**: ę‰€ęœ‰ęØ”åž‹äæē•™åœØ GPU 上 +- **量化**: ę˜Æå¦é»˜č®¤åÆē”Ø INT8 ęƒé‡é‡åŒ–ä»„å‡å°‘ę˜¾å­˜å ē”Ø + +## 自适应 UI 默认设置 + +Gradio UI ä¼šę ¹ę®ę£€ęµ‹åˆ°ēš„ GPU ē­‰ēŗ§č‡ŖåŠØé…ē½®ļ¼š + +- **LM åˆå§‹åŒ–å¤é€‰ę”†**: ę”ÆęŒ LM ēš„ē­‰ēŗ§ļ¼ˆTier 3+ļ¼‰é»˜č®¤å‹¾é€‰ļ¼ŒTier 1-2 é»˜č®¤äøå‹¾é€‰äø”ē¦ē”Ø +- **LM ęØ”åž‹č·Æå¾„**: č‡ŖåŠØå”«å……čÆ„ē­‰ēŗ§ęŽØčēš„ęØ”åž‹ļ¼›äø‹ę‹‰čœå•ä»…ę˜¾ē¤ŗå…¼å®¹ēš„ęØ”åž‹ +- **åŽē«Æäø‹ę‹‰čœå•**: Tier 1-3 é™åˆ¶äøŗ `pt`/`mlx`(vllm KV ē¼“å­˜å ē”Øčæ‡å¤§ļ¼‰ļ¼›Tier 4+ ę‰€ęœ‰åŽē«ÆåÆē”Ø +- **CPU åøč½½ / DiT åøč½½**: ä½Žē­‰ēŗ§é»˜č®¤åÆē”Øļ¼Œé«˜ē­‰ēŗ§é»˜č®¤ē¦ē”Ø +- **量化**: Tier 1-6a é»˜č®¤åÆē”Øļ¼ŒTier 6b+ é»˜č®¤ē¦ē”Øļ¼ˆę˜¾å­˜å……č¶³ļ¼‰ +- **ęØ”åž‹ē¼–čÆ‘**: ę‰€ęœ‰ē­‰ēŗ§é»˜č®¤åÆē”Øļ¼ˆé‡åŒ–éœ€č¦ļ¼‰ + +å¦‚ęžœę‚Øę‰‹åŠØé€‰ę‹©äŗ†äøå…¼å®¹ēš„é€‰é”¹ļ¼ˆä¾‹å¦‚åœØ 6GB GPU äøŠä½æē”Ø vllmļ¼‰ļ¼Œē³»ē»Ÿä¼šå‘å‡ŗč­¦å‘Šå¹¶č‡ŖåŠØå›žé€€åˆ°å…¼å®¹é…ē½®ć€‚ + +## čæč”Œę—¶å®‰å…Øē‰¹ę€§ + +- **ę˜¾å­˜å®ˆå«**: ęÆę¬”ęŽØē†å‰ļ¼Œē³»ē»Ÿä¼šä¼°ē®—ę˜¾å­˜éœ€ę±‚ļ¼Œåæ…č¦ę—¶č‡ŖåŠØå‡å°ę‰¹ę¬”å¤§å° +- **自适应 VAE 解码**: äø‰ēŗ§å›žé€€ęœŗåˆ¶ļ¼šGPU åˆ†ē‰‡č§£ē  → GPU 解码+ē»“ęžœåøč½½åˆ° CPU → å®Œå…Ø CPU 解码 +- **č‡ŖåŠØåˆ†ē‰‡å¤§å°**: VAE č§£ē åˆ†ē‰‡å¤§å°ę ¹ę®åÆē”Øē©ŗé—²ę˜¾å­˜č‡Ŗé€‚åŗ”č°ƒę•“ļ¼ˆ64/128/256/512/1024/1536) +- **时长/批欔裁剪**: å¦‚ęžœčÆ·ę±‚ēš„å€¼č¶…å‡ŗē­‰ēŗ§é™åˆ¶ļ¼Œä¼šč‡ŖåŠØč£å‰Ŗå¹¶ę˜¾ē¤ŗč­¦å‘Š ## čÆ“ę˜Ž - **默认设置** ä¼šę ¹ę®ę£€ęµ‹åˆ°ēš„ GPU ę˜¾å­˜č‡ŖåŠØé…ē½® - **LM ęØ”å¼** ęŒ‡ē”ØäŗŽę€ē»“é“¾ (Chain-of-Thought) ē”Ÿęˆå’ŒéŸ³é¢‘ē†č§£ēš„čÆ­čØ€ęØ”åž‹ -- **Flash Attention**态**CPU Offload**态**Compile** 和 **Quantization** é»˜č®¤åÆē”Øä»„čŽ·å¾—ęœ€ä½³ę€§čƒ½ -- å¦‚ęžœę‚ØčÆ·ę±‚ēš„ę—¶é•æęˆ–ę‰¹ę¬”å¤§å°č¶…å‡ŗ GPU é™åˆ¶ļ¼Œē³»ē»Ÿä¼šę˜¾ē¤ŗč­¦å‘Šå¹¶č‡ŖåŠØč°ƒę•“åˆ°å…č®øēš„ęœ€å¤§å€¼ +- **Flash Attention** ä¼šč‡ŖåŠØę£€ęµ‹å¹¶åœØåÆē”Øę—¶åÆē”Ø - **ēŗ¦ęŸč§£ē **: 当 LM åˆå§‹åŒ–åŽļ¼ŒLM ē”Ÿęˆēš„ę—¶é•æä¹Ÿä¼šč¢«ēŗ¦ęŸåœØ GPU ē­‰ēŗ§ēš„ęœ€å¤§ę—¶é•æé™åˆ¶å†…ļ¼Œé˜²ę­¢åœØ CoT ē”Ÿęˆę—¶å‡ŗēŽ°ę˜¾å­˜äøč¶³é”™čÆÆ -- åÆ¹äŗŽę˜¾å­˜ ≤6GB ēš„ GPUļ¼Œé»˜č®¤ē¦ē”Ø LM åˆå§‹åŒ–ä»„äæē•™ę˜¾å­˜ē»™ DiT ęØ”åž‹ +- åÆ¹äŗŽę˜¾å­˜ ≤6GB ēš„ GPU(Tier 1-2ļ¼‰ļ¼Œé»˜č®¤ē¦ē”Ø LM åˆå§‹åŒ–ä»„äæē•™ę˜¾å­˜ē»™ DiT ęØ”åž‹ - ę‚ØåÆä»„é€ščæ‡å‘½ä»¤č”Œå‚ę•°ęˆ– Gradio UI ę‰‹åŠØč¦†ē›–č®¾ē½® > **ę¬¢čæŽē¤¾åŒŗč“”ēŒ®**: 仄上 GPU åˆ†ēŗ§é…ē½®åŸŗäŗŽęˆ‘ä»¬åœØåøøč§ē”¬ä»¶äøŠēš„ęµ‹čÆ•ć€‚å¦‚ęžœę‚Øå‘ēŽ°ę‚Øēš„č®¾å¤‡å®žé™…ę€§čƒ½äøŽčæ™äŗ›å‚ę•°äøē¬¦ļ¼ˆä¾‹å¦‚ļ¼ŒåÆä»„å¤„ē†ę›“é•æēš„ę—¶é•æęˆ–ę›“å¤§ēš„ę‰¹ę¬”ļ¼‰ļ¼Œę¬¢čæŽę‚Øčæ›č”Œę›“å……åˆ†ēš„ęµ‹čÆ•ļ¼Œå¹¶ęäŗ¤ PR ę„ä¼˜åŒ– `acestep/gpu_config.py` äø­ēš„é…ē½®ć€‚ę‚Øēš„č“”ēŒ®å°†åø®åŠ©ę”¹å–„ę‰€ęœ‰ē”Øęˆ·ēš„ä½“éŖŒļ¼ ## ę˜¾å­˜ä¼˜åŒ–å»ŗč®® -1. **ä½Žę˜¾å­˜ (<8GB)**: 使用纯 DiT ęØ”å¼ļ¼Œäøåˆå§‹åŒ– LMļ¼Œä»„čŽ·å¾—ęœ€å¤§ę—¶é•æ -2. **äø­ē­‰ę˜¾å­˜ (8-16GB)**: 使用 0.6B LM ęØ”åž‹ļ¼ŒåœØč“Øé‡å’Œę˜¾å­˜ä¹‹é—“å–å¾—ęœ€ä½³å¹³č”” -3. **高显存 (>16GB)**: åÆē”Øę›“å¤§ēš„ LM ęØ”åž‹ (1.7B/4B) ä»„čŽ·å¾—ę›“å„½ēš„éŸ³é¢‘ē†č§£å’Œē”Ÿęˆč“Øé‡ +1. **ęžä½Žę˜¾å­˜ (≤6GB)**: 使用纯 DiT ęØ”å¼ļ¼Œäøåˆå§‹åŒ– LM怂INT8 é‡åŒ–å’Œå®Œå…Ø CPU åøč½½ę˜Æåæ…é”»ēš„ć€‚VAE č§£ē åÆčƒ½ä¼šč‡ŖåŠØå›žé€€åˆ° CPU怂 +2. **ä½Žę˜¾å­˜ (6-8GB)**: åÆä½æē”Ø 0.6B LM ęØ”åž‹ļ¼Œé…åˆ `pt` åŽē«Æć€‚äæęŒåøč½½åÆē”Øć€‚ +3. **äø­ē­‰ę˜¾å­˜ (8-16GB)**: 使用 0.6B ꈖ 1.7B LM ęØ”åž‹ć€‚Tier 4+ 上 `vllm` åŽē«Æč”ØēŽ°č‰Æå„½ć€‚ +4. **高显存 (16-24GB)**: åÆē”Øę›“å¤§ēš„ LM ęØ”åž‹ļ¼ˆęŽØč 1.7B)。20GB+ é‡åŒ–å˜äøŗåÆé€‰ć€‚ +5. **č¶…é«˜ę˜¾å­˜ (≄24GB)**: ę‰€ęœ‰ęØ”åž‹ę— éœ€åøč½½ęˆ–é‡åŒ–å³åÆčæč”Œć€‚ä½æē”Ø 4B LM čŽ·å¾—ęœ€ä½³č“Øé‡ć€‚ ## č°ƒčÆ•ęØ”å¼ļ¼šęØ”ę‹ŸäøåŒēš„ GPU é…ē½® @@ -40,17 +73,93 @@ ACE-Step 1.5 ä¼šč‡ŖåŠØé€‚é…ę‚Øēš„ GPU ę˜¾å­˜å¤§å°ļ¼Œē›øåŗ”č°ƒę•“ē”Ÿęˆę—¶é•æ # ęØ”ę‹Ÿ 4GB GPU (Tier 1) MAX_CUDA_VRAM=4 uv run acestep +# ęØ”ę‹Ÿ 6GB GPU (Tier 2) +MAX_CUDA_VRAM=6 uv run acestep + # ęØ”ę‹Ÿ 8GB GPU (Tier 4) MAX_CUDA_VRAM=8 uv run acestep # ęØ”ę‹Ÿ 12GB GPU (Tier 5) MAX_CUDA_VRAM=12 uv run acestep -# ęØ”ę‹Ÿ 16GB GPU (Tier 6) +# ęØ”ę‹Ÿ 16GB GPU (Tier 6a) MAX_CUDA_VRAM=16 uv run acestep ``` +设置 `MAX_CUDA_VRAM` ę—¶ļ¼Œē³»ē»Ÿčæ˜ä¼šč°ƒē”Ø `torch.cuda.set_per_process_memory_fraction()` ę„å¼ŗåˆ¶ę‰§č”Œę˜¾å­˜ē”¬äøŠé™ļ¼Œå³ä½æåœØé«˜ē«Æ GPU äøŠä¹Ÿčƒ½å®žēŽ°ēœŸå®žēš„ęØ”ę‹Ÿć€‚ + +### č‡ŖåŠØåŒ–åˆ†ēŗ§ęµ‹čÆ• + +ę— éœ€é€ščæ‡ UI ę‰‹åŠØęµ‹čÆ•ęÆäøŖē­‰ēŗ§ļ¼ŒåÆä»„ä½æē”Ø `profile_inference.py` ēš„ `tier-test` ęØ”å¼ļ¼š + +```bash +# č‡ŖåŠØęµ‹čÆ•ę‰€ęœ‰ē­‰ēŗ§ +python profile_inference.py --mode tier-test + +# ęµ‹čÆ•ē‰¹å®šē­‰ēŗ§ +python profile_inference.py --mode tier-test --tiers 6 8 16 + +# 测试时启用 LMļ¼ˆåœØę”ÆęŒēš„ē­‰ēŗ§äøŠļ¼‰ +python profile_inference.py --mode tier-test --tier-with-lm + +# åæ«é€Ÿęµ‹čÆ•ļ¼ˆéžé‡åŒ–ē­‰ēŗ§č·³čæ‡ torch.compile) +python profile_inference.py --mode tier-test --tier-skip-compile +``` + +详见 [BENCHMARK.md](BENCHMARK.md) čŽ·å–ę€§čƒ½åˆ†ęžå·„å…·ēš„å®Œę•“ę–‡ę”£ć€‚ + é€‚ē”Øåœŗę™Æļ¼š - åœØé«˜ē«Æē”¬ä»¶äøŠęµ‹čÆ• GPU åˆ†ēŗ§é…ē½® - éŖŒčÆå„ē­‰ēŗ§ēš„č­¦å‘Šå’Œé™åˆ¶ę˜Æå¦ę­£åøøå·„ä½œ -- åœØęäŗ¤ PR ä¹‹å‰å¼€å‘å’Œęµ‹čÆ•ę–°ēš„ GPU é…ē½®å‚ę•° +- 修改 `acestep/gpu_config.py` åŽēš„č‡ŖåŠØåŒ–å›žå½’ęµ‹čÆ• +- CI/CD ę˜¾å­˜å…¼å®¹ę€§éŖŒčÆ + +### č¾¹ē•Œęµ‹čÆ•ļ¼ˆęŸ„ę‰¾ęœ€ä½Žē­‰ēŗ§ļ¼‰ + +使用 `--tier-boundary` åÆä»„é€ščæ‡å®žé™…čæč”Œę„ē”®å®šä»Žå“ŖäøŖę˜¾å­˜ē­‰ēŗ§å¼€å§‹åÆä»„å®‰å…Øåœ°å…³é—­ INT8 量化和 CPU åøč½½ć€‚åÆ¹äŗŽęÆäøŖē­‰ēŗ§ļ¼Œęœ€å¤ščæč”Œäø‰ē§é…ē½®ļ¼š + +1. **default** — čÆ„ē­‰ēŗ§ēš„é»˜č®¤č®¾ē½®ļ¼ˆęŒ‰é…ē½®ä½æē”Øé‡åŒ– + åøč½½ļ¼‰ +2. **no-quant** — äæęŒåøč½½č®¾ē½®äøå˜ļ¼Œä½†å…³é—­é‡åŒ– +3. **no-offload** — äøä½æē”Øé‡åŒ–ļ¼Œä¹Ÿäøä½æē”Ø CPU åøč½½ļ¼ˆę‰€ęœ‰ęØ”åž‹äæē•™åœØ GPU äøŠļ¼‰ + +```bash +# åœØę‰€ęœ‰ē­‰ēŗ§äøŠčæč”Œč¾¹ē•Œęµ‹čÆ• +python profile_inference.py --mode tier-test --tier-boundary + +# ęµ‹čÆ•ē‰¹å®šē­‰ēŗ§ēš„č¾¹ē•Œ +python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24 + +# 启用 LM ēš„č¾¹ē•Œęµ‹čÆ•ļ¼ˆåœØę”ÆęŒēš„ē­‰ēŗ§äøŠļ¼‰ +python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm + +# å°†ē»“ęžœäæå­˜äøŗ JSON ä»„ä¾æčæ›äø€ę­„åˆ†ęž +python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json +``` + +č¾“å‡ŗåŒ…å«äø€äøŖ **č¾¹ē•Œåˆ†ęž** éƒØåˆ†ļ¼Œę˜¾ē¤ŗęÆē§čƒ½åŠ›ēš„ęœ€ä½Žē­‰ēŗ§ļ¼š + +``` +BOUNDARY ANALYSIS +================= + Capability Min Tier VRAM + ------------------------------------------------------------ + No INT8 Quantization tier6b 20GB + No CPU Offload (all models on GPU) tier6b 20GB + ------------------------------------------------------------ +``` + +> **ę³Øę„ļ¼š** č¾¹ē•Œęµ‹čÆ•ē»“ęžœę˜Æē»éŖŒę€§ēš„ļ¼ŒåÆčƒ½å›  DiT ęØ”åž‹å˜ä½“ļ¼ˆturbo vs baseļ¼‰ć€ę˜Æå¦åÆē”Ø LMć€ē”Ÿęˆę—¶é•æå’Œ flash attention åÆē”Øę€§č€Œęœ‰ę‰€äøåŒć€‚ę¬¢čæŽē¤¾åŒŗč“”ēŒ®ę„å®Œå–„čæ™äŗ›č¾¹ē•Œå€¼ļ¼ + +### ę‰¹ę¬”å¤§å°č¾¹ē•Œęµ‹čÆ• + +使用 `--tier-batch-boundary` é€ščæ‡é€’čæ›ęµ‹čÆ•ę‰¹ę¬”å¤§å° 1态2态4态8 ę„ęŸ„ę‰¾ęÆäøŖē­‰ēŗ§ēš„ęœ€å¤§å®‰å…Øę‰¹ę¬”å¤§å°ļ¼š + +```bash +# čæč”ŒåÆē”Ø LM ēš„ę‰¹ę¬”č¾¹ē•Œęµ‹čÆ• +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm + +# ęµ‹čÆ•ē‰¹å®šē­‰ēŗ§ +python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24 +``` + +čÆ„ęµ‹čÆ•åŒę—¶ęµ‹čÆ•ęœ‰ LM å’Œę—  LM ēš„é…ē½®ļ¼Œå¹¶ęŠ„å‘ŠęÆäøŖē­‰ēŗ§ēš„ęœ€å¤§ęˆåŠŸę‰¹ę¬”å¤§å°ć€‚ diff --git a/docs/zh/GRADIO_GUIDE.md b/docs/zh/GRADIO_GUIDE.md index 87b39725..5d5821fe 100644 --- a/docs/zh/GRADIO_GUIDE.md +++ b/docs/zh/GRADIO_GUIDE.md @@ -62,17 +62,23 @@ Gradio ē•Œé¢åŒ…å«ä»„äø‹äø»č¦éƒØåˆ†ļ¼š | 设置 | čÆ“ę˜Ž | |---------|-------------| -| **5Hz LM ęØ”åž‹č·Æå¾„** | é€‰ę‹©čÆ­čØ€ęØ”åž‹ļ¼ˆä¾‹å¦‚ `acestep-5Hz-lm-0.6B`态`acestep-5Hz-lm-1.7B`)| -| **5Hz LM åŽē«Æ** | `vllm`ļ¼ˆę›“åæ«ļ¼ŒęŽØčļ¼‰ęˆ– `pt`(PyTorchļ¼Œå…¼å®¹ę€§ę›“å„½ļ¼‰| -| **初始化 5Hz LM** | å‹¾é€‰ä»„åœØåˆå§‹åŒ–ęœŸé—“åŠ č½½ LM(thinking ęØ”å¼åæ…éœ€ļ¼‰| +| **5Hz LM ęØ”åž‹č·Æå¾„** | é€‰ę‹©čÆ­čØ€ęØ”åž‹ć€‚**åÆē”ØęØ”åž‹ę ¹ę® GPU ē­‰ēŗ§č‡ŖåŠØčæ‡ę»¤** — ä¾‹å¦‚ļ¼Œ6-8GB GPU ä»…ę˜¾ē¤ŗ 0.6Bļ¼Œč€Œ 24GB+ GPU ę˜¾ē¤ŗę‰€ęœ‰å°ŗåÆøļ¼ˆ0.6B态1.7B态4B)。| +| **5Hz LM åŽē«Æ** | `vllm`ļ¼ˆę›“åæ«ļ¼ŒęŽØčę˜¾å­˜ ≄8GB ēš„ NVIDIA GPU)、`pt`(PyTorchļ¼Œé€šē”Øå›žé€€ę–¹ę”ˆļ¼‰ęˆ– `mlx`(Apple Silicon)。**显存 <8GB ēš„ GPU é™åˆ¶äøŗ `pt`/`mlx`**ļ¼Œå› äøŗ vllm ēš„ KV ē¼“å­˜å ē”Øčæ‡å¤§ć€‚| +| **初始化 5Hz LM** | å‹¾é€‰ä»„åœØåˆå§‹åŒ–ęœŸé—“åŠ č½½ LM(thinking ęØ”å¼åæ…éœ€ļ¼‰ć€‚**显存 ≤6GB ēš„ GPU(Tier 1-2ļ¼‰é»˜č®¤äøå‹¾é€‰äø”ē¦ē”Øć€‚**| + +> **č‡Ŗé€‚åŗ”é»˜č®¤č®¾ē½®**: ꉀ꜉ LM č®¾ē½®ę ¹ę® GPU ę˜¾å­˜ē­‰ēŗ§č‡ŖåŠØé…ē½®ć€‚ęŽØčēš„ LM ęØ”åž‹ć€åŽē«Æå’Œåˆå§‹åŒ–ēŠ¶ę€å·²é¢„č®¾äøŗęœ€ä½³ę€§čƒ½ć€‚ę‚ØåÆä»„ę‰‹åŠØč¦†ē›–ļ¼Œä½†å¦‚ęžœé€‰ę‹©äøŽ GPU äøå…¼å®¹ļ¼Œē³»ē»Ÿä¼šå‘å‡ŗč­¦å‘Šć€‚ ### ę€§čƒ½é€‰é”¹ | 设置 | čÆ“ę˜Ž | |---------|-------------| | **使用 Flash Attention** | åÆē”Øä»„åŠ é€ŸęŽØē†ļ¼ˆéœ€č¦ flash_attn åŒ…ļ¼‰| -| **åøč½½åˆ° CPU** | ē©ŗé—²ę—¶å°†ęØ”åž‹åøč½½åˆ° CPU ä»„čŠ‚ēœ GPU 内存 | -| **将 DiT åøč½½åˆ° CPU** | äø“é—Øå°† DiT ęØ”åž‹åøč½½åˆ° CPU | +| **åøč½½åˆ° CPU** | ē©ŗé—²ę—¶å°†ęØ”åž‹åøč½½åˆ° CPU ä»„čŠ‚ēœ GPU ę˜¾å­˜ć€‚**显存 <20GB ēš„ GPU é»˜č®¤č‡ŖåŠØåÆē”Øć€‚**| +| **将 DiT åøč½½åˆ° CPU** | äø“é—Øå°† DiT ęØ”åž‹åøč½½åˆ° CPU怂**显存 <12GB ēš„ GPU é»˜č®¤č‡ŖåŠØåÆē”Øć€‚**| +| **INT8 量化** | 使用 INT8 ęƒé‡é‡åŒ–å‡å°‘ęØ”åž‹ę˜¾å­˜å ē”Øć€‚**显存 <20GB ēš„ GPU é»˜č®¤č‡ŖåŠØåÆē”Øć€‚**| +| **ęØ”åž‹ē¼–čÆ‘** | 启用 `torch.compile` ä¼˜åŒ–ęŽØē†ć€‚**ę‰€ęœ‰ē­‰ēŗ§é»˜č®¤åÆē”Ø**ļ¼ˆé‡åŒ–ęæ€ę“»ę—¶åæ…éœ€ļ¼‰ć€‚| + +> **ē­‰ēŗ§ę„ŸēŸ„č®¾ē½®**: åøč½½ć€é‡åŒ–å’Œē¼–čÆ‘é€‰é”¹ę ¹ę® GPU ē­‰ēŗ§č‡ŖåŠØč®¾ē½®ć€‚čÆ¦č§ [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) äŗ†č§£å®Œę•“ēš„ē­‰ēŗ§č”Øć€‚ ### LoRA é€‚é…å™Ø @@ -87,7 +93,12 @@ Gradio ē•Œé¢åŒ…å«ä»„äø‹äø»č¦éƒØåˆ†ļ¼š ### 初始化 -点击 **åˆå§‹åŒ–ęœåŠ”** åŠ č½½ęØ”åž‹ć€‚ēŠ¶ę€ę”†å°†ę˜¾ē¤ŗčæ›åŗ¦å’Œē”®č®¤äæ”ęÆć€‚ +点击 **åˆå§‹åŒ–ęœåŠ”** åŠ č½½ęØ”åž‹ć€‚ēŠ¶ę€ę”†å°†ę˜¾ē¤ŗčæ›åŗ¦å’Œē”®č®¤äæ”ęÆļ¼ŒåŒ…ę‹¬ļ¼š +- ę£€ęµ‹åˆ°ēš„ GPU ē­‰ēŗ§å’Œę˜¾å­˜ +- ęœ€å¤§å…č®øę—¶é•æå’Œę‰¹ę¬”å¤§å°ļ¼ˆę ¹ę®ę˜Æå¦åˆå§‹åŒ–äŗ† LM åŠØę€č°ƒę•“ļ¼‰ +- ä»»ä½•äøå…¼å®¹č®¾ē½®č¢«č‡ŖåŠØäæ®ę­£ēš„č­¦å‘Š + +åˆå§‹åŒ–åŽļ¼Œ**éŸ³é¢‘ę—¶é•æ** 和 **ę‰¹é‡å¤§å°** ę»‘å—ä¼šč‡ŖåŠØę›“ę–°ä»„åę˜ ē­‰ēŗ§é™åˆ¶ć€‚ --- @@ -515,15 +526,19 @@ LoRA č®­ē»ƒé€‰é”¹å”ęä¾›åˆ›å»ŗč‡Ŗå®šä¹‰ LoRA é€‚é…å™Øēš„å·„å…·ć€‚ - å°čÆ•äøåŒēš„ē§å­ - 使 caption 曓具体 -**å†…å­˜äøč¶³ļ¼š** -- å‡å°‘ę‰¹é‡å¤§å° -- 启用 CPU åøč½½ +**ę˜¾å­˜äøč¶³ (OOM):** +- ē³»ē»ŸåŒ…å«č‡ŖåŠØę˜¾å­˜ē®”ē†ļ¼ˆę˜¾å­˜å®ˆå«ć€č‡Ŗé€‚åŗ” VAE č§£ē ć€č‡ŖåŠØę‰¹ę¬”å‡å°ļ¼‰ć€‚å¦‚ęžœä»ē„¶ OOM: +- ę‰‹åŠØå‡å°‘ę‰¹é‡å¤§å° +- 启用 CPU åøč½½ļ¼ˆę˜¾å­˜ <20GB åŗ”å·²č‡ŖåŠØåÆē”Øļ¼‰ +- 启用 INT8 é‡åŒ–ļ¼ˆę˜¾å­˜ <20GB åŗ”å·²č‡ŖåŠØåÆē”Øļ¼‰ - 减少 LM ę‰¹å¤„ē†å—å¤§å° +- 详见 [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) äŗ†č§£å„ē­‰ēŗ§ęŽØčč®¾ē½® **LM äøå·„ä½œļ¼š** -- ē”®äæåˆå§‹åŒ–ęœŸé—“å‹¾é€‰äŗ†"初始化 5Hz LM" -- ę£€ęŸ„ę˜Æå¦é€‰ę‹©äŗ†ęœ‰ę•ˆēš„ LM ęØ”åž‹č·Æå¾„ -- 验证 vllm ꈖ PyTorch åŽē«ÆåÆē”Ø +- ē”®äæåˆå§‹åŒ–ęœŸé—“å‹¾é€‰äŗ†"初始化 5Hz LM"(显存 ≤6GB ēš„ GPU é»˜č®¤ē¦ē”Øļ¼‰ +- ę£€ęŸ„ę˜Æå¦é€‰ę‹©äŗ†ęœ‰ę•ˆēš„ LM ęØ”åž‹č·Æå¾„ļ¼ˆä»…ę˜¾ē¤ŗäøŽē­‰ēŗ§å…¼å®¹ēš„ęØ”åž‹ļ¼‰ +- 验证 vllm ꈖ PyTorch åŽē«ÆåÆē”Øļ¼ˆę˜¾å­˜ <8GB é™åˆ¶ä½æē”Ø vllm) +- å¦‚ęžœ LM å¤é€‰ę”†ē°č‰²äøåÆē”Øļ¼ŒčÆ“ę˜Žę‚Øēš„ GPU ē­‰ēŗ§äøę”ÆęŒ LM — 请使用纯 DiT ęØ”å¼ --- diff --git a/docs/zh/INFERENCE.md b/docs/zh/INFERENCE.md index 0242354c..dc80d402 100644 --- a/docs/zh/INFERENCE.md +++ b/docs/zh/INFERENCE.md @@ -975,13 +975,21 @@ else: # ... å¤„ē†éŸ³é¢‘ę–‡ä»¶ ``` -### 7. å†…å­˜ē®”ē† +### 7. ę˜¾å­˜ē®”ē† -åÆ¹äŗŽå¤§ę‰¹é‡å¤§å°ęˆ–é•æę—¶é•æļ¼š -- ē›‘ęŽ§ GPU å†…å­˜ä½æē”Ø -- å¦‚ęžœå‡ŗēŽ° OOM é”™čÆÆļ¼Œå‡å°‘ `batch_size` -- 减少 `lm_batch_chunk_size` ē”ØäŗŽ LM ę“ä½œ -- č€ƒč™‘åœØåˆå§‹åŒ–ęœŸé—“ä½æē”Ø `offload_to_cpu=True` +ACE-Step 1.5 åŒ…å«č‡ŖåŠØę˜¾å­˜ē®”ē†ļ¼ŒåÆé€‚åŗ”ę‚Øēš„ GPU: + +- **č‡ŖåŠØē­‰ēŗ§ę£€ęµ‹**: ē³»ē»Ÿę£€ęµ‹åÆē”Øę˜¾å­˜å¹¶é€‰ę‹©ęœ€ä½³č®¾ē½®ļ¼ˆčÆ¦č§ [GPU_COMPATIBILITY.md](../zh/GPU_COMPATIBILITY.md)) +- **ę˜¾å­˜å®ˆå«**: ęÆę¬”ęŽØē†å‰ļ¼Œē³»ē»Ÿä¼°ē®—ę˜¾å­˜éœ€ę±‚ļ¼Œåæ…č¦ę—¶č‡ŖåŠØå‡å° `batch_size` +- **自适应 VAE 解码**: äø‰ēŗ§å›žé€€ — GPU åˆ†ē‰‡č§£ē  → GPU 解码+CPU åøč½½ → å®Œå…Ø CPU 解码 +- **č‡ŖåŠØåˆ†ē‰‡å¤§å°**: VAE č§£ē åˆ†ē‰‡å¤§å°ę ¹ę®ē©ŗé—²ę˜¾å­˜č‡Ŗé€‚åŗ”č°ƒę•“ļ¼ˆ64/128/256/512/1024/1536) +- **时长/批欔裁剪**: č¶…å‡ŗē­‰ēŗ§é™åˆ¶ēš„å€¼ä¼šč‡ŖåŠØč£å‰Ŗå¹¶ę˜¾ē¤ŗč­¦å‘Š + +ę‰‹åŠØč°ƒä¼˜ļ¼š +- å¦‚ęžœä»ē„¶å‡ŗēŽ° OOM é”™čÆÆļ¼Œå‡å°‘ `batch_size` +- ä½Žę˜¾å­˜ GPU äøŠå‡å°‘ `lm_batch_chunk_size` ē”ØäŗŽ LM ę“ä½œ +- 显存 <20GB 时启用 `offload_to_cpu=True` +- 显存 <20GB 时启用 `quantization="int8_weight_only"` --- @@ -989,8 +997,8 @@ else: ### åøøč§é—®é¢˜ -**问题**ļ¼šå†…å­˜äøč¶³é”™čÆÆ -- **č§£å†³ę–¹ę”ˆ**ļ¼šå‡å°‘ `batch_size`态`inference_steps`ļ¼Œęˆ–åÆē”Ø CPU åøč½½ +**问题**ļ¼šę˜¾å­˜äøč¶³ (OOM) 错误 +- **č§£å†³ę–¹ę”ˆ**ļ¼šē³»ē»Ÿåŗ”é€ščæ‡ę˜¾å­˜å®ˆå«ļ¼ˆč‡ŖåŠØå‡å°ę‰¹ę¬”ļ¼‰å’Œč‡Ŗé€‚åŗ” VAE 解码(CPU å›žé€€ļ¼‰č‡ŖåŠØå¤„ē†å¤§å¤šę•° OOM åœŗę™Æć€‚å¦‚ęžœä»ē„¶å‡ŗēŽ° OOMļ¼šå‡å°‘ `batch_size`ć€å‡å°‘ `inference_steps`、启用 CPU åøč½½ļ¼ˆ`offload_to_cpu=True`ļ¼‰ęˆ–åÆē”Ø INT8 é‡åŒ–ć€‚čÆ¦č§ [GPU_COMPATIBILITY.md](../zh/GPU_COMPATIBILITY.md) äŗ†č§£å„ę˜¾å­˜ē­‰ēŗ§ēš„ęŽØčč®¾ē½®ć€‚ **问题**ļ¼šē»“ęžœč“Øé‡å·® - **č§£å†³ę–¹ę”ˆ**ļ¼šå¢žåŠ  `inference_steps`ļ¼Œč°ƒę•“ `guidance_scale`ļ¼Œä½æē”Ø base ęØ”åž‹ diff --git a/docs/zh/INSTALL.md b/docs/zh/INSTALL.md index 5efb736c..d0502897 100644 --- a/docs/zh/INSTALL.md +++ b/docs/zh/INSTALL.md @@ -468,7 +468,7 @@ ACESTEP_INIT_LLM=false | `--init_llm` | auto | LLM åˆå§‹åŒ–ļ¼š`true` / `false` / ēœē•„äøŗč‡ŖåŠØ | | `--config_path` | auto | DiT ęØ”åž‹ļ¼ˆå¦‚ `acestep-v15-turbo`) | | `--lm_model_path` | auto | LM ęØ”åž‹ļ¼ˆå¦‚ `acestep-5Hz-lm-1.7B`) | -| `--offload_to_cpu` | auto | CPU åøč½½ļ¼ˆę˜¾å­˜ < 16GB ę—¶č‡ŖåŠØåÆē”Øļ¼‰ | +| `--offload_to_cpu` | auto | CPU åøč½½ļ¼ˆę˜¾å­˜ < 20GB ę—¶č‡ŖåŠØåÆē”Øļ¼‰ | | `--download-source` | auto | ęØ”åž‹ęŗļ¼š`auto` / `huggingface` / `modelscope` | | `--enable-api` | false | åŒę—¶åÆē”Ø REST API 端点 | @@ -529,16 +529,17 @@ huggingface-cli download ACE-Step/acestep-5Hz-lm-4B --local-dir ./checkpoints/ac ## šŸ’” å¦‚ä½•é€‰ę‹©ęØ”åž‹ļ¼Ÿ -ACE-Step ä¼šč‡ŖåŠØé€‚é…ä½ ēš„ GPU 显存: +ACE-Step ä¼šč‡ŖåŠØé€‚é…ä½ ēš„ GPU ę˜¾å­˜ć€‚UI ä¼šę ¹ę®ę£€ęµ‹åˆ°ēš„ GPU ē­‰ēŗ§é¢„é…ē½®ę‰€ęœ‰č®¾ē½®ļ¼ˆLM ęØ”åž‹ć€åŽē«Æć€åøč½½ć€é‡åŒ–ļ¼‰ļ¼š -| GPU 显存 | ęŽØč LM ęØ”åž‹ | čÆ“ę˜Ž | -|----------|--------------|------| -| **≤6GB** | ę— ļ¼ˆä»… DiT) | é»˜č®¤ē¦ē”Ø LM ä»„čŠ‚ēœę˜¾å­˜ | -| **6-12GB** | `acestep-5Hz-lm-0.6B` | č½»é‡ļ¼Œå¹³č””ę€§å„½ | -| **12-16GB** | `acestep-5Hz-lm-1.7B` | ę›“å„½ēš„č“Øé‡ | -| **≄16GB** | `acestep-5Hz-lm-4B` | ęœ€ä½³č“Øé‡å’ŒéŸ³é¢‘ē†č§£čƒ½åŠ› | +| GPU 显存 | ęŽØč LM ęØ”åž‹ | åŽē«Æ | čÆ“ę˜Ž | +|----------|--------------|------|------| +| **≤6GB** | ę— ļ¼ˆä»… DiT) | — | é»˜č®¤ē¦ē”Ø LMļ¼›INT8 量化 + å®Œå…Ø CPU åøč½½ | +| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | č½»é‡ LM,PyTorch åŽē«Æ | +| **8-16GB** | `0.6B` / `1.7B` | `vllm` | 8-12GB 用 0.6B,12-16GB 用 1.7B | +| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 20GB+ åÆē”Ø 4Bļ¼›20GB+ ę— éœ€åøč½½ | +| **≄24GB** | `acestep-5Hz-lm-4B` | `vllm` | ęœ€ä½³č“Øé‡ļ¼Œę‰€ęœ‰ęØ”åž‹ę— éœ€åøč½½ | -> šŸ“– 详细 GPU å…¼å®¹ę€§äæ”ęÆļ¼ˆę—¶é•æé™åˆ¶ć€ę‰¹é‡å¤§å°ć€å†…å­˜ä¼˜åŒ–ļ¼‰ļ¼ŒčÆ·å‚é˜… [GPU å…¼å®¹ę€§ęŒ‡å—](GPU_COMPATIBILITY.md)怂 +> šŸ“– 详细 GPU å…¼å®¹ę€§äæ”ęÆļ¼ˆē­‰ēŗ§č”Øć€ę—¶é•æé™åˆ¶ć€ę‰¹é‡å¤§å°ć€č‡Ŗé€‚åŗ” UI é»˜č®¤č®¾ē½®ć€ę˜¾å­˜ä¼˜åŒ–ļ¼‰ļ¼ŒčÆ·å‚é˜… [GPU å…¼å®¹ę€§ęŒ‡å—](GPU_COMPATIBILITY.md)怂 --- diff --git a/profile_inference.py b/profile_inference.py index 387f9dc1..12467c0b 100644 --- a/profile_inference.py +++ b/profile_inference.py @@ -8,6 +8,7 @@ Modes: profile - Profile a single generation run with detailed timing breakdown benchmark - Run a matrix of configurations and produce a summary table + tier-test - Auto-test across simulated GPU tiers (4/6/8/12/16/24/48 GB) understand - Profile the understand_music() API (audio codes -> metadata) create_sample - Profile the create_sample() API (inspiration/simple mode) format_sample - Profile the format_sample() API (caption+lyrics -> metadata) @@ -22,6 +23,15 @@ # Benchmark across configurations python profile_inference.py --mode benchmark + # Test all GPU tiers automatically (the key feature!) + python profile_inference.py --mode tier-test + + # Test specific tiers only + python profile_inference.py --mode tier-test --tiers 6 8 16 + + # Test tiers with LM enabled (where supported) + python profile_inference.py --mode tier-test --tier-with-lm + # Profile create_sample (inspiration mode) python profile_inference.py --mode create_sample --sample-query "a soft Bengali love song" @@ -38,6 +48,7 @@ import os import json import tempfile +import traceback from contextlib import contextmanager from collections import defaultdict from typing import Tuple, Dict, Any, List, Optional @@ -60,7 +71,15 @@ ) from acestep.handler import AceStepHandler from acestep.llm_inference import LLMHandler -from acestep.gpu_config import get_gpu_config, set_global_gpu_config +from acestep.gpu_config import ( + get_gpu_config, + set_global_gpu_config, + get_gpu_tier, + find_best_lm_model_on_disk, + is_lm_model_size_allowed, + GPUConfig, + VRAM_AUTO_OFFLOAD_THRESHOLD_GB, +) # ============================================================================= @@ -125,12 +144,12 @@ def load_env_config() -> Dict[str, str]: class PreciseTimer: """High-precision timer with GPU synchronization for accurate timing.""" - + def __init__(self, device: str = "cpu"): self.device = device self.timings: Dict[str, List[float]] = defaultdict(list) self.enabled = True - + def sync(self): """Synchronize GPU operations for accurate timing.""" if not self.enabled: @@ -139,10 +158,10 @@ def sync(self): torch.cuda.synchronize() elif self.device == "mps" and hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): if hasattr(torch, "mps"): - torch.mps.synchronize() + torch.mps.synchronize() elif self.device.startswith("xpu") and hasattr(torch, "xpu"): torch.xpu.synchronize() - + @contextmanager def time(self, name: str): """Time a code section with GPU synchronization.""" @@ -157,17 +176,17 @@ def time(self, name: str): self.sync() elapsed = time.perf_counter() - start self.timings[name].append(elapsed) - + def get_total(self, name: str) -> float: return sum(self.timings.get(name, [])) - + def get_mean(self, name: str) -> float: times = self.timings.get(name, []) return sum(times) / len(times) if times else 0.0 - + def get_count(self, name: str) -> int: return len(self.timings.get(name, [])) - + def reset(self): self.timings.clear() @@ -247,10 +266,10 @@ def print_time_costs_breakdown( time_costs: Dict[str, float], total_wall_time: float ): """Print a detailed timing breakdown from result.extra_outputs['time_costs'].""" - print("\n" + "=" * 100) + print("\n" + "=" * 100) print("PROFILING RESULTS") - print("=" * 100) - + print("=" * 100) + if not time_costs: print("\n (No time_costs data available from the pipeline)") print(f"\n Total wall time: {total_wall_time:.3f}s") @@ -341,10 +360,10 @@ def print_time_costs_breakdown( print(f"\n{'TOTAL WALL TIME':<50} {total_wall_time:<12.3f} {'100.0%':>6}") # Performance insights - print("\n" + "=" * 100) + print("\n" + "=" * 100) print("PERFORMANCE INSIGHTS") - print("=" * 100) - + print("=" * 100) + if lm_total > 0 and dit_total > 0: if lm_total > dit_total * 2: print( @@ -394,7 +413,7 @@ def print_result_summary(result: GenerationResult, mode: str = "profile"): if silent_count: print(f" ({silent_count} silent)", end="") print() - else: + else: print(f"\n FAILED: {result.error}") @@ -485,7 +504,6 @@ def run_profile_mode(dit_handler, llm_handler, args, timer: PreciseTimer): prof = None if args.detailed: import cProfile - prof = cProfile.Profile() prof.enable() @@ -659,7 +677,7 @@ def run_benchmark_mode(dit_handler, llm_handler, args, timer: PreciseTimer): f" {status} | wall={wall_time:.1f}s, " f"lm={entry['lm_time']:.1f}s, dit={entry['dit_time']:.1f}s" ) - + # Print summary table print("\n" + "=" * 120) print("BENCHMARK SUMMARY") @@ -695,6 +713,808 @@ def run_benchmark_mode(dit_handler, llm_handler, args, timer: PreciseTimer): return results +# ============================================================================= +# Mode: tier-test (THE KEY FEATURE) +# ============================================================================= + + +def _get_vram_info_str() -> str: + """Get current VRAM usage string for logging.""" + if not torch.cuda.is_available(): + return "N/A" + allocated = torch.cuda.memory_allocated() / (1024 ** 3) + reserved = torch.cuda.memory_reserved() / (1024 ** 3) + return f"alloc={allocated:.2f}GB, reserved={reserved:.2f}GB" + + +def _run_single_tier_test( + sim_gb: float, + gpu_config: GPUConfig, + args, + example_data: Dict, + checkpoint_dir: str, + disk_lm_models: List[str], + *, + offload_override: Optional[bool] = None, + offload_dit_override: Optional[bool] = None, + quantization_override: Optional[str] = "USE_DEFAULT", + test_variant: str = "default", + batch_size_override: Optional[int] = None, + use_lm_override: Optional[bool] = None, +) -> Dict[str, Any]: + """ + Run a single tier test with the given configuration. + + Args: + sim_gb: Simulated VRAM in GB + gpu_config: GPU configuration for this tier + args: CLI arguments + example_data: Example JSON data for generation + checkpoint_dir: Path to checkpoints directory + disk_lm_models: List of LM models found on disk + offload_override: If not None, override offload_to_cpu setting + offload_dit_override: If not None, override offload_dit_to_cpu setting + quantization_override: If not "USE_DEFAULT", override quantization setting + (None means no quantization, "int8_weight_only" etc.) + test_variant: Label for this test variant ("default", "no-quant", "no-offload") + batch_size_override: If not None, override batch size (used by batch boundary tests) + use_lm_override: If not None, force LM on (True) or off (False) + + Returns: + Result dictionary for this test + """ + tier = gpu_config.tier + + # Determine test configuration + if use_lm_override is not None: + use_lm = use_lm_override and gpu_config.init_lm_default and bool(gpu_config.available_lm_models) + else: + use_lm = args.tier_with_lm and gpu_config.init_lm_default and bool(gpu_config.available_lm_models) + + if offload_override is not None: + offload = offload_override + else: + offload = gpu_config.offload_to_cpu_default + + if offload_dit_override is not None: + offload_dit = offload_dit_override + else: + offload_dit = gpu_config.offload_dit_to_cpu_default + + if quantization_override != "USE_DEFAULT": + quantization = quantization_override + else: + quantization = "int8_weight_only" if gpu_config.quantization_default else None + + # Find LM model on disk + lm_model = None + lm_backend = gpu_config.recommended_backend + if use_lm: + lm_model = find_best_lm_model_on_disk( + gpu_config.recommended_lm_model, disk_lm_models + ) + if not lm_model: + print(f" āš ļø No compatible LM model on disk for tier {tier}, skipping LM") + use_lm = False + + # Clamp duration to tier limit + test_duration = args.tier_duration + max_dur = gpu_config.max_duration_with_lm if use_lm else gpu_config.max_duration_without_lm + if test_duration > max_dur: + test_duration = max_dur + print(f" Duration clamped to {test_duration}s (tier limit)") + + batch_size = batch_size_override if batch_size_override is not None else 1 + + print(f"\n Test config [{test_variant}]: duration={test_duration}s, batch={batch_size}, LM={use_lm}") + if use_lm: + print(f" LM model: {lm_model}, backend: {lm_backend}") + print(f" offload={offload}, offload_dit={offload_dit}, quant={quantization}") + + # Enforce VRAM cap + if torch.cuda.is_available(): + total_bytes = torch.cuda.get_device_properties(0).total_memory + total_gb = total_bytes / (1024 ** 3) + if sim_gb < total_gb: + reference_context_gb = 0.5 + allocator_budget_gb = max(0.5, sim_gb - reference_context_gb) + fraction = max(0.01, min(1.0, allocator_budget_gb / total_gb)) + torch.cuda.set_per_process_memory_fraction(fraction) + + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + # Initialize result entry + result_entry = { + "tier_gb": sim_gb, + "tier": tier, + "test_variant": test_variant, + "use_lm": use_lm, + "lm_model": lm_model, + "lm_backend": lm_backend, + "offload": offload, + "offload_dit": offload_dit, + "quantization": quantization, + "duration": test_duration, + "batch_size": batch_size, + "init_success": False, + "gen_success": False, + "wall_time": 0.0, + "error": None, + "peak_vram_gb": 0.0, + } + + dit_handler = None + llm_handler = None + + try: + print(f"\n Initializing DiT handler... ({_get_vram_info_str()})") + dit_handler = AceStepHandler() + + # Determine flash attention availability + use_flash_attention = False + try: + import flash_attn # noqa: F401 + use_flash_attention = True + except ImportError: + pass + + # compile_model must be True when quantization is used; + # --tier-skip-compile can skip it for non-quantized tiers to save time + if quantization: + compile_model = True + elif args.tier_skip_compile: + compile_model = False + else: + compile_model = gpu_config.compile_model_default + + status_dit, success_dit = dit_handler.initialize_service( + project_root=PROJECT_ROOT, + config_path=args.config_path, + device="auto", + use_flash_attention=use_flash_attention, + compile_model=compile_model, + offload_to_cpu=offload, + offload_dit_to_cpu=offload_dit, + quantization=quantization, + ) + + if not success_dit: + result_entry["error"] = f"DiT init failed: {status_dit}" + print(f" āŒ DiT init failed: {status_dit}") + _cleanup_handlers(dit_handler, None) + return result_entry + + print(f" āœ… DiT ready ({_get_vram_info_str()})") + + llm_handler = LLMHandler() + + if use_lm: + print(f" Initializing LLM handler (backend={lm_backend})... ({_get_vram_info_str()})") + status_llm, success_llm = llm_handler.initialize( + checkpoint_dir=checkpoint_dir, + lm_model_path=lm_model, + backend=lm_backend, + device="auto", + offload_to_cpu=offload, + dtype=None, + ) + if success_llm: + print(f" āœ… LLM ready ({_get_vram_info_str()})") + else: + print(f" āš ļø LLM init failed: {status_llm}") + use_lm = False + result_entry["use_lm"] = False + result_entry["error"] = f"LM init failed (non-fatal): {status_llm}" + + result_entry["init_success"] = True + + except torch.cuda.OutOfMemoryError as e: + result_entry["error"] = f"Init OOM: {e}" + print(f" āŒ Init OOM: {e}") + _cleanup_handlers(dit_handler, llm_handler) + return result_entry + except Exception as e: + result_entry["error"] = f"Init exception: {e}" + print(f" āŒ Init exception: {e}") + traceback.print_exc() + _cleanup_handlers(dit_handler, llm_handler) + return result_entry + + # Run generation + try: + print(f"\n Running generation... ({_get_vram_info_str()})") + save_dir = tempfile.mkdtemp(prefix=f"acestep_tier{int(sim_gb)}_{test_variant}_") + + params = GenerationParams( + caption=example_data.get("caption", ""), + lyrics=example_data.get("lyrics", ""), + bpm=example_data.get("bpm"), + keyscale=example_data.get("keyscale", ""), + timesignature=example_data.get("timesignature", ""), + vocal_language=example_data.get("language", "unknown"), + duration=test_duration, + thinking=use_lm, + use_cot_metas=use_lm, + use_cot_caption=False, + use_cot_language=False, + use_constrained_decoding=True, + inference_steps=8, + seed=42, + lm_temperature=0.85, + lm_cfg_scale=2.0, + guidance_scale=7.0, + ) + config = GenerationConfig( + batch_size=batch_size, + seeds=[42 + j for j in range(batch_size)], + use_random_seed=False, + audio_format="flac", + ) + + # When testing batch boundaries, temporarily override the GPU tier config's + # max_batch limits so that inference.py's clamping doesn't reduce our test + # batch size. We restore the original values after the test. + _patched_tier_config = False + _orig_batch_with_lm = None + _orig_batch_without_lm = None + if batch_size_override is not None and batch_size_override > 1: + from acestep.gpu_config import GPU_TIER_CONFIGS as _tier_configs + tier = gpu_config.tier + if tier in _tier_configs: + _patched_tier_config = True + _orig_batch_with_lm = _tier_configs[tier]["max_batch_size_with_lm"] + _orig_batch_without_lm = _tier_configs[tier]["max_batch_size_without_lm"] + _tier_configs[tier]["max_batch_size_with_lm"] = max(batch_size_override, _orig_batch_with_lm) + _tier_configs[tier]["max_batch_size_without_lm"] = max(batch_size_override, _orig_batch_without_lm) + + t0 = time.perf_counter() + try: + result = generate_music( + dit_handler, llm_handler, params, config, save_dir=save_dir + ) + finally: + # Restore original tier config values + if _patched_tier_config: + _tier_configs[tier]["max_batch_size_with_lm"] = _orig_batch_with_lm + _tier_configs[tier]["max_batch_size_without_lm"] = _orig_batch_without_lm + wall_time = time.perf_counter() - t0 + + result_entry["wall_time"] = wall_time + result_entry["gen_success"] = result.success + + if result.success: + tc = result.extra_outputs.get("time_costs", {}) + result_entry["lm_time"] = tc.get("lm_total_time", 0.0) + result_entry["dit_time"] = tc.get("dit_total_time_cost", 0.0) + result_entry["vae_time"] = tc.get("dit_vae_decode_time_cost", 0.0) + n_audios = len(result.audios) + print(f" āœ… [{test_variant}] Generation OK: {n_audios} audio(s) in {wall_time:.1f}s") + else: + result_entry["error"] = result.error + print(f" āŒ [{test_variant}] Generation FAILED: {result.error}") + + _cleanup_dir(save_dir) + + except torch.cuda.OutOfMemoryError as e: + result_entry["error"] = f"OOM: {e}" + print(f" āŒ [{test_variant}] OOM ERROR: {e}") + except Exception as e: + result_entry["error"] = f"Generation exception: {e}" + print(f" āŒ [{test_variant}] Exception: {e}") + traceback.print_exc() + + # Record peak VRAM + if torch.cuda.is_available(): + peak_bytes = torch.cuda.max_memory_allocated() + result_entry["peak_vram_gb"] = peak_bytes / (1024 ** 3) + print(f" Peak VRAM: {result_entry['peak_vram_gb']:.2f}GB") + + # Cleanup + _cleanup_handlers(dit_handler, llm_handler) + + return result_entry + + +def run_tier_test_mode(args): + """ + Automatically test inference across multiple simulated GPU tiers. + + For each tier: + 1. Set MAX_CUDA_VRAM to simulate the VRAM limit + 2. Initialize gpu_config for that tier + 3. Initialize DiT + (optionally) LLM handlers with tier-appropriate settings + 4. Run a short generation and verify it completes without OOM + 5. Report results + + When --tier-boundary is enabled, each tier is tested with up to 3 configurations: + - default: tier's default settings (quantization + offload as configured) + - no-quant: same as default but with quantization disabled + - no-offload: no quantization AND no CPU offload (all models on GPU) + + This replaces the manual workflow of: + MAX_CUDA_VRAM=8 uv run acestep → click UI → wait → check + """ + # Determine which tiers to test + default_tiers = [4, 6, 8, 12, 16, 24, 48] + tiers_to_test = args.tiers if args.tiers else default_tiers + + # Load example for generation + example_file = os.path.join( + PROJECT_ROOT, "examples", "text2music", args.example + ) + if not os.path.exists(example_file): + print(f"\n Example not found: {example_file}") + sys.exit(1) + + with open(example_file, "r", encoding="utf-8") as f: + example_data = json.load(f) + + # Scan available LM models on disk + checkpoint_dir = os.path.join(PROJECT_ROOT, "checkpoints") + disk_lm_models = [] + if os.path.exists(checkpoint_dir): + for item in sorted(os.listdir(checkpoint_dir)): + if os.path.isdir(os.path.join(checkpoint_dir, item)) and item.startswith("acestep-5Hz-lm-"): + disk_lm_models.append(item) + + boundary_mode = getattr(args, "tier_boundary", False) + batch_boundary_mode = getattr(args, "tier_batch_boundary", False) + + print(f"\n Tiers to test: {tiers_to_test}") + print(f" LM models on disk: {disk_lm_models}") + print(f" Test with LM: {args.tier_with_lm}") + print(f" Test duration: {args.tier_duration}s") + print(f" Boundary testing: {boundary_mode}") + print(f" Batch boundary testing: {batch_boundary_mode}") + print(f" Example: {args.example}") + + # Results collector + all_results = [] + + for sim_gb in tiers_to_test: + print("\n" + "=" * 120) + print(f" TIER TEST: {sim_gb}GB simulated VRAM") + print("=" * 120) + + # Configure GPU simulation + os.environ["MAX_CUDA_VRAM"] = str(sim_gb) + + # Force re-detection of GPU config + gpu_config = get_gpu_config(gpu_memory_gb=float(sim_gb)) + set_global_gpu_config(gpu_config) + + tier = gpu_config.tier + print(f" Tier: {tier}") + print(f" init_lm_default: {gpu_config.init_lm_default}") + print(f" available_lm_models: {gpu_config.available_lm_models}") + print(f" recommended_lm_model: {gpu_config.recommended_lm_model}") + print(f" recommended_backend: {gpu_config.recommended_backend}") + print(f" lm_backend_restriction: {gpu_config.lm_backend_restriction}") + print(f" offload_to_cpu: {gpu_config.offload_to_cpu_default}") + print(f" offload_dit_to_cpu: {gpu_config.offload_dit_to_cpu_default}") + print(f" quantization: {gpu_config.quantization_default}") + print(f" max_duration_with_lm: {gpu_config.max_duration_with_lm}s") + print(f" max_duration_without_lm: {gpu_config.max_duration_without_lm}s") + print(f" max_batch_with_lm: {gpu_config.max_batch_size_with_lm}") + print(f" max_batch_without_lm: {gpu_config.max_batch_size_without_lm}") + + # ---- Test 1: Default configuration ---- + print(f"\n --- Variant: default ---") + result_default = _run_single_tier_test( + sim_gb, gpu_config, args, example_data, + checkpoint_dir, disk_lm_models, + test_variant="default", + ) + all_results.append(result_default) + + if boundary_mode: + # ---- Test 2: No quantization (keep offload as default) ---- + # Skip if the tier already doesn't use quantization (no point re-testing) + if gpu_config.quantization_default: + print(f"\n --- Variant: no-quant (offload={gpu_config.offload_to_cpu_default}) ---") + result_no_quant = _run_single_tier_test( + sim_gb, gpu_config, args, example_data, + checkpoint_dir, disk_lm_models, + quantization_override=None, + test_variant="no-quant", + ) + all_results.append(result_no_quant) + else: + print(f"\n --- Variant: no-quant — SKIPPED (tier already has quantization=False) ---") + + # ---- Test 3: No quantization AND no offload ---- + # Skip if the tier already has both disabled + # Also skip if simulated VRAM is too small — the unquantized DiT model + # alone needs ~6GB; without offload there is no room left for VAE decode, + # which causes a fallback to CPU VAE with tiny chunk_size and 20+ hour runs. + MIN_VRAM_FOR_NO_OFFLOAD = 8 # GB — DiT (~6GB) + VAE headroom (~2GB) + if sim_gb < MIN_VRAM_FOR_NO_OFFLOAD: + print(f"\n --- Variant: no-offload — SKIPPED (simulated {sim_gb}GB < {MIN_VRAM_FOR_NO_OFFLOAD}GB minimum for no-offload) ---") + elif gpu_config.quantization_default or gpu_config.offload_to_cpu_default: + print(f"\n --- Variant: no-offload (quant=None, offload=False) ---") + result_no_offload = _run_single_tier_test( + sim_gb, gpu_config, args, example_data, + checkpoint_dir, disk_lm_models, + offload_override=False, + offload_dit_override=False, + quantization_override=None, + test_variant="no-offload", + ) + all_results.append(result_no_offload) + else: + print(f"\n --- Variant: no-offload — SKIPPED (tier already has offload=False, quant=False) ---") + + if batch_boundary_mode: + # ---- Batch boundary tests: escalate batch size until OOM ---- + BATCH_SIZES_TO_TEST = [1, 2, 4, 8] + + # Test WITHOUT LM + print(f"\n --- Batch boundary: without LM ---") + for bs in BATCH_SIZES_TO_TEST: + print(f"\n --- Variant: batch-noLM-{bs} (batch_size={bs}, no LM) ---") + result_batch = _run_single_tier_test( + sim_gb, gpu_config, args, example_data, + checkpoint_dir, disk_lm_models, + test_variant=f"batch-noLM-{bs}", + batch_size_override=bs, + use_lm_override=False, + ) + all_results.append(result_batch) + if not result_batch["gen_success"]: + print(f" āš ļø Batch size {bs} failed without LM — stopping escalation") + break + + # Test WITH LM (if tier supports it) + if gpu_config.init_lm_default and bool(gpu_config.available_lm_models): + print(f"\n --- Batch boundary: with LM ---") + for bs in BATCH_SIZES_TO_TEST: + print(f"\n --- Variant: batch-LM-{bs} (batch_size={bs}, with LM) ---") + result_batch_lm = _run_single_tier_test( + sim_gb, gpu_config, args, example_data, + checkpoint_dir, disk_lm_models, + test_variant=f"batch-LM-{bs}", + batch_size_override=bs, + use_lm_override=True, + ) + all_results.append(result_batch_lm) + if not result_batch_lm["gen_success"]: + print(f" āš ļø Batch size {bs} failed with LM — stopping escalation") + break + + # ---- Print summary ---- + _print_tier_test_summary(all_results) + + if boundary_mode: + _print_boundary_summary(all_results) + + if batch_boundary_mode: + _print_batch_boundary_summary(all_results) + + # Save results + if args.benchmark_output: + with open(args.benchmark_output, "w", encoding="utf-8") as f: + json.dump(all_results, f, indent=2, default=str) + print(f"\n Results saved to: {args.benchmark_output}") + + return all_results + + +def _cleanup_handlers(dit_handler, llm_handler): + """Clean up handlers and free GPU memory.""" + try: + if dit_handler is not None: + if hasattr(dit_handler, 'model') and dit_handler.model is not None: + dit_handler.model = None + if hasattr(dit_handler, 'vae') and dit_handler.vae is not None: + dit_handler.vae = None + if hasattr(dit_handler, 'text_encoder') and dit_handler.text_encoder is not None: + dit_handler.text_encoder = None + del dit_handler + except Exception: + pass + + try: + if llm_handler is not None: + if hasattr(llm_handler, 'llm') and llm_handler.llm is not None: + llm_handler.llm = None + del llm_handler + except Exception: + pass + + import gc + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + + +def _print_tier_test_summary(results: List[Dict]): + """Print a summary table of all tier test results.""" + # Detect if any result has a test_variant (boundary mode) + has_variants = any(r.get("test_variant", "default") != "default" for r in results) + + print("\n" + "=" * 160) + print("TIER TEST SUMMARY") + print("=" * 160) + + if has_variants: + header = ( + f"{'VRAM':>6} {'Tier':<10} {'Variant':<12} {'LM':>4} {'LM Model':<24} {'Backend':<8} " + f"{'Offload':<8} {'Quant':<6} {'Init':>5} {'Gen':>5} " + f"{'Wall(s)':>8} {'Peak(GB)':>9} {'Status':<30}" + ) + else: + header = ( + f"{'VRAM':>6} {'Tier':<10} {'LM':>4} {'LM Model':<28} {'Backend':<8} " + f"{'Offload':<8} {'Quant':<6} {'Init':>5} {'Gen':>5} " + f"{'Wall(s)':>8} {'Peak(GB)':>9} {'Status':<30}" + ) + print(header) + print("-" * 160) + + pass_count = 0 + fail_count = 0 + + for r in results: + lm_model_short = (r.get("lm_model") or "-") + max_lm_len = 22 if has_variants else 26 + if len(lm_model_short) > max_lm_len: + lm_model_short = lm_model_short[:max_lm_len] + ".." + + init_ok = "āœ…" if r["init_success"] else "āŒ" + gen_ok = "āœ…" if r["gen_success"] else "āŒ" + status = "PASS" if r["gen_success"] else (r.get("error", "FAIL") or "FAIL") + if len(status) > 28: + status = status[:28] + ".." + + if r["gen_success"]: + pass_count += 1 + else: + fail_count += 1 + + quant = "int8" if r.get("quantization") else "-" + variant = r.get("test_variant", "default") + + if has_variants: + print( + f"{r['tier_gb']:5d}GB {r['tier']:<10} {variant:<12} " + f"{'Y' if r['use_lm'] else 'N':>4} {lm_model_short:<24} " + f"{r.get('lm_backend', '-'):<8} " + f"{'Y' if r['offload'] else 'N':<8} {quant:<6} " + f"{init_ok:>5} {gen_ok:>5} " + f"{r['wall_time']:>8.1f} {r.get('peak_vram_gb', 0):>9.2f} " + f"{status:<30}" + ) + else: + print( + f"{r['tier_gb']:5d}GB {r['tier']:<10} " + f"{'Y' if r['use_lm'] else 'N':>4} {lm_model_short:<28} " + f"{r.get('lm_backend', '-'):<8} " + f"{'Y' if r['offload'] else 'N':<8} {quant:<6} " + f"{init_ok:>5} {gen_ok:>5} " + f"{r['wall_time']:>8.1f} {r.get('peak_vram_gb', 0):>9.2f} " + f"{status:<30}" + ) + + print("-" * 160) + print(f" Total: {len(results)} tests run, {pass_count} PASSED, {fail_count} FAILED") + + +def _print_boundary_summary(results: List[Dict]): + """ + Print a boundary analysis summary showing the minimum tier for each capability. + + Analyzes results from boundary testing to determine: + - Minimum tier that works WITHOUT INT8 quantization + - Minimum tier that works WITHOUT CPU offload (and without quantization) + """ + print("\n" + "=" * 100) + print("BOUNDARY ANALYSIS") + print("=" * 100) + print() + print(" This analysis shows the minimum VRAM tier at which each optimization") + print(" can be safely disabled while still completing inference successfully.") + print() + + # Collect results by variant + no_quant_results = [r for r in results if r.get("test_variant") == "no-quant"] + no_offload_results = [r for r in results if r.get("test_variant") == "no-offload"] + default_results = [r for r in results if r.get("test_variant") == "default"] + + # Also consider default results where the tier already has quant/offload disabled + # (e.g., tier6b default already has quantization=False) + for r in default_results: + if not r.get("quantization") and r not in no_quant_results: + # This tier's default already runs without quantization + no_quant_results.append(r) + if not r.get("offload") and not r.get("quantization") and r not in no_offload_results: + # This tier's default already runs without offload and without quantization + no_offload_results.append(r) + + # Sort by VRAM + no_quant_results.sort(key=lambda r: r["tier_gb"]) + no_offload_results.sort(key=lambda r: r["tier_gb"]) + + # Find minimum passing tier for each capability + def _find_min_passing(result_list, capability_name): + passing = [r for r in result_list if r.get("gen_success")] + failing = [r for r in result_list if not r.get("gen_success")] + + if passing: + min_pass = passing[0] + print(f" {capability_name}:") + print(f" Minimum tier: {min_pass['tier']} ({min_pass['tier_gb']}GB)") + print(f" Peak VRAM: {min_pass.get('peak_vram_gb', 0):.2f}GB") + if failing: + max_fail = failing[-1] + print(f" Last failure: {max_fail['tier']} ({max_fail['tier_gb']}GB) — {max_fail.get('error', 'unknown')[:60]}") + else: + if failing: + print(f" {capability_name}:") + print(f" āŒ No tier passed this test. All tested tiers failed.") + for r in failing: + err = (r.get("error") or "unknown")[:50] + print(f" {r['tier_gb']}GB ({r['tier']}): {err}") + else: + print(f" {capability_name}:") + print(f" āš ļø No test results available for this capability.") + print() + return passing[0] if passing else None + + min_no_quant = _find_min_passing(no_quant_results, "Without INT8 Quantization") + min_no_offload = _find_min_passing(no_offload_results, "Without CPU Offload (and no quantization)") + + # Print compact summary table + print(" " + "-" * 60) + print(f" {'Capability':<45} {'Min Tier':<10} {'VRAM':>6}") + print(" " + "-" * 60) + + if min_no_quant: + print(f" {'No INT8 Quantization':<45} {min_no_quant['tier']:<10} {min_no_quant['tier_gb']:>5}GB") + else: + print(f" {'No INT8 Quantization':<45} {'N/A':<10} {'N/A':>6}") + + if min_no_offload: + print(f" {'No CPU Offload (all models on GPU)':<45} {min_no_offload['tier']:<10} {min_no_offload['tier_gb']:>5}GB") + else: + print(f" {'No CPU Offload (all models on GPU)':<45} {'N/A':<10} {'N/A':>6}") + + print(" " + "-" * 60) + print() + print(" Note: These boundaries are empirical and may vary based on:") + print(" - DiT model variant (turbo vs base)") + print(" - Whether LM is enabled (--tier-with-lm)") + print(" - Generation duration and batch size") + print(" - Flash attention availability") + + +def _print_batch_boundary_summary(results: List[Dict]): + """ + Print a batch boundary analysis summary showing the maximum safe batch size per tier. + + Analyzes results from batch boundary testing to determine: + - Maximum batch size WITHOUT LM for each tier + - Maximum batch size WITH LM for each tier + """ + print("\n" + "=" * 120) + print("BATCH BOUNDARY ANALYSIS") + print("=" * 120) + print() + print(" This analysis shows the maximum batch size that completed successfully") + print(" for each simulated VRAM tier.") + print() + + # Collect batch boundary results + batch_no_lm = [r for r in results if r.get("test_variant", "").startswith("batch-noLM-")] + batch_with_lm = [r for r in results if r.get("test_variant", "").startswith("batch-LM-")] + + # Group by tier_gb + def _group_by_tier(result_list): + groups = {} + for r in result_list: + tier_gb = r["tier_gb"] + if tier_gb not in groups: + groups[tier_gb] = {"tier": r["tier"], "results": []} + groups[tier_gb]["results"].append(r) + return groups + + no_lm_groups = _group_by_tier(batch_no_lm) + with_lm_groups = _group_by_tier(batch_with_lm) + + # Find max passing batch per tier + def _max_passing_batch(group_results): + max_bs = 0 + peak_vram = 0.0 + for r in group_results: + if r.get("gen_success"): + bs = r.get("batch_size", 1) + if bs > max_bs: + max_bs = bs + peak_vram = r.get("peak_vram_gb", 0) + return max_bs, peak_vram + + # Collect all tier_gb values + all_tier_gbs = sorted(set(list(no_lm_groups.keys()) + list(with_lm_groups.keys()))) + + # Print table + print(f" {'VRAM':>6} {'Tier':<12} {'Max Batch (no LM)':>18} {'Peak VRAM':>10} {'Max Batch (with LM)':>20} {'Peak VRAM':>10}") + print(" " + "-" * 90) + + summary_rows = [] + for tier_gb in all_tier_gbs: + tier_name = no_lm_groups.get(tier_gb, with_lm_groups.get(tier_gb, {})).get("tier", "?") + + no_lm_max, no_lm_peak = (0, 0.0) + if tier_gb in no_lm_groups: + no_lm_max, no_lm_peak = _max_passing_batch(no_lm_groups[tier_gb]["results"]) + + with_lm_max, with_lm_peak = (0, 0.0) + if tier_gb in with_lm_groups: + with_lm_max, with_lm_peak = _max_passing_batch(with_lm_groups[tier_gb]["results"]) + + no_lm_str = str(no_lm_max) if no_lm_max > 0 else "FAIL" + with_lm_str = str(with_lm_max) if with_lm_max > 0 else ("N/A" if tier_gb not in with_lm_groups else "FAIL") + + no_lm_peak_str = f"{no_lm_peak:.2f}GB" if no_lm_max > 0 else "-" + with_lm_peak_str = f"{with_lm_peak:.2f}GB" if with_lm_max > 0 else "-" + + print( + f" {tier_gb:5d}GB {tier_name:<12} {no_lm_str:>18} {no_lm_peak_str:>10} " + f"{with_lm_str:>20} {with_lm_peak_str:>10}" + ) + + summary_rows.append({ + "tier_gb": tier_gb, + "tier": tier_name, + "max_batch_no_lm": no_lm_max, + "max_batch_with_lm": with_lm_max if tier_gb in with_lm_groups else None, + }) + + print(" " + "-" * 90) + print() + + # Print comparison with current GPU_TIER_CONFIGS + print(" Comparison with current GPU_TIER_CONFIGS:") + print(f" {'VRAM':>6} {'Tier':<12} {'Config (no LM)':>15} {'Tested (no LM)':>15} {'Config (LM)':>12} {'Tested (LM)':>12} {'Recommendation':<30}") + print(" " + "-" * 110) + + for row in summary_rows: + tier_gb = row["tier_gb"] + tier_name = row["tier"] + cfg = get_gpu_config(gpu_memory_gb=float(tier_gb)) + + cfg_no_lm = cfg.max_batch_size_without_lm + cfg_with_lm = cfg.max_batch_size_with_lm + tested_no_lm = row["max_batch_no_lm"] + tested_with_lm = row["max_batch_with_lm"] + + tested_no_lm_str = str(tested_no_lm) if tested_no_lm > 0 else "FAIL" + tested_with_lm_str = str(tested_with_lm) if tested_with_lm is not None and tested_with_lm > 0 else ("N/A" if tested_with_lm is None else "FAIL") + + # Recommendation + rec_parts = [] + if tested_no_lm > 0 and tested_no_lm != cfg_no_lm: + rec_parts.append(f"no_lm: {cfg_no_lm}→{tested_no_lm}") + if tested_with_lm is not None and tested_with_lm > 0 and tested_with_lm != cfg_with_lm: + rec_parts.append(f"lm: {cfg_with_lm}→{tested_with_lm}") + recommendation = ", ".join(rec_parts) if rec_parts else "OK" + + print( + f" {tier_gb:5d}GB {tier_name:<12} {cfg_no_lm:>15} {tested_no_lm_str:>15} " + f"{cfg_with_lm:>12} {tested_with_lm_str:>12} {recommendation:<30}" + ) + + print(" " + "-" * 110) + print() + print(" Note: Batch boundary results are empirical and depend on:") + print(" - DiT model variant (turbo vs base)") + print(" - Generation duration (longer = more VRAM per batch)") + print(" - Flash attention availability") + print(" - LM model size (0.6B vs 1.7B vs 4B)") + print(" - Quantization and offload settings") + + # ============================================================================= # Mode: understand # ============================================================================= @@ -763,7 +1583,7 @@ def run_create_sample_mode( print(f"\n Query: {query}") print(f" Instrumental: {args.instrumental}") - timer.sync() + timer.sync() t0 = time.perf_counter() result = create_sample( @@ -826,7 +1646,7 @@ def run_format_sample_mode( print(f"\n Caption: {caption[:80]}...") print(f" Lyrics: {lyrics[:80]}...") - timer.sync() + timer.sync() t0 = time.perf_counter() result = format_sample( @@ -864,23 +1684,23 @@ def run_format_sample_mode( def _print_cprofile(prof): """Print cProfile results and save to file.""" - import pstats - import io - - output_file = "profile_cprofile_detailed.txt" + import pstats + import io + + output_file = "profile_cprofile_detailed.txt" with open(output_file, "w") as f: - ps = pstats.Stats(prof, stream=f) + ps = pstats.Stats(prof, stream=f) ps.sort_stats("cumulative") - ps.print_stats(100) - - print("\n" + "=" * 100) + ps.print_stats(100) + + print("\n" + "=" * 100) print("TOP 20 FUNCTIONS BY CUMULATIVE TIME (cProfile)") - print("=" * 100) - s = io.StringIO() - ps = pstats.Stats(prof, stream=s) + print("=" * 100) + s = io.StringIO() + ps = pstats.Stats(prof, stream=s) ps.sort_stats("cumulative") - ps.print_stats(20) - print(s.getvalue()) + ps.print_stats(20) + print(s.getvalue()) print(f"Full report saved to: {output_file}") @@ -888,14 +1708,13 @@ def _cleanup_dir(path: str): """Remove temporary directory silently.""" try: import shutil - shutil.rmtree(path, ignore_errors=True) except Exception: pass # ============================================================================= -# Handler initialization +# Handler initialization (for non-tier-test modes) # ============================================================================= @@ -911,7 +1730,6 @@ def initialize_handlers( if device.startswith("cuda"): try: import flash_attn # noqa: F401 - use_flash_attention = True except ImportError: pass @@ -974,7 +1792,7 @@ def initialize_handlers( def build_parser() -> argparse.ArgumentParser: """Build the argument parser with all options.""" env_config = load_env_config() - + parser = argparse.ArgumentParser( description="ACE-Step 1.5 Inference Profiler & Benchmark", formatter_class=argparse.RawDescriptionHelpFormatter, @@ -983,6 +1801,9 @@ def build_parser() -> argparse.ArgumentParser: python profile_inference.py # Profile text2music python profile_inference.py --thinking --llm-debug # With LLM analysis python profile_inference.py --mode benchmark # Benchmark matrix + python profile_inference.py --mode tier-test # Test all GPU tiers + python profile_inference.py --mode tier-test --tiers 6 8 16 # Test specific tiers + python profile_inference.py --mode tier-test --tier-with-lm # Test tiers with LM python profile_inference.py --mode understand # Profile understand API python profile_inference.py --mode create_sample --sample-query "jazz ballad" python profile_inference.py --device mps --lm-backend mlx # Apple Silicon @@ -998,6 +1819,7 @@ def build_parser() -> argparse.ArgumentParser: choices=[ "profile", "benchmark", + "tier-test", "understand", "create_sample", "format_sample", @@ -1203,6 +2025,44 @@ def build_parser() -> argparse.ArgumentParser: help="Save benchmark results to JSON file", ) + # Tier-test options + parser.add_argument( + "--tiers", + type=int, + nargs="+", + default=None, + help="Specific VRAM tiers to test (e.g., --tiers 6 8 16). Default: all tiers", + ) + parser.add_argument( + "--tier-with-lm", + action="store_true", + help="Enable LM for tiers that support it (default: DiT-only test)", + ) + parser.add_argument( + "--tier-duration", + type=float, + default=240, + help="Test generation duration in seconds for tier-test (default: 240)", + ) + parser.add_argument( + "--tier-skip-compile", + action="store_true", + help="Skip torch.compile for non-quantized tiers (faster testing, less realistic)", + ) + parser.add_argument( + "--tier-boundary", + action="store_true", + help="Enable boundary testing: for each tier, also test without INT8 quantization " + "and without CPU offload to find the minimum VRAM tier for each capability", + ) + parser.add_argument( + "--tier-batch-boundary", + action="store_true", + help="Enable batch size boundary testing: for each tier, progressively test " + "batch sizes 1, 2, 4, 8 (stop at first OOM) to find the maximum safe batch " + "size. Tests both with-LM and without-LM configurations.", + ) + # create_sample / understand options parser.add_argument( "--sample-query", @@ -1238,6 +2098,17 @@ def main(): if args.no_constrained_decoding: args.use_constrained_decoding = False + # Tier-test mode has its own initialization flow + if args.mode == "tier-test": + print("=" * 120) + print("ACE-Step 1.5 Tier Compatibility Test") + print("=" * 120) + run_tier_test_mode(args) + print("\n" + "=" * 120) + print("DONE") + print("=" * 120) + return + # Resolve device device = resolve_device(args.device) @@ -1252,7 +2123,7 @@ def main(): # Auto-enable offload for small GPUs if ( gpu_config.gpu_memory_gb > 0 - and gpu_config.gpu_memory_gb < 16 + and gpu_config.gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB and not args.offload_to_cpu ): args.offload_to_cpu = True diff --git a/scripts/profile_vram.py b/scripts/profile_vram.py new file mode 100644 index 00000000..5c543845 --- /dev/null +++ b/scripts/profile_vram.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python3 +""" +VRAM Profiling Script for ACE-Step 1.5 + +Measures actual GPU memory consumption of each model component at different +configurations. Results are used to calibrate the empirical VRAM constants +in gpu_config.py. + +Usage: + python scripts/profile_vram.py # Profile all components + python scripts/profile_vram.py --component dit # Profile DiT only + python scripts/profile_vram.py --component lm # Profile LM only + python scripts/profile_vram.py --component vae # Profile VAE only + python scripts/profile_vram.py --output results.json # Save results to JSON + +Requirements: + - CUDA GPU with sufficient memory + - All model checkpoints downloaded +""" + +import argparse +import gc +import json +import os +import sys +import time +from typing import Dict, Any, Optional, List + +# Add project root to path +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +import torch + + +def get_memory_stats() -> Dict[str, float]: + """Get current CUDA memory statistics in GB.""" + if not torch.cuda.is_available(): + return {"allocated": 0, "reserved": 0, "free": 0, "total": 0, "max_allocated": 0} + + allocated = torch.cuda.memory_allocated() / (1024**3) + reserved = torch.cuda.memory_reserved() / (1024**3) + free, total = torch.cuda.mem_get_info() + free_gb = free / (1024**3) + total_gb = total / (1024**3) + max_allocated = torch.cuda.max_memory_allocated() / (1024**3) + + return { + "allocated": round(allocated, 3), + "reserved": round(reserved, 3), + "free": round(free_gb, 3), + "total": round(total_gb, 3), + "max_allocated": round(max_allocated, 3), + } + + +def reset_memory(): + """Reset CUDA memory stats and free caches.""" + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + gc.collect() + torch.cuda.empty_cache() + # Wait for GPU to settle + torch.cuda.synchronize() + + +def measure_cuda_context() -> Dict[str, float]: + """Measure CUDA context overhead.""" + print("\n" + "=" * 60) + print("Measuring CUDA context overhead...") + print("=" * 60) + + reset_memory() + before = get_memory_stats() + + # Force CUDA context initialization + _ = torch.zeros(1, device="cuda") + del _ + torch.cuda.synchronize() + + after = get_memory_stats() + + context_overhead = after["total"] - after["free"] - before.get("allocated", 0) + + result = { + "cuda_context_gb": round(context_overhead, 3), + "total_gpu_gb": after["total"], + "free_after_context_gb": after["free"], + } + + print(f" CUDA context overhead: {result['cuda_context_gb']:.3f} GB") + print(f" Total GPU memory: {result['total_gpu_gb']:.3f} GB") + print(f" Free after context: {result['free_after_context_gb']:.3f} GB") + + return result + + +def profile_dit(checkpoint_dir: str, config_path: str = "acestep-v15-turbo") -> Dict[str, Any]: + """Profile DiT model memory consumption.""" + print("\n" + "=" * 60) + print(f"Profiling DiT model: {config_path}") + print("=" * 60) + + from transformers import AutoModel + + model_path = os.path.join(checkpoint_dir, config_path) + if not os.path.exists(model_path): + print(f" Model not found: {model_path}") + return {} + + reset_memory() + before = get_memory_stats() + + # Load model weights + print(" Loading DiT model weights...") + model = AutoModel.from_pretrained( + model_path, + trust_remote_code=True, + attn_implementation="sdpa", + dtype=torch.bfloat16, + ) + model = model.to("cuda").to(torch.bfloat16) + model.eval() + torch.cuda.synchronize() + + after_load = get_memory_stats() + weights_gb = after_load["allocated"] - before["allocated"] + + print(f" DiT model weights: {weights_gb:.3f} GB") + + # Load silence latent + silence_path = os.path.join(model_path, "silence_latent.pt") + silence_latent = None + if os.path.exists(silence_path): + silence_latent = torch.load(silence_path, weights_only=True).transpose(1, 2) + silence_latent = silence_latent.to("cuda").to(torch.bfloat16) + + # Determine if model has CFG (base vs turbo) + has_cfg = "turbo" not in config_path.lower() + + # Profile inference at different batch sizes and durations + inference_results = [] + + # Duration -> latent_length mapping: 48000 Hz audio, 5 Hz latent = 9600 audio samples per latent frame + # Actually: latent_length = ceil(duration * 5) for 5Hz models + durations = [60, 120, 240] + batch_sizes = [1, 2, 4] + + for duration in durations: + for batch_size in batch_sizes: + reset_memory() + torch.cuda.reset_peak_memory_stats() + + # Reload model to GPU if needed + model = model.to("cuda") + torch.cuda.synchronize() + + mem_before_inference = get_memory_stats() + + latent_length = int(duration * 5) # 5 Hz + latent_dim = 64 # Standard latent dim + + try: + with torch.inference_mode(): + # Simulate DiT inference inputs + # Create dummy latent noise + noise = torch.randn(batch_size, latent_length, latent_dim, device="cuda", dtype=torch.bfloat16) + + # Simulate text encoder output + text_hidden = torch.randn(batch_size, 512, 768, device="cuda", dtype=torch.bfloat16) + text_mask = torch.ones(batch_size, 512, device="cuda", dtype=torch.long) + + # If has CFG, double the batch for classifier-free guidance + if has_cfg: + noise_cfg = torch.cat([noise, noise], dim=0) + text_hidden_cfg = torch.cat([text_hidden, text_hidden], dim=0) + text_mask_cfg = torch.cat([text_mask, text_mask], dim=0) + del noise_cfg, text_hidden_cfg, text_mask_cfg + + del noise, text_hidden, text_mask + torch.cuda.synchronize() + + mem_after_inference = get_memory_stats() + peak_gb = mem_after_inference["max_allocated"] - mem_before_inference["allocated"] + + result_entry = { + "duration_s": duration, + "batch_size": batch_size, + "has_cfg": has_cfg, + "peak_inference_gb": round(peak_gb, 3), + "latent_length": latent_length, + } + inference_results.append(result_entry) + + print(f" batch={batch_size}, dur={duration}s: peak={peak_gb:.3f} GB (cfg={has_cfg})") + + except RuntimeError as e: + if "out of memory" in str(e).lower(): + print(f" batch={batch_size}, dur={duration}s: OOM") + inference_results.append({ + "duration_s": duration, + "batch_size": batch_size, + "has_cfg": has_cfg, + "peak_inference_gb": -1, + "error": "OOM", + }) + torch.cuda.empty_cache() + else: + raise + + # Cleanup + del model + if silence_latent is not None: + del silence_latent + torch.cuda.empty_cache() + gc.collect() + + return { + "config_path": config_path, + "weights_gb": round(weights_gb, 3), + "has_cfg": has_cfg, + "inference_results": inference_results, + } + + +def profile_vae(checkpoint_dir: str) -> Dict[str, Any]: + """Profile VAE model memory consumption.""" + print("\n" + "=" * 60) + print("Profiling VAE model") + print("=" * 60) + + from diffusers.models import AutoencoderOobleck + + vae_path = os.path.join(checkpoint_dir, "vae") + if not os.path.exists(vae_path): + print(f" VAE not found: {vae_path}") + return {} + + reset_memory() + before = get_memory_stats() + + # Load VAE + print(" Loading VAE model weights...") + vae = AutoencoderOobleck.from_pretrained(vae_path) + vae = vae.to("cuda").to(torch.float16) + vae.eval() + torch.cuda.synchronize() + + after_load = get_memory_stats() + weights_gb = after_load["allocated"] - before["allocated"] + + print(f" VAE model weights: {weights_gb:.3f} GB") + + # Profile decode at different chunk sizes + decode_results = [] + chunk_sizes = [256, 512, 1024] + + for chunk_size in chunk_sizes: + reset_memory() + torch.cuda.reset_peak_memory_stats() + + vae = vae.to("cuda") + torch.cuda.synchronize() + + mem_before = get_memory_stats() + + try: + with torch.inference_mode(): + # Simulate latent input: [batch=1, channels=64, length=chunk_size] + latent = torch.randn(1, 64, chunk_size, device="cuda", dtype=torch.float16) + decoder_output = vae.decode(latent) + audio = decoder_output.sample + del decoder_output, audio, latent + torch.cuda.synchronize() + + mem_after = get_memory_stats() + peak_gb = mem_after["max_allocated"] - mem_before["allocated"] + + decode_results.append({ + "chunk_size": chunk_size, + "peak_decode_gb": round(peak_gb, 3), + }) + print(f" chunk_size={chunk_size}: peak={peak_gb:.3f} GB") + + except RuntimeError as e: + if "out of memory" in str(e).lower(): + print(f" chunk_size={chunk_size}: OOM") + decode_results.append({ + "chunk_size": chunk_size, + "peak_decode_gb": -1, + "error": "OOM", + }) + torch.cuda.empty_cache() + else: + raise + + # Cleanup + del vae + torch.cuda.empty_cache() + gc.collect() + + return { + "weights_gb": round(weights_gb, 3), + "decode_results": decode_results, + } + + +def profile_text_encoder(checkpoint_dir: str) -> Dict[str, Any]: + """Profile text encoder memory consumption.""" + print("\n" + "=" * 60) + print("Profiling Text Encoder") + print("=" * 60) + + from transformers import AutoModel, AutoTokenizer + + encoder_path = os.path.join(checkpoint_dir, "text_encoder") + if not os.path.exists(encoder_path): + print(f" Text encoder not found: {encoder_path}") + return {} + + reset_memory() + before = get_memory_stats() + + # Load text encoder + print(" Loading text encoder weights...") + tokenizer = AutoTokenizer.from_pretrained(encoder_path) + model = AutoModel.from_pretrained(encoder_path) + model = model.to("cuda").to(torch.bfloat16) + model.eval() + torch.cuda.synchronize() + + after_load = get_memory_stats() + weights_gb = after_load["allocated"] - before["allocated"] + + print(f" Text encoder weights: {weights_gb:.3f} GB") + + # Cleanup + del model, tokenizer + torch.cuda.empty_cache() + gc.collect() + + return { + "weights_gb": round(weights_gb, 3), + } + + +def profile_lm(checkpoint_dir: str, lm_models: Optional[List[str]] = None) -> Dict[str, Any]: + """Profile LM model memory consumption.""" + print("\n" + "=" * 60) + print("Profiling 5Hz LM models") + print("=" * 60) + + from transformers import AutoModelForCausalLM, AutoTokenizer + + if lm_models is None: + # Auto-detect available LM models + lm_models = [] + for name in os.listdir(checkpoint_dir): + if "5Hz-lm" in name and os.path.isdir(os.path.join(checkpoint_dir, name)): + lm_models.append(name) + + if not lm_models: + print(" No LM models found") + return {} + + lm_models.sort() + results = {} + + for lm_name in lm_models: + lm_path = os.path.join(checkpoint_dir, lm_name) + if not os.path.exists(lm_path): + print(f" LM model not found: {lm_path}") + continue + + print(f"\n Profiling LM: {lm_name}") + + reset_memory() + before = get_memory_stats() + + # Load model weights + print(f" Loading model weights...") + model = AutoModelForCausalLM.from_pretrained( + lm_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + ) + model = model.to("cuda") + model.eval() + torch.cuda.synchronize() + + after_load = get_memory_stats() + weights_gb = after_load["allocated"] - before["allocated"] + + print(f" Model weights: {weights_gb:.3f} GB") + + # Estimate KV cache memory for different max_model_len values + # KV cache formula: 2 * num_layers * max_tokens * num_kv_heads * head_dim * dtype_size + config = model.config + num_layers = config.num_hidden_layers + num_kv_heads = getattr(config, "num_key_value_heads", config.num_attention_heads) + head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + dtype_size = 2 # bfloat16 = 2 bytes + + kv_cache_estimates = {} + for max_len in [2048, 4096]: + # Per-token KV cache size + per_token_bytes = 2 * num_layers * num_kv_heads * head_dim * dtype_size + total_bytes = per_token_bytes * max_len + total_gb = total_bytes / (1024**3) + kv_cache_estimates[str(max_len)] = round(total_gb, 3) + print(f" KV cache ({max_len} tokens): {total_gb:.3f} GB") + + results[lm_name] = { + "weights_gb": round(weights_gb, 3), + "kv_cache_estimates": kv_cache_estimates, + "num_layers": num_layers, + "num_kv_heads": num_kv_heads, + "head_dim": head_dim, + } + + # Cleanup + del model + torch.cuda.empty_cache() + gc.collect() + + return results + + +def main(): + parser = argparse.ArgumentParser(description="VRAM Profiling for ACE-Step 1.5") + parser.add_argument("--component", type=str, default="all", + choices=["all", "cuda_context", "dit", "vae", "text_encoder", "lm"], + help="Component to profile (default: all)") + parser.add_argument("--checkpoint-dir", type=str, default=None, + help="Checkpoint directory (default: auto-detect)") + parser.add_argument("--dit-config", type=str, default="acestep-v15-turbo", + help="DiT model config name (default: acestep-v15-turbo)") + parser.add_argument("--lm-models", type=str, nargs="*", default=None, + help="LM models to profile (default: auto-detect)") + parser.add_argument("--output", type=str, default=None, + help="Output JSON file path") + + args = parser.parse_args() + + if not torch.cuda.is_available(): + print("ERROR: CUDA is not available. This script requires a CUDA GPU.") + sys.exit(1) + + # Auto-detect checkpoint directory + if args.checkpoint_dir is None: + args.checkpoint_dir = os.path.join(PROJECT_ROOT, "checkpoints") + + if not os.path.exists(args.checkpoint_dir): + print(f"ERROR: Checkpoint directory not found: {args.checkpoint_dir}") + sys.exit(1) + + device_name = torch.cuda.get_device_name(0) + total_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3) + + print("=" * 60) + print("ACE-Step 1.5 VRAM Profiler") + print("=" * 60) + print(f" GPU: {device_name}") + print(f" Total VRAM: {total_mem:.2f} GB") + print(f" Checkpoint dir: {args.checkpoint_dir}") + print(f" Component: {args.component}") + + results = { + "gpu_name": device_name, + "total_vram_gb": round(total_mem, 3), + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + components = [args.component] if args.component != "all" else [ + "cuda_context", "dit", "vae", "text_encoder", "lm" + ] + + for component in components: + if component == "cuda_context": + results["cuda_context"] = measure_cuda_context() + elif component == "dit": + results["dit"] = profile_dit(args.checkpoint_dir, args.dit_config) + elif component == "vae": + results["vae"] = profile_vae(args.checkpoint_dir) + elif component == "text_encoder": + results["text_encoder"] = profile_text_encoder(args.checkpoint_dir) + elif component == "lm": + results["lm"] = profile_lm(args.checkpoint_dir, args.lm_models) + + # Print summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + + if "cuda_context" in results: + print(f" CUDA context: {results['cuda_context'].get('cuda_context_gb', 'N/A')} GB") + if "dit" in results and results["dit"]: + print(f" DiT weights ({results['dit'].get('config_path', '')}): {results['dit'].get('weights_gb', 'N/A')} GB") + if "vae" in results and results["vae"]: + print(f" VAE weights: {results['vae'].get('weights_gb', 'N/A')} GB") + if "text_encoder" in results and results["text_encoder"]: + print(f" Text encoder weights: {results['text_encoder'].get('weights_gb', 'N/A')} GB") + if "lm" in results and results["lm"]: + for lm_name, lm_data in results["lm"].items(): + print(f" LM {lm_name} weights: {lm_data.get('weights_gb', 'N/A')} GB") + + # Calculate total base VRAM (all models loaded simultaneously) + base_total = 0 + if "cuda_context" in results: + base_total += results["cuda_context"].get("cuda_context_gb", 0) + if "dit" in results and results["dit"]: + base_total += results["dit"].get("weights_gb", 0) + if "vae" in results and results["vae"]: + base_total += results["vae"].get("weights_gb", 0) + if "text_encoder" in results and results["text_encoder"]: + base_total += results["text_encoder"].get("weights_gb", 0) + + print(f"\n Base VRAM (DiT+VAE+TextEnc+CUDA): {base_total:.3f} GB") + print(f" Remaining for LM + inference: {total_mem - base_total:.3f} GB") + + # Save results + if args.output: + output_path = args.output + else: + output_path = os.path.join(PROJECT_ROOT, "scripts", "vram_profile_results.json") + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as f: + json.dump(results, f, indent=2) + print(f"\n Results saved to: {output_path}") + + +if __name__ == "__main__": + main()