diff --git a/acestep/gpu_config.py b/acestep/gpu_config.py index f1929ab0..6fb89df1 100644 --- a/acestep/gpu_config.py +++ b/acestep/gpu_config.py @@ -111,7 +111,7 @@ class GPUConfig: recommended_lm_model: str # Recommended default LM model path (empty if LM not available) # LM backend restriction - # "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for very low VRAM + # "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for MPS (vllm requires CUDA) lm_backend_restriction: str # "all" or "pt_mlx_only" recommended_backend: str # Recommended default backend: "vllm", "pt", or "mlx" @@ -142,8 +142,8 @@ class GPUConfig: "init_lm_default": False, "available_lm_models": [], "recommended_lm_model": "", - "lm_backend_restriction": "pt_mlx_only", # vllm KV cache won't fit - "recommended_backend": "pt", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", "offload_to_cpu_default": True, "offload_dit_to_cpu_default": True, "quantization_default": True, # INT8 essential to fit DiT in ~4GB @@ -161,8 +161,8 @@ class GPUConfig: "init_lm_default": False, "available_lm_models": [], "recommended_lm_model": "", - "lm_backend_restriction": "pt_mlx_only", - "recommended_backend": "pt", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", "offload_to_cpu_default": True, "offload_dit_to_cpu_default": True, "quantization_default": True, @@ -172,7 +172,7 @@ class GPUConfig: "tier3": { # 6-8GB # Offload mode. DiT(4.46) + context(0.5) ≈ 5.0GB. # ~1.5-3GB headroom allows LM 0.6B (1.2+0.6=1.8GB) and batch=2. - # vllm KV cache is tight; pt backend is safer for 0.6B on this tier. + # With CPU offload, DiT is offloaded before LM runs → vllm can use freed VRAM. "max_duration_with_lm": 480, # 8 minutes "max_duration_without_lm": 600, # 10 minutes (max supported) "max_batch_size_with_lm": 2, @@ -180,8 +180,8 @@ class GPUConfig: "init_lm_default": True, "available_lm_models": ["acestep-5Hz-lm-0.6B"], "recommended_lm_model": "acestep-5Hz-lm-0.6B", - "lm_backend_restriction": "pt_mlx_only", # vllm KV cache too greedy for <8GB - "recommended_backend": "pt", + "lm_backend_restriction": "all", + "recommended_backend": "vllm", "offload_to_cpu_default": True, "offload_dit_to_cpu_default": True, "quantization_default": True, @@ -1080,6 +1080,97 @@ def print_gpu_config_info(gpu_config: GPUConfig): logger.info(f" - Available LM Models: {gpu_config.available_lm_models or 'None'}") +# Human-readable tier labels for UI display +GPU_TIER_LABELS = { + "tier1": "tier1 (≤4GB)", + "tier2": "tier2 (4-6GB)", + "tier3": "tier3 (6-8GB)", + "tier4": "tier4 (8-12GB)", + "tier5": "tier5 (12-16GB)", + "tier6a": "tier6a (16-20GB)", + "tier6b": "tier6b (20-24GB)", + "unlimited": "unlimited (≥24GB)", +} + +# Ordered list of tier keys for dropdown +GPU_TIER_CHOICES = list(GPU_TIER_LABELS.items()) # [(value, label), ...] + + +def get_gpu_device_name() -> str: + """ + Get the GPU device name string. + + Returns: + Human-readable GPU name, e.g. "NVIDIA GeForce RTX 4060 Ti", + "Apple M2 Pro (MPS)", "CPU only", etc. + """ + try: + import torch + if torch.cuda.is_available(): + return torch.cuda.get_device_name(0) + elif hasattr(torch, 'xpu') and torch.xpu.is_available(): + props = torch.xpu.get_device_properties(0) + return getattr(props, 'name', 'Intel XPU') + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + # MPS doesn't expose a device name; use platform info + try: + import platform + chip = platform.processor() or "Apple Silicon" + return f"{chip} (MPS)" + except Exception: + return "Apple Silicon (MPS)" + else: + return "CPU only" + except ImportError: + return "Unknown (PyTorch not available)" + + +def get_gpu_config_for_tier(tier: str) -> GPUConfig: + """ + Create a GPUConfig for a specific tier, applying platform overrides. + + This is used when the user manually selects a different tier in the UI. + The actual gpu_memory_gb is preserved from the real hardware detection, + but all tier-based settings come from the selected tier's config. + + Args: + tier: Tier key, e.g. "tier3", "tier6a", "unlimited" + + Returns: + GPUConfig with the selected tier's settings + """ + if tier not in GPU_TIER_CONFIGS: + logger.warning(f"Unknown tier '{tier}', falling back to auto-detected config") + return get_gpu_config() + + # Keep the real GPU memory for informational purposes + real_gpu_memory = get_gpu_memory_gb() + config = GPU_TIER_CONFIGS[tier] + + _mps = is_mps_platform() + if _mps: + logger.info(f"Manual tier override to {tier} on macOS MPS — applying Apple Silicon overrides") + + return GPUConfig( + tier=tier, + gpu_memory_gb=real_gpu_memory, + max_duration_with_lm=config["max_duration_with_lm"], + max_duration_without_lm=config["max_duration_without_lm"], + max_batch_size_with_lm=config["max_batch_size_with_lm"], + max_batch_size_without_lm=config["max_batch_size_without_lm"], + init_lm_default=config["init_lm_default"], + available_lm_models=config["available_lm_models"], + recommended_lm_model=config.get("recommended_lm_model", ""), + lm_backend_restriction="pt_mlx_only" if _mps else config.get("lm_backend_restriction", "all"), + recommended_backend="mlx" if _mps else config.get("recommended_backend", "vllm"), + offload_to_cpu_default=False if _mps else config.get("offload_to_cpu_default", True), + offload_dit_to_cpu_default=False if _mps else config.get("offload_dit_to_cpu_default", True), + quantization_default=False if _mps else config.get("quantization_default", True), + compile_model_default=False if _mps else config.get("compile_model_default", True), + lm_memory_gb=config["lm_memory_gb"], + ) + + # Global GPU config instance (initialized lazily) _global_gpu_config: Optional[GPUConfig] = None diff --git a/acestep/gradio_ui/events/__init__.py b/acestep/gradio_ui/events/__init__.py index 90c6e126..00ed93db 100644 --- a/acestep/gradio_ui/events/__init__.py +++ b/acestep/gradio_ui/events/__init__.py @@ -43,6 +43,24 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase ] ) + # ========== Tier Override ========== + generation_section["tier_dropdown"].change( + fn=lambda tier: gen_h.on_tier_change(tier, llm_handler), + inputs=[generation_section["tier_dropdown"]], + outputs=[ + generation_section["offload_to_cpu_checkbox"], + generation_section["offload_dit_to_cpu_checkbox"], + generation_section["compile_model_checkbox"], + generation_section["quantization_checkbox"], + generation_section["backend_dropdown"], + generation_section["lm_model_path"], + generation_section["init_llm_checkbox"], + generation_section["batch_size_input"], + generation_section["audio_duration"], + generation_section["gpu_info_display"], + ] + ) + generation_section["init_btn"].click( fn=lambda *args: gen_h.init_service_wrapper(dit_handler, llm_handler, *args), inputs=[ diff --git a/acestep/gradio_ui/events/generation_handlers.py b/acestep/gradio_ui/events/generation_handlers.py index 6f89bded..d7fb041a 100644 --- a/acestep/gradio_ui/events/generation_handlers.py +++ b/acestep/gradio_ui/events/generation_handlers.py @@ -16,7 +16,10 @@ ) from acestep.gradio_ui.i18n import t from acestep.inference import understand_music, create_sample, format_sample -from acestep.gpu_config import get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk +from acestep.gpu_config import ( + get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk, + get_gpu_config_for_tier, set_global_gpu_config, GPU_TIER_LABELS, GPU_TIER_CONFIGS, +) def clamp_duration_to_gpu_limit(duration_value: Optional[float], llm_handler=None) -> Optional[float]: @@ -559,6 +562,88 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi ) +def on_tier_change(selected_tier, llm_handler=None): + """ + Handle manual tier override from the UI dropdown. + + Updates the global GPU config and returns gr.update() for all + affected UI components so they reflect the new tier's defaults. + + Returns a tuple of gr.update() objects for: + (offload_to_cpu, offload_dit_to_cpu, compile_model, quantization, + backend_dropdown, lm_model_path, init_llm, batch_size_input, + audio_duration, gpu_info_display) + """ + if not selected_tier or selected_tier not in GPU_TIER_CONFIGS: + logger.warning(f"Invalid tier selection: {selected_tier}") + return (gr.update(),) * 10 + + # Build new config for the selected tier and update global + new_config = get_gpu_config_for_tier(selected_tier) + set_global_gpu_config(new_config) + logger.info(f"🔄 Tier manually changed to {selected_tier} — updating UI defaults") + + # Backend choices + if new_config.lm_backend_restriction == "pt_mlx_only": + available_backends = ["pt", "mlx"] + else: + available_backends = ["vllm", "pt", "mlx"] + recommended_backend = new_config.recommended_backend + if recommended_backend not in available_backends: + recommended_backend = available_backends[0] + + # LM model choices — filter disk models by tier + tier_lm_models = new_config.available_lm_models + all_disk_models = llm_handler.get_available_5hz_lm_models() if llm_handler else [] + if tier_lm_models: + filtered = [m for m in all_disk_models if is_lm_model_size_allowed(m, tier_lm_models)] + available_lm_models = filtered if filtered else all_disk_models + else: + available_lm_models = all_disk_models + + recommended_lm = new_config.recommended_lm_model + default_lm_model = find_best_lm_model_on_disk(recommended_lm, available_lm_models) + + # Duration and batch limits (use without-LM limits as safe default; init will refine) + max_duration = new_config.max_duration_without_lm + max_batch = new_config.max_batch_size_without_lm + + # GPU info markdown update + tier_label = GPU_TIER_LABELS.get(selected_tier, selected_tier) + from acestep.gpu_config import get_gpu_device_name + _gpu_device_name = get_gpu_device_name() + gpu_info_text = f"🖥️ **{_gpu_device_name}** — {new_config.gpu_memory_gb:.1f} GB VRAM — {t('service.gpu_auto_tier')}: **{tier_label}**" + + return ( + # offload_to_cpu_checkbox + gr.update(value=new_config.offload_to_cpu_default, + info=t("service.offload_cpu_info") + (" (recommended for this tier)" if new_config.offload_to_cpu_default else " (optional for this tier)")), + # offload_dit_to_cpu_checkbox + gr.update(value=new_config.offload_dit_to_cpu_default, + info=t("service.offload_dit_cpu_info") + (" (recommended for this tier)" if new_config.offload_dit_to_cpu_default else " (optional for this tier)")), + # compile_model_checkbox + gr.update(value=new_config.compile_model_default), + # quantization_checkbox + gr.update(value=new_config.quantization_default, + info=t("service.quantization_info") + (" (recommended for this tier)" if new_config.quantization_default else " (optional for this tier)")), + # backend_dropdown + gr.update(choices=available_backends, value=recommended_backend), + # lm_model_path + gr.update(choices=available_lm_models, value=default_lm_model, + info=t("service.lm_model_path_info") + (f" (Recommended: {recommended_lm})" if recommended_lm else " (LM not available for this GPU tier)")), + # init_llm_checkbox + gr.update(value=new_config.init_lm_default), + # batch_size_input + gr.update(value=min(2, max_batch), maximum=max_batch, + info=f"Number of samples to generate (Max: {max_batch})"), + # audio_duration + gr.update(maximum=float(max_duration), + info=f"Duration in seconds (-1 for auto). Max: {max_duration}s / {max_duration // 60} min"), + # gpu_info_display + gr.update(value=gpu_info_text), + ) + + def get_ui_control_config(is_turbo: bool) -> dict: """Return UI control configuration (values, limits, visibility) for model type. Used by both interactive init and service-mode startup so controls stay consistent. @@ -821,7 +906,13 @@ def update_audio_components_visibility(batch_size): Row 2: Components 5-8 (batch_size 5-8) """ # Clamp batch size to 1-8 range for UI - batch_size = min(max(int(batch_size), 1), 8) + if batch_size is None: + batch_size = 1 + else: + try: + batch_size = min(max(int(batch_size), 1), 8) + except (TypeError, ValueError): + batch_size = 1 # Row 1 columns (1-4) updates_row1 = ( diff --git a/acestep/gradio_ui/i18n/en.json b/acestep/gradio_ui/i18n/en.json index 099711dc..875aea5c 100644 --- a/acestep/gradio_ui/i18n/en.json +++ b/acestep/gradio_ui/i18n/en.json @@ -59,7 +59,10 @@ "init_btn": "Initialize Service", "status_label": "Status", "language_label": "UI Language", - "language_info": "Select interface language" + "language_info": "Select interface language", + "gpu_auto_tier": "Auto-detected tier", + "tier_label": "GPU Tier Override", + "tier_info": "Manually select a GPU tier to adjust optimization defaults (offload, quantization, backend, etc.)" }, "generation": { "required_inputs": "📝 Required Inputs", diff --git a/acestep/gradio_ui/i18n/he.json b/acestep/gradio_ui/i18n/he.json index 6e0535d3..399d74cb 100644 --- a/acestep/gradio_ui/i18n/he.json +++ b/acestep/gradio_ui/i18n/he.json @@ -59,7 +59,10 @@ "init_btn": "אתחול שירות", "status_label": "מצב", "language_label": "שפת ממשק", - "language_info": "בחר את שפת הממשק" + "language_info": "בחר את שפת הממשק", + "gpu_auto_tier": "שכבה שזוהתה אוטומטית", + "tier_label": "דריסת שכבת GPU", + "tier_info": "בחר שכבת GPU באופן ידני כדי להתאים ברירות מחדל של אופטימיזציה (העברה, קוונטיזציה, מנוע וכו')" }, "generation": { "required_inputs": "📝 קלטים נדרשים", diff --git a/acestep/gradio_ui/i18n/ja.json b/acestep/gradio_ui/i18n/ja.json index e5b934cb..a9af9445 100644 --- a/acestep/gradio_ui/i18n/ja.json +++ b/acestep/gradio_ui/i18n/ja.json @@ -59,7 +59,10 @@ "init_btn": "サービスを初期化", "status_label": "ステータス", "language_label": "UI言語", - "language_info": "インターフェース言語を選択" + "language_info": "インターフェース言語を選択", + "gpu_auto_tier": "自動検出ティア", + "tier_label": "GPU ティアの手動選択", + "tier_info": "GPUティアを手動で選択して最適化のデフォルト(オフロード、量子化、バックエンドなど)を調整します" }, "generation": { "required_inputs": "📝 必須入力", diff --git a/acestep/gradio_ui/i18n/zh.json b/acestep/gradio_ui/i18n/zh.json index cd222e4c..9a4cd160 100644 --- a/acestep/gradio_ui/i18n/zh.json +++ b/acestep/gradio_ui/i18n/zh.json @@ -59,7 +59,10 @@ "init_btn": "初始化服务", "status_label": "状态", "language_label": "界面语言", - "language_info": "选择界面语言" + "language_info": "选择界面语言", + "gpu_auto_tier": "自动检测层级", + "tier_label": "GPU 层级覆盖", + "tier_info": "手动选择 GPU 层级以调整优化默认值(卸载、量化、后端等)" }, "generation": { "required_inputs": "📝 必需输入", diff --git a/acestep/gradio_ui/interfaces/generation.py b/acestep/gradio_ui/interfaces/generation.py index 2d0c18bf..2700af02 100644 --- a/acestep/gradio_ui/interfaces/generation.py +++ b/acestep/gradio_ui/interfaces/generation.py @@ -13,7 +13,7 @@ ) from acestep.gradio_ui.i18n import t from acestep.gradio_ui.events.generation_handlers import get_ui_control_config -from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk +from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk, get_gpu_device_name, GPU_TIER_LABELS, GPU_TIER_CHOICES def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict: @@ -101,6 +101,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua scale=1, ) + # GPU info display and tier override + _gpu_device_name = get_gpu_device_name() + _gpu_info_text = f"🖥️ **{_gpu_device_name}** — {gpu_config.gpu_memory_gb:.1f} GB VRAM — {t('service.gpu_auto_tier')}: **{GPU_TIER_LABELS.get(gpu_config.tier, gpu_config.tier)}**" + with gr.Row(): + gpu_info_display = gr.Markdown(value=_gpu_info_text) + with gr.Row(): + tier_dropdown = gr.Dropdown( + choices=[(label, key) for key, label in GPU_TIER_LABELS.items()], + value=gpu_config.tier, + label=t("service.tier_label"), + info=t("service.tier_info"), + scale=1, + ) + # Dropdown options section - all dropdowns grouped together with gr.Row(equal_height=True): with gr.Column(scale=4): @@ -874,4 +888,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua "gpu_config": gpu_config, "max_duration": max_duration, "max_batch_size": max_batch_size, + # GPU info and tier override + "gpu_info_display": gpu_info_display, + "tier_dropdown": tier_dropdown, } diff --git a/acestep/handler.py b/acestep/handler.py index 50c1b6b3..a8e839b6 100644 --- a/acestep/handler.py +++ b/acestep/handler.py @@ -1451,8 +1451,16 @@ def _vram_guard_reduce_batch( # Estimate per-sample activation cost for DiT duration_sec = float(audio_duration) if audio_duration and float(audio_duration) > 0 else 60.0 - # Empirical: ~0.8 GB per sample at 60s, linear scaling - per_sample_gb = 0.8 * (duration_sec / 60.0) + # Empirical observation: DiT activation memory per extra batch element is + # relatively modest because the latent is processed in a single forward pass + # and flash-attention keeps peak memory low. Measured values: + # - 60s turbo, noLM, batch 4 → ~13.3 GB total on 16GB GPU + # (model ~8.5 GB + 4 × ~0.8 GB activations ≈ 11.7 GB + overhead) + # - 208s turbo, batch 1 → peak 9.3 GB (model ~8.9 GB + ~0.4 GB activation) + # The old formula (0.8 * duration/60) heavily overestimates for long durations + # because activation memory scales sub-linearly with latent length (flash attn). + # Use a more conservative formula: base 0.5 GB + 0.15 GB per 60s beyond 60s. + per_sample_gb = 0.5 + max(0.0, 0.15 * (duration_sec - 60.0) / 60.0) # If using cfg (base model), double the per-sample cost if hasattr(self, 'model') and self.model is not None: model_name = getattr(self, 'config_path', '') or '' diff --git a/acestep/llm_inference.py b/acestep/llm_inference.py index d7daf062..bcbb2042 100644 --- a/acestep/llm_inference.py +++ b/acestep/llm_inference.py @@ -25,8 +25,9 @@ from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION, DEFAULT_LM_INSPIRED_INSTRUCTION, DEFAULT_LM_REWRITE_INSTRUCTION from acestep.gpu_config import get_lm_gpu_memory_ratio, get_gpu_memory_gb, get_lm_model_size, get_global_gpu_config -# VRAM thresholds for skipping vLLM/CUDA graphs on 16GB GPUs to avoid OOM/fragmentation -VRAM_SAFE_TOTAL_GB = 16.0 +# Minimum free VRAM (GB) required to attempt vLLM initialization. +# vLLM's KV cache allocator adapts to available memory, so we only need a +# basic sanity check — not a hard total-VRAM gate. VRAM_SAFE_FREE_GB = 2.0 @@ -551,9 +552,9 @@ def initialize( free_gb = (total_bytes - torch.cuda.memory_reserved(0)) / (1024**3) except Exception: free_gb = 0.0 - if device == "cuda" and (total_gb <= VRAM_SAFE_TOTAL_GB or free_gb < VRAM_SAFE_FREE_GB): + if device == "cuda" and free_gb < VRAM_SAFE_FREE_GB: logger.warning( - f"vLLM disabled due to VRAM safety constraints (total={total_gb:.2f}GB, free={free_gb:.2f}GB) — falling back to PyTorch backend" + f"vLLM disabled due to insufficient free VRAM (total={total_gb:.2f}GB, free={free_gb:.2f}GB, need>={VRAM_SAFE_FREE_GB}GB free) — falling back to PyTorch backend" ) success, status_msg = self._load_pytorch_model(full_lm_model_path, device) if not success: