ace-step · ChuxiJ · Feb 11, 2026 · Feb 11, 2026
diff --git a/acestep/gpu_config.py b/acestep/gpu_config.py
@@ -111,7 +111,7 @@ class GPUConfig:
     recommended_lm_model: str  # Recommended default LM model path (empty if LM not available)
 
     # LM backend restriction
-    # "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for very low VRAM
+    # "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for MPS (vllm requires CUDA)
     lm_backend_restriction: str  # "all" or "pt_mlx_only"
     recommended_backend: str  # Recommended default backend: "vllm", "pt", or "mlx"
 
@@ -142,8 +142,8 @@ class GPUConfig:
         "init_lm_default": False,
         "available_lm_models": [],
         "recommended_lm_model": "",
-        "lm_backend_restriction": "pt_mlx_only",  # vllm KV cache won't fit
-        "recommended_backend": "pt",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
         "offload_to_cpu_default": True,
         "offload_dit_to_cpu_default": True,
         "quantization_default": True,  # INT8 essential to fit DiT in ~4GB
@@ -161,8 +161,8 @@ class GPUConfig:
         "init_lm_default": False,
         "available_lm_models": [],
         "recommended_lm_model": "",
-        "lm_backend_restriction": "pt_mlx_only",
-        "recommended_backend": "pt",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
         "offload_to_cpu_default": True,
         "offload_dit_to_cpu_default": True,
         "quantization_default": True,
@@ -172,16 +172,16 @@ class GPUConfig:
     "tier3": {  # 6-8GB
         # Offload mode.  DiT(4.46) + context(0.5) ≈ 5.0GB.
         # ~1.5-3GB headroom allows LM 0.6B (1.2+0.6=1.8GB) and batch=2.
-        # vllm KV cache is tight; pt backend is safer for 0.6B on this tier.
+        # With CPU offload, DiT is offloaded before LM runs → vllm can use freed VRAM.
         "max_duration_with_lm": 480,  # 8 minutes
         "max_duration_without_lm": 600,  # 10 minutes (max supported)
         "max_batch_size_with_lm": 2,
         "max_batch_size_without_lm": 2,
         "init_lm_default": True,
         "available_lm_models": ["acestep-5Hz-lm-0.6B"],
         "recommended_lm_model": "acestep-5Hz-lm-0.6B",
-        "lm_backend_restriction": "pt_mlx_only",  # vllm KV cache too greedy for <8GB
-        "recommended_backend": "pt",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
         "offload_to_cpu_default": True,
         "offload_dit_to_cpu_default": True,
         "quantization_default": True,
@@ -1080,6 +1080,97 @@ def print_gpu_config_info(gpu_config: GPUConfig):
     logger.info(f"  - Available LM Models: {gpu_config.available_lm_models or 'None'}")
 
 
+# Human-readable tier labels for UI display
+GPU_TIER_LABELS = {
+    "tier1": "tier1 (≤4GB)",
+    "tier2": "tier2 (4-6GB)",
+    "tier3": "tier3 (6-8GB)",
+    "tier4": "tier4 (8-12GB)",
+    "tier5": "tier5 (12-16GB)",
+    "tier6a": "tier6a (16-20GB)",
+    "tier6b": "tier6b (20-24GB)",
+    "unlimited": "unlimited (≥24GB)",
+}
+
+# Ordered list of tier keys for dropdown
+GPU_TIER_CHOICES = list(GPU_TIER_LABELS.items())  # [(value, label), ...]
+
+
+def get_gpu_device_name() -> str:
+    """
+    Get the GPU device name string.
+
+    Returns:
+        Human-readable GPU name, e.g. "NVIDIA GeForce RTX 4060 Ti",
+        "Apple M2 Pro (MPS)", "CPU only", etc.
+    """
+    try:
+        import torch
+        if torch.cuda.is_available():
+            return torch.cuda.get_device_name(0)
+        elif hasattr(torch, 'xpu') and torch.xpu.is_available():
+            props = torch.xpu.get_device_properties(0)
+            return getattr(props, 'name', 'Intel XPU')
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            # MPS doesn't expose a device name; use platform info
+            try:
+                import platform
+                chip = platform.processor() or "Apple Silicon"
+                return f"{chip} (MPS)"
+            except Exception:
+                return "Apple Silicon (MPS)"
+        else:
+            return "CPU only"
+    except ImportError:
+        return "Unknown (PyTorch not available)"
+
+
+def get_gpu_config_for_tier(tier: str) -> GPUConfig:
+    """
+    Create a GPUConfig for a specific tier, applying platform overrides.
+
+    This is used when the user manually selects a different tier in the UI.
+    The actual gpu_memory_gb is preserved from the real hardware detection,
+    but all tier-based settings come from the selected tier's config.
+
+    Args:
+        tier: Tier key, e.g. "tier3", "tier6a", "unlimited"
+
+    Returns:
+        GPUConfig with the selected tier's settings
+    """
+    if tier not in GPU_TIER_CONFIGS:
+        logger.warning(f"Unknown tier '{tier}', falling back to auto-detected config")
+        return get_gpu_config()
+
+    # Keep the real GPU memory for informational purposes
+    real_gpu_memory = get_gpu_memory_gb()
+    config = GPU_TIER_CONFIGS[tier]
+
+    _mps = is_mps_platform()
+    if _mps:
+        logger.info(f"Manual tier override to {tier} on macOS MPS — applying Apple Silicon overrides")
+
+    return GPUConfig(
+        tier=tier,
+        gpu_memory_gb=real_gpu_memory,
+        max_duration_with_lm=config["max_duration_with_lm"],
+        max_duration_without_lm=config["max_duration_without_lm"],
+        max_batch_size_with_lm=config["max_batch_size_with_lm"],
+        max_batch_size_without_lm=config["max_batch_size_without_lm"],
+        init_lm_default=config["init_lm_default"],
+        available_lm_models=config["available_lm_models"],
+        recommended_lm_model=config.get("recommended_lm_model", ""),
+        lm_backend_restriction="pt_mlx_only" if _mps else config.get("lm_backend_restriction", "all"),
+        recommended_backend="mlx" if _mps else config.get("recommended_backend", "vllm"),
+        offload_to_cpu_default=False if _mps else config.get("offload_to_cpu_default", True),
+        offload_dit_to_cpu_default=False if _mps else config.get("offload_dit_to_cpu_default", True),
+        quantization_default=False if _mps else config.get("quantization_default", True),
+        compile_model_default=False if _mps else config.get("compile_model_default", True),
+        lm_memory_gb=config["lm_memory_gb"],
+    )
+
+
 # Global GPU config instance (initialized lazily)
 _global_gpu_config: Optional[GPUConfig] = None
 

diff --git a/acestep/gradio_ui/events/__init__.py b/acestep/gradio_ui/events/__init__.py
@@ -43,6 +43,24 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         ]
     )
 
+    # ========== Tier Override ==========
+    generation_section["tier_dropdown"].change(
+        fn=lambda tier: gen_h.on_tier_change(tier, llm_handler),
+        inputs=[generation_section["tier_dropdown"]],
+        outputs=[
+            generation_section["offload_to_cpu_checkbox"],
+            generation_section["offload_dit_to_cpu_checkbox"],
+            generation_section["compile_model_checkbox"],
+            generation_section["quantization_checkbox"],
+            generation_section["backend_dropdown"],
+            generation_section["lm_model_path"],
+            generation_section["init_llm_checkbox"],
+            generation_section["batch_size_input"],
+            generation_section["audio_duration"],
+            generation_section["gpu_info_display"],
+        ]
+    )
+
     generation_section["init_btn"].click(
         fn=lambda *args: gen_h.init_service_wrapper(dit_handler, llm_handler, *args),
         inputs=[

diff --git a/acestep/gradio_ui/events/generation_handlers.py b/acestep/gradio_ui/events/generation_handlers.py
@@ -16,7 +16,10 @@
 )
 from acestep.gradio_ui.i18n import t
 from acestep.inference import understand_music, create_sample, format_sample
-from acestep.gpu_config import get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk
+from acestep.gpu_config import (
+    get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk,
+    get_gpu_config_for_tier, set_global_gpu_config, GPU_TIER_LABELS, GPU_TIER_CONFIGS,
+)
 
 
 def clamp_duration_to_gpu_limit(duration_value: Optional[float], llm_handler=None) -> Optional[float]:
@@ -559,6 +562,88 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
     )
 
 
+def on_tier_change(selected_tier, llm_handler=None):
+    """
+    Handle manual tier override from the UI dropdown.
+
+    Updates the global GPU config and returns gr.update() for all
+    affected UI components so they reflect the new tier's defaults.
+
+    Returns a tuple of gr.update() objects for:
+        (offload_to_cpu, offload_dit_to_cpu, compile_model, quantization,
+         backend_dropdown, lm_model_path, init_llm, batch_size_input,
+         audio_duration, gpu_info_display)
+    """
+    if not selected_tier or selected_tier not in GPU_TIER_CONFIGS:
+        logger.warning(f"Invalid tier selection: {selected_tier}")
+        return (gr.update(),) * 10
+
+    # Build new config for the selected tier and update global
+    new_config = get_gpu_config_for_tier(selected_tier)
+    set_global_gpu_config(new_config)
+    logger.info(f"🔄 Tier manually changed to {selected_tier} — updating UI defaults")
+
+    # Backend choices
+    if new_config.lm_backend_restriction == "pt_mlx_only":
+        available_backends = ["pt", "mlx"]
+    else:
+        available_backends = ["vllm", "pt", "mlx"]
+    recommended_backend = new_config.recommended_backend
+    if recommended_backend not in available_backends:
+        recommended_backend = available_backends[0]
+
+    # LM model choices — filter disk models by tier
+    tier_lm_models = new_config.available_lm_models
+    all_disk_models = llm_handler.get_available_5hz_lm_models() if llm_handler else []
+    if tier_lm_models:
+        filtered = [m for m in all_disk_models if is_lm_model_size_allowed(m, tier_lm_models)]
+        available_lm_models = filtered if filtered else all_disk_models
+    else:
+        available_lm_models = all_disk_models
+
+    recommended_lm = new_config.recommended_lm_model
+    default_lm_model = find_best_lm_model_on_disk(recommended_lm, available_lm_models)
+
+    # Duration and batch limits (use without-LM limits as safe default; init will refine)
+    max_duration = new_config.max_duration_without_lm
+    max_batch = new_config.max_batch_size_without_lm
+
+    # GPU info markdown update
+    tier_label = GPU_TIER_LABELS.get(selected_tier, selected_tier)
+    from acestep.gpu_config import get_gpu_device_name
+    _gpu_device_name = get_gpu_device_name()
+    gpu_info_text = f"🖥️ **{_gpu_device_name}** — {new_config.gpu_memory_gb:.1f} GB VRAM — {t('service.gpu_auto_tier')}: **{tier_label}**"
+
+    return (
+        # offload_to_cpu_checkbox
+        gr.update(value=new_config.offload_to_cpu_default,
+                  info=t("service.offload_cpu_info") + (" (recommended for this tier)" if new_config.offload_to_cpu_default else " (optional for this tier)")),
+        # offload_dit_to_cpu_checkbox
+        gr.update(value=new_config.offload_dit_to_cpu_default,
+                  info=t("service.offload_dit_cpu_info") + (" (recommended for this tier)" if new_config.offload_dit_to_cpu_default else " (optional for this tier)")),
+        # compile_model_checkbox
+        gr.update(value=new_config.compile_model_default),
+        # quantization_checkbox
+        gr.update(value=new_config.quantization_default,
+                  info=t("service.quantization_info") + (" (recommended for this tier)" if new_config.quantization_default else " (optional for this tier)")),
+        # backend_dropdown
+        gr.update(choices=available_backends, value=recommended_backend),
+        # lm_model_path
+        gr.update(choices=available_lm_models, value=default_lm_model,
+                  info=t("service.lm_model_path_info") + (f" (Recommended: {recommended_lm})" if recommended_lm else " (LM not available for this GPU tier)")),
+        # init_llm_checkbox
+        gr.update(value=new_config.init_lm_default),
+        # batch_size_input
+        gr.update(value=min(2, max_batch), maximum=max_batch,
+                  info=f"Number of samples to generate (Max: {max_batch})"),
+        # audio_duration
+        gr.update(maximum=float(max_duration),
+                  info=f"Duration in seconds (-1 for auto). Max: {max_duration}s / {max_duration // 60} min"),
+        # gpu_info_display
+        gr.update(value=gpu_info_text),
+    )
+
+
 def get_ui_control_config(is_turbo: bool) -> dict:
     """Return UI control configuration (values, limits, visibility) for model type.
     Used by both interactive init and service-mode startup so controls stay consistent.
@@ -821,7 +906,13 @@ def update_audio_components_visibility(batch_size):
     Row 2: Components 5-8 (batch_size 5-8)
     """
     # Clamp batch size to 1-8 range for UI
-    batch_size = min(max(int(batch_size), 1), 8)
+    if batch_size is None:
+        batch_size = 1
+    else:
+        try:
+            batch_size = min(max(int(batch_size), 1), 8)
+        except (TypeError, ValueError):
+            batch_size = 1
 
     # Row 1 columns (1-4)
     updates_row1 = (

diff --git a/acestep/gradio_ui/i18n/en.json b/acestep/gradio_ui/i18n/en.json
@@ -59,7 +59,10 @@
     "init_btn": "Initialize Service",
     "status_label": "Status",
     "language_label": "UI Language",
-    "language_info": "Select interface language"
+    "language_info": "Select interface language",
+    "gpu_auto_tier": "Auto-detected tier",
+    "tier_label": "GPU Tier Override",
+    "tier_info": "Manually select a GPU tier to adjust optimization defaults (offload, quantization, backend, etc.)"
   },
   "generation": {
     "required_inputs": "📝 Required Inputs",

diff --git a/acestep/gradio_ui/i18n/he.json b/acestep/gradio_ui/i18n/he.json
@@ -59,7 +59,10 @@
     "init_btn": "אתחול שירות",
     "status_label": "מצב",
     "language_label": "שפת ממשק",
-    "language_info": "בחר את שפת הממשק"
+    "language_info": "בחר את שפת הממשק",
+    "gpu_auto_tier": "שכבה שזוהתה אוטומטית",
+    "tier_label": "דריסת שכבת GPU",
+    "tier_info": "בחר שכבת GPU באופן ידני כדי להתאים ברירות מחדל של אופטימיזציה (העברה, קוונטיזציה, מנוע וכו')"
   },
   "generation": {
     "required_inputs": "📝 קלטים נדרשים",

diff --git a/acestep/gradio_ui/i18n/ja.json b/acestep/gradio_ui/i18n/ja.json
@@ -59,7 +59,10 @@
     "init_btn": "サービスを初期化",
     "status_label": "ステータス",
     "language_label": "UI言語",
-    "language_info": "インターフェース言語を選択"
+    "language_info": "インターフェース言語を選択",
+    "gpu_auto_tier": "自動検出ティア",
+    "tier_label": "GPU ティアの手動選択",
+    "tier_info": "GPUティアを手動で選択して最適化のデフォルト（オフロード、量子化、バックエンドなど）を調整します"
   },
   "generation": {
     "required_inputs": "📝 必須入力",

diff --git a/acestep/gradio_ui/i18n/zh.json b/acestep/gradio_ui/i18n/zh.json
@@ -59,7 +59,10 @@
     "init_btn": "初始化服务",
     "status_label": "状态",
     "language_label": "界面语言",
-    "language_info": "选择界面语言"
+    "language_info": "选择界面语言",
+    "gpu_auto_tier": "自动检测层级",
+    "tier_label": "GPU 层级覆盖",
+    "tier_info": "手动选择 GPU 层级以调整优化默认值（卸载、量化、后端等）"
   },
   "generation": {
     "required_inputs": "📝 必需输入",

diff --git a/acestep/gradio_ui/interfaces/generation.py b/acestep/gradio_ui/interfaces/generation.py
@@ -13,7 +13,7 @@
 )
 from acestep.gradio_ui.i18n import t
 from acestep.gradio_ui.events.generation_handlers import get_ui_control_config
-from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk
+from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk, get_gpu_device_name, GPU_TIER_LABELS, GPU_TIER_CHOICES
 
 
 def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict:
@@ -101,6 +101,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     scale=1,
                 )
 
+            # GPU info display and tier override
+            _gpu_device_name = get_gpu_device_name()
+            _gpu_info_text = f"🖥️ **{_gpu_device_name}** — {gpu_config.gpu_memory_gb:.1f} GB VRAM — {t('service.gpu_auto_tier')}: **{GPU_TIER_LABELS.get(gpu_config.tier, gpu_config.tier)}**"
+            with gr.Row():
+                gpu_info_display = gr.Markdown(value=_gpu_info_text)
+            with gr.Row():
+                tier_dropdown = gr.Dropdown(
+                    choices=[(label, key) for key, label in GPU_TIER_LABELS.items()],
+                    value=gpu_config.tier,
+                    label=t("service.tier_label"),
+                    info=t("service.tier_info"),
+                    scale=1,
+                )
+
             # Dropdown options section - all dropdowns grouped together
             with gr.Row(equal_height=True):
                 with gr.Column(scale=4):
@@ -874,4 +888,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "gpu_config": gpu_config,
         "max_duration": max_duration,
         "max_batch_size": max_batch_size,
+        # GPU info and tier override
+        "gpu_info_display": gpu_info_display,
+        "tier_dropdown": tier_dropdown,
     }
diff --git a/acestep/handler.py b/acestep/handler.py
@@ -1451,8 +1451,16 @@ def _vram_guard_reduce_batch(
 
         # Estimate per-sample activation cost for DiT
         duration_sec = float(audio_duration) if audio_duration and float(audio_duration) > 0 else 60.0
-        # Empirical: ~0.8 GB per sample at 60s, linear scaling
-        per_sample_gb = 0.8 * (duration_sec / 60.0)
+        # Empirical observation: DiT activation memory per extra batch element is
+        # relatively modest because the latent is processed in a single forward pass
+        # and flash-attention keeps peak memory low.  Measured values:
+        #   - 60s turbo, noLM, batch 4 → ~13.3 GB total on 16GB GPU
+        #     (model ~8.5 GB + 4 × ~0.8 GB activations ≈ 11.7 GB + overhead)
+        #   - 208s turbo, batch 1 → peak 9.3 GB (model ~8.9 GB + ~0.4 GB activation)
+        # The old formula (0.8 * duration/60) heavily overestimates for long durations
+        # because activation memory scales sub-linearly with latent length (flash attn).
+        # Use a more conservative formula: base 0.5 GB + 0.15 GB per 60s beyond 60s.
+        per_sample_gb = 0.5 + max(0.0, 0.15 * (duration_sec - 60.0) / 60.0)
         # If using cfg (base model), double the per-sample cost
         if hasattr(self, 'model') and self.model is not None:
             model_name = getattr(self, 'config_path', '') or ''