ace-step · ChuxiJ · Feb 11, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/README.md b/README.md
@@ -100,14 +100,17 @@ Open http://localhost:7860 (Gradio) or http://localhost:8001 (API).
 
 ### 💡 Which Model Should I Choose?
 
-| Your GPU VRAM | Recommended LM Model | Notes |
-|---------------|---------------------|-------|
-| **≤6GB** | None (DiT only) | LM disabled by default to save memory |
-| **6-12GB** | `acestep-5Hz-lm-0.6B` | Lightweight, good balance |
-| **12-16GB** | `acestep-5Hz-lm-1.7B` | Better quality |
-| **≥16GB** | `acestep-5Hz-lm-4B` | Best quality and audio understanding |
-
-> 📖 GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [中文](./docs/zh/GPU_COMPATIBILITY.md) | [日本語](./docs/ja/GPU_COMPATIBILITY.md)
+| Your GPU VRAM | Recommended LM Model | Backend | Notes |
+|---------------|---------------------|---------|-------|
+| **≤6GB** | None (DiT only) | — | LM disabled by default; INT8 quantization + full CPU offload |
+| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | Lightweight LM with PyTorch backend |
+| **8-16GB** | `acestep-5Hz-lm-0.6B` / `1.7B` | `vllm` | 0.6B for 8-12GB, 1.7B for 12-16GB |
+| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 4B available on 20GB+; no offload needed on 20GB+ |
+| **≥24GB** | `acestep-5Hz-lm-4B` | `vllm` | Best quality, all models fit without offload |
+
+The UI automatically selects the best configuration for your GPU. All settings (LM model, backend, offloading, quantization) are tier-aware and pre-configured.
+
+> 📖 GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [中文](./docs/zh/GPU_COMPATIBILITY.md) | [日本語](./docs/ja/GPU_COMPATIBILITY.md) | [한국어](./docs/ko/GPU_COMPATIBILITY.md)
 
 ## 🚀 Launch Scripts
 

diff --git a/acestep/acestep_v15_pipeline.py b/acestep/acestep_v15_pipeline.py
@@ -36,7 +36,7 @@
     from .llm_inference import LLMHandler
     from .dataset_handler import DatasetHandler
     from .gradio_ui import create_gradio_interface
-    from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB
+    from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB
     from .model_downloader import ensure_lm_model
 except ImportError:
     # When executed as a script: `python acestep/acestep_v15_pipeline.py`
@@ -47,7 +47,7 @@
     from acestep.llm_inference import LLMHandler
     from acestep.dataset_handler import DatasetHandler
     from acestep.gradio_ui import create_gradio_interface
-    from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB
+    from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB
     from acestep.model_downloader import ensure_lm_model
 
 
@@ -93,7 +93,11 @@ def main():
     set_global_gpu_config(gpu_config)  # Set global config for use across modules
 
     gpu_memory_gb = gpu_config.gpu_memory_gb
-    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB
+    # Enable auto-offload for GPUs below 20 GB.  16 GB GPUs cannot hold all
+    # models simultaneously (DiT ~4.7 + VAE ~0.3 + text_enc ~1.2 + LM ≥1.2 +
+    # activations) so they *must* offload.  The old threshold of 16 GB caused
+    # 16 GB GPUs to never offload, leading to OOM.
+    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB
 
     # Print GPU configuration info
     print(f"\n{'='*60}")
@@ -110,9 +114,9 @@ def main():
     print(f"{'='*60}\n")
 
     if auto_offload:
-        print(f"Auto-enabling CPU offload (GPU < 16GB)")
+        print(f"Auto-enabling CPU offload (GPU {gpu_memory_gb:.1f}GB < {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)")
     elif gpu_memory_gb > 0:
-        print(f"CPU offload disabled by default (GPU >= 16GB)")
+        print(f"CPU offload disabled by default (GPU {gpu_memory_gb:.1f}GB >= {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)")
     else:
         print("No GPU detected, running on CPU")
 
@@ -205,6 +209,19 @@ def main():
             args.offload_to_cpu = True
             print(f"Auto-enabling CPU offload (4B LM model requires offloading on {gpu_memory_gb:.0f}GB GPU)")
 
+    # Safety: on 16GB GPUs, prevent selecting LM models that are too large.
+    # Even with offloading, a 4B LM (8 GB weights + KV cache) leaves almost no
+    # headroom for DiT activations on a 16 GB card.
+    if args.lm_model_path and 0 < gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB:
+        if "4B" in args.lm_model_path:
+            # Downgrade to 1.7B if available
+            fallback = args.lm_model_path.replace("4B", "1.7B")
+            print(
+                f"WARNING: 4B LM model is too large for {gpu_memory_gb:.0f}GB GPU. "
+                f"Downgrading to 1.7B variant: {fallback}"
+            )
+            args.lm_model_path = fallback
+
     try:
         init_params = None
         dit_handler = None

diff --git a/acestep/api_server.py b/acestep/api_server.py
@@ -68,6 +68,7 @@
     is_lm_model_supported,
     GPUConfig,
     VRAM_16GB_MIN_GB,
+    VRAM_AUTO_OFFLOAD_THRESHOLD_GB,
 )
 
 
@@ -1899,7 +1900,7 @@ async def _job_store_cleanup_worker() -> None:
         app.state.gpu_config = gpu_config
 
         gpu_memory_gb = gpu_config.gpu_memory_gb
-        auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB
+        auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB
 
         # Print GPU configuration info
         print(f"\n{'='*60}")

diff --git a/acestep/dit_alignment_score.py b/acestep/dit_alignment_score.py
@@ -834,16 +834,16 @@ def calculate_score(
         Returns:
             AlignmentScore object containing individual metrics and final score.
         """
-        # Ensure Inputs are Tensors on the correct device
+        # Ensure Inputs are Tensors.
+        # Always compute on CPU — the scoring matrices are small and this
+        # avoids occupying GPU VRAM that DiT / VAE / LM need.  Keeping
+        # everything on CPU also prevents timeout issues on low-VRAM GPUs
+        # where the accelerator memory is fully committed to model weights.
+        _score_device = "cpu"
         if not isinstance(energy_matrix, torch.Tensor):
-            # Use available accelerator device; fallback to CPU if none
-            if torch.cuda.is_available():
-                _score_device = "cuda"
-            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-                _score_device = "mps"
-            else:
-                _score_device = "cpu"
             energy_matrix = torch.tensor(energy_matrix, device=_score_device, dtype=torch.float32)
+        else:
+            energy_matrix = energy_matrix.to(device=_score_device, dtype=torch.float32)
 
         device = energy_matrix.device