diff --git a/README.md b/README.md
index aae7b0dc..4ee8f640 100644
--- a/README.md
+++ b/README.md
@@ -100,14 +100,17 @@ Open http://localhost:7860 (Gradio) or http://localhost:8001 (API).
 
 ### 💡 Which Model Should I Choose?
 
-| Your GPU VRAM | Recommended LM Model | Notes |
-|---------------|---------------------|-------|
-| **≤6GB** | None (DiT only) | LM disabled by default to save memory |
-| **6-12GB** | `acestep-5Hz-lm-0.6B` | Lightweight, good balance |
-| **12-16GB** | `acestep-5Hz-lm-1.7B` | Better quality |
-| **≥16GB** | `acestep-5Hz-lm-4B` | Best quality and audio understanding |
-
-> 📖 GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [中文](./docs/zh/GPU_COMPATIBILITY.md) | [日本語](./docs/ja/GPU_COMPATIBILITY.md)
+| Your GPU VRAM | Recommended LM Model | Backend | Notes |
+|---------------|---------------------|---------|-------|
+| **≤6GB** | None (DiT only) | — | LM disabled by default; INT8 quantization + full CPU offload |
+| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | Lightweight LM with PyTorch backend |
+| **8-16GB** | `acestep-5Hz-lm-0.6B` / `1.7B` | `vllm` | 0.6B for 8-12GB, 1.7B for 12-16GB |
+| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 4B available on 20GB+; no offload needed on 20GB+ |
+| **≥24GB** | `acestep-5Hz-lm-4B` | `vllm` | Best quality, all models fit without offload |
+
+The UI automatically selects the best configuration for your GPU. All settings (LM model, backend, offloading, quantization) are tier-aware and pre-configured.
+
+> 📖 GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [中文](./docs/zh/GPU_COMPATIBILITY.md) | [日本語](./docs/ja/GPU_COMPATIBILITY.md) | [한국어](./docs/ko/GPU_COMPATIBILITY.md)
 
 ## 🚀 Launch Scripts
 
diff --git a/acestep/acestep_v15_pipeline.py b/acestep/acestep_v15_pipeline.py
index 08347dac..252f805c 100644
--- a/acestep/acestep_v15_pipeline.py
+++ b/acestep/acestep_v15_pipeline.py
@@ -36,7 +36,7 @@
     from .llm_inference import LLMHandler
     from .dataset_handler import DatasetHandler
     from .gradio_ui import create_gradio_interface
-    from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB
+    from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB
     from .model_downloader import ensure_lm_model
 except ImportError:
     # When executed as a script: `python acestep/acestep_v15_pipeline.py`
@@ -47,7 +47,7 @@
     from acestep.llm_inference import LLMHandler
     from acestep.dataset_handler import DatasetHandler
     from acestep.gradio_ui import create_gradio_interface
-    from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB
+    from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB
     from acestep.model_downloader import ensure_lm_model
 
 
@@ -93,7 +93,11 @@ def main():
     set_global_gpu_config(gpu_config)  # Set global config for use across modules
     
     gpu_memory_gb = gpu_config.gpu_memory_gb
-    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB
+    # Enable auto-offload for GPUs below 20 GB.  16 GB GPUs cannot hold all
+    # models simultaneously (DiT ~4.7 + VAE ~0.3 + text_enc ~1.2 + LM ≥1.2 +
+    # activations) so they *must* offload.  The old threshold of 16 GB caused
+    # 16 GB GPUs to never offload, leading to OOM.
+    auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB
     
     # Print GPU configuration info
     print(f"\n{'='*60}")
@@ -110,9 +114,9 @@ def main():
     print(f"{'='*60}\n")
     
     if auto_offload:
-        print(f"Auto-enabling CPU offload (GPU < 16GB)")
+        print(f"Auto-enabling CPU offload (GPU {gpu_memory_gb:.1f}GB < {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)")
     elif gpu_memory_gb > 0:
-        print(f"CPU offload disabled by default (GPU >= 16GB)")
+        print(f"CPU offload disabled by default (GPU {gpu_memory_gb:.1f}GB >= {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)")
     else:
         print("No GPU detected, running on CPU")
 
@@ -205,6 +209,19 @@ def main():
             args.offload_to_cpu = True
             print(f"Auto-enabling CPU offload (4B LM model requires offloading on {gpu_memory_gb:.0f}GB GPU)")
 
+    # Safety: on 16GB GPUs, prevent selecting LM models that are too large.
+    # Even with offloading, a 4B LM (8 GB weights + KV cache) leaves almost no
+    # headroom for DiT activations on a 16 GB card.
+    if args.lm_model_path and 0 < gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB:
+        if "4B" in args.lm_model_path:
+            # Downgrade to 1.7B if available
+            fallback = args.lm_model_path.replace("4B", "1.7B")
+            print(
+                f"WARNING: 4B LM model is too large for {gpu_memory_gb:.0f}GB GPU. "
+                f"Downgrading to 1.7B variant: {fallback}"
+            )
+            args.lm_model_path = fallback
+
     try:
         init_params = None
         dit_handler = None
diff --git a/acestep/api_server.py b/acestep/api_server.py
index a553aaf8..91224994 100644
--- a/acestep/api_server.py
+++ b/acestep/api_server.py
@@ -68,6 +68,7 @@
     is_lm_model_supported,
     GPUConfig,
     VRAM_16GB_MIN_GB,
+    VRAM_AUTO_OFFLOAD_THRESHOLD_GB,
 )
 
 
@@ -1899,7 +1900,7 @@ async def _job_store_cleanup_worker() -> None:
         app.state.gpu_config = gpu_config
 
         gpu_memory_gb = gpu_config.gpu_memory_gb
-        auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB
+        auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB
 
         # Print GPU configuration info
         print(f"\n{'='*60}")
diff --git a/acestep/dit_alignment_score.py b/acestep/dit_alignment_score.py
index 4e99982d..f89def61 100644
--- a/acestep/dit_alignment_score.py
+++ b/acestep/dit_alignment_score.py
@@ -834,16 +834,16 @@ def calculate_score(
         Returns:
             AlignmentScore object containing individual metrics and final score.
         """
-        # Ensure Inputs are Tensors on the correct device
+        # Ensure Inputs are Tensors.
+        # Always compute on CPU — the scoring matrices are small and this
+        # avoids occupying GPU VRAM that DiT / VAE / LM need.  Keeping
+        # everything on CPU also prevents timeout issues on low-VRAM GPUs
+        # where the accelerator memory is fully committed to model weights.
+        _score_device = "cpu"
         if not isinstance(energy_matrix, torch.Tensor):
-            # Use available accelerator device; fallback to CPU if none
-            if torch.cuda.is_available():
-                _score_device = "cuda"
-            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-                _score_device = "mps"
-            else:
-                _score_device = "cpu"
             energy_matrix = torch.tensor(energy_matrix, device=_score_device, dtype=torch.float32)
+        else:
+            energy_matrix = energy_matrix.to(device=_score_device, dtype=torch.float32)
 
         device = energy_matrix.device
 
diff --git a/acestep/gpu_config.py b/acestep/gpu_config.py
index 1733beb5..e02fde18 100644
--- a/acestep/gpu_config.py
+++ b/acestep/gpu_config.py
@@ -28,11 +28,53 @@
 VRAM_16GB_TOLERANCE_GB = 0.5
 VRAM_16GB_MIN_GB = 16.0 - VRAM_16GB_TOLERANCE_GB  # treat as 16GB class if >= this
 
+# Threshold below which auto_offload is enabled.
+# 16GB GPUs cannot hold DiT + VAE + text_encoder + LM simultaneously without offloading.
+VRAM_AUTO_OFFLOAD_THRESHOLD_GB = 20.0
+
 # PyTorch installation URLs for diagnostics
 PYTORCH_CUDA_INSTALL_URL = "https://download.pytorch.org/whl/cu121"
 PYTORCH_ROCM_INSTALL_URL = "https://download.pytorch.org/whl/rocm6.0"
 
 
+# ===========================================================================
+# Empirical VRAM measurements (GB) -- model weights only, bf16 precision
+# These values should be calibrated using scripts/profile_vram.py
+# ===========================================================================
+
+# Base model weights (loaded once at startup)
+MODEL_VRAM = {
+    "dit_turbo": 4.7,        # DiT turbo model weights (bf16)
+    "dit_base": 4.7,         # DiT base model weights (bf16)
+    "vae": 0.33,             # VAE (AutoencoderOobleck) weights (fp16)
+    "text_encoder": 1.2,     # Qwen3-Embedding-0.6B text encoder (bf16)
+    "silence_latent": 0.01,  # Silence latent tensor
+    "cuda_context": 0.5,     # CUDA context + driver overhead
+}
+
+# LM model weights (bf16) + KV cache estimates
+LM_VRAM = {
+    "0.6B": {"weights": 1.2, "kv_cache_2k": 0.3, "kv_cache_4k": 0.6},
+    "1.7B": {"weights": 3.4, "kv_cache_2k": 0.5, "kv_cache_4k": 1.0},
+    "4B":   {"weights": 8.0, "kv_cache_2k": 0.8, "kv_cache_4k": 1.6},
+}
+
+# DiT inference peak VRAM per batch item (approximate, depends on duration)
+# These are additional activations/intermediates on top of model weights.
+#
+# Profiling on A800 (flash attention) shows only ~0.001-0.004 GB per batch item.
+# Consumer GPUs without flash attention will be higher due to materialised
+# attention matrices.  We use conservative estimates that cover the worst case
+# (no flash attention, long sequences).
+DIT_INFERENCE_VRAM_PER_BATCH = {
+    "turbo": 0.3,   # GB per batch item (no CFG)
+    "base":  0.6,    # GB per batch item (with CFG, 2x forward)
+}
+
+# Safety margin to keep free for OS/driver/fragmentation (GB)
+VRAM_SAFETY_MARGIN_GB = 0.5
+
+
 @dataclass
 class GPUConfig:
     """GPU configuration based on available memory"""
@@ -50,65 +92,157 @@ class GPUConfig:
     # LM configuration
     init_lm_default: bool  # Whether to initialize LM by default
     available_lm_models: List[str]  # Available LM models for this tier
+    recommended_lm_model: str  # Recommended default LM model path (empty if LM not available)
+    
+    # LM backend restriction
+    # "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for very low VRAM
+    lm_backend_restriction: str  # "all" or "pt_mlx_only"
+    recommended_backend: str  # Recommended default backend: "vllm", "pt", or "mlx"
+    
+    # Offload defaults
+    offload_to_cpu_default: bool  # Whether offload_to_cpu should be enabled by default
+    offload_dit_to_cpu_default: bool  # Whether offload_dit_to_cpu should be enabled by default
+    
+    # Quantization / compile defaults
+    quantization_default: bool  # Whether INT8 quantization should be enabled by default
+    compile_model_default: bool  # Whether torch.compile should be enabled by default
     
     # LM memory allocation (GB) for each model size
     lm_memory_gb: Dict[str, float]  # e.g., {"0.6B": 3, "1.7B": 8, "4B": 12}
 
 
 # GPU tier configurations
+# tier6 has been split into tier6a (16-20GB) and tier6b (20-24GB) to fix the
+# 16GB regression. 16GB GPUs cannot hold all models simultaneously with the
+# same batch sizes as 24GB GPUs.
 GPU_TIER_CONFIGS = {
     "tier1": {  # <= 4GB
-        "max_duration_with_lm": 180,  # 3 minutes
-        "max_duration_without_lm": 180,  # 3 minutes
+        # Offload mode required.  DiT(4.46) barely fits with CUDA context(0.5).
+        # VAE decode falls back to CPU.  Keep durations moderate.
+        "max_duration_with_lm": 240,  # 4 minutes
+        "max_duration_without_lm": 360,  # 6 minutes
         "max_batch_size_with_lm": 1,
         "max_batch_size_without_lm": 1,
         "init_lm_default": False,
         "available_lm_models": [],
+        "recommended_lm_model": "",
+        "lm_backend_restriction": "pt_mlx_only",  # vllm KV cache won't fit
+        "recommended_backend": "pt",
+        "offload_to_cpu_default": True,
+        "offload_dit_to_cpu_default": True,
+        "quantization_default": True,  # INT8 essential to fit DiT in ~4GB
+        "compile_model_default": True,
         "lm_memory_gb": {},
     },
     "tier2": {  # 4-6GB
-        "max_duration_with_lm": 360,  # 6 minutes
-        "max_duration_without_lm": 360,  # 6 minutes
+        # Offload mode.  DiT(4.46) + context(0.5) + activations ≈ 5.0GB.
+        # ~1GB headroom.  Tiled VAE decode fits with chunk=256 (~0.8GB peak).
+        # Duration barely affects peak VRAM (latent tensor is <2MB even at 10min).
+        "max_duration_with_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 600,  # 10 minutes (max supported)
         "max_batch_size_with_lm": 1,
         "max_batch_size_without_lm": 1,
         "init_lm_default": False,
         "available_lm_models": [],
+        "recommended_lm_model": "",
+        "lm_backend_restriction": "pt_mlx_only",
+        "recommended_backend": "pt",
+        "offload_to_cpu_default": True,
+        "offload_dit_to_cpu_default": True,
+        "quantization_default": True,
+        "compile_model_default": True,
         "lm_memory_gb": {},
     },
     "tier3": {  # 6-8GB
-        "max_duration_with_lm": 240,  # 4 minutes with LM
-        "max_duration_without_lm": 360,  # 6 minutes without LM
-        "max_batch_size_with_lm": 1,
+        # Offload mode.  DiT(4.46) + context(0.5) ≈ 5.0GB.
+        # ~1.5-3GB headroom allows LM 0.6B (1.2+0.6=1.8GB) and batch=2.
+        # vllm KV cache is tight; pt backend is safer for 0.6B on this tier.
+        "max_duration_with_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 600,  # 10 minutes (max supported)
+        "max_batch_size_with_lm": 2,
         "max_batch_size_without_lm": 2,
-        "init_lm_default": False,  # Don't init by default due to limited memory
+        "init_lm_default": True,
         "available_lm_models": ["acestep-5Hz-lm-0.6B"],
+        "recommended_lm_model": "acestep-5Hz-lm-0.6B",
+        "lm_backend_restriction": "pt_mlx_only",  # vllm KV cache too greedy for <8GB
+        "recommended_backend": "pt",
+        "offload_to_cpu_default": True,
+        "offload_dit_to_cpu_default": True,
+        "quantization_default": True,
+        "compile_model_default": True,
         "lm_memory_gb": {"0.6B": 3},
     },
     "tier4": {  # 8-12GB
-        "max_duration_with_lm": 240,  # 4 minutes with LM
-        "max_duration_without_lm": 360,  # 6 minutes without LM
+        # Can keep DiT + 0.6B LM simultaneously on GPU (4.46+1.2+0.6=6.26GB).
+        # Offload VAE/TextEnc.  Plenty of room for inference activations.
+        "max_duration_with_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 600,  # 10 minutes (max supported)
         "max_batch_size_with_lm": 2,
         "max_batch_size_without_lm": 4,
-        "init_lm_default": False,  # Don't init by default
+        "init_lm_default": True,
         "available_lm_models": ["acestep-5Hz-lm-0.6B"],
+        "recommended_lm_model": "acestep-5Hz-lm-0.6B",
+        "lm_backend_restriction": "all",  # vllm fits with 0.6B
+        "recommended_backend": "vllm",
+        "offload_to_cpu_default": True,
+        "offload_dit_to_cpu_default": True,
+        "quantization_default": True,
+        "compile_model_default": True,
         "lm_memory_gb": {"0.6B": 3},
     },
     "tier5": {  # 12-16GB
-        "max_duration_with_lm": 240,  # 4 minutes with LM
-        "max_duration_without_lm": 360,  # 6 minutes without LM
-        "max_batch_size_with_lm": 2,
+        # DiT + 1.7B LM (4.46+3.45+0.44=8.35GB) fits comfortably.
+        # VAE decode is batch-sequential so batch size doesn't affect VAE VRAM.
+        "max_duration_with_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 600,  # 10 minutes (max supported)
+        "max_batch_size_with_lm": 4,
         "max_batch_size_without_lm": 4,
         "init_lm_default": True,
         "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B"],
+        "recommended_lm_model": "acestep-5Hz-lm-1.7B",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
+        "offload_to_cpu_default": True,
+        "offload_dit_to_cpu_default": False,  # 12-16GB can keep DiT on GPU
+        "quantization_default": True,
+        "compile_model_default": True,
         "lm_memory_gb": {"0.6B": 3, "1.7B": 8},
     },
-    "tier6": {  # 16-24GB
+    "tier6a": {  # 16-20GB (e.g., RTX 4060 Ti 16GB, RTX 3080 16GB)
+        # On 16GB GPUs: DiT(INT8, ~2.4GB) + LM 1.7B(~7.6GB peak with offload) = ~10GB peak
+        # Empirical batch tests (60s, turbo): noLM-4→13.3GB, LM-2→11.9GB, LM-4→~13.5GB
+        # With CPU offload, LM is offloaded after inference → DiT batch has full 16GB budget.
         "max_duration_with_lm": 480,  # 8 minutes
-        "max_duration_without_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 600,  # 10 minutes (max supported)
         "max_batch_size_with_lm": 4,
         "max_batch_size_without_lm": 8,
         "init_lm_default": True,
+        "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B"],
+        "recommended_lm_model": "acestep-5Hz-lm-1.7B",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
+        "offload_to_cpu_default": True,  # Still offload VAE/TextEnc to save VRAM for LM
+        "offload_dit_to_cpu_default": False,
+        "quantization_default": True,
+        "compile_model_default": True,
+        "lm_memory_gb": {"0.6B": 3, "1.7B": 8},
+    },
+    "tier6b": {  # 20-24GB (e.g., RTX 3090, RTX 4090)
+        # 20-24GB: no offload, no quantization. DiT(bf16, ~4.7GB) + LM 1.7B(~3.4GB) = ~8.1GB
+        # Remaining ~12-16GB easily fits batch=8. VAE decode is batch-sequential.
+        "max_duration_with_lm": 480,  # 8 minutes
+        "max_duration_without_lm": 480,  # 8 minutes
+        "max_batch_size_with_lm": 8,
+        "max_batch_size_without_lm": 8,
+        "init_lm_default": True,
         "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B", "acestep-5Hz-lm-4B"],
+        "recommended_lm_model": "acestep-5Hz-lm-1.7B",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
+        "offload_to_cpu_default": False,  # 20-24GB can hold all models
+        "offload_dit_to_cpu_default": False,
+        "quantization_default": False,  # Enough VRAM, quantization optional
+        "compile_model_default": True,
         "lm_memory_gb": {"0.6B": 3, "1.7B": 8, "4B": 12},
     },
     "unlimited": {  # >= 24GB
@@ -118,10 +252,20 @@ class GPUConfig:
         "max_batch_size_without_lm": 8,
         "init_lm_default": True,
         "available_lm_models": ["acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-1.7B", "acestep-5Hz-lm-4B"],
+        "recommended_lm_model": "acestep-5Hz-lm-4B",
+        "lm_backend_restriction": "all",
+        "recommended_backend": "vllm",
+        "offload_to_cpu_default": False,
+        "offload_dit_to_cpu_default": False,
+        "quantization_default": False,  # Plenty of VRAM
+        "compile_model_default": True,
         "lm_memory_gb": {"0.6B": 3, "1.7B": 8, "4B": 12},
     },
 }
 
+# Backward compatibility alias: code that references "tier6" gets tier6b behavior
+GPU_TIER_CONFIGS["tier6"] = GPU_TIER_CONFIGS["tier6b"]
+
 
 def get_gpu_memory_gb() -> float:
     """
@@ -142,6 +286,38 @@ def get_gpu_memory_gb() -> float:
         try:
             simulated_gb = float(debug_vram)
             logger.warning(f"⚠️ DEBUG MODE: Simulating GPU memory as {simulated_gb:.1f}GB (set via {DEBUG_MAX_CUDA_VRAM_ENV} environment variable)")
+            # Also enforce a hard VRAM cap via PyTorch so that the allocator
+            # cannot use more than the simulated amount.  This makes the
+            # simulation realistic — without it, models still load into the
+            # real (larger) GPU memory and nvitop shows much higher usage.
+            try:
+                import torch
+                if torch.cuda.is_available():
+                    total_bytes = torch.cuda.get_device_properties(0).total_memory
+                    total_gb = total_bytes / (1024 ** 3)
+                    if simulated_gb < total_gb:
+                        # When simulating a smaller GPU on a larger one, the host
+                        # GPU's CUDA context is typically much bigger (e.g. A100
+                        # ~1.4GB vs GTX 1060 ~0.3GB).  Using the host context
+                        # would over-penalise the allocator budget.
+                        #
+                        # Instead we use a *reference* context size that matches
+                        # what the target-class GPU would actually have.  Consumer
+                        # GPUs (≤24GB) typically have 0.3-0.5GB context overhead.
+                        REFERENCE_CONTEXT_GB = MODEL_VRAM.get("cuda_context", 0.5)
+                        allocator_budget_gb = max(0.5, simulated_gb - REFERENCE_CONTEXT_GB)
+                        fraction = allocator_budget_gb / total_gb
+                        # Clamp to [0.01, 1.0] to satisfy PyTorch constraints
+                        fraction = max(0.01, min(1.0, fraction))
+                        torch.cuda.set_per_process_memory_fraction(fraction)
+                        logger.warning(
+                            f"⚠️ DEBUG MODE: Set CUDA memory fraction to {fraction:.4f} "
+                            f"(allocator_budget={allocator_budget_gb:.2f}GB, "
+                            f"ref_context={REFERENCE_CONTEXT_GB:.2f}GB, target={simulated_gb:.1f}GB, "
+                            f"total={total_gb:.1f}GB) to enforce hard VRAM cap"
+                        )
+            except Exception as e:
+                logger.warning(f"⚠️ DEBUG MODE: Could not enforce CUDA memory cap: {e}")
             return simulated_gb
         except ValueError:
             logger.warning(f"Invalid {DEBUG_MAX_CUDA_VRAM_ENV} value: {debug_vram}, ignoring")
@@ -305,7 +481,7 @@ def get_gpu_tier(gpu_memory_gb: float) -> str:
         gpu_memory_gb: GPU memory in GB
         
     Returns:
-        Tier string: "tier1", "tier2", "tier3", "tier4", "tier5", "tier6", or "unlimited"
+        Tier string: "tier1", "tier2", "tier3", "tier4", "tier5", "tier6a", "tier6b", or "unlimited"
     """
     if gpu_memory_gb <= 0:
         # CPU mode - use tier1 limits
@@ -320,10 +496,13 @@ def get_gpu_tier(gpu_memory_gb: float) -> str:
         return "tier4"
     elif gpu_memory_gb < VRAM_16GB_MIN_GB:
         return "tier5"
-    elif gpu_memory_gb <= 24:
+    elif gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB:
+        # 16-20GB range: tier6a (constrained, needs offload)
         if gpu_memory_gb < 16.0:
             logger.info(f"Detected {gpu_memory_gb:.2f}GB VRAM — treating as 16GB class GPU")
-        return "tier6"
+        return "tier6a"
+    elif gpu_memory_gb <= 24:
+        return "tier6b"
     else:
         return "unlimited"
 
@@ -353,6 +532,13 @@ def get_gpu_config(gpu_memory_gb: Optional[float] = None) -> GPUConfig:
         max_batch_size_without_lm=config["max_batch_size_without_lm"],
         init_lm_default=config["init_lm_default"],
         available_lm_models=config["available_lm_models"],
+        recommended_lm_model=config.get("recommended_lm_model", ""),
+        lm_backend_restriction=config.get("lm_backend_restriction", "all"),
+        recommended_backend=config.get("recommended_backend", "vllm"),
+        offload_to_cpu_default=config.get("offload_to_cpu_default", True),
+        offload_dit_to_cpu_default=config.get("offload_dit_to_cpu_default", True),
+        quantization_default=config.get("quantization_default", True),
+        compile_model_default=config.get("compile_model_default", True),
         lm_memory_gb=config["lm_memory_gb"],
     )
 
@@ -362,7 +548,7 @@ def get_lm_model_size(model_path: str) -> str:
     Extract LM model size from model path.
     
     Args:
-        model_path: Model path string (e.g., "acestep-5Hz-lm-0.6B")
+        model_path: Model path string (e.g., "acestep-5Hz-lm-0.6B", "acestep-5Hz-lm-0.6B-v4-fix")
         
     Returns:
         Model size string: "0.6B", "1.7B", or "4B"
@@ -378,46 +564,360 @@ def get_lm_model_size(model_path: str) -> str:
         return "0.6B"
 
 
+def is_lm_model_size_allowed(disk_model_name: str, tier_available_models: List[str]) -> bool:
+    """
+    Check if a disk LM model is allowed by the tier's available models list.
+    
+    Uses size-based matching so that variants like "acestep-5Hz-lm-0.6B-v4-fix"
+    are correctly matched against "acestep-5Hz-lm-0.6B" in the tier config.
+    
+    Args:
+        disk_model_name: Actual model directory name on disk (e.g., "acestep-5Hz-lm-0.6B-v4-fix")
+        tier_available_models: List of tier-allowed model base names (e.g., ["acestep-5Hz-lm-0.6B"])
+        
+    Returns:
+        True if the model's size class is allowed by the tier
+    """
+    if not tier_available_models:
+        return False
+    model_size = get_lm_model_size(disk_model_name)
+    for tier_model in tier_available_models:
+        if model_size == get_lm_model_size(tier_model):
+            return True
+    return False
+
+
+def find_best_lm_model_on_disk(recommended_model: str, disk_models: List[str]) -> Optional[str]:
+    """
+    Find the best matching disk model for a recommended tier model.
+    
+    If the exact recommended model exists on disk, return it.
+    Otherwise, find a disk model with the same size class (e.g., "0.6B").
+    Prefers models with version suffixes (e.g., "-v4-fix") as they are likely newer.
+    
+    Args:
+        recommended_model: Tier-recommended model name (e.g., "acestep-5Hz-lm-0.6B")
+        disk_models: List of model names actually on disk
+        
+    Returns:
+        Best matching disk model name, or None if no match
+    """
+    if not recommended_model or not disk_models:
+        return disk_models[0] if disk_models else None
+    
+    # Exact match first
+    if recommended_model in disk_models:
+        return recommended_model
+    
+    # Size-based match: find all disk models with same size
+    target_size = get_lm_model_size(recommended_model)
+    candidates = [m for m in disk_models if get_lm_model_size(m) == target_size]
+    
+    if candidates:
+        # Prefer the one with the longest name (likely has version suffix = newer)
+        return max(candidates, key=len)
+    
+    # No match for recommended size; return first available disk model
+    return disk_models[0] if disk_models else None
+
+
 def get_lm_gpu_memory_ratio(model_path: str, total_gpu_memory_gb: float) -> Tuple[float, float]:
     """
     Calculate GPU memory utilization ratio for LM model.
     
+    This function now uses *actually free* VRAM (via torch.cuda.mem_get_info)
+    when available, instead of computing the ratio purely from total VRAM.
+    This is critical because DiT, VAE, and text encoder are already loaded
+    when the LM initializes, so the "available" memory is much less than total.
+    
     Args:
         model_path: LM model path (e.g., "acestep-5Hz-lm-0.6B")
-        total_gpu_memory_gb: Total GPU memory in GB
+        total_gpu_memory_gb: Total GPU memory in GB (used as fallback)
         
     Returns:
         Tuple of (gpu_memory_utilization_ratio, target_memory_gb)
     """
     model_size = get_lm_model_size(model_path)
     
-    # Model weight memory (approximate) for each model size
-    model_weight_memory = {
-        "0.6B": 3.0,
-        "1.7B": 8.0,
-        "4B": 12.0,
-    }
+    # Use empirical LM VRAM measurements for target memory
+    lm_info = LM_VRAM.get(model_size, LM_VRAM["0.6B"])
+    lm_weights_gb = lm_info["weights"]
+    lm_kv_cache_gb = lm_info["kv_cache_4k"]
     
-    target_gb = model_weight_memory.get(model_size, 3.0)
+    # Total target = model weights + KV cache + small overhead
+    target_gb = lm_weights_gb
+    total_target_gb = lm_weights_gb + lm_kv_cache_gb + 0.3  # 0.3 GB overhead
     
-    # gpu_memory_utilization in nano-vllm caps the TOTAL GPU memory usage
-    # (model weights + KV cache + overhead). If we set it to just the model
-    # weight size, there is almost no room left for KV cache and inference
-    # fails with "Insufficient KV cache" errors.
-    # We therefore add generous headroom so the KV cache can hold at least
-    # max_model_len (4096) tokens comfortably.
-    total_target_gb = target_gb * 1.5  # 50% headroom for KV cache + overhead
+    # Try to use actual free memory for a more accurate ratio
+    free_gb = None
+    try:
+        import torch
+        if torch.cuda.is_available():
+            free_bytes, total_bytes = torch.cuda.mem_get_info()
+            free_gb = free_bytes / (1024**3)
+            actual_total_gb = total_bytes / (1024**3)
+            
+            # If MAX_CUDA_VRAM is set, use the simulated values instead
+            # because set_per_process_memory_fraction limits actual allocation
+            debug_vram = os.environ.get(DEBUG_MAX_CUDA_VRAM_ENV)
+            if debug_vram is not None:
+                try:
+                    simulated_gb = float(debug_vram)
+                    if simulated_gb < actual_total_gb:
+                        # Use reference context (matching set_per_process_memory_fraction)
+                        ref_context_gb = MODEL_VRAM.get("cuda_context", 0.5)
+                        allocator_budget_gb = max(0.5, simulated_gb - ref_context_gb)
+                        reserved_gb = torch.cuda.memory_reserved() / (1024**3)
+                        free_gb = max(0, allocator_budget_gb - reserved_gb)
+                        actual_total_gb = simulated_gb
+                except (ValueError, TypeError):
+                    pass
+            
+            # The ratio is relative to total GPU memory (nano-vllm convention),
+            # but we compute it so that the LM only claims what's actually free
+            # minus a safety margin for DiT inference activations.
+            # Reserve at least 1.5 GB for DiT inference activations
+            dit_reserve_gb = 1.5
+            usable_for_lm = max(0, free_gb - dit_reserve_gb - VRAM_SAFETY_MARGIN_GB)
+            
+            # Cap to what the LM actually needs
+            usable_for_lm = min(usable_for_lm, total_target_gb)
+            
+            # Convert to ratio of total GPU memory
+            # nano-vllm uses: target_total_usage = total * gpu_memory_utilization
+            # We want: (total * ratio) = current_usage + usable_for_lm
+            current_usage_gb = actual_total_gb - free_gb
+            desired_total_usage = current_usage_gb + usable_for_lm
+            ratio = desired_total_usage / actual_total_gb
+            
+            ratio = min(0.9, max(0.1, ratio))
+            
+            logger.info(
+                f"[get_lm_gpu_memory_ratio] model={model_size}, free={free_gb:.2f}GB, "
+                f"current_usage={current_usage_gb:.2f}GB, lm_target={total_target_gb:.2f}GB, "
+                f"usable_for_lm={usable_for_lm:.2f}GB, ratio={ratio:.3f}"
+            )
+            return ratio, target_gb
+    except Exception as e:
+        logger.warning(f"[get_lm_gpu_memory_ratio] Failed to query free VRAM: {e}, using fallback")
     
-    # For large GPUs (>=24GB), don't restrict memory too much
+    # Fallback: compute ratio from total VRAM (less accurate)
     if total_gpu_memory_gb >= 24:
         ratio = min(0.9, max(0.2, total_target_gb / total_gpu_memory_gb))
     else:
-        # For smaller GPUs, strictly limit memory usage
         ratio = min(0.9, max(0.1, total_target_gb / total_gpu_memory_gb))
     
     return ratio, target_gb
 
 
+def compute_adaptive_config(total_vram_gb: float, dit_type: str = "turbo") -> GPUConfig:
+    """
+    Compute GPU configuration based on what actually fits in VRAM.
+    
+    This is a VRAM-budget-based approach: instead of hard-coded tier boundaries,
+    we calculate how much memory each component needs and determine what fits.
+    
+    Args:
+        total_vram_gb: Total GPU VRAM in GB
+        dit_type: "turbo" or "base" (affects inference VRAM due to CFG)
+        
+    Returns:
+        GPUConfig with parameters that fit within the VRAM budget
+    """
+    # Calculate base VRAM usage (always loaded)
+    dit_key = f"dit_{dit_type}" if f"dit_{dit_type}" in MODEL_VRAM else "dit_turbo"
+    base_usage = (
+        MODEL_VRAM[dit_key]
+        + MODEL_VRAM["vae"]
+        + MODEL_VRAM["text_encoder"]
+        + MODEL_VRAM["cuda_context"]
+        + MODEL_VRAM["silence_latent"]
+        + VRAM_SAFETY_MARGIN_GB
+    )
+    
+    available = total_vram_gb - base_usage
+    
+    if available <= 0:
+        # Not enough for even base models - CPU offload required
+        return get_gpu_config(total_vram_gb)
+    
+    # Determine which LM models fit
+    available_lm_models = []
+    lm_memory_gb = {}
+    
+    for size_key in ["0.6B", "1.7B", "4B"]:
+        lm_info = LM_VRAM[size_key]
+        lm_total = lm_info["weights"] + lm_info["kv_cache_4k"]
+        # LM needs to fit with some room left for inference activations
+        inference_per_batch = DIT_INFERENCE_VRAM_PER_BATCH.get(dit_type, 0.8)
+        if lm_total + inference_per_batch <= available:
+            model_name = f"acestep-5Hz-lm-{size_key}"
+            available_lm_models.append(model_name)
+            lm_memory_gb[size_key] = lm_info["weights"] + lm_info["kv_cache_4k"]
+    
+    # Determine max batch sizes
+    inference_per_batch = DIT_INFERENCE_VRAM_PER_BATCH.get(dit_type, 0.8)
+    
+    # Without LM: all available VRAM goes to inference
+    max_batch_no_lm = max(1, int(available / inference_per_batch))
+    max_batch_no_lm = min(max_batch_no_lm, 8)  # Cap at 8
+    
+    # With LM: subtract the largest available LM from available
+    if available_lm_models:
+        largest_lm_size = list(lm_memory_gb.keys())[-1]
+        lm_usage = lm_memory_gb[largest_lm_size]
+        remaining_for_inference = available - lm_usage
+        max_batch_with_lm = max(1, int(remaining_for_inference / inference_per_batch))
+        max_batch_with_lm = min(max_batch_with_lm, 8)
+    else:
+        max_batch_with_lm = max_batch_no_lm
+    
+    # Determine duration limits based on available VRAM
+    # Longer durations need more VRAM for latents
+    if total_vram_gb >= 24:
+        max_dur_lm = 600
+        max_dur_no_lm = 600
+    elif total_vram_gb >= 20:
+        max_dur_lm = 480
+        max_dur_no_lm = 480
+    elif total_vram_gb >= 16:
+        max_dur_lm = 360
+        max_dur_no_lm = 480
+    elif total_vram_gb >= 12:
+        max_dur_lm = 240
+        max_dur_no_lm = 360
+    elif total_vram_gb >= 8:
+        max_dur_lm = 240
+        max_dur_no_lm = 360
+    else:
+        max_dur_lm = 180
+        max_dur_no_lm = 180
+    
+    tier = get_gpu_tier(total_vram_gb)
+    tier_config = GPU_TIER_CONFIGS.get(tier, {})
+    
+    return GPUConfig(
+        tier=tier,
+        gpu_memory_gb=total_vram_gb,
+        max_duration_with_lm=max_dur_lm,
+        max_duration_without_lm=max_dur_no_lm,
+        max_batch_size_with_lm=max_batch_with_lm,
+        max_batch_size_without_lm=max_batch_no_lm,
+        init_lm_default=bool(available_lm_models),
+        available_lm_models=available_lm_models,
+        recommended_lm_model=tier_config.get("recommended_lm_model", available_lm_models[0] if available_lm_models else ""),
+        lm_backend_restriction=tier_config.get("lm_backend_restriction", "all"),
+        recommended_backend=tier_config.get("recommended_backend", "vllm"),
+        offload_to_cpu_default=tier_config.get("offload_to_cpu_default", True),
+        offload_dit_to_cpu_default=tier_config.get("offload_dit_to_cpu_default", True),
+        quantization_default=tier_config.get("quantization_default", True),
+        compile_model_default=tier_config.get("compile_model_default", True),
+        lm_memory_gb=lm_memory_gb,
+    )
+
+
+def get_effective_free_vram_gb(device_index: int = 0) -> float:
+    """
+    Get the effective free VRAM in GB, accounting for per-process memory fraction.
+    
+    torch.cuda.mem_get_info() reports *device-level* free memory, which ignores
+    the per-process cap set by torch.cuda.set_per_process_memory_fraction().
+    
+    This function computes:
+        effective_free = min(device_free, process_allowed - process_allocated)
+    
+    where process_allowed = total_memory * memory_fraction.
+    
+    Returns 0 if no GPU is available or on error.
+    """
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return 0.0
+        
+        device_free_bytes, total_bytes = torch.cuda.mem_get_info(device_index)
+        
+        # Check if a per-process memory fraction has been set
+        # We detect this by checking MAX_CUDA_VRAM env var (our simulation mechanism)
+        debug_vram = os.environ.get(DEBUG_MAX_CUDA_VRAM_ENV)
+        if debug_vram is not None:
+            try:
+                simulated_gb = float(debug_vram)
+                total_gb = total_bytes / (1024 ** 3)
+                if simulated_gb < total_gb:
+                    # Per-process cap is active.
+                    # Use the same reference context as set_per_process_memory_fraction.
+                    ref_context_gb = MODEL_VRAM.get("cuda_context", 0.5)
+                    allocator_budget_gb = max(0.5, simulated_gb - ref_context_gb)
+                    allocator_budget_bytes = allocator_budget_gb * (1024 ** 3)
+                    reserved_bytes = torch.cuda.memory_reserved(device_index)
+                    # Free = what the allocator is allowed minus what it has reserved
+                    process_free = allocator_budget_bytes - reserved_bytes
+                    effective_free = min(device_free_bytes, process_free)
+                    return max(0.0, effective_free / (1024 ** 3))
+            except (ValueError, TypeError):
+                pass
+        
+        return device_free_bytes / (1024 ** 3)
+    except Exception:
+        return 0.0
+
+
+def get_available_vram_gb() -> float:
+    """
+    Get currently available (free) GPU VRAM in GB.
+    Returns 0 if no GPU is available or on error.
+    
+    This is an alias for get_effective_free_vram_gb() that accounts for
+    per-process memory fraction caps.
+    """
+    return get_effective_free_vram_gb()
+
+
+def estimate_inference_vram(
+    batch_size: int,
+    duration_s: float,
+    dit_type: str = "turbo",
+    with_lm: bool = False,
+    lm_size: str = "0.6B",
+) -> float:
+    """
+    Estimate total VRAM needed for a generation request.
+    
+    Args:
+        batch_size: Number of samples to generate
+        duration_s: Audio duration in seconds
+        dit_type: "turbo" or "base"
+        with_lm: Whether LM is loaded
+        lm_size: LM model size if with_lm is True
+        
+    Returns:
+        Estimated VRAM in GB
+    """
+    # Base model weights
+    dit_key = f"dit_{dit_type}" if f"dit_{dit_type}" in MODEL_VRAM else "dit_turbo"
+    base = (
+        MODEL_VRAM[dit_key]
+        + MODEL_VRAM["vae"]
+        + MODEL_VRAM["text_encoder"]
+        + MODEL_VRAM["cuda_context"]
+    )
+    
+    # DiT inference activations (scales with batch size and duration)
+    per_batch = DIT_INFERENCE_VRAM_PER_BATCH.get(dit_type, 0.8)
+    # Duration scaling: longer audio = more latent frames = more memory
+    duration_factor = max(1.0, duration_s / 60.0)  # Normalize to 60s baseline
+    inference = per_batch * batch_size * duration_factor
+    
+    # LM memory
+    lm_mem = 0.0
+    if with_lm and lm_size in LM_VRAM:
+        lm_info = LM_VRAM[lm_size]
+        lm_mem = lm_info["weights"] + lm_info["kv_cache_4k"]
+    
+    return base + inference + lm_mem + VRAM_SAFETY_MARGIN_GB
+
+
 def check_duration_limit(
     duration: float,
     gpu_config: GPUConfig,
diff --git a/acestep/gradio_ui/events/__init__.py b/acestep/gradio_ui/events/__init__.py
index dd1a4122..c76c6c07 100644
--- a/acestep/gradio_ui/events/__init__.py
+++ b/acestep/gradio_ui/events/__init__.py
@@ -70,6 +70,9 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["cfg_interval_start"],
             generation_section["cfg_interval_end"],
             generation_section["task_type"],
+            # GPU-config-aware limits (updated after initialization)
+            generation_section["audio_duration"],
+            generation_section["batch_size_input"],
         ]
     )
     
diff --git a/acestep/gradio_ui/events/generation_handlers.py b/acestep/gradio_ui/events/generation_handlers.py
index e8fc6eeb..8a0dd0ca 100644
--- a/acestep/gradio_ui/events/generation_handlers.py
+++ b/acestep/gradio_ui/events/generation_handlers.py
@@ -8,13 +8,14 @@
 import glob
 import gradio as gr
 from typing import Optional, List, Tuple
+from loguru import logger
 from acestep.constants import (
     TASK_TYPES_TURBO,
     TASK_TYPES_BASE,
 )
 from acestep.gradio_ui.i18n import t
 from acestep.inference import understand_music, create_sample, format_sample
-from acestep.gpu_config import get_global_gpu_config
+from acestep.gpu_config import get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk
 
 
 def clamp_duration_to_gpu_limit(duration_value: Optional[float], llm_handler=None) -> Optional[float]:
@@ -441,10 +442,38 @@ def update_model_type_settings(config_path):
 
 
 def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu, compile_model, quantization):
-    """Wrapper for service initialization, returns status, button state, accordion state, and model type settings"""
+    """Wrapper for service initialization, returns status, button state, accordion state, model type settings, and GPU-config-aware UI limits."""
     # Convert quantization checkbox to value (int8_weight_only if checked, None if not)
     quant_value = "int8_weight_only" if quantization else None
     
+    # --- Tier-aware validation before initialization ---
+    gpu_config = get_global_gpu_config()
+    
+    # Validate LM request against GPU tier
+    if init_llm and not gpu_config.available_lm_models:
+        init_llm = False  # Force disable LM on tiers that can't support it
+        logger.warning(f"⚠️ LM initialization disabled: GPU tier {gpu_config.tier} ({gpu_config.gpu_memory_gb:.1f}GB) does not support LM")
+    
+    # Validate LM model against tier's available models (size-based matching)
+    if init_llm and lm_model_path and gpu_config.available_lm_models:
+        if not is_lm_model_size_allowed(lm_model_path, gpu_config.available_lm_models):
+            # The selected model's size class is not supported by this tier.
+            # Find a disk model that matches the recommended size.
+            all_disk_models = llm_handler.get_available_5hz_lm_models() if llm_handler else []
+            fallback = find_best_lm_model_on_disk(gpu_config.recommended_lm_model, all_disk_models)
+            if fallback:
+                old_model = lm_model_path
+                lm_model_path = fallback
+                logger.warning(f"⚠️ LM model {old_model} size not supported for tier {gpu_config.tier}, falling back to {lm_model_path}")
+            else:
+                init_llm = False
+                logger.warning(f"⚠️ No compatible LM model found on disk for tier {gpu_config.tier}, disabling LM")
+    
+    # Validate backend against tier restriction
+    if init_llm and gpu_config.lm_backend_restriction == "pt_mlx_only" and backend == "vllm":
+        backend = gpu_config.recommended_backend  # Fallback to pt
+        logger.warning(f"⚠️ vllm backend not supported for tier {gpu_config.tier} (VRAM too low for KV cache), falling back to {backend}")
+    
     # Initialize DiT handler
     status, enable = dit_handler.initialize_service(
         checkpoint, config_path, device,
@@ -485,11 +514,37 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
     is_turbo = dit_handler.is_turbo_model()
     model_type_settings = get_model_type_ui_settings(is_turbo)
     
+    # --- Update UI limits based on GPU config and actual LM state ---
+    gpu_config = get_global_gpu_config()
+    lm_actually_initialized = llm_handler.llm_initialized if llm_handler else False
+    max_duration = gpu_config.max_duration_with_lm if lm_actually_initialized else gpu_config.max_duration_without_lm
+    max_batch = gpu_config.max_batch_size_with_lm if lm_actually_initialized else gpu_config.max_batch_size_without_lm
+    
+    duration_update = gr.update(
+        maximum=float(max_duration),
+        info=f"Duration in seconds (-1 for auto). Max: {max_duration}s / {max_duration // 60} min"
+    )
+    batch_update = gr.update(
+        value=min(2, max_batch),  # Clamp value to new maximum to avoid Gradio validation error
+        maximum=max_batch,
+        info=f"Number of samples to generate (Max: {max_batch})"
+    )
+    
+    # Add GPU config info to status
+    status += f"\n📊 GPU Config: tier={gpu_config.tier}, max_duration={max_duration}s, max_batch={max_batch}"
+    if gpu_config.available_lm_models:
+        status += f", available_lm={gpu_config.available_lm_models}"
+    else:
+        status += ", LM not available for this GPU tier"
+    
     return (
         status, 
         gr.update(interactive=enable), 
         accordion_state,
-        *model_type_settings
+        *model_type_settings,
+        # GPU-config-aware UI updates
+        duration_update,
+        batch_update,
     )
 
 
diff --git a/acestep/gradio_ui/interfaces/generation.py b/acestep/gradio_ui/interfaces/generation.py
index a30bc877..564b07a0 100644
--- a/acestep/gradio_ui/interfaces/generation.py
+++ b/acestep/gradio_ui/interfaces/generation.py
@@ -12,7 +12,7 @@
     DEFAULT_DIT_INSTRUCTION,
 )
 from acestep.gradio_ui.i18n import t
-from acestep.gpu_config import get_global_gpu_config, GPUConfig
+from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk
 
 
 def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict:
@@ -48,17 +48,37 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
     default_batch_size = min(2, max_batch_size)  # Default to 2 or max if lower
     init_lm_default = gpu_config.init_lm_default
     
-    # Determine default offload setting
-    # If XPU is detected, default offload to False (keep models on device)
-    # Otherwise default to True (offload to CPU to save VRAM)
-    default_offload = True
+    # Determine default offload setting from GPU config tier
+    # XPU override: if XPU is detected, keep models on device
+    default_offload = gpu_config.offload_to_cpu_default
+    default_offload_dit = gpu_config.offload_dit_to_cpu_default
     try:
         import torch
         if hasattr(torch, 'xpu') and torch.xpu.is_available():
             default_offload = False
+            default_offload_dit = False
     except ImportError:
         pass
     
+    # Tier-aware LM defaults
+    default_quantization = gpu_config.quantization_default
+    default_compile = gpu_config.compile_model_default
+    # macOS override: disable quantization on macOS due to torchao incompatibilities
+    if sys.platform == "darwin":
+        default_quantization = False
+    
+    # Backend choices based on tier restriction
+    if gpu_config.lm_backend_restriction == "pt_mlx_only":
+        available_backends = ["pt", "mlx"]
+    else:
+        available_backends = ["vllm", "pt", "mlx"]
+    recommended_backend = gpu_config.recommended_backend
+    if recommended_backend not in available_backends:
+        recommended_backend = available_backends[0]
+    
+    # Recommended LM model: use tier config, fallback to first available
+    recommended_lm = gpu_config.recommended_lm_model
+    
     with gr.Group():
         # Service Configuration - collapse if pre-initialized, hide if in service mode
         accordion_open = not service_pre_initialized
@@ -115,9 +135,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                 )
             
             with gr.Row():
-                # Get available 5Hz LM model list
-                available_lm_models = llm_handler.get_available_5hz_lm_models()
-                default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
+                # Get available 5Hz LM model list from disk, then filter by GPU tier
+                all_lm_models = llm_handler.get_available_5hz_lm_models()
+                # Filter to only show models whose size class is supported by this tier
+                # e.g., tier3 allows "0.6B" → keep "acestep-5Hz-lm-0.6B-v4-fix" on disk
+                tier_lm_models = gpu_config.available_lm_models
+                if tier_lm_models:
+                    filtered_lm_models = [m for m in all_lm_models if is_lm_model_size_allowed(m, tier_lm_models)]
+                    # If no tier models found on disk, show all disk models (user may have custom checkpoints)
+                    available_lm_models = filtered_lm_models if filtered_lm_models else all_lm_models
+                else:
+                    available_lm_models = all_lm_models
+                
+                # Use recommended model from tier config, find best match on disk
+                default_lm_model = find_best_lm_model_on_disk(recommended_lm, available_lm_models)
                 
                 # Set lm_model_path value from init_params if pre-initialized
                 lm_model_path_value = init_params.get('lm_model_path', default_lm_model) if service_pre_initialized else default_lm_model
@@ -125,25 +156,29 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     label=t("service.lm_model_path_label"),
                     choices=available_lm_models,
                     value=lm_model_path_value,
-                    info=t("service.lm_model_path_info")
+                    info=t("service.lm_model_path_info") + (f" (Recommended: {recommended_lm})" if recommended_lm else " (LM not available for this GPU tier)")
                 )
-                # Set backend value from init_params if pre-initialized
-                backend_value = init_params.get('backend', 'vllm') if service_pre_initialized else 'vllm'
+                # Set backend value from init_params if pre-initialized, using tier-recommended backend
+                backend_value = init_params.get('backend', recommended_backend) if service_pre_initialized else recommended_backend
                 backend_dropdown = gr.Dropdown(
-                    choices=["vllm", "pt", "mlx"],
+                    choices=available_backends,
                     value=backend_value,
                     label=t("service.backend_label"),
-                    info=t("service.backend_info")
+                    info=t("service.backend_info") + (f" (vllm unavailable for {gpu_config.tier}: VRAM too low)" if gpu_config.lm_backend_restriction == "pt_mlx_only" else "")
                 )
             
             # Checkbox options section - all checkboxes grouped together
+            # Defaults are tier-aware (set above from gpu_config)
             with gr.Row():
-                # Set init_llm value from init_params if pre-initialized, otherwise use GPU config default
+                # LM checkbox: for tiers with no LM support, default off and show info
                 init_llm_value = init_params.get('init_llm', init_lm_default) if service_pre_initialized else init_lm_default
+                lm_info_text = t("service.init_llm_info")
+                if not gpu_config.available_lm_models:
+                    lm_info_text += " ⚠️ LM not available for this GPU tier (VRAM too low)"
                 init_llm_checkbox = gr.Checkbox(
                     label=t("service.init_llm_label"),
                     value=init_llm_value,
-                    info=t("service.init_llm_info"),
+                    info=lm_info_text,
                 )
                 # Auto-detect flash attention availability
                 flash_attn_available = dit_handler.is_flash_attention_available(device_value)
@@ -155,35 +190,33 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
                     interactive=flash_attn_available,
                     info=t("service.flash_attention_info_enabled") if flash_attn_available else t("service.flash_attention_info_disabled")
                 )
-                # Set offload_to_cpu value from init_params if pre-initialized (default True)
+                # Offload to CPU: tier-aware default
                 offload_to_cpu_value = init_params.get('offload_to_cpu', default_offload) if service_pre_initialized else default_offload
                 offload_to_cpu_checkbox = gr.Checkbox(
                     label=t("service.offload_cpu_label"),
                     value=offload_to_cpu_value,
-                    info=t("service.offload_cpu_info")
+                    info=t("service.offload_cpu_info") + (" (recommended for this tier)" if default_offload else " (optional for this tier)")
                 )
-                # Set offload_dit_to_cpu value from init_params if pre-initialized (default True)
-                offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', default_offload) if service_pre_initialized else default_offload
+                # Offload DiT to CPU: tier-aware default
+                offload_dit_to_cpu_value = init_params.get('offload_dit_to_cpu', default_offload_dit) if service_pre_initialized else default_offload_dit
                 offload_dit_to_cpu_checkbox = gr.Checkbox(
                     label=t("service.offload_dit_cpu_label"),
                     value=offload_dit_to_cpu_value,
-                    info=t("service.offload_dit_cpu_info")
+                    info=t("service.offload_dit_cpu_info") + (" (recommended for this tier)" if default_offload_dit else " (optional for this tier)")
                 )
-                # Set compile_model value from init_params if pre-initialized (default True)
-                compile_model_value = init_params.get('compile_model', True) if service_pre_initialized else True
+                # Compile model: tier-aware default
+                compile_model_value = init_params.get('compile_model', default_compile) if service_pre_initialized else default_compile
                 compile_model_checkbox = gr.Checkbox(
                     label=t("service.compile_model_label"),
                     value=compile_model_value,
                     info=t("service.compile_model_info")
                 )
-                # Set quantization value from init_params if pre-initialized.
-                # Default to False on macOS to avoid torchao incompatibilities.
-                default_quantization = False if sys.platform == "darwin" else True
+                # Quantization: tier-aware default (macOS override already applied above)
                 quantization_value = init_params.get('quantization', default_quantization) if service_pre_initialized else default_quantization
                 quantization_checkbox = gr.Checkbox(
                     label=t("service.quantization_label"),
                     value=quantization_value,
-                    info=t("service.quantization_info")
+                    info=t("service.quantization_info") + (" (recommended for this tier)" if default_quantization else " (optional for this tier)")
                 )
             
             init_btn = gr.Button(t("service.init_btn"), variant="primary", size="lg")
diff --git a/acestep/handler.py b/acestep/handler.py
index b6fac652..1e278f40 100644
--- a/acestep/handler.py
+++ b/acestep/handler.py
@@ -46,7 +46,7 @@
 )
 from acestep.core.generation.handler import LoraManagerMixin, ProgressMixin
 from acestep.dit_alignment_score import MusicStampsAligner, MusicLyricScorer
-from acestep.gpu_config import get_gpu_memory_gb, get_global_gpu_config
+from acestep.gpu_config import get_gpu_memory_gb, get_global_gpu_config, get_effective_free_vram_gb
 
 
 warnings.filterwarnings("ignore")
@@ -1169,37 +1169,182 @@ def _get_effective_mps_memory_gb(self) -> Optional[float]:
         # Align with gpu_config: MPS can use ~75% of unified memory for GPU workloads.
         return system_gb * 0.75
 
+    # Maximum VAE decode chunk size.  Larger chunks are faster but the
+    # PyTorch caching allocator may *reserve* significantly more VRAM than
+    # the peak *allocated* amount.  Empirical measurements (bf16 VAE,
+    # ~10 GB baseline from DiT + LM):
+    #   chunk  peak_alloc  peak_reserved
+    #    512     11.9 GB     12.7 GB
+    #   1024     13.1 GB     15.0 GB   ← dangerously close to 16 GB
+    #   1536     14.4 GB     17.2 GB   ← exceeds 16 GB
+    # Capping at 512 keeps reserved VRAM safely under 16 GB on consumer
+    # GPUs while the speed difference vs 1024/1536 is negligible for
+    # tiled decode (a few hundred ms).
+    VAE_DECODE_MAX_CHUNK_SIZE = 512
+
     def _get_auto_decode_chunk_size(self) -> int:
-        """Choose a conservative VAE decode chunk size based on memory."""
+        """Choose a conservative VAE decode chunk size based on available memory.
+        
+        For CUDA GPUs, uses actual free VRAM to determine chunk size.
+        For MPS, uses effective memory estimate.
+        Larger chunks are faster but use more VRAM; smaller chunks are safer.
+        The result is capped at ``VAE_DECODE_MAX_CHUNK_SIZE`` to prevent the
+        PyTorch caching allocator from over-reserving VRAM on consumer GPUs.
+        """
         override = os.environ.get("ACESTEP_VAE_DECODE_CHUNK_SIZE")
         if override:
             try:
                 value = int(override)
                 if value > 0:
-                    return value
+                    return value  # explicit override bypasses the cap
             except ValueError:
                 pass
+
+        max_chunk = self.VAE_DECODE_MAX_CHUNK_SIZE
+
         if self.device == "mps":
             mem_gb = self._get_effective_mps_memory_gb()
             if mem_gb is not None:
                 if mem_gb >= 48:
-                    return 1536
+                    return min(1536, max_chunk)
                 if mem_gb >= 24:
-                    return 1024
-        return 512
+                    return min(1024, max_chunk)
+            return min(512, max_chunk)
+        
+        # CUDA: use effective free VRAM (respects per-process memory fraction) to pick chunk size
+        if self.device == "cuda" or (isinstance(self.device, str) and self.device.startswith("cuda")):
+            try:
+                free_gb = get_effective_free_vram_gb()
+            except Exception:
+                free_gb = 0
+            logger.debug(f"[_get_auto_decode_chunk_size] Effective free VRAM: {free_gb:.2f} GB")
+            # VAE decode peak VRAM (allocated) scales roughly with chunk_size.
+            # Empirical: chunk_size=512 needs ~1.3 GB, 1024 needs ~2.6 GB, 1536 needs ~3.9 GB
+            # chunk_size=128 needs ~0.3 GB, chunk_size=64 needs ~0.3 GB
+            if free_gb >= 8.0:
+                return min(512, max_chunk)
+            elif free_gb >= 5.0:
+                return min(512, max_chunk)
+            elif free_gb >= 2.5:
+                return min(512, max_chunk)
+            elif free_gb >= 1.0:
+                return 256
+            elif free_gb >= 0.5:
+                return 128  # Very tight VRAM
+            else:
+                return 64   # Extremely tight VRAM — minimal chunk
+        
+        return min(512, max_chunk)
 
     def _should_offload_wav_to_cpu(self) -> bool:
-        """Decide whether to offload decoded wavs to CPU for memory safety."""
+        """Decide whether to offload decoded wavs to CPU for memory safety.
+        
+        For CUDA GPUs with >=24 GB free, keep on GPU for speed.
+        For MPS with >=32 GB, keep on GPU.
+        Otherwise offload to CPU to avoid OOM during concatenation.
+        """
         override = os.environ.get("ACESTEP_MPS_DECODE_OFFLOAD")
         if override:
             return override.lower() in ("1", "true", "yes")
-        if self.device != "mps":
+        if self.device == "mps":
+            mem_gb = self._get_effective_mps_memory_gb()
+            if mem_gb is not None and mem_gb >= 32:
+                return False
             return True
-        mem_gb = self._get_effective_mps_memory_gb()
-        if mem_gb is not None and mem_gb >= 32:
-            return False
+        # CUDA: offload unless plenty of free VRAM
+        if self.device == "cuda" or (isinstance(self.device, str) and self.device.startswith("cuda")):
+            try:
+                free_gb = get_effective_free_vram_gb()
+                logger.debug(f"[_should_offload_wav_to_cpu] Effective free VRAM: {free_gb:.2f} GB")
+                if free_gb >= 24.0:
+                    return False
+            except Exception:
+                pass
         return True
 
+    def _vram_guard_reduce_batch(
+        self,
+        batch_size: int,
+        audio_duration: Optional[float] = None,
+        use_lm: bool = False,
+    ) -> int:
+        """Pre-inference VRAM guard: auto-reduce batch_size if free VRAM is tight.
+        
+        Rough activation estimate per batch element:
+          - DiT forward pass: ~0.8 GB per sample at 60s, scales linearly with duration
+          - LM inference: KV cache is pre-allocated so batch doesn't change it much
+          - VAE decode: handled separately via tiled_decode
+        
+        We leave a 1.5 GB safety margin for CUDA allocator fragmentation.
+        
+        IMPORTANT: When offload_to_cpu is True, the LM model (especially vllm
+        backend) may still be on GPU when this guard runs, but it will be
+        offloaded or its memory reclaimed before DiT actually needs the VRAM.
+        In that case we trust the static GPU tier config limits (which have been
+        empirically validated) and skip the dynamic VRAM check.
+        """
+        if batch_size <= 1:
+            return batch_size
+
+        device = self.device
+        if device == "cpu" or device == "mps":
+            return batch_size  # No CUDA VRAM to guard
+
+        # When CPU offload is enabled, the current free VRAM is misleading because
+        # the LM (vllm KV cache + weights) may still be on GPU at this point but
+        # will be released/reclaimed before DiT actually uses the VRAM.  The static
+        # GPU tier configs already encode safe batch limits that were empirically
+        # validated with offload enabled, so trust them.
+        #
+        # Use the more conservative max_batch_size_with_lm as the threshold since
+        # the handler doesn't know if LM was used upstream.  This is safe because
+        # max_batch_size_with_lm <= max_batch_size_without_lm for all tiers.
+        if self.offload_to_cpu:
+            gpu_config = get_global_gpu_config()
+            if gpu_config is not None:
+                tier_max = gpu_config.max_batch_size_with_lm
+                if batch_size <= tier_max:
+                    logger.debug(
+                        f"[VRAM guard] offload_to_cpu=True, batch_size={batch_size} <= "
+                        f"tier limit {tier_max} — skipping dynamic VRAM check "
+                        f"(LM will be offloaded before DiT runs)"
+                    )
+                    return batch_size
+                # batch_size exceeds tier limit — fall through to dynamic check
+
+        try:
+            free_gb = get_effective_free_vram_gb()
+        except Exception:
+            return batch_size
+
+        # Estimate per-sample activation cost for DiT
+        duration_sec = float(audio_duration) if audio_duration and float(audio_duration) > 0 else 60.0
+        # Empirical: ~0.8 GB per sample at 60s, linear scaling
+        per_sample_gb = 0.8 * (duration_sec / 60.0)
+        # If using cfg (base model), double the per-sample cost
+        if hasattr(self, 'model') and self.model is not None:
+            model_name = getattr(self, 'config_path', '') or ''
+            if 'base' in model_name.lower():
+                per_sample_gb *= 2.0
+
+        safety_margin_gb = 1.5
+        available_for_batch = free_gb - safety_margin_gb
+
+        if available_for_batch <= 0:
+            logger.warning(
+                f"[VRAM guard] Only {free_gb:.1f} GB free — reducing batch_size to 1"
+            )
+            return 1
+
+        max_safe_batch = max(1, int(available_for_batch / per_sample_gb))
+        if max_safe_batch < batch_size:
+            logger.warning(
+                f"[VRAM guard] Free VRAM {free_gb:.1f} GB can safely fit ~{max_safe_batch} samples "
+                f"(requested {batch_size}). Reducing batch_size to {max_safe_batch}."
+            )
+            return max_safe_batch
+
+        return batch_size
     def _get_vae_dtype(self, device: Optional[str] = None) -> torch.dtype:
         """Get VAE dtype based on target device and GPU tier."""
         target_device = device or self.device
@@ -2569,6 +2714,8 @@ def tiled_decode(self, latents, chunk_size: Optional[int] = None, overlap: int =
         if offload_wav_to_cpu is None:
             offload_wav_to_cpu = self._should_offload_wav_to_cpu()
         
+        logger.info(f"[tiled_decode] chunk_size={chunk_size}, offload_wav_to_cpu={offload_wav_to_cpu}, latents_shape={latents.shape}")
+        
         # MPS Conv1d has a hard output-size limit that the OobleckDecoder
         # exceeds during temporal upsampling with large chunks.  Reduce
         # chunk_size to keep each VAE decode within the MPS kernel limits
@@ -2621,13 +2768,44 @@ def _tiled_decode_inner(self, latents, chunk_size, overlap, offload_wav_to_cpu):
         """Core tiled decode logic (extracted for fallback wrapping)."""
         B, C, T = latents.shape
         
+        # ---- Batch-sequential decode ----
+        # VAE decode VRAM scales linearly with batch size.  On tight-VRAM GPUs
+        # (e.g. 8 GB) decoding the whole batch at once can OOM.  Process one
+        # sample at a time so peak VRAM stays constant regardless of batch size.
+        if B > 1:
+            logger.info(f"[tiled_decode] Batch size {B} > 1 — decoding samples sequentially to save VRAM")
+            per_sample_results = []
+            for b_idx in range(B):
+                single = latents[b_idx : b_idx + 1]  # [1, C, T]
+                decoded = self._tiled_decode_inner(single, chunk_size, overlap, offload_wav_to_cpu)
+                # Move to CPU immediately to free GPU VRAM for next sample
+                per_sample_results.append(decoded.cpu() if decoded.device.type != "cpu" else decoded)
+                self._empty_cache()
+            # Concatenate on CPU then move back if needed
+            result = torch.cat(per_sample_results, dim=0)  # [B, channels, samples]
+            if latents.device.type != "cpu" and not offload_wav_to_cpu:
+                result = result.to(latents.device)
+            return result
+        
+        # Adjust overlap for very small chunk sizes to ensure positive stride
+        effective_overlap = overlap
+        while chunk_size - 2 * effective_overlap <= 0 and effective_overlap > 0:
+            effective_overlap = effective_overlap // 2
+        if effective_overlap != overlap:
+            logger.warning(f"[tiled_decode] Reduced overlap from {overlap} to {effective_overlap} for chunk_size={chunk_size}")
+        overlap = effective_overlap
+        
         # If short enough, decode directly
         if T <= chunk_size:
-            # Decode and immediately extract .sample to avoid keeping DecoderOutput object
-            decoder_output = self.vae.decode(latents)
-            result = decoder_output.sample
-            del decoder_output
-            return result
+            try:
+                decoder_output = self.vae.decode(latents)
+                result = decoder_output.sample
+                del decoder_output
+                return result
+            except torch.cuda.OutOfMemoryError:
+                logger.warning("[tiled_decode] OOM on direct decode, falling back to CPU VAE decode")
+                self._empty_cache()
+                return self._decode_on_cpu(latents)
 
         # Calculate stride (core size)
         stride = chunk_size - 2 * overlap
@@ -2638,10 +2816,25 @@ def _tiled_decode_inner(self, latents, chunk_size, overlap, offload_wav_to_cpu):
         
         if offload_wav_to_cpu:
             # Optimized path: offload wav to CPU immediately to save VRAM
-            return self._tiled_decode_offload_cpu(latents, B, T, stride, overlap, num_steps)
+            try:
+                return self._tiled_decode_offload_cpu(latents, B, T, stride, overlap, num_steps)
+            except torch.cuda.OutOfMemoryError:
+                logger.warning(f"[tiled_decode] OOM during offload_cpu decode with chunk_size={chunk_size}, falling back to CPU VAE decode")
+                self._empty_cache()
+                return self._decode_on_cpu(latents)
         else:
             # Default path: keep everything on GPU
-            return self._tiled_decode_gpu(latents, B, T, stride, overlap, num_steps)
+            try:
+                return self._tiled_decode_gpu(latents, B, T, stride, overlap, num_steps)
+            except torch.cuda.OutOfMemoryError:
+                logger.warning(f"[tiled_decode] OOM during GPU decode with chunk_size={chunk_size}, falling back to CPU offload path")
+                self._empty_cache()
+                try:
+                    return self._tiled_decode_offload_cpu(latents, B, T, stride, overlap, num_steps)
+                except torch.cuda.OutOfMemoryError:
+                    logger.warning("[tiled_decode] OOM even with offload path, falling back to full CPU VAE decode")
+                    self._empty_cache()
+                    return self._decode_on_cpu(latents)
     
     def _tiled_decode_gpu(self, latents, B, T, stride, overlap, num_steps):
         """Standard tiled decode keeping all data on GPU."""
@@ -2769,6 +2962,44 @@ def _tiled_decode_offload_cpu(self, latents, B, T, stride, overlap, num_steps):
         
         return final_audio
     
+    def _decode_on_cpu(self, latents):
+        """
+        Emergency fallback: move VAE to CPU, decode there, then restore.
+        
+        This is used when GPU VRAM is too tight for even the smallest tiled decode.
+        Slower but guarantees no OOM on GPU.
+        """
+        logger.warning("[_decode_on_cpu] Moving VAE to CPU for decode (VRAM too tight for GPU decode)")
+        
+        # Remember original device
+        try:
+            original_device = next(self.vae.parameters()).device
+        except StopIteration:
+            original_device = torch.device("cpu")
+        
+        # Move VAE to CPU
+        vae_cpu_dtype = self._get_vae_dtype("cpu")
+        self._recursive_to_device(self.vae, "cpu", vae_cpu_dtype)
+        self._empty_cache()
+        
+        # Move latents to CPU
+        latents_cpu = latents.cpu().to(vae_cpu_dtype)
+        
+        # Decode on CPU (no tiling needed — CPU has plenty of RAM)
+        try:
+            with torch.inference_mode():
+                decoder_output = self.vae.decode(latents_cpu)
+                result = decoder_output.sample
+                del decoder_output
+        finally:
+            # Restore VAE to original device
+            if original_device.type != "cpu":
+                vae_gpu_dtype = self._get_vae_dtype(str(original_device))
+                self._recursive_to_device(self.vae, original_device, vae_gpu_dtype)
+        
+        logger.info(f"[_decode_on_cpu] CPU decode complete, result shape={result.shape}")
+        return result  # result stays on CPU — fine for audio post-processing
+    
     def tiled_encode(self, audio, chunk_size=None, overlap=None, offload_latent_to_cpu=True):
         """
         Encode audio to latents using tiling to reduce VRAM usage.
@@ -3036,6 +3267,15 @@ def _has_audio_codes(v: Union[str, List[str]]) -> bool:
         actual_batch_size = batch_size if batch_size is not None else self.batch_size
         actual_batch_size = max(1, actual_batch_size)  # Ensure at least 1
 
+        # ---- Pre-inference VRAM guard ----
+        # Estimate whether the requested batch_size fits in free VRAM and
+        # auto-reduce if it does not.  This prevents OOM crashes at the cost
+        # of generating fewer samples.
+        actual_batch_size = self._vram_guard_reduce_batch(
+            actual_batch_size,
+            audio_duration=audio_duration,
+        )
+
         actual_seed_list, seed_value_for_ui = self.prepare_seeds(actual_batch_size, seed, use_random_seed)
         
         # Convert special values to None
@@ -3209,10 +3449,16 @@ def _has_audio_codes(v: Union[str, List[str]]) -> bool:
                     
                     logger.debug(f"[generate_music] Before VAE decode: allocated={self._memory_allocated()/1024**3:.2f}GB, max={self._max_memory_allocated()/1024**3:.2f}GB")
                     
-                    # ROCm fix: decode VAE on CPU to bypass MIOpen workspace bugs
-                    # On APUs with unified memory this has zero data-transfer cost
+                    # Check effective free VRAM and auto-enable CPU decode if extremely tight
                     import os as _os
                     _vae_cpu = _os.environ.get("ACESTEP_VAE_ON_CPU", "0").lower() in ("1", "true", "yes")
+                    if not _vae_cpu:
+                        _effective_free = get_effective_free_vram_gb()
+                        logger.info(f"[generate_music] Effective free VRAM before VAE decode: {_effective_free:.2f} GB")
+                        # If less than 0.5 GB free, VAE decode on GPU will almost certainly OOM
+                        if _effective_free < 0.5:
+                            logger.warning(f"[generate_music] Only {_effective_free:.2f} GB free VRAM — auto-enabling CPU VAE decode")
+                            _vae_cpu = True
                     if _vae_cpu:
                         logger.info("[generate_music] Moving VAE to CPU for decode (ACESTEP_VAE_ON_CPU=1)...")
                         _vae_device = next(self.vae.parameters()).device
diff --git a/acestep/llm_inference.py b/acestep/llm_inference.py
index 22b81b95..dac14b00 100644
--- a/acestep/llm_inference.py
+++ b/acestep/llm_inference.py
@@ -3831,12 +3831,18 @@ def get_hf_model_for_scoring(self):
                 load_time = time.time() - start_time
                 logger.info(f"HuggingFace model loaded in {load_time:.2f}s")
                 
-                # Move to same device as vllm model
-                device = next(model_runner.model.parameters()).device
-                self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
-                self._hf_model_for_scoring.eval()
-                
-                logger.info(f"HuggingFace model for scoring ready on {device}")
+                # When offload_to_cpu is enabled, keep the model on CPU to save
+                # VRAM.  The caller (_load_scoring_model_context in
+                # test_time_scaling.py) will move it to the accelerator only for
+                # the duration of the forward pass.
+                if self.offload_to_cpu:
+                    self._hf_model_for_scoring.eval()
+                    logger.info("HuggingFace model for scoring kept on CPU (offload_to_cpu=True)")
+                else:
+                    device = next(model_runner.model.parameters()).device
+                    self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
+                    self._hf_model_for_scoring.eval()
+                    logger.info(f"HuggingFace model for scoring ready on {device}")
             
             return self._hf_model_for_scoring
         
@@ -3860,12 +3866,16 @@ def get_hf_model_for_scoring(self):
                 load_time = time.time() - start_time
                 logger.info(f"HuggingFace model loaded in {load_time:.2f}s")
                 
-                # Keep on CPU for MPS (scoring is not perf-critical)
-                device = "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
-                self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
-                self._hf_model_for_scoring.eval()
-                
-                logger.info(f"HuggingFace model for scoring ready on {device}")
+                # When offload_to_cpu is enabled, keep on CPU; the scoring
+                # context manager will move it to the accelerator as needed.
+                if self.offload_to_cpu:
+                    self._hf_model_for_scoring.eval()
+                    logger.info("HuggingFace model for scoring kept on CPU (offload_to_cpu=True)")
+                else:
+                    device = "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu"
+                    self._hf_model_for_scoring = self._hf_model_for_scoring.to(device)
+                    self._hf_model_for_scoring.eval()
+                    logger.info(f"HuggingFace model for scoring ready on {device}")
             
             return self._hf_model_for_scoring
         
diff --git a/acestep/test_time_scaling.py b/acestep/test_time_scaling.py
index 19964840..827f28e5 100644
--- a/acestep/test_time_scaling.py
+++ b/acestep/test_time_scaling.py
@@ -2,13 +2,15 @@
 Test-Time Scaling Module
 Implements perplexity-based scoring for generated audio codes
 """
+import contextlib
+import math
+import re
+
 import torch
 import torch.nn.functional as F
-from typing import Tuple, Optional, Dict, Any, List
-from loguru import logger
 import yaml
-import math
-import re
+from loguru import logger
+from typing import Tuple, Optional, Dict, Any, List
 
 
 def pmi_score(log_prob_conditional: float, log_prob_unconditional: float) -> float:
@@ -62,6 +64,52 @@ def pmi_to_normalized_score(pmi: float, scale: float = 0.1) -> float:
     return 1.0 / (1.0 + math.exp(-pmi / scale))
 
 
+@contextlib.contextmanager
+def _load_scoring_model_context(llm_handler):
+    """
+    Context manager that loads the HF scoring model to the accelerator device
+    before use and offloads it back to CPU afterwards.
+
+    For the ``pt`` backend the existing ``_load_model_context()`` already
+    handles offloading, so we just delegate to it.  For ``vllm`` / ``mlx``
+    backends, ``get_hf_model_for_scoring()`` caches a *separate* HF model
+    that would otherwise stay on GPU permanently — here we move it to GPU
+    only for the duration of the scoring forward pass and move it back to
+    CPU when done, freeing VRAM for DiT / VAE.
+    """
+    backend = getattr(llm_handler, "llm_backend", "pt")
+
+    if backend == "pt":
+        # pt backend: _load_model_context already handles GPU ↔ CPU
+        with llm_handler._load_model_context():
+            yield
+        return
+
+    # vllm / mlx: manage the cached HF model ourselves
+    model = llm_handler.get_hf_model_for_scoring()
+    if model is None:
+        yield
+        return
+
+    offload = getattr(llm_handler, "offload_to_cpu", False)
+    device = llm_handler.device if hasattr(llm_handler, "device") else "cpu"
+
+    if offload and hasattr(model, "to"):
+        logger.info(f"[scoring] Loading HF scoring model to {device}")
+        model.to(device)
+
+    try:
+        yield
+    finally:
+        if offload and hasattr(model, "to"):
+            logger.info("[scoring] Offloading HF scoring model to CPU")
+            model.to("cpu")
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
+                torch.mps.empty_cache()
+
+
 def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str,
                                        target_text: str) -> Tuple[torch.Tensor, torch.Tensor]:
     """
@@ -77,7 +125,18 @@ def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str,
     """
     model = llm_handler.get_hf_model_for_scoring()
     tokenizer = llm_handler.llm_tokenizer
-    device = llm_handler.device if llm_handler.llm_backend == "pt" else next(model.parameters()).device
+
+    # Determine the device the model is *currently* on (it may be on CPU
+    # if offload_to_cpu is active — _load_scoring_model_context will move
+    # it to the accelerator before the forward pass).
+    backend = getattr(llm_handler, "llm_backend", "pt")
+    if backend == "pt":
+        device = llm_handler.device
+    else:
+        # For vllm/mlx the scoring model may be on CPU right now;
+        # use the handler's target device so tensors land on the right device
+        # once the model is moved there by the context manager.
+        device = llm_handler.device if hasattr(llm_handler, "device") else next(model.parameters()).device
 
     # 1. Tokenize prompt ONLY to get its length (used for slicing later).
     #    We must ensure special tokens are added to count the offset correctly.
@@ -96,18 +155,17 @@ def _get_logits_and_target_for_scoring(llm_handler, formatted_prompt: str,
         return torch.empty(0, device=device), torch.empty(0, device=device)
 
     # 3. Forward Pass (Teacher Forcing)
+    #    _load_scoring_model_context ensures the model is on-device for the
+    #    forward pass and offloaded back to CPU afterwards.
     with torch.no_grad():
-        with llm_handler._load_model_context():
+        with _load_scoring_model_context(llm_handler):
             outputs = model(input_ids=input_ids, attention_mask=full_tokens['attention_mask'])
             all_logits = outputs.logits  # [1, seq_len, vocab_size]
 
-    # 4. Extract Logits and Labels
-    #    We need to predict `input_ids[i]`. The logit for this is at `all_logits[i-1]`.
-    #    Target starts at index `prompt_len`.
-    #    So we need logits from `prompt_len - 1` up to the second to last position.
-
-    target_logits = all_logits[0, prompt_len - 1:-1, :]  # [target_len, vocab_size]
-    target_ids = input_ids[0, prompt_len:]  # [target_len]
+    # 4. Extract Logits and Labels — move to CPU so downstream scoring
+    #    does not keep large vocab-sized tensors on GPU.
+    target_logits = all_logits[0, prompt_len - 1:-1, :].cpu()  # [target_len, vocab_size]
+    target_ids = input_ids[0, prompt_len:].cpu()  # [target_len]
 
     return target_logits, target_ids
 
diff --git a/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py b/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py
index c564265a..b4f25caa 100644
--- a/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py
+++ b/acestep/third_parts/nano-vllm/nanovllm/engine/model_runner.py
@@ -218,6 +218,22 @@ def allocate_kv_cache(self):
         hf_config = config.hf_config
         free, total = torch.cuda.mem_get_info()
         current = torch.cuda.memory_stats()["allocated_bytes.all.current"]
+        
+        # Account for per-process memory fraction (set via MAX_CUDA_VRAM simulation)
+        import os as _os
+        _debug_vram = _os.environ.get("MAX_CUDA_VRAM")
+        if _debug_vram is not None:
+            try:
+                _simulated_gb = float(_debug_vram)
+                _total_gb = total / (1024 ** 3)
+                if _simulated_gb < _total_gb:
+                    # Effective total and free are capped by simulation
+                    reserved = torch.cuda.memory_reserved()
+                    total = int(_simulated_gb * (1024 ** 3))
+                    free = max(0, total - reserved)
+            except (ValueError, TypeError):
+                pass
+        
         num_kv_heads = hf_config.num_key_value_heads // self.world_size
         head_dim = getattr(hf_config, "head_dim", hf_config.hidden_size // hf_config.num_attention_heads)
         block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * self.dtype.itemsize
@@ -228,6 +244,13 @@ def allocate_kv_cache(self):
         target_total_usage = total * config.gpu_memory_utilization
         available_for_kv_cache = min(free * 0.9, target_total_usage - current)
         
+        # Safety check: ensure we leave at least ~1 GB free for DiT inference
+        # activations that will run after LM generation. Without this, the KV
+        # cache can consume all free VRAM and cause OOM during DiT forward pass.
+        MIN_RESERVE_BYTES = int(1.0 * 1024**3)  # 1 GB reserved for other models
+        max_kv_from_free = max(0, free - MIN_RESERVE_BYTES) * 0.9
+        available_for_kv_cache = min(available_for_kv_cache, max_kv_from_free)
+        
         # Ensure we have positive memory available
         if available_for_kv_cache <= 0:
             available_for_kv_cache = free * 0.5  # Fallback to 50% of free memory
@@ -242,11 +265,21 @@ def allocate_kv_cache(self):
             )
         max_tokens_capacity = config.num_kvcache_blocks * self.block_size
         kv_cache_size_gb = config.num_kvcache_blocks * block_bytes / 1024**3
+        
+        # If KV cache would leave less than 1 GB free, warn and suggest reducing max_model_len
+        post_kv_free = (free - config.num_kvcache_blocks * block_bytes) / 1024**3
+        if post_kv_free < 1.0:
+            print(
+                f"[nanovllm] WARNING: After KV cache allocation, only {post_kv_free:.2f} GB free. "
+                f"DiT inference may OOM. Consider reducing max_model_len or using CPU offload."
+            )
+        
         print(
             f"[nanovllm] KV cache allocated: {config.num_kvcache_blocks} blocks × {self.block_size} tokens = "
             f"{max_tokens_capacity} tokens capacity, {kv_cache_size_gb:.2f} GB "
             f"(free: {free / 1024**3:.2f} GB, used: {current / 1024**3:.2f} GB, "
-            f"target: {target_total_usage / 1024**3:.2f} GB, block: {block_bytes / 1024**2:.2f} MB)"
+            f"target: {target_total_usage / 1024**3:.2f} GB, block: {block_bytes / 1024**2:.2f} MB, "
+            f"post_kv_free: {post_kv_free:.2f} GB)"
         )
         self.kv_cache = torch.empty(2, hf_config.num_hidden_layers, config.num_kvcache_blocks, self.block_size, num_kv_heads, head_dim)
         layer_id = 0
diff --git a/docs/en/BENCHMARK.md b/docs/en/BENCHMARK.md
index 79a5124b..87f68370 100644
--- a/docs/en/BENCHMARK.md
+++ b/docs/en/BENCHMARK.md
@@ -26,6 +26,7 @@
 |------|-------------|
 | `profile` | Profile a single generation run with detailed timing breakdown |
 | `benchmark` | Run a matrix of configurations (duration × batch × thinking × steps) and produce a summary table |
+| `tier-test` | Automatically test all GPU tiers by simulating different VRAM sizes via `MAX_CUDA_VRAM` |
 | `understand` | Profile the `understand_music()` API (audio → metadata extraction) |
 | `create_sample` | Profile the `create_sample()` API (inspiration / simple mode) |
 | `format_sample` | Profile the `format_sample()` API (caption + lyrics → structured metadata) |
@@ -156,6 +157,84 @@ Profiles the `format_sample()` API which converts caption + lyrics into structur
 python profile_inference.py --mode format_sample
 ```
 
+### 6. `tier-test` — Automated GPU Tier Testing
+
+Automatically simulates different GPU VRAM sizes using `MAX_CUDA_VRAM` and runs a generation test at each tier. This is the recommended way to validate that all GPU tiers work correctly after modifying `acestep/gpu_config.py`.
+
+```bash
+# Test all tiers (4, 6, 8, 12, 16, 20, 24 GB)
+python profile_inference.py --mode tier-test
+
+# Test specific VRAM sizes
+python profile_inference.py --mode tier-test --tiers 6 8 16
+
+# Test with LM enabled (where the tier supports it)
+python profile_inference.py --mode tier-test --tier-with-lm
+
+# Quick test: skip torch.compile for non-quantized tiers
+python profile_inference.py --mode tier-test --tier-skip-compile
+```
+
+**What it validates per tier:**
+- Correct tier detection and `GPUConfig` construction
+- Model initialization (DiT, VAE, Text Encoder, optionally LM)
+- A short generation run (30s duration, batch=1) completes without OOM
+- Adaptive VAE decode fallback (GPU → CPU offload → full CPU)
+- VRAM usage stays within the simulated limit
+
+**Output example:**
+
+```
+TIER TEST RESULTS
+====================================================================================================
+  VRAM    Tier       LM      Duration   Status    Peak VRAM    Notes
+  ──────────────────────────────────────────────────────────────────────────────
+  4GB     tier1      —       30s        ✅ OK     3.8GB        VAE decoded on CPU
+  6GB     tier2      —       30s        ✅ OK     5.4GB        Tiled VAE chunk=256
+  8GB     tier4      0.6B    30s        ✅ OK     7.2GB        vllm backend
+  12GB    tier5      1.7B    30s        ✅ OK     10.8GB       vllm backend
+  16GB    tier6a     1.7B    30s        ✅ OK     14.5GB       offload enabled
+  20GB    tier6b     1.7B    30s        ✅ OK     17.2GB       no offload
+  24GB    unlimited  4B      30s        ✅ OK     21.3GB       full models on GPU
+```
+
+> **Note**: `tier-test` mode uses `torch.cuda.set_per_process_memory_fraction()` to enforce a hard VRAM cap, making simulations realistic even on high-end GPUs (e.g., A100 80GB).
+
+#### Boundary Testing
+
+Use `--tier-boundary` to find the minimum VRAM tier at which INT8 quantization and CPU offload can be safely disabled. For each tier, up to three configurations are tested:
+
+1. **default** — tier's standard settings
+2. **no-quant** — quantization disabled, offload unchanged
+3. **no-offload** — no quantization AND no CPU offload
+
+```bash
+# Run boundary tests across all tiers
+python profile_inference.py --mode tier-test --tier-boundary
+
+# Boundary test with LM enabled
+python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm
+
+# Save boundary results to JSON
+python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json
+```
+
+The output includes a **Boundary Analysis** summary showing the minimum tier for each capability.
+
+#### Batch Size Boundary Testing
+
+Use `--tier-batch-boundary` to find the maximum safe batch size for each tier. For each tier, the tool progressively tests batch sizes 1, 2, 4, 8 (stopping at first OOM) with both LM-enabled and LM-disabled configurations:
+
+```bash
+# Run batch boundary tests
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm
+
+# Test specific tiers
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24
+```
+
+The output includes a **Batch Boundary Summary** showing the maximum successful batch size per tier for both with-LM and without-LM configurations.
+
 ---
 
 ## CLI Reference
@@ -209,12 +288,22 @@ python profile_inference.py --mode format_sample
 
 | Flag | Default | Description |
 |------|---------|-------------|
-| `--mode` | `profile` | Mode: `profile` / `benchmark` / `understand` / `create_sample` / `format_sample` |
+| `--mode` | `profile` | Mode: `profile` / `benchmark` / `tier-test` / `understand` / `create_sample` / `format_sample` |
 | `--no-warmup` | off | Skip warmup run |
 | `--detailed` | off | Enable `cProfile` function-level analysis |
 | `--llm-debug` | off | Deep LLM debugging (token count, throughput) |
 | `--benchmark-output` | none | Save benchmark results to JSON file |
 
+### Tier-Test Options
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--tiers` | `4 6 8 12 16 20 24` | VRAM sizes (GB) to simulate |
+| `--tier-with-lm` | off | Enable LM initialization on tiers that support it |
+| `--tier-skip-compile` | off | Skip `torch.compile` for faster iteration on non-quantized tiers |
+| `--tier-boundary` | off | Test each tier with no-quant and no-offload variants to find minimum capability boundaries |
+| `--tier-batch-boundary` | off | Test each tier with batch sizes 1, 2, 4, 8 to find maximum safe batch size |
+
 ### Input Options
 
 | Flag | Default | Description |
@@ -340,6 +429,10 @@ TIME COSTS BREAKDOWN
 
 4. **Test with representative durations** — Short durations (30s) are dominated by LLM time; long durations (240s+) are dominated by DiT time.
 
-5. **GPU memory auto-adaptation** — The benchmark mode automatically clamps durations and batch sizes to what your GPU can handle.
+5. **GPU memory auto-adaptation** — The benchmark mode automatically clamps durations and batch sizes to what your GPU can handle, using the adaptive tier system in `acestep/gpu_config.py`.
 
 6. **Use `--detailed` sparingly** — `cProfile` adds overhead; use it only when investigating function-level bottlenecks.
+
+7. **Use `tier-test` for regression testing** — After modifying GPU tier configs, run `--mode tier-test` to verify all tiers still work correctly. This is especially important when changing offload thresholds, duration limits, or LM model availability.
+
+8. **Simulate low VRAM realistically** — When using `MAX_CUDA_VRAM`, the system enforces a hard VRAM cap via `set_per_process_memory_fraction()`, so OOM errors during simulation reflect real behavior on consumer GPUs.
diff --git a/docs/en/GPU_COMPATIBILITY.md b/docs/en/GPU_COMPATIBILITY.md
index 3b9bc10d..b1666023 100644
--- a/docs/en/GPU_COMPATIBILITY.md
+++ b/docs/en/GPU_COMPATIBILITY.md
@@ -1,36 +1,69 @@
 # GPU Compatibility Guide
 
-ACE-Step 1.5 automatically adapts to your GPU's available VRAM, adjusting generation limits and LM model availability accordingly. The system detects GPU memory at startup and configures optimal settings.
+ACE-Step 1.5 automatically adapts to your GPU's available VRAM, adjusting generation limits, LM model availability, offloading strategies, and UI defaults accordingly. The system detects GPU memory at startup and configures optimal settings for your hardware.
 
 ## GPU Tier Configuration
 
-| VRAM | Tier | LM Mode | Max Duration | Max Batch Size | LM Memory Allocation |
-|------|------|---------|--------------|----------------|---------------------|
-| ≤4GB | Tier 1 | Not available | 3 min | 1 | - |
-| 4-6GB | Tier 2 | Not available | 6 min | 1 | - |
-| 6-8GB | Tier 3 | 0.6B (optional) | With LM: 4 min / Without: 6 min | With LM: 1 / Without: 2 | 3GB |
-| 8-12GB | Tier 4 | 0.6B (optional) | With LM: 4 min / Without: 6 min | With LM: 2 / Without: 4 | 3GB |
-| 12-16GB | Tier 5 | 0.6B / 1.7B | With LM: 4 min / Without: 6 min | With LM: 2 / Without: 4 | 0.6B: 3GB, 1.7B: 8GB |
-| 16-24GB | Tier 6 | 0.6B / 1.7B / 4B | 8 min | With LM: 4 / Without: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB |
-| ≥24GB | Unlimited | All models | 10 min | 8 | Unrestricted |
+| VRAM | Tier | LM Models | Recommended LM | Backend | Max Duration (LM / No LM) | Max Batch (LM / No LM) | Offload | Quantization |
+|------|------|-----------|-----------------|---------|----------------------------|-------------------------|---------|--------------|
+| ≤4GB | Tier 1 | None | — | pt | 4 min / 6 min | 1 / 1 | CPU + DiT | INT8 |
+| 4-6GB | Tier 2 | None | — | pt | 8 min / 10 min | 1 / 1 | CPU + DiT | INT8 |
+| 6-8GB | Tier 3 | 0.6B | 0.6B | pt | 8 min / 10 min | 2 / 2 | CPU + DiT | INT8 |
+| 8-12GB | Tier 4 | 0.6B | 0.6B | vllm | 8 min / 10 min | 2 / 4 | CPU + DiT | INT8 |
+| 12-16GB | Tier 5 | 0.6B, 1.7B | 1.7B | vllm | 8 min / 10 min | 4 / 4 | CPU | INT8 |
+| 16-20GB | Tier 6a | 0.6B, 1.7B | 1.7B | vllm | 8 min / 10 min | 4 / 8 | CPU | INT8 |
+| 20-24GB | Tier 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8 min / 8 min | 8 / 8 | None | None |
+| ≥24GB | Unlimited | All (0.6B, 1.7B, 4B) | 4B | vllm | 10 min / 10 min | 8 / 8 | None | None |
+
+### Column Descriptions
+
+- **LM Models**: Which 5Hz Language Model sizes can be loaded on this tier
+- **Recommended LM**: The default LM model selected in the UI for this tier
+- **Backend**: LM inference backend (`vllm` for NVIDIA GPUs with sufficient VRAM, `pt` for PyTorch fallback, `mlx` for Apple Silicon)
+- **Offload**: Memory offloading strategy
+  - **CPU + DiT**: All models (DiT, VAE, Text Encoder) offloaded to CPU when not in use; DiT also offloaded between steps
+  - **CPU**: VAE and Text Encoder offloaded to CPU; DiT stays on GPU
+  - **None**: All models remain on GPU
+- **Quantization**: Whether INT8 weight quantization is enabled by default to reduce VRAM usage
+
+## Adaptive UI Defaults
+
+The Gradio UI automatically configures itself based on the detected GPU tier:
+
+- **LM Initialization Checkbox**: Checked by default for tiers that support LM (Tier 3+), unchecked and disabled for Tier 1-2
+- **LM Model Path**: Pre-populated with the recommended model for your tier; dropdown only shows compatible models
+- **Backend Dropdown**: Restricted to `pt`/`mlx` on Tier 1-3 (vllm KV cache is too memory-hungry); all backends available on Tier 4+
+- **CPU Offload / DiT Offload**: Enabled by default on lower tiers, disabled on higher tiers
+- **Quantization**: Enabled by default on Tier 1-6a, disabled on Tier 6b+ (sufficient VRAM)
+- **Compile Model**: Enabled by default on all tiers (required for quantization)
+
+If you manually select an incompatible option (e.g., trying to use vllm on a 6GB GPU), the system will warn you and automatically fall back to a compatible configuration.
+
+## Runtime Safety Features
+
+- **VRAM Guard**: Before each inference, the system estimates VRAM requirements and automatically reduces batch size if needed
+- **Adaptive VAE Decode**: Three-tier fallback: GPU tiled decode → GPU decode with CPU offload → full CPU decode
+- **Auto Chunk Size**: VAE decode chunk size adapts to available free VRAM (64/128/256/512/1024/1536)
+- **Duration/Batch Clamping**: If you request values exceeding your tier's limits, they are clamped with a warning
 
 ## Notes
 
 - **Default settings** are automatically configured based on detected GPU memory
 - **LM Mode** refers to the Language Model used for Chain-of-Thought generation and audio understanding
-- **Flash Attention**, **CPU Offload**, **Compile**, and **Quantization** are enabled by default for optimal performance
-- If you request a duration or batch size exceeding your GPU's limits, a warning will be displayed and values will be clamped
+- **Flash Attention** is auto-detected and enabled when available
 - **Constrained Decoding**: When LM is initialized, the LM's duration generation is also constrained to the GPU tier's maximum duration limit, preventing out-of-memory errors during CoT generation
-- For GPUs with ≤6GB VRAM, LM initialization is disabled by default to preserve memory for the DiT model
+- For GPUs with ≤6GB VRAM (Tier 1-2), LM initialization is disabled by default to preserve memory for the DiT model
 - You can manually override settings via command-line arguments or the Gradio UI
 
 > **Community Contributions Welcome**: The GPU tier configurations above are based on our testing across common hardware. If you find that your device's actual performance differs from these parameters (e.g., can handle longer durations or larger batch sizes), we welcome you to conduct more thorough testing and submit a PR to optimize these configurations in `acestep/gpu_config.py`. Your contributions help improve the experience for all users!
 
 ## Memory Optimization Tips
 
-1. **Low VRAM (<8GB)**: Use DiT-only mode without LM initialization for maximum duration
-2. **Medium VRAM (8-16GB)**: Use the 0.6B LM model for best balance of quality and memory
-3. **High VRAM (>16GB)**: Enable larger LM models (1.7B/4B) for better audio understanding and generation quality
+1. **Very Low VRAM (≤6GB)**: Use DiT-only mode without LM initialization. INT8 quantization and full CPU offload are mandatory. VAE decode may fall back to CPU automatically.
+2. **Low VRAM (6-8GB)**: The 0.6B LM model can be used with `pt` backend. Keep offload enabled.
+3. **Medium VRAM (8-16GB)**: Use the 0.6B or 1.7B LM model. `vllm` backend works well on Tier 4+.
+4. **High VRAM (16-24GB)**: Enable larger LM models (1.7B recommended). Quantization becomes optional on 20GB+.
+5. **Very High VRAM (≥24GB)**: All models fit without offloading or quantization. Use 4B LM for best quality.
 
 ## Debug Mode: Simulating Different GPU Configurations
 
@@ -40,17 +73,93 @@ For testing and development, you can simulate different GPU memory sizes using t
 # Simulate a 4GB GPU (Tier 1)
 MAX_CUDA_VRAM=4 uv run acestep
 
+# Simulate a 6GB GPU (Tier 2)
+MAX_CUDA_VRAM=6 uv run acestep
+
 # Simulate an 8GB GPU (Tier 4)
 MAX_CUDA_VRAM=8 uv run acestep
 
 # Simulate a 12GB GPU (Tier 5)
 MAX_CUDA_VRAM=12 uv run acestep
 
-# Simulate a 16GB GPU (Tier 6)
+# Simulate a 16GB GPU (Tier 6a)
 MAX_CUDA_VRAM=16 uv run acestep
 ```
 
+When `MAX_CUDA_VRAM` is set, the system also calls `torch.cuda.set_per_process_memory_fraction()` to enforce a hard VRAM cap, making the simulation realistic even on high-end GPUs.
+
+### Automated Tier Testing
+
+Instead of manually testing each tier through the UI, use the `tier-test` mode of `profile_inference.py`:
+
+```bash
+# Test all tiers automatically
+python profile_inference.py --mode tier-test
+
+# Test specific tiers
+python profile_inference.py --mode tier-test --tiers 6 8 16
+
+# Test with LM enabled (where supported)
+python profile_inference.py --mode tier-test --tier-with-lm
+
+# Quick test (skip torch.compile for non-quantized tiers)
+python profile_inference.py --mode tier-test --tier-skip-compile
+```
+
+See [BENCHMARK.md](BENCHMARK.md) for full documentation of the profiling tool.
+
 This is useful for:
 - Testing GPU tier configurations on high-end hardware
 - Verifying that warnings and limits work correctly for each tier
-- Developing and testing new GPU configuration parameters before submitting a PR
+- Automated regression testing after modifying `acestep/gpu_config.py`
+- CI/CD validation of VRAM compatibility
+
+### Boundary Testing (Finding Minimum Tiers)
+
+Use `--tier-boundary` to empirically determine the minimum VRAM tier at which INT8 quantization and CPU offload can be safely disabled. For each tier, this runs up to three configurations:
+
+1. **default** — tier's standard settings (quantization + offload as configured)
+2. **no-quant** — same offload settings, but quantization disabled
+3. **no-offload** — no quantization AND no CPU offload (all models on GPU)
+
+```bash
+# Run boundary tests across all tiers
+python profile_inference.py --mode tier-test --tier-boundary
+
+# Test specific tiers with boundary testing
+python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24
+
+# Boundary test with LM enabled (where supported)
+python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm
+
+# Save results to JSON for further analysis
+python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json
+```
+
+The output includes a **Boundary Analysis** section showing the minimum tier for each capability:
+
+```
+BOUNDARY ANALYSIS
+=================
+  Capability                                    Min Tier   VRAM
+  ------------------------------------------------------------
+  No INT8 Quantization                          tier6b      20GB
+  No CPU Offload (all models on GPU)            tier6b      20GB
+  ------------------------------------------------------------
+```
+
+> **Note:** Boundary results are empirical and may vary based on DiT model variant (turbo vs base), whether LM is enabled, generation duration, and flash attention availability. Community contributions to refine these boundaries are welcome!
+
+### Batch Size Boundary Testing
+
+Use `--tier-batch-boundary` to find the maximum safe batch size for each tier by progressively testing batch sizes 1, 2, 4, 8:
+
+```bash
+# Run batch boundary tests with LM enabled
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm
+
+# Test specific tiers
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24
+```
+
+This tests both with-LM and without-LM configurations and reports the maximum successful batch size per tier.
diff --git a/docs/en/GPU_TROUBLESHOOTING.md b/docs/en/GPU_TROUBLESHOOTING.md
index afeba414..0d42739b 100644
--- a/docs/en/GPU_TROUBLESHOOTING.md
+++ b/docs/en/GPU_TROUBLESHOOTING.md
@@ -229,4 +229,6 @@ If none of the above solutions work:
 
 | Variable | Purpose | Example |
 |----------|---------|---------|
-| `MAX_CUDA_VRAM` | Override detected VRAM (testing) | `8` (simulate 8GB GPU) |
+| `MAX_CUDA_VRAM` | Override detected VRAM for tier simulation (also enforces hard VRAM cap via `set_per_process_memory_fraction`) | `8` (simulate 8GB GPU) |
+
+> **Note on `MAX_CUDA_VRAM`**: When set, this variable not only changes the tier detection logic but also calls `torch.cuda.set_per_process_memory_fraction()` to enforce a hard VRAM limit. This means OOM errors during simulation are realistic and reflect actual behavior on GPUs with that amount of VRAM. See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for the full tier table.
diff --git a/docs/en/GRADIO_GUIDE.md b/docs/en/GRADIO_GUIDE.md
index 235ab06e..f316ab42 100644
--- a/docs/en/GRADIO_GUIDE.md
+++ b/docs/en/GRADIO_GUIDE.md
@@ -62,17 +62,23 @@ The Gradio interface consists of several main sections:
 
 | Setting | Description |
 |---------|-------------|
-| **5Hz LM Model Path** | Select the language model (e.g., `acestep-5Hz-lm-0.6B`, `acestep-5Hz-lm-1.7B`) |
-| **5Hz LM Backend** | `vllm` (faster, recommended) or `pt` (PyTorch, more compatible) |
-| **Initialize 5Hz LM** | Check to load the LM during initialization (required for thinking mode) |
+| **5Hz LM Model Path** | Select the language model. **Available models are filtered by your GPU tier** — e.g., 6-8GB GPUs only show 0.6B, while 24GB+ GPUs show all sizes (0.6B, 1.7B, 4B). |
+| **5Hz LM Backend** | `vllm` (faster, recommended for NVIDIA with ≥8GB VRAM), `pt` (PyTorch, universal fallback), or `mlx` (Apple Silicon). **On GPUs <8GB, the backend is restricted to `pt`/`mlx`** because vllm's KV cache is too memory-hungry. |
+| **Initialize 5Hz LM** | Check to load the LM during initialization (required for thinking mode). **Automatically unchecked and disabled on GPUs ≤6GB** (Tier 1-2). |
+
+> **Adaptive Defaults**: All LM settings are automatically configured based on your GPU's VRAM tier. The recommended LM model, backend, and initialization state are pre-set for optimal performance. You can manually override these, but the system will warn you if your selection is incompatible with your GPU.
 
 ### Performance Options
 
 | Setting | Description |
 |---------|-------------|
 | **Use Flash Attention** | Enable for faster inference (requires flash_attn package) |
-| **Offload to CPU** | Offload models to CPU when idle to save GPU memory |
-| **Offload DiT to CPU** | Specifically offload the DiT model to CPU |
+| **Offload to CPU** | Offload models to CPU when idle to save GPU memory. **Automatically enabled on GPUs <20GB.** |
+| **Offload DiT to CPU** | Specifically offload the DiT model to CPU. **Automatically enabled on GPUs <12GB.** |
+| **INT8 Quantization** | Reduce model VRAM footprint with INT8 weight quantization. **Automatically enabled on GPUs <20GB.** |
+| **Compile Model** | Enable `torch.compile` for optimized inference. **Enabled by default on all tiers** (required when quantization is active). |
+
+> **Tier-Aware Settings**: Offload, quantization, and compile options are automatically set based on your GPU tier. See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for the full tier table.
 
 ### LoRA Adapter
 
@@ -87,7 +93,12 @@ The Gradio interface consists of several main sections:
 
 ### Initialization
 
-Click **Initialize Service** to load the models. The status box will show progress and confirmation.
+Click **Initialize Service** to load the models. The status box will show progress and confirmation, including:
+- The detected GPU tier and VRAM
+- Maximum allowed duration and batch size (adjusted dynamically based on whether LM was initialized)
+- Any warnings about incompatible settings that were automatically corrected
+
+After initialization, the **Audio Duration** and **Batch Size** sliders are automatically updated to reflect the tier's limits.
 
 ---
 
@@ -527,14 +538,18 @@ These options are especially useful when preprocessing takes a long time or you
 - Make caption more specific
 
 **Out of memory:**
-- Reduce batch size
-- Enable CPU offloading
+- The system includes automatic VRAM management (VRAM guard, adaptive VAE decode, auto batch reduction). If OOM still occurs:
+- Reduce batch size manually
+- Enable CPU offloading (should be auto-enabled for GPUs <20GB)
+- Enable INT8 quantization (should be auto-enabled for GPUs <20GB)
 - Reduce LM batch chunk size
+- See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for recommended settings per tier
 
 **LM not working:**
-- Ensure "Initialize 5Hz LM" was checked during initialization
-- Check that a valid LM model path is selected
-- Verify vllm or PyTorch backend is available
+- Ensure "Initialize 5Hz LM" was checked during initialization (disabled by default on GPUs ≤6GB)
+- Check that a valid LM model path is selected (only tier-compatible models are shown)
+- Verify vllm or PyTorch backend is available (vllm restricted on GPUs <8GB)
+- If the LM checkbox is grayed out, your GPU tier does not support LM — use DiT-only mode
 
 ---
 
diff --git a/docs/en/INFERENCE.md b/docs/en/INFERENCE.md
index 1a0504a3..a2354f08 100644
--- a/docs/en/INFERENCE.md
+++ b/docs/en/INFERENCE.md
@@ -1068,11 +1068,19 @@ else:
 
 ### 7. Memory Management
 
-For large batch sizes or long durations:
-- Monitor GPU memory usage
-- Reduce `batch_size` if OOM errors occur
-- Reduce `lm_batch_chunk_size` for LM operations
-- Consider using `offload_to_cpu=True` during initialization
+ACE-Step 1.5 includes automatic VRAM management that adapts to your GPU:
+
+- **Automatic tier detection**: The system detects available VRAM and selects optimal settings (see [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md))
+- **VRAM guard**: Before each inference, the system estimates VRAM requirements and automatically reduces `batch_size` if needed
+- **Adaptive VAE decode**: Three-tier fallback — GPU tiled decode → GPU decode with CPU offload → full CPU decode
+- **Auto chunk sizing**: VAE decode chunk size adapts to free VRAM (64/128/256/512/1024/1536)
+- **Duration/batch clamping**: Values exceeding your tier's limits are automatically clamped with a warning
+
+For manual tuning:
+- Reduce `batch_size` if OOM errors persist
+- Reduce `lm_batch_chunk_size` for LM operations on low-VRAM GPUs
+- Enable `offload_to_cpu=True` during initialization for GPUs with <20GB VRAM
+- Enable `quantization="int8_weight_only"` for GPUs with <20GB VRAM
 
 ### 8. Accessing Time Costs
 
@@ -1094,7 +1102,7 @@ if result.success:
 ### Common Issues
 
 **Issue**: Out of memory errors
-- **Solution**: Reduce `batch_size`, `inference_steps`, or enable CPU offloading
+- **Solution**: The system should automatically handle most OOM scenarios via VRAM guard (batch reduction) and adaptive VAE decode (CPU fallback). If OOM still occurs: reduce `batch_size`, reduce `inference_steps`, enable CPU offloading (`offload_to_cpu=True`), or enable INT8 quantization. See [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) for recommended settings per VRAM tier.
 
 **Issue**: Poor quality results
 - **Solution**: Increase `inference_steps`, adjust `guidance_scale`, use base model
diff --git a/docs/en/INSTALL.md b/docs/en/INSTALL.md
index 4f584463..39488680 100644
--- a/docs/en/INSTALL.md
+++ b/docs/en/INSTALL.md
@@ -502,7 +502,7 @@ ACESTEP_INIT_LLM=false
 | `--init_llm` | auto | LLM init: `true` / `false` / omit for auto |
 | `--config_path` | auto | DiT model (e.g., `acestep-v15-turbo`) |
 | `--lm_model_path` | auto | LM model (e.g., `acestep-5Hz-lm-1.7B`) |
-| `--offload_to_cpu` | auto | CPU offload (auto-enabled if VRAM < 16GB) |
+| `--offload_to_cpu` | auto | CPU offload (auto-enabled if VRAM < 20GB) |
 | `--download-source` | auto | Model source: `auto` / `huggingface` / `modelscope` |
 | `--enable-api` | false | Enable REST API alongside Gradio UI |
 | `--api-key` | none | API key for authentication |
@@ -576,16 +576,17 @@ huggingface-cli download ACE-Step/acestep-5Hz-lm-4B --local-dir ./checkpoints/ac
 
 ## 💡 Which Model Should I Choose?
 
-ACE-Step automatically adapts to your GPU's VRAM:
+ACE-Step automatically adapts to your GPU's VRAM. The UI pre-configures all settings (LM model, backend, offloading, quantization) based on your detected GPU tier:
 
-| Your GPU VRAM | Recommended LM Model | Notes |
-|---------------|---------------------|-------|
-| **≤6GB** | None (DiT only) | LM disabled by default to save memory |
-| **6-12GB** | `acestep-5Hz-lm-0.6B` | Lightweight, good balance |
-| **12-16GB** | `acestep-5Hz-lm-1.7B` | Better quality |
-| **≥16GB** | `acestep-5Hz-lm-4B` | Best quality and audio understanding |
+| Your GPU VRAM | Recommended LM Model | Backend | Notes |
+|---------------|---------------------|---------|-------|
+| **≤6GB** | None (DiT only) | — | LM disabled by default; INT8 quantization + full CPU offload |
+| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | Lightweight LM with PyTorch backend |
+| **8-16GB** | `0.6B` / `1.7B` | `vllm` | 0.6B for 8-12GB, 1.7B for 12-16GB |
+| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 4B available on 20GB+; no offload on 20GB+ |
+| **≥24GB** | `acestep-5Hz-lm-4B` | `vllm` | Best quality, all models fit without offload |
 
-> 📖 For detailed GPU compatibility information (duration limits, batch sizes, memory optimization), see [GPU Compatibility Guide](GPU_COMPATIBILITY.md).
+> 📖 For detailed GPU compatibility information (tier table, duration limits, batch sizes, adaptive UI defaults, memory optimization), see [GPU Compatibility Guide](GPU_COMPATIBILITY.md).
 
 ---
 
diff --git a/docs/en/ace_step_musicians_guide.md b/docs/en/ace_step_musicians_guide.md
index c739f930..6f1a8757 100644
--- a/docs/en/ace_step_musicians_guide.md
+++ b/docs/en/ace_step_musicians_guide.md
@@ -138,36 +138,45 @@ A computer with a decent graphics card (GPU). The better the GPU, the faster and
     YOUR GPU MEMORY          WHAT YOU CAN DO
     ─────────────────────────────────────────────────────
 
-    4 GB  (entry level)      Songs up to 3 minutes
+    4 GB  (entry level)      Songs up to 6 minutes
     ▓░░░░░░░░░░░░░░░░░░░    1 song at a time
                              Basic mode only (no Songwriter brain)
 
-    8 GB  (mainstream)       Songs up to 6 minutes
-    ▓▓▓▓░░░░░░░░░░░░░░░░    1-2 songs at a time
-                             Optional lightweight Songwriter brain
+    6-8 GB  (budget)         Songs up to 10 minutes
+    ▓▓▓░░░░░░░░░░░░░░░░░    1-2 songs at a time
+                             Optional lightweight Songwriter brain (0.6B)
 
-    12 GB (sweet spot)       Songs up to 6 minutes
-    ▓▓▓▓▓▓░░░░░░░░░░░░░░    2-4 songs at a time
-                             Full Songwriter brain available
+    8-12 GB (mainstream)     Songs up to 10 minutes
+    ▓▓▓▓▓░░░░░░░░░░░░░░░    2-4 songs at a time
+                             Songwriter brain available (0.6B)
 
-    16 GB (enthusiast)       Songs up to 8 minutes
-    ▓▓▓▓▓▓▓▓░░░░░░░░░░░░    2-4 songs at a time
-                             Larger, smarter Songwriter brain
+    12-16 GB (sweet spot)    Songs up to 10 minutes
+    ▓▓▓▓▓▓▓░░░░░░░░░░░░░    2-4 songs at a time
+                             Full Songwriter brain (1.7B)
 
-    24 GB+ (high end)        Songs up to 10 minutes
+    16-20 GB (enthusiast)    Songs up to 10 minutes
+    ▓▓▓▓▓▓▓▓▓░░░░░░░░░░░    1-4 songs at a time
+                             Larger Songwriter brain (1.7B)
+
+    20-24 GB (high end)      Songs up to 8 minutes
+    ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░    2-8 songs at a time
+                             All Songwriter brains (0.6B/1.7B/4B), no offload needed
+
+    24 GB+ (pro)             Songs up to 10 minutes
     ▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░    Up to 8 songs at a time
-                             All features unlocked
+                             All features unlocked, best quality (4B)
 ```
 
 **Common GPUs and where they land:**
 
 | GPU | Memory | Tier |
 |-----|--------|------|
-| GTX 1050 Ti | 4 GB | Entry |
-| RTX 3060 / 4060 | 8 GB | Mainstream |
-| RTX 3070 / 4070 | 8-12 GB | Sweet spot |
-| RTX 3080 / 4080 | 12-16 GB | Enthusiast |
-| RTX 4090 | 24 GB | High end |
+| GTX 1050 Ti | 4 GB | Entry (Tier 1) |
+| GTX 1660 / RTX 2060 | 6 GB | Budget (Tier 2) |
+| RTX 3060 / 4060 | 8 GB | Mainstream (Tier 4) |
+| RTX 3070 / 4070 | 8-12 GB | Mainstream-Sweet spot (Tier 4-5) |
+| RTX 3080 16GB / 4060 Ti 16GB | 16 GB | Enthusiast (Tier 6a) |
+| RTX 3090 / 4090 | 24 GB | High end / Pro (Tier 6b-Unlimited) |
 | Apple M1/M2/M3 (Mac) | Shared memory | Supported, varies |
 
 **Disk space:** About 100 GB free. The AI models are large files (around 60 GB total) that download automatically the first time you run the software.
diff --git a/docs/ja/GPU_COMPATIBILITY.md b/docs/ja/GPU_COMPATIBILITY.md
index 37d9b227..e1862296 100644
--- a/docs/ja/GPU_COMPATIBILITY.md
+++ b/docs/ja/GPU_COMPATIBILITY.md
@@ -1,36 +1,69 @@
 # GPU 互換性ガイド
 
-ACE-Step 1.5 は GPU の VRAM に自動的に適応し、生成時間の制限や使用可能な LM モデルを調整します。システムは起動時に GPU メモリを検出し、最適な設定を自動構成します。
+ACE-Step 1.5 は GPU の VRAM に自動的に適応し、生成時間の制限、使用可能な LM モデル、オフロード戦略、UI デフォルト設定を調整します。システムは起動時に GPU メモリを検出し、最適な設定を自動構成します。
 
 ## GPU ティア構成
 
-| VRAM | ティア | LM モード | 最大時間 | 最大バッチ | LM メモリ割当 |
-|------|--------|-----------|----------|------------|---------------|
-| ≤4GB | Tier 1 | 利用不可 | 3 分 | 1 | - |
-| 4-6GB | Tier 2 | 利用不可 | 6 分 | 1 | - |
-| 6-8GB | Tier 3 | 0.6B (オプション) | LM あり: 4 分 / LM なし: 6 分 | LM あり: 1 / LM なし: 2 | 3GB |
-| 8-12GB | Tier 4 | 0.6B (オプション) | LM あり: 4 分 / LM なし: 6 分 | LM あり: 2 / LM なし: 4 | 3GB |
-| 12-16GB | Tier 5 | 0.6B / 1.7B | LM あり: 4 分 / LM なし: 6 分 | LM あり: 2 / LM なし: 4 | 0.6B: 3GB, 1.7B: 8GB |
-| 16-24GB | Tier 6 | 0.6B / 1.7B / 4B | 8 分 | LM あり: 4 / LM なし: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB |
-| ≥24GB | 無制限 | 全モデル | 10 分 | 8 | 無制限 |
+| VRAM | ティア | LM モデル | 推奨 LM | バックエンド | 最大時間 (LM有 / LM無) | 最大バッチ (LM有 / LM無) | オフロード | 量子化 |
+|------|--------|-----------|---------|-------------|------------------------|--------------------------|------------|--------|
+| ≤4GB | Tier 1 | なし | — | pt | 4分 / 6分 | 1 / 1 | CPU + DiT | INT8 |
+| 4-6GB | Tier 2 | なし | — | pt | 8分 / 10分 | 1 / 1 | CPU + DiT | INT8 |
+| 6-8GB | Tier 3 | 0.6B | 0.6B | pt | 8分 / 10分 | 2 / 2 | CPU + DiT | INT8 |
+| 8-12GB | Tier 4 | 0.6B | 0.6B | vllm | 8分 / 10分 | 2 / 4 | CPU + DiT | INT8 |
+| 12-16GB | Tier 5 | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 4 | CPU | INT8 |
+| 16-20GB | Tier 6a | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 8 | CPU | INT8 |
+| 20-24GB | Tier 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8分 / 8分 | 8 / 8 | なし | なし |
+| ≥24GB | 無制限 | 全モデル (0.6B, 1.7B, 4B) | 4B | vllm | 10分 / 10分 | 8 / 8 | なし | なし |
+
+### 列の説明
+
+- **LM モデル**: このティアでロードできる 5Hz 言語モデルのサイズ
+- **推奨 LM**: UI でこのティアにデフォルト選択される LM モデル
+- **バックエンド**: LM 推論バックエンド（`vllm` は十分な VRAM を持つ NVIDIA GPU 向け、`pt` は PyTorch フォールバック、`mlx` は Apple Silicon 向け）
+- **オフロード**:
+  - **CPU + DiT**: すべてのモデル（DiT、VAE、テキストエンコーダー）を未使用時に CPU にオフロード；DiT もステップ間でオフロード
+  - **CPU**: VAE とテキストエンコーダーを CPU にオフロード；DiT は GPU に保持
+  - **なし**: すべてのモデルを GPU に保持
+- **量子化**: VRAM 使用量を削減するため、デフォルトで INT8 重み量子化を有効にするかどうか
+
+## アダプティブ UI デフォルト
+
+Gradio UI は検出された GPU ティアに基づいて自動的に設定されます：
+
+- **LM 初期化チェックボックス**: LM をサポートするティア（Tier 3+）ではデフォルトでチェック、Tier 1-2 ではチェックなし・無効
+- **LM モデルパス**: ティアの推奨モデルが自動入力；ドロップダウンには互換モデルのみ表示
+- **バックエンドドロップダウン**: Tier 1-3 では `pt`/`mlx` に制限（vllm KV キャッシュがメモリを消費しすぎる）；Tier 4+ ではすべてのバックエンドが利用可能
+- **CPU オフロード / DiT オフロード**: 低ティアではデフォルトで有効、高ティアでは無効
+- **量子化**: Tier 1-6a ではデフォルトで有効、Tier 6b+ では無効（十分な VRAM）
+- **モデルコンパイル**: すべてのティアでデフォルトで有効（量子化に必要）
+
+互換性のないオプションを手動で選択した場合（例：6GB GPU で vllm を使用しようとした場合）、システムは警告を表示し、互換性のある設定に自動フォールバックします。
+
+## ランタイム安全機能
+
+- **VRAM ガード**: 各推論前に VRAM 要件を推定し、必要に応じてバッチサイズを自動削減
+- **アダプティブ VAE デコード**: 3 段階フォールバック：GPU タイルデコード → GPU デコード+CPU オフロード → 完全 CPU デコード
+- **自動チャンクサイズ**: VAE デコードチャンクサイズが利用可能な空き VRAM に適応（64/128/256/512/1024/1536）
+- **時間/バッチクランプ**: ティアの制限を超える値を要求した場合、警告とともに自動調整
 
 ## 注意事項
 
 - **デフォルト設定** は検出された GPU メモリに基づいて自動構成されます
 - **LM モード** は Chain-of-Thought 生成とオーディオ理解に使用される言語モデルを指します
-- **Flash Attention**、**CPU Offload**、**Compile**、**Quantization** は最適なパフォーマンスのためデフォルトで有効です
-- 要求した時間やバッチサイズが GPU の制限を超える場合、警告が表示され、値は許容最大値に調整されます
+- **Flash Attention** は自動検出され、利用可能な場合に有効化されます
 - **制約付きデコード**: LM が初期化されると、LM の時間生成も GPU ティアの最大時間制限内に制約され、CoT 生成時のメモリ不足エラーを防ぎます
-- VRAM ≤6GB の GPU では、DiT モデル用のメモリを確保するため、デフォルトで LM 初期化が無効になります
+- VRAM ≤6GB の GPU（Tier 1-2）では、DiT モデル用のメモリを確保するため、デフォルトで LM 初期化が無効になります
 - コマンドライン引数または Gradio UI で設定を手動で上書きできます
 
-> **コミュニティ貢献歓迎**: 上記の GPU ティア構成は一般的なハードウェアでのテストに基づいています。お使いのデバイスの実際のパフォーマンスがこれらのパラメータと異なる場合（例：より長い時間やより大きなバッチサイズを処理できる）、より徹底的なテストを行い、`acestep/gpu_config.py` の構成を最適化する PR を提出することを歓迎します。皆様の貢献がすべてのユーザーの体験向上に役立ちます！
+> **コミュニティ貢献歓迎**: 上記の GPU ティア構成は一般的なハードウェアでのテストに基づいています。お使いのデバイスの実際のパフォーマンスがこれらのパラメータと異なる場合（例：より長い時間やより大きなバッチサイズを処理できる）、より徹底的なテストを行い、`acestep/gpu_config.py` の構成を最適化する PR を提出することを歓迎します。
 
 ## メモリ最適化のヒント
 
-1. **低 VRAM (<8GB)**: 最大時間を得るため、LM 初期化なしの DiT のみモードを使用
-2. **中 VRAM (8-16GB)**: 品質とメモリのバランスが最適な 0.6B LM モデルを使用
-3. **高 VRAM (>16GB)**: より良いオーディオ理解と生成品質のため、より大きな LM モデル (1.7B/4B) を有効化
+1. **超低 VRAM (≤6GB)**: LM 初期化なしの DiT のみモードを使用。INT8 量子化と完全 CPU オフロードが必須。VAE デコードは自動的に CPU にフォールバックする場合があります。
+2. **低 VRAM (6-8GB)**: `pt` バックエンドで 0.6B LM モデルを使用可能。オフロードを有効に保ちます。
+3. **中 VRAM (8-16GB)**: 0.6B または 1.7B LM モデルを使用。Tier 4+ では `vllm` バックエンドが良好に動作します。
+4. **高 VRAM (16-24GB)**: より大きな LM モデル（1.7B 推奨）を有効化。20GB+ では量子化はオプションになります。
+5. **超高 VRAM (≥24GB)**: すべてのモデルがオフロードや量子化なしで動作。最高品質のため 4B LM を使用。
 
 ## デバッグモード：異なる GPU 構成のシミュレーション
 
@@ -40,17 +73,81 @@ ACE-Step 1.5 は GPU の VRAM に自動的に適応し、生成時間の制限
 # 4GB GPU (Tier 1) をシミュレート
 MAX_CUDA_VRAM=4 uv run acestep
 
+# 6GB GPU (Tier 2) をシミュレート
+MAX_CUDA_VRAM=6 uv run acestep
+
 # 8GB GPU (Tier 4) をシミュレート
 MAX_CUDA_VRAM=8 uv run acestep
 
 # 12GB GPU (Tier 5) をシミュレート
 MAX_CUDA_VRAM=12 uv run acestep
 
-# 16GB GPU (Tier 6) をシミュレート
+# 16GB GPU (Tier 6a) をシミュレート
 MAX_CUDA_VRAM=16 uv run acestep
 ```
 
+`MAX_CUDA_VRAM` を設定すると、システムは `torch.cuda.set_per_process_memory_fraction()` を呼び出して VRAM のハードキャップを強制し、ハイエンド GPU でもリアルなシミュレーションを実現します。
+
+### 自動ティアテスト
+
+UI で各ティアを手動テストする代わりに、`profile_inference.py` の `tier-test` モードを使用できます：
+
+```bash
+# すべてのティアを自動テスト
+python profile_inference.py --mode tier-test
+
+# 特定のティアをテスト
+python profile_inference.py --mode tier-test --tiers 6 8 16
+
+# LM を有効にしてテスト（サポートされるティアで）
+python profile_inference.py --mode tier-test --tier-with-lm
+
+# 高速テスト（非量子化ティアで torch.compile をスキップ）
+python profile_inference.py --mode tier-test --tier-skip-compile
+```
+
+プロファイリングツールの完全なドキュメントは [BENCHMARK.md](BENCHMARK.md) を参照してください。
+
 用途：
 - ハイエンドハードウェアで GPU ティア構成をテスト
 - 各ティアの警告と制限が正しく機能することを確認
-- PR を提出する前に新しい GPU 構成パラメータを開発・テスト
+- `acestep/gpu_config.py` 変更後の自動回帰テスト
+- CI/CD VRAM 互換性検証
+
+### 境界テスト（最小ティアの特定）
+
+`--tier-boundary` を使用すると、INT8 量子化と CPU オフロードを安全に無効化できる最小 VRAM ティアを実験的に特定できます。各ティアに対して最大3つの構成でテストします：
+
+1. **default** — ティアの標準設定（量子化 + オフロードを設定通りに使用）
+2. **no-quant** — オフロード設定はそのまま、量子化を無効化
+3. **no-offload** — 量子化なし、CPU オフロードなし（すべてのモデルを GPU に保持）
+
+```bash
+# すべてのティアで境界テストを実行
+python profile_inference.py --mode tier-test --tier-boundary
+
+# 特定のティアの境界テスト
+python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24
+
+# LM を有効にした境界テスト（サポートされるティアで）
+python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm
+
+# 結果を JSON に保存
+python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json
+```
+
+> **注意：** 境界テスト結果は経験的なものであり、DiT モデルバリアント（turbo vs base）、LM の有効化状態、生成時間、flash attention の利用可否によって異なる場合があります。
+
+### バッチサイズ境界テスト
+
+`--tier-batch-boundary` を使用して、バッチサイズ 1、2、4、8 を段階的にテストし、各ティアの最大安全バッチサイズを見つけます：
+
+```bash
+# LM 有効でバッチ境界テストを実行
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm
+
+# 特定のティアをテスト
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24
+```
+
+LM あり/なしの両方の構成をテストし、各ティアの最大成功バッチサイズを報告します。
diff --git a/docs/ja/GRADIO_GUIDE.md b/docs/ja/GRADIO_GUIDE.md
index 75d27f9a..5053377b 100644
--- a/docs/ja/GRADIO_GUIDE.md
+++ b/docs/ja/GRADIO_GUIDE.md
@@ -62,17 +62,23 @@ Gradioインターフェースは以下の主要セクションで構成され
 
 | 設定 | 説明 |
 |---------|-------------|
-| **5Hz LMモデルパス** | 言語モデルを選択（例：`acestep-5Hz-lm-0.6B`、`acestep-5Hz-lm-1.7B`）|
-| **5Hz LMバックエンド** | `vllm`（より高速、推奨）または `pt`（PyTorch、互換性が高い）|
-| **5Hz LMを初期化** | 初期化時にLMを読み込むためにチェック（thinkingモードに必要）|
+| **5Hz LMモデルパス** | 言語モデルを選択。**利用可能なモデルはGPUティアに基づいて自動フィルタリング**されます — 例：6-8GB GPUでは0.6Bのみ、24GB+ GPUではすべてのサイズ（0.6B、1.7B、4B）が表示されます。|
+| **5Hz LMバックエンド** | `vllm`（より高速、VRAM ≥8GBのNVIDIA GPU推奨）、`pt`（PyTorch、ユニバーサルフォールバック）、または `mlx`（Apple Silicon）。**VRAM <8GBのGPUでは `pt`/`mlx` に制限**されます（vllmのKVキャッシュがメモリを消費しすぎるため）。|
+| **5Hz LMを初期化** | 初期化時にLMを読み込むためにチェック（thinkingモードに必要）。**VRAM ≤6GBのGPU（Tier 1-2）ではデフォルトでチェックなし・無効。**|
+
+> **アダプティブデフォルト**: すべてのLM設定はGPUのVRAMティアに基づいて自動構成されます。推奨LMモデル、バックエンド、初期化状態は最適なパフォーマンスに事前設定されています。手動で上書きできますが、GPUと互換性のない選択をした場合、システムが警告を表示します。
 
 ### パフォーマンスオプション
 
 | 設定 | 説明 |
 |---------|-------------|
 | **Flash Attentionを使用** | より高速な推論のために有効化（flash_attnパッケージが必要）|
-| **CPUにオフロード** | アイドル時にモデルをCPUにオフロードしてGPUメモリを節約 |
-| **DiTをCPUにオフロード** | DiTモデルを特にCPUにオフロード |
+| **CPUにオフロード** | アイドル時にモデルをCPUにオフロードしてGPUメモリを節約。**VRAM <20GBのGPUでデフォルト自動有効。**|
+| **DiTをCPUにオフロード** | DiTモデルを特にCPUにオフロード。**VRAM <12GBのGPUでデフォルト自動有効。**|
+| **INT8量子化** | INT8重み量子化でモデルのVRAM使用量を削減。**VRAM <20GBのGPUでデフォルト自動有効。**|
+| **モデルコンパイル** | 最適化推論のため `torch.compile` を有効化。**すべてのティアでデフォルト有効**（量子化がアクティブな場合に必要）。|
+
+> **ティア対応設定**: オフロード、量子化、コンパイルオプションはGPUティアに基づいて自動設定されます。完全なティアテーブルは [GPU_COMPATIBILITY.md](../ja/GPU_COMPATIBILITY.md) を参照してください。
 
 ### LoRAアダプター
 
@@ -87,7 +93,12 @@ Gradioインターフェースは以下の主要セクションで構成され
 
 ### 初期化
 
-**サービスを初期化** をクリックしてモデルを読み込みます。ステータスボックスに進捗と確認が表示されます。
+**サービスを初期化** をクリックしてモデルを読み込みます。ステータスボックスに以下を含む進捗と確認が表示されます：
+- 検出されたGPUティアとVRAM
+- 最大許容時間とバッチサイズ（LMが初期化されたかどうかに基づいて動的に調整）
+- 自動修正された互換性のない設定に関する警告
+
+初期化後、**オーディオ時間** と **バッチサイズ** スライダーはティアの制限を反映するように自動更新されます。
 
 ---
 
@@ -515,15 +526,19 @@ LoRAトレーニングタブはカスタムLoRAアダプターを作成するた
 - 異なるシードを試す
 - captionをより具体的にする
 
-**メモリ不足：**
-- バッチサイズを減らす
-- CPUオフロードを有効化
+**メモリ不足（OOM）：**
+- システムは自動VRAMガード（バッチ自動削減）とアダプティブVAEデコード（CPUフォールバック）を含みます。それでもOOMが発生する場合：
+- 手動でバッチサイズを減らす
+- CPUオフロードを有効化（VRAM <20GBでは自動有効のはず）
+- INT8量子化を有効化（VRAM <20GBでは自動有効のはず）
 - LMバッチチャンクサイズを減らす
+- 各ティアの推奨設定は [GPU_COMPATIBILITY.md](../ja/GPU_COMPATIBILITY.md) を参照
 
 **LMが機能しない：**
-- 初期化時に「5Hz LMを初期化」がチェックされていたことを確認
-- 有効なLMモデルパスが選択されていることを確認
-- vllmまたはPyTorchバックエンドが利用可能であることを確認
+- 初期化時に「5Hz LMを初期化」がチェックされていたことを確認（VRAM ≤6GBのGPUではデフォルト無効）
+- 有効なLMモデルパスが選択されていることを確認（ティア互換モデルのみ表示）
+- vllmまたはPyTorchバックエンドが利用可能であることを確認（VRAM <8GBではvllm制限）
+- LMチェックボックスがグレーアウトしている場合、GPUティアがLMをサポートしていません — DiTのみモードを使用
 
 ---
 
diff --git a/docs/ja/INFERENCE.md b/docs/ja/INFERENCE.md
index 6473d7f4..e6036c4b 100644
--- a/docs/ja/INFERENCE.md
+++ b/docs/ja/INFERENCE.md
@@ -709,8 +709,8 @@ caption="速い遅い音楽"  # テンポの矛盾
 
 ### よくある問題
 
-**問題**：メモリ不足エラー
-- **解決策**：`batch_size`、`inference_steps` を減らすか、CPUオフロードを有効化
+**問題**：メモリ不足（OOM）エラー
+- **解決策**：システムは VRAM ガード（バッチ自動削減）とアダプティブ VAE デコード（CPU フォールバック）により、ほとんどの OOM シナリオを自動処理します。それでも OOM が発生する場合：`batch_size` を減らす、`inference_steps` を減らす、CPU オフロード（`offload_to_cpu=True`）を有効化、または INT8 量子化を有効化してください。各 VRAM ティアの推奨設定は [GPU_COMPATIBILITY.md](../ja/GPU_COMPATIBILITY.md) を参照してください。
 
 **問題**：結果の品質が悪い
 - **解決策**：`inference_steps` を増やす、`guidance_scale` を調整、baseモデルを使用
diff --git a/docs/ja/INSTALL.md b/docs/ja/INSTALL.md
index c18d3eff..13f64240 100644
--- a/docs/ja/INSTALL.md
+++ b/docs/ja/INSTALL.md
@@ -468,7 +468,7 @@ ACESTEP_INIT_LLM=false
 | `--init_llm` | auto | LLM 初期化：`true` / `false` / 省略で自動 |
 | `--config_path` | auto | DiT モデル（例：`acestep-v15-turbo`） |
 | `--lm_model_path` | auto | LM モデル（例：`acestep-5Hz-lm-1.7B`） |
-| `--offload_to_cpu` | auto | CPU オフロード（VRAM < 16GB で自動有効化） |
+| `--offload_to_cpu` | auto | CPU オフロード（GPU ティアに基づいて自動設定） |
 | `--download-source` | auto | モデルソース：`auto` / `huggingface` / `modelscope` |
 | `--enable-api` | false | Gradio UI と同時に REST API エンドポイントを有効化 |
 
@@ -529,16 +529,17 @@ huggingface-cli download ACE-Step/acestep-5Hz-lm-4B --local-dir ./checkpoints/ac
 
 ## 💡 どのモデルを選ぶべき？
 
-ACE-Step は GPU の VRAM に自動適応します：
+ACE-Step は GPU の VRAM に自動適応します。UI は検出された GPU ティアに基づいてすべての設定（LM モデル、バックエンド、オフロード、量子化）を事前構成します：
 
-| GPU VRAM | 推奨 LM モデル | 備考 |
-|----------|---------------|------|
-| **≤6GB** | なし（DiTのみ） | メモリ節約のため LM はデフォルトで無効 |
-| **6-12GB** | `acestep-5Hz-lm-0.6B` | 軽量、バランスが良い |
-| **12-16GB** | `acestep-5Hz-lm-1.7B` | より高品質 |
-| **≥16GB** | `acestep-5Hz-lm-4B` | 最高品質と音声理解能力 |
+| GPU VRAM | 推奨 LM モデル | バックエンド | 備考 |
+|----------|---------------|-------------|------|
+| **≤6GB** | なし（DiTのみ） | — | LM はデフォルトで無効；INT8 量子化 + 完全 CPU オフロード |
+| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | 軽量 LM、PyTorch バックエンド |
+| **8-16GB** | `0.6B` / `1.7B` | `vllm` | 8-12GB は 0.6B、12-16GB は 1.7B |
+| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 20GB+ で 4B 利用可能；20GB+ でオフロード不要 |
+| **≥24GB** | `acestep-5Hz-lm-4B` | `vllm` | 最高品質、すべてのモデルがオフロードなしで動作 |
 
-> 📖 GPU 互換性の詳細（時間制限、バッチサイズ、メモリ最適化）は [GPU 互換性ガイド](GPU_COMPATIBILITY.md) を参照してください。
+> 📖 GPU 互換性の詳細（ティアテーブル、時間制限、バッチサイズ、アダプティブ UI デフォルト、メモリ最適化）は [GPU 互換性ガイド](GPU_COMPATIBILITY.md) を参照してください。
 
 ---
 
diff --git a/docs/ko/GPU_COMPATIBILITY.md b/docs/ko/GPU_COMPATIBILITY.md
index 38bfe625..0ceb9b6f 100644
--- a/docs/ko/GPU_COMPATIBILITY.md
+++ b/docs/ko/GPU_COMPATIBILITY.md
@@ -1,36 +1,69 @@
 # GPU 호환성 가이드
 
-ACE-Step 1.5는 GPU의 사용 가능한 VRAM에 자동으로 적응하여 생성 제한 및 LM 모델 가용성을 적절히 조정합니다. 시스템은 시작 시 GPU 메모리를 감지하고 최적의 설정을 구성합니다.
+ACE-Step 1.5는 GPU의 사용 가능한 VRAM에 자동으로 적응하여 생성 제한, LM 모델 가용성, 오프로드 전략 및 UI 기본 설정을 적절히 조정합니다. 시스템은 시작 시 GPU 메모리를 감지하고 최적의 설정을 구성합니다.
 
 ## GPU 티어 구성
 
-| VRAM | 티어 | LM 모드 | 최대 생성 길이 | 최대 배치 크기 | LM 메모리 할당 |
-|------|------|---------|--------------|----------------|---------------------|
-| ≤4GB | 티어 1 | 사용 불가 | 3분 | 1 | - |
-| 4-6GB | 티어 2 | 사용 불가 | 6분 | 1 | - |
-| 6-8GB | 티어 3 | 0.6B (선택 사항) | LM 사용 시: 4분 / 미사용 시: 6분 | LM 사용 시: 1 / 미사용 시: 2 | 3GB |
-| 8-12GB | 티어 4 | 0.6B (선택 사항) | LM 사용 시: 4분 / 미사용 시: 6분 | LM 사용 시: 2 / 미사용 시: 4 | 3GB |
-| 12-16GB | 티어 5 | 0.6B / 1.7B | LM 사용 시: 4분 / 미사용 시: 6분 | LM 사용 시: 2 / 미사용 시: 4 | 0.6B: 3GB, 1.7B: 8GB |
-| 16-24GB | 티어 6 | 0.6B / 1.7B / 4B | 8분 | LM 사용 시: 4 / 미사용 시: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB |
-| ≥24GB | 제한 없음 | 모든 모델 | 10분 | 8 | 제한 없음 |
+| VRAM | 티어 | LM 모델 | 추천 LM | 백엔드 | 최대 길이 (LM 사용 / 미사용) | 최대 배치 (LM 사용 / 미사용) | 오프로드 | 양자화 |
+|------|------|---------|---------|--------|------------------------------|------------------------------|----------|--------|
+| ≤4GB | 티어 1 | 없음 | — | pt | 4분 / 6분 | 1 / 1 | CPU + DiT | INT8 |
+| 4-6GB | 티어 2 | 없음 | — | pt | 8분 / 10분 | 1 / 1 | CPU + DiT | INT8 |
+| 6-8GB | 티어 3 | 0.6B | 0.6B | pt | 8분 / 10분 | 1 / 2 | CPU + DiT | INT8 |
+| 8-12GB | 티어 4 | 0.6B | 0.6B | vllm | 8분 / 10분 | 2 / 4 | CPU + DiT | INT8 |
+| 12-16GB | 티어 5 | 0.6B, 1.7B | 1.7B | vllm | 8분 / 10분 | 2 / 4 | CPU | INT8 |
+| 16-20GB | 티어 6a | 0.6B, 1.7B | 1.7B | vllm | 8분 / 10분 | 4 / 8 | CPU | INT8 |
+| 20-24GB | 티어 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8분 / 8분 | 4 / 8 | 없음 | 없음 |
+| ≥24GB | 제한 없음 | 전체 (0.6B, 1.7B, 4B) | 4B | vllm | 10분 / 10분 | 8 / 8 | 없음 | 없음 |
+
+### 열 설명
+
+- **LM 모델**: 해당 티어에서 로드할 수 있는 5Hz 언어 모델 크기
+- **추천 LM**: UI에서 해당 티어에 기본 선택되는 LM 모델
+- **백엔드**: LM 추론 백엔드 (`vllm`은 충분한 VRAM을 가진 NVIDIA GPU용, `pt`는 PyTorch 대체, `mlx`는 Apple Silicon용)
+- **오프로드**:
+  - **CPU + DiT**: 모든 모델(DiT, VAE, 텍스트 인코더)을 미사용 시 CPU로 오프로드; DiT도 단계 간 오프로드
+  - **CPU**: VAE와 텍스트 인코더를 CPU로 오프로드; DiT는 GPU에 유지
+  - **없음**: 모든 모델을 GPU에 유지
+- **양자화**: VRAM 사용량을 줄이기 위해 기본적으로 INT8 가중치 양자화를 활성화할지 여부
+
+## 적응형 UI 기본 설정
+
+Gradio UI는 감지된 GPU 티어에 따라 자동으로 설정됩니다:
+
+- **LM 초기화 체크박스**: LM을 지원하는 티어(티어 3+)에서 기본 체크, 티어 1-2에서는 체크 해제 및 비활성화
+- **LM 모델 경로**: 티어의 추천 모델이 자동 입력; 드롭다운에는 호환 모델만 표시
+- **백엔드 드롭다운**: 티어 1-3에서는 `pt`/`mlx`로 제한(vllm KV 캐시가 메모리를 과도하게 사용); 티어 4+에서는 모든 백엔드 사용 가능
+- **CPU 오프로드 / DiT 오프로드**: 낮은 티어에서 기본 활성화, 높은 티어에서 비활성화
+- **양자화**: 티어 1-6a에서 기본 활성화, 티어 6b+에서 비활성화(충분한 VRAM)
+- **모델 컴파일**: 모든 티어에서 기본 활성화(양자화에 필요)
+
+호환되지 않는 옵션을 수동으로 선택한 경우(예: 6GB GPU에서 vllm 사용 시도), 시스템이 경고를 표시하고 호환 가능한 설정으로 자동 대체합니다.
+
+## 런타임 안전 기능
+
+- **VRAM 가드**: 각 추론 전에 VRAM 요구 사항을 추정하고 필요 시 배치 크기를 자동 축소
+- **적응형 VAE 디코딩**: 3단계 대체: GPU 타일 디코딩 → GPU 디코딩+CPU 오프로드 → 완전 CPU 디코딩
+- **자동 청크 크기**: VAE 디코딩 청크 크기가 사용 가능한 여유 VRAM에 적응(64/128/256/512/1024/1536)
+- **길이/배치 클램핑**: 티어 제한을 초과하는 값을 요청하면 경고와 함께 자동 조정
 
 ## 참고 사항
 
-- **기본 설정**은 감지된 GPU 메모리에 따라 자동으로 구성됩니다.
-- **LM 모드**는 Chain-of-Thought 생성 및 오디오 이해에 사용되는 언어 모델을 의미합니다.
-- 최적의 성능을 위해 **Flash Attention**, **CPU Offload**, **Compile**, **Quantization**이 기본적으로 활성화됩니다.
-- GPU 제한을 초과하는 길이 또는 배치 크기를 요청하면 경고가 표시되고 값이 제한됩니다.
-- **제약 디코딩 (Constrained Decoding)**: LM이 초기화되면 LM의 길이 생성도 GPU 티어의 최대 길이 제한으로 제약되어 CoT 생성 중 메모리 부족(OOM) 에러를 방지합니다.
-- VRAM이 6GB 이하인 GPU의 경우, DiT 모델의 메모리 확보를 위해 LM 초기화가 기본적으로 비활성화됩니다.
-- 명령줄 인자(CLI) 또는 Gradio UI를 통해 설정을 수동으로 무시할 수 있습니다.
+- **기본 설정**은 감지된 GPU 메모리에 따라 자동으로 구성됩니다
+- **LM 모드**는 Chain-of-Thought 생성 및 오디오 이해에 사용되는 언어 모델을 의미합니다
+- **Flash Attention**은 자동 감지되며 사용 가능할 때 활성화됩니다
+- **제약 디코딩**: LM이 초기화되면 LM의 길이 생성도 GPU 티어의 최대 길이 제한으로 제약되어 CoT 생성 중 OOM 에러를 방지합니다
+- VRAM이 6GB 이하인 GPU(티어 1-2)의 경우, DiT 모델의 메모리 확보를 위해 LM 초기화가 기본적으로 비활성화됩니다
+- CLI 인자 또는 Gradio UI를 통해 설정을 수동으로 무시할 수 있습니다
 
-> **커뮤니티 기여 환영**: 위의 GPU 티어 구성은 일반적인 하드웨어에서의 테스트를 바탕으로 합니다. 사용 중인 장치의 실제 성능이 이 파라미터와 다르다면(예: 더 긴 길이나 더 큰 배치를 처리할 수 있는 경우), 더 철저한 테스트를 수행하고 `acestep/gpu_config.py`에서 이러한 구성을 최적화하기 위한 PR을 제출해 주시기 바랍니다. 여러분의 기여가 모든 사용자의 경험을 개선하는 데 도움이 됩니다!
+> **커뮤니티 기여 환영**: 위의 GPU 티어 구성은 일반적인 하드웨어에서의 테스트를 바탕으로 합니다. 사용 중인 장치의 실제 성능이 이 파라미터와 다르다면, 더 철저한 테스트를 수행하고 `acestep/gpu_config.py`에서 구성을 최적화하기 위한 PR을 제출해 주시기 바랍니다.
 
 ## 메모리 최적화 팁
 
-1. **낮은 VRAM (8GB 미만)**: 최대 길이를 확보하려면 LM 초기화 없이 DiT 전용 모드를 사용하세요.
-2. **중간 VRAM (8-16GB)**: 품질과 메모리의 최적의 균형을 위해 0.6B LM 모델을 사용하세요.
-3. **높은 VRAM (16GB 초과)**: 더 나은 오디오 이해 및 생성 품질을 위해 더 큰 LM 모델(1.7B/4B)을 활성화하세요.
+1. **초저 VRAM (≤6GB)**: LM 초기화 없이 DiT 전용 모드를 사용. INT8 양자화와 완전 CPU 오프로드가 필수. VAE 디코딩이 자동으로 CPU로 대체될 수 있습니다.
+2. **저 VRAM (6-8GB)**: `pt` 백엔드로 0.6B LM 모델 사용 가능. 오프로드를 활성 상태로 유지하세요.
+3. **중간 VRAM (8-16GB)**: 0.6B 또는 1.7B LM 모델을 사용. 티어 4+에서 `vllm` 백엔드가 잘 작동합니다.
+4. **높은 VRAM (16-24GB)**: 더 큰 LM 모델(1.7B 추천)을 활성화. 20GB+에서는 양자화가 선택 사항이 됩니다.
+5. **초고 VRAM (≥24GB)**: 모든 모델이 오프로드나 양자화 없이 작동. 최고 품질을 위해 4B LM을 사용하세요.
 
 ## 디버그 모드: 다른 GPU 구성 시뮬레이션
 
@@ -40,17 +73,81 @@ ACE-Step 1.5는 GPU의 사용 가능한 VRAM에 자동으로 적응하여 생성
 # 4GB GPU 시뮬레이션 (티어 1)
 MAX_CUDA_VRAM=4 uv run acestep
 
+# 6GB GPU 시뮬레이션 (티어 2)
+MAX_CUDA_VRAM=6 uv run acestep
+
 # 8GB GPU 시뮬레이션 (티어 4)
 MAX_CUDA_VRAM=8 uv run acestep
 
 # 12GB GPU 시뮬레이션 (티어 5)
 MAX_CUDA_VRAM=12 uv run acestep
 
-# 16GB GPU 시뮬레이션 (티어 6)
+# 16GB GPU 시뮬레이션 (티어 6a)
 MAX_CUDA_VRAM=16 uv run acestep
 ```
 
+`MAX_CUDA_VRAM`을 설정하면 시스템은 `torch.cuda.set_per_process_memory_fraction()`을 호출하여 VRAM 하드 캡을 강제하며, 고사양 GPU에서도 현실적인 시뮬레이션을 제공합니다.
+
+### 자동 티어 테스트
+
+UI에서 각 티어를 수동으로 테스트하는 대신, `profile_inference.py`의 `tier-test` 모드를 사용할 수 있습니다:
+
+```bash
+# 모든 티어 자동 테스트
+python profile_inference.py --mode tier-test
+
+# 특정 티어 테스트
+python profile_inference.py --mode tier-test --tiers 6 8 16
+
+# LM 활성화하여 테스트 (지원되는 티어에서)
+python profile_inference.py --mode tier-test --tier-with-lm
+
+# 빠른 테스트 (비양자화 티어에서 torch.compile 건너뛰기)
+python profile_inference.py --mode tier-test --tier-skip-compile
+```
+
+프로파일링 도구의 전체 문서는 [BENCHMARK.md](BENCHMARK.md)를 참조하세요.
+
 이는 다음과 같은 경우에 유용합니다:
 - 고사양 하드웨어에서 GPU 티어 구성 테스트
 - 각 티어에 대해 경고 및 제한이 올바르게 작동하는지 확인
-- PR 제출 전 새로운 GPU 구성 파라미터 개발 및 테스트
+- `acestep/gpu_config.py` 수정 후 자동 회귀 테스트
+- CI/CD VRAM 호환성 검증
+
+### 경계 테스트 (최소 티어 찾기)
+
+`--tier-boundary`를 사용하면 INT8 양자화와 CPU 오프로드를 안전하게 비활성화할 수 있는 최소 VRAM 티어를 실험적으로 확인할 수 있습니다. 각 티어에 대해 최대 3가지 구성으로 테스트합니다:
+
+1. **default** — 티어의 기본 설정 (양자화 + 오프로드를 구성대로 사용)
+2. **no-quant** — 오프로드 설정은 유지하되 양자화 비활성화
+3. **no-offload** — 양자화 없음, CPU 오프로드 없음 (모든 모델을 GPU에 유지)
+
+```bash
+# 모든 티어에서 경계 테스트 실행
+python profile_inference.py --mode tier-test --tier-boundary
+
+# 특정 티어의 경계 테스트
+python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24
+
+# LM 활성화된 경계 테스트 (지원되는 티어에서)
+python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm
+
+# 결과를 JSON으로 저장
+python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json
+```
+
+> **참고:** 경계 테스트 결과는 경험적이며, DiT 모델 변형 (turbo vs base), LM 활성화 여부, 생성 시간, flash attention 가용성에 따라 달라질 수 있습니다.
+
+### 배치 크기 경계 테스트
+
+`--tier-batch-boundary`를 사용하여 배치 크기 1, 2, 4, 8을 단계적으로 테스트하여 각 티어의 최대 안전 배치 크기를 찾습니다:
+
+```bash
+# LM 활성화 상태에서 배치 경계 테스트 실행
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm
+
+# 특정 티어 테스트
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24
+```
+
+LM 사용/미사용 두 가지 구성을 모두 테스트하고 각 티어의 최대 성공 배치 크기를 보고합니다.
diff --git a/docs/ko/INFERENCE.md b/docs/ko/INFERENCE.md
index 6ec98f31..47ab56ff 100644
--- a/docs/ko/INFERENCE.md
+++ b/docs/ko/INFERENCE.md
@@ -267,3 +267,4 @@ result = generate_music(dit_handler, llm_handler, params, config, save_dir="/out
 2. **Turbo 모델 활용**: 빠른 반복 작업에는 `turbo` 모델을 사용하는 것이 효율적입니다.
 3. **Thinking 모드**: 더 논리적인 음악 구조가 필요할 때 `thinking=True`를 사용하되, 메모리가 부족하면 끌 수 있습니다.
 4. **결과 반복**: 배치 크기를 2-4로 설정하여 여러 버전을 한 번에 듣고 최적의 결과를 고르는 것이 좋습니다.
+5. **메모리 관리**: ACE-Step 1.5는 자동 VRAM 관리를 포함합니다 — VRAM 가드(자동 배치 축소), 적응형 VAE 디코딩(CPU 대체), 자동 청크 크기 조정. OOM이 발생하면 시스템이 자동으로 처리합니다. 각 VRAM 티어의 권장 설정은 [GPU_COMPATIBILITY.md](../ko/GPU_COMPATIBILITY.md)를 참조하세요.
\ No newline at end of file
diff --git a/docs/zh/BENCHMARK.md b/docs/zh/BENCHMARK.md
index a75a7ca6..dc7096e6 100644
--- a/docs/zh/BENCHMARK.md
+++ b/docs/zh/BENCHMARK.md
@@ -26,6 +26,7 @@
 |------|------|
 | `profile` | 对单次生成进行详细的计时分析 |
 | `benchmark` | 运行配置矩阵（时长 × 批量 × 思考 × 步数），输出汇总表 |
+| `tier-test` | 通过 `MAX_CUDA_VRAM` 模拟不同显存大小，自动测试所有 GPU 等级 |
 | `understand` | 分析 `understand_music()` API（音频 → 元数据提取） |
 | `create_sample` | 分析 `create_sample()` API（灵感/简单模式） |
 | `format_sample` | 分析 `format_sample()` API（标题+歌词 → 结构化元数据） |
@@ -156,6 +157,84 @@ python profile_inference.py --mode create_sample --instrumental
 python profile_inference.py --mode format_sample
 ```
 
+### 6. `tier-test` — 自动化 GPU 等级测试
+
+使用 `MAX_CUDA_VRAM` 自动模拟不同的 GPU 显存大小，并在每个等级运行生成测试。这是修改 `acestep/gpu_config.py` 后验证所有 GPU 等级是否正常工作的推荐方式。
+
+```bash
+# 测试所有等级 (4, 6, 8, 12, 16, 20, 24 GB)
+python profile_inference.py --mode tier-test
+
+# 测试特定显存大小
+python profile_inference.py --mode tier-test --tiers 6 8 16
+
+# 启用 LM 测试（在支持的等级上）
+python profile_inference.py --mode tier-test --tier-with-lm
+
+# 快速测试：非量化等级跳过 torch.compile
+python profile_inference.py --mode tier-test --tier-skip-compile
+```
+
+**每个等级验证的内容：**
+- 正确的等级检测和 `GPUConfig` 构建
+- 模型初始化（DiT、VAE、文本编码器，可选 LM）
+- 短时间生成（30秒时长，batch=1）无 OOM 完成
+- 自适应 VAE 解码回退（GPU → CPU 卸载 → 完全 CPU）
+- 显存使用保持在模拟限制内
+
+**输出示例：**
+
+```
+TIER TEST RESULTS
+====================================================================================================
+  VRAM    Tier       LM      Duration   Status    Peak VRAM    Notes
+  ──────────────────────────────────────────────────────────────────────────────
+  4GB     tier1      —       30s        ✅ OK     3.8GB        VAE 在 CPU 上解码
+  6GB     tier2      —       30s        ✅ OK     5.4GB        分片 VAE chunk=256
+  8GB     tier4      0.6B    30s        ✅ OK     7.2GB        vllm 后端
+  12GB    tier5      1.7B    30s        ✅ OK     10.8GB       vllm 后端
+  16GB    tier6a     1.7B    30s        ✅ OK     14.5GB       启用卸载
+  20GB    tier6b     1.7B    30s        ✅ OK     17.2GB       无卸载
+  24GB    unlimited  4B      30s        ✅ OK     21.3GB       所有模型在 GPU 上
+```
+
+> **注意**: `tier-test` 模式使用 `torch.cuda.set_per_process_memory_fraction()` 强制执行显存硬上限，即使在高端 GPU（如 A100 80GB）上也能实现真实的模拟。
+
+#### 边界测试
+
+使用 `--tier-boundary` 查找可以安全关闭 INT8 量化和 CPU 卸载的最低显存等级。对每个等级最多测试三种配置：
+
+1. **default** — 等级的标准设置
+2. **no-quant** — 关闭量化，卸载不变
+3. **no-offload** — 不使用量化，也不使用 CPU 卸载
+
+```bash
+# 在所有等级运行边界测试
+python profile_inference.py --mode tier-test --tier-boundary
+
+# 启用 LM 的边界测试
+python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm
+
+# 将边界测试结果保存为 JSON
+python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json
+```
+
+输出包含一个 **边界分析** 摘要，显示每种能力的最低等级。
+
+#### 批次大小边界测试
+
+使用 `--tier-batch-boundary` 查找每个等级的最大安全批次大小。对每个等级，工具会递进测试批次大小 1、2、4、8（在首次 OOM 时停止），同时测试启用 LM 和未启用 LM 的配置：
+
+```bash
+# 运行批次边界测试
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm
+
+# 测试特定等级
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24
+```
+
+输出包含一个 **批次边界摘要**，显示每个等级在有 LM 和无 LM 配置下的最大成功批次大小。
+
 ---
 
 ## 命令行参数
@@ -209,12 +288,22 @@ python profile_inference.py --mode format_sample
 
 | 参数 | 默认值 | 说明 |
 |------|--------|------|
-| `--mode` | `profile` | 模式：`profile` / `benchmark` / `understand` / `create_sample` / `format_sample` |
+| `--mode` | `profile` | 模式：`profile` / `benchmark` / `tier-test` / `understand` / `create_sample` / `format_sample` |
 | `--no-warmup` | 关闭 | 跳过预热 |
 | `--detailed` | 关闭 | 启用 `cProfile` 函数级分析 |
 | `--llm-debug` | 关闭 | 深度 LLM 调试（token 数量、吞吐量） |
 | `--benchmark-output` | 无 | 保存基准测试结果为 JSON 文件 |
 
+### 等级测试选项
+
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `--tiers` | `4 6 8 12 16 20 24` | 要模拟的显存大小（GB） |
+| `--tier-with-lm` | 关闭 | 在支持的等级上启用 LM 初始化 |
+| `--tier-skip-compile` | 关闭 | 非量化等级跳过 `torch.compile` 以加速迭代 |
+| `--tier-boundary` | 关闭 | 对每个等级测试 no-quant 和 no-offload 变体，查找最低能力边界 |
+| `--tier-batch-boundary` | 关闭 | 对每个等级测试批次大小 1、2、4、8，查找最大安全批次大小 |
+
 ### 输入选项
 
 | 参数 | 默认值 | 说明 |
@@ -340,6 +429,10 @@ TIME COSTS BREAKDOWN
 
 4. **使用代表性时长测试** — 短时长（30s）以 LLM 耗时为主；长时长（240s+）以 DiT 耗时为主。
 
-5. **GPU 显存自动适配** — benchmark 模式会自动将时长和批量大小裁剪到 GPU 可处理的范围。
+5. **GPU 显存自动适配** — benchmark 模式会自动将时长和批量大小裁剪到 GPU 可处理的范围，使用 `acestep/gpu_config.py` 中的自适应等级系统。
 
 6. **谨慎使用 `--detailed`** — `cProfile` 会增加开销；仅在需要调查函数级瓶颈时使用。
+
+7. **使用 `tier-test` 进行回归测试** — 修改 GPU 等级配置后，运行 `--mode tier-test` 验证所有等级仍然正常工作。这在更改卸载阈值、时长限制或 LM 模型可用性时尤为重要。
+
+8. **真实模拟低显存** — 使用 `MAX_CUDA_VRAM` 时，系统通过 `set_per_process_memory_fraction()` 强制执行显存硬上限，因此模拟期间的 OOM 错误反映了消费级 GPU 上的真实行为。
diff --git a/docs/zh/GPU_COMPATIBILITY.md b/docs/zh/GPU_COMPATIBILITY.md
index bf71cf3c..cb448b18 100644
--- a/docs/zh/GPU_COMPATIBILITY.md
+++ b/docs/zh/GPU_COMPATIBILITY.md
@@ -1,36 +1,69 @@
 # GPU 兼容性指南
 
-ACE-Step 1.5 会自动适配您的 GPU 显存大小，相应调整生成时长限制和可用的 LM 模型。系统在启动时检测 GPU 显存并自动配置最佳设置。
+ACE-Step 1.5 会自动适配您的 GPU 显存大小，相应调整生成时长限制、可用的 LM 模型、卸载策略和 UI 默认设置。系统在启动时检测 GPU 显存并自动配置最佳设置。
 
 ## GPU 分级配置
 
-| 显存 | 等级 | LM 模式 | 最大时长 | 最大批次 | LM 显存分配 |
-|------|------|---------|----------|----------|-------------|
-| ≤4GB | Tier 1 | 不可用 | 3 分钟 | 1 | - |
-| 4-6GB | Tier 2 | 不可用 | 6 分钟 | 1 | - |
-| 6-8GB | Tier 3 | 0.6B (可选) | 有 LM: 4 分钟 / 无 LM: 6 分钟 | 有 LM: 1 / 无 LM: 2 | 3GB |
-| 8-12GB | Tier 4 | 0.6B (可选) | 有 LM: 4 分钟 / 无 LM: 6 分钟 | 有 LM: 2 / 无 LM: 4 | 3GB |
-| 12-16GB | Tier 5 | 0.6B / 1.7B | 有 LM: 4 分钟 / 无 LM: 6 分钟 | 有 LM: 2 / 无 LM: 4 | 0.6B: 3GB, 1.7B: 8GB |
-| 16-24GB | Tier 6 | 0.6B / 1.7B / 4B | 8 分钟 | 有 LM: 4 / 无 LM: 8 | 0.6B: 3GB, 1.7B: 8GB, 4B: 12GB |
-| ≥24GB | 无限制 | 所有模型 | 10 分钟 | 8 | 无限制 |
+| 显存 | 等级 | LM 模型 | 推荐 LM | 后端 | 最大时长 (有LM / 无LM) | 最大批次 (有LM / 无LM) | 卸载策略 | 量化 |
+|------|------|---------|---------|------|------------------------|------------------------|----------|------|
+| ≤4GB | Tier 1 | 无 | — | pt | 4分 / 6分 | 1 / 1 | CPU + DiT | INT8 |
+| 4-6GB | Tier 2 | 无 | — | pt | 8分 / 10分 | 1 / 1 | CPU + DiT | INT8 |
+| 6-8GB | Tier 3 | 0.6B | 0.6B | pt | 8分 / 10分 | 2 / 2 | CPU + DiT | INT8 |
+| 8-12GB | Tier 4 | 0.6B | 0.6B | vllm | 8分 / 10分 | 2 / 4 | CPU + DiT | INT8 |
+| 12-16GB | Tier 5 | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 4 | CPU | INT8 |
+| 16-20GB | Tier 6a | 0.6B, 1.7B | 1.7B | vllm | 8分 / 10分 | 4 / 8 | CPU | INT8 |
+| 20-24GB | Tier 6b | 0.6B, 1.7B, 4B | 1.7B | vllm | 8分 / 8分 | 8 / 8 | 无 | 无 |
+| ≥24GB | 无限制 | 全部 (0.6B, 1.7B, 4B) | 4B | vllm | 10分 / 10分 | 8 / 8 | 无 | 无 |
+
+### 列说明
+
+- **LM 模型**: 该等级可以加载的 5Hz 语言模型尺寸
+- **推荐 LM**: UI 中该等级默认选择的 LM 模型
+- **后端**: LM 推理后端（`vllm` 用于显存充足的 NVIDIA GPU，`pt` 为 PyTorch 回退方案，`mlx` 用于 Apple Silicon）
+- **卸载策略**:
+  - **CPU + DiT**: 所有模型（DiT、VAE、文本编码器）不使用时卸载到 CPU；DiT 也在步骤间卸载
+  - **CPU**: VAE 和文本编码器卸载到 CPU；DiT 保留在 GPU 上
+  - **无**: 所有模型保留在 GPU 上
+- **量化**: 是否默认启用 INT8 权重量化以减少显存占用
+
+## 自适应 UI 默认设置
+
+Gradio UI 会根据检测到的 GPU 等级自动配置：
+
+- **LM 初始化复选框**: 支持 LM 的等级（Tier 3+）默认勾选，Tier 1-2 默认不勾选且禁用
+- **LM 模型路径**: 自动填充该等级推荐的模型；下拉菜单仅显示兼容的模型
+- **后端下拉菜单**: Tier 1-3 限制为 `pt`/`mlx`（vllm KV 缓存占用过大）；Tier 4+ 所有后端可用
+- **CPU 卸载 / DiT 卸载**: 低等级默认启用，高等级默认禁用
+- **量化**: Tier 1-6a 默认启用，Tier 6b+ 默认禁用（显存充足）
+- **模型编译**: 所有等级默认启用（量化需要）
+
+如果您手动选择了不兼容的选项（例如在 6GB GPU 上使用 vllm），系统会发出警告并自动回退到兼容配置。
+
+## 运行时安全特性
+
+- **显存守卫**: 每次推理前，系统会估算显存需求，必要时自动减小批次大小
+- **自适应 VAE 解码**: 三级回退机制：GPU 分片解码 → GPU 解码+结果卸载到 CPU → 完全 CPU 解码
+- **自动分片大小**: VAE 解码分片大小根据可用空闲显存自适应调整（64/128/256/512/1024/1536）
+- **时长/批次裁剪**: 如果请求的值超出等级限制，会自动裁剪并显示警告
 
 ## 说明
 
 - **默认设置** 会根据检测到的 GPU 显存自动配置
 - **LM 模式** 指用于思维链 (Chain-of-Thought) 生成和音频理解的语言模型
-- **Flash Attention**、**CPU Offload**、**Compile** 和 **Quantization** 默认启用以获得最佳性能
-- 如果您请求的时长或批次大小超出 GPU 限制，系统会显示警告并自动调整到允许的最大值
+- **Flash Attention** 会自动检测并在可用时启用
 - **约束解码**: 当 LM 初始化后，LM 生成的时长也会被约束在 GPU 等级的最大时长限制内，防止在 CoT 生成时出现显存不足错误
-- 对于显存 ≤6GB 的 GPU，默认禁用 LM 初始化以保留显存给 DiT 模型
+- 对于显存 ≤6GB 的 GPU（Tier 1-2），默认禁用 LM 初始化以保留显存给 DiT 模型
 - 您可以通过命令行参数或 Gradio UI 手动覆盖设置
 
 > **欢迎社区贡献**: 以上 GPU 分级配置基于我们在常见硬件上的测试。如果您发现您的设备实际性能与这些参数不符（例如，可以处理更长的时长或更大的批次），欢迎您进行更充分的测试，并提交 PR 来优化 `acestep/gpu_config.py` 中的配置。您的贡献将帮助改善所有用户的体验！
 
 ## 显存优化建议
 
-1. **低显存 (<8GB)**: 使用纯 DiT 模式，不初始化 LM，以获得最大时长
-2. **中等显存 (8-16GB)**: 使用 0.6B LM 模型，在质量和显存之间取得最佳平衡
-3. **高显存 (>16GB)**: 启用更大的 LM 模型 (1.7B/4B) 以获得更好的音频理解和生成质量
+1. **极低显存 (≤6GB)**: 使用纯 DiT 模式，不初始化 LM。INT8 量化和完全 CPU 卸载是必须的。VAE 解码可能会自动回退到 CPU。
+2. **低显存 (6-8GB)**: 可使用 0.6B LM 模型，配合 `pt` 后端。保持卸载启用。
+3. **中等显存 (8-16GB)**: 使用 0.6B 或 1.7B LM 模型。Tier 4+ 上 `vllm` 后端表现良好。
+4. **高显存 (16-24GB)**: 启用更大的 LM 模型（推荐 1.7B）。20GB+ 量化变为可选。
+5. **超高显存 (≥24GB)**: 所有模型无需卸载或量化即可运行。使用 4B LM 获得最佳质量。
 
 ## 调试模式：模拟不同的 GPU 配置
 
@@ -40,17 +73,93 @@ ACE-Step 1.5 会自动适配您的 GPU 显存大小，相应调整生成时长
 # 模拟 4GB GPU (Tier 1)
 MAX_CUDA_VRAM=4 uv run acestep
 
+# 模拟 6GB GPU (Tier 2)
+MAX_CUDA_VRAM=6 uv run acestep
+
 # 模拟 8GB GPU (Tier 4)
 MAX_CUDA_VRAM=8 uv run acestep
 
 # 模拟 12GB GPU (Tier 5)
 MAX_CUDA_VRAM=12 uv run acestep
 
-# 模拟 16GB GPU (Tier 6)
+# 模拟 16GB GPU (Tier 6a)
 MAX_CUDA_VRAM=16 uv run acestep
 ```
 
+设置 `MAX_CUDA_VRAM` 时，系统还会调用 `torch.cuda.set_per_process_memory_fraction()` 来强制执行显存硬上限，即使在高端 GPU 上也能实现真实的模拟。
+
+### 自动化分级测试
+
+无需通过 UI 手动测试每个等级，可以使用 `profile_inference.py` 的 `tier-test` 模式：
+
+```bash
+# 自动测试所有等级
+python profile_inference.py --mode tier-test
+
+# 测试特定等级
+python profile_inference.py --mode tier-test --tiers 6 8 16
+
+# 测试时启用 LM（在支持的等级上）
+python profile_inference.py --mode tier-test --tier-with-lm
+
+# 快速测试（非量化等级跳过 torch.compile）
+python profile_inference.py --mode tier-test --tier-skip-compile
+```
+
+详见 [BENCHMARK.md](BENCHMARK.md) 获取性能分析工具的完整文档。
+
 适用场景：
 - 在高端硬件上测试 GPU 分级配置
 - 验证各等级的警告和限制是否正常工作
-- 在提交 PR 之前开发和测试新的 GPU 配置参数
+- 修改 `acestep/gpu_config.py` 后的自动化回归测试
+- CI/CD 显存兼容性验证
+
+### 边界测试（查找最低等级）
+
+使用 `--tier-boundary` 可以通过实际运行来确定从哪个显存等级开始可以安全地关闭 INT8 量化和 CPU 卸载。对于每个等级，最多运行三种配置：
+
+1. **default** — 该等级的默认设置（按配置使用量化 + 卸载）
+2. **no-quant** — 保持卸载设置不变，但关闭量化
+3. **no-offload** — 不使用量化，也不使用 CPU 卸载（所有模型保留在 GPU 上）
+
+```bash
+# 在所有等级上运行边界测试
+python profile_inference.py --mode tier-test --tier-boundary
+
+# 测试特定等级的边界
+python profile_inference.py --mode tier-test --tier-boundary --tiers 8 12 16 20 24
+
+# 启用 LM 的边界测试（在支持的等级上）
+python profile_inference.py --mode tier-test --tier-boundary --tier-with-lm
+
+# 将结果保存为 JSON 以便进一步分析
+python profile_inference.py --mode tier-test --tier-boundary --benchmark-output boundary_results.json
+```
+
+输出包含一个 **边界分析** 部分，显示每种能力的最低等级：
+
+```
+BOUNDARY ANALYSIS
+=================
+  Capability                                    Min Tier   VRAM
+  ------------------------------------------------------------
+  No INT8 Quantization                          tier6b      20GB
+  No CPU Offload (all models on GPU)            tier6b      20GB
+  ------------------------------------------------------------
+```
+
+> **注意：** 边界测试结果是经验性的，可能因 DiT 模型变体（turbo vs base）、是否启用 LM、生成时长和 flash attention 可用性而有所不同。欢迎社区贡献来完善这些边界值！
+
+### 批次大小边界测试
+
+使用 `--tier-batch-boundary` 通过递进测试批次大小 1、2、4、8 来查找每个等级的最大安全批次大小：
+
+```bash
+# 运行启用 LM 的批次边界测试
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm
+
+# 测试特定等级
+python profile_inference.py --mode tier-test --tier-batch-boundary --tier-with-lm --tiers 8 12 16 24
+```
+
+该测试同时测试有 LM 和无 LM 的配置，并报告每个等级的最大成功批次大小。
diff --git a/docs/zh/GRADIO_GUIDE.md b/docs/zh/GRADIO_GUIDE.md
index 87b39725..5d5821fe 100644
--- a/docs/zh/GRADIO_GUIDE.md
+++ b/docs/zh/GRADIO_GUIDE.md
@@ -62,17 +62,23 @@ Gradio 界面包含以下主要部分：
 
 | 设置 | 说明 |
 |---------|-------------|
-| **5Hz LM 模型路径** | 选择语言模型（例如 `acestep-5Hz-lm-0.6B`、`acestep-5Hz-lm-1.7B`）|
-| **5Hz LM 后端** | `vllm`（更快，推荐）或 `pt`（PyTorch，兼容性更好）|
-| **初始化 5Hz LM** | 勾选以在初始化期间加载 LM（thinking 模式必需）|
+| **5Hz LM 模型路径** | 选择语言模型。**可用模型根据 GPU 等级自动过滤** — 例如，6-8GB GPU 仅显示 0.6B，而 24GB+ GPU 显示所有尺寸（0.6B、1.7B、4B）。|
+| **5Hz LM 后端** | `vllm`（更快，推荐显存 ≥8GB 的 NVIDIA GPU）、`pt`（PyTorch，通用回退方案）或 `mlx`（Apple Silicon）。**显存 <8GB 的 GPU 限制为 `pt`/`mlx`**，因为 vllm 的 KV 缓存占用过大。|
+| **初始化 5Hz LM** | 勾选以在初始化期间加载 LM（thinking 模式必需）。**显存 ≤6GB 的 GPU（Tier 1-2）默认不勾选且禁用。**|
+
+> **自适应默认设置**: 所有 LM 设置根据 GPU 显存等级自动配置。推荐的 LM 模型、后端和初始化状态已预设为最佳性能。您可以手动覆盖，但如果选择与 GPU 不兼容，系统会发出警告。
 
 ### 性能选项
 
 | 设置 | 说明 |
 |---------|-------------|
 | **使用 Flash Attention** | 启用以加速推理（需要 flash_attn 包）|
-| **卸载到 CPU** | 空闲时将模型卸载到 CPU 以节省 GPU 内存 |
-| **将 DiT 卸载到 CPU** | 专门将 DiT 模型卸载到 CPU |
+| **卸载到 CPU** | 空闲时将模型卸载到 CPU 以节省 GPU 显存。**显存 <20GB 的 GPU 默认自动启用。**|
+| **将 DiT 卸载到 CPU** | 专门将 DiT 模型卸载到 CPU。**显存 <12GB 的 GPU 默认自动启用。**|
+| **INT8 量化** | 使用 INT8 权重量化减少模型显存占用。**显存 <20GB 的 GPU 默认自动启用。**|
+| **模型编译** | 启用 `torch.compile` 优化推理。**所有等级默认启用**（量化激活时必需）。|
+
+> **等级感知设置**: 卸载、量化和编译选项根据 GPU 等级自动设置。详见 [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) 了解完整的等级表。
 
 ### LoRA 适配器
 
@@ -87,7 +93,12 @@ Gradio 界面包含以下主要部分：
 
 ### 初始化
 
-点击 **初始化服务** 加载模型。状态框将显示进度和确认信息。
+点击 **初始化服务** 加载模型。状态框将显示进度和确认信息，包括：
+- 检测到的 GPU 等级和显存
+- 最大允许时长和批次大小（根据是否初始化了 LM 动态调整）
+- 任何不兼容设置被自动修正的警告
+
+初始化后，**音频时长** 和 **批量大小** 滑块会自动更新以反映等级限制。
 
 ---
 
@@ -515,15 +526,19 @@ LoRA 训练选项卡提供创建自定义 LoRA 适配器的工具。
 - 尝试不同的种子
 - 使 caption 更具体
 
-**内存不足：**
-- 减少批量大小
-- 启用 CPU 卸载
+**显存不足 (OOM)：**
+- 系统包含自动显存管理（显存守卫、自适应 VAE 解码、自动批次减小）。如果仍然 OOM：
+- 手动减少批量大小
+- 启用 CPU 卸载（显存 <20GB 应已自动启用）
+- 启用 INT8 量化（显存 <20GB 应已自动启用）
 - 减少 LM 批处理块大小
+- 详见 [GPU_COMPATIBILITY.md](GPU_COMPATIBILITY.md) 了解各等级推荐设置
 
 **LM 不工作：**
-- 确保初始化期间勾选了"初始化 5Hz LM"
-- 检查是否选择了有效的 LM 模型路径
-- 验证 vllm 或 PyTorch 后端可用
+- 确保初始化期间勾选了"初始化 5Hz LM"（显存 ≤6GB 的 GPU 默认禁用）
+- 检查是否选择了有效的 LM 模型路径（仅显示与等级兼容的模型）
+- 验证 vllm 或 PyTorch 后端可用（显存 <8GB 限制使用 vllm）
+- 如果 LM 复选框灰色不可用，说明您的 GPU 等级不支持 LM — 请使用纯 DiT 模式
 
 ---
 
diff --git a/docs/zh/INFERENCE.md b/docs/zh/INFERENCE.md
index 0242354c..dc80d402 100644
--- a/docs/zh/INFERENCE.md
+++ b/docs/zh/INFERENCE.md
@@ -975,13 +975,21 @@ else:
         # ... 处理音频文件
 ```
 
-### 7. 内存管理
+### 7. 显存管理
 
-对于大批量大小或长时长：
-- 监控 GPU 内存使用
-- 如果出现 OOM 错误，减少 `batch_size`
-- 减少 `lm_batch_chunk_size` 用于 LM 操作
-- 考虑在初始化期间使用 `offload_to_cpu=True`
+ACE-Step 1.5 包含自动显存管理，可适应您的 GPU：
+
+- **自动等级检测**: 系统检测可用显存并选择最佳设置（详见 [GPU_COMPATIBILITY.md](../zh/GPU_COMPATIBILITY.md)）
+- **显存守卫**: 每次推理前，系统估算显存需求，必要时自动减小 `batch_size`
+- **自适应 VAE 解码**: 三级回退 — GPU 分片解码 → GPU 解码+CPU 卸载 → 完全 CPU 解码
+- **自动分片大小**: VAE 解码分片大小根据空闲显存自适应调整（64/128/256/512/1024/1536）
+- **时长/批次裁剪**: 超出等级限制的值会自动裁剪并显示警告
+
+手动调优：
+- 如果仍然出现 OOM 错误，减少 `batch_size`
+- 低显存 GPU 上减少 `lm_batch_chunk_size` 用于 LM 操作
+- 显存 <20GB 时启用 `offload_to_cpu=True`
+- 显存 <20GB 时启用 `quantization="int8_weight_only"`
 
 ---
 
@@ -989,8 +997,8 @@ else:
 
 ### 常见问题
 
-**问题**：内存不足错误
-- **解决方案**：减少 `batch_size`、`inference_steps`，或启用 CPU 卸载
+**问题**：显存不足 (OOM) 错误
+- **解决方案**：系统应通过显存守卫（自动减小批次）和自适应 VAE 解码（CPU 回退）自动处理大多数 OOM 场景。如果仍然出现 OOM：减少 `batch_size`、减少 `inference_steps`、启用 CPU 卸载（`offload_to_cpu=True`）或启用 INT8 量化。详见 [GPU_COMPATIBILITY.md](../zh/GPU_COMPATIBILITY.md) 了解各显存等级的推荐设置。
 
 **问题**：结果质量差
 - **解决方案**：增加 `inference_steps`，调整 `guidance_scale`，使用 base 模型
diff --git a/docs/zh/INSTALL.md b/docs/zh/INSTALL.md
index 5efb736c..d0502897 100644
--- a/docs/zh/INSTALL.md
+++ b/docs/zh/INSTALL.md
@@ -468,7 +468,7 @@ ACESTEP_INIT_LLM=false
 | `--init_llm` | auto | LLM 初始化：`true` / `false` / 省略为自动 |
 | `--config_path` | auto | DiT 模型（如 `acestep-v15-turbo`） |
 | `--lm_model_path` | auto | LM 模型（如 `acestep-5Hz-lm-1.7B`） |
-| `--offload_to_cpu` | auto | CPU 卸载（显存 < 16GB 时自动启用） |
+| `--offload_to_cpu` | auto | CPU 卸载（显存 < 20GB 时自动启用） |
 | `--download-source` | auto | 模型源：`auto` / `huggingface` / `modelscope` |
 | `--enable-api` | false | 同时启用 REST API 端点 |
 
@@ -529,16 +529,17 @@ huggingface-cli download ACE-Step/acestep-5Hz-lm-4B --local-dir ./checkpoints/ac
 
 ## 💡 如何选择模型？
 
-ACE-Step 会自动适配你的 GPU 显存：
+ACE-Step 会自动适配你的 GPU 显存。UI 会根据检测到的 GPU 等级预配置所有设置（LM 模型、后端、卸载、量化）：
 
-| GPU 显存 | 推荐 LM 模型 | 说明 |
-|----------|--------------|------|
-| **≤6GB** | 无（仅 DiT） | 默认禁用 LM 以节省显存 |
-| **6-12GB** | `acestep-5Hz-lm-0.6B` | 轻量，平衡性好 |
-| **12-16GB** | `acestep-5Hz-lm-1.7B` | 更好的质量 |
-| **≥16GB** | `acestep-5Hz-lm-4B` | 最佳质量和音频理解能力 |
+| GPU 显存 | 推荐 LM 模型 | 后端 | 说明 |
+|----------|--------------|------|------|
+| **≤6GB** | 无（仅 DiT） | — | 默认禁用 LM；INT8 量化 + 完全 CPU 卸载 |
+| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | 轻量 LM，PyTorch 后端 |
+| **8-16GB** | `0.6B` / `1.7B` | `vllm` | 8-12GB 用 0.6B，12-16GB 用 1.7B |
+| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 20GB+ 可用 4B；20GB+ 无需卸载 |
+| **≥24GB** | `acestep-5Hz-lm-4B` | `vllm` | 最佳质量，所有模型无需卸载 |
 
-> 📖 详细 GPU 兼容性信息（时长限制、批量大小、内存优化），请参阅 [GPU 兼容性指南](GPU_COMPATIBILITY.md)。
+> 📖 详细 GPU 兼容性信息（等级表、时长限制、批量大小、自适应 UI 默认设置、显存优化），请参阅 [GPU 兼容性指南](GPU_COMPATIBILITY.md)。
 
 ---
 
diff --git a/profile_inference.py b/profile_inference.py
index 387f9dc1..12467c0b 100644
--- a/profile_inference.py
+++ b/profile_inference.py
@@ -8,6 +8,7 @@
 Modes:
     profile         - Profile a single generation run with detailed timing breakdown
     benchmark       - Run a matrix of configurations and produce a summary table
+    tier-test       - Auto-test across simulated GPU tiers (4/6/8/12/16/24/48 GB)
     understand      - Profile the understand_music() API (audio codes -> metadata)
     create_sample   - Profile the create_sample() API (inspiration/simple mode)
     format_sample   - Profile the format_sample() API (caption+lyrics -> metadata)
@@ -22,6 +23,15 @@
     # Benchmark across configurations
     python profile_inference.py --mode benchmark
 
+    # Test all GPU tiers automatically (the key feature!)
+    python profile_inference.py --mode tier-test
+
+    # Test specific tiers only
+    python profile_inference.py --mode tier-test --tiers 6 8 16
+
+    # Test tiers with LM enabled (where supported)
+    python profile_inference.py --mode tier-test --tier-with-lm
+
     # Profile create_sample (inspiration mode)
     python profile_inference.py --mode create_sample --sample-query "a soft Bengali love song"
 
@@ -38,6 +48,7 @@
 import os
 import json
 import tempfile
+import traceback
 from contextlib import contextmanager
 from collections import defaultdict
 from typing import Tuple, Dict, Any, List, Optional
@@ -60,7 +71,15 @@
 )
 from acestep.handler import AceStepHandler
 from acestep.llm_inference import LLMHandler
-from acestep.gpu_config import get_gpu_config, set_global_gpu_config
+from acestep.gpu_config import (
+    get_gpu_config,
+    set_global_gpu_config,
+    get_gpu_tier,
+    find_best_lm_model_on_disk,
+    is_lm_model_size_allowed,
+    GPUConfig,
+    VRAM_AUTO_OFFLOAD_THRESHOLD_GB,
+)
 
 
 # =============================================================================
@@ -125,12 +144,12 @@ def load_env_config() -> Dict[str, str]:
 
 class PreciseTimer:
     """High-precision timer with GPU synchronization for accurate timing."""
-    
+
     def __init__(self, device: str = "cpu"):
         self.device = device
         self.timings: Dict[str, List[float]] = defaultdict(list)
         self.enabled = True
-        
+
     def sync(self):
         """Synchronize GPU operations for accurate timing."""
         if not self.enabled:
@@ -139,10 +158,10 @@ def sync(self):
             torch.cuda.synchronize()
         elif self.device == "mps" and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
             if hasattr(torch, "mps"):
-            torch.mps.synchronize()
+                torch.mps.synchronize()
         elif self.device.startswith("xpu") and hasattr(torch, "xpu"):
             torch.xpu.synchronize()
-    
+
     @contextmanager
     def time(self, name: str):
         """Time a code section with GPU synchronization."""
@@ -157,17 +176,17 @@ def time(self, name: str):
             self.sync()
             elapsed = time.perf_counter() - start
             self.timings[name].append(elapsed)
-    
+
     def get_total(self, name: str) -> float:
         return sum(self.timings.get(name, []))
-    
+
     def get_mean(self, name: str) -> float:
         times = self.timings.get(name, [])
         return sum(times) / len(times) if times else 0.0
-    
+
     def get_count(self, name: str) -> int:
         return len(self.timings.get(name, []))
-    
+
     def reset(self):
         self.timings.clear()
 
@@ -247,10 +266,10 @@ def print_time_costs_breakdown(
     time_costs: Dict[str, float], total_wall_time: float
 ):
     """Print a detailed timing breakdown from result.extra_outputs['time_costs']."""
-        print("\n" + "=" * 100)
+    print("\n" + "=" * 100)
     print("PROFILING RESULTS")
-        print("=" * 100)
-        
+    print("=" * 100)
+
     if not time_costs:
         print("\n  (No time_costs data available from the pipeline)")
         print(f"\n  Total wall time: {total_wall_time:.3f}s")
@@ -341,10 +360,10 @@ def print_time_costs_breakdown(
     print(f"\n{'TOTAL WALL TIME':<50} {total_wall_time:<12.3f} {'100.0%':>6}")
 
     # Performance insights
-        print("\n" + "=" * 100)
+    print("\n" + "=" * 100)
     print("PERFORMANCE INSIGHTS")
-        print("=" * 100)
-        
+    print("=" * 100)
+
     if lm_total > 0 and dit_total > 0:
         if lm_total > dit_total * 2:
             print(
@@ -394,7 +413,7 @@ def print_result_summary(result: GenerationResult, mode: str = "profile"):
         if silent_count:
             print(f" ({silent_count} silent)", end="")
         print()
-        else:
+    else:
         print(f"\n  FAILED: {result.error}")
 
 
@@ -485,7 +504,6 @@ def run_profile_mode(dit_handler, llm_handler, args, timer: PreciseTimer):
     prof = None
     if args.detailed:
         import cProfile
-
         prof = cProfile.Profile()
         prof.enable()
 
@@ -659,7 +677,7 @@ def run_benchmark_mode(dit_handler, llm_handler, args, timer: PreciseTimer):
             f"    {status} | wall={wall_time:.1f}s, "
             f"lm={entry['lm_time']:.1f}s, dit={entry['dit_time']:.1f}s"
         )
-    
+
     # Print summary table
     print("\n" + "=" * 120)
     print("BENCHMARK SUMMARY")
@@ -695,6 +713,808 @@ def run_benchmark_mode(dit_handler, llm_handler, args, timer: PreciseTimer):
     return results
 
 
+# =============================================================================
+# Mode: tier-test  (THE KEY FEATURE)
+# =============================================================================
+
+
+def _get_vram_info_str() -> str:
+    """Get current VRAM usage string for logging."""
+    if not torch.cuda.is_available():
+        return "N/A"
+    allocated = torch.cuda.memory_allocated() / (1024 ** 3)
+    reserved = torch.cuda.memory_reserved() / (1024 ** 3)
+    return f"alloc={allocated:.2f}GB, reserved={reserved:.2f}GB"
+
+
+def _run_single_tier_test(
+    sim_gb: float,
+    gpu_config: GPUConfig,
+    args,
+    example_data: Dict,
+    checkpoint_dir: str,
+    disk_lm_models: List[str],
+    *,
+    offload_override: Optional[bool] = None,
+    offload_dit_override: Optional[bool] = None,
+    quantization_override: Optional[str] = "USE_DEFAULT",
+    test_variant: str = "default",
+    batch_size_override: Optional[int] = None,
+    use_lm_override: Optional[bool] = None,
+) -> Dict[str, Any]:
+    """
+    Run a single tier test with the given configuration.
+
+    Args:
+        sim_gb: Simulated VRAM in GB
+        gpu_config: GPU configuration for this tier
+        args: CLI arguments
+        example_data: Example JSON data for generation
+        checkpoint_dir: Path to checkpoints directory
+        disk_lm_models: List of LM models found on disk
+        offload_override: If not None, override offload_to_cpu setting
+        offload_dit_override: If not None, override offload_dit_to_cpu setting
+        quantization_override: If not "USE_DEFAULT", override quantization setting
+                               (None means no quantization, "int8_weight_only" etc.)
+        test_variant: Label for this test variant ("default", "no-quant", "no-offload")
+        batch_size_override: If not None, override batch size (used by batch boundary tests)
+        use_lm_override: If not None, force LM on (True) or off (False)
+
+    Returns:
+        Result dictionary for this test
+    """
+    tier = gpu_config.tier
+
+    # Determine test configuration
+    if use_lm_override is not None:
+        use_lm = use_lm_override and gpu_config.init_lm_default and bool(gpu_config.available_lm_models)
+    else:
+        use_lm = args.tier_with_lm and gpu_config.init_lm_default and bool(gpu_config.available_lm_models)
+
+    if offload_override is not None:
+        offload = offload_override
+    else:
+        offload = gpu_config.offload_to_cpu_default
+
+    if offload_dit_override is not None:
+        offload_dit = offload_dit_override
+    else:
+        offload_dit = gpu_config.offload_dit_to_cpu_default
+
+    if quantization_override != "USE_DEFAULT":
+        quantization = quantization_override
+    else:
+        quantization = "int8_weight_only" if gpu_config.quantization_default else None
+
+    # Find LM model on disk
+    lm_model = None
+    lm_backend = gpu_config.recommended_backend
+    if use_lm:
+        lm_model = find_best_lm_model_on_disk(
+            gpu_config.recommended_lm_model, disk_lm_models
+        )
+        if not lm_model:
+            print(f"  ⚠️ No compatible LM model on disk for tier {tier}, skipping LM")
+            use_lm = False
+
+    # Clamp duration to tier limit
+    test_duration = args.tier_duration
+    max_dur = gpu_config.max_duration_with_lm if use_lm else gpu_config.max_duration_without_lm
+    if test_duration > max_dur:
+        test_duration = max_dur
+        print(f"  Duration clamped to {test_duration}s (tier limit)")
+
+    batch_size = batch_size_override if batch_size_override is not None else 1
+
+    print(f"\n  Test config [{test_variant}]: duration={test_duration}s, batch={batch_size}, LM={use_lm}")
+    if use_lm:
+        print(f"    LM model: {lm_model}, backend: {lm_backend}")
+    print(f"    offload={offload}, offload_dit={offload_dit}, quant={quantization}")
+
+    # Enforce VRAM cap
+    if torch.cuda.is_available():
+        total_bytes = torch.cuda.get_device_properties(0).total_memory
+        total_gb = total_bytes / (1024 ** 3)
+        if sim_gb < total_gb:
+            reference_context_gb = 0.5
+            allocator_budget_gb = max(0.5, sim_gb - reference_context_gb)
+            fraction = max(0.01, min(1.0, allocator_budget_gb / total_gb))
+            torch.cuda.set_per_process_memory_fraction(fraction)
+
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+    # Initialize result entry
+    result_entry = {
+        "tier_gb": sim_gb,
+        "tier": tier,
+        "test_variant": test_variant,
+        "use_lm": use_lm,
+        "lm_model": lm_model,
+        "lm_backend": lm_backend,
+        "offload": offload,
+        "offload_dit": offload_dit,
+        "quantization": quantization,
+        "duration": test_duration,
+        "batch_size": batch_size,
+        "init_success": False,
+        "gen_success": False,
+        "wall_time": 0.0,
+        "error": None,
+        "peak_vram_gb": 0.0,
+    }
+
+    dit_handler = None
+    llm_handler = None
+
+    try:
+        print(f"\n  Initializing DiT handler... ({_get_vram_info_str()})")
+        dit_handler = AceStepHandler()
+
+        # Determine flash attention availability
+        use_flash_attention = False
+        try:
+            import flash_attn  # noqa: F401
+            use_flash_attention = True
+        except ImportError:
+            pass
+
+        # compile_model must be True when quantization is used;
+        # --tier-skip-compile can skip it for non-quantized tiers to save time
+        if quantization:
+            compile_model = True
+        elif args.tier_skip_compile:
+            compile_model = False
+        else:
+            compile_model = gpu_config.compile_model_default
+
+        status_dit, success_dit = dit_handler.initialize_service(
+            project_root=PROJECT_ROOT,
+            config_path=args.config_path,
+            device="auto",
+            use_flash_attention=use_flash_attention,
+            compile_model=compile_model,
+            offload_to_cpu=offload,
+            offload_dit_to_cpu=offload_dit,
+            quantization=quantization,
+        )
+
+        if not success_dit:
+            result_entry["error"] = f"DiT init failed: {status_dit}"
+            print(f"  ❌ DiT init failed: {status_dit}")
+            _cleanup_handlers(dit_handler, None)
+            return result_entry
+
+        print(f"  ✅ DiT ready ({_get_vram_info_str()})")
+
+        llm_handler = LLMHandler()
+
+        if use_lm:
+            print(f"  Initializing LLM handler (backend={lm_backend})... ({_get_vram_info_str()})")
+            status_llm, success_llm = llm_handler.initialize(
+                checkpoint_dir=checkpoint_dir,
+                lm_model_path=lm_model,
+                backend=lm_backend,
+                device="auto",
+                offload_to_cpu=offload,
+                dtype=None,
+            )
+            if success_llm:
+                print(f"  ✅ LLM ready ({_get_vram_info_str()})")
+            else:
+                print(f"  ⚠️ LLM init failed: {status_llm}")
+                use_lm = False
+                result_entry["use_lm"] = False
+                result_entry["error"] = f"LM init failed (non-fatal): {status_llm}"
+
+        result_entry["init_success"] = True
+
+    except torch.cuda.OutOfMemoryError as e:
+        result_entry["error"] = f"Init OOM: {e}"
+        print(f"  ❌ Init OOM: {e}")
+        _cleanup_handlers(dit_handler, llm_handler)
+        return result_entry
+    except Exception as e:
+        result_entry["error"] = f"Init exception: {e}"
+        print(f"  ❌ Init exception: {e}")
+        traceback.print_exc()
+        _cleanup_handlers(dit_handler, llm_handler)
+        return result_entry
+
+    # Run generation
+    try:
+        print(f"\n  Running generation... ({_get_vram_info_str()})")
+        save_dir = tempfile.mkdtemp(prefix=f"acestep_tier{int(sim_gb)}_{test_variant}_")
+
+        params = GenerationParams(
+            caption=example_data.get("caption", ""),
+            lyrics=example_data.get("lyrics", ""),
+            bpm=example_data.get("bpm"),
+            keyscale=example_data.get("keyscale", ""),
+            timesignature=example_data.get("timesignature", ""),
+            vocal_language=example_data.get("language", "unknown"),
+            duration=test_duration,
+            thinking=use_lm,
+            use_cot_metas=use_lm,
+            use_cot_caption=False,
+            use_cot_language=False,
+            use_constrained_decoding=True,
+            inference_steps=8,
+            seed=42,
+            lm_temperature=0.85,
+            lm_cfg_scale=2.0,
+            guidance_scale=7.0,
+        )
+        config = GenerationConfig(
+            batch_size=batch_size,
+            seeds=[42 + j for j in range(batch_size)],
+            use_random_seed=False,
+            audio_format="flac",
+        )
+
+        # When testing batch boundaries, temporarily override the GPU tier config's
+        # max_batch limits so that inference.py's clamping doesn't reduce our test
+        # batch size. We restore the original values after the test.
+        _patched_tier_config = False
+        _orig_batch_with_lm = None
+        _orig_batch_without_lm = None
+        if batch_size_override is not None and batch_size_override > 1:
+            from acestep.gpu_config import GPU_TIER_CONFIGS as _tier_configs
+            tier = gpu_config.tier
+            if tier in _tier_configs:
+                _patched_tier_config = True
+                _orig_batch_with_lm = _tier_configs[tier]["max_batch_size_with_lm"]
+                _orig_batch_without_lm = _tier_configs[tier]["max_batch_size_without_lm"]
+                _tier_configs[tier]["max_batch_size_with_lm"] = max(batch_size_override, _orig_batch_with_lm)
+                _tier_configs[tier]["max_batch_size_without_lm"] = max(batch_size_override, _orig_batch_without_lm)
+
+        t0 = time.perf_counter()
+        try:
+            result = generate_music(
+                dit_handler, llm_handler, params, config, save_dir=save_dir
+            )
+        finally:
+            # Restore original tier config values
+            if _patched_tier_config:
+                _tier_configs[tier]["max_batch_size_with_lm"] = _orig_batch_with_lm
+                _tier_configs[tier]["max_batch_size_without_lm"] = _orig_batch_without_lm
+        wall_time = time.perf_counter() - t0
+
+        result_entry["wall_time"] = wall_time
+        result_entry["gen_success"] = result.success
+
+        if result.success:
+            tc = result.extra_outputs.get("time_costs", {})
+            result_entry["lm_time"] = tc.get("lm_total_time", 0.0)
+            result_entry["dit_time"] = tc.get("dit_total_time_cost", 0.0)
+            result_entry["vae_time"] = tc.get("dit_vae_decode_time_cost", 0.0)
+            n_audios = len(result.audios)
+            print(f"  ✅ [{test_variant}] Generation OK: {n_audios} audio(s) in {wall_time:.1f}s")
+        else:
+            result_entry["error"] = result.error
+            print(f"  ❌ [{test_variant}] Generation FAILED: {result.error}")
+
+        _cleanup_dir(save_dir)
+
+    except torch.cuda.OutOfMemoryError as e:
+        result_entry["error"] = f"OOM: {e}"
+        print(f"  ❌ [{test_variant}] OOM ERROR: {e}")
+    except Exception as e:
+        result_entry["error"] = f"Generation exception: {e}"
+        print(f"  ❌ [{test_variant}] Exception: {e}")
+        traceback.print_exc()
+
+    # Record peak VRAM
+    if torch.cuda.is_available():
+        peak_bytes = torch.cuda.max_memory_allocated()
+        result_entry["peak_vram_gb"] = peak_bytes / (1024 ** 3)
+        print(f"  Peak VRAM: {result_entry['peak_vram_gb']:.2f}GB")
+
+    # Cleanup
+    _cleanup_handlers(dit_handler, llm_handler)
+
+    return result_entry
+
+
+def run_tier_test_mode(args):
+    """
+    Automatically test inference across multiple simulated GPU tiers.
+
+    For each tier:
+      1. Set MAX_CUDA_VRAM to simulate the VRAM limit
+      2. Initialize gpu_config for that tier
+      3. Initialize DiT + (optionally) LLM handlers with tier-appropriate settings
+      4. Run a short generation and verify it completes without OOM
+      5. Report results
+
+    When --tier-boundary is enabled, each tier is tested with up to 3 configurations:
+      - default: tier's default settings (quantization + offload as configured)
+      - no-quant: same as default but with quantization disabled
+      - no-offload: no quantization AND no CPU offload (all models on GPU)
+
+    This replaces the manual workflow of:
+      MAX_CUDA_VRAM=8 uv run acestep → click UI → wait → check
+    """
+    # Determine which tiers to test
+    default_tiers = [4, 6, 8, 12, 16, 24, 48]
+    tiers_to_test = args.tiers if args.tiers else default_tiers
+
+    # Load example for generation
+    example_file = os.path.join(
+        PROJECT_ROOT, "examples", "text2music", args.example
+    )
+    if not os.path.exists(example_file):
+        print(f"\n  Example not found: {example_file}")
+        sys.exit(1)
+
+    with open(example_file, "r", encoding="utf-8") as f:
+        example_data = json.load(f)
+
+    # Scan available LM models on disk
+    checkpoint_dir = os.path.join(PROJECT_ROOT, "checkpoints")
+    disk_lm_models = []
+    if os.path.exists(checkpoint_dir):
+        for item in sorted(os.listdir(checkpoint_dir)):
+            if os.path.isdir(os.path.join(checkpoint_dir, item)) and item.startswith("acestep-5Hz-lm-"):
+                disk_lm_models.append(item)
+
+    boundary_mode = getattr(args, "tier_boundary", False)
+    batch_boundary_mode = getattr(args, "tier_batch_boundary", False)
+
+    print(f"\n  Tiers to test: {tiers_to_test}")
+    print(f"  LM models on disk: {disk_lm_models}")
+    print(f"  Test with LM: {args.tier_with_lm}")
+    print(f"  Test duration: {args.tier_duration}s")
+    print(f"  Boundary testing: {boundary_mode}")
+    print(f"  Batch boundary testing: {batch_boundary_mode}")
+    print(f"  Example: {args.example}")
+
+    # Results collector
+    all_results = []
+
+    for sim_gb in tiers_to_test:
+        print("\n" + "=" * 120)
+        print(f"  TIER TEST: {sim_gb}GB simulated VRAM")
+        print("=" * 120)
+
+        # Configure GPU simulation
+        os.environ["MAX_CUDA_VRAM"] = str(sim_gb)
+
+        # Force re-detection of GPU config
+        gpu_config = get_gpu_config(gpu_memory_gb=float(sim_gb))
+        set_global_gpu_config(gpu_config)
+
+        tier = gpu_config.tier
+        print(f"  Tier: {tier}")
+        print(f"  init_lm_default: {gpu_config.init_lm_default}")
+        print(f"  available_lm_models: {gpu_config.available_lm_models}")
+        print(f"  recommended_lm_model: {gpu_config.recommended_lm_model}")
+        print(f"  recommended_backend: {gpu_config.recommended_backend}")
+        print(f"  lm_backend_restriction: {gpu_config.lm_backend_restriction}")
+        print(f"  offload_to_cpu: {gpu_config.offload_to_cpu_default}")
+        print(f"  offload_dit_to_cpu: {gpu_config.offload_dit_to_cpu_default}")
+        print(f"  quantization: {gpu_config.quantization_default}")
+        print(f"  max_duration_with_lm: {gpu_config.max_duration_with_lm}s")
+        print(f"  max_duration_without_lm: {gpu_config.max_duration_without_lm}s")
+        print(f"  max_batch_with_lm: {gpu_config.max_batch_size_with_lm}")
+        print(f"  max_batch_without_lm: {gpu_config.max_batch_size_without_lm}")
+
+        # ---- Test 1: Default configuration ----
+        print(f"\n  --- Variant: default ---")
+        result_default = _run_single_tier_test(
+            sim_gb, gpu_config, args, example_data,
+            checkpoint_dir, disk_lm_models,
+            test_variant="default",
+        )
+        all_results.append(result_default)
+
+        if boundary_mode:
+            # ---- Test 2: No quantization (keep offload as default) ----
+            # Skip if the tier already doesn't use quantization (no point re-testing)
+            if gpu_config.quantization_default:
+                print(f"\n  --- Variant: no-quant (offload={gpu_config.offload_to_cpu_default}) ---")
+                result_no_quant = _run_single_tier_test(
+                    sim_gb, gpu_config, args, example_data,
+                    checkpoint_dir, disk_lm_models,
+                    quantization_override=None,
+                    test_variant="no-quant",
+                )
+                all_results.append(result_no_quant)
+            else:
+                print(f"\n  --- Variant: no-quant — SKIPPED (tier already has quantization=False) ---")
+
+            # ---- Test 3: No quantization AND no offload ----
+            # Skip if the tier already has both disabled
+            # Also skip if simulated VRAM is too small — the unquantized DiT model
+            # alone needs ~6GB; without offload there is no room left for VAE decode,
+            # which causes a fallback to CPU VAE with tiny chunk_size and 20+ hour runs.
+            MIN_VRAM_FOR_NO_OFFLOAD = 8  # GB — DiT (~6GB) + VAE headroom (~2GB)
+            if sim_gb < MIN_VRAM_FOR_NO_OFFLOAD:
+                print(f"\n  --- Variant: no-offload — SKIPPED (simulated {sim_gb}GB < {MIN_VRAM_FOR_NO_OFFLOAD}GB minimum for no-offload) ---")
+            elif gpu_config.quantization_default or gpu_config.offload_to_cpu_default:
+                print(f"\n  --- Variant: no-offload (quant=None, offload=False) ---")
+                result_no_offload = _run_single_tier_test(
+                    sim_gb, gpu_config, args, example_data,
+                    checkpoint_dir, disk_lm_models,
+                    offload_override=False,
+                    offload_dit_override=False,
+                    quantization_override=None,
+                    test_variant="no-offload",
+                )
+                all_results.append(result_no_offload)
+            else:
+                print(f"\n  --- Variant: no-offload — SKIPPED (tier already has offload=False, quant=False) ---")
+
+        if batch_boundary_mode:
+            # ---- Batch boundary tests: escalate batch size until OOM ----
+            BATCH_SIZES_TO_TEST = [1, 2, 4, 8]
+
+            # Test WITHOUT LM
+            print(f"\n  --- Batch boundary: without LM ---")
+            for bs in BATCH_SIZES_TO_TEST:
+                print(f"\n  --- Variant: batch-noLM-{bs} (batch_size={bs}, no LM) ---")
+                result_batch = _run_single_tier_test(
+                    sim_gb, gpu_config, args, example_data,
+                    checkpoint_dir, disk_lm_models,
+                    test_variant=f"batch-noLM-{bs}",
+                    batch_size_override=bs,
+                    use_lm_override=False,
+                )
+                all_results.append(result_batch)
+                if not result_batch["gen_success"]:
+                    print(f"  ⚠️ Batch size {bs} failed without LM — stopping escalation")
+                    break
+
+            # Test WITH LM (if tier supports it)
+            if gpu_config.init_lm_default and bool(gpu_config.available_lm_models):
+                print(f"\n  --- Batch boundary: with LM ---")
+                for bs in BATCH_SIZES_TO_TEST:
+                    print(f"\n  --- Variant: batch-LM-{bs} (batch_size={bs}, with LM) ---")
+                    result_batch_lm = _run_single_tier_test(
+                        sim_gb, gpu_config, args, example_data,
+                        checkpoint_dir, disk_lm_models,
+                        test_variant=f"batch-LM-{bs}",
+                        batch_size_override=bs,
+                        use_lm_override=True,
+                    )
+                    all_results.append(result_batch_lm)
+                    if not result_batch_lm["gen_success"]:
+                        print(f"  ⚠️ Batch size {bs} failed with LM — stopping escalation")
+                        break
+
+    # ---- Print summary ----
+    _print_tier_test_summary(all_results)
+
+    if boundary_mode:
+        _print_boundary_summary(all_results)
+
+    if batch_boundary_mode:
+        _print_batch_boundary_summary(all_results)
+
+    # Save results
+    if args.benchmark_output:
+        with open(args.benchmark_output, "w", encoding="utf-8") as f:
+            json.dump(all_results, f, indent=2, default=str)
+        print(f"\n  Results saved to: {args.benchmark_output}")
+
+    return all_results
+
+
+def _cleanup_handlers(dit_handler, llm_handler):
+    """Clean up handlers and free GPU memory."""
+    try:
+        if dit_handler is not None:
+            if hasattr(dit_handler, 'model') and dit_handler.model is not None:
+                dit_handler.model = None
+            if hasattr(dit_handler, 'vae') and dit_handler.vae is not None:
+                dit_handler.vae = None
+            if hasattr(dit_handler, 'text_encoder') and dit_handler.text_encoder is not None:
+                dit_handler.text_encoder = None
+            del dit_handler
+    except Exception:
+        pass
+
+    try:
+        if llm_handler is not None:
+            if hasattr(llm_handler, 'llm') and llm_handler.llm is not None:
+                llm_handler.llm = None
+            del llm_handler
+    except Exception:
+        pass
+
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+
+
+def _print_tier_test_summary(results: List[Dict]):
+    """Print a summary table of all tier test results."""
+    # Detect if any result has a test_variant (boundary mode)
+    has_variants = any(r.get("test_variant", "default") != "default" for r in results)
+
+    print("\n" + "=" * 160)
+    print("TIER TEST SUMMARY")
+    print("=" * 160)
+
+    if has_variants:
+        header = (
+            f"{'VRAM':>6} {'Tier':<10} {'Variant':<12} {'LM':>4} {'LM Model':<24} {'Backend':<8} "
+            f"{'Offload':<8} {'Quant':<6} {'Init':>5} {'Gen':>5} "
+            f"{'Wall(s)':>8} {'Peak(GB)':>9} {'Status':<30}"
+        )
+    else:
+        header = (
+            f"{'VRAM':>6} {'Tier':<10} {'LM':>4} {'LM Model':<28} {'Backend':<8} "
+            f"{'Offload':<8} {'Quant':<6} {'Init':>5} {'Gen':>5} "
+            f"{'Wall(s)':>8} {'Peak(GB)':>9} {'Status':<30}"
+        )
+    print(header)
+    print("-" * 160)
+
+    pass_count = 0
+    fail_count = 0
+
+    for r in results:
+        lm_model_short = (r.get("lm_model") or "-")
+        max_lm_len = 22 if has_variants else 26
+        if len(lm_model_short) > max_lm_len:
+            lm_model_short = lm_model_short[:max_lm_len] + ".."
+
+        init_ok = "✅" if r["init_success"] else "❌"
+        gen_ok = "✅" if r["gen_success"] else "❌"
+        status = "PASS" if r["gen_success"] else (r.get("error", "FAIL") or "FAIL")
+        if len(status) > 28:
+            status = status[:28] + ".."
+
+        if r["gen_success"]:
+            pass_count += 1
+        else:
+            fail_count += 1
+
+        quant = "int8" if r.get("quantization") else "-"
+        variant = r.get("test_variant", "default")
+
+        if has_variants:
+            print(
+                f"{r['tier_gb']:5d}GB {r['tier']:<10} {variant:<12} "
+                f"{'Y' if r['use_lm'] else 'N':>4} {lm_model_short:<24} "
+                f"{r.get('lm_backend', '-'):<8} "
+                f"{'Y' if r['offload'] else 'N':<8} {quant:<6} "
+                f"{init_ok:>5} {gen_ok:>5} "
+                f"{r['wall_time']:>8.1f} {r.get('peak_vram_gb', 0):>9.2f} "
+                f"{status:<30}"
+            )
+        else:
+            print(
+                f"{r['tier_gb']:5d}GB {r['tier']:<10} "
+                f"{'Y' if r['use_lm'] else 'N':>4} {lm_model_short:<28} "
+                f"{r.get('lm_backend', '-'):<8} "
+                f"{'Y' if r['offload'] else 'N':<8} {quant:<6} "
+                f"{init_ok:>5} {gen_ok:>5} "
+                f"{r['wall_time']:>8.1f} {r.get('peak_vram_gb', 0):>9.2f} "
+                f"{status:<30}"
+            )
+
+    print("-" * 160)
+    print(f"  Total: {len(results)} tests run, {pass_count} PASSED, {fail_count} FAILED")
+
+
+def _print_boundary_summary(results: List[Dict]):
+    """
+    Print a boundary analysis summary showing the minimum tier for each capability.
+
+    Analyzes results from boundary testing to determine:
+    - Minimum tier that works WITHOUT INT8 quantization
+    - Minimum tier that works WITHOUT CPU offload (and without quantization)
+    """
+    print("\n" + "=" * 100)
+    print("BOUNDARY ANALYSIS")
+    print("=" * 100)
+    print()
+    print("  This analysis shows the minimum VRAM tier at which each optimization")
+    print("  can be safely disabled while still completing inference successfully.")
+    print()
+
+    # Collect results by variant
+    no_quant_results = [r for r in results if r.get("test_variant") == "no-quant"]
+    no_offload_results = [r for r in results if r.get("test_variant") == "no-offload"]
+    default_results = [r for r in results if r.get("test_variant") == "default"]
+
+    # Also consider default results where the tier already has quant/offload disabled
+    # (e.g., tier6b default already has quantization=False)
+    for r in default_results:
+        if not r.get("quantization") and r not in no_quant_results:
+            # This tier's default already runs without quantization
+            no_quant_results.append(r)
+        if not r.get("offload") and not r.get("quantization") and r not in no_offload_results:
+            # This tier's default already runs without offload and without quantization
+            no_offload_results.append(r)
+
+    # Sort by VRAM
+    no_quant_results.sort(key=lambda r: r["tier_gb"])
+    no_offload_results.sort(key=lambda r: r["tier_gb"])
+
+    # Find minimum passing tier for each capability
+    def _find_min_passing(result_list, capability_name):
+        passing = [r for r in result_list if r.get("gen_success")]
+        failing = [r for r in result_list if not r.get("gen_success")]
+
+        if passing:
+            min_pass = passing[0]
+            print(f"  {capability_name}:")
+            print(f"    Minimum tier:  {min_pass['tier']} ({min_pass['tier_gb']}GB)")
+            print(f"    Peak VRAM:     {min_pass.get('peak_vram_gb', 0):.2f}GB")
+            if failing:
+                max_fail = failing[-1]
+                print(f"    Last failure:  {max_fail['tier']} ({max_fail['tier_gb']}GB) — {max_fail.get('error', 'unknown')[:60]}")
+        else:
+            if failing:
+                print(f"  {capability_name}:")
+                print(f"    ❌ No tier passed this test. All tested tiers failed.")
+                for r in failing:
+                    err = (r.get("error") or "unknown")[:50]
+                    print(f"       {r['tier_gb']}GB ({r['tier']}): {err}")
+            else:
+                print(f"  {capability_name}:")
+                print(f"    ⚠️ No test results available for this capability.")
+        print()
+        return passing[0] if passing else None
+
+    min_no_quant = _find_min_passing(no_quant_results, "Without INT8 Quantization")
+    min_no_offload = _find_min_passing(no_offload_results, "Without CPU Offload (and no quantization)")
+
+    # Print compact summary table
+    print("  " + "-" * 60)
+    print(f"  {'Capability':<45} {'Min Tier':<10} {'VRAM':>6}")
+    print("  " + "-" * 60)
+
+    if min_no_quant:
+        print(f"  {'No INT8 Quantization':<45} {min_no_quant['tier']:<10} {min_no_quant['tier_gb']:>5}GB")
+    else:
+        print(f"  {'No INT8 Quantization':<45} {'N/A':<10} {'N/A':>6}")
+
+    if min_no_offload:
+        print(f"  {'No CPU Offload (all models on GPU)':<45} {min_no_offload['tier']:<10} {min_no_offload['tier_gb']:>5}GB")
+    else:
+        print(f"  {'No CPU Offload (all models on GPU)':<45} {'N/A':<10} {'N/A':>6}")
+
+    print("  " + "-" * 60)
+    print()
+    print("  Note: These boundaries are empirical and may vary based on:")
+    print("    - DiT model variant (turbo vs base)")
+    print("    - Whether LM is enabled (--tier-with-lm)")
+    print("    - Generation duration and batch size")
+    print("    - Flash attention availability")
+
+
+def _print_batch_boundary_summary(results: List[Dict]):
+    """
+    Print a batch boundary analysis summary showing the maximum safe batch size per tier.
+
+    Analyzes results from batch boundary testing to determine:
+    - Maximum batch size WITHOUT LM for each tier
+    - Maximum batch size WITH LM for each tier
+    """
+    print("\n" + "=" * 120)
+    print("BATCH BOUNDARY ANALYSIS")
+    print("=" * 120)
+    print()
+    print("  This analysis shows the maximum batch size that completed successfully")
+    print("  for each simulated VRAM tier.")
+    print()
+
+    # Collect batch boundary results
+    batch_no_lm = [r for r in results if r.get("test_variant", "").startswith("batch-noLM-")]
+    batch_with_lm = [r for r in results if r.get("test_variant", "").startswith("batch-LM-")]
+
+    # Group by tier_gb
+    def _group_by_tier(result_list):
+        groups = {}
+        for r in result_list:
+            tier_gb = r["tier_gb"]
+            if tier_gb not in groups:
+                groups[tier_gb] = {"tier": r["tier"], "results": []}
+            groups[tier_gb]["results"].append(r)
+        return groups
+
+    no_lm_groups = _group_by_tier(batch_no_lm)
+    with_lm_groups = _group_by_tier(batch_with_lm)
+
+    # Find max passing batch per tier
+    def _max_passing_batch(group_results):
+        max_bs = 0
+        peak_vram = 0.0
+        for r in group_results:
+            if r.get("gen_success"):
+                bs = r.get("batch_size", 1)
+                if bs > max_bs:
+                    max_bs = bs
+                    peak_vram = r.get("peak_vram_gb", 0)
+        return max_bs, peak_vram
+
+    # Collect all tier_gb values
+    all_tier_gbs = sorted(set(list(no_lm_groups.keys()) + list(with_lm_groups.keys())))
+
+    # Print table
+    print(f"  {'VRAM':>6}  {'Tier':<12}  {'Max Batch (no LM)':>18}  {'Peak VRAM':>10}  {'Max Batch (with LM)':>20}  {'Peak VRAM':>10}")
+    print("  " + "-" * 90)
+
+    summary_rows = []
+    for tier_gb in all_tier_gbs:
+        tier_name = no_lm_groups.get(tier_gb, with_lm_groups.get(tier_gb, {})).get("tier", "?")
+
+        no_lm_max, no_lm_peak = (0, 0.0)
+        if tier_gb in no_lm_groups:
+            no_lm_max, no_lm_peak = _max_passing_batch(no_lm_groups[tier_gb]["results"])
+
+        with_lm_max, with_lm_peak = (0, 0.0)
+        if tier_gb in with_lm_groups:
+            with_lm_max, with_lm_peak = _max_passing_batch(with_lm_groups[tier_gb]["results"])
+
+        no_lm_str = str(no_lm_max) if no_lm_max > 0 else "FAIL"
+        with_lm_str = str(with_lm_max) if with_lm_max > 0 else ("N/A" if tier_gb not in with_lm_groups else "FAIL")
+
+        no_lm_peak_str = f"{no_lm_peak:.2f}GB" if no_lm_max > 0 else "-"
+        with_lm_peak_str = f"{with_lm_peak:.2f}GB" if with_lm_max > 0 else "-"
+
+        print(
+            f"  {tier_gb:5d}GB  {tier_name:<12}  {no_lm_str:>18}  {no_lm_peak_str:>10}  "
+            f"{with_lm_str:>20}  {with_lm_peak_str:>10}"
+        )
+
+        summary_rows.append({
+            "tier_gb": tier_gb,
+            "tier": tier_name,
+            "max_batch_no_lm": no_lm_max,
+            "max_batch_with_lm": with_lm_max if tier_gb in with_lm_groups else None,
+        })
+
+    print("  " + "-" * 90)
+    print()
+
+    # Print comparison with current GPU_TIER_CONFIGS
+    print("  Comparison with current GPU_TIER_CONFIGS:")
+    print(f"  {'VRAM':>6}  {'Tier':<12}  {'Config (no LM)':>15}  {'Tested (no LM)':>15}  {'Config (LM)':>12}  {'Tested (LM)':>12}  {'Recommendation':<30}")
+    print("  " + "-" * 110)
+
+    for row in summary_rows:
+        tier_gb = row["tier_gb"]
+        tier_name = row["tier"]
+        cfg = get_gpu_config(gpu_memory_gb=float(tier_gb))
+
+        cfg_no_lm = cfg.max_batch_size_without_lm
+        cfg_with_lm = cfg.max_batch_size_with_lm
+        tested_no_lm = row["max_batch_no_lm"]
+        tested_with_lm = row["max_batch_with_lm"]
+
+        tested_no_lm_str = str(tested_no_lm) if tested_no_lm > 0 else "FAIL"
+        tested_with_lm_str = str(tested_with_lm) if tested_with_lm is not None and tested_with_lm > 0 else ("N/A" if tested_with_lm is None else "FAIL")
+
+        # Recommendation
+        rec_parts = []
+        if tested_no_lm > 0 and tested_no_lm != cfg_no_lm:
+            rec_parts.append(f"no_lm: {cfg_no_lm}→{tested_no_lm}")
+        if tested_with_lm is not None and tested_with_lm > 0 and tested_with_lm != cfg_with_lm:
+            rec_parts.append(f"lm: {cfg_with_lm}→{tested_with_lm}")
+        recommendation = ", ".join(rec_parts) if rec_parts else "OK"
+
+        print(
+            f"  {tier_gb:5d}GB  {tier_name:<12}  {cfg_no_lm:>15}  {tested_no_lm_str:>15}  "
+            f"{cfg_with_lm:>12}  {tested_with_lm_str:>12}  {recommendation:<30}"
+        )
+
+    print("  " + "-" * 110)
+    print()
+    print("  Note: Batch boundary results are empirical and depend on:")
+    print("    - DiT model variant (turbo vs base)")
+    print("    - Generation duration (longer = more VRAM per batch)")
+    print("    - Flash attention availability")
+    print("    - LM model size (0.6B vs 1.7B vs 4B)")
+    print("    - Quantization and offload settings")
+
+
 # =============================================================================
 # Mode: understand
 # =============================================================================
@@ -763,7 +1583,7 @@ def run_create_sample_mode(
     print(f"\n  Query: {query}")
     print(f"  Instrumental: {args.instrumental}")
 
-        timer.sync()
+    timer.sync()
     t0 = time.perf_counter()
 
     result = create_sample(
@@ -826,7 +1646,7 @@ def run_format_sample_mode(
     print(f"\n  Caption: {caption[:80]}...")
     print(f"  Lyrics: {lyrics[:80]}...")
 
-        timer.sync()
+    timer.sync()
     t0 = time.perf_counter()
 
     result = format_sample(
@@ -864,23 +1684,23 @@ def run_format_sample_mode(
 
 def _print_cprofile(prof):
     """Print cProfile results and save to file."""
-            import pstats
-            import io
-            
-            output_file = "profile_cprofile_detailed.txt"
+    import pstats
+    import io
+
+    output_file = "profile_cprofile_detailed.txt"
     with open(output_file, "w") as f:
-                ps = pstats.Stats(prof, stream=f)
+        ps = pstats.Stats(prof, stream=f)
         ps.sort_stats("cumulative")
-                ps.print_stats(100)
-            
-            print("\n" + "=" * 100)
+        ps.print_stats(100)
+
+    print("\n" + "=" * 100)
     print("TOP 20 FUNCTIONS BY CUMULATIVE TIME (cProfile)")
-            print("=" * 100)
-            s = io.StringIO()
-            ps = pstats.Stats(prof, stream=s)
+    print("=" * 100)
+    s = io.StringIO()
+    ps = pstats.Stats(prof, stream=s)
     ps.sort_stats("cumulative")
-            ps.print_stats(20)
-            print(s.getvalue())
+    ps.print_stats(20)
+    print(s.getvalue())
     print(f"Full report saved to: {output_file}")
 
 
@@ -888,14 +1708,13 @@ def _cleanup_dir(path: str):
     """Remove temporary directory silently."""
     try:
         import shutil
-
         shutil.rmtree(path, ignore_errors=True)
     except Exception:
         pass
 
 
 # =============================================================================
-# Handler initialization
+# Handler initialization (for non-tier-test modes)
 # =============================================================================
 
 
@@ -911,7 +1730,6 @@ def initialize_handlers(
     if device.startswith("cuda"):
         try:
             import flash_attn  # noqa: F401
-
             use_flash_attention = True
         except ImportError:
             pass
@@ -974,7 +1792,7 @@ def initialize_handlers(
 def build_parser() -> argparse.ArgumentParser:
     """Build the argument parser with all options."""
     env_config = load_env_config()
-    
+
     parser = argparse.ArgumentParser(
         description="ACE-Step 1.5 Inference Profiler & Benchmark",
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -983,6 +1801,9 @@ def build_parser() -> argparse.ArgumentParser:
   python profile_inference.py                                    # Profile text2music
   python profile_inference.py --thinking --llm-debug             # With LLM analysis
   python profile_inference.py --mode benchmark                   # Benchmark matrix
+  python profile_inference.py --mode tier-test                   # Test all GPU tiers
+  python profile_inference.py --mode tier-test --tiers 6 8 16    # Test specific tiers
+  python profile_inference.py --mode tier-test --tier-with-lm    # Test tiers with LM
   python profile_inference.py --mode understand                  # Profile understand API
   python profile_inference.py --mode create_sample --sample-query "jazz ballad"
   python profile_inference.py --device mps --lm-backend mlx      # Apple Silicon
@@ -998,6 +1819,7 @@ def build_parser() -> argparse.ArgumentParser:
         choices=[
             "profile",
             "benchmark",
+            "tier-test",
             "understand",
             "create_sample",
             "format_sample",
@@ -1203,6 +2025,44 @@ def build_parser() -> argparse.ArgumentParser:
         help="Save benchmark results to JSON file",
     )
 
+    # Tier-test options
+    parser.add_argument(
+        "--tiers",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Specific VRAM tiers to test (e.g., --tiers 6 8 16). Default: all tiers",
+    )
+    parser.add_argument(
+        "--tier-with-lm",
+        action="store_true",
+        help="Enable LM for tiers that support it (default: DiT-only test)",
+    )
+    parser.add_argument(
+        "--tier-duration",
+        type=float,
+        default=240,
+        help="Test generation duration in seconds for tier-test (default: 240)",
+    )
+    parser.add_argument(
+        "--tier-skip-compile",
+        action="store_true",
+        help="Skip torch.compile for non-quantized tiers (faster testing, less realistic)",
+    )
+    parser.add_argument(
+        "--tier-boundary",
+        action="store_true",
+        help="Enable boundary testing: for each tier, also test without INT8 quantization "
+             "and without CPU offload to find the minimum VRAM tier for each capability",
+    )
+    parser.add_argument(
+        "--tier-batch-boundary",
+        action="store_true",
+        help="Enable batch size boundary testing: for each tier, progressively test "
+             "batch sizes 1, 2, 4, 8 (stop at first OOM) to find the maximum safe batch "
+             "size. Tests both with-LM and without-LM configurations.",
+    )
+
     # create_sample / understand options
     parser.add_argument(
         "--sample-query",
@@ -1238,6 +2098,17 @@ def main():
     if args.no_constrained_decoding:
         args.use_constrained_decoding = False
 
+    # Tier-test mode has its own initialization flow
+    if args.mode == "tier-test":
+        print("=" * 120)
+        print("ACE-Step 1.5 Tier Compatibility Test")
+        print("=" * 120)
+        run_tier_test_mode(args)
+        print("\n" + "=" * 120)
+        print("DONE")
+        print("=" * 120)
+        return
+
     # Resolve device
     device = resolve_device(args.device)
 
@@ -1252,7 +2123,7 @@ def main():
     # Auto-enable offload for small GPUs
     if (
         gpu_config.gpu_memory_gb > 0
-        and gpu_config.gpu_memory_gb < 16
+        and gpu_config.gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB
         and not args.offload_to_cpu
     ):
         args.offload_to_cpu = True
diff --git a/scripts/profile_vram.py b/scripts/profile_vram.py
new file mode 100644
index 00000000..5c543845
--- /dev/null
+++ b/scripts/profile_vram.py
@@ -0,0 +1,536 @@
+#!/usr/bin/env python3
+"""
+VRAM Profiling Script for ACE-Step 1.5
+
+Measures actual GPU memory consumption of each model component at different
+configurations. Results are used to calibrate the empirical VRAM constants
+in gpu_config.py.
+
+Usage:
+    python scripts/profile_vram.py                          # Profile all components
+    python scripts/profile_vram.py --component dit          # Profile DiT only
+    python scripts/profile_vram.py --component lm           # Profile LM only
+    python scripts/profile_vram.py --component vae          # Profile VAE only
+    python scripts/profile_vram.py --output results.json    # Save results to JSON
+
+Requirements:
+    - CUDA GPU with sufficient memory
+    - All model checkpoints downloaded
+"""
+
+import argparse
+import gc
+import json
+import os
+import sys
+import time
+from typing import Dict, Any, Optional, List
+
+# Add project root to path
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+import torch
+
+
+def get_memory_stats() -> Dict[str, float]:
+    """Get current CUDA memory statistics in GB."""
+    if not torch.cuda.is_available():
+        return {"allocated": 0, "reserved": 0, "free": 0, "total": 0, "max_allocated": 0}
+    
+    allocated = torch.cuda.memory_allocated() / (1024**3)
+    reserved = torch.cuda.memory_reserved() / (1024**3)
+    free, total = torch.cuda.mem_get_info()
+    free_gb = free / (1024**3)
+    total_gb = total / (1024**3)
+    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
+    
+    return {
+        "allocated": round(allocated, 3),
+        "reserved": round(reserved, 3),
+        "free": round(free_gb, 3),
+        "total": round(total_gb, 3),
+        "max_allocated": round(max_allocated, 3),
+    }
+
+
+def reset_memory():
+    """Reset CUDA memory stats and free caches."""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        gc.collect()
+        torch.cuda.empty_cache()
+        # Wait for GPU to settle
+        torch.cuda.synchronize()
+
+
+def measure_cuda_context() -> Dict[str, float]:
+    """Measure CUDA context overhead."""
+    print("\n" + "=" * 60)
+    print("Measuring CUDA context overhead...")
+    print("=" * 60)
+    
+    reset_memory()
+    before = get_memory_stats()
+    
+    # Force CUDA context initialization
+    _ = torch.zeros(1, device="cuda")
+    del _
+    torch.cuda.synchronize()
+    
+    after = get_memory_stats()
+    
+    context_overhead = after["total"] - after["free"] - before.get("allocated", 0)
+    
+    result = {
+        "cuda_context_gb": round(context_overhead, 3),
+        "total_gpu_gb": after["total"],
+        "free_after_context_gb": after["free"],
+    }
+    
+    print(f"  CUDA context overhead: {result['cuda_context_gb']:.3f} GB")
+    print(f"  Total GPU memory: {result['total_gpu_gb']:.3f} GB")
+    print(f"  Free after context: {result['free_after_context_gb']:.3f} GB")
+    
+    return result
+
+
+def profile_dit(checkpoint_dir: str, config_path: str = "acestep-v15-turbo") -> Dict[str, Any]:
+    """Profile DiT model memory consumption."""
+    print("\n" + "=" * 60)
+    print(f"Profiling DiT model: {config_path}")
+    print("=" * 60)
+    
+    from transformers import AutoModel
+    
+    model_path = os.path.join(checkpoint_dir, config_path)
+    if not os.path.exists(model_path):
+        print(f"  Model not found: {model_path}")
+        return {}
+    
+    reset_memory()
+    before = get_memory_stats()
+    
+    # Load model weights
+    print("  Loading DiT model weights...")
+    model = AutoModel.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+        dtype=torch.bfloat16,
+    )
+    model = model.to("cuda").to(torch.bfloat16)
+    model.eval()
+    torch.cuda.synchronize()
+    
+    after_load = get_memory_stats()
+    weights_gb = after_load["allocated"] - before["allocated"]
+    
+    print(f"  DiT model weights: {weights_gb:.3f} GB")
+    
+    # Load silence latent
+    silence_path = os.path.join(model_path, "silence_latent.pt")
+    silence_latent = None
+    if os.path.exists(silence_path):
+        silence_latent = torch.load(silence_path, weights_only=True).transpose(1, 2)
+        silence_latent = silence_latent.to("cuda").to(torch.bfloat16)
+    
+    # Determine if model has CFG (base vs turbo)
+    has_cfg = "turbo" not in config_path.lower()
+    
+    # Profile inference at different batch sizes and durations
+    inference_results = []
+    
+    # Duration -> latent_length mapping: 48000 Hz audio, 5 Hz latent = 9600 audio samples per latent frame
+    # Actually: latent_length = ceil(duration * 5) for 5Hz models
+    durations = [60, 120, 240]
+    batch_sizes = [1, 2, 4]
+    
+    for duration in durations:
+        for batch_size in batch_sizes:
+            reset_memory()
+            torch.cuda.reset_peak_memory_stats()
+            
+            # Reload model to GPU if needed
+            model = model.to("cuda")
+            torch.cuda.synchronize()
+            
+            mem_before_inference = get_memory_stats()
+            
+            latent_length = int(duration * 5)  # 5 Hz
+            latent_dim = 64  # Standard latent dim
+            
+            try:
+                with torch.inference_mode():
+                    # Simulate DiT inference inputs
+                    # Create dummy latent noise
+                    noise = torch.randn(batch_size, latent_length, latent_dim, device="cuda", dtype=torch.bfloat16)
+                    
+                    # Simulate text encoder output
+                    text_hidden = torch.randn(batch_size, 512, 768, device="cuda", dtype=torch.bfloat16)
+                    text_mask = torch.ones(batch_size, 512, device="cuda", dtype=torch.long)
+                    
+                    # If has CFG, double the batch for classifier-free guidance
+                    if has_cfg:
+                        noise_cfg = torch.cat([noise, noise], dim=0)
+                        text_hidden_cfg = torch.cat([text_hidden, text_hidden], dim=0)
+                        text_mask_cfg = torch.cat([text_mask, text_mask], dim=0)
+                        del noise_cfg, text_hidden_cfg, text_mask_cfg
+                    
+                    del noise, text_hidden, text_mask
+                    torch.cuda.synchronize()
+                    
+                mem_after_inference = get_memory_stats()
+                peak_gb = mem_after_inference["max_allocated"] - mem_before_inference["allocated"]
+                
+                result_entry = {
+                    "duration_s": duration,
+                    "batch_size": batch_size,
+                    "has_cfg": has_cfg,
+                    "peak_inference_gb": round(peak_gb, 3),
+                    "latent_length": latent_length,
+                }
+                inference_results.append(result_entry)
+                
+                print(f"  batch={batch_size}, dur={duration}s: peak={peak_gb:.3f} GB (cfg={has_cfg})")
+                
+            except RuntimeError as e:
+                if "out of memory" in str(e).lower():
+                    print(f"  batch={batch_size}, dur={duration}s: OOM")
+                    inference_results.append({
+                        "duration_s": duration,
+                        "batch_size": batch_size,
+                        "has_cfg": has_cfg,
+                        "peak_inference_gb": -1,
+                        "error": "OOM",
+                    })
+                    torch.cuda.empty_cache()
+                else:
+                    raise
+    
+    # Cleanup
+    del model
+    if silence_latent is not None:
+        del silence_latent
+    torch.cuda.empty_cache()
+    gc.collect()
+    
+    return {
+        "config_path": config_path,
+        "weights_gb": round(weights_gb, 3),
+        "has_cfg": has_cfg,
+        "inference_results": inference_results,
+    }
+
+
+def profile_vae(checkpoint_dir: str) -> Dict[str, Any]:
+    """Profile VAE model memory consumption."""
+    print("\n" + "=" * 60)
+    print("Profiling VAE model")
+    print("=" * 60)
+    
+    from diffusers.models import AutoencoderOobleck
+    
+    vae_path = os.path.join(checkpoint_dir, "vae")
+    if not os.path.exists(vae_path):
+        print(f"  VAE not found: {vae_path}")
+        return {}
+    
+    reset_memory()
+    before = get_memory_stats()
+    
+    # Load VAE
+    print("  Loading VAE model weights...")
+    vae = AutoencoderOobleck.from_pretrained(vae_path)
+    vae = vae.to("cuda").to(torch.float16)
+    vae.eval()
+    torch.cuda.synchronize()
+    
+    after_load = get_memory_stats()
+    weights_gb = after_load["allocated"] - before["allocated"]
+    
+    print(f"  VAE model weights: {weights_gb:.3f} GB")
+    
+    # Profile decode at different chunk sizes
+    decode_results = []
+    chunk_sizes = [256, 512, 1024]
+    
+    for chunk_size in chunk_sizes:
+        reset_memory()
+        torch.cuda.reset_peak_memory_stats()
+        
+        vae = vae.to("cuda")
+        torch.cuda.synchronize()
+        
+        mem_before = get_memory_stats()
+        
+        try:
+            with torch.inference_mode():
+                # Simulate latent input: [batch=1, channels=64, length=chunk_size]
+                latent = torch.randn(1, 64, chunk_size, device="cuda", dtype=torch.float16)
+                decoder_output = vae.decode(latent)
+                audio = decoder_output.sample
+                del decoder_output, audio, latent
+                torch.cuda.synchronize()
+            
+            mem_after = get_memory_stats()
+            peak_gb = mem_after["max_allocated"] - mem_before["allocated"]
+            
+            decode_results.append({
+                "chunk_size": chunk_size,
+                "peak_decode_gb": round(peak_gb, 3),
+            })
+            print(f"  chunk_size={chunk_size}: peak={peak_gb:.3f} GB")
+            
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                print(f"  chunk_size={chunk_size}: OOM")
+                decode_results.append({
+                    "chunk_size": chunk_size,
+                    "peak_decode_gb": -1,
+                    "error": "OOM",
+                })
+                torch.cuda.empty_cache()
+            else:
+                raise
+    
+    # Cleanup
+    del vae
+    torch.cuda.empty_cache()
+    gc.collect()
+    
+    return {
+        "weights_gb": round(weights_gb, 3),
+        "decode_results": decode_results,
+    }
+
+
+def profile_text_encoder(checkpoint_dir: str) -> Dict[str, Any]:
+    """Profile text encoder memory consumption."""
+    print("\n" + "=" * 60)
+    print("Profiling Text Encoder")
+    print("=" * 60)
+    
+    from transformers import AutoModel, AutoTokenizer
+    
+    encoder_path = os.path.join(checkpoint_dir, "text_encoder")
+    if not os.path.exists(encoder_path):
+        print(f"  Text encoder not found: {encoder_path}")
+        return {}
+    
+    reset_memory()
+    before = get_memory_stats()
+    
+    # Load text encoder
+    print("  Loading text encoder weights...")
+    tokenizer = AutoTokenizer.from_pretrained(encoder_path)
+    model = AutoModel.from_pretrained(encoder_path)
+    model = model.to("cuda").to(torch.bfloat16)
+    model.eval()
+    torch.cuda.synchronize()
+    
+    after_load = get_memory_stats()
+    weights_gb = after_load["allocated"] - before["allocated"]
+    
+    print(f"  Text encoder weights: {weights_gb:.3f} GB")
+    
+    # Cleanup
+    del model, tokenizer
+    torch.cuda.empty_cache()
+    gc.collect()
+    
+    return {
+        "weights_gb": round(weights_gb, 3),
+    }
+
+
+def profile_lm(checkpoint_dir: str, lm_models: Optional[List[str]] = None) -> Dict[str, Any]:
+    """Profile LM model memory consumption."""
+    print("\n" + "=" * 60)
+    print("Profiling 5Hz LM models")
+    print("=" * 60)
+    
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    
+    if lm_models is None:
+        # Auto-detect available LM models
+        lm_models = []
+        for name in os.listdir(checkpoint_dir):
+            if "5Hz-lm" in name and os.path.isdir(os.path.join(checkpoint_dir, name)):
+                lm_models.append(name)
+    
+    if not lm_models:
+        print("  No LM models found")
+        return {}
+    
+    lm_models.sort()
+    results = {}
+    
+    for lm_name in lm_models:
+        lm_path = os.path.join(checkpoint_dir, lm_name)
+        if not os.path.exists(lm_path):
+            print(f"  LM model not found: {lm_path}")
+            continue
+        
+        print(f"\n  Profiling LM: {lm_name}")
+        
+        reset_memory()
+        before = get_memory_stats()
+        
+        # Load model weights
+        print(f"    Loading model weights...")
+        model = AutoModelForCausalLM.from_pretrained(
+            lm_path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+        model = model.to("cuda")
+        model.eval()
+        torch.cuda.synchronize()
+        
+        after_load = get_memory_stats()
+        weights_gb = after_load["allocated"] - before["allocated"]
+        
+        print(f"    Model weights: {weights_gb:.3f} GB")
+        
+        # Estimate KV cache memory for different max_model_len values
+        # KV cache formula: 2 * num_layers * max_tokens * num_kv_heads * head_dim * dtype_size
+        config = model.config
+        num_layers = config.num_hidden_layers
+        num_kv_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        dtype_size = 2  # bfloat16 = 2 bytes
+        
+        kv_cache_estimates = {}
+        for max_len in [2048, 4096]:
+            # Per-token KV cache size
+            per_token_bytes = 2 * num_layers * num_kv_heads * head_dim * dtype_size
+            total_bytes = per_token_bytes * max_len
+            total_gb = total_bytes / (1024**3)
+            kv_cache_estimates[str(max_len)] = round(total_gb, 3)
+            print(f"    KV cache ({max_len} tokens): {total_gb:.3f} GB")
+        
+        results[lm_name] = {
+            "weights_gb": round(weights_gb, 3),
+            "kv_cache_estimates": kv_cache_estimates,
+            "num_layers": num_layers,
+            "num_kv_heads": num_kv_heads,
+            "head_dim": head_dim,
+        }
+        
+        # Cleanup
+        del model
+        torch.cuda.empty_cache()
+        gc.collect()
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="VRAM Profiling for ACE-Step 1.5")
+    parser.add_argument("--component", type=str, default="all",
+                       choices=["all", "cuda_context", "dit", "vae", "text_encoder", "lm"],
+                       help="Component to profile (default: all)")
+    parser.add_argument("--checkpoint-dir", type=str, default=None,
+                       help="Checkpoint directory (default: auto-detect)")
+    parser.add_argument("--dit-config", type=str, default="acestep-v15-turbo",
+                       help="DiT model config name (default: acestep-v15-turbo)")
+    parser.add_argument("--lm-models", type=str, nargs="*", default=None,
+                       help="LM models to profile (default: auto-detect)")
+    parser.add_argument("--output", type=str, default=None,
+                       help="Output JSON file path")
+    
+    args = parser.parse_args()
+    
+    if not torch.cuda.is_available():
+        print("ERROR: CUDA is not available. This script requires a CUDA GPU.")
+        sys.exit(1)
+    
+    # Auto-detect checkpoint directory
+    if args.checkpoint_dir is None:
+        args.checkpoint_dir = os.path.join(PROJECT_ROOT, "checkpoints")
+    
+    if not os.path.exists(args.checkpoint_dir):
+        print(f"ERROR: Checkpoint directory not found: {args.checkpoint_dir}")
+        sys.exit(1)
+    
+    device_name = torch.cuda.get_device_name(0)
+    total_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+    
+    print("=" * 60)
+    print("ACE-Step 1.5 VRAM Profiler")
+    print("=" * 60)
+    print(f"  GPU: {device_name}")
+    print(f"  Total VRAM: {total_mem:.2f} GB")
+    print(f"  Checkpoint dir: {args.checkpoint_dir}")
+    print(f"  Component: {args.component}")
+    
+    results = {
+        "gpu_name": device_name,
+        "total_vram_gb": round(total_mem, 3),
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    
+    components = [args.component] if args.component != "all" else [
+        "cuda_context", "dit", "vae", "text_encoder", "lm"
+    ]
+    
+    for component in components:
+        if component == "cuda_context":
+            results["cuda_context"] = measure_cuda_context()
+        elif component == "dit":
+            results["dit"] = profile_dit(args.checkpoint_dir, args.dit_config)
+        elif component == "vae":
+            results["vae"] = profile_vae(args.checkpoint_dir)
+        elif component == "text_encoder":
+            results["text_encoder"] = profile_text_encoder(args.checkpoint_dir)
+        elif component == "lm":
+            results["lm"] = profile_lm(args.checkpoint_dir, args.lm_models)
+    
+    # Print summary
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    
+    if "cuda_context" in results:
+        print(f"  CUDA context: {results['cuda_context'].get('cuda_context_gb', 'N/A')} GB")
+    if "dit" in results and results["dit"]:
+        print(f"  DiT weights ({results['dit'].get('config_path', '')}): {results['dit'].get('weights_gb', 'N/A')} GB")
+    if "vae" in results and results["vae"]:
+        print(f"  VAE weights: {results['vae'].get('weights_gb', 'N/A')} GB")
+    if "text_encoder" in results and results["text_encoder"]:
+        print(f"  Text encoder weights: {results['text_encoder'].get('weights_gb', 'N/A')} GB")
+    if "lm" in results and results["lm"]:
+        for lm_name, lm_data in results["lm"].items():
+            print(f"  LM {lm_name} weights: {lm_data.get('weights_gb', 'N/A')} GB")
+    
+    # Calculate total base VRAM (all models loaded simultaneously)
+    base_total = 0
+    if "cuda_context" in results:
+        base_total += results["cuda_context"].get("cuda_context_gb", 0)
+    if "dit" in results and results["dit"]:
+        base_total += results["dit"].get("weights_gb", 0)
+    if "vae" in results and results["vae"]:
+        base_total += results["vae"].get("weights_gb", 0)
+    if "text_encoder" in results and results["text_encoder"]:
+        base_total += results["text_encoder"].get("weights_gb", 0)
+    
+    print(f"\n  Base VRAM (DiT+VAE+TextEnc+CUDA): {base_total:.3f} GB")
+    print(f"  Remaining for LM + inference: {total_mem - base_total:.3f} GB")
+    
+    # Save results
+    if args.output:
+        output_path = args.output
+    else:
+        output_path = os.path.join(PROJECT_ROOT, "scripts", "vram_profile_results.json")
+    
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\n  Results saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()