Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,17 @@ Open http://localhost:7860 (Gradio) or http://localhost:8001 (API).

### 💡 Which Model Should I Choose?

| Your GPU VRAM | Recommended LM Model | Notes |
|---------------|---------------------|-------|
| **≤6GB** | None (DiT only) | LM disabled by default to save memory |
| **6-12GB** | `acestep-5Hz-lm-0.6B` | Lightweight, good balance |
| **12-16GB** | `acestep-5Hz-lm-1.7B` | Better quality |
| **≥16GB** | `acestep-5Hz-lm-4B` | Best quality and audio understanding |

> 📖 GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [中文](./docs/zh/GPU_COMPATIBILITY.md) | [日本語](./docs/ja/GPU_COMPATIBILITY.md)
| Your GPU VRAM | Recommended LM Model | Backend | Notes |
|---------------|---------------------|---------|-------|
| **≤6GB** | None (DiT only) | — | LM disabled by default; INT8 quantization + full CPU offload |
| **6-8GB** | `acestep-5Hz-lm-0.6B` | `pt` | Lightweight LM with PyTorch backend |
| **8-16GB** | `acestep-5Hz-lm-0.6B` / `1.7B` | `vllm` | 0.6B for 8-12GB, 1.7B for 12-16GB |
| **16-24GB** | `acestep-5Hz-lm-1.7B` | `vllm` | 4B available on 20GB+; no offload needed on 20GB+ |
| **≥24GB** | `acestep-5Hz-lm-4B` | `vllm` | Best quality, all models fit without offload |

The UI automatically selects the best configuration for your GPU. All settings (LM model, backend, offloading, quantization) are tier-aware and pre-configured.

> 📖 GPU compatibility details: [English](./docs/en/GPU_COMPATIBILITY.md) | [中文](./docs/zh/GPU_COMPATIBILITY.md) | [日本語](./docs/ja/GPU_COMPATIBILITY.md) | [한국어](./docs/ko/GPU_COMPATIBILITY.md)

## 🚀 Launch Scripts

Expand Down
27 changes: 22 additions & 5 deletions acestep/acestep_v15_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from .llm_inference import LLMHandler
from .dataset_handler import DatasetHandler
from .gradio_ui import create_gradio_interface
from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB
from .gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB
from .model_downloader import ensure_lm_model
except ImportError:
# When executed as a script: `python acestep/acestep_v15_pipeline.py`
Expand All @@ -47,7 +47,7 @@
from acestep.llm_inference import LLMHandler
from acestep.dataset_handler import DatasetHandler
from acestep.gradio_ui import create_gradio_interface
from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB
from acestep.gpu_config import get_gpu_config, get_gpu_memory_gb, print_gpu_config_info, set_global_gpu_config, VRAM_16GB_MIN_GB, VRAM_AUTO_OFFLOAD_THRESHOLD_GB
from acestep.model_downloader import ensure_lm_model


Expand Down Expand Up @@ -93,7 +93,11 @@ def main():
set_global_gpu_config(gpu_config) # Set global config for use across modules

gpu_memory_gb = gpu_config.gpu_memory_gb
auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB
# Enable auto-offload for GPUs below 20 GB. 16 GB GPUs cannot hold all
# models simultaneously (DiT ~4.7 + VAE ~0.3 + text_enc ~1.2 + LM ≥1.2 +
# activations) so they *must* offload. The old threshold of 16 GB caused
# 16 GB GPUs to never offload, leading to OOM.
auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB

# Print GPU configuration info
print(f"\n{'='*60}")
Expand All @@ -110,9 +114,9 @@ def main():
print(f"{'='*60}\n")

if auto_offload:
print(f"Auto-enabling CPU offload (GPU < 16GB)")
print(f"Auto-enabling CPU offload (GPU {gpu_memory_gb:.1f}GB < {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)")
elif gpu_memory_gb > 0:
print(f"CPU offload disabled by default (GPU >= 16GB)")
print(f"CPU offload disabled by default (GPU {gpu_memory_gb:.1f}GB >= {VRAM_AUTO_OFFLOAD_THRESHOLD_GB}GB threshold)")
else:
print("No GPU detected, running on CPU")

Expand Down Expand Up @@ -205,6 +209,19 @@ def main():
args.offload_to_cpu = True
print(f"Auto-enabling CPU offload (4B LM model requires offloading on {gpu_memory_gb:.0f}GB GPU)")

# Safety: on 16GB GPUs, prevent selecting LM models that are too large.
# Even with offloading, a 4B LM (8 GB weights + KV cache) leaves almost no
# headroom for DiT activations on a 16 GB card.
if args.lm_model_path and 0 < gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB:
if "4B" in args.lm_model_path:
# Downgrade to 1.7B if available
fallback = args.lm_model_path.replace("4B", "1.7B")
print(
f"WARNING: 4B LM model is too large for {gpu_memory_gb:.0f}GB GPU. "
f"Downgrading to 1.7B variant: {fallback}"
)
args.lm_model_path = fallback

try:
init_params = None
dit_handler = None
Expand Down
3 changes: 2 additions & 1 deletion acestep/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
is_lm_model_supported,
GPUConfig,
VRAM_16GB_MIN_GB,
VRAM_AUTO_OFFLOAD_THRESHOLD_GB,
)


Expand Down Expand Up @@ -1899,7 +1900,7 @@ async def _job_store_cleanup_worker() -> None:
app.state.gpu_config = gpu_config

gpu_memory_gb = gpu_config.gpu_memory_gb
auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_16GB_MIN_GB
auto_offload = gpu_memory_gb > 0 and gpu_memory_gb < VRAM_AUTO_OFFLOAD_THRESHOLD_GB

# Print GPU configuration info
print(f"\n{'='*60}")
Expand Down
16 changes: 8 additions & 8 deletions acestep/dit_alignment_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,16 +834,16 @@ def calculate_score(
Returns:
AlignmentScore object containing individual metrics and final score.
"""
# Ensure Inputs are Tensors on the correct device
# Ensure Inputs are Tensors.
# Always compute on CPU — the scoring matrices are small and this
# avoids occupying GPU VRAM that DiT / VAE / LM need. Keeping
# everything on CPU also prevents timeout issues on low-VRAM GPUs
# where the accelerator memory is fully committed to model weights.
_score_device = "cpu"
if not isinstance(energy_matrix, torch.Tensor):
# Use available accelerator device; fallback to CPU if none
if torch.cuda.is_available():
_score_device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
_score_device = "mps"
else:
_score_device = "cpu"
energy_matrix = torch.tensor(energy_matrix, device=_score_device, dtype=torch.float32)
else:
energy_matrix = energy_matrix.to(device=_score_device, dtype=torch.float32)

device = energy_matrix.device

Expand Down
Loading