Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 99 additions & 8 deletions acestep/gpu_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class GPUConfig:
recommended_lm_model: str # Recommended default LM model path (empty if LM not available)

# LM backend restriction
# "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for very low VRAM
# "all" = any backend, "pt_mlx_only" = only pt/mlx (no vllm), used for MPS (vllm requires CUDA)
lm_backend_restriction: str # "all" or "pt_mlx_only"
recommended_backend: str # Recommended default backend: "vllm", "pt", or "mlx"

Expand Down Expand Up @@ -142,8 +142,8 @@ class GPUConfig:
"init_lm_default": False,
"available_lm_models": [],
"recommended_lm_model": "",
"lm_backend_restriction": "pt_mlx_only", # vllm KV cache won't fit
"recommended_backend": "pt",
"lm_backend_restriction": "all",
"recommended_backend": "vllm",
"offload_to_cpu_default": True,
"offload_dit_to_cpu_default": True,
"quantization_default": True, # INT8 essential to fit DiT in ~4GB
Expand All @@ -161,8 +161,8 @@ class GPUConfig:
"init_lm_default": False,
"available_lm_models": [],
"recommended_lm_model": "",
"lm_backend_restriction": "pt_mlx_only",
"recommended_backend": "pt",
"lm_backend_restriction": "all",
"recommended_backend": "vllm",
"offload_to_cpu_default": True,
"offload_dit_to_cpu_default": True,
"quantization_default": True,
Expand All @@ -172,16 +172,16 @@ class GPUConfig:
"tier3": { # 6-8GB
# Offload mode. DiT(4.46) + context(0.5) ≈ 5.0GB.
# ~1.5-3GB headroom allows LM 0.6B (1.2+0.6=1.8GB) and batch=2.
# vllm KV cache is tight; pt backend is safer for 0.6B on this tier.
# With CPU offload, DiT is offloaded before LM runs → vllm can use freed VRAM.
"max_duration_with_lm": 480, # 8 minutes
"max_duration_without_lm": 600, # 10 minutes (max supported)
"max_batch_size_with_lm": 2,
"max_batch_size_without_lm": 2,
"init_lm_default": True,
"available_lm_models": ["acestep-5Hz-lm-0.6B"],
"recommended_lm_model": "acestep-5Hz-lm-0.6B",
"lm_backend_restriction": "pt_mlx_only", # vllm KV cache too greedy for <8GB
"recommended_backend": "pt",
"lm_backend_restriction": "all",
"recommended_backend": "vllm",
"offload_to_cpu_default": True,
"offload_dit_to_cpu_default": True,
"quantization_default": True,
Expand Down Expand Up @@ -1080,6 +1080,97 @@ def print_gpu_config_info(gpu_config: GPUConfig):
logger.info(f" - Available LM Models: {gpu_config.available_lm_models or 'None'}")


# Human-readable tier labels for UI display
GPU_TIER_LABELS = {
"tier1": "tier1 (≤4GB)",
"tier2": "tier2 (4-6GB)",
"tier3": "tier3 (6-8GB)",
"tier4": "tier4 (8-12GB)",
"tier5": "tier5 (12-16GB)",
"tier6a": "tier6a (16-20GB)",
"tier6b": "tier6b (20-24GB)",
"unlimited": "unlimited (≥24GB)",
}

# Ordered list of tier keys for dropdown
GPU_TIER_CHOICES = list(GPU_TIER_LABELS.items()) # [(value, label), ...]


def get_gpu_device_name() -> str:
"""
Get the GPU device name string.

Returns:
Human-readable GPU name, e.g. "NVIDIA GeForce RTX 4060 Ti",
"Apple M2 Pro (MPS)", "CPU only", etc.
"""
try:
import torch
if torch.cuda.is_available():
return torch.cuda.get_device_name(0)
elif hasattr(torch, 'xpu') and torch.xpu.is_available():
props = torch.xpu.get_device_properties(0)
return getattr(props, 'name', 'Intel XPU')
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
# MPS doesn't expose a device name; use platform info
try:
import platform
chip = platform.processor() or "Apple Silicon"
return f"{chip} (MPS)"
except Exception:
return "Apple Silicon (MPS)"
else:
return "CPU only"
except ImportError:
return "Unknown (PyTorch not available)"


def get_gpu_config_for_tier(tier: str) -> GPUConfig:
"""
Create a GPUConfig for a specific tier, applying platform overrides.

This is used when the user manually selects a different tier in the UI.
The actual gpu_memory_gb is preserved from the real hardware detection,
but all tier-based settings come from the selected tier's config.

Args:
tier: Tier key, e.g. "tier3", "tier6a", "unlimited"

Returns:
GPUConfig with the selected tier's settings
"""
if tier not in GPU_TIER_CONFIGS:
logger.warning(f"Unknown tier '{tier}', falling back to auto-detected config")
return get_gpu_config()

# Keep the real GPU memory for informational purposes
real_gpu_memory = get_gpu_memory_gb()
config = GPU_TIER_CONFIGS[tier]

_mps = is_mps_platform()
if _mps:
logger.info(f"Manual tier override to {tier} on macOS MPS — applying Apple Silicon overrides")

return GPUConfig(
tier=tier,
gpu_memory_gb=real_gpu_memory,
max_duration_with_lm=config["max_duration_with_lm"],
max_duration_without_lm=config["max_duration_without_lm"],
max_batch_size_with_lm=config["max_batch_size_with_lm"],
max_batch_size_without_lm=config["max_batch_size_without_lm"],
init_lm_default=config["init_lm_default"],
available_lm_models=config["available_lm_models"],
recommended_lm_model=config.get("recommended_lm_model", ""),
lm_backend_restriction="pt_mlx_only" if _mps else config.get("lm_backend_restriction", "all"),
recommended_backend="mlx" if _mps else config.get("recommended_backend", "vllm"),
offload_to_cpu_default=False if _mps else config.get("offload_to_cpu_default", True),
offload_dit_to_cpu_default=False if _mps else config.get("offload_dit_to_cpu_default", True),
quantization_default=False if _mps else config.get("quantization_default", True),
compile_model_default=False if _mps else config.get("compile_model_default", True),
lm_memory_gb=config["lm_memory_gb"],
)


# Global GPU config instance (initialized lazily)
_global_gpu_config: Optional[GPUConfig] = None

Expand Down
18 changes: 18 additions & 0 deletions acestep/gradio_ui/events/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,24 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
]
)

# ========== Tier Override ==========
generation_section["tier_dropdown"].change(
fn=lambda tier: gen_h.on_tier_change(tier, llm_handler),
inputs=[generation_section["tier_dropdown"]],
outputs=[
generation_section["offload_to_cpu_checkbox"],
generation_section["offload_dit_to_cpu_checkbox"],
generation_section["compile_model_checkbox"],
generation_section["quantization_checkbox"],
generation_section["backend_dropdown"],
generation_section["lm_model_path"],
generation_section["init_llm_checkbox"],
generation_section["batch_size_input"],
generation_section["audio_duration"],
generation_section["gpu_info_display"],
]
)

generation_section["init_btn"].click(
fn=lambda *args: gen_h.init_service_wrapper(dit_handler, llm_handler, *args),
inputs=[
Expand Down
95 changes: 93 additions & 2 deletions acestep/gradio_ui/events/generation_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
)
from acestep.gradio_ui.i18n import t
from acestep.inference import understand_music, create_sample, format_sample
from acestep.gpu_config import get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk
from acestep.gpu_config import (
get_global_gpu_config, is_lm_model_size_allowed, find_best_lm_model_on_disk,
get_gpu_config_for_tier, set_global_gpu_config, GPU_TIER_LABELS, GPU_TIER_CONFIGS,
)


def clamp_duration_to_gpu_limit(duration_value: Optional[float], llm_handler=None) -> Optional[float]:
Expand Down Expand Up @@ -559,6 +562,88 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
)


def on_tier_change(selected_tier, llm_handler=None):
"""
Handle manual tier override from the UI dropdown.

Updates the global GPU config and returns gr.update() for all
affected UI components so they reflect the new tier's defaults.

Returns a tuple of gr.update() objects for:
(offload_to_cpu, offload_dit_to_cpu, compile_model, quantization,
backend_dropdown, lm_model_path, init_llm, batch_size_input,
audio_duration, gpu_info_display)
"""
if not selected_tier or selected_tier not in GPU_TIER_CONFIGS:
logger.warning(f"Invalid tier selection: {selected_tier}")
return (gr.update(),) * 10

# Build new config for the selected tier and update global
new_config = get_gpu_config_for_tier(selected_tier)
set_global_gpu_config(new_config)
logger.info(f"🔄 Tier manually changed to {selected_tier} — updating UI defaults")

# Backend choices
if new_config.lm_backend_restriction == "pt_mlx_only":
available_backends = ["pt", "mlx"]
else:
available_backends = ["vllm", "pt", "mlx"]
recommended_backend = new_config.recommended_backend
if recommended_backend not in available_backends:
recommended_backend = available_backends[0]

# LM model choices — filter disk models by tier
tier_lm_models = new_config.available_lm_models
all_disk_models = llm_handler.get_available_5hz_lm_models() if llm_handler else []
if tier_lm_models:
filtered = [m for m in all_disk_models if is_lm_model_size_allowed(m, tier_lm_models)]
available_lm_models = filtered if filtered else all_disk_models
else:
available_lm_models = all_disk_models

recommended_lm = new_config.recommended_lm_model
default_lm_model = find_best_lm_model_on_disk(recommended_lm, available_lm_models)

# Duration and batch limits (use without-LM limits as safe default; init will refine)
max_duration = new_config.max_duration_without_lm
max_batch = new_config.max_batch_size_without_lm

# GPU info markdown update
tier_label = GPU_TIER_LABELS.get(selected_tier, selected_tier)
from acestep.gpu_config import get_gpu_device_name
_gpu_device_name = get_gpu_device_name()
gpu_info_text = f"🖥️ **{_gpu_device_name}** — {new_config.gpu_memory_gb:.1f} GB VRAM — {t('service.gpu_auto_tier')}: **{tier_label}**"

return (
# offload_to_cpu_checkbox
gr.update(value=new_config.offload_to_cpu_default,
info=t("service.offload_cpu_info") + (" (recommended for this tier)" if new_config.offload_to_cpu_default else " (optional for this tier)")),
# offload_dit_to_cpu_checkbox
gr.update(value=new_config.offload_dit_to_cpu_default,
info=t("service.offload_dit_cpu_info") + (" (recommended for this tier)" if new_config.offload_dit_to_cpu_default else " (optional for this tier)")),
# compile_model_checkbox
gr.update(value=new_config.compile_model_default),
# quantization_checkbox
gr.update(value=new_config.quantization_default,
info=t("service.quantization_info") + (" (recommended for this tier)" if new_config.quantization_default else " (optional for this tier)")),
# backend_dropdown
gr.update(choices=available_backends, value=recommended_backend),
# lm_model_path
gr.update(choices=available_lm_models, value=default_lm_model,
info=t("service.lm_model_path_info") + (f" (Recommended: {recommended_lm})" if recommended_lm else " (LM not available for this GPU tier)")),
# init_llm_checkbox
gr.update(value=new_config.init_lm_default),
# batch_size_input
gr.update(value=min(2, max_batch), maximum=max_batch,
info=f"Number of samples to generate (Max: {max_batch})"),
# audio_duration
gr.update(maximum=float(max_duration),
info=f"Duration in seconds (-1 for auto). Max: {max_duration}s / {max_duration // 60} min"),
# gpu_info_display
gr.update(value=gpu_info_text),
)


def get_ui_control_config(is_turbo: bool) -> dict:
"""Return UI control configuration (values, limits, visibility) for model type.
Used by both interactive init and service-mode startup so controls stay consistent.
Expand Down Expand Up @@ -821,7 +906,13 @@ def update_audio_components_visibility(batch_size):
Row 2: Components 5-8 (batch_size 5-8)
"""
# Clamp batch size to 1-8 range for UI
batch_size = min(max(int(batch_size), 1), 8)
if batch_size is None:
batch_size = 1
else:
try:
batch_size = min(max(int(batch_size), 1), 8)
except (TypeError, ValueError):
batch_size = 1

# Row 1 columns (1-4)
updates_row1 = (
Expand Down
5 changes: 4 additions & 1 deletion acestep/gradio_ui/i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@
"init_btn": "Initialize Service",
"status_label": "Status",
"language_label": "UI Language",
"language_info": "Select interface language"
"language_info": "Select interface language",
"gpu_auto_tier": "Auto-detected tier",
"tier_label": "GPU Tier Override",
"tier_info": "Manually select a GPU tier to adjust optimization defaults (offload, quantization, backend, etc.)"
},
"generation": {
"required_inputs": "📝 Required Inputs",
Expand Down
5 changes: 4 additions & 1 deletion acestep/gradio_ui/i18n/he.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@
"init_btn": "אתחול שירות",
"status_label": "מצב",
"language_label": "שפת ממשק",
"language_info": "בחר את שפת הממשק"
"language_info": "בחר את שפת הממשק",
"gpu_auto_tier": "שכבה שזוהתה אוטומטית",
"tier_label": "דריסת שכבת GPU",
"tier_info": "בחר שכבת GPU באופן ידני כדי להתאים ברירות מחדל של אופטימיזציה (העברה, קוונטיזציה, מנוע וכו')"
},
"generation": {
"required_inputs": "📝 קלטים נדרשים",
Expand Down
5 changes: 4 additions & 1 deletion acestep/gradio_ui/i18n/ja.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@
"init_btn": "サービスを初期化",
"status_label": "ステータス",
"language_label": "UI言語",
"language_info": "インターフェース言語を選択"
"language_info": "インターフェース言語を選択",
"gpu_auto_tier": "自動検出ティア",
"tier_label": "GPU ティアの手動選択",
"tier_info": "GPUティアを手動で選択して最適化のデフォルト(オフロード、量子化、バックエンドなど)を調整します"
},
"generation": {
"required_inputs": "📝 必須入力",
Expand Down
5 changes: 4 additions & 1 deletion acestep/gradio_ui/i18n/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@
"init_btn": "初始化服务",
"status_label": "状态",
"language_label": "界面语言",
"language_info": "选择界面语言"
"language_info": "选择界面语言",
"gpu_auto_tier": "自动检测层级",
"tier_label": "GPU 层级覆盖",
"tier_info": "手动选择 GPU 层级以调整优化默认值(卸载、量化、后端等)"
},
"generation": {
"required_inputs": "📝 必需输入",
Expand Down
19 changes: 18 additions & 1 deletion acestep/gradio_ui/interfaces/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from acestep.gradio_ui.i18n import t
from acestep.gradio_ui.events.generation_handlers import get_ui_control_config
from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk
from acestep.gpu_config import get_global_gpu_config, GPUConfig, is_lm_model_size_allowed, find_best_lm_model_on_disk, get_gpu_device_name, GPU_TIER_LABELS, GPU_TIER_CHOICES


def create_generation_section(dit_handler, llm_handler, init_params=None, language='en') -> dict:
Expand Down Expand Up @@ -101,6 +101,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
scale=1,
)

# GPU info display and tier override
_gpu_device_name = get_gpu_device_name()
_gpu_info_text = f"🖥️ **{_gpu_device_name}** — {gpu_config.gpu_memory_gb:.1f} GB VRAM — {t('service.gpu_auto_tier')}: **{GPU_TIER_LABELS.get(gpu_config.tier, gpu_config.tier)}**"
with gr.Row():
gpu_info_display = gr.Markdown(value=_gpu_info_text)
with gr.Row():
tier_dropdown = gr.Dropdown(
choices=[(label, key) for key, label in GPU_TIER_LABELS.items()],
value=gpu_config.tier,
label=t("service.tier_label"),
info=t("service.tier_info"),
scale=1,
)

# Dropdown options section - all dropdowns grouped together
with gr.Row(equal_height=True):
with gr.Column(scale=4):
Expand Down Expand Up @@ -874,4 +888,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
"gpu_config": gpu_config,
"max_duration": max_duration,
"max_batch_size": max_batch_size,
# GPU info and tier override
"gpu_info_display": gpu_info_display,
"tier_dropdown": tier_dropdown,
}
12 changes: 10 additions & 2 deletions acestep/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1451,8 +1451,16 @@ def _vram_guard_reduce_batch(

# Estimate per-sample activation cost for DiT
duration_sec = float(audio_duration) if audio_duration and float(audio_duration) > 0 else 60.0
# Empirical: ~0.8 GB per sample at 60s, linear scaling
per_sample_gb = 0.8 * (duration_sec / 60.0)
# Empirical observation: DiT activation memory per extra batch element is
# relatively modest because the latent is processed in a single forward pass
# and flash-attention keeps peak memory low. Measured values:
# - 60s turbo, noLM, batch 4 → ~13.3 GB total on 16GB GPU
# (model ~8.5 GB + 4 × ~0.8 GB activations ≈ 11.7 GB + overhead)
# - 208s turbo, batch 1 → peak 9.3 GB (model ~8.9 GB + ~0.4 GB activation)
# The old formula (0.8 * duration/60) heavily overestimates for long durations
# because activation memory scales sub-linearly with latent length (flash attn).
# Use a more conservative formula: base 0.5 GB + 0.15 GB per 60s beyond 60s.
per_sample_gb = 0.5 + max(0.0, 0.15 * (duration_sec - 60.0) / 60.0)
# If using cfg (base model), double the per-sample cost
if hasattr(self, 'model') and self.model is not None:
model_name = getattr(self, 'config_path', '') or ''
Expand Down
Loading