"""ACE Music Studio — Gradio entrypoint. UI ARCHITECTURE (locked — read this before editing): The five "modes" (Generate / Cover / Extend / Edit / Lyrics) are NOT implemented via ``gr.Tabs``. The wireframes at ``docs/superpowers/specs/mockups/`` show a LEFT sidebar with mode pills + a session History section, and a single content column on the right. The implementation pattern is: gr.Row(elem_classes=["ams-body"]) ├── gr.Column(min_width=190, elem_classes=["ams-sidebar"]) │ ├── gr.Radio(label=None, elem_classes=["ams-side-radio"]) ← 5 mode choices │ └── gr.HTML(... "History · session" ...) └── gr.Column(elem_classes=["ams-content"]) ├── gr.Group(visible=True) ← pane_generate ├── gr.Group(visible=False) ← pane_cover ├── gr.Group(visible=False) ← pane_extend ├── gr.Group(visible=False) ← pane_edit └── gr.Group(visible=False) ← pane_lyrics The Radio's ``change`` event fires ``_switch_pane(mode)`` which returns visibility updates for the five Groups. The Radio's native ``:checked`` state gives us the sidebar "active item" highlight for free via CSS (see ``theme.CSS`` for ``.ams-side-radio`` selectors). DO NOT switch this back to ``gr.Tabs`` — that produces top-positioned horizontal tabs which contradicts the wireframes. On HF Spaces (``SPACE_ID`` env present), ``_bootstrap_spaces_cache()`` runs once on import to (a) hardlink-mirror the build-user-owned HF hub cache into a runtime-writable ``~/hf-cache-rw/`` and (b) symlink the preloaded snapshots into ``./models///`` so ACE-Step's checkpoint resolver finds them. On Mac/Linux locally, it's a no-op — local dev uses ``setup.sh``'s site-packages symlink instead. """ from __future__ import annotations import os import sys as _sys print("[ams] python process started", flush=True, file=_sys.stderr) # Set MPS fallback BEFORE any torch import path is taken. os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1") # Don't pin HF download source — let HF default for both Spaces and local cache. os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # On HF Spaces ZeroGPU, ~/.cache/huggingface/ is build-user-owned and read-only # at runtime. transformers.AutoModel.from_pretrained(trust_remote_code=True) # (used by the ACE-Step DiT loader) wants to write modeling_*.py shims into # ~/.cache/huggingface/modules/ → PermissionError. Redirect to /tmp which # is always writable. Off-Spaces this is harmless — transformers just uses # the redirected path. ~50 KB per model, fast re-download on cold starts. os.environ.setdefault("HF_MODULES_CACHE", "/tmp/hf-modules") # Vendored ace-step (git submodule at vendor/ace-step/) — added to sys.path # BEFORE any module that imports `from acestep import ...`. We vendor # instead of pip-installing because the upstream pyproject.toml declares # `nano-vllm; sys_platform != "darwin"`, a path-source dep not on PyPI # that breaks `pip install -r requirements.txt` on HF Spaces (Linux). import sys from pathlib import Path _VENDORED_ACE_STEP = Path(__file__).resolve().parent / "vendor" / "ace-step" if _VENDORED_ACE_STEP.exists() and str(_VENDORED_ACE_STEP) not in sys.path: sys.path.insert(0, str(_VENDORED_ACE_STEP)) print(f"[ams] sys.path patched (vendor exists: {_VENDORED_ACE_STEP.exists()})", flush=True, file=_sys.stderr) import hashlib import random import shutil # noqa: F401 (reserved for future cleanup paths) import gradio as gr print("[ams] gradio imported", flush=True, file=_sys.stderr) import ace_pipeline import backend as be import lora_stack import modes import post_process import theme import ui print("[ams] local modules imported", flush=True, file=_sys.stderr) _BACKEND: be.ACEStepStudioBackend | None = None def get_backend() -> be.ACEStepStudioBackend: global _BACKEND if _BACKEND is None: _BACKEND = be.ACEStepStudioBackend() return _BACKEND # Repos that are pre-downloaded by HF Spaces' ``preload_from_hub`` (see # README frontmatter). The two ACE-Step repos *must* be symlinked into # ``./models///`` so the fork's checkpoint resolver finds them # without an extra network round-trip. The LoRA repos and Qwen don't # strictly need the symlink — ``lora_stack.download_preset`` and the # ``transformers`` Auto* loaders resolve them via the HF cache directly # from ``hf_hub_download(repo_id, filename)`` / ``from_pretrained(repo_id)``. # Including them here is a belt-and-braces measure: the snapshot_download # call in ``_symlink_snapshots_into_models`` short-circuits when files are # already cached, so the only cost is one symlink each. _PRELOAD_REPOS = ( "ACE-Step/Ace-Step1.5", "ACE-Step/acestep-v15-xl-sft", "ACE-Step/ACE-Step-v1-chinese-rap-LoRA", "ACE-Step/ACE-Step-v1.5-chinese-new-year-LoRA", "Qwen/Qwen2.5-7B-Instruct", ) def _symlink_ace_step_checkpoints() -> None: """Pre-populate the fork's hardcoded checkpoint dir with symlinks to HF-preloaded snapshots so it doesn't trigger its built-in auto-download on first inference. The fork's AceStepHandler resolves checkpoints relative to its own install dir (here, vendor/ace-step/checkpoints/). Expected layout: vendor/ace-step/checkpoints/ ├── ← vae/, encoder/, 5Hz-lm/, … (flat) └── acestep-v15-xl-sft/ ← the XL SFT DiT variant Without this, initialize_service() kicks off an async auto-download, returns before it finishes, then generate_music() hits "Model not fully initialized" on the first user click. """ from huggingface_hub import snapshot_download checkpoints_dir = _VENDORED_ACE_STEP / "checkpoints" checkpoints_dir.mkdir(parents=True, exist_ok=True) # Umbrella repo → symlink each top-level entry flat into checkpoints/. # snapshot_download is a no-op when files are already preloaded into the # HF cache; it just returns the snapshot dir on disk. umbrella = Path(snapshot_download(repo_id="ACE-Step/Ace-Step1.5")) for child in umbrella.iterdir(): target = checkpoints_dir / child.name if target.exists() or target.is_symlink(): continue target.symlink_to(child) # XL SFT DiT variant → as the subdir name the fork looks for. xl_snap = Path(snapshot_download(repo_id="ACE-Step/acestep-v15-xl-sft")) xl_target = checkpoints_dir / "acestep-v15-xl-sft" if not (xl_target.exists() or xl_target.is_symlink()): xl_target.symlink_to(xl_snap) def _bootstrap_spaces_cache() -> None: """On HF Spaces, point the fork's checkpoint resolver at preloaded snapshots. Skipped locally — local dev uses setup.sh's site-packages symlink instead, since the apple-silicon fork hardcodes its checkpoint resolver to its own install dir. """ if not os.getenv("SPACE_ID"): return _symlink_ace_step_checkpoints() def _warm_demucs_on_spaces() -> None: """Pre-download Demucs htdemucs so first stem request is fast. Demucs hosts its weights on dl.fbaipublicfiles.com, not HF Hub, so preload_from_hub can't fetch them. We trigger the download at module load on Spaces (gated by SPACE_ID) so user-facing latency is minimal. Off-Spaces this is a no-op — local dev downloads on first user click. """ if not os.getenv("SPACE_ID"): return try: from demucs.pretrained import get_model # Calling get_model triggers the download + cache. Discard the result. get_model("htdemucs") except Exception as e: # Warmup is best-effort. Surface in the log but don't crash startup. print(f"[warmup] demucs htdemucs preload skipped: {e}", flush=True, file=_sys.stderr) _GPU_BASE_BY_MODE = { "generate": 30, "cover": 40, "extend": 30, "edit": 30, "lyrics": 15, # CPU-only typically — lyrics LM runs short on GPU too } _GPU_CLAMP_MIN = 60 _GPU_CLAMP_MAX = 300 def _estimate_gpu_duration(mode: str, params: dict, multiplier: float = 1.0) -> int: """Estimate per-call GPU duration in seconds. Inputs: mode: one of generate/cover/extend/edit/lyrics params: dict that may contain "duration_s" — the requested audio length multiplier: safety factor (1.0 = nominal, 1.5 = pessimistic) Returns int seconds, clamped to [60, 300]. """ base = _GPU_BASE_BY_MODE.get(mode, 30) duration_s = float(params.get("duration_s") or 30) # Roughly 2x realtime on a ZeroGPU L4 — generation > playback length. estimated = base + duration_s * 2.0 * float(multiplier) return max(_GPU_CLAMP_MIN, min(_GPU_CLAMP_MAX, int(estimated))) # Per-mode hints for where the duration is in the handler's call args. # Each entry: (positional_index, kwarg_name). # For "edit" mode, the duration is computed as (segment_end_s - segment_start_s). # For "lyrics", there's no audio duration; we just default. _GPU_DURATION_HINTS: dict[str, tuple[int, str] | str | None] = { "generate": (2, "duration_s"), "cover": (3, "duration_s"), "extend": (3, "extra_duration_s"), "edit": "segment_window", # special: end - start "lyrics": None, # no audio length } def _extract_duration_s(mode: str, args: tuple, kwargs: dict) -> float | None: """Pull the requested audio duration out of a handler's call args, mode-aware. Returns None when the mode has no audio duration concept (lyrics) or when the value can't be found. Caller falls back to a per-mode default. """ hint = _GPU_DURATION_HINTS.get(mode) if hint is None: return None if hint == "segment_window": # edit: (source_audio, sub_mode, source_lyrics, target_lyrics, segment_start_s, segment_end_s, ...) start = kwargs.get("segment_start_s") end = kwargs.get("segment_end_s") if start is None and len(args) > 4: start = args[4] if isinstance(args[4], (int, float)) else None if end is None and len(args) > 5: end = args[5] if isinstance(args[5], (int, float)) else None if start is not None and end is not None: window = float(end) - float(start) return window if window > 0 else None return None pos_idx, kw_name = hint if kw_name in kwargs and isinstance(kwargs[kw_name], (int, float)): return float(kwargs[kw_name]) if len(args) > pos_idx and isinstance(args[pos_idx], (int, float)): return float(args[pos_idx]) return None def _gpu_call_to_estimator(mode: str): """Bridge spaces.GPU's per-call (*args, **kwargs) → our (mode, params, multiplier) estimator. Per-mode duration extraction handles the different signatures of the five handlers. Falls back to a per-mode default when extraction fails so the estimator still produces a reasonable timeout. """ def from_call(*args, **kwargs): duration_s = _extract_duration_s(mode, args, kwargs) if duration_s is None: # Per-mode default when no duration found in call args. duration_s = { "generate": 30.0, "cover": 30.0, "extend": 20.0, "edit": 8.0, # typical edit segment window "lyrics": 0.0, # no audio; base alone }.get(mode, 30.0) return _estimate_gpu_duration(mode, {"duration_s": duration_s}) return from_call def _maybe_spaces_gpu(mode: str): """Return ``@spaces.GPU(duration=)`` on HF Spaces, otherwise a no-op decorator. The callable estimator gives long extends/edits the time they need (up to 300s) while keeping short clips fast (60s floor). Off-Spaces this returns identity. """ if os.getenv("SPACE_ID"): try: import spaces return spaces.GPU(duration=_gpu_call_to_estimator(mode)) except ImportError: pass def _noop(fn): return fn return _noop # Run cache bootstrap at module import so HF Spaces' startup analyzer sees # the symlinks before the lazy backend singleton is constructed on first click. print("[ams] calling _bootstrap_spaces_cache", flush=True, file=_sys.stderr) _bootstrap_spaces_cache() print("[ams] bootstrap done, calling _warm_demucs_on_spaces", flush=True, file=_sys.stderr) _warm_demucs_on_spaces() print("[ams] warm done", flush=True, file=_sys.stderr) def _safe_call(fn, *args, **kwargs): """Wrap a mode handler so all known exceptions become friendly gr.Error toasts. Centralising this here lets every on_*_click handler stay a single-line call into modes.* without each one repeating the try/except mosaic. The error classes mirror what each mode handler can actually raise: - ``lora_stack.LoRAValidationError`` — uploaded LoRA isn't compatible - ``ValueError`` — mode-handler param validation (missing prompt, etc.) - ``FileNotFoundError`` — user-supplied ref_audio path doesn't exist - ``RuntimeError`` — pipeline crash, including MPS op-fallback failures """ try: return fn(*args, **kwargs) except lora_stack.LoRAValidationError as e: raise gr.Error(str(e)) from e except ValueError as e: raise gr.Error(str(e)) from e except FileNotFoundError as e: raise gr.Error(f"File not found: {e}") from e except RuntimeError as e: msg = str(e) if "MPS" in msg or "mps" in msg: raise gr.Error(f"Apple Silicon op issue: {msg}. PYTORCH_ENABLE_MPS_FALLBACK is enabled.") from e raise gr.Error(f"Generation failed: {msg}") from e def _sha256(path: str) -> str: """Stream a file through SHA-256 in 64 KB chunks. Used to fingerprint the active LoRA so the generation metadata includes a provenance hash (useful when the user uploads variants of the same psytrance fine-tune with subtly different weights). """ h = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(65536), b""): h.update(chunk) return h.hexdigest() def _active_md(name: str, scale: float, kind: str) -> str: """Format the 'Active: …' line shown under the strength slider.""" return f"**Active:** `{name}`  ·  scale `{scale:.2f}`  ·  {kind}" def on_lora_preset_change(preset_name: str, strength: float): """User picked a preset (or 'None'). Downloads + validates + sets state. Returns (state, active_markdown, upload_clear_value) — the third value clears any custom-upload widget so the two inputs stay mutually exclusive. """ if preset_name == "None" or not preset_name: return None, "_No LoRA active_", None try: local_path = lora_stack.download_preset(preset_name) except lora_stack.LoRAValidationError as e: raise gr.Error(str(e)) from e info = lora_stack.sniff(local_path) if not info.compatible: raise gr.Error( f"Preset {preset_name!r} is not compatible with ACE-Step 1.5 XL SFT: {info.diagnostic}" ) state = { "name": preset_name, "scale": float(strength), "path": str(local_path), "sha256": _sha256(str(local_path)), } return state, _active_md(preset_name, float(strength), "preset"), None def on_lora_upload(file_obj, strength: float): """User dropped a custom .safetensors. Replaces any active preset. Returns (state, active_markdown, preset_reset_value) — the third value resets the preset radio to 'None' so the two inputs stay mutually exclusive. """ if file_obj is None: return None, "_No LoRA active_", "None" path_str = file_obj.name if hasattr(file_obj, "name") else str(file_obj) try: info = lora_stack.sniff(path_str) except lora_stack.LoRAValidationError as e: raise gr.Error(str(e)) from e if not info.compatible: raise gr.Error(f"Uploaded LoRA isn't compatible with ACE-Step 1.5 XL SFT: {info.diagnostic}") name = Path(path_str).stem state = { "name": name, "scale": float(strength), "path": path_str, "sha256": _sha256(path_str), } return state, _active_md(name, float(strength), "custom"), "None" def on_lora_strength_change(state, strength: float): """User dragged the strength slider. Update scale on the active LoRA. No-op if no LoRA is active. """ if not state: return state, "_No LoRA active_" new_state = {**state, "scale": float(strength)} # Preserve the "preset" vs "custom" tag — presets resolve to a path # under the HF cache (~/.cache/huggingface/hub/…), uploads land # under /tmp/gradio/… or the user's pwd. Use the same heuristic # the upload/preset handlers used: a path inside the HF cache or # snapshot tree counts as preset, otherwise custom. path = str(new_state.get("path", "")) kind = "preset" if (".cache/huggingface" in path or "snapshots" in path) else "custom" return new_state, _active_md(new_state["name"], float(strength), kind) def _build_advanced_params( adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, ): """Pack the 21 Advanced-accordion inputs into the ``advanced`` + ``lm`` dicts that ``ace_pipeline.ACEStepStudio.generate`` consumes. Centralising this avoids repeating the same dict-construction in each of the four song-mode click handlers. Returns ``(seed, advanced, lm)``. ``seed`` is the resolved seed (-1 / 0 / None → random 32-bit positive). """ seed_raw = int(adv_seed) if adv_seed is not None else -1 seed = seed_raw if seed_raw > 0 else random.randint(1, 2_147_483_647) advanced = { "inference_steps": int(adv_inference_steps), "guidance_scale": float(adv_guidance_scale), "infer_method": adv_infer_method, "cfg_interval_start": float(adv_cfg_interval_start), "cfg_interval_end": float(adv_cfg_interval_end), "shift": float(adv_shift), "use_adg": bool(adv_use_adg), "bpm": int(adv_bpm) if adv_bpm else None, "keyscale": adv_keyscale or "", "timesignature": adv_timesignature or "", "vocal_language": adv_vocal_language or "unknown", } lm = { "thinking": bool(adv_thinking), "use_cot_caption": bool(adv_use_cot_caption), "use_cot_metas": bool(adv_use_cot_metas), "use_cot_language": bool(adv_use_cot_language), "temperature": float(adv_lm_temperature), "top_p": float(adv_lm_top_p), "top_k": int(adv_lm_top_k) if adv_lm_top_k else 0, "cfg": float(adv_lm_cfg_scale), "negative_prompt": adv_lm_negative_prompt or "NO USER INPUT", } return seed, advanced, lm @_maybe_spaces_gpu("generate") def on_generate_click( prompt: str, lyrics: str, duration_s: float, instrumental_label: str, lora_state, adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, progress=gr.Progress(track_tqdm=True), # noqa: B008 ): loras = [lora_state] if lora_state else [] seed, advanced, lm = _build_advanced_params( adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, ) out_path, meta = _safe_call( modes.generate, get_backend(), params={ "prompt": prompt, "lyrics": lyrics, "duration_s": int(duration_s), "instrumental": instrumental_label == "Instrumental", "seed": seed, "loras": loras, "advanced": advanced, "lm": lm, "dcw": {}, }, ) new_history = _history_push("generate", prompt or "(no prompt)") return out_path, meta, new_history @_maybe_spaces_gpu("cover") def on_cover_click( ref_audio, prompt: str, lyrics: str, duration_s: float, audio_cover_strength: float, lora_state, adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, progress=gr.Progress(track_tqdm=True), # noqa: B008 ): """Cover-mode click. ref_audio is a filepath from gr.Audio(type='filepath').""" loras = [lora_state] if lora_state else [] seed, advanced, lm = _build_advanced_params( adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, ) out_path, meta = _safe_call( modes.cover, get_backend(), params={ "ref_audio": ref_audio, "prompt": prompt, "lyrics": lyrics, "duration_s": int(duration_s), "audio_cover_strength": float(audio_cover_strength), "seed": seed, "loras": loras, "advanced": advanced, "lm": lm, "dcw": {}, }, ) new_history = _history_push("cover", prompt or "(cover)") return out_path, meta, new_history @_maybe_spaces_gpu("extend") def on_extend_click( seed_audio, extra_prompt: str, extension_lyrics: str, extra_duration_s: float, wav_crossfade_s: float, repaint_mode: str, repaint_strength: float, latent_crossfade_frames: float, chunk_mask_mode: str, lora_state, adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, progress=gr.Progress(track_tqdm=True), # noqa: B008 ): """Extend-mode click. seed_audio is a filepath from gr.Audio(type='filepath').""" loras = [lora_state] if lora_state else [] seed, advanced, lm = _build_advanced_params( adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, ) out_path, meta = _safe_call( modes.extend, get_backend(), params={ "seed_audio": seed_audio, "extra_prompt": extra_prompt, "extension_lyrics": extension_lyrics, "extra_duration_s": int(extra_duration_s), "wav_crossfade_s": float(wav_crossfade_s), "repaint_mode": repaint_mode, "repaint_strength": float(repaint_strength), "latent_crossfade_frames": int(latent_crossfade_frames), "chunk_mask_mode": chunk_mask_mode, "seed": seed, "loras": loras, "advanced": advanced, "lm": lm, "dcw": {}, }, ) new_history = _history_push("extend", extra_prompt or "(extend)") return out_path, meta, new_history @_maybe_spaces_gpu("lyrics") def on_draft_lyrics( brief: str, structure: str, language: str, tone: str, verse_lines: float, chorus_lines: float, bridge_lines: float, rhyme: str, temperature: float, top_p: float, top_k: float, max_new_tokens: float, seed, progress=gr.Progress(track_tqdm=True), # noqa: B008 ): """Lyrics-mode click. Calls ``modes.lyrics(...)`` directly — no ACE-Step pipeline is touched. Qwen 2.5 7B is its own lazy singleton inside ``lyrics_lm``; the first click triggers a ~4 GB MLX download (cached afterwards) and ~30 s warm-up before the draft appears. """ lyrics_text, meta = _safe_call( modes.lyrics, get_backend(), params={ "brief": brief, "structure": structure, "language": language, "tone": tone, "verse_lines": int(verse_lines), "chorus_lines": int(chorus_lines), "bridge_lines": int(bridge_lines), "rhyme": rhyme, "temperature": float(temperature), "top_p": float(top_p), "top_k": int(top_k), "max_new_tokens": int(max_new_tokens), "seed": int(seed) if seed is not None else None, }, ) new_history = _history_push("lyrics", brief or "(brief)") return lyrics_text, meta, new_history def on_separate_stems(audio_path): """Run Demucs on the current Output audio and surface 4 stem files.""" if not audio_path: raise gr.Error("Generate a song first.") try: stems = post_process.separate_stems(audio_path) except Exception as e: raise gr.Error(f"Demucs failed: {e}") from e # gr.Files's pydantic FileData model only accepts str paths in Gradio # 6.14; PosixPath objects from separate_stems() trip its validator. return gr.Files(value=[str(p) for p in stems.values()], visible=True) def on_normalise(audio_path): """Run pyloudnorm at -14 LUFS and surface the normalised WAV.""" if not audio_path: raise gr.Error("Generate a song first.") try: out = post_process.normalise_lufs(audio_path, target_lufs=-14.0) except Exception as e: raise gr.Error(f"Normalisation failed: {e}") from e return gr.Audio(value=str(out), visible=True) def on_export_mp3(audio_path): """Encode the current Output to MP3 320 k via ffmpeg and surface the file.""" if not audio_path: raise gr.Error("Generate a song first.") try: out = post_process.to_mp3(audio_path, bitrate_kbps=320) except Exception as e: raise gr.Error(f"MP3 export failed: {e}") from e return gr.File(value=str(out), visible=True) @_maybe_spaces_gpu("edit") def on_edit_click( source_audio, sub_mode: str, source_lyrics: str, target_lyrics: str, segment_start_s: float, segment_end_s: float, repaint_strength: float, repaint_mode: str, flow_source_caption: str, flow_n_min: float, flow_n_max: float, flow_n_avg: float, lora_state, adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, progress=gr.Progress(track_tqdm=True), # noqa: B008 ): """Edit-mode click. source_audio is a filepath from gr.Audio(type='filepath').""" loras = [lora_state] if lora_state else [] seed, advanced, lm = _build_advanced_params( adv_inference_steps, adv_guidance_scale, adv_infer_method, adv_seed, adv_cfg_interval_start, adv_cfg_interval_end, adv_shift, adv_use_adg, adv_thinking, adv_use_cot_caption, adv_use_cot_metas, adv_use_cot_language, adv_lm_temperature, adv_lm_top_p, adv_lm_top_k, adv_lm_cfg_scale, adv_lm_negative_prompt, adv_bpm, adv_keyscale, adv_timesignature, adv_vocal_language, ) out_path, meta = _safe_call( modes.edit, get_backend(), params={ "source_audio": source_audio, "sub_mode": sub_mode, "source_lyrics": source_lyrics, "target_lyrics": target_lyrics, "segment_start_s": float(segment_start_s), "segment_end_s": float(segment_end_s), "repaint_strength": float(repaint_strength), "repaint_mode": repaint_mode, "flow_source_caption": flow_source_caption, "flow_n_min": float(flow_n_min), "flow_n_max": float(flow_n_max), "flow_n_avg": int(flow_n_avg), "seed": seed, "loras": loras, "advanced": advanced, "lm": lm, "dcw": {}, }, ) new_history = _history_push("edit", target_lyrics or sub_mode or "(edit)") return out_path, meta, new_history HEADER_HTML = """
ACE Music Studio.
ready
""".strip() def _status_html(device: str) -> str: """Right-aligned status indicator in the header. Updated at boot only.""" return f"""
ACE Music Studio.
ready · {device.upper()}
""".strip() CTA_HTML = """
Built with . Drop a like at the top  ·  Follow @techfreakworm for what's next.
""".strip() HISTORY_HTML = """
History · session
No generations yet
""".strip() # --- In-memory history (M6/H2) ---------------------------------------------- # Per spec §13, persistent history is out of scope for v1. The sidebar block # is an in-process list that lives for the lifetime of the Gradio process and # resets on reload. Newest entries first; capped at _HISTORY_MAX so the # bordered sidebar stays compact at the desktop breakpoint. _HISTORY: list[dict] = [] _HISTORY_MAX = 12 def _history_render() -> str: """Render _HISTORY into the sidebar HTML block. Falls back to the empty-state HTML constant when no rows are present so the placeholder copy stays exactly aligned with the wireframe. """ if not _HISTORY: return HISTORY_HTML rows_html = "\n".join( f'
' f'{h["mode"]}' f'{h["label"]}' f"
" for h in _HISTORY ) return f'
History · session
{rows_html}
' def _history_push(mode: str, label: str) -> str: """Push a generation onto the history and return the new HTML.""" _HISTORY.insert(0, {"mode": mode, "label": (label or "").strip()[:30] or "(untitled)"}) while len(_HISTORY) > _HISTORY_MAX: _HISTORY.pop() return _history_render() MODE_CHOICES = [ ("🎵 Generate", "generate"), ("🎤 Cover", "cover"), ("⏩ Extend", "extend"), ("✏️ Edit", "edit"), ("✍️ Lyrics", "lyrics"), ] def build_app() -> gr.Blocks: device = ace_pipeline.detect_device() with gr.Blocks(theme=theme.build_theme(), css=theme.CSS, title="ACE Music Studio") as demo: gr.HTML(_status_html(device)) gr.HTML(CTA_HTML) with gr.Row(elem_classes=["ams-body"]): # --- Sidebar ---------------------------------------------------- with gr.Column(scale=0, min_width=190, elem_classes=["ams-sidebar"]): mode = gr.Radio( choices=MODE_CHOICES, value="generate", label=None, show_label=False, container=False, elem_classes=["ams-side-radio"], ) # Dynamic in-memory history (M6/H2). Initial value renders # the same "No generations yet" placeholder the static block # used to emit; each click handler refreshes the HTML via # _history_push(). history_html = gr.HTML(HISTORY_HTML, elem_classes=["ams-history-wrapper"]) # --- Content ---------------------------------------------------- with gr.Column(scale=10, elem_classes=["ams-content"]): with gr.Group(visible=True, elem_classes=["ams-tab-pane"]) as pane_generate: g = ui.build_generate_tab() g["lora_preset"].change( fn=on_lora_preset_change, inputs=[g["lora_preset"], g["lora_strength"]], outputs=[g["lora_state"], g["lora_active"], g["lora_upload"]], ) g["lora_upload"].change( fn=on_lora_upload, inputs=[g["lora_upload"], g["lora_strength"]], outputs=[g["lora_state"], g["lora_active"], g["lora_preset"]], ) g["lora_strength"].change( fn=on_lora_strength_change, inputs=[g["lora_state"], g["lora_strength"]], outputs=[g["lora_state"], g["lora_active"]], ) g["generate_btn"].click( fn=on_generate_click, inputs=[ g["prompt"], g["lyrics"], g["duration_s"], g["instrumental"], g["lora_state"], g["adv_inference_steps"], g["adv_guidance_scale"], g["adv_infer_method"], g["adv_seed"], g["adv_cfg_interval_start"], g["adv_cfg_interval_end"], g["adv_shift"], g["adv_use_adg"], g["adv_thinking"], g["adv_use_cot_caption"], g["adv_use_cot_metas"], g["adv_use_cot_language"], g["adv_lm_temperature"], g["adv_lm_top_p"], g["adv_lm_top_k"], g["adv_lm_cfg_scale"], g["adv_lm_negative_prompt"], g["adv_bpm"], g["adv_keyscale"], g["adv_timesignature"], g["adv_vocal_language"], ], outputs=[g["output_audio"], g["output_meta"], history_html], ) # Post-processing actions (M5/G2) g["separate_stems_btn"].click( fn=on_separate_stems, inputs=[g["output_audio"]], outputs=[g["stem_files"]], ) g["normalise_btn"].click( fn=on_normalise, inputs=[g["output_audio"]], outputs=[g["normalised_audio"]], ) g["mp3_btn"].click( fn=on_export_mp3, inputs=[g["output_audio"]], outputs=[g["mp3_file"]], ) with gr.Group(visible=False, elem_classes=["ams-tab-pane"]) as pane_cover: c = ui.build_cover_tab() c["lora_preset"].change( fn=on_lora_preset_change, inputs=[c["lora_preset"], c["lora_strength"]], outputs=[c["lora_state"], c["lora_active"], c["lora_upload"]], ) c["lora_upload"].change( fn=on_lora_upload, inputs=[c["lora_upload"], c["lora_strength"]], outputs=[c["lora_state"], c["lora_active"], c["lora_preset"]], ) c["lora_strength"].change( fn=on_lora_strength_change, inputs=[c["lora_state"], c["lora_strength"]], outputs=[c["lora_state"], c["lora_active"]], ) c["generate_btn"].click( fn=on_cover_click, inputs=[ c["ref_audio"], c["prompt"], c["lyrics"], c["duration_s"], c["audio_cover_strength"], c["lora_state"], c["adv_inference_steps"], c["adv_guidance_scale"], c["adv_infer_method"], c["adv_seed"], c["adv_cfg_interval_start"], c["adv_cfg_interval_end"], c["adv_shift"], c["adv_use_adg"], c["adv_thinking"], c["adv_use_cot_caption"], c["adv_use_cot_metas"], c["adv_use_cot_language"], c["adv_lm_temperature"], c["adv_lm_top_p"], c["adv_lm_top_k"], c["adv_lm_cfg_scale"], c["adv_lm_negative_prompt"], c["adv_bpm"], c["adv_keyscale"], c["adv_timesignature"], c["adv_vocal_language"], ], outputs=[c["output_audio"], c["output_meta"], history_html], ) # Post-processing actions (M5/G2) c["separate_stems_btn"].click( fn=on_separate_stems, inputs=[c["output_audio"]], outputs=[c["stem_files"]], ) c["normalise_btn"].click( fn=on_normalise, inputs=[c["output_audio"]], outputs=[c["normalised_audio"]], ) c["mp3_btn"].click( fn=on_export_mp3, inputs=[c["output_audio"]], outputs=[c["mp3_file"]], ) with gr.Group(visible=False, elem_classes=["ams-tab-pane"]) as pane_extend: x = ui.build_extend_tab() x["lora_preset"].change( fn=on_lora_preset_change, inputs=[x["lora_preset"], x["lora_strength"]], outputs=[x["lora_state"], x["lora_active"], x["lora_upload"]], ) x["lora_upload"].change( fn=on_lora_upload, inputs=[x["lora_upload"], x["lora_strength"]], outputs=[x["lora_state"], x["lora_active"], x["lora_preset"]], ) x["lora_strength"].change( fn=on_lora_strength_change, inputs=[x["lora_state"], x["lora_strength"]], outputs=[x["lora_state"], x["lora_active"]], ) x["generate_btn"].click( fn=on_extend_click, inputs=[ x["seed_audio"], x["extra_prompt"], x["extension_lyrics"], x["extra_duration_s"], x["wav_crossfade_s"], x["repaint_mode"], x["repaint_strength"], x["latent_crossfade_frames"], x["chunk_mask_mode"], x["lora_state"], x["adv_inference_steps"], x["adv_guidance_scale"], x["adv_infer_method"], x["adv_seed"], x["adv_cfg_interval_start"], x["adv_cfg_interval_end"], x["adv_shift"], x["adv_use_adg"], x["adv_thinking"], x["adv_use_cot_caption"], x["adv_use_cot_metas"], x["adv_use_cot_language"], x["adv_lm_temperature"], x["adv_lm_top_p"], x["adv_lm_top_k"], x["adv_lm_cfg_scale"], x["adv_lm_negative_prompt"], x["adv_bpm"], x["adv_keyscale"], x["adv_timesignature"], x["adv_vocal_language"], ], outputs=[x["output_audio"], x["output_meta"], history_html], ) # Post-processing actions (M5/G2) x["separate_stems_btn"].click( fn=on_separate_stems, inputs=[x["output_audio"]], outputs=[x["stem_files"]], ) x["normalise_btn"].click( fn=on_normalise, inputs=[x["output_audio"]], outputs=[x["normalised_audio"]], ) x["mp3_btn"].click( fn=on_export_mp3, inputs=[x["output_audio"]], outputs=[x["mp3_file"]], ) with gr.Group(visible=False, elem_classes=["ams-tab-pane"]) as pane_edit: e = ui.build_edit_tab() e["lora_preset"].change( fn=on_lora_preset_change, inputs=[e["lora_preset"], e["lora_strength"]], outputs=[e["lora_state"], e["lora_active"], e["lora_upload"]], ) e["lora_upload"].change( fn=on_lora_upload, inputs=[e["lora_upload"], e["lora_strength"]], outputs=[e["lora_state"], e["lora_active"], e["lora_preset"]], ) e["lora_strength"].change( fn=on_lora_strength_change, inputs=[e["lora_state"], e["lora_strength"]], outputs=[e["lora_state"], e["lora_active"]], ) e["generate_btn"].click( fn=on_edit_click, inputs=[ e["source_audio"], e["sub_mode"], e["source_lyrics"], e["target_lyrics"], e["segment_start_s"], e["segment_end_s"], e["repaint_strength"], e["repaint_mode"], e["flow_source_caption"], e["flow_n_min"], e["flow_n_max"], e["flow_n_avg"], e["lora_state"], e["adv_inference_steps"], e["adv_guidance_scale"], e["adv_infer_method"], e["adv_seed"], e["adv_cfg_interval_start"], e["adv_cfg_interval_end"], e["adv_shift"], e["adv_use_adg"], e["adv_thinking"], e["adv_use_cot_caption"], e["adv_use_cot_metas"], e["adv_use_cot_language"], e["adv_lm_temperature"], e["adv_lm_top_p"], e["adv_lm_top_k"], e["adv_lm_cfg_scale"], e["adv_lm_negative_prompt"], e["adv_bpm"], e["adv_keyscale"], e["adv_timesignature"], e["adv_vocal_language"], ], outputs=[e["output_audio"], e["output_meta"], history_html], ) # Post-processing actions (M5/G2) e["separate_stems_btn"].click( fn=on_separate_stems, inputs=[e["output_audio"]], outputs=[e["stem_files"]], ) e["normalise_btn"].click( fn=on_normalise, inputs=[e["output_audio"]], outputs=[e["normalised_audio"]], ) e["mp3_btn"].click( fn=on_export_mp3, inputs=[e["output_audio"]], outputs=[e["mp3_file"]], ) with gr.Group(visible=False, elem_classes=["ams-tab-pane"]) as pane_lyrics: lyr = ui.build_lyrics_tab() lyr["draft_btn"].click( fn=on_draft_lyrics, inputs=[ lyr["brief"], lyr["structure"], lyr["language"], lyr["tone"], lyr["verse_lines"], lyr["chorus_lines"], lyr["bridge_lines"], lyr["rhyme"], lyr["temperature"], lyr["top_p"], lyr["top_k"], lyr["max_new_tokens"], lyr["seed"], ], outputs=[lyr["lyrics_output"], lyr["meta_output"], history_html], ) # Cross-tab "Use these in Generate" — pipes the drafted # text straight into the Generate tab's lyrics textbox. # Both panes were declared inside the same gr.Blocks # context so referencing g["lyrics"] across panes works. lyr["use_in_generate_btn"].click( fn=lambda txt: txt, inputs=[lyr["lyrics_output"]], outputs=[g["lyrics"]], ) panes = [pane_generate, pane_cover, pane_extend, pane_edit, pane_lyrics] def _switch_pane(selected: str): order = ["generate", "cover", "extend", "edit", "lyrics"] return tuple(gr.Group(visible=(selected == name)) for name in order) mode.change(fn=_switch_pane, inputs=mode, outputs=panes) return demo if __name__ == "__main__": print("[ams] building app", flush=True, file=_sys.stderr) demo = build_app() print("[ams] queueing", flush=True, file=_sys.stderr) demo.queue(default_concurrency_limit=1) print( f"[ams] launching on port {int(os.environ.get('PORT', 7860))}", flush=True, file=_sys.stderr, ) demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))