techfreakworm's picture
feat(ui): add advanced controls accordion — inference steps, cfg, infer method, seed, lm cot, schedule, metadata
c287b6a unverified
"""Per-tab Gradio component builders + shared output panel.
Each builder returns a dict of components keyed by purpose so app.py wires
events without depending on Gradio's positional return order.
NOTE: builders DO NOT instantiate the surrounding gr.Group / pane — they
ONLY build the form + output components inside it. app.py wraps the
result in pane_generate / pane_cover / etc.
"""
from __future__ import annotations
import gradio as gr
import lora_stack
import tooltips
def _build_advanced_accordion(components: dict[str, gr.components.Component]) -> None:
"""Advanced controls accordion shared by all four song modes.
User complaint: "no matter what prompt I write, style is not deviating
by a lot". Root cause: ``GenerationParams.inference_steps`` defaults
to 8 (ACE-Step turbo) — too few for the XL SFT model to actually
express prompt variation. ``guidance_scale``, ``infer_method``,
``shift``, ``use_adg``, and the CoT flags were all left at dataclass
defaults too. This accordion surfaces the ~21 most useful knobs in
four logical groups so the user can lock-and-iterate.
Each song-mode pane (Generate / Cover / Extend / Edit) calls this
right after ``_build_lora_accordion(components)`` so the layout is
consistent. The Lyrics tab does NOT get this — it's a Qwen path with
its own LM-params accordion already.
"""
with gr.Accordion(
label="Advanced",
open=False,
elem_classes=["ams-advanced"],
):
# --- Group A — Diffusion (most impactful) ---
gr.Markdown("**Diffusion**", elem_classes=["ams-adv-section"])
components["adv_inference_steps"] = gr.Slider(
minimum=8,
maximum=80,
value=27,
step=1,
label="Inference steps",
info="More steps → richer detail. 8 is turbo, 27-60 is the sweet spot for XL SFT.",
)
components["adv_guidance_scale"] = gr.Slider(
minimum=1.0,
maximum=15.0,
value=7.0,
step=0.5,
label="Guidance scale (CFG)",
info="Higher = follow the prompt more strictly. Lower = more creative / weirder.",
)
components["adv_infer_method"] = gr.Radio(
choices=["ode", "sde"],
value="ode",
label="Inference method",
info="ode = deterministic per seed. sde = injects stochastic noise per step → genuinely different outputs each run.",
)
components["adv_seed"] = gr.Number(
value=-1,
precision=0,
label="Seed",
info="-1 = randomize each run. Set a number to lock-and-iterate.",
)
# --- Group B — CFG schedule + shift + ADG ---
gr.Markdown("**CFG schedule + shift**", elem_classes=["ams-adv-section"])
components["adv_cfg_interval_start"] = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.05,
label="CFG interval start",
info="Fraction of diffusion at which CFG kicks in.",
)
components["adv_cfg_interval_end"] = gr.Slider(
minimum=0.0,
maximum=1.0,
value=1.0,
step=0.05,
label="CFG interval end",
info="Fraction of diffusion at which CFG stops.",
)
components["adv_shift"] = gr.Slider(
minimum=0.5,
maximum=3.0,
value=1.0,
step=0.1,
label="Shift",
info="Timestep shift. Try 0.7-1.3 for different feel.",
)
components["adv_use_adg"] = gr.Checkbox(
value=False,
label="Use Adaptive Dual Guidance (ADG)",
info="Experimental — sometimes improves base model output.",
)
# --- Group C — 5Hz Language Model (CoT reasoning) ---
gr.Markdown("**5Hz LM (CoT)**", elem_classes=["ams-adv-section"])
components["adv_thinking"] = gr.Checkbox(
value=True,
label="Enable thinking (CoT)",
info="Let the 5Hz LM reason before generating. Recommended ON.",
)
components["adv_use_cot_caption"] = gr.Checkbox(
value=True,
label="Let LM rewrite caption",
info="LM expands/rephrases your prompt. Adds variety.",
)
components["adv_use_cot_metas"] = gr.Checkbox(
value=True,
label="Let LM infer metadata (bpm/key/time)",
info="LM picks musical metadata. Turn off to force your manual values below.",
)
components["adv_use_cot_language"] = gr.Checkbox(
value=True,
label="Let LM detect vocal language",
info="LM picks vocal language from caption + lyrics.",
)
components["adv_lm_temperature"] = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.85,
step=0.05,
label="LM temperature",
info="Higher = more creative metadata/structure.",
)
components["adv_lm_top_p"] = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.9,
step=0.05,
label="LM top-p",
info="Nucleus sampling.",
)
components["adv_lm_top_k"] = gr.Number(
value=0,
precision=0,
label="LM top-k",
info="0 = disabled.",
)
components["adv_lm_cfg_scale"] = gr.Slider(
minimum=1.0,
maximum=10.0,
value=2.0,
step=0.5,
label="LM CFG scale",
info="5Hz LM classifier-free guidance.",
)
components["adv_lm_negative_prompt"] = gr.Textbox(
value="NO USER INPUT",
label="LM negative prompt",
info="Steer the LM AWAY from these traits.",
)
# --- Group D — Music metadata (manual overrides) ---
gr.Markdown("**Music metadata**", elem_classes=["ams-adv-section"])
components["adv_bpm"] = gr.Number(
value=None,
precision=0,
label="BPM",
info="Empty = auto. 30-300.",
)
components["adv_keyscale"] = gr.Textbox(
value="",
label="Key / scale",
info="e.g. 'C Major', 'Am'. Empty = auto.",
)
components["adv_timesignature"] = gr.Dropdown(
choices=["", "2", "3", "4", "6"],
value="",
label="Time signature",
info="2=2/4, 3=3/4, 4=4/4, 6=6/8. Empty = auto.",
)
components["adv_vocal_language"] = gr.Dropdown(
choices=["unknown", "en", "zh", "ja", "ko", "es", "fr", "de", "it", "pt", "ru"],
value="unknown",
label="Vocal language",
info="Hint for the 5Hz LM. unknown = auto.",
)
def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None:
"""LoRA accordion with single-LoRA semantics. Mutates ``components``.
Each song mode (generate / cover / extend / edit) calls this so the
form has a consistent LoRA picker. Apple-Silicon ACE-Step fork's
AceStepHandler can only hold one active adapter at a time (see
``lora_stack.apply_stack``), so the UI surfaces a single slot — a
preset radio OR a custom upload — and a strength slider, with a
Markdown "active LoRA" display.
"""
with gr.Accordion(
label="LoRA",
open=False,
elem_classes=["ams-lora", "ams-lora-accordion"],
):
gr.Markdown(
"_Only one LoRA at a time on this build. "
"Picking a preset or uploading a custom file "
"replaces the active LoRA._",
elem_classes=["ams-lora-note"],
)
# Preset choices are read from presets/manifest.json so the
# radio stays in sync with whatever official ACE-Step LoRAs
# are actually published on HuggingFace.
_preset_names = ["None"] + [p["name"] for p in lora_stack.load_presets()]
components["lora_preset"] = gr.Radio(
choices=_preset_names,
value="None",
label="Preset",
elem_classes=["ams-lora-preset"],
interactive=True,
)
components["lora_upload"] = gr.File(
label="Custom LoRA (.safetensors)",
file_types=[".safetensors"],
file_count="single",
elem_classes=["ams-lora-file"],
)
components["lora_strength"] = gr.Slider(
minimum=0.0,
maximum=1.5,
step=0.05,
value=0.95,
label="Strength",
info=tooltips.LORA_STRENGTH,
elem_classes=["ams-lora-strength"],
)
components["lora_active"] = gr.Markdown(
"_No LoRA active_",
elem_classes=["ams-lora-active"],
)
# Hidden state holding the resolved active LoRA dict
# ``{name, scale, path, sha256}`` so the click handler can pass
# it straight to backend.dispatch.
components["lora_state"] = gr.State(None)
def _build_output_panel(components: dict[str, gr.components.Component]) -> None:
"""Shared OUTPUT (gr.Audio) + post-process actions + METADATA (gr.JSON).
elem_classes on each output component give CSS hooks for the
Brutalist Mono treatment (uppercase mono labels + bordered
empty-state panels). Without these we'd need to target
svelte-hashed classes which can change across Gradio versions.
gr.JSON renders a dict directly as a syntax-highlighted, expandable
tree. gr.Code(language="json") refuses dicts — it requires a
pre-stringified blob — and crashes with "'dict' has no .strip()".
Below the Audio we expose three secondary post-process actions
(M5/G2): Demucs stem separation, pyloudnorm LUFS normalisation, and
ffmpeg MP3 export. Each emits to a hidden output (stem_files /
normalised_audio / mp3_file) that becomes visible only once the
click handler returns a populated value.
"""
components["output_audio"] = gr.Audio(
label="Output",
type="filepath",
interactive=False,
elem_classes=["ams-out", "ams-out-audio"],
)
with gr.Row(elem_classes=["ams-post-actions"]):
components["separate_stems_btn"] = gr.Button(
"↯ Separate stems",
variant="secondary",
elem_classes=["ams-post-btn"],
)
components["normalise_btn"] = gr.Button(
"▮ Normalise -14 LUFS",
variant="secondary",
elem_classes=["ams-post-btn"],
)
components["mp3_btn"] = gr.Button(
"↓ MP3 320k",
variant="secondary",
elem_classes=["ams-post-btn"],
)
components["stem_files"] = gr.Files(
label="Stems",
visible=False,
elem_classes=["ams-stem-files"],
)
components["normalised_audio"] = gr.Audio(
label="Normalised (-14 LUFS)",
type="filepath",
interactive=False,
visible=False,
elem_classes=["ams-out", "ams-out-normalised"],
)
components["mp3_file"] = gr.File(
label="MP3 download",
visible=False,
elem_classes=["ams-mp3-file"],
)
components["output_meta"] = gr.JSON(
label="Metadata",
elem_classes=["ams-out", "ams-out-meta"],
)
def build_generate_tab() -> dict[str, gr.components.Component]:
"""Generate tab body: 2-column row (form left, output right).
Includes a single-LoRA picker in a collapsed accordion between the
duration/vocal-mode row and the Generate button.
Advanced / LM-planner / DCW accordions are deferred to M2-M4 and
will be added by extending this builder.
"""
components: dict[str, gr.components.Component] = {}
with gr.Row():
# --- FORM column (left, ~60% width) ---
with gr.Column(scale=13):
components["prompt"] = gr.Textbox(
label="Style prompt",
placeholder="psytrance, rolling triplet bassline, acid squelch, metallic leads",
lines=2,
info=tooltips.GENERATE_PROMPT,
)
components["lyrics"] = gr.Textbox(
label="Lyrics",
placeholder="[intro] atmospheric pads\n[verse] ...",
lines=6,
info=tooltips.GENERATE_LYRICS,
)
with gr.Row():
components["duration_s"] = gr.Slider(
minimum=5,
maximum=240,
step=5,
value=30,
label="Duration (s)",
info=tooltips.GENERATE_DURATION,
)
components["instrumental"] = gr.Radio(
choices=["With vocals", "Instrumental"],
value="With vocals",
label="Vocal mode",
info=tooltips.GENERATE_VOCAL,
)
_build_lora_accordion(components)
_build_advanced_accordion(components)
components["generate_btn"] = gr.Button(
"▶ Generate",
variant="primary",
)
# --- OUTPUT column (right, ~40% width) ---
with gr.Column(scale=10):
_build_output_panel(components)
return components
def build_cover_tab() -> dict[str, gr.components.Component]:
"""Cover tab body: reference audio + new lyrics -> cover in that style.
Maps to ACE-Step's ``task_type="cover"`` with the uploaded reference
feeding ``reference_audio`` and the strength slider controlling
``audio_cover_strength``. Higher strength clings to the reference;
lower lets the new prompt/lyrics drift the timbre.
"""
components: dict[str, gr.components.Component] = {}
with gr.Row():
with gr.Column(scale=13):
components["ref_audio"] = gr.Audio(
label="Reference audio",
type="filepath",
sources=["upload"],
elem_classes=["ams-input-audio"],
)
components["prompt"] = gr.Textbox(
label="New style prompt (optional)",
placeholder="faster, more aggressive leads",
lines=2,
info=tooltips.COVER_PROMPT,
)
components["lyrics"] = gr.Textbox(
label="New lyrics",
placeholder="[verse] new lyrics over the reference style",
lines=5,
info=tooltips.COVER_LYRICS,
)
with gr.Row():
components["duration_s"] = gr.Slider(
minimum=5,
maximum=240,
step=5,
value=30,
label="Duration (s)",
info=tooltips.COVER_DURATION,
)
components["audio_cover_strength"] = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.01,
value=0.93,
label="Cover strength",
info=tooltips.COVER_STRENGTH,
)
_build_lora_accordion(components)
_build_advanced_accordion(components)
components["generate_btn"] = gr.Button(
"▶ Generate cover",
variant="primary",
)
with gr.Column(scale=10):
_build_output_panel(components)
return components
def build_extend_tab() -> dict[str, gr.components.Component]:
"""Extend tab body: seed audio + extension prompt -> continued song.
Maps to ACE-Step's ``task_type="repaint"`` with ``src_audio`` set to
the uploaded seed and the repaint window pointing past the end of
the seed so the model paints new audio after it.
The repaint params (``repaint_mode``, ``repaint_strength``,
``latent_crossfade_frames``, ``chunk_mask_mode``, ``wav_crossfade_s``)
are surfaced in an experimental accordion because the installed
ACE-Step ``GenerationParams`` dataclass doesn't expose them yet — the
UI captures them so they're ready to plumb through once upstream
adds the fields.
"""
components: dict[str, gr.components.Component] = {}
with gr.Row():
with gr.Column(scale=13):
components["seed_audio"] = gr.Audio(
label="Seed audio",
type="filepath",
sources=["upload"],
elem_classes=["ams-input-audio"],
)
components["extra_prompt"] = gr.Textbox(
label="Extension prompt",
placeholder="build to climax, layered acid leads",
lines=2,
info=tooltips.EXTEND_PROMPT,
)
components["extension_lyrics"] = gr.Textbox(
label="Extension lyrics (optional)",
placeholder="[bridge] the drop is coming...",
lines=4,
info=tooltips.EXTEND_LYRICS,
)
with gr.Row():
components["extra_duration_s"] = gr.Slider(
minimum=5,
maximum=120,
step=5,
value=60,
label="Extra duration (s)",
info=tooltips.EXTEND_DURATION,
)
components["wav_crossfade_s"] = gr.Slider(
minimum=0.0,
maximum=5.0,
step=0.1,
value=2.0,
label="WAV crossfade (s)",
info=tooltips.EXTEND_CROSSFADE,
)
with gr.Accordion(
"Repaint params (experimental)",
open=False,
elem_classes=["ams-experimental"],
):
gr.Markdown(
"_These knobs are captured in the request but the installed "
"ACE-Step dataclass doesn't expose them yet._",
elem_classes=["ams-lora-note"],
)
components["repaint_mode"] = gr.Dropdown(
choices=["balanced", "left", "right"],
value="balanced",
label="Repaint mode",
)
components["repaint_strength"] = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.05,
value=0.5,
label="Repaint strength",
)
components["latent_crossfade_frames"] = gr.Slider(
minimum=0,
maximum=30,
step=1,
value=10,
label="Latent crossfade frames",
)
components["chunk_mask_mode"] = gr.Dropdown(
choices=["auto", "manual"],
value="auto",
label="Chunk mask",
)
_build_lora_accordion(components)
_build_advanced_accordion(components)
components["generate_btn"] = gr.Button(
"▶ Extend",
variant="primary",
)
with gr.Column(scale=10):
_build_output_panel(components)
return components
def build_edit_tab() -> dict[str, gr.components.Component]:
"""Edit tab body: source audio + segment + target lyrics -> repaint/morph.
Two sub-modes:
- ``repaint`` (default): paint over [segment_start_s, segment_end_s]
using ACE-Step's repaint task_type. ``segment_start_s`` and
``segment_end_s`` are wired through the params dict to
``repainting_start`` / ``repainting_end`` on the pipeline side.
- ``flow_edit``: caption-to-caption morph. The installed ACE-Step
``GenerationParams`` has no ``flow_edit_*`` fields, so this
sub-mode falls back to a repaint pass with lower
``audio_cover_strength``. The flow knobs are still captured so
they're ready once upstream adds native support.
"""
components: dict[str, gr.components.Component] = {}
with gr.Row():
with gr.Column(scale=13):
components["source_audio"] = gr.Audio(
label="Source audio",
type="filepath",
sources=["upload"],
elem_classes=["ams-input-audio"],
)
components["sub_mode"] = gr.Radio(
choices=["repaint", "flow_edit"],
value="repaint",
label="Edit sub-mode",
info=tooltips.EDIT_SUB_MODE,
)
components["source_lyrics"] = gr.Textbox(
label="Source lyrics",
lines=3,
info=tooltips.EDIT_SOURCE_LYRICS,
)
components["target_lyrics"] = gr.Textbox(
label="Target lyrics",
placeholder="[chorus] new chorus replaces the old",
lines=3,
info=tooltips.EDIT_TARGET_LYRICS,
)
with gr.Row():
components["segment_start_s"] = gr.Number(
value=0.0,
label="Segment start (s)",
precision=1,
info=tooltips.EDIT_SEGMENT_START,
)
components["segment_end_s"] = gr.Number(
value=30.0,
label="Segment end (s)",
precision=1,
info=tooltips.EDIT_SEGMENT_END,
)
with gr.Accordion(
"Repaint options (experimental)",
open=False,
elem_classes=["ams-experimental"],
):
gr.Markdown(
"_These knobs are captured in the request but the installed "
"ACE-Step dataclass doesn't expose them yet._",
elem_classes=["ams-lora-note"],
)
components["repaint_strength"] = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.05,
value=0.5,
label="Repaint strength",
)
components["repaint_mode"] = gr.Dropdown(
choices=["balanced", "left", "right"],
value="balanced",
label="Repaint mode",
)
with gr.Accordion(
"Flow-morph options (experimental)",
open=False,
elem_classes=["ams-experimental"],
):
gr.Markdown(
"_flow_edit sub-mode currently falls back to a repaint pass with "
"lower audio_cover_strength. flow-specific params are captured "
"but not yet wired._",
elem_classes=["ams-lora-note"],
)
components["flow_source_caption"] = gr.Textbox(
label="Source caption",
placeholder="acoustic ballad, gentle piano",
)
components["flow_n_min"] = gr.Slider(
minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="n_min"
)
components["flow_n_max"] = gr.Slider(
minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="n_max"
)
components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg")
_build_lora_accordion(components)
_build_advanced_accordion(components)
components["generate_btn"] = gr.Button(
"▶ Apply edit",
variant="primary",
)
with gr.Column(scale=10):
_build_output_panel(components)
return components
def build_lyrics_tab() -> dict[str, gr.components.Component]:
"""Lyrics tab body: Qwen 2.5 7B drafts structurally-tagged lyrics.
Compact 2-column row: form on the left (brief / structure / language /
line counts / tone / rhyme + collapsed LM-params accordion), output on
the right (read-only multi-line textbox + ``Use these in Generate``
cross-tab CTA + bordered JSON metadata panel).
The output textbox carries ``elem_classes=["ams-lyrics-output"]`` so
the Brutalist Mono treatment in ``theme.CSS`` (mono font, 12 px,
280 px min-height) applies. The "Use in Generate" button is tagged
``ams-lyrics-use-btn`` so it gets a small top margin instead of
sitting flush against the textbox.
Does NOT include the LoRA accordion — Qwen-7B has no LoRA picker and
the audio-mode LoRA semantics don't apply here.
"""
c: dict[str, gr.components.Component] = {}
with gr.Row():
# --- FORM column (left) ---
with gr.Column(scale=12):
c["brief"] = gr.Textbox(
label="Brief",
lines=4,
placeholder=("Describe the song. Tone, mood, references, specific images, lines to avoid…"),
info=tooltips.LYRICS_BRIEF,
)
with gr.Row():
c["structure"] = gr.Textbox(
label="Structure",
value="intro, verse, chorus, verse, chorus, bridge, chorus, outro",
info=tooltips.LYRICS_STRUCTURE,
)
c["language"] = gr.Dropdown(
choices=["en", "zh", "ja", "ko", "es", "fr", "de"],
value="en",
label="Language",
info=tooltips.LYRICS_LANGUAGE,
)
with gr.Row():
c["verse_lines"] = gr.Slider(
minimum=2,
maximum=10,
value=6,
step=1,
label="Verse lines",
)
c["chorus_lines"] = gr.Slider(
minimum=2,
maximum=8,
value=4,
step=1,
label="Chorus lines",
)
c["bridge_lines"] = gr.Slider(
minimum=1,
maximum=6,
value=2,
step=1,
label="Bridge lines",
)
c["tone"] = gr.Textbox(
label="Tone / mood",
placeholder="euphoric, hypnotic, transcendent, not cheesy",
info=tooltips.LYRICS_TONE,
)
c["rhyme"] = gr.Radio(
choices=["strict", "loose", "none"],
value="loose",
label="Rhyme",
)
with gr.Accordion(
"LM parameters",
open=False,
elem_classes=["ams-lm-accordion"],
):
c["temperature"] = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.85,
step=0.05,
label="Temperature",
info=tooltips.LYRICS_TEMPERATURE,
)
c["top_p"] = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p",
info=tooltips.LYRICS_TOP_P,
)
c["top_k"] = gr.Slider(
minimum=0,
maximum=200,
value=40,
step=1,
label="Top-k",
info=tooltips.LYRICS_TOP_K,
)
c["max_new_tokens"] = gr.Slider(
minimum=100,
maximum=2000,
value=600,
step=50,
label="Max new tokens",
info=tooltips.LYRICS_MAX_TOKENS,
)
c["seed"] = gr.Number(
value=42,
precision=0,
label="Seed",
)
c["draft_btn"] = gr.Button(
"▶ Draft lyrics",
variant="primary",
)
# --- OUTPUT column (right) ---
with gr.Column(scale=10):
# NOTE: gr.Textbox in Gradio 6.14 doesn't accept ``show_copy_button``
# (the kwarg landed in a later 6.x). The Brutalist Mono textbox already
# exposes a native selection + browser copy via Cmd-A / Cmd-C; the
# copy-button affordance is therefore a no-op miss here.
c["lyrics_output"] = gr.Textbox(
label="Draft",
lines=14,
interactive=False,
elem_classes=["ams-lyrics-output"],
)
c["use_in_generate_btn"] = gr.Button(
"↑ Use these in Generate",
variant="primary",
elem_classes=["ams-lyrics-use-btn"],
)
c["meta_output"] = gr.JSON(
label="Metadata",
elem_classes=["ams-out", "ams-out-meta"],
)
return c