"""Per-tab Gradio component builders + shared output panel. Each builder returns a dict of components keyed by purpose so app.py wires events without depending on Gradio's positional return order. NOTE: builders DO NOT instantiate the surrounding gr.Group / pane — they ONLY build the form + output components inside it. app.py wraps the result in pane_generate / pane_cover / etc. """ from __future__ import annotations import gradio as gr import lora_stack import tooltips def _build_advanced_accordion(components: dict[str, gr.components.Component]) -> None: """Advanced controls accordion shared by all four song modes. User complaint: "no matter what prompt I write, style is not deviating by a lot". Root cause: ``GenerationParams.inference_steps`` defaults to 8 (ACE-Step turbo) — too few for the XL SFT model to actually express prompt variation. ``guidance_scale``, ``infer_method``, ``shift``, ``use_adg``, and the CoT flags were all left at dataclass defaults too. This accordion surfaces the ~21 most useful knobs in four logical groups so the user can lock-and-iterate. Each song-mode pane (Generate / Cover / Extend / Edit) calls this right after ``_build_lora_accordion(components)`` so the layout is consistent. The Lyrics tab does NOT get this — it's a Qwen path with its own LM-params accordion already. """ with gr.Accordion( label="Advanced", open=False, elem_classes=["ams-advanced"], ): # --- Group A — Diffusion (most impactful) --- gr.Markdown("**Diffusion**", elem_classes=["ams-adv-section"]) components["adv_inference_steps"] = gr.Slider( minimum=8, maximum=80, value=27, step=1, label="Inference steps", info="More steps → richer detail. 8 is turbo, 27-60 is the sweet spot for XL SFT.", ) components["adv_guidance_scale"] = gr.Slider( minimum=1.0, maximum=15.0, value=7.0, step=0.5, label="Guidance scale (CFG)", info="Higher = follow the prompt more strictly. Lower = more creative / weirder.", ) components["adv_infer_method"] = gr.Radio( choices=["ode", "sde"], value="ode", label="Inference method", info="ode = deterministic per seed. sde = injects stochastic noise per step → genuinely different outputs each run.", ) components["adv_seed"] = gr.Number( value=-1, precision=0, label="Seed", info="-1 = randomize each run. Set a number to lock-and-iterate.", ) # --- Group B — CFG schedule + shift + ADG --- gr.Markdown("**CFG schedule + shift**", elem_classes=["ams-adv-section"]) components["adv_cfg_interval_start"] = gr.Slider( minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="CFG interval start", info="Fraction of diffusion at which CFG kicks in.", ) components["adv_cfg_interval_end"] = gr.Slider( minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="CFG interval end", info="Fraction of diffusion at which CFG stops.", ) components["adv_shift"] = gr.Slider( minimum=0.5, maximum=3.0, value=1.0, step=0.1, label="Shift", info="Timestep shift. Try 0.7-1.3 for different feel.", ) components["adv_use_adg"] = gr.Checkbox( value=False, label="Use Adaptive Dual Guidance (ADG)", info="Experimental — sometimes improves base model output.", ) # --- Group C — 5Hz Language Model (CoT reasoning) --- gr.Markdown("**5Hz LM (CoT)**", elem_classes=["ams-adv-section"]) components["adv_thinking"] = gr.Checkbox( value=True, label="Enable thinking (CoT)", info="Let the 5Hz LM reason before generating. Recommended ON.", ) components["adv_use_cot_caption"] = gr.Checkbox( value=True, label="Let LM rewrite caption", info="LM expands/rephrases your prompt. Adds variety.", ) components["adv_use_cot_metas"] = gr.Checkbox( value=True, label="Let LM infer metadata (bpm/key/time)", info="LM picks musical metadata. Turn off to force your manual values below.", ) components["adv_use_cot_language"] = gr.Checkbox( value=True, label="Let LM detect vocal language", info="LM picks vocal language from caption + lyrics.", ) components["adv_lm_temperature"] = gr.Slider( minimum=0.0, maximum=2.0, value=0.85, step=0.05, label="LM temperature", info="Higher = more creative metadata/structure.", ) components["adv_lm_top_p"] = gr.Slider( minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="LM top-p", info="Nucleus sampling.", ) components["adv_lm_top_k"] = gr.Number( value=0, precision=0, label="LM top-k", info="0 = disabled.", ) components["adv_lm_cfg_scale"] = gr.Slider( minimum=1.0, maximum=10.0, value=2.0, step=0.5, label="LM CFG scale", info="5Hz LM classifier-free guidance.", ) components["adv_lm_negative_prompt"] = gr.Textbox( value="NO USER INPUT", label="LM negative prompt", info="Steer the LM AWAY from these traits.", ) # --- Group D — Music metadata (manual overrides) --- gr.Markdown("**Music metadata**", elem_classes=["ams-adv-section"]) components["adv_bpm"] = gr.Number( value=None, precision=0, label="BPM", info="Empty = auto. 30-300.", ) components["adv_keyscale"] = gr.Textbox( value="", label="Key / scale", info="e.g. 'C Major', 'Am'. Empty = auto.", ) components["adv_timesignature"] = gr.Dropdown( choices=["", "2", "3", "4", "6"], value="", label="Time signature", info="2=2/4, 3=3/4, 4=4/4, 6=6/8. Empty = auto.", ) components["adv_vocal_language"] = gr.Dropdown( choices=["unknown", "en", "zh", "ja", "ko", "es", "fr", "de", "it", "pt", "ru"], value="unknown", label="Vocal language", info="Hint for the 5Hz LM. unknown = auto.", ) def _build_lora_accordion(components: dict[str, gr.components.Component]) -> None: """LoRA accordion with single-LoRA semantics. Mutates ``components``. Each song mode (generate / cover / extend / edit) calls this so the form has a consistent LoRA picker. Apple-Silicon ACE-Step fork's AceStepHandler can only hold one active adapter at a time (see ``lora_stack.apply_stack``), so the UI surfaces a single slot — a preset radio OR a custom upload — and a strength slider, with a Markdown "active LoRA" display. """ with gr.Accordion( label="LoRA", open=False, elem_classes=["ams-lora", "ams-lora-accordion"], ): gr.Markdown( "_Only one LoRA at a time on this build. " "Picking a preset or uploading a custom file " "replaces the active LoRA._", elem_classes=["ams-lora-note"], ) # Preset choices are read from presets/manifest.json so the # radio stays in sync with whatever official ACE-Step LoRAs # are actually published on HuggingFace. _preset_names = ["None"] + [p["name"] for p in lora_stack.load_presets()] components["lora_preset"] = gr.Radio( choices=_preset_names, value="None", label="Preset", elem_classes=["ams-lora-preset"], interactive=True, ) components["lora_upload"] = gr.File( label="Custom LoRA (.safetensors)", file_types=[".safetensors"], file_count="single", elem_classes=["ams-lora-file"], ) components["lora_strength"] = gr.Slider( minimum=0.0, maximum=1.5, step=0.05, value=0.95, label="Strength", info=tooltips.LORA_STRENGTH, elem_classes=["ams-lora-strength"], ) components["lora_active"] = gr.Markdown( "_No LoRA active_", elem_classes=["ams-lora-active"], ) # Hidden state holding the resolved active LoRA dict # ``{name, scale, path, sha256}`` so the click handler can pass # it straight to backend.dispatch. components["lora_state"] = gr.State(None) def _build_output_panel(components: dict[str, gr.components.Component]) -> None: """Shared OUTPUT (gr.Audio) + post-process actions + METADATA (gr.JSON). elem_classes on each output component give CSS hooks for the Brutalist Mono treatment (uppercase mono labels + bordered empty-state panels). Without these we'd need to target svelte-hashed classes which can change across Gradio versions. gr.JSON renders a dict directly as a syntax-highlighted, expandable tree. gr.Code(language="json") refuses dicts — it requires a pre-stringified blob — and crashes with "'dict' has no .strip()". Below the Audio we expose three secondary post-process actions (M5/G2): Demucs stem separation, pyloudnorm LUFS normalisation, and ffmpeg MP3 export. Each emits to a hidden output (stem_files / normalised_audio / mp3_file) that becomes visible only once the click handler returns a populated value. """ components["output_audio"] = gr.Audio( label="Output", type="filepath", interactive=False, elem_classes=["ams-out", "ams-out-audio"], ) with gr.Row(elem_classes=["ams-post-actions"]): components["separate_stems_btn"] = gr.Button( "↯ Separate stems", variant="secondary", elem_classes=["ams-post-btn"], ) components["normalise_btn"] = gr.Button( "▮ Normalise -14 LUFS", variant="secondary", elem_classes=["ams-post-btn"], ) components["mp3_btn"] = gr.Button( "↓ MP3 320k", variant="secondary", elem_classes=["ams-post-btn"], ) components["stem_files"] = gr.Files( label="Stems", visible=False, elem_classes=["ams-stem-files"], ) components["normalised_audio"] = gr.Audio( label="Normalised (-14 LUFS)", type="filepath", interactive=False, visible=False, elem_classes=["ams-out", "ams-out-normalised"], ) components["mp3_file"] = gr.File( label="MP3 download", visible=False, elem_classes=["ams-mp3-file"], ) components["output_meta"] = gr.JSON( label="Metadata", elem_classes=["ams-out", "ams-out-meta"], ) def build_generate_tab() -> dict[str, gr.components.Component]: """Generate tab body: 2-column row (form left, output right). Includes a single-LoRA picker in a collapsed accordion between the duration/vocal-mode row and the Generate button. Advanced / LM-planner / DCW accordions are deferred to M2-M4 and will be added by extending this builder. """ components: dict[str, gr.components.Component] = {} with gr.Row(): # --- FORM column (left, ~60% width) --- with gr.Column(scale=13): components["prompt"] = gr.Textbox( label="Style prompt", placeholder="psytrance, rolling triplet bassline, acid squelch, metallic leads", lines=2, info=tooltips.GENERATE_PROMPT, ) components["lyrics"] = gr.Textbox( label="Lyrics", placeholder="[intro] atmospheric pads\n[verse] ...", lines=6, info=tooltips.GENERATE_LYRICS, ) with gr.Row(): components["duration_s"] = gr.Slider( minimum=5, maximum=240, step=5, value=30, label="Duration (s)", info=tooltips.GENERATE_DURATION, ) components["instrumental"] = gr.Radio( choices=["With vocals", "Instrumental"], value="With vocals", label="Vocal mode", info=tooltips.GENERATE_VOCAL, ) _build_lora_accordion(components) _build_advanced_accordion(components) components["generate_btn"] = gr.Button( "▶ Generate", variant="primary", ) # --- OUTPUT column (right, ~40% width) --- with gr.Column(scale=10): _build_output_panel(components) return components def build_cover_tab() -> dict[str, gr.components.Component]: """Cover tab body: reference audio + new lyrics -> cover in that style. Maps to ACE-Step's ``task_type="cover"`` with the uploaded reference feeding ``reference_audio`` and the strength slider controlling ``audio_cover_strength``. Higher strength clings to the reference; lower lets the new prompt/lyrics drift the timbre. """ components: dict[str, gr.components.Component] = {} with gr.Row(): with gr.Column(scale=13): components["ref_audio"] = gr.Audio( label="Reference audio", type="filepath", sources=["upload"], elem_classes=["ams-input-audio"], ) components["prompt"] = gr.Textbox( label="New style prompt (optional)", placeholder="faster, more aggressive leads", lines=2, info=tooltips.COVER_PROMPT, ) components["lyrics"] = gr.Textbox( label="New lyrics", placeholder="[verse] new lyrics over the reference style", lines=5, info=tooltips.COVER_LYRICS, ) with gr.Row(): components["duration_s"] = gr.Slider( minimum=5, maximum=240, step=5, value=30, label="Duration (s)", info=tooltips.COVER_DURATION, ) components["audio_cover_strength"] = gr.Slider( minimum=0.0, maximum=1.0, step=0.01, value=0.93, label="Cover strength", info=tooltips.COVER_STRENGTH, ) _build_lora_accordion(components) _build_advanced_accordion(components) components["generate_btn"] = gr.Button( "▶ Generate cover", variant="primary", ) with gr.Column(scale=10): _build_output_panel(components) return components def build_extend_tab() -> dict[str, gr.components.Component]: """Extend tab body: seed audio + extension prompt -> continued song. Maps to ACE-Step's ``task_type="repaint"`` with ``src_audio`` set to the uploaded seed and the repaint window pointing past the end of the seed so the model paints new audio after it. The repaint params (``repaint_mode``, ``repaint_strength``, ``latent_crossfade_frames``, ``chunk_mask_mode``, ``wav_crossfade_s``) are surfaced in an experimental accordion because the installed ACE-Step ``GenerationParams`` dataclass doesn't expose them yet — the UI captures them so they're ready to plumb through once upstream adds the fields. """ components: dict[str, gr.components.Component] = {} with gr.Row(): with gr.Column(scale=13): components["seed_audio"] = gr.Audio( label="Seed audio", type="filepath", sources=["upload"], elem_classes=["ams-input-audio"], ) components["extra_prompt"] = gr.Textbox( label="Extension prompt", placeholder="build to climax, layered acid leads", lines=2, info=tooltips.EXTEND_PROMPT, ) components["extension_lyrics"] = gr.Textbox( label="Extension lyrics (optional)", placeholder="[bridge] the drop is coming...", lines=4, info=tooltips.EXTEND_LYRICS, ) with gr.Row(): components["extra_duration_s"] = gr.Slider( minimum=5, maximum=120, step=5, value=60, label="Extra duration (s)", info=tooltips.EXTEND_DURATION, ) components["wav_crossfade_s"] = gr.Slider( minimum=0.0, maximum=5.0, step=0.1, value=2.0, label="WAV crossfade (s)", info=tooltips.EXTEND_CROSSFADE, ) with gr.Accordion( "Repaint params (experimental)", open=False, elem_classes=["ams-experimental"], ): gr.Markdown( "_These knobs are captured in the request but the installed " "ACE-Step dataclass doesn't expose them yet._", elem_classes=["ams-lora-note"], ) components["repaint_mode"] = gr.Dropdown( choices=["balanced", "left", "right"], value="balanced", label="Repaint mode", ) components["repaint_strength"] = gr.Slider( minimum=0.0, maximum=1.0, step=0.05, value=0.5, label="Repaint strength", ) components["latent_crossfade_frames"] = gr.Slider( minimum=0, maximum=30, step=1, value=10, label="Latent crossfade frames", ) components["chunk_mask_mode"] = gr.Dropdown( choices=["auto", "manual"], value="auto", label="Chunk mask", ) _build_lora_accordion(components) _build_advanced_accordion(components) components["generate_btn"] = gr.Button( "▶ Extend", variant="primary", ) with gr.Column(scale=10): _build_output_panel(components) return components def build_edit_tab() -> dict[str, gr.components.Component]: """Edit tab body: source audio + segment + target lyrics -> repaint/morph. Two sub-modes: - ``repaint`` (default): paint over [segment_start_s, segment_end_s] using ACE-Step's repaint task_type. ``segment_start_s`` and ``segment_end_s`` are wired through the params dict to ``repainting_start`` / ``repainting_end`` on the pipeline side. - ``flow_edit``: caption-to-caption morph. The installed ACE-Step ``GenerationParams`` has no ``flow_edit_*`` fields, so this sub-mode falls back to a repaint pass with lower ``audio_cover_strength``. The flow knobs are still captured so they're ready once upstream adds native support. """ components: dict[str, gr.components.Component] = {} with gr.Row(): with gr.Column(scale=13): components["source_audio"] = gr.Audio( label="Source audio", type="filepath", sources=["upload"], elem_classes=["ams-input-audio"], ) components["sub_mode"] = gr.Radio( choices=["repaint", "flow_edit"], value="repaint", label="Edit sub-mode", info=tooltips.EDIT_SUB_MODE, ) components["source_lyrics"] = gr.Textbox( label="Source lyrics", lines=3, info=tooltips.EDIT_SOURCE_LYRICS, ) components["target_lyrics"] = gr.Textbox( label="Target lyrics", placeholder="[chorus] new chorus replaces the old", lines=3, info=tooltips.EDIT_TARGET_LYRICS, ) with gr.Row(): components["segment_start_s"] = gr.Number( value=0.0, label="Segment start (s)", precision=1, info=tooltips.EDIT_SEGMENT_START, ) components["segment_end_s"] = gr.Number( value=30.0, label="Segment end (s)", precision=1, info=tooltips.EDIT_SEGMENT_END, ) with gr.Accordion( "Repaint options (experimental)", open=False, elem_classes=["ams-experimental"], ): gr.Markdown( "_These knobs are captured in the request but the installed " "ACE-Step dataclass doesn't expose them yet._", elem_classes=["ams-lora-note"], ) components["repaint_strength"] = gr.Slider( minimum=0.0, maximum=1.0, step=0.05, value=0.5, label="Repaint strength", ) components["repaint_mode"] = gr.Dropdown( choices=["balanced", "left", "right"], value="balanced", label="Repaint mode", ) with gr.Accordion( "Flow-morph options (experimental)", open=False, elem_classes=["ams-experimental"], ): gr.Markdown( "_flow_edit sub-mode currently falls back to a repaint pass with " "lower audio_cover_strength. flow-specific params are captured " "but not yet wired._", elem_classes=["ams-lora-note"], ) components["flow_source_caption"] = gr.Textbox( label="Source caption", placeholder="acoustic ballad, gentle piano", ) components["flow_n_min"] = gr.Slider( minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="n_min" ) components["flow_n_max"] = gr.Slider( minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="n_max" ) components["flow_n_avg"] = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="n_avg") _build_lora_accordion(components) _build_advanced_accordion(components) components["generate_btn"] = gr.Button( "▶ Apply edit", variant="primary", ) with gr.Column(scale=10): _build_output_panel(components) return components def build_lyrics_tab() -> dict[str, gr.components.Component]: """Lyrics tab body: Qwen 2.5 7B drafts structurally-tagged lyrics. Compact 2-column row: form on the left (brief / structure / language / line counts / tone / rhyme + collapsed LM-params accordion), output on the right (read-only multi-line textbox + ``Use these in Generate`` cross-tab CTA + bordered JSON metadata panel). The output textbox carries ``elem_classes=["ams-lyrics-output"]`` so the Brutalist Mono treatment in ``theme.CSS`` (mono font, 12 px, 280 px min-height) applies. The "Use in Generate" button is tagged ``ams-lyrics-use-btn`` so it gets a small top margin instead of sitting flush against the textbox. Does NOT include the LoRA accordion — Qwen-7B has no LoRA picker and the audio-mode LoRA semantics don't apply here. """ c: dict[str, gr.components.Component] = {} with gr.Row(): # --- FORM column (left) --- with gr.Column(scale=12): c["brief"] = gr.Textbox( label="Brief", lines=4, placeholder=("Describe the song. Tone, mood, references, specific images, lines to avoid…"), info=tooltips.LYRICS_BRIEF, ) with gr.Row(): c["structure"] = gr.Textbox( label="Structure", value="intro, verse, chorus, verse, chorus, bridge, chorus, outro", info=tooltips.LYRICS_STRUCTURE, ) c["language"] = gr.Dropdown( choices=["en", "zh", "ja", "ko", "es", "fr", "de"], value="en", label="Language", info=tooltips.LYRICS_LANGUAGE, ) with gr.Row(): c["verse_lines"] = gr.Slider( minimum=2, maximum=10, value=6, step=1, label="Verse lines", ) c["chorus_lines"] = gr.Slider( minimum=2, maximum=8, value=4, step=1, label="Chorus lines", ) c["bridge_lines"] = gr.Slider( minimum=1, maximum=6, value=2, step=1, label="Bridge lines", ) c["tone"] = gr.Textbox( label="Tone / mood", placeholder="euphoric, hypnotic, transcendent, not cheesy", info=tooltips.LYRICS_TONE, ) c["rhyme"] = gr.Radio( choices=["strict", "loose", "none"], value="loose", label="Rhyme", ) with gr.Accordion( "LM parameters", open=False, elem_classes=["ams-lm-accordion"], ): c["temperature"] = gr.Slider( minimum=0.0, maximum=2.0, value=0.85, step=0.05, label="Temperature", info=tooltips.LYRICS_TEMPERATURE, ) c["top_p"] = gr.Slider( minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top-p", info=tooltips.LYRICS_TOP_P, ) c["top_k"] = gr.Slider( minimum=0, maximum=200, value=40, step=1, label="Top-k", info=tooltips.LYRICS_TOP_K, ) c["max_new_tokens"] = gr.Slider( minimum=100, maximum=2000, value=600, step=50, label="Max new tokens", info=tooltips.LYRICS_MAX_TOKENS, ) c["seed"] = gr.Number( value=42, precision=0, label="Seed", ) c["draft_btn"] = gr.Button( "▶ Draft lyrics", variant="primary", ) # --- OUTPUT column (right) --- with gr.Column(scale=10): # NOTE: gr.Textbox in Gradio 6.14 doesn't accept ``show_copy_button`` # (the kwarg landed in a later 6.x). The Brutalist Mono textbox already # exposes a native selection + browser copy via Cmd-A / Cmd-C; the # copy-button affordance is therefore a no-op miss here. c["lyrics_output"] = gr.Textbox( label="Draft", lines=14, interactive=False, elem_classes=["ams-lyrics-output"], ) c["use_in_generate_btn"] = gr.Button( "↑ Use these in Generate", variant="primary", elem_classes=["ams-lyrics-use-btn"], ) c["meta_output"] = gr.JSON( label="Metadata", elem_classes=["ams-out", "ams-out-meta"], ) return c